需求:
获取西刺网代理ip信息,包括ip地址、端口号、ip类型
西刺网:http://www.xicidaili.com/nn/
那,如何解决这个问题?
分析页面结构和url设计得知:
数据都在本页面可以全部获取,没有单独的详情页面
下一页通过更改当前页面最后url后缀进行跳转页面,那我实现URL的拼接不就解决这个问题了
那,软件的运行环境?
python3.5
scrapy
twisted
request
pymysql
以上是第三方包,通过pip安装
MySQL服务
其中db,user,password的值根据实际情况而定
#!/usr/bin/python3 __author__ = 'beimenchuixue' __blog__ = 'http://www.cnblogs.com/2bjiujiu/' import requests import pymysql from time import sleep from random import randint, choice from scrapy.selector import Selector from twisted.enterprise import adbapi from twisted.internet import reactor # 数据库基本配置, 自行配置 db_settings = { 'host': 'localhost', 'db': 'db_name', 'user': 'user_name', 'password': 'password', 'charset': 'utf8', 'use_unicode': True } # conn = pymysql.connect(**db_settings) # cursor = conn.cursor() # 生成连接池 db_conn = adbapi.ConnectionPool('pymysql', **db_settings) def go_sleep(): """进行随机io堵塞,模仿人访问""" while randint(0, 1): sleep(choice([0.1, 0.2, 0.3, 0.4, 0.5, 0.6])) def get_sql(ip, port, ip_type): """获得sql语句""" if ip and port and ip_type: sql = """insert into ip_server(ip, port, ip_type) value (%s, %s, %s) on DUPLICATE key update ip=values(ip), port=values(port), ip_type=values(ip_type)""" try: params = (ip, int(port), ip_type) except Exception as e: print(e) return None return sql, params else: return None def go_insert(cursor, sql, params): """数据库插入操作""" try: cursor.execute(sql, params) except Exception as e: print(e) def get_ip(): """爬取ip信息并存入数据库""" # 设置请求头 headers = { 'Referer': 'http://www.xicidaili.com/nn/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } # 获取50页的数据 for page in range(1, 50): # 建立关系映射,增加程序可阅读性 ip_index, port_index, type_index = 2, 3, 6 # 爬取的url url = 'http://www.xicidaili.com/nn/{page}'.format(page=page) go_sleep() response = requests.get(url, headers=headers) # 打印状态码 print(response.status_code) # 进行页面解析 selectors = Selector(text=response.text) all_trs = selectors.css('#ip_list .odd') for tr in all_trs: ip = tr.css('td:nth-child(%s)::text' % ip_index).extract_first() port = tr.css('td:nth-child(%s)::text' % port_index).extract_first() ip_type = tr.css('td:nth-child(%s)::text' % type_index).extract_first() sql, params = get_sql(ip, port, ip_type) if sql: try: # cursor.execute(sql, params) # conn.commit() # 执行sql操作 db_conn.runInteraction(go_insert, sql, params) except Exception as e: print(e) else: break if __name__ == '__main__': get_ip() # 让twisted的sql操作去完成 reactor.callLater(4, reactor.stop) reactor.run()