不要急于求成,你只要做的是比昨天的你更优秀一点
--匿名
今天给大家讲一下--IpProxy,由于从"http://www.xicidaili.com/nn"爬取,以下是我转载的博客
https://www.jianshu.com/p/8975a3997ab6
需要解决的问题
1.ip,端口和协议都是在静态页面中爬取
2.验证代理ip是否可用
这里就给大家看看爬取的代码怎么写,其他的配置可以看我之前的博客,具体代码可以进我的GitHub:。QAQ!!
# -*- coding: utf-8 -*- import scrapy from Iproxy.items import IproxyItem import pdb from Iproxy.settings import USER_AGENT import re from scrapy.linkextractors import LinkExtractor import telnetlib class IproxySpider(scrapy.Spider): name = 'iproxy' allowed_domains = ['www.xicidaili.com'] start_urls = ['http://www.xicidaili.com/nn'] headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Content-Length': '11', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.xicidaili.com', 'Origin': 'www.xicidaili.com', 'Referer': 'http://www.xicidaili.com/', 'User-Agent': USER_AGENT, 'X-Requested-With': 'XMLHttpRequest', } #验证ip代理是否可用 def telnet(self,item): try: telnetlib.Telnet(item['origin_ip'], port=item['port'], timeout=10.0) except: print('connect failure') return False else: print('conncet success') return True def parse(self, response): iplist = IproxyItem() sels = response.xpath('//tr[@class="odd"]') items = {} for sel in sels: ips = sel.xpath('./td[2]').extract()[0].encode('utf8') ports = sel.xpath('./td[3]').extract()[0].encode('utf8') types = sel.xpath('./td[6]').extract()[0].encode('utf8') type = re.findall(r'>(.*?)<',types)[0] #获取ip代理协议,低址,端口 if type == 'HTTP': #items = 'http://' + re.findall(r'>(.*?)<',ips)[0] +':'+re.findall(r'>(.*?)<',ports)[0] items['origin_ip'] = re.findall(r'>(.*?)<',ips)[0] items['port'] = re.findall(r'>(.*?)<',ports)[0] if self.telnet(items): iplist['ip_name'] = 'http://' + re.findall(r'>(.*?)<',ips)[0] iplist['port'] = re.findall(r'>(.*?)<',ports)[0] if type == 'HTTPS': items['origin_ip'] = re.findall(r'>(.*?)<', ips)[0] items['port'] = re.findall(r'>(.*?)<', ports)[0] #items = 'https://' + re.findall(r'>(.*?)<', ips)[0] +':'+re.findall(r'>(.*?)<', ports)[0] if self.telnet(items): iplist['ip_name'] = 'https://' + re.findall(r'>(.*?)<',ips)[0] iplist['port'] = re.findall(r'>(.*?)<', ports)[0] print iplist yield iplist #获取页面链接url links = LinkExtractor(restrict_css='div.pagination') for link in links.extract_links(response): yield scrapy.Request(link.url,callback=self.parse)