简单的使用内置的代理中间件
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from wyb.items import WybItem 4 from scrapy.dupefilters import RFPDupeFilter 5 from scrapy.http.response.html import HtmlResponse 6 from scrapy.http.cookies import CookieJar 7 from urllib.parse import urlencode 8 from scrapy.http import Request 9 10 11 class ChoutiSpider(scrapy.Spider): 12 name = 'chouti' 13 # 爬取定向的网页 只允许这个域名的 14 allowed_domains = ['chouti.com'] 15 start_urls = ['https://dig.chouti.com/'] 16 17 def start_requests(self): 18 import os 19 # 代理设置 downloadmiddleware中httppxoxy 20 os.environ['HTTP_PROXY'] = '1.1.1.2' 21 os.environ['HTTPS_PROXY'] = 'http://root:woshinizuzong@192.168.10.10:8888/' 22 # 方式1 23 for url in self.start_urls: 24 yield Request(url=url) 25 # 方式2 26 # req_list = [] 27 # for url in self.start_urls: 28 # req_list.append(Request(url=url)) 29 # return req_list 30 31 def parse(self, response): 32 """ 33 第一次访问响应response 34 :param response: 35 :return: 36 """ 37 page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract() 38 for page in page_list: 39 from scrapy.http import Request 40 page = "https://dig.chouti.com"+page 41 # 继续发请求,回调函数parse 42 yield Request(url=page, callback=self.parse, meta={"proxy": "http://root:woshinizuzong@192.168.10.10:8888/"})
源码分析
总结:
如果要简单的使用只需在请求刚开始的时候定义os.environ进程环境变量
1 import os 2 # 代理设置 downloadmiddleware中httppxoxy 3 os.environ['HTTP_PROXY'] = '1.1.1.2' 4 os.environ['HTTPS_PROXY'] = 'http://root:woshinizuzong@192.168.10.10:8888/' 5 6 7 yield Request(url=page, callback=self.parse, meta={"proxy": "http://root:woshinizuzong@192.168.10.10:8888/"}) # meta优先级比environ更高
如果要自定义下载代理中间件需要定义这些方法
1 def __init__(self, auth_encoding='latin-1'): 2 3 @classmethod 4 def from_crawler(cls, crawler): 5 6 def _basic_auth_header(self, username, password): 7 8 def _get_proxy(self, url, orig_type): 9 10 def process_request(self, request, spider): 11 12 def _set_proxy(self, request, scheme):
自定义下载代理中间件方式一
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 import base64 4 import random 5 from six.moves.urllib.parse import unquote, urlunparse 6 7 try: 8 from urllib2 import _parse_proxy 9 except ImportError: 10 from urllib.request import _parse_proxy 11 from scrapy.utils.python import to_bytes 12 13 14 class WybProxyMiddleware(object): 15 16 def _basic_auth_header(self, username, password): 17 user_pass = to_bytes( 18 '%s:%s' % (unquote(username), unquote(password)), 19 encoding='latin-1') 20 return base64.b64encode(user_pass) 21 22 def process_request(self, request, spider): 23 PROXIES = [ 24 "http://root:woshinizuzong@192.168.10.10:8888/", 25 "http://root:woshinizuzong@192.168.10.11:8888/", 26 "http://root:woshinizuzong@192.168.10.12:8888/", 27 "http://root:woshinizuzong@192.168.10.13:8888/", 28 "http://root:woshinizuzong@192.168.10.14:8888/", 29 "http://root:woshinizuzong@192.168.10.15:8888/", 30 ] 31 # url = "http://root:woshinizuzong@192.168.10.10:8888/" 32 url = random.choice(PROXIES) 33 orig_type = "" 34 proxy_type, user, password, hostport = _parse_proxy(url) 35 proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', '')) 36 37 if user: 38 creds = self._basic_auth_header(user, password) 39 else: 40 creds = None 41 42 request.meta['proxy'] = proxy_url 43 if creds: 44 request.headers['Proxy-Authorization'] = b'Basic ' + creds
自定义下载代理中间件方式二,源码看了很轻松自己定义
1 class ProxyMiddleware(object): 2 def process_request(self, request, spider): 3 PROXIES = [ 4 {'ip_port': '220.230.240.200:80', 'user_pass': 'woshinizuzong'}, 5 {'ip_port': '220.230.240.201:80', 'user_pass': 'woshinidie'}, 6 {'ip_port': '220.230.240.202:8888', 'user_pass': 'woshiniye'}, 7 {'ip_port': '220.230.240.203:80', 'user_pass': 'caonidaba'}, 8 {'ip_port': '220.230.240.204:80', 'user_pass': 'jiaowolaoba'}, 9 {'ip_port': '220.230.240.205:8888', 'user_pass': 'shuowodiaoda'}, 10 ] 11 proxy = random.choice(PROXIES) 12 if proxy['user_pass'] is not None: 13 request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port']) 14 encoded_user_pass = base64.b64decode(to_bytes(proxy['user_pass'])) 15 request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass) 16 else: 17 request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
配置文件中
好奇看下Scrapy的配置文件