zoukankan      html  css  js  c++  java
  • Scrapy之下载中间件中的代理中间件HttpProxyMiddleware

    简单的使用内置的代理中间件

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from wyb.items import WybItem
     4 from scrapy.dupefilters import RFPDupeFilter
     5 from scrapy.http.response.html import HtmlResponse
     6 from scrapy.http.cookies import CookieJar
     7 from urllib.parse import urlencode
     8 from scrapy.http import Request
     9 
    10 
    11 class ChoutiSpider(scrapy.Spider):
    12     name = 'chouti'
    13     # 爬取定向的网页 只允许这个域名的
    14     allowed_domains = ['chouti.com']
    15     start_urls = ['https://dig.chouti.com/']
    16 
    17     def start_requests(self):
    18         import os
    19         # 代理设置 downloadmiddleware中httppxoxy
    20         os.environ['HTTP_PROXY'] = '1.1.1.2'
    21         os.environ['HTTPS_PROXY'] = 'http://root:woshinizuzong@192.168.10.10:8888/'
    22         # 方式1
    23         for url in self.start_urls:
    24             yield Request(url=url)
    25         # 方式2
    26         # req_list = []
    27         # for url in self.start_urls:
    28         #     req_list.append(Request(url=url))
    29         # return req_list
    30 
    31     def parse(self, response):
    32         """
    33         第一次访问响应response
    34         :param response:
    35         :return:
    36         """
    37         page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract()
    38         for page in page_list:
    39             from scrapy.http import Request
    40             page = "https://dig.chouti.com"+page
    41             # 继续发请求,回调函数parse
    42             yield Request(url=page, callback=self.parse, meta={"proxy": "http://root:woshinizuzong@192.168.10.10:8888/"})

     源码分析

     

     

     

     

      

     

     

     

     总结:

      如果要简单的使用只需在请求刚开始的时候定义os.environ进程环境变量

    1         import os
    2         # 代理设置 downloadmiddleware中httppxoxy
    3         os.environ['HTTP_PROXY'] = '1.1.1.2'
    4         os.environ['HTTPS_PROXY'] = 'http://root:woshinizuzong@192.168.10.10:8888/'
    5 
    6 
    7      yield Request(url=page, callback=self.parse, meta={"proxy": "http://root:woshinizuzong@192.168.10.10:8888/"})   # meta优先级比environ更高

       如果要自定义下载代理中间件需要定义这些方法

     1 def __init__(self, auth_encoding='latin-1'):
     2 
     3 @classmethod
     4 def from_crawler(cls, crawler):
     5 
     6 def _basic_auth_header(self, username, password):
     7 
     8 def _get_proxy(self, url, orig_type):
     9 
    10 def process_request(self, request, spider):
    11 
    12 def _set_proxy(self, request, scheme): 

    自定义下载代理中间件方式一

     1 #!/usr/bin/env python
     2 # -*- coding:utf-8 -*-
     3 import base64
     4 import random
     5 from six.moves.urllib.parse import unquote, urlunparse
     6 
     7 try:
     8     from urllib2 import _parse_proxy
     9 except ImportError:
    10     from urllib.request import _parse_proxy
    11 from scrapy.utils.python import to_bytes
    12 
    13 
    14 class WybProxyMiddleware(object):
    15 
    16     def _basic_auth_header(self, username, password):
    17         user_pass = to_bytes(
    18             '%s:%s' % (unquote(username), unquote(password)),
    19             encoding='latin-1')
    20         return base64.b64encode(user_pass)
    21 
    22     def process_request(self, request, spider):
    23         PROXIES = [
    24             "http://root:woshinizuzong@192.168.10.10:8888/",
    25             "http://root:woshinizuzong@192.168.10.11:8888/",
    26             "http://root:woshinizuzong@192.168.10.12:8888/",
    27             "http://root:woshinizuzong@192.168.10.13:8888/",
    28             "http://root:woshinizuzong@192.168.10.14:8888/",
    29             "http://root:woshinizuzong@192.168.10.15:8888/",
    30         ]
    31         # url = "http://root:woshinizuzong@192.168.10.10:8888/"
    32         url = random.choice(PROXIES)
    33         orig_type = ""
    34         proxy_type, user, password, hostport = _parse_proxy(url)
    35         proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
    36 
    37         if user:
    38             creds = self._basic_auth_header(user, password)
    39         else:
    40             creds = None
    41 
    42         request.meta['proxy'] = proxy_url
    43         if creds:
    44             request.headers['Proxy-Authorization'] = b'Basic ' + creds

    自定义下载代理中间件方式二,源码看了很轻松自己定义

     1 class ProxyMiddleware(object):
     2     def process_request(self, request, spider):
     3         PROXIES = [
     4             {'ip_port': '220.230.240.200:80', 'user_pass': 'woshinizuzong'},
     5             {'ip_port': '220.230.240.201:80', 'user_pass': 'woshinidie'},
     6             {'ip_port': '220.230.240.202:8888', 'user_pass': 'woshiniye'},
     7             {'ip_port': '220.230.240.203:80', 'user_pass': 'caonidaba'},
     8             {'ip_port': '220.230.240.204:80', 'user_pass': 'jiaowolaoba'},
     9             {'ip_port': '220.230.240.205:8888', 'user_pass': 'shuowodiaoda'},
    10         ]
    11         proxy = random.choice(PROXIES)
    12         if proxy['user_pass'] is not None:
    13             request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
    14             encoded_user_pass = base64.b64decode(to_bytes(proxy['user_pass']))
    15             request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
    16         else:
    17             request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])

     配置文件中

     

     好奇看下Scrapy的配置文件

  • 相关阅读:
    一个Windows的对话框
    怎样用Javascript定义一个类
    WPF 开源UI框架
    企业级系统的认证与授权设计
    MVC (ModelBinder等)技巧
    oracle 调用存储过程和方法
    WPF相关文章索引 (持续更新中)
    当文件操作遇上进程占用时
    每位设计师都应该拥有的50个CSS代码片段
    wpf 复制/剪切到本地系统剪切板中以供右键粘贴用
  • 原文地址:https://www.cnblogs.com/Alexephor/p/11440483.html
Copyright © 2011-2022 走看看