zoukankan      html  css  js  c++  java
  • 爬虫下载中间件

    # 设置随机请求头  设置代理ip
    # 在middleware.py文件中 写一个类
    
    class MiddlewearproDownloaderMiddleware(object):
    
        user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
            "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]
        PROXY_http = [
            '153.180.102.104:80',
            '195.208.131.189:56055',
        ]
        PROXY_https = [
            '120.83.49.90:9000',
            '95.189.112.214:35508',
        ]
        
        # 拦截所有的正常对象
        def process_request(self, request, spider):
            # 随机选择一个UA
            user_agent = random.choice(self.user_agent_list)
            request.headers['User-Agent'] = user_agent
    
        # 拦截所有的响应对象
        def process_response(self, request, response, spider):
    
            return response
    
        # 拦截发生异常的请求对象
        def process_exception(self, request, exception, spider):
    
            #设置代理ip
            if request.url.split(':')[0] == 'http':
                request.meta['proxy'] = 'http://'+random.choice(self.PROXY_http)
            else:
                request.meta['proxy'] = 'https://'+random.choice(self.PROXY_https)
                
                
    # 在settings文件中修改中间件  只需要类名改成新的类名就可以
    DOWNLOADER_MIDDLEWARES = {
       'UA_demo.middlewares.MiddlewearproDownloaderMiddleware': 543,
    }
    
    
    # 爬虫文件 打印一下请求头看下
    # -*- coding: utf-8 -*-
    import scrapy
    import json
    
    class UaSpider(scrapy.Spider):
        name = 'httpbin'
        allowed_domains = ['httpbin.org']
        start_urls = ['http://httpbin.org/user-agent']
    
        def parse(self, response):
            user_agent=json.loads(response.text)["user-agent"]
            print(user_agent)
            yield scrapy.Request(self.start_urls[0],dont_filter=True) 
            # 因为scrapy框架会去重,如果不写dont_filter=True拿完第一个url之后就不会再去执行'http://httpbin.org/user-agent'了
  • 相关阅读:
    Ubuntu安装Oracleclient远程连接数据库
    解决报错:Unable to process Jar entry [org/springframework/jmx/export/annotation/*****]
    解决报错:The import javax.servlet.annotation cannot be resolved
    解决导入MAVEN项目报错Dynamic Web Module 3.1 requires Java 1.7 or newer.
    python批量下载链接图片
    thrax的安装
    cmake下载与使用(含cmake安装包)
    NFA转换为DFA
    杂记
    attention机制
  • 原文地址:https://www.cnblogs.com/kenD/p/11123707.html
Copyright © 2011-2022 走看看