zoukankan      html  css  js  c++  java
  • scrapy 中间件

    scrapy 中间件

    中间件流程:

    class WxappSpiderMiddleware(object):
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_spider_input(self, response, spider):
            # 爬虫处理前
            return None
    
        def process_spider_output(self, response, result, spider):
            # 爬虫返回数据前执行
            for i in result:
                yield i
    
        def process_spider_exception(self, response, exception, spider):
            # 异常处理
            pass
    
        def process_start_requests(self, start_requests, spider):
        # 爬虫开始请求前
            for r in start_requests:
                yield r
    
        def spider_opened(self, spider):
            #爬虫结束执行
            spider.logger.info('Spider opened: %s' % spider.name)
    

    随机请求头:

    
    http://httpbin.org/user-agent    查看自己的user-agent
    
            import random
    # 请求头--》 所有(http://useragentstring.com/pages/useragentstring.php?name=Chrome)
    class UserAgentDownloaderMiddleware(object):
       
        USER_AGENTS = [
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36",
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2919.83 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2762.73 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
        ]
    
        def process_request(self, request, spider):
            user_agent =  random.choice(self.USER_AGENTS)
            request.headers['User-Agent'] = user_agent
            
     
    settings 配置:
    DOWNLOADER_MIDDLEWARES = {
       'wxapp.middlewares.UserAgentDownloaderMiddleware': 543,
    }
    
    
    spider 配置:
    
    def parse(self,response):
        user_agent = json.loads(response.text)['user-agent']
        # 重复请求 url
        yield scrapy Request(self.start_url[0],dont_filter=True)   # 关闭去重请求
                             
    

    ip 代理中间件:

    出现验证码: 1. 识别   2. 跟换代理
    
    代理服务商:
    	快代理
       
    httpbin.org/ip      打印当前代理ip
    
    
    class IPDownloaderMiddleware(object):
        # 高匿名 + https  + 稳定
        PROXIES =[
            "ip:port",
            "",
            "",
            "",
            "",
        ]
        
        def process_request(self,request,spider):
            proxy = random.choice(self.PROXIES)
            request.meta['proxy'] = proxy
            
            
     setting 配置:
    
    DOWNLOADER_MIDDLEWARES = {
       'wxapp.middlewares.IPDownloaderMiddleware': 543,
    }
    

    ip独享模式:

    import base64
    class IPDownloaderMiddleware(object):
        # 高匿名 + https  + 稳定
    
        def process_request(self, request, spider):
            proxy = "ip:port"
            user_password = "name:password"
            b64_user = base64.b64encode(user_password.encode('utf-8'))
            request.meta['proxy'] = proxy
            request.headers['Proxy-Authorization'] = 'Basic' + b64_user.decode('utf-8')
    

    注意:

    content 用extract()
    content  = "".join(content).strip() 
    
    
  • 相关阅读:
    PHP里文件的查找方式及写法
    上传文件
    用ajax对数据进行查看人员信息
    ajax实现分页
    jquery做个日期选择适用于手机端
    PHP用ajia代码写三级联动下拉
    JSON数据的定义
    jquery做一些小的特效
    对jquery操作复选框
    使用 LUT 模拟光照
  • 原文地址:https://www.cnblogs.com/shaozheng/p/12776122.html
Copyright © 2011-2022 走看看