zoukankan      html  css  js  c++  java
  • 爬虫框架scrapy(2)post请求,传递item参数,加速爬虫效率,UA池及代理池

    scrapy 发送post请求

    class PostSpider(scrapy.Spider):
        name = 'post'
       
        start_urls = ['https://fanyi.baidu.com/sug']
        #原始作用:将起始url列表中的url进行get请求的发送.
        #通过如下操作进行父类方法的重写,让其进行post请求的发送
        def start_requests(self):
            data = {
                'kw':'dog'
            }
            for url in self.start_urls:
                yield scrapy.FormRequest(url=url,callback=self.parse,formdata=data)
            
        def parse(self, response):
            print(response.text)
    

    核心:
    重写父类 start_requests方法,默认的 start_requests方法提交的是yield scrapy.Request(url=url,formdata=formdata,callback=self.parse)这种get请求,
    改写为 yield scrapy.FormRequest(url=url,formdata=formdata,callback=self.parse) formdata为传递参数

    传递item参数,在参数中添加 meta

    def parse(self, response):
            div_list = response.xpath('//div[@class="col-xs-1-5 movie-item"]')
    
            for div in div_list:
               item = MovieproItem()
               item['title'] = div.xpath('.//div[@class="meta"]/h1/a/text()').extract_first()
               item['score'] = div.xpath('.//div[@class="meta"]/h1/em/text()').extract_first()
    
               detail_url ='https:'+ div.xpath('.//div[@class="meta"]/h1/a/@href').extract_first()
    
               yield scrapy.Request(url=detail_url,callback=self.getdata,meta={'item':item})
    
        def getdata(self,respose):
            item =respose.meta['item']
            item["deactor"]=respose.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[1]/td[2]/a/text()').extract_first()
            item["desc"]=respose.xpath('//div[@class="col-xs-12 movie-introduce"]/p/text()').extract_first()
    
            yield item
    

    加速爬虫效率在settings中配置以下

    加速爬虫效率
    CONCURRENT_REQUESTS = 10 开启线程数量
    LOG_LEVEL = 'ERROR'      打印日志等级
    COOKIES_ENABLED = False	 对于不需要处理cookies
    RETRY_ENABLED = False    是否重试
    DOWNLOAD_TIMEOUT = 5	 超时处理
    

    UA池和代理池

    在中间件文件中按照如下配置,代理池可以再www.goubanjia.com中找取,注意http与https

    class ProxyproDownloaderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
        proxy_http = ['http://39.137.168.229:8080', 'http://103.218.240.182:80', 'http://80.26.152.146:60133']
        proxy_https = ['https://221.6.201.18:9999', 'https://220.180.50.14:53281', 'https://140.227.200.38:3128']
        user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
            "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]
    
        def process_request(self, request, spider):
            print('下载中间件',request)
    
            if request.url.split(':')[0] == 'http':
                request.meta['proxy'] = random.choice(self.proxy_http)
            else:
                request.meta['proxy'] = random.choice(self.proxy_https)
    
            request.headers['User-Agent'] = random.choice(self.user_agent_list)
            return None
        def process_response(self, request, response, spider):
            # Called with the response returned from the downloader.
    
            # Must either;
            # - return a Response object
            # - return a Request object
            # - or raise IgnoreRequest
            return response
    
        def process_exception(self, request, exception, spider):
            # Called when a download handler or a process_request()
            # (from other downloader middleware) raises an exception.
    
            # Must either:
            # - return None: continue processing this exception
            # - return a Response object: stops process_exception() chain
            # - return a Request object: stops process_exception() chain
            pass
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    

      

      

  • 相关阅读:
    os
    虚拟站点配置
    21. Merge Two Sorted Lists
    38. Count and Say
    算法分类的书写模板
    Vue
    问题集、知识点
    [Linux] Linux命令
    回调函数在小程序中的实际应用
    小程序app.onLaunch中获取用户信息,index.onLoad初次载入时取不到值的问题
  • 原文地址:https://www.cnblogs.com/wszxdzd/p/10269210.html
Copyright © 2011-2022 走看看