zoukankan      html  css  js  c++  java
  • 关于scrapy里的中间件和请求传参

    一、中间件

    scrapy中间件事介于下载器和scrapy引擎之间,主要是接收和发送响应和请求

    下面是关于中间件的方法的介绍

     1 class MiddleproDownloaderMiddleware(object):
     2     user_agent_list = [
     3         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
     4         "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
     5         "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
     6         "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
     7         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
     8         "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
     9         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
    10         "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    11         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
    12         "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    13         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
    14         "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    15         "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
    16         "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    17         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    18         "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    19         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
    20         "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    21         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
    22         "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    23         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    24         "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    25         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    26         "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    27         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    28         "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    29         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    30         "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    31         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
    32         "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    33         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    34         "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    35         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
    36         "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    37         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
    38         "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    39     ]
    40     # 可被选用的代理IP
    41     PROXY_http = [
    42         '153.180.102.104:80',
    43         '195.208.131.189:56055',
    44     ]
    45     PROXY_https = [
    46         '120.83.49.90:9000',
    47         '95.189.112.214:35508',
    48     ]
    49    #拦截所有未发生异常的请求
    50     def process_request(self, request, spider):
    51         # Called for each request that goes through the downloader
    52         # middleware.
    53 
    54         # Must either:
    55         # - return None: continue processing this request
    56         # - or return a Response object
    57         # - or return a Request object
    58         # - or raise IgnoreRequest: process_exception() methods of
    59         #   installed downloader middleware will be called
    60 
    61         #使用UA池进行请求的UA伪装
    62         print('this is process_request')
    63         request.headers['User-Agent'] = random.choice(self.user_agent_list)
    64         print(request.headers['User-Agent'])
    65 
    66         # #使用代理池进行请求代理ip的设置
    67         # if request.url.split(':')[0] == 'http':
    68         #     request.meta['proxy'] = random.choice(self.PROXY_http)
    69         # else:
    70         #     request.meta['proxy'] = random.choice(self.PROXY_https)
    71         return None
    72     #拦截所有的响应
    73     def process_response(self, request, response, spider):
    74         # Called with the response returned from the downloader.
    75 
    76         # Must either;
    77         # - return a Response object
    78         # - return a Request object
    79         # - or raise IgnoreRequest
    80         return response
    81     #拦截到产生异常的请求
    82     def process_exception(self, request, exception, spider):
    83         # Called when a download handler or a process_request()
    84         # (from other downloader middleware) raises an exception.
    85 
    86         # Must either:
    87         # - return None: continue processing this exception
    88         # - return a Response object: stops process_exception() chain
    89         # - return a Request object: stops process_exception() chain
    90         # 使用代理池进行请求代理ip的设置
    91         print('this is process_exception!')
    92         if request.url.split(':')[0] == 'http':
    93             request.meta['proxy'] = random.choice(self.PROXY_http)
    94         else:
    95             request.meta['proxy'] = random.choice(self.PROXY_https)
    View Code

    二、请求传参

    当我们要获取的数据不在一个页面时,例如新闻,获取到标题之后点击进入详情页面,获取详情信息

    爬虫文件

     1 class FirstSpider(scrapy.Spider):
     2     name = 'first'
     3     # allowed_domains = ['www.xxx.com']
     4     start_urls = ['http://war.163.com/']
     5 
     6     def parse_detail(self, response):
     7         tag_list = []
     8         item = response.meta["item"]
     9         a_list = response.xpath('/html/body/div[2]/div[2]/div[2]/div[1]/a')
    10 
    11         for a in a_list:
    12             tag = a.xpath('./text()').extract_first()
    13             tag_list.append(tag)
    14         yield item
    15 
    16     def parse(self, response):
    17         li_list = response.xpath('//div[@class="today_news"]/ul/li')
    18         for li in li_list:
    19             item = MiddleItem()
    20             title = li.xpath('./a/text()').extract_first()
    21             item['title'] = title
    22             print(item)
    23             detail_url = li.xpath('./a/@href').extract_first()+'#p=E9DPI10E4T8E0001NOS'
    24             print(detail_url)
    25             yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item})
    View Code

    在每次获取信息之后都会再一次的发送请求,这里我们要重新编写def parse_detail来获取详情页面的信息,并且将item作为参数传入

  • 相关阅读:
    spingboot集成jpa(二)
    datanode与namenode的通信
    Yarn NodeManager restart
    hadoop上线和下线节点
    Spark读取文件
    eclipse安装中文补丁包
    eclipse中maven打包
    (转) eclipse安装lombok
    Windows 访问 Oracle
    wCF 问题收集页
  • 原文地址:https://www.cnblogs.com/liaopeng123/p/10472680.html
Copyright © 2011-2022 走看看