class scrapy.http.Request(url[, callback, method="GET", headers, body, cookies, meta, encoding='utf8', priority=0, dont_filter=Falese, errback]))
参数详解:
- url : 目标请求地址
- callback : 指定http方法, 默认为get
- method : 自定http方法, 默认为get
- meta : request.meta 可以传一些键值对
- body : 请求正文, 二进制内容
- headers : http请求头
- cookies: 附带在请求中要一起发出的cookies对象
- encoding : 当前请求的编码方式, 设置为true则不过滤请求
- priority : 设置请求的优先级, 默认为0, 这个优先级是scheduler在线程中用于定义处理请求的顺序
- dont_filter : 默认为False, 设置为True则不过滤请求
- erraback: 当请求发生任何异常时就会调用此回调函数
import scrapy from scrapy.linkextractors import LinkExtractor class DeepInSpider( scrapy.Spider ): name = 'example.com' start_urls = [ 'https://www.baidu.com' ] def parse( self, response ): link_extractor = LinkExtractor() seen =set() linkes = link_extractor.extract_links(response) links = [ link for link in linkes if link not in senn ] for link in links: print( link.url ) seen.add(link) cd = None if ( link.contains( 'detail ) ): cd = self.parse_detail yield scrapy.Request( url=link, callback=cd ) yield scrapy.Request( url = link.url, callback=cd ) def parse_detail(self, response): pass