-
- 全站数据爬取
-
1.手动请求发送:
- - yield scrapy.Request(url,callback)【callback】回调一个函数用于数据解析
-
实例1.爬取阳光网多页面试数据
1).爬虫文件sun.py
# -*- coding: utf-8 -*- import scrapy from sunLinePro.items import SunlineproItem class SunSpider(scrapy.Spider): name = 'sun' # allowed_domains = ['www.xxx.com'] start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page='] # 通用url模板(不可修改) url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page=%d' page = 1 def parse(self, response): print('---------------------------page=',self.page) tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr') for tr in tr_list: title = tr.xpath('./td[2]/a[2]/text()').extract_first() status = tr.xpath('./td[3]/span/text()').extract_first() item = SunlineproItem() item['title'] = title item['status'] = status yield item if self.page < 5: # 手动对指定的url进行请求发送 count = self.page * 30 new_url = format(self.url%count) self.page += 1 # 手动对指定的url进行请求发送 yield scrapy.Request(url=new_url,callback=self.parse)
2).setting文件
# -*- coding: utf-8 -*- BOT_NAME = 'sunLinePro' SPIDER_MODULES = ['sunLinePro.spiders'] NEWSPIDER_MODULE = 'sunLinePro.spiders' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' ROBOTSTXT_OBEY = False LOG_LEVEL = 'ERROR' ITEM_PIPELINES = { 'sunLinePro.pipelines.SunlineproPipeline': 300, }
3).items文件
# -*- coding: utf-8 -*- import scrapy class SunlineproItem(scrapy.Item): title = scrapy.Field() status = scrapy.Field()
4).pipelines文件
# -*- coding: utf-8 -*- class SunlineproPipeline(object): def process_item(self, item, spider): print(item) return item
2.scrapy如何进行post请求的发送和如何进行cookie处理
- post请求的发送:
- 重写父类的start_requests(self)方法
- 在改方法内部只需要调用yield scrapy.FormRequest(url,callback,formdata)
- cookie处理:scrapy默认情况下会自动进行cookie处理
post请求实例
# -*- coding: utf-8 -*- import scrapy class PostdemoSpider(scrapy.Spider): name = 'postDemo' # allowed_domains = ['www.xxx.com'] # https://fanyi.baidu.com/sug start_urls = ['https://fanyi.baidu.com/sug'] # 父类方法,将start_urls中的列表元素进行get请求的发送【scrapy默认发起get请求】 # def start_requests(self): # for url in self.start_urls: # yield scrapy.Request(url=url,callback=self.parse)
# 以下为post请求源码 def start_requests(self): for url in self.start_urls: data = { 'kw':'cat' } # post请求的手动发送使用的是FormRequest yield scrapy.FormRequest(url=url,callback=self.parse,formdata=data)
def parse(self, response): print(response.text)