目录
Python网络爬虫之Scrapy框架(CrawlSpider)
rawlSpider简介:
CrawlSpider其实是Spider的一个子类,除了继承到Spider的特性和功能外,还派生除了其自己独有的更加强大的特性和功能。
其中最显著的功能就是”LinkExtractors链接提取器“。Spider是所有爬虫的基类,其设计原则只是为了爬取start_url列表中网页,而从爬取到的网页中提取出的url进行继续的爬取工作使用CrawlSpider更合适
使用:
创建scrapy工程:scrapy startproject projectName
创建爬虫文件:scrapy genspider -t crawl spiderName www.xxx.com
指令对比以前的指令多了 "-t crawl",表示创建的爬虫文件是基于CrawlSpider这个类的,而不再是Spider这个基类。
LinkExtractor:顾名思义,链接提取器:`
LinkExtractor(
allow = r'Items/', # 满足括号中“正则表达式”的值会被提取,如果为空,则全部匹配。
deny = xxx, # 满足正则表达式的则不会被提取。
restrict_xpaths = xxx, # 满足xpath表达式的值会被提取
restrict_css = xxx, # 满足css表达式的值会被提取
deny_domains = xxx, # 不会被提取的链接的domains。
)
- 作用:提取response中符合规则的链接。
Rule : 规则解析器。根据链接提取器中提取到的链接,根据指定规则提取解析器链接网页中的内容。
Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True)
参数介绍:
参数1:指定链接提取器
参数2:指定规则解析器解析数据的规则(回调函数)
参数3:是否将链接提取器继续作用到链接提取器提取出的链接网页中。当callback为None,参数3的默认值为true。
(3). rules=( ):指定不同规则解析器。一个Rule对象表示一种提取规则。
(4). CrawlSpider整体爬取流程:
a)爬虫文件首先根据起始url,获取该url的网页内容
b)链接提取器会根据指定提取规则将步骤a中网页内容中的链接进行提取
c)规则解析器会根据指定解析规则将链接提取器中提取到的链接中的网页内容根据指定的规则进行解析
d)将解析数据封装到item中,然后提交给管道进行持久化存储
实战:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class CrawldemoSpider(CrawlSpider):
name = 'qiubai'
#allowed_domains = ['www.qiushibaike.com']
start_urls = ['https://www.qiushibaike.com/pic/']
#连接提取器:会去起始url响应回来的页面中提取指定的url
link = LinkExtractor(allow=r'/pic/page/d+?') #s=为随机数
link1 = LinkExtractor(allow=r'/pic/$')#爬取第一页
#rules元组中存放的是不同的规则解析器(封装好了某种解析规则)
rules = (
#规则解析器:可以将连接提取器提取到的所有连接表示的页面进行指定规则(回调函数)的解析
Rule(link, callback='parse_item', follow=True),
Rule(link1, callback='parse_item', follow=True),
)
def parse_item(self, response):
print(response)
测试:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from qiubaiBycrawl.items import QiubaibycrawlItem
import re
class QiubaitestSpider(CrawlSpider):
name = 'qiubaiTest'
#起始url
start_urls = ['http://www.qiushibaike.com/']
#定义链接提取器,且指定其提取规则
page_link = LinkExtractor(allow=r'/8hr/page/d+/')
rules = (
#定义规则解析器,且指定解析规则通过callback回调函数
Rule(page_link, callback='parse_item', follow=True),
)
#自定义规则解析器的解析规则函数
def parse_item(self, response):
div_list = response.xpath('//div[@id="content-left"]/div')
for div in div_list:
#定义item
item = QiubaibycrawlItem()
#根据xpath表达式提取糗百中段子的作者
item['author'] = div.xpath('./div/a[2]/h2/text()').extract_first().strip('
')
#根据xpath表达式提取糗百中段子的内容
item['content'] = div.xpath('.//div[@class="content"]/span/text()').extract_first().strip('
')
yield item #将item提交至管道
管道文件:
class QiubaibycrawlPipeline(object):
def __init__(self):
self.fp = None
def open_spider(self,spider):
print('开始爬虫')
self.fp = open('./data.txt','w')
def process_item(self, item, spider):
#将爬虫文件提交的item写入文件进行持久化存储
self.fp.write(item['author']+':'+item['content']+'
')
return item
def close_spider(self,spider):
print('结束爬虫')
self.fp.close()
spider:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from bosspro.items import FirstItem, DetailItem
class BossSpider(CrawlSpider):
"""
爬取的是首页的岗位名称和详情页的岗位描述
"""
name = 'boss'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.zhipin.com/c101280600/?query=python%E5%BC%80%E5%8F%91&page=1&ka=page-1']
link = LinkExtractor(allow=r'page=d+')
link_detail = LinkExtractor(allow=r'/job_detail/.*?html')
# /job_detail/7f67d44f4e4a8fd11H1y2dq1GVc~.html
# /job_detail/aef99658c94e76da1H1509W9F1s~.html
rules = (
Rule(link, callback='parse_item', follow=True),
Rule(link_detail, callback='parse_detail'),
)
def parse_item(self, response):
li_list = response.xpath('//div[@class="job-list"]/ul/li')
for li in li_list:
item = FirstItem()
job_title = li.xpath('.//div[@class="job-title"]/text()').extract_first()
item['job_title'] = job_title
yield item
def parse_detail(self, response):
item = DetailItem()
job_desc = response.xpath('//div[@class="job-sec company-info"]//text()').extract()
job_desc = "".join(job_desc)
item['job_desc'] = job_desc
yield item
item:
import scrapy
class FirstItem(scrapy.Item):
job_title = scrapy.Field()
class DetailItem(scrapy.Item):
job_desc = scrapy.Field()
pip:
class BossproPipeline(object):
f1, f2 = None, None
def open_spider(self, spider):
self.f1 = open('title.txt', 'w', encoding="utf-8")
self.f2 = open('desc.txt', 'w', encoding="utf-8")
def process_item(self, item, spider):
"""
item在同一时刻只可以接收某一个指定的item对象
:param item:
:param spider:
:return:
"""
if item.__class__.__name__ == "FirstItem":
job_title = item['job_title']
self.f1.write(job_title + '
')
else:
job_desc = item['job_desc']
self.f2.write(job_desc)
return item
settings:
BOT_NAME = 'bosspro'
SPIDER_MODULES = ['bosspro.spiders']
NEWSPIDER_MODULE = 'bosspro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'bosspro.pipelines.BossproPipeline': 300,
}
LOG_LEVEL = 'ERROR'