- CrawlSpider继承自Spider,Spider类的设计原则是只爬取start_url列表中的网页,而CrawlSpider类定义了一些规则(rule)来提供跟进link的方便的机制,从爬取的网页中获取link并继续爬取。
- 创建项目与之前不同
scrapy startproject ct cd ct scrapy genspider -t crawl chouti www.xxx.com
- 简单爬取抽屉网全部url
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class CtSpider(CrawlSpider):
name = 'ct'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://dig.chouti.com/all/hot/recent/1']
# 连接提取器:
# allow:表示的就是链接提取器提取连接的规则(正则)
link = LinkExtractor(allow=r'/all/hot/recent/d+')
rules = (
#规则解析器:将链接提取器提取到的连接所对应的页面数据进行指定形式的解析
Rule(link, callback='parse_item', follow=True),
)
def parse_item(self, response):
print(response)
- 糗事百科
class CtSpider(CrawlSpider):
name = 'qiubai'
start_urls = ['https://www.qiushibaike.com/pic/']
link = LinkExtractor(allow=r'/pic/page/d+?s=d+')
link1 = LinkExtractor(allow=r'/pic/$')
rules = (
Rule(link, callback='parse_item', follow=True),
Rule(link1, callback='parse_item', follow=True),
)
def parse_item(self, response):
print(response)
链接