CrawlSpider介绍
CrawlSpider是Spider的一个子类,意味着拥有Spider的方法,以及自己的方法,更加高效简洁。其中最显著的功能就是"LinkExtractors"链接提取器。Spider是所有爬虫的基类,其设计只是为了爬取start_urls列表中的网页。然而CrawlSpider更适合在网页中提取url继续进行爬取。
CrawlSpider使用
1、创建scrapy工程:
scrapy startproject projectName
2、创建爬虫文件:
scrapy genspider -t crawl SpiderName www.xxx.com
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class A4567tvSpider(CrawlSpider): name = '4567Tv' # allowed_domains = ['www.xxx.com'] start_urls = ['http://www.xxx.com/'] rules = ( Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), ) def parse_item(self, response): item = {} #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() return item
LinkExtractor连接提取器:根据指定规则(正则)进行连接的提取
Rule规则解析器:将链接提取器提取到的链接进行请求发送,然后对获取的页面数据进行
指定规则(callback)的解析
一个链接提取器对应唯一一个规则解析器
爬取4567tv.tv的全栈电影名字以及演员名字进行持久化储存:
spider/4567tv.py:
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from crawlProject.items import CrawlprojectItem #"/frim/index1-2.html" class A4567tvSpider(CrawlSpider): name = '4567Tv' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.4567tv.tv/frim/index1.html'] link = LinkExtractor(allow=r'/frim/index1-d+.html')#链接采集器 正则表达式 #如果正则为空,则匹配所有的链接 link1 = LinkExtractor(allow=r'/movie/indexd+.html') rules = ( Rule(link, callback='parse_item', follow=True),#参数三True就是采集所有的网页 Rule(link1, callback='parse_detail'), ) #rules=():指定不同规则解析器。一个Rule对象表示一种提取规则 #Rule:规则解析器。根据链接提取器中提取到的链接,根据指定规则提取解析器链接网页的内容 def parse_item(self, response): first_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for url in first_list: title = url.xpath('./div/a/@title').extract_first() name = url.xpath('./div/div/p/text()').extract_first() item = CrawlprojectItem() item["title"] = title item["name"] = name yield item #CrawlSpider的爬取流程: """爬虫文件首先根据起始的url、获取该url的网页内容。 链接提取器会根据指定提取规则将步骤a中网页内容中的链接进行提取 规则解析器会根据指定解析规则将链接提取器中的网页中的内容根据指定的规则进行解析 将解析数据封装到item中。提交给管道进行持久化储存 """
items.py:
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class CrawlprojectItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() name = scrapy.Field()
pipelins.py:
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class CrawlprojectPipeline(object): def __init__(self): self.fp = None def open_spider(self,spider): print("开始爬虫!!!") self.fp = open("./movies.txt","w",encoding="utf-8") def process_item(self, item, spider): self.fp.write(item["title"]+":"+item["name"]+" ") return item def close_spider(self,spider): print("爬虫结束!!!") self.fp.close()
# -*- coding: utf-8 -*- # Scrapy settings for crawlProject project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'crawlProject' SPIDER_MODULES = ['crawlProject.spiders'] NEWSPIDER_MODULE = 'crawlProject.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'crawlProject (+http://www.yourdomain.com)' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False LOG_LEVEL = "ERROR" # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'crawlProject.middlewares.CrawlprojectSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'crawlProject.middlewares.CrawlprojectDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'crawlProject.pipelines.CrawlprojectPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'