以爬取某电影网的电影列表以及子链接中的信息为例子
spiders/parndemo.py
import scrapy from parnpost import items class ParndemoSpider(scrapy.Spider): name = 'parndemo' # allowed_domains = ['https://www.55xia.com/movie'] start_urls = ['https://www.55xia.com/movie/'] def getinfo(self, response): print(00000000000000) actor = response.xpath("/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[1]/td[2]/a/text()").extract_first() lanuage = response.xpath("/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[6]/td[2]/text()").extract_first() time = response.xpath("/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[8]/td[2]/text()").extract_first() item = response.meta["item"] item["actor"] = actor item["lanuage"] = lanuage item["time"] = time print("item", item) # 将item提交给管道 yield item def parse(self, response): # 名称 类型 导演 语言 片长 div_list = response.xpath("/html/body/div[1]/div[1]/div[2]/div") # /div获取所有的子div # print("div_list", div_list) # 实例化item # language = response.xpath("") # longTime = response.xpath("") for div in div_list: item = items.ParnpostItem() # print("div", div) name = div.xpath('.//div[@class="meta"]/h1/a/text()').extract_first() kind = div.xpath('.//div[@class="otherinfo"]/text()').extract_first() # print("kind", kind) url = div.xpath('.//div[@class="meta"]/h1/a/@href').extract_first() # 获取指定数据的链接 # 将数据存入item中 # print("name", name) print("url", url) url = "https:"+url item["name"] = name item["kind"] = kind print("item", item) # 对url发起请求操作,获取相应的页面数据进行分析 #meta传入只能是字典格式 yield scrapy.Request(url=url, callback=self.getinfo, meta={"item": item})
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class ParnpostPipeline(object): fp = None def open_spider(self, spider): print(1) self.fp = open("record.txt", "w", encoding="utf-8") def process_item(self, item, spider): print(2) name = item["name"] kind = item["kind"] actor = item["actor"] lanuage = item["lanuage"] time = item["time"] content = "名称:" + str(name) + " " + "种类:" + str(kind) + " " + "导演:" + str(actor) + " " + "语种:" + str(lanuage) + " " + "时间:" + str(time) + " " print(name, kind, actor, lanuage, content) print("content") self.fp.write(content) return item def close_spider(self, spider): print(3) self.fp.close()
settings.py
# -*- coding: utf-8 -*- # Scrapy settings for parnpost project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'parnpost' SPIDER_MODULES = ['parnpost.spiders'] NEWSPIDER_MODULE = 'parnpost.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'parnpost.middlewares.ParnpostSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'parnpost.middlewares.ParnpostDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'parnpost.pipelines.ParnpostPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class ParnpostItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() kind = scrapy.Field() actor = scrapy.Field() lanuage = scrapy.Field() time = scrapy.Field() # pass