# -*- coding: utf-8 -*- import scrapy from rihan.items import RihanItem class RihanspiderSpider(scrapy.Spider): name = "rihanspider" # allowed_domains = ["*******"] start_urls = [**************'] def parse(self, response): # print(response.text) for each in response.css('.img li'): index_url = each.css('a::attr(href)').extract_first() # print(index_url) yield scrapy.Request(url=index_url,callback=self.parse_detail) next = response.xpath('//a[contains(.,"下一页")]/@href').extract_first() if next: url = response.urljoin(next) yield scrapy.Request(url, callback=self.parse) def parse_detail(self, response): item = RihanItem() image_title = response.css('.width .weizhi h1::text').extract_first() item['image_title'] = image_title for each in response.css('.content'): # 注意这里如果想用scrapy内置的下载图片模块,这里要用extract()而不是extract_first() image_url = each.css('img::attr(src)').extract() # print(image_url) item['image_url'] = image_url yield item next = response.xpath('//a[contains(.,"下一页")]/@href').extract_first() if next: next_url = response.urljoin(next) yield scrapy.Request(url=next_url, callback=self.parse_detail)
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1} IMAGES_URLS_FIELD = 'image_url' IMAGES_STORE = r'.'