scrapy 设置图片下载
1 setting.py配置
ITEM_PIPELINES = {
# 'img_spider.pipelines.ImgSpiderPipeline': 300,
# 图片处理的管道中间件
'scrapy.pipelines.images.ImagesPipeline': 300
}
指定图片存储路径(该路径要提前创建好)
IMAGES_STORE = "images"
指定图片存储时的缩略图和大图
IMAGES_THUMBS = {
"small": (50, 50),
"big": (1000, 1000)
}
2 360图片爬虫应用
import scrapy
import json
class SotuSpider(scrapy.Spider):
name = 'sotu'
allowed_domains = ['so.com']
start_urls = ['https://image.so.com/zjl?ch=beauty&sn={}&listtype=new&temp=1']
page_sn = 1
MAX_COUNT = 200
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url.format(self.page_sn), callback=self.parse, encoding="UTF-8")
def parse(self, response, **kwargs):
解析响应中的URL地址
images_json = json.loads(response.body.decode("UTF-8"))
images_url_list = [img.get('qhimg_url') for img in images_json.get("list")]
将图片交给管道--图片中间件进行处理
yield {"image_urls": images_url_list}
更新page_sn(图片搜索起点id)
if images_json.get("count") > 0 and SotuSpider.page_sn < self.MAX_COUNT:
SotuSpider.page_sn += images_json.get("count")
yield scrapy.Request(url=self.start_urls[0].format(SotuSpider.page_sn),
callback=self.parse, encoding="UTF-8")