闲着无聊,最近刚好看完scrapy框架,想着找个网站练练手,想来想去,把书中的360图片抓取拓展为批量抓取各版块图片,并分类保存,该网站为动态加载网站(Ajax),基本上没有什么反爬措施,因此抓取起来很顺利。这个小项目重点在于重写图片保存路径,各模块代码具体为:
1、items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class Images360Item(scrapy.Item): image_urls = scrapy.Field() image_path = scrapy.Field()
2、images360.py
# -*- coding: utf-8 -*- import scrapy from ..items import Images360Item import json class Images360Spider(scrapy.Spider): name = 'images360' allowed_domains = ['image.so.com'] type_list = ['beauty', 'video', 'wallpaper', 'design', 'funny', 'art', 'car', 'photography', 'food', 'home', 'pet'] page_number = 50 def start_requests(self): base_url = 'http://image.so.com/zjl?ch={}&sn={}' for type in self.type_list: for i in range(0, self.page_number * 30, 30): item = Images360Item() item['image_path'] = type url = base_url.format(type, i) yield scrapy.Request( url=url, meta={'item': item}, callback=self.get_image, ) def get_image(self, response): item = response.meta['item'] image_list = [image['qhimg_url'] for image in json.loads(response.text)['list']] item['image_urls'] = image_list yield item
3、pipelines.py(重点)
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.pipelines.images import ImagesPipeline from scrapy import Request class Images360Pipeline(object): def process_item(self, item, spider): return item class MyImages360Pipeline(ImagesPipeline): def get_media_requests(self, item, info): for image in item['image_urls']: yield Request( url=image, meta={'image_path': item['image_path']} ) def file_path(self, request, response=None, info=None): image_name = request.url.split('/')[-1] image_path = request.meta['image_path'] filename = '{}{}'.format(image_path, image_name) return filename
4、settings.py
# -*- coding: utf-8 -*- # Scrapy settings for Images360 project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'Images360' SPIDER_MODULES = ['Images360.spiders'] NEWSPIDER_MODULE = 'Images360.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent # USER_AGENT = 'Images360 (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) # COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) # TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # 'Images360.middlewares.Images360SpiderMiddleware': 543, # } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # DOWNLOADER_MIDDLEWARES = { # 'Images360.middlewares.Images360DownloaderMiddleware': 543, # 'Images360.middlewares.RandomUserAgentDownloaderMiddleware': 200, # 'Images360.middlewares.RandomProxyDownloaderMiddleware': 250, # } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html # EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, # } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { # 'Images360.pipelines.Images360Pipeline': 300, 'Images360.pipelines.MyImages360Pipeline': 200, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html # AUTOTHROTTLE_ENABLED = True # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = 'httpcache' # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' IMAGES_STORE = r'E:360图片' REDIS_HOST = '127.0.0.1' REDIS_PORT = 6379 REDIS_DB = 0 REDIS_PWD = '123456' REDIS_PROXY_KEY = 'proxy' LOG_LEVEL = 'WARNING'
最后附上抓取结果截图: