zoukankan      html  css  js  c++  java
  • python文件管道 下载图集

    # -*- coding: utf-8 -*-
    import re
    from time import sleep
    
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    
    class AngelSpider(CrawlSpider):
        name = 'angel'
        allowed_domains = ['angelimg.spbeen.com']
        start_urls = ['http://angelimg.spbeen.com/']
    
        base_url = "http://angelimg.spbeen.com"
        rules = (
            Rule(LinkExtractor(allow=r'^http://angelimg.spbeen.com/ang/d+$'), callback='parse_item', follow=False),
        )
    
        def parse_item(self, response):
            print(response.url)
            item = response.meta.get('item',False)
            if item:
                pass
            else:
                item = {}
                item['files'] = []
                item['file_urls'] = []
                dir_name = response.xpath('.//div[@class="article"]/h2/text()').extract_first()
                item['dir_name'] = dir_name.split('【')[0]
                item['dir_name'] = re.sub(u"([^u4e00-u9fa5u0030-u0039u0041-u005au0061-u007a])","", item['dir_name'])
    
            img_url = response.xpath('.//div[@id="content"]/a/img/@src').extract_first()
            item['file_urls'].append(img_url)
            # 如果有下一页 请求下一页,没有数据丢回管道
            next_url = response.xpath('.//div[@class="page"]//a[contains(@class,"next")]/@href').extract_first()
    
            #sleep(1)
            if next_url:
                next_url = self.base_url + next_url
                yield scrapy.Request(next_url,callback=self.parse_item,meta={'item':item})
            else:
                yield item
    

      

    管道 继承文件管道

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    import hashlib
    import os
    
    from scrapy.pipelines.files import FilesPipeline
    
    class AngelimgPipeline(object):
        def process_item(self, item, spider):
            return item
    
    
    
    from scrapy.http import Request
    from scrapy.utils.python import to_bytes
    
    class DealFilePathPipeline(FilesPipeline):
        def get_media_requests(self, item, info):
            return [Request(x,meta={'item':item}) for x in item.get(self.files_urls_field, [])]
    
    
        def file_path(self, request, response=None, info=None):
            ## start of deprecation warning block (can be removed in the future)
            def _warn():
                from scrapy.exceptions import ScrapyDeprecationWarning
                import warnings
                warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
                              'file_path(request, response=None, info=None) instead',
                              category=ScrapyDeprecationWarning, stacklevel=1)
    
            # check if called from file_key with url as first argument
            if not isinstance(request, Request):
                _warn()
                url = request
            else:
                url = request.url
    
            # detect if file_key() method has been overridden
            if not hasattr(self.file_key, '_base'):
                _warn()
                return self.file_key(url)
            ## end of deprecation warning block
            item = request.meta.get('item',{})
            media_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
            media_ext = os.path.splitext(url)[1]  # change to request.url after deprecation
            print(item)
            return 'full2/{}/{}{}'.format(item['dir_name'],media_guid, media_ext)
            return 'full/%s%s' % (media_guid, media_ext)
    
        # deprecated
        def file_key(self, url):
            return self.file_path(url)
    
        file_key._base = True
    

      setting.py

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for angelImg project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'angelImg'
    
    SPIDER_MODULES = ['angelImg.spiders']
    NEWSPIDER_MODULE = 'angelImg.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'angelImg (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    DEFAULT_REQUEST_HEADERS = {
      # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      # 'Accept-Language': 'en',
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
        "Referer":"http://angelimg.spbeen.com/"
    }
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'angelImg.middlewares.AngelimgSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'angelImg.middlewares.AngelimgDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       #'angelImg.pipelines.AngelimgPipeline': 300,
       'angelImg.pipelines.DealFilePathPipeline': 200,
       #'scrapy.pipelines.files.FilesPipeline': 2
    }
    
    FILES_STORE='file_doload'
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
  • 相关阅读:
    SharePoint和Reporting Services整合 样式问题
    SharePoint2010安装体验(二)
    Moss 文件夹审批
    Moss中的控件—PeoplePicker
    C# 温故而知新:Stream篇(—)
    Entity Framework框架Code First Fluent API
    C# 温故而知新:Stream篇(二)
    扩展RBAC用户角色权限设计方案
    常见C#面试试题
    Spring Cloud之Ribbon负载均衡(Spring Cloud 2020.0.3版)
  • 原文地址:https://www.cnblogs.com/brady-wang/p/12510024.html
Copyright © 2011-2022 走看看