zoukankan      html  css  js  c++  java
  • scrapy爬取趣头条

    # -*- coding: utf-8 -*-
    import scrapy
    from ..items import QutoutiaoItem
    import json
    import re
    from ..settings import CATEGORY_INFO, LIST_LIMIT
    
    
    
    class QutoutiaoSpider(scrapy.Spider):
    
        name = 'qutoutiao'
        #allowed_domains = ['qutoutiao.net']
        start_urls = []
        # 各类小标题
        categoryInfo = CATEGORY_INFO
        limit = LIST_LIMIT
        for value in categoryInfo:
            url = BASE_API + "cid=%s&tn=1&page=1&limit=%s" % (
            str(value['cid']), str(limit))
            start_urls.append(url)
    
    
        def parse(self, response):
    
            response_url = response.url
            # 分类id从url获取一次
            searchObj = re.search(r'(.*)cid=(d+)', response_url)
            cid = searchObj and searchObj.group(2) or 0
    
            data = json.loads(response.text)['data']['data']
    
            for value in data:
                # 初始化模型对象
                item = QutoutiaoItem()
                # 来源
                item['source_name'] = value['source_name']
                # 标题
                item['title'] = value['title']
                # 详细页url
                url = item['url'] = value['url']
                # url = url[0:url.find('?')]
                # 简介
                item['introduction'] = value['introduction']
                # 封面图
                item['cover'] = value['cover']
                # 发布时间
                item['publish_time'] = value['publish_time']
                # 分类
                item['cid'] = cid
    
                # 爬取详情页
                yield scrapy.Request(url=item['url'], meta={'meta_item': item},
                                     callback=self.detail_parse)
    
        # 详情页
        def detail_parse(self, response):
            # 提取每次Response的meta数据
            meta_item = response.meta['meta_item']
            # 取内容
            content_selector = response.xpath('//div[@class="content"]')
            meta_item['content_images'] = content_selector.xpath(
                '//img/@src|//img/@data-src').extract()
            meta_item['content'] = content_selector.extract()[0]
            yield meta_item
    
    
    
    
    
        # 列表API
        BASE_API = 'http://api.1sapp.com/content/outList?'
    
        # 爬取地址
    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class QutoutiaoItem(scrapy.Item):
        # define the fields for your item here like:
    
        # 文章id
        aid = scrapy.Field()
        # 来源
        source_name = scrapy.Field()
        # 标题
        title = scrapy.Field()
        # 详细页url
        url = scrapy.Field()
        # 简介
        introduction = scrapy.Field()
        # 封面图
        cover = scrapy.Field()
        # 发布时间
        publish_time = scrapy.Field()
        # 分类ID
        cid = scrapy.Field()
        # 内容
        content = scrapy.Field()
        # 内容-中的图片
        content_images = scrapy.Field()
    # -*- coding: utf-8 -*-
    
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    from scrapy import signals
    
    
    class QutoutiaoSpiderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the spider middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_spider_input(self, response, spider):
            # Called for each response that goes through the spider
            # middleware and into the spider.
    
            # Should return None or raise an exception.
            return None
    
        def process_spider_output(self, response, result, spider):
            # Called with the results returned from the Spider, after
            # it has processed the response.
    
            # Must return an iterable of Request, dict or Item objects.
            for i in result:
                yield i
    
        def process_spider_exception(self, response, exception, spider):
            # Called when a spider or process_spider_input() method
            # (from other spider middleware) raises an exception.
    
            # Should return either None or an iterable of Response, dict
            # or Item objects.
            pass
    
        def process_start_requests(self, start_requests, spider):
            # Called with the start requests of the spider, and works
            # similarly to the process_spider_output() method, except
            # that it doesn’t have a response associated.
    
            # Must return only requests (not items).
            for r in start_requests:
                yield r
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    
    
    from fake_useragent import UserAgent
    import logging
    
    
    class UserAgent_CookiesMiddleware(object):
        # 随机更换user-agent
        def __init__(self, crawler):
            super(UserAgent_CookiesMiddleware, self).__init__()
            self.ua = UserAgent()
            self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
            self.logger = logging.getLogger(__name__)
    
        @classmethod
        def from_crawler(cls, crawler):
            return cls(crawler)
    
        def process_request(self, request, spider):
            def get_ua():
                return getattr(self.ua, self.ua_type)
    
            random_agent = get_ua()
    
            if random_agent:
                # 记录
                request.headers['User-Agent'] = random_agent
                request.headers['Accept'] = 'application/json, text/javascript, */*; q=0.01'
                request.headers['Origin'] = 'http://home.qutoutiao.net'
                request.headers['Referer'] = 'http://home.qutoutiao.net/pages/home.html'
    
                self.logger.debug('Current UserAgent: ' + random_agent)
    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    import scrapy
    from scrapy.utils.project import get_project_settings
    from scrapy.pipelines.images import ImagesPipeline
    import os
    from .qttutils import QttUtils
    
    
    # 封面下载
    class CoverImagePipeline(ImagesPipeline):
        # 获取settings中的常量
        IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
        # 下载图片
        def get_media_requests(self, item, info):
            cover_images = item['cover']
            if cover_images:
                for image_url in cover_images:
                    yield scrapy.Request(url=image_url)
    
        # 下载完成
        def item_completed(self, results, item, info):
            # print('*'*20,results,item,info)
            image_path = [x['path'] for ok, x in results if ok]
            # 获取自定义存储路径
            store_path = QttUtils.getStorePath()
            coverImages = []
            # 将图片移动到新的路径
            print('------------------image_path-',image_path)
            print('-----------------type(image_path)--', type(image_path))
            if image_path:
                for image_url in image_path:
                    file_name = os.path.split(str(image_url))
                    print('------------------file_name-', file_name)
                    print('------------------file_name-', type(file_name))
                    new_image = store_path + '/' + file_name[1]
                    coverImages.append(new_image)
                    os.rename(self.IMAGES_STORE + '/' + image_url, new_image)
            item['cover'] = coverImages
            return item
    
    # 内容图片下载
    class ContentImagePipeline(ImagesPipeline):
        # 获取settings中的常量
        IMAGE_STORE = get_project_settings().get('IMAGES_STORE')
        # 下载图片
        def get_media_requests(self, item, info):
            content_images = item['content_images']
            if content_images:
                for image_url in content_images:
                    yield scrapy.Request(image_url)
    
        # 下载完成
        def item_completed(self, results, item, info):
            image_path = [x['path'] for ok, x in results if ok]
            print('---------------------image_path', image_path)
            # 获取自定义存储路径
            store_path = QttUtils.getStorePath()
            contentImages = []
            # 将图片移动到新的路径
            if image_path:
                for base_path in image_path:
                    print('----------------value', base_path)
                    file_name = os.path.split(str(base_path))
                    new_image = store_path + "/" + file_name[1]
                    contentImages.append(new_image)
                    os.rename(self.IMAGE_STORE + "/" + base_path, new_image)
            item['content_images'] = contentImages
            return item
    
        # # 下载完成 方法一
        # def item_completed(self, results, item, info):
        #     image_info = [(x['path'], x['url']) for ok, x in results if ok]
        #     print('---------------------image_info', image_info)
        #     # 获取自定义存储路径
        #     store_path = QttUtils.getStorePath()
        #     contentImages = []
        #     # 将图片移动到新的路径
        #     if image_info:
        #         for value in image_info:
        #             print('----------------value', value)
        #             image_url = value[0]
        #             image_source = value[1]
        #
        #             file_name = os.path.split(str(image_url))
        #             new_image = store_path + "/" + file_name[1]
        #             contentImages.append((new_image, image_source))
        #             os.rename(self.IMAGE_STORE + "/" + image_url, new_image)
        #     item['content_images'] = contentImages
        #     return item
    
    import json
    from .qttutils import QttUtils
    
    
    class QutoutiaoPipeline(object):
    
        def __init__(self):
            # 获取自定义的存储路径
            store_path = QttUtils.getStorePath()
            json_path = store_path + '/' + 'qutoutiao.json'
            self.filename = open(json_path, 'wb')
    
        def process_item(self, item, spider):
            text = json.dumps(dict(item), ensure_ascii=False) + '
    '
            self.filename.write(text.encode('utf-8'))
            return item
    
        def close_spider(self, spider):
            self.filename.close()
    # -*- coding: utf-8 -*-
    # @Time    : 2018-6-1 11:01
    # @Author  : Amir
    # @Site    : 
    # @File    : qttutils.py
    # @Software: PyCharm
    
    '''
    趣头条工具类
    '''
    
    import time
    import os
    import shutil
    from .settings import DATA_PATH
    
    class QttUtils:
        # 获取存储路径
        #
        # @param  [string] action [remove删除目录,默认create]
        # @return [string] path/year/month/day/*
        @staticmethod
        def getStorePath(action='create'):
            localtimes = time.localtime()
            year = time.strftime("%Y", localtimes)
            month = time.strftime('%m', localtimes)
            day = time.strftime('%d', localtimes)
            store_path = DATA_PATH + "/%s/%s/%s"%(year,month,day)
    
            # 删除目录
            if os.path.exists(store_path) and action == 'remove':
                shutil.rmtree(store_path)
    
            # 创建多级目录
            if not os.path.exists(store_path) and action == 'create':
                os.makedirs(store_path)
    
            return store_path
    # -*- coding: utf-8 -*-
    
    # Scrapy settings for QuTouTiao project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     http://doc.scrapy.org/en/latest/topics/settings.html
    #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'QuTouTiao'
    
    SPIDER_MODULES = ['QuTouTiao.spiders']
    NEWSPIDER_MODULE = 'QuTouTiao.spiders'
    
    
    #列表记录数
    LIST_LIMIT = 10
    
    # 储存路径
    DATA_PATH = r'./data'
    IMAGES_STORE = r'./image'
    
    #分类
    CATEGORY_INFO = [
        {"cid":255,"name":"推荐"},
        {"cid":1,"name":"热点"},
        {"cid":6,"name":"娱乐"},
        {"cid":5,"name":"养生"},
        {"cid":2,"name":"搞笑"},
        {"cid":7,"name":"科技"},
        {"cid":8,"name":"生活"},
        {"cid":10,"name":"财经"},
        {"cid":9,"name":"汽车"},
    ]
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'QuTouTiao (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'QuTouTiao.middlewares.QutoutiaoSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    DOWNLOADER_MIDDLEWARES = {
       'QuTouTiao.middlewares.UserAgent_CookiesMiddleware': 299,
    }
    
    # Enable or disable extensions
    # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
        'QuTouTiao.pipelines.QutoutiaoPipeline': 300,
        'QuTouTiao.pipelines.ContentImagePipeline': 301,
        'QuTouTiao.pipelines.CoverImagePipeline': 302
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
  • 相关阅读:
    0:一种过滤机制的MobileMenuList
    MobileMenuImage
    (转)How To Kill runaway processes After Terminating Concurrent Request
    fnd_profile.value('AFLOG_ENABLED')的取值 和配置文件相关SQL
    供应商 银行 SQL (转自ITPUB)
    重启并发管理器
    定义并发请求时 业务实体值集显示没有值数据
    Oracle EBS环境下查找数据源(OAF篇)
    查看在线EBS用户的相关信息
    转,Oracle中关于处理小数点位数的几个函数,取小数位数,Oracle查询函数
  • 原文地址:https://www.cnblogs.com/hyxailj/p/9124659.html
Copyright © 2011-2022 走看看