zoukankan      html  css  js  c++  java
  • 用scrapy爬取搜狗Lofter图片

    用scrapy爬取搜狗Lofter图片

    # -*- coding: utf-8 -*-
    import json
    
    import scrapy
    from scrapy.http import Request
    from urllib import parse
    from scrapy.loader import ItemLoader
    
    from tutorial.items import LofterSpiderItem
    
    
    class LofterSpider(scrapy.Spider):
        name = "lofter"
        allowed_domains = ["pic.sogou.com"]
        start_urls = ['http://pic.sogou.com/']
    
        # question的第一页answer的请求url
        start_answer_url = "http://pic.sogou.com/pics/channel/getAllRecomPicByTag.jsp?category=LOFTER&tag=%E5%85%A8%E9%83%A8&start={0}&len=15"
    
        headers = {
            "HOST": "pic.sogou.com",
            "Referer": "http://pic.sogou.com",
            'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
        }
    
        def parse(self, response):
    
            yield scrapy.Request(self.start_answer_url.format(0), headers=self.headers,callback=self.parse_url)
    
        def parse_url(self,response):
            ans_json = json.loads(response.text)
            for ans in ans_json['all_items']:
                image_url = ans['ori_pic_url']
                item_loader = ItemLoader(item=LofterSpiderItem(), response=response)
                item_loader.add_value("lofter_image_url", image_url)
                lofter_item = item_loader.load_item()
                yield lofter_item
    
            yield scrapy.Request(self.start_answer_url.format(ans_json['startIndex']+15), headers=self.headers,callback=self.parse_url)
    
    
    

    settings.py

    ITEM_PIPELINES = {
       'tutorial.pipelines.TutorialPipeline': 300,
        'tutorial.pipelines.TutorialImagePipeline': 1,
    }
    # IMAGES_URLS_FIELD = "front_image_url"
    project_dir = os.path.abspath(os.path.dirname(__file__))
    IMAGES_STORE = os.path.join(project_dir, 'image')
    

    items.py

    class LofterSpiderItem(scrapy.Item):
        lofter_image_url = scrapy.Field(
            output_processor=MapCompose(return_value)
        )
    
  • 相关阅读:
    (转)HTTP、TCP和HTTPS
    使用覆盖索引优化like查询
    Laravel 源码解析(一)
    redis 缓存策略注意的问题总结
    laravel 监听mysql操作 生成时间
    CommonJS, AMD, CMD 笔记
    php json_encode小数精度丢失的问题
    Python通过pandas操作excel常用功能
    Vmware 虚拟化技术
    磁盘中的esp分区与msr分区
  • 原文地址:https://www.cnblogs.com/luozhiyun/p/8127259.html
Copyright © 2011-2022 走看看