zoukankan      html  css  js  c++  java
  • 用scrapy爬取搜狗Lofter图片

    用scrapy爬取搜狗Lofter图片

    # -*- coding: utf-8 -*-
    import json
    
    import scrapy
    from scrapy.http import Request
    from urllib import parse
    from scrapy.loader import ItemLoader
    
    from tutorial.items import LofterSpiderItem
    
    
    class LofterSpider(scrapy.Spider):
        name = "lofter"
        allowed_domains = ["pic.sogou.com"]
        start_urls = ['http://pic.sogou.com/']
    
        # question的第一页answer的请求url
        start_answer_url = "http://pic.sogou.com/pics/channel/getAllRecomPicByTag.jsp?category=LOFTER&tag=%E5%85%A8%E9%83%A8&start={0}&len=15"
    
        headers = {
            "HOST": "pic.sogou.com",
            "Referer": "http://pic.sogou.com",
            'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
        }
    
        def parse(self, response):
    
            yield scrapy.Request(self.start_answer_url.format(0), headers=self.headers,callback=self.parse_url)
    
        def parse_url(self,response):
            ans_json = json.loads(response.text)
            for ans in ans_json['all_items']:
                image_url = ans['ori_pic_url']
                item_loader = ItemLoader(item=LofterSpiderItem(), response=response)
                item_loader.add_value("lofter_image_url", image_url)
                lofter_item = item_loader.load_item()
                yield lofter_item
    
            yield scrapy.Request(self.start_answer_url.format(ans_json['startIndex']+15), headers=self.headers,callback=self.parse_url)
    
    
    

    settings.py

    ITEM_PIPELINES = {
       'tutorial.pipelines.TutorialPipeline': 300,
        'tutorial.pipelines.TutorialImagePipeline': 1,
    }
    # IMAGES_URLS_FIELD = "front_image_url"
    project_dir = os.path.abspath(os.path.dirname(__file__))
    IMAGES_STORE = os.path.join(project_dir, 'image')
    

    items.py

    class LofterSpiderItem(scrapy.Item):
        lofter_image_url = scrapy.Field(
            output_processor=MapCompose(return_value)
        )
    
  • 相关阅读:
    linux free
    uptime
    简述负载均衡&CDN技术(转)
    大胆地去做自己坚信的事情,去做不伤害国家和客户的事情 做企业一定要专注。为企业制定战略目标,绝对不能超过三个。超过三个,你就记不住了,员工也记不住
    同一路由器不同vlan之间的通信(一)
    计算机基础之计算机网络与安全
    LayoutInflater的使用
    插入排序
    Java NIO与IO
    高速排序算法
  • 原文地址:https://www.cnblogs.com/luozhiyun/p/8127259.html
Copyright © 2011-2022 走看看