zoukankan      html  css  js  c++  java
  • scrapy简单爬取图片

    #这里只爬取第一页
    
    items.py
    import scrapy
    #定义爬取数据
    class InsistItem(scrapy.Item):
        image_urls=scrapy.Field()
    
    tengxun.py
    import scrapy
    from insist.items import InsistItem
    import json
    
    class TengxunSpider(scrapy.Spider):
        name = 'tengxun'
        allowed_domains = ['douyucdn.cn']
        start_urls = ['http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=']
        def parse(self, response):
           item=InsistItem()
           con=json.loads(response.body)
           datas=con['data']
           print(datas)
           for i in datas:
                item['image_urls']=[i['vertical_src']]#非常重要,由于
     #{'scrapy.pipelines.images.ImagesPipeline': 301}用到的是图片的url列表,即使是一个链接也要用列表
                yield item
    
    settings.py
    ITEM_PIPELINES = {
      #'insist.pipelines.InsistPipeline': 300,
       'scrapy.pipelines.images.ImagesPipeline': 1,
    }
    IMAGES_STORE='C:\Users\lenovo\Desktop\data'#图片保存地址
    IMAGES_URLS_FIELD='image_urls'#保存链接的字段
    
    pipelines.py
    import scrapy
    from scrapy.pipelines.images import ImagesPipeline#导包
    class SDPipeline(ImagesPipeline):
        def get_media_requests(self,item,info):
            image_link=item['image_urls']
            yield scrapy.Request(image_link)
    
    最后scrapy crawl tengxun
    然后在所写的图片的目录中打开一个full的文件夹查看图片
  • 相关阅读:
    python中创建实例属性
    Python中if __name__ == "__main__": 的理解
    模块
    python函数式编程
    python-复杂生成式
    生成器的测试
    mysql乱码配置
    javascript
    Sql Server 2008 R2 下载地址
    Microsoft Data Access Components 2.8
  • 原文地址:https://www.cnblogs.com/persistence-ok/p/11560304.html
Copyright © 2011-2022 走看看