zoukankan      html  css  js  c++  java
  • 爬取链家网租房图 使用ImagesPipeline保存图片

    # 爬虫文件
    
    # -*- coding: utf-8 -*-
    import scrapy
    import os
    from urllib import request
    from lianjia.items import LianjiaItem
    class LianjiaspiderSpider(scrapy.Spider):
        name = 'lianjiaSpider'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['https://bj.lianjia.com/zufang/l1rp5/#contentList ']
    
        def parse(self, response):
            div_list = response.xpath('//div[@class="content__list"]/div[@class="content__list--item"]')
            # print(len(div_list))
            for div in div_list:
                title = div.xpath('.//div[@class="content__list--item--main"]/p[1]/a/text()').get()
                title = title.strip()
                detail_url = div.xpath('.//div[@class="content__list--item--main"]/p[1]/a/@href').get()
                detail_url = "https://bj.lianjia.com" + detail_url
                # print(detail_url)
                location = div.xpath('.//div[@class="content__list--item--main"]/p[2]//text()').getall()
                location = list(map(lambda x:x.replace("
    ","").replace("-","").replace("/","").strip(),location))
                location = "".join(location)
                # print(location)
                price = div.xpath('.//div[@class="content__list--item--main"]/span//text()').getall()
                price = price[0]+price[1]
                # print(price)
    
                yield scrapy.Request(url=detail_url, callback=self.parse_detail,meta={'info':(title,location,price,detail_url)})
    
            # 2-100页的url
            for i in range(2,101):
                next_url = "https://bj.lianjia.com/zufang/pg%dl1rp5/#contentList" % i
                yield scrapy.Request(url=next_url, callback=self.parse)
    
    
        def parse_detail(self,response):
            title,location,price,detail_url = response.meta.get("info")
            # pic_src = response.xpath("//div[@class='content__thumb--box']/ul/li[2]/img/@src").get()
            pic_srcs = response.xpath("//div[@class='content__thumb--box']/ul//img/@src").getall()
            # print('户型图链接:',pic_srcs)
            print('房源链接:',detail_url)
    
            item = LianjiaItem()
            item["title"] = title
            item["location"] = location
            item["price"] = price
            item['detail_url']=detail_url
            # item['pic_srcs'] = pic_srcs
            item['image_urls'] = pic_srcs
            yield item
    # 管道文件
    # 保存图片
    # 普通方法保存图片
    
    import os
    from urllib import request
    
    class LianjiaPipeline(object):
        def __init__(self):
            # 获取当前pipeline文件所在的目录路径 os.path.dirname(__file__)
            # 获取最外层bmw的路径os.path.dirname(os.path.dirname(__file__))
            # 在最外层bmw目录下创建一个文件夹 images, 获取images的路径
            self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images') # 生成images文件夹
            if not os.path.exists(self.path):
                print("images文件夹不存在")
                os.mkdir(self.path)  # 创建images文件夹
    
        def process_item(self, item, spider):
            location = item['location']
            urls = item['pic_srcs']
            per_house_pic_path = os.path.join(self.path,location)
            # path2=self.path  # G:Crawler and Data21days_spiderlianjiaimages
    
           # 处理路径拼接  打印出来的是一个斜杠的  但是系统里是两个斜杠的, 会报错
            per_house_pic_path = per_house_pic_path.replace('/','\')
            print('每一个户型图的保存路径:',per_house_pic_path)
    
            if not os.path.exists(per_house_pic_path):
                os.mkdir(per_house_pic_path)
            for url in urls:
                # 每个图片的url
                url = url.replace('126x86.jpg','780x439.jpg')  # 更改保存图片的大小
                # 切割图片url  拼接图片的名称  防止图片保存被覆盖 不然最后爬下的始终只有一张图片
                pic_name = url.split('.')[2][-9:-1]  # 防止图片被覆盖
    
                # os.path.join 的两个参数:户型图文件夹 和 图片的名称 拼接出来图片路径
                request.urlretrieve(url=url,filename=os.path.join(per_house_pic_path,pic_name+'.png'))
            return item
    
        
    # item文件
    class LianjiaItem(scrapy.Item):
        # define the fields for your item here like:
    
        # 普通的字段
        title = scrapy.Field()
        detail_url = scrapy.Field()
        location = scrapy.Field()
        price = scrapy.Field()
        pic_srcs = scrapy.Field()
        
        
    # setting中
    ITEM_PIPELINES = {
       'lianjia.pipelines.LianjiaPipeline': 300,
    
    }
    # 使用scrapy中的 image pipleline方法保存图片
    import os
    from urllib import request
    from scrapy.pipelines.images import ImagesPipeline
    from lianjia import settings
    
    class LjImagesPipeline(ImagesPipeline):
        # 这个方法是下载请求前调用的, 就是发送下载请求的时候调用
        def get_media_requests(self,item,info):
            request_objs = super(LjImagesPipeline,self).get_media_requests(item,info)
            for request_obj in request_objs:
                request_obj.item = item   # 把item绑定到request上面,为了下面的方法可以通过request获取item
            return request_objs
    
        def file_path(self,request,response=None,info=None):
            # 这个方法是图片被存储的时候调用,来获取这个图片存储的路径
            path = super(LjImagesPipeline,self).file_path(request,response,info)
            location = request.item.get('location')
            # 获取图片存储路径    images文件夹路径
            images_store = settings.IMAGES_STORE
            # 判断这里有没有目录   每个房源的目录(这里面存房子图片)
            per_house_pic_path = os.path.join(images_store, location)
            if not os.path.exists(per_house_pic_path):
                os.mkdir(per_house_pic_path)
            image_name = path.replace('full/','') # 加个斜杠/是把full删除
            # print('image_name:',image_name)  #c554f76249059833f3a454830ec2cc2067465968.jpg
    
            image_path = os.path.join(per_house_pic_path,image_name)
            return image_path
    
        
    # 对应的item文件
    class LianjiaItem(scrapy.Item):
        # define the fields for your item here like:
    
        # 普通的字段
        title = scrapy.Field()
        detail_url = scrapy.Field()
        location = scrapy.Field()
        price = scrapy.Field()
        # pic_srcs = scrapy.Field()
    
        # 使用Images Pipeline需要的字段
        image_urls=scrapy.Field()
        images = scrapy.Field()
        
      
    
    #settings文件
    ITEM_PIPELINES = {
       # 'lianjia.pipelines.LianjiaPipeline': 300,
       # "scrapy.pipelines.images.ImagesPipeline":1 #不执行管道文件
        
      'lianjia.pipelines.LjImagesPipeline': 1,   #执行管道文件里重写的两个方法
    }
    
    # 图片下载的路径 供image.pipelines使用
    import os
    # 图片存储路径
    IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images') # 生成images文件夹
    # 总结:
    1.  def process_item()方法中 self.path 获取到的是images文件夹的路径, 要在这个文件下面保存每一个户型图的图片
    
    2.  在window系统的路径拼接, os.path.join() 生成的路径通过print打印出来是一个斜杠/, 但是系统找路径的时候是找的双斜杠//, 这个时候就会报错.
  • 相关阅读:
    Linux下Java环境安装
    Go语言学习之10 Web开发与Mysql数据库
    Go语言学习之9 网络协议TCP、Redis与聊天室
    Redis入门指南之三(入门)
    Redis入门指南之一(简介)
    Go语言学习之8 goroutine详解、定时器与单元测试
    Redis入门指南之二(安装及配置)
    Go语言学习之7 接口实例、终端文件读写、异常处理
    Go语言学习之6 反射详解
    Go语言学习之5 进阶-排序、链表、二叉树、接口
  • 原文地址:https://www.cnblogs.com/kenD/p/11143563.html
Copyright © 2011-2022 走看看