• 某家简单爬虫记录


    定义爬取数据

    import scrapy
    
    class LianjianItem(scrapy.Item):
        name = scrapy.Field()
        address = scrapy.Field()
        type = scrapy.Field()
        size = scrapy.Field()
        price = scrapy.Field()
    

    编写爬虫文件

    # -*- coding: utf-8 -*-
    import scrapy
    from lianjian.items import LianjianItem
    from scrapy.http import Request
    
    class LianjiaspiderSpider(scrapy.Spider):
        name = 'lianjiaSpider'
        allowed_domains = ['lianjia.com']
        start_urls = ['https://hz.fang.lianjia.com/loupan/yuhang-xiaoshan-binjiang/pg1/']
    
        def parse(self, response):
            item = LianjianItem()
            item['name'] = response.xpath('//div[@class="resblock-name"]/a/text()').extract()
            item['address'] = response.xpath('//div[@class="resblock-location"]/a/text()').extract()
            item['type'] = response.xpath('//a[@class="resblock-room"]/span/text()').extract()
            item['size'] = response.xpath('//div[@class="resblock-area"]/span/text()').extract()
            item['price'] =response.xpath('//div[@class="resblock-price"]/div[@class="second"]/text()').extract()
            yield item
            for i in range(1,52):
                url = 'https://hz.fang.lianjia.com/loupan/yuhang-xiaoshan-binjiang/pg'+str(i)+'/'
                yield Request(url,callback=self.parse)
    

    定义管道

    编写管道文件

    # -*- coding: utf-8 -*-
    
    import xlwt
    import xlrd
    
    class LianjianPipeline(object):
        def __init__(self):
            pass
    
        def process_item(self, item, spider):
            # print("进来了"+str(item))
            line = 0
            for i in range(len(item['name'])):
                name = item['name'][i]
                # self.sheet.write(line,0,name)
                print(name)
                address = item['address'][i]
                print(address)
                # self.sheet.write(line,1,address)
                type = item['type'][i]
                print(type)
                # self.sheet.write(line, 2, type)
                size = item['size'][i]
                print(size)
                # self.sheet.write(line, 3, size)
                price = item['price'][i]
                print(price)
                # self.sheet.write(line, 4, price)
                print("-----------------------")
                line += 1
            # self.book.save("lianjia.xls")
            return item
    

    settings.py开启管道

    ITEM_PIPELINES = {
       'lianjian.pipelines.LianjianPipeline': 300,
    }
    

    启动爬虫文件

    image.png

  • 相关阅读:
    SIT/UAT测试
    Oracle密码过期设置和修改密码问题
    1、查询速度慢的原因很多,常见如下几种:
    dbs:apple-notes
    值不能为 null 或为空。参数名: linkText
    Visual Stadio 2015创建WebApplication应用和运行赏析
    HTTP 错误 500.19
    Introduction to ASP.NET Web Programming Using the Razor Syntax (C#)
    vs2015-Azure Mobile Service
    6.1.1 验证注解的使用
  • 原文地址:https://www.cnblogs.com/yiweiblog/p/12652493.html
走看看 - 开发者的网上家园