zoukankan      html  css  js  c++  java
  • 某家简单爬虫记录

    定义爬取数据

    import scrapy
    
    class LianjianItem(scrapy.Item):
        name = scrapy.Field()
        address = scrapy.Field()
        type = scrapy.Field()
        size = scrapy.Field()
        price = scrapy.Field()
    

    编写爬虫文件

    # -*- coding: utf-8 -*-
    import scrapy
    from lianjian.items import LianjianItem
    from scrapy.http import Request
    
    class LianjiaspiderSpider(scrapy.Spider):
        name = 'lianjiaSpider'
        allowed_domains = ['lianjia.com']
        start_urls = ['https://hz.fang.lianjia.com/loupan/yuhang-xiaoshan-binjiang/pg1/']
    
        def parse(self, response):
            item = LianjianItem()
            item['name'] = response.xpath('//div[@class="resblock-name"]/a/text()').extract()
            item['address'] = response.xpath('//div[@class="resblock-location"]/a/text()').extract()
            item['type'] = response.xpath('//a[@class="resblock-room"]/span/text()').extract()
            item['size'] = response.xpath('//div[@class="resblock-area"]/span/text()').extract()
            item['price'] =response.xpath('//div[@class="resblock-price"]/div[@class="second"]/text()').extract()
            yield item
            for i in range(1,52):
                url = 'https://hz.fang.lianjia.com/loupan/yuhang-xiaoshan-binjiang/pg'+str(i)+'/'
                yield Request(url,callback=self.parse)
    

    定义管道

    编写管道文件

    # -*- coding: utf-8 -*-
    
    import xlwt
    import xlrd
    
    class LianjianPipeline(object):
        def __init__(self):
            pass
    
        def process_item(self, item, spider):
            # print("进来了"+str(item))
            line = 0
            for i in range(len(item['name'])):
                name = item['name'][i]
                # self.sheet.write(line,0,name)
                print(name)
                address = item['address'][i]
                print(address)
                # self.sheet.write(line,1,address)
                type = item['type'][i]
                print(type)
                # self.sheet.write(line, 2, type)
                size = item['size'][i]
                print(size)
                # self.sheet.write(line, 3, size)
                price = item['price'][i]
                print(price)
                # self.sheet.write(line, 4, price)
                print("-----------------------")
                line += 1
            # self.book.save("lianjia.xls")
            return item
    

    settings.py开启管道

    ITEM_PIPELINES = {
       'lianjian.pipelines.LianjianPipeline': 300,
    }
    

    启动爬虫文件

    image.png

  • 相关阅读:
    Python
    TERSUS笔记116-删除数据操作
    TERSUS笔记115-修改数据操作
    TERSUS笔记114-表格分页操作
    TERSUS笔记113-查询数据操作
    TERSUS笔记111-上传图片在列表中显示
    TERSUS笔记110-增加保存验证数据操作
    TERSUS笔记103-本地开发测试mysql数据库连接
    TERSUS笔记102-CSS样式和icon图标设置说明
    TERSUS笔记101-常用元件和操作说明
  • 原文地址:https://www.cnblogs.com/yiweiblog/p/12652493.html
Copyright © 2011-2022 走看看