zoukankan      html  css  js  c++  java
  • scrapy上海买房指南

    Spider:

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy_zhaopin.items import ScrapyHouseItem
    from scrapy.http import Request
    
    
    class MySpider(scrapy.Spider):
        name = "spiderhouse"
        allowed_domains = ["sh.lianjia.com"]
        start_urls = ["https://sh.lianjia.com/ershoufang/rs徐泾北城/"]
    
        def parse(self, response):
    
            for line in response.xpath('//*[contains(@log-mod,"list")]//li[contains(@class,"clear")]'):
                item = ScrapyHouseItem()
    
                item['title'] = line.xpath('//title/text()').extract()[0].split("_")[0].replace("二手房房源", "")
                item['name'] = line.xpath('.//*[@class="title"]/a/text()').extract()
                item['address'] = line.xpath('.//*[@class="positionInfo"]/a/text()').extract()
                item['house_info'] = line.xpath('.//*[@class="houseInfo"]/text()').extract()
                item['price'] = line.xpath('.//*[@class="totalPrice"]//span/text()').extract()
                item['unit_price'] = line.xpath('.//*[@class="unitPrice"]//span/text()').extract()[0].replace("单价", "").replace("元/平米", "")
                yield item
    
            address_list = ["徐盈路", "徐泾镇", "华新镇", "嘉定北", "中山公园", "汇金路", "青浦新城", "爱博家园", "九亭",
                            "佘山", "泗泾", "洞泾", "赵巷"]
    
            for i in address_list:
                address_url = f'https://sh.lianjia.com/ershoufang/rs{i}/'
                yield Request(address_url, callback=self.parse)
    
            # if self.page < response.xpath('(//*[@class="pager-num"]//*[@class="num-iten"])[last()]/text()'):
            #     self.page += 1
            #     page_url = self.page_url % self.page
            #     yield Request(page_url, callback=self.parse)
  • 相关阅读:
    linux mint安装成功
    js 兼容性
    程序员的肚子有多大,水平就有多高
    财富通直连接口for rails3
    ubuntu live cd版本是没有recuse broken system功能
    生活百科
    省市县导入mysql代码,通过csv
    省市县导入mysql代码,通过csv
    休眠、挂起、待机三者之间的区别 收藏
    支付宝接口for rails3
  • 原文地址:https://www.cnblogs.com/nieliangcai/p/13322671.html
Copyright © 2011-2022 走看看