zoukankan      html  css  js  c++  java
  • 21天打造分布式爬虫-房天下全国658城市房源(十一)

    项目:爬取房天下网站全国所有城市的新房和二手房信息

    网站url分析

    1.获取所有城市url
    
       http://www.fang.com/SoufunFamily.htm
    
        例如:http://cq.fang.com/
    
    2.新房url
    
      http://newhouse.sh.fang.com/house/s/
    
    3.二手房url
    
      http://esf.sh.fang.com/
    
    4.北京新房和二手房url规则不同
    
       http://newhouse.fang.com/house/s/
    
       http://esf.fang.com/

    创建项目

    scrapy startproject fang
    
    scrapy genspider sfw_spider "fang.com"

    sfw_spider.py

    # -*- coding: utf-8 -*-
    import scrapy
    import re
    from fang.items import NewHouseItem,ESFHouseItem
    
    class SfwSpiderSpider(scrapy.Spider):
        name = 'sfw_spider'
        allowed_domains = ['fang.com']
        start_urls = ['http://www.fang.com/SoufunFamily.htm']
    
        def parse(self, response):
            trs = response.xpath("//div[@class='outCont']//tr")
            provice = None
            for tr in trs:
                #排除掉第一个td,两个第二个和第三个td标签
                tds = tr.xpath(".//td[not(@class)]")
                provice_td = tds[0]
                provice_text = provice_td.xpath(".//text()").get()
                #如果第二个td里面是空值,则使用上个td的省份的值
                provice_text = re.sub(r"s","",provice_text)
                if provice_text:
                    provice = provice_text
                #排除海外城市
                if provice == '其它':
                    continue
    
                city_td = tds[1]
                city_links = city_td.xpath(".//a")
                for city_link in city_links:
                    city = city_link.xpath(".//text()").get()
                    city_url = city_link.xpath(".//@href").get()
                    # print("省份:",provice)
                    # print("城市:",city)
                    # print("城市链接:",city_url)
                    #下面通过获取的city_url拼接出新房和二手房的url链接
                    #城市url:http://cq.fang.com/
                    #新房url:http://newhouse.cq.fang.com/house/s/
                    #二手房:http://esf.cq.fang.com/
                    url_module = city_url.split("//")
                    scheme = url_module[0]     #http:
                    domain = url_module[1]     #cq.fang.com/
                    if 'bj' in domain:
                        newhouse_url = ' http://newhouse.fang.com/house/s/'
                        esf_url = ' http://esf.fang.com/'
                    else:
                        #新房url
                        newhouse_url = scheme + '//' + "newhouse." + domain + "house/s/"
                        #二手房url
                        esf_url = scheme + '//' + "esf." + domain + "house/s/"
                    # print("新房链接:",newhouse_url)
                    # print("二手房链接:",esf_url)
    
                    #meta里面可以携带一些参数信息放到Request里面,在callback函数里面通过response获取
                    yield scrapy.Request(url=newhouse_url,
                                         callback=self.parse_newhouse,
                                         meta = {'info':(provice,city)}
                                         )
    
                    yield scrapy.Request(url=esf_url,
                                         callback=self.parse_esf,
                                         meta={'info': (provice, city)})
    
    
        def parse_newhouse(self,response):
            #新房
            provice,city = response.meta.get('info')
            lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
            for li in lis:
                name = li.xpath(".//div[contains(@class,'house_value')]//div[@class='nlcd_name']/a/text()").get()
                if name:
                    name = re.sub(r"s","",name)
                    #居室
                    house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
                    house_type_list = list(map(lambda x:re.sub(r"s","",x),house_type_list))
                    rooms = list(filter(lambda x:x.endswith(""),house_type_list))
                    #面积
                    area = "".join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall())
                    area = re.sub(r"s|-|/","",area)
                    #地址
                    address = li.xpath(".//div[@class='address']/a/@title").get()
                    address = re.sub(r"[请选择]","",address)
                    sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()
                    price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall())
                    price = re.sub(r"s|广告","",price)
                    #详情页url
                    origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
    
                    item = NewHouseItem(
                        name=name,
                        rooms=rooms,
                        area=area,
                        address=address,
                        sale=sale,
                        price=price,
                        origin_url=origin_url,
                        provice=provice,
                        city=city
                    )
                    yield item
    
            #下一页
            next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get()
            if next_url:
                yield scrapy.Request(url=response.urljoin(next_url),
                                     callback=self.parse_newhouse,
                                     meta={'info': (provice, city)}
                                     )
    
    
        def parse_esf(self,response):
            #二手房
            provice, city = response.meta.get('info')
            dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
            for dl in dls:
                item = ESFHouseItem(provice=provice,city=city)
                name = dl.xpath(".//span[@class='tit_shop']/text()").get()
                if name:
                    infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
                    infos = list(map(lambda x:re.sub(r"s","",x),infos))
                    for info in infos:
                        if "" in info:
                            item["rooms"] = info
                        elif '' in info:
                            item["floor"] = info
                        elif '' in info:
                            item['toward'] = info
                        elif '' in info:
                            item['area'] = info
                        elif '年建' in info:
                            item['year'] = re.sub("年建","",info)
                    item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get()
                    #总价
                    item['price'] = "".join(dl.xpath(".//span[@class='red']//text()").getall())
                    #单价
                    item['unit'] = dl.xpath(".//dd[@class='price_right']/span[2]/text()").get()
                    item['name'] = name
                    detail = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
                    item['origin_url'] = response.urljoin(detail)
                    yield item
            #下一页
            next_url = response.xpath("//div[@class='page_al']/p/a/@href").get()
            if next_url:
                yield scrapy.Request(url=response.urljoin(next_url),
                                     callback=self.parse_esf,
                                     meta={'info': (provice, city)}
                                     )

    items.py

    # -*- coding: utf-8 -*-
    
    import scrapy
    
    class NewHouseItem(scrapy.Item):
        #省份
        provice = scrapy.Field()
        # 城市
        city = scrapy.Field()
        # 小区
        name = scrapy.Field()
        # 价格
        price = scrapy.Field()
        # 几居,是个列表
        rooms = scrapy.Field()
        # 面积
        area = scrapy.Field()
        # 地址
        address = scrapy.Field()
        # 是否在售
        sale = scrapy.Field()
        # 房天下详情页面的url
        origin_url = scrapy.Field()
    
    class ESFHouseItem(scrapy.Item):
        # 省份
        provice = scrapy.Field()
        # 城市
        city = scrapy.Field()
        # 小区名字
        name = scrapy.Field()
        # 几室几厅
        rooms = scrapy.Field()
        #
        floor = scrapy.Field()
        # 朝向
        toward = scrapy.Field()
        # 年代
        year = scrapy.Field()
        # 地址
        address = scrapy.Field()
        # 建筑面积
        area = scrapy.Field()
        # 总价
        price = scrapy.Field()
        # 单价
        unit = scrapy.Field()
        # 详情页url
        origin_url = scrapy.Field()

    pipelines.py

    # -*- coding: utf-8 -*-
    
    from scrapy.exporters import JsonLinesItemExporter
    
    class FangPipeline(object):
        def __init__(self):
            self.newhouse_fp = open('newhouse.json','wb')
            self.esfhouse_fp = open('esfhouse.json','wb')
            self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,ensure_ascii=False)
            self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp,ensure_ascii=False)
    
        def process_item(self, item, spider):
            self.newhouse_exporter.export_item(item)
            self.esfhouse_exporter.export_item(item)
            return item
    
        def close_spider(self,spider):
            self.newhouse_fp.close()
            self.esfhouse_fp.close()

    middleware.py 设置随机User-Agent

    # -*- coding: utf-8 -*-
    
    import random
    
    class UserAgentDownloadMiddleware(object):
        USER_AGENTS = [
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 ',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
        ]
    
        def process_request(self,request,spider):
            user_agent = random.choice(self.USER_AGENTS)
            request.headers['User-Agent'] = user_agent

    settings.py

    ROBOTSTXT_OBEY = False
    
    DOWNLOAD_DELAY = 1
    
    DOWNLOADER_MIDDLEWARES = {
       'fang.middlewares.UserAgentDownloadMiddleware': 543,
    }
    
    ITEM_PIPELINES = {
       'fang.pipelines.FangPipeline': 300,
    }

    start.py

    from scrapy import cmdline
    
    cmdline.execute("scrapy crawl sfw_spider".split())
  • 相关阅读:
    JS基本语法---while循环---练习
    JS基本语法---while循环
    JS基础语法---分支语句总结
    Python开发坦克大战
    基于Python的face_recognition库实现人脸识别
    10个Python 初学者必知编码小技巧
    论如何用python发qq消息轰炸虐狗好友
    Python + Selenium +Chrome 批量下载网页代码修改【新手必学】
    Python爬取mc皮肤【爬虫项目】
    Python 分发包中添加额外文件【新手必学】
  • 原文地址:https://www.cnblogs.com/derek1184405959/p/9446544.html
Copyright © 2011-2022 走看看