zoukankan      html  css  js  c++  java
  • 麦田厦门小区信息数据爬取

    刚开始爬取的时候没有用headers伪装成是浏览器,导致麦田北京和福州小区把我的ip给禁掉了,还好后来发现原因也还剩下厦门小区没被我弄坏,代码如下:

    #-*- coding:utf-8 -*-
    import requests
    from bs4 import BeautifulSoup
    page_url = "http://xm.maitian.cn/xqall"
    headers = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
               "Referer":"http://xm.maitian.cn/esfall",
               "Connection":"keep-alive",
               "Content-Type":"text/plain; charset=utf-8"}
    
    
    def get_communities_url():
        all_data =[]
        try:
            reponse = requests.get(url=page_url,headers=headers)
        except Exception as e:
            print("请求连接错误")
            raise e
    
        soup = BeautifulSoup(reponse.text,"lxml")
        soup = soup.find("div","list_wrap")
        tag_li = soup.find_all("li")
        for tag_li in soup.find_all("li"):
            href = tag_li.h1.a['href']
            new_url = page_url.replace("/xqall",href)
            #all_url.append(new_url)
            dict_data =get_target_info(new_url)
            if dict_data:
                all_data.append(dict_data)
        #print(all_data)
        return all_data
    
    def get_target_info(new_url):
        # all_url = get_communities_url()
        # print(len(all_url))
    
        dict = {}
    
        try:
            reponse = requests.get(url=new_url,headers=headers)
        except Exception as e:
            print("请求连接错误")
            raise e
    
        #print(reponse.text)
        soup = BeautifulSoup(reponse.text,'lxml')
        soup1 = soup.find("section","home_main")
        ps = soup1.find_all("p")
        # 小区均价
        community_avg = ps[0].b.string.strip()
        dict["community_avg"] =community_avg
        #待售房源
        unsold_homes = ps[1].find_all("em")[0].a.string+""
        dict["unsold_homes"] = unsold_homes
        #待租房源
        rent_homes = ps[1].find_all("em")[1].a.string + ""
        dict["rent_homes"] = rent_homes
        #所属商圈
        business_circle = ps[2].label.string
        dict["business_circle"] =business_circle
        #开发商
        developers = ps[2].em.string
        dict["developers"] = developers
    
        soup2 = soup.find("ul","home_details")
        for tag_li in soup2.find_all("li"):
            if tag_li["class"] == ['li_left']:
                p = tag_li.find_all("p")
                #建筑面积
                area=p[0].em.string
                dict["area"] = area
                #物业公司
                property_company=p[1].em.string
                dict["property_company"] = property_company
                #物业费
                industry_fee = p[2].em.string
                dict["industry_fee"] = industry_fee
    
            elif tag_li["class"] == ['li_center']:
                p = tag_li.find_all("p")
                #建成年代
                built_year = p[0].em.string
                dict["built_year"] = built_year
                #房屋总数
                total_houses = p[1].em.string
                dict["total_houses"] = total_houses
                #绿化率
                green_rates = p[2].em.string
                dict["green_rates"] =green_rates
    
            elif tag_li["class"] == ['li_right']:
                p = tag_li.find_all("p")
                # 占地面积
                cover_area = p[0].em.string
                dict["cover_area"] = cover_area
                # 楼栋总数
                total_built = p[1].em.string
                dict["total_built"] = total_built
                # 容积率
                product_rates = p[2].em.string
                dict["product_rates"] = product_rates
        return dict
    
    
    
    if __name__ == '__main__':
        data_all = get_communities_url()
        print(data_all)
  • 相关阅读:
    HDU 4285
    Codeforces 242C
    Codeforces 811C
    Codeforces 883H
    Codeforces 371D
    Codeforces 926E
    牛客算法周周练17 解题报告
    牛客算法周周练17D
    牛客算法周周练17C
    牛客算法周周练17A
  • 原文地址:https://www.cnblogs.com/venvive/p/11415472.html
Copyright © 2011-2022 走看看