zoukankan      html  css  js  c++  java
  • 上海租房信息分析-爬虫-数据分析-练习

    参考https://mp.weixin.qq.com/s/tQV9k8sEcBXUFuczdLxXUw

    一、爬虫取数

    import requests
    import time
    import re
    from scrapy.selector import Selector


    url = 'https://sh.lianjia.com/zufang/'
    def res_get(url):
    header = {
    'Cookie': 'TY_SESSION_ID=41bf824a-e5fb-4b8a-9dc8-e224e038e9a0; lianjia_uuid=f11f6224-f260-4bfe-b4d0-e1071db02482; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1535337110; _smt_uid=5b836297.3339dbe3; UM_distinctid=16579391f217-045c31e2ca4389-43480420-1fa400-16579391f225b; _jzqc=1; _jzqckmp=1; _ga=GA1.2.688060518.1535337113; _gid=GA1.2.2056944229.1535337113; lianjia_ssid=f41dfd57-40a9-4b89-9265-7728d84c3cd4; _jzqa=1.4175547485031953400.1535337111.1535345836.1535349807.3; all-lj=c32edd623b8a5a59c7de54c92107bb6c; CNZZDATA1253492439=1485168093-1535347395-%7C1535347395; CNZZDATA1254525948=1488534351-1535348570-%7C1535348570; CNZZDATA1255633284=1099602249-1535346343-%7C1535346343; _qzjc=1; select_city=310000; CNZZDATA1255604082=1120666595-1535345064-%7C1535346820; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1535351058; _qzja=1.260637364.1535349932545.1535349932545.1535349932545.1535351050620.1535351058619.0.0.0.25.1; _qzjb=1.1535349932545.25.0.0.0; _qzjto=25.1.0; _jzqb=1.28.10.1535349807.1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    res = requests.get(url,headers=header)
    if res.status_code == 200:
    return Selector(res)
    else:
    print('res_get wrong!')





    def get_house_data(quyu,url):
    res = res_get(url)
    houst_lists = res.xpath('//ul[@id="house-lst"]/li')
    for houst_list in houst_lists:
    try:
    title = houst_list.xpath('div[@class="info-panel"]/h2/a/@title').extract_first()
    except Exception as e:
    title = ''
    try:
    region = houst_list.xpath('div//span[@class="region"]/text()').extract_first().split('xa0')[0]
    except Exception as e:
    region = ''
    try:
    zone = houst_list.xpath('div//span[@class="zone"]/span/text()').extract_first().split('xa0')[0]
    except Exception as e:
    zone = ''
    try:
    meters = int(houst_list.xpath('div//span[@class="meters"]/text()').extract_first().split('平米')[0])
    except Exception as e:
    meters = ''
    try:
    dir = houst_list.xpath('div//span[@class="meters"]/following-sibling::*/text()').extract_first()
    except Exception as e:
    dir = ''
    try:
    place = houst_list.xpath('div//div[@class="other"]//a/text()').extract_first().split('租房')[0]
    except Exception as e:
    place = ''
    try:
    cur_floor = houst_list.xpath('div//div[@class="con"]/text()').extract()[0].split('(')[0]
    except Exception as e:
    cur_floor = ''
    try:
    all_floor = int(re.search('d+',houst_list.xpath('div//div[@class="con"]/text()').extract()[0]).group())
    except Exception as e:
    all_floor = ''
    try:
    build_year = int(re.search('d+',houst_list.xpath('div//div[@class="con"]/text()').extract()[1]).group())
    except Exception as e:
    build_year = ''
    try:
    price = int(houst_list.xpath('div//div[@class="price"]/span/text()').extract_first())
    except Exception as e:
    price = ''
    save_to_csv(quyu,title,region,zone,meters,dir,place,cur_floor,all_floor,build_year,price)
    print(quyu,title,region,zone,meters,dir,place,cur_floor,all_floor,build_year,price)

    def save_to_csv(quyu,title,region,zone,meters,dir,place,cur_floor,all_floor,build_year,price):
    with open('链家上海租房.txt', 'a', encoding='utf-8') as f:
    f.write(str(quyu) + ',' + str(title) + ',' + str(region) + ',' + str(zone) + ',' + str(meters) + ','
    + str(dir) + ',' + str(place) + ',' + str(cur_floor) + ',' + str(all_floor) + ',' + str(build_year) + ',' + str(price) + ' ')
    print('writing work has done!continue the next page')


    def get_page_url(quyu,url):
    res = res_get(url)
    total_pages = int(res.re(r'"totalPage":(d+),"curPage":')[0])
    for i in range(1,total_pages+1):
    url_page = url_areas.get(quyu) + 'pg' + str(i)
    print('get:',url_page)
    get_house_data(quyu,url_page)





    res = res_get(url)
    areas = res.xpath('//dd[@data-index="0"]/div/a/text()').extract()[1:]
    areas_links = res.xpath('//dd[@data-index="0"]/div/a/@href').extract()[1:]
    url_areas = {}
    for i in range(len(areas)):
    if areas[i] == '浦东':
    url_areas['浦东1'] = 'https://sh.lianjia.com' + areas_links[i] + 'brp0erp5000/'
    url_areas['浦东2'] = 'https://sh.lianjia.com' + areas_links[i] + 'brp5001erp15000/'
    url_areas['浦东3'] = 'https://sh.lianjia.com' + areas_links[i] + 'brp15001erp55000/'
    else:
    url_areas[areas[i]] = 'https://sh.lianjia.com' + areas_links[i]
    for i in url_areas:
    print(url_areas.get(i))
    get_page_url(i,url_areas.get(i))

    二、利用pandas进行数据分析及可视化

    import numpy as np
    import pandas as pd
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # data = np.loadtxt(r"Desktop12.txt",encoding='utf-8')
    # pd.read_csv(r'Desktop12.txt',encoding='utf-8')
    # f = open(r'Desktop12.txt','r',encoding='utf-8')
    # f.readlines()
    # df = pd.read_table(f,header=None,sep=',',names=['quyu','title','region','zone','meters','dir','place','cur_floor','all_floor','build_year','price'])
    # 由于七八行的数据多拆出3列,所以无法直接读取
    df = pd.read_excel(r'Desktop123.xlsx',header=None)
    df.drop([11,12,13],axis=1,inplace=True)
    df.columns=['quyu','title','region','zone','meters','dir','place','cur_floor','all_floor','build_year','price']
    df.loc[df['quyu'].str.contains('浦东'),'quyu'] = df.loc[df['quyu'].str.contains('浦东'),'quyu'].str[:2]
    
    df.isnull().sum()
    df.drop(df.loc[df.meters.isnull()].index,axis=0,inplace=True)
    df.drop(df.loc[df.title.isnull()].index,axis=0,inplace=True)
    df.drop(df.loc[df['meters'].map(type)!=int].index,axis=0,inplace=True)
    df.isnull().sum()
    
    
    df.meters = df.meters.astype(int)
    df.all_floor = df.all_floor.astype(int)
    df.build_year = df.build_year.astype(int,errors='ignore')
    df.price = df.price.astype(int)
    df.info()
    df.describe()
    
    
    bins =[0,30,60,90,120,150,200,300,400,700,1000]
    
    level = ['0-30', '30-60', '60-90', '90-120', '120-150', '150-200', '200-300','300-400','400-700','700-1000']
    
    df['meters_grade'] = pd.cut(df['meters'],bins = bins,labels = level,right=True)
    
    
    
    bins = [0,2000,3500,5000,7000,10000,15000,20000,200000]
    
    level = ['0-2000', '2000-3500', '3500-5000', '5000-7000', '7000-10000','10000-15000','15000-20000','20000+']
    
    df['price_grade'] = pd.cut(df['price'], bins = bins,labels = level,right=True)
    
    mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
    mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
    
    
    
    #上海路段_房屋均价分布图
    df_quyu_grouped = df.groupby('quyu').price.agg(['mean','count']).sort_values(by='mean',ascending=False)
    df_quyu_grouped.reset_index(inplace=True)
    
    attr = df_quyu_grouped['quyu'].tolist()
    v1 = df_quyu_grouped['count'].tolist()
    v2 = df_quyu_grouped['mean'].tolist()
    
    
    from pyecharts import Line,Bar,Overlap
    
    line = Line("上海区域房租均价")
    
    line.add("区域",attr,v2,is_stack=True,xaxis_rotate=30,yaxix_min=4.2,
    
        mark_point=['min','max'],xaxis_interval=0,line_color='lightblue',
    
        line_width=4,mark_point_textcolor='black',mark_point_color='lightblue',
    
        is_splitline_show=False)
    
    bar = Bar("上海区域房屋数量")
    
    bar.add("路段",attr,v1,is_stack=True,xaxis_rotate=30,yaxix_min=4.2,
    
        xaxis_interval=0,is_splitline_show=False)
    overlap = Overlap()
    
    overlap.add(bar)
    
    overlap.add(line,yaxis_index=1,is_add_yaxis=True)
    
    overlap.render('区域均价数量.html')
    
    
    
    #房源价格区间分布图
    
    price_info = df[['area', 'price']]
    
    
    
    
    #对价格分区
    df.describe()
    bins = [0,1000,1500,2000,2500,3000,4000,5000,6000,8000,10000,20000,50000,200000]
    
    level = ['0-1000','1000-1500', '1500-2000', '2000-3000', '3000-4000', '4000-5000', '5000-6000', '6000-8000', '8000-1000','10000-20000','20000-50000','50000-200000','200000+']
    
    price_stage = pd.cut(df['price'], bins = bins,labels = level).value_counts().sort_index()
    
    attr = price_stage.index
    
    v1 = price_stage.values
    
    bar = Bar("价格区间&房源数量分布")
    
    bar.add("",attr,v1,is_stack=True,xaxis_rotate=30,yaxix_min=4.2,
    
        xaxis_interval=0,is_splitline_show=False)
    
    
    overlap = Overlap()
    
    overlap.add(bar)
    
    overlap.render('价格区间&房源数量分布.html')
    
    
    #房屋面积分布
    
    
    df.describe()
    bins =[0,30,60,90,120,150,200,300,400,700,1000]
    
    level = ['0-30', '30-60', '60-90', '90-120', '120-150', '150-200', '200-300','300-400','400-700','700-1000']
    
    house_meters = pd.cut(df['meters'],bins = bins,labels = level,right=True).value_counts().sort_index()
    
    attr = house_meters.index
    
    v1 = house_meters.values
    
    from pyecharts import Pie
    
    pie = Pie("房屋面积分布",title_pos='center')
    
    pie.add("",attr,v1,radius=[40, 75],label_text_color=None,is_label_show=True,legend_orient="vertical",legend_pos="left")
    overlap = Overlap()
    
    overlap.add(pie)
    
    overlap.render('房屋面积分布.html')
    
    # df.columns
    # seaborn画图
    sns.distplot(df['price'])
    plt.scatter(df.meters,df.price,alpha=0.2,lw=0.5)
    sns.boxplot(x='cur_floor', y="price", data=df)
    
    sns.boxplot(x='build_year', y="price", data=df)
    plt.xticks(rotation=90)
    
    sns.boxplot(x='quyu', y="price", data=df)
    
    sns.pairplot(df[['quyu',  'price', 'meters_grade', 'price_grade']])
    
    
    
    g = sns.FacetGrid(df.loc[df.price<15000], col="quyu")
    g.map(plt.hist, "price")
  • 相关阅读:
    使用 linux kernel +busybox 定制linux系统
    记一次golang的内存泄露
    关于Queries_per_sec 性能计数器
    NUMA导致的MySQL服务器SWAP问题分析
    Drop Table对MySQL的性能影响分析
    当MySQL数据库遇到Syn Flooding
    tcp_tw_recycle参数引发的数据库连接异常
    一例数据同步异常问题分析
    MySQL大量线程处于Opening tables的问题分析
    MySQL DeadLock故障排查过程
  • 原文地址:https://www.cnblogs.com/figo-studypath/p/9547070.html
Copyright © 2011-2022 走看看