zoukankan      html  css  js  c++  java
  • 上海租房信息分析-爬虫-数据分析-练习

    参考https://mp.weixin.qq.com/s/tQV9k8sEcBXUFuczdLxXUw

    一、爬虫取数

    import requests
    import time
    import re
    from scrapy.selector import Selector


    url = 'https://sh.lianjia.com/zufang/'
    def res_get(url):
    header = {
    'Cookie': 'TY_SESSION_ID=41bf824a-e5fb-4b8a-9dc8-e224e038e9a0; lianjia_uuid=f11f6224-f260-4bfe-b4d0-e1071db02482; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1535337110; _smt_uid=5b836297.3339dbe3; UM_distinctid=16579391f217-045c31e2ca4389-43480420-1fa400-16579391f225b; _jzqc=1; _jzqckmp=1; _ga=GA1.2.688060518.1535337113; _gid=GA1.2.2056944229.1535337113; lianjia_ssid=f41dfd57-40a9-4b89-9265-7728d84c3cd4; _jzqa=1.4175547485031953400.1535337111.1535345836.1535349807.3; all-lj=c32edd623b8a5a59c7de54c92107bb6c; CNZZDATA1253492439=1485168093-1535347395-%7C1535347395; CNZZDATA1254525948=1488534351-1535348570-%7C1535348570; CNZZDATA1255633284=1099602249-1535346343-%7C1535346343; _qzjc=1; select_city=310000; CNZZDATA1255604082=1120666595-1535345064-%7C1535346820; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1535351058; _qzja=1.260637364.1535349932545.1535349932545.1535349932545.1535351050620.1535351058619.0.0.0.25.1; _qzjb=1.1535349932545.25.0.0.0; _qzjto=25.1.0; _jzqb=1.28.10.1535349807.1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    res = requests.get(url,headers=header)
    if res.status_code == 200:
    return Selector(res)
    else:
    print('res_get wrong!')





    def get_house_data(quyu,url):
    res = res_get(url)
    houst_lists = res.xpath('//ul[@id="house-lst"]/li')
    for houst_list in houst_lists:
    try:
    title = houst_list.xpath('div[@class="info-panel"]/h2/a/@title').extract_first()
    except Exception as e:
    title = ''
    try:
    region = houst_list.xpath('div//span[@class="region"]/text()').extract_first().split('xa0')[0]
    except Exception as e:
    region = ''
    try:
    zone = houst_list.xpath('div//span[@class="zone"]/span/text()').extract_first().split('xa0')[0]
    except Exception as e:
    zone = ''
    try:
    meters = int(houst_list.xpath('div//span[@class="meters"]/text()').extract_first().split('平米')[0])
    except Exception as e:
    meters = ''
    try:
    dir = houst_list.xpath('div//span[@class="meters"]/following-sibling::*/text()').extract_first()
    except Exception as e:
    dir = ''
    try:
    place = houst_list.xpath('div//div[@class="other"]//a/text()').extract_first().split('租房')[0]
    except Exception as e:
    place = ''
    try:
    cur_floor = houst_list.xpath('div//div[@class="con"]/text()').extract()[0].split('(')[0]
    except Exception as e:
    cur_floor = ''
    try:
    all_floor = int(re.search('d+',houst_list.xpath('div//div[@class="con"]/text()').extract()[0]).group())
    except Exception as e:
    all_floor = ''
    try:
    build_year = int(re.search('d+',houst_list.xpath('div//div[@class="con"]/text()').extract()[1]).group())
    except Exception as e:
    build_year = ''
    try:
    price = int(houst_list.xpath('div//div[@class="price"]/span/text()').extract_first())
    except Exception as e:
    price = ''
    save_to_csv(quyu,title,region,zone,meters,dir,place,cur_floor,all_floor,build_year,price)
    print(quyu,title,region,zone,meters,dir,place,cur_floor,all_floor,build_year,price)

    def save_to_csv(quyu,title,region,zone,meters,dir,place,cur_floor,all_floor,build_year,price):
    with open('链家上海租房.txt', 'a', encoding='utf-8') as f:
    f.write(str(quyu) + ',' + str(title) + ',' + str(region) + ',' + str(zone) + ',' + str(meters) + ','
    + str(dir) + ',' + str(place) + ',' + str(cur_floor) + ',' + str(all_floor) + ',' + str(build_year) + ',' + str(price) + ' ')
    print('writing work has done!continue the next page')


    def get_page_url(quyu,url):
    res = res_get(url)
    total_pages = int(res.re(r'"totalPage":(d+),"curPage":')[0])
    for i in range(1,total_pages+1):
    url_page = url_areas.get(quyu) + 'pg' + str(i)
    print('get:',url_page)
    get_house_data(quyu,url_page)





    res = res_get(url)
    areas = res.xpath('//dd[@data-index="0"]/div/a/text()').extract()[1:]
    areas_links = res.xpath('//dd[@data-index="0"]/div/a/@href').extract()[1:]
    url_areas = {}
    for i in range(len(areas)):
    if areas[i] == '浦东':
    url_areas['浦东1'] = 'https://sh.lianjia.com' + areas_links[i] + 'brp0erp5000/'
    url_areas['浦东2'] = 'https://sh.lianjia.com' + areas_links[i] + 'brp5001erp15000/'
    url_areas['浦东3'] = 'https://sh.lianjia.com' + areas_links[i] + 'brp15001erp55000/'
    else:
    url_areas[areas[i]] = 'https://sh.lianjia.com' + areas_links[i]
    for i in url_areas:
    print(url_areas.get(i))
    get_page_url(i,url_areas.get(i))

    二、利用pandas进行数据分析及可视化

    import numpy as np
    import pandas as pd
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # data = np.loadtxt(r"Desktop12.txt",encoding='utf-8')
    # pd.read_csv(r'Desktop12.txt',encoding='utf-8')
    # f = open(r'Desktop12.txt','r',encoding='utf-8')
    # f.readlines()
    # df = pd.read_table(f,header=None,sep=',',names=['quyu','title','region','zone','meters','dir','place','cur_floor','all_floor','build_year','price'])
    # 由于七八行的数据多拆出3列,所以无法直接读取
    df = pd.read_excel(r'Desktop123.xlsx',header=None)
    df.drop([11,12,13],axis=1,inplace=True)
    df.columns=['quyu','title','region','zone','meters','dir','place','cur_floor','all_floor','build_year','price']
    df.loc[df['quyu'].str.contains('浦东'),'quyu'] = df.loc[df['quyu'].str.contains('浦东'),'quyu'].str[:2]
    
    df.isnull().sum()
    df.drop(df.loc[df.meters.isnull()].index,axis=0,inplace=True)
    df.drop(df.loc[df.title.isnull()].index,axis=0,inplace=True)
    df.drop(df.loc[df['meters'].map(type)!=int].index,axis=0,inplace=True)
    df.isnull().sum()
    
    
    df.meters = df.meters.astype(int)
    df.all_floor = df.all_floor.astype(int)
    df.build_year = df.build_year.astype(int,errors='ignore')
    df.price = df.price.astype(int)
    df.info()
    df.describe()
    
    
    bins =[0,30,60,90,120,150,200,300,400,700,1000]
    
    level = ['0-30', '30-60', '60-90', '90-120', '120-150', '150-200', '200-300','300-400','400-700','700-1000']
    
    df['meters_grade'] = pd.cut(df['meters'],bins = bins,labels = level,right=True)
    
    
    
    bins = [0,2000,3500,5000,7000,10000,15000,20000,200000]
    
    level = ['0-2000', '2000-3500', '3500-5000', '5000-7000', '7000-10000','10000-15000','15000-20000','20000+']
    
    df['price_grade'] = pd.cut(df['price'], bins = bins,labels = level,right=True)
    
    mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
    mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
    
    
    
    #上海路段_房屋均价分布图
    df_quyu_grouped = df.groupby('quyu').price.agg(['mean','count']).sort_values(by='mean',ascending=False)
    df_quyu_grouped.reset_index(inplace=True)
    
    attr = df_quyu_grouped['quyu'].tolist()
    v1 = df_quyu_grouped['count'].tolist()
    v2 = df_quyu_grouped['mean'].tolist()
    
    
    from pyecharts import Line,Bar,Overlap
    
    line = Line("上海区域房租均价")
    
    line.add("区域",attr,v2,is_stack=True,xaxis_rotate=30,yaxix_min=4.2,
    
        mark_point=['min','max'],xaxis_interval=0,line_color='lightblue',
    
        line_width=4,mark_point_textcolor='black',mark_point_color='lightblue',
    
        is_splitline_show=False)
    
    bar = Bar("上海区域房屋数量")
    
    bar.add("路段",attr,v1,is_stack=True,xaxis_rotate=30,yaxix_min=4.2,
    
        xaxis_interval=0,is_splitline_show=False)
    overlap = Overlap()
    
    overlap.add(bar)
    
    overlap.add(line,yaxis_index=1,is_add_yaxis=True)
    
    overlap.render('区域均价数量.html')
    
    
    
    #房源价格区间分布图
    
    price_info = df[['area', 'price']]
    
    
    
    
    #对价格分区
    df.describe()
    bins = [0,1000,1500,2000,2500,3000,4000,5000,6000,8000,10000,20000,50000,200000]
    
    level = ['0-1000','1000-1500', '1500-2000', '2000-3000', '3000-4000', '4000-5000', '5000-6000', '6000-8000', '8000-1000','10000-20000','20000-50000','50000-200000','200000+']
    
    price_stage = pd.cut(df['price'], bins = bins,labels = level).value_counts().sort_index()
    
    attr = price_stage.index
    
    v1 = price_stage.values
    
    bar = Bar("价格区间&房源数量分布")
    
    bar.add("",attr,v1,is_stack=True,xaxis_rotate=30,yaxix_min=4.2,
    
        xaxis_interval=0,is_splitline_show=False)
    
    
    overlap = Overlap()
    
    overlap.add(bar)
    
    overlap.render('价格区间&房源数量分布.html')
    
    
    #房屋面积分布
    
    
    df.describe()
    bins =[0,30,60,90,120,150,200,300,400,700,1000]
    
    level = ['0-30', '30-60', '60-90', '90-120', '120-150', '150-200', '200-300','300-400','400-700','700-1000']
    
    house_meters = pd.cut(df['meters'],bins = bins,labels = level,right=True).value_counts().sort_index()
    
    attr = house_meters.index
    
    v1 = house_meters.values
    
    from pyecharts import Pie
    
    pie = Pie("房屋面积分布",title_pos='center')
    
    pie.add("",attr,v1,radius=[40, 75],label_text_color=None,is_label_show=True,legend_orient="vertical",legend_pos="left")
    overlap = Overlap()
    
    overlap.add(pie)
    
    overlap.render('房屋面积分布.html')
    
    # df.columns
    # seaborn画图
    sns.distplot(df['price'])
    plt.scatter(df.meters,df.price,alpha=0.2,lw=0.5)
    sns.boxplot(x='cur_floor', y="price", data=df)
    
    sns.boxplot(x='build_year', y="price", data=df)
    plt.xticks(rotation=90)
    
    sns.boxplot(x='quyu', y="price", data=df)
    
    sns.pairplot(df[['quyu',  'price', 'meters_grade', 'price_grade']])
    
    
    
    g = sns.FacetGrid(df.loc[df.price<15000], col="quyu")
    g.map(plt.hist, "price")
  • 相关阅读:
    6. Flask请求和响应
    5. Flask模板
    FW:Software Testing
    What is the difference between modified duration, effective duration and duration?
    How to push master to QA branch in GIT
    FTPS Firewall
    Query performance optimization of Vertica
    (Forward)5 Public Speaking Tips That'll Prepare You for Any Interview
    (转)The remote certificate is invalid according to the validation procedure
    Change
  • 原文地址:https://www.cnblogs.com/figo-studypath/p/9547070.html
Copyright © 2011-2022 走看看