参考https://mp.weixin.qq.com/s/tQV9k8sEcBXUFuczdLxXUw
一、爬虫取数
import requests
import time
import re
from scrapy.selector import Selector
url = 'https://sh.lianjia.com/zufang/'
def res_get(url):
header = {
'Cookie': 'TY_SESSION_ID=41bf824a-e5fb-4b8a-9dc8-e224e038e9a0; lianjia_uuid=f11f6224-f260-4bfe-b4d0-e1071db02482; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1535337110; _smt_uid=5b836297.3339dbe3; UM_distinctid=16579391f217-045c31e2ca4389-43480420-1fa400-16579391f225b; _jzqc=1; _jzqckmp=1; _ga=GA1.2.688060518.1535337113; _gid=GA1.2.2056944229.1535337113; lianjia_ssid=f41dfd57-40a9-4b89-9265-7728d84c3cd4; _jzqa=1.4175547485031953400.1535337111.1535345836.1535349807.3; all-lj=c32edd623b8a5a59c7de54c92107bb6c; CNZZDATA1253492439=1485168093-1535347395-%7C1535347395; CNZZDATA1254525948=1488534351-1535348570-%7C1535348570; CNZZDATA1255633284=1099602249-1535346343-%7C1535346343; _qzjc=1; select_city=310000; CNZZDATA1255604082=1120666595-1535345064-%7C1535346820; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1535351058; _qzja=1.260637364.1535349932545.1535349932545.1535349932545.1535351050620.1535351058619.0.0.0.25.1; _qzjb=1.1535349932545.25.0.0.0; _qzjto=25.1.0; _jzqb=1.28.10.1535349807.1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
res = requests.get(url,headers=header)
if res.status_code == 200:
return Selector(res)
else:
print('res_get wrong!')
def get_house_data(quyu,url):
res = res_get(url)
houst_lists = res.xpath('//ul[@id="house-lst"]/li')
for houst_list in houst_lists:
try:
title = houst_list.xpath('div[@class="info-panel"]/h2/a/@title').extract_first()
except Exception as e:
title = ''
try:
region = houst_list.xpath('div//span[@class="region"]/text()').extract_first().split('xa0')[0]
except Exception as e:
region = ''
try:
zone = houst_list.xpath('div//span[@class="zone"]/span/text()').extract_first().split('xa0')[0]
except Exception as e:
zone = ''
try:
meters = int(houst_list.xpath('div//span[@class="meters"]/text()').extract_first().split('平米')[0])
except Exception as e:
meters = ''
try:
dir = houst_list.xpath('div//span[@class="meters"]/following-sibling::*/text()').extract_first()
except Exception as e:
dir = ''
try:
place = houst_list.xpath('div//div[@class="other"]//a/text()').extract_first().split('租房')[0]
except Exception as e:
place = ''
try:
cur_floor = houst_list.xpath('div//div[@class="con"]/text()').extract()[0].split('(')[0]
except Exception as e:
cur_floor = ''
try:
all_floor = int(re.search('d+',houst_list.xpath('div//div[@class="con"]/text()').extract()[0]).group())
except Exception as e:
all_floor = ''
try:
build_year = int(re.search('d+',houst_list.xpath('div//div[@class="con"]/text()').extract()[1]).group())
except Exception as e:
build_year = ''
try:
price = int(houst_list.xpath('div//div[@class="price"]/span/text()').extract_first())
except Exception as e:
price = ''
save_to_csv(quyu,title,region,zone,meters,dir,place,cur_floor,all_floor,build_year,price)
print(quyu,title,region,zone,meters,dir,place,cur_floor,all_floor,build_year,price)
def save_to_csv(quyu,title,region,zone,meters,dir,place,cur_floor,all_floor,build_year,price):
with open('链家上海租房.txt', 'a', encoding='utf-8') as f:
f.write(str(quyu) + ',' + str(title) + ',' + str(region) + ',' + str(zone) + ',' + str(meters) + ','
+ str(dir) + ',' + str(place) + ',' + str(cur_floor) + ',' + str(all_floor) + ',' + str(build_year) + ',' + str(price) + ' ')
print('writing work has done!continue the next page')
def get_page_url(quyu,url):
res = res_get(url)
total_pages = int(res.re(r'"totalPage":(d+),"curPage":')[0])
for i in range(1,total_pages+1):
url_page = url_areas.get(quyu) + 'pg' + str(i)
print('get:',url_page)
get_house_data(quyu,url_page)
res = res_get(url)
areas = res.xpath('//dd[@data-index="0"]/div/a/text()').extract()[1:]
areas_links = res.xpath('//dd[@data-index="0"]/div/a/@href').extract()[1:]
url_areas = {}
for i in range(len(areas)):
if areas[i] == '浦东':
url_areas['浦东1'] = 'https://sh.lianjia.com' + areas_links[i] + 'brp0erp5000/'
url_areas['浦东2'] = 'https://sh.lianjia.com' + areas_links[i] + 'brp5001erp15000/'
url_areas['浦东3'] = 'https://sh.lianjia.com' + areas_links[i] + 'brp15001erp55000/'
else:
url_areas[areas[i]] = 'https://sh.lianjia.com' + areas_links[i]
for i in url_areas:
print(url_areas.get(i))
get_page_url(i,url_areas.get(i))
二、利用pandas进行数据分析及可视化
import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns # data = np.loadtxt(r"Desktop12.txt",encoding='utf-8') # pd.read_csv(r'Desktop12.txt',encoding='utf-8') # f = open(r'Desktop12.txt','r',encoding='utf-8') # f.readlines() # df = pd.read_table(f,header=None,sep=',',names=['quyu','title','region','zone','meters','dir','place','cur_floor','all_floor','build_year','price']) # 由于七八行的数据多拆出3列,所以无法直接读取 df = pd.read_excel(r'Desktop123.xlsx',header=None) df.drop([11,12,13],axis=1,inplace=True) df.columns=['quyu','title','region','zone','meters','dir','place','cur_floor','all_floor','build_year','price'] df.loc[df['quyu'].str.contains('浦东'),'quyu'] = df.loc[df['quyu'].str.contains('浦东'),'quyu'].str[:2] df.isnull().sum() df.drop(df.loc[df.meters.isnull()].index,axis=0,inplace=True) df.drop(df.loc[df.title.isnull()].index,axis=0,inplace=True) df.drop(df.loc[df['meters'].map(type)!=int].index,axis=0,inplace=True) df.isnull().sum() df.meters = df.meters.astype(int) df.all_floor = df.all_floor.astype(int) df.build_year = df.build_year.astype(int,errors='ignore') df.price = df.price.astype(int) df.info() df.describe() bins =[0,30,60,90,120,150,200,300,400,700,1000] level = ['0-30', '30-60', '60-90', '90-120', '120-150', '150-200', '200-300','300-400','400-700','700-1000'] df['meters_grade'] = pd.cut(df['meters'],bins = bins,labels = level,right=True) bins = [0,2000,3500,5000,7000,10000,15000,20000,200000] level = ['0-2000', '2000-3500', '3500-5000', '5000-7000', '7000-10000','10000-15000','15000-20000','20000+'] df['price_grade'] = pd.cut(df['price'], bins = bins,labels = level,right=True) mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体 mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 #上海路段_房屋均价分布图 df_quyu_grouped = df.groupby('quyu').price.agg(['mean','count']).sort_values(by='mean',ascending=False) df_quyu_grouped.reset_index(inplace=True) attr = df_quyu_grouped['quyu'].tolist() v1 = df_quyu_grouped['count'].tolist() v2 = df_quyu_grouped['mean'].tolist() from pyecharts import Line,Bar,Overlap line = Line("上海区域房租均价") line.add("区域",attr,v2,is_stack=True,xaxis_rotate=30,yaxix_min=4.2, mark_point=['min','max'],xaxis_interval=0,line_color='lightblue', line_width=4,mark_point_textcolor='black',mark_point_color='lightblue', is_splitline_show=False) bar = Bar("上海区域房屋数量") bar.add("路段",attr,v1,is_stack=True,xaxis_rotate=30,yaxix_min=4.2, xaxis_interval=0,is_splitline_show=False) overlap = Overlap() overlap.add(bar) overlap.add(line,yaxis_index=1,is_add_yaxis=True) overlap.render('区域均价数量.html') #房源价格区间分布图 price_info = df[['area', 'price']] #对价格分区 df.describe() bins = [0,1000,1500,2000,2500,3000,4000,5000,6000,8000,10000,20000,50000,200000] level = ['0-1000','1000-1500', '1500-2000', '2000-3000', '3000-4000', '4000-5000', '5000-6000', '6000-8000', '8000-1000','10000-20000','20000-50000','50000-200000','200000+'] price_stage = pd.cut(df['price'], bins = bins,labels = level).value_counts().sort_index() attr = price_stage.index v1 = price_stage.values bar = Bar("价格区间&房源数量分布") bar.add("",attr,v1,is_stack=True,xaxis_rotate=30,yaxix_min=4.2, xaxis_interval=0,is_splitline_show=False) overlap = Overlap() overlap.add(bar) overlap.render('价格区间&房源数量分布.html') #房屋面积分布 df.describe() bins =[0,30,60,90,120,150,200,300,400,700,1000] level = ['0-30', '30-60', '60-90', '90-120', '120-150', '150-200', '200-300','300-400','400-700','700-1000'] house_meters = pd.cut(df['meters'],bins = bins,labels = level,right=True).value_counts().sort_index() attr = house_meters.index v1 = house_meters.values from pyecharts import Pie pie = Pie("房屋面积分布",title_pos='center') pie.add("",attr,v1,radius=[40, 75],label_text_color=None,is_label_show=True,legend_orient="vertical",legend_pos="left") overlap = Overlap() overlap.add(pie) overlap.render('房屋面积分布.html') # df.columns # seaborn画图 sns.distplot(df['price']) plt.scatter(df.meters,df.price,alpha=0.2,lw=0.5) sns.boxplot(x='cur_floor', y="price", data=df) sns.boxplot(x='build_year', y="price", data=df) plt.xticks(rotation=90) sns.boxplot(x='quyu', y="price", data=df) sns.pairplot(df[['quyu', 'price', 'meters_grade', 'price_grade']]) g = sns.FacetGrid(df.loc[df.price<15000], col="quyu") g.map(plt.hist, "price")