zoukankan      html  css  js  c++  java
  • 使用python抓取并分析北京链家地产二手房信息

      1 import requests
      2 import time
      3 from bs4 import BeautifulSoup
      4 
      5 #设置列表页URL的固定部分
      6 url='http://bj.lianjia.com/ershoufang/'
      7 #设置页面页的可变部分
      8 page=('pg')
      9 
     10 #设置请求头部信息
     11 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
     12 'Accept':'text/html;q=0.9,*/*;q=0.8',
     13 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
     14 'Accept-Encoding':'gzip',
     15 'Connection':'close',
     16 'Referer':'http://www.baidu.com/link?url=_andhfsjjjKRgEWkj7i9cFmYYGsisrnm2A-TN3XZDQXxvGsM9k9ZZSnikW2Yds4s&wd=&eqid=c3435a7d00006bd600000003582bfd1f'
     17 }
     18 
     19 #循环抓取列表页信息
     20 for i in range(1,10):
     21     if i == 1:
     22         i=str(i)
     23         a=(url+page+i+'/')
     24         r=requests.get(url=a,headers=headers)
     25         html=r.content
     26     else:
     27         i=str(i)
     28         a=(url+page+i+'/')
     29         r=requests.get(url=a,headers=headers)
     30         html2=r.content
     31         html = html + html2
     32 #每次间隔0.5秒
     33         time.sleep(0.5)
     34 
     35 #解析抓取的页面内容
     36 lj=BeautifulSoup(html,'html.parser')
     37 
     38 #提取房源总价
     39 price=lj.find_all('div',attrs={'class':'priceInfo'})
     40 tp=[]
     41 for a in price:
     42     totalPrice=a.span.string
     43     tp.append(totalPrice)
     44 
     45 #提取房源信息
     46     houseInfo=lj.find_all('div',attrs={'class':'houseInfo'})
     47     hi=[]
     48 for b in houseInfo:
     49     house=b.get_text()
     50     hi.append(house)
     51 
     52 #提取房源关注度
     53     followInfo=lj.find_all('div',attrs={'class':'followInfo'})
     54     fi=[]
     55 for c in followInfo:
     56     follow=c.get_text()
     57     fi.append(follow)
     58 
     59 #导入pandas库
     60 import pandas as pd
     61 #创建数据表
     62 house=pd.DataFrame({'totalprice':tp,'houseinfo':hi,'followinfo':fi})
     63 #查看数据表的内容
     64 house.head()
     65 
     66 #对房源信息进行分列
     67 houseinfo_split = pd.DataFrame((x.split('|') for x in house.houseinfo),index=house.index,columns=['xiaoqu','huxing','mianji','chaoxiang','zhuangxiu','dianti'])
     68 
     69 #查看分列结果
     70 houseinfo_split.head()
     71 
     72 #将分列结果拼接回原数据表
     73 house=pd.merge(house,houseinfo_split,right_index=True, left_index=True)
     74 #完成拼接后的数据表中既包含了原有字段,也包含了分列后的新增字段。
     75 #查看拼接后的数据表
     76 house.head()
     77 
     78 #对房源关注度进行分列
     79 followinfo_split = pd.DataFrame((x.split('/') for x in house.followinfo),index=house.index,columns=['guanzhu','daikan','fabu'])
     80 #将分列后的关注度信息拼接回原数据表
     81 house=pd.merge(house,followinfo_split,right_index=True, left_index=True)
     82 
     83 #按房源户型类别进行汇总
     84 huxing=house.groupby('huxing')['huxing'].agg(len)
     85 #查看户型汇总结果
     86 huxing
     87 
     88 #导入图表库
     89 import matplotlib.pyplot as plt
     90 #导入数值计算库
     91 import numpy as np
     92 
     93 #用len函数计算出huxing的长度
     94 l = len(huxing)
     95 # 定义一个hx空数组
     96 hx=[]
     97 for i in range(1,len(huxing)+1):
     98 
     99     hx.append(i)
    100 
    101 #绘制房源户型分布条形图
    102 plt.rc('font', family='STXihei', size=11)
    103 a=np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])
    104 plt.barh(hx,huxing,color='#052B6C',alpha=0.8,align='center',edgecolor='white')
    105 plt.ylabel('户型')
    106 plt.xlabel('数量')
    107 plt.xlim(0,1300)
    108 plt.ylim(0,20)
    109 plt.title('房源户型分布情况')
    110 plt.legend(['数量'], loc='upper right')
    111 plt.grid(color='#95a5a6',linestyle='--', linewidth=1,axis='y',alpha=0.4)
    112 plt.yticks(a,('1室0厅','1室1厅','1室2厅','2室0厅','2室1厅','2室2厅','3室0厅','3室1厅','3室2厅','3室3厅','4室1厅','4室2厅','4室3厅','5室2厅','5室3厅','6室1厅','6室2厅','7室2厅','7室3厅'))
    113 plt.show()
    114 
    115 #对房源面积进行二次分列
    116 mianji_num_split = pd.DataFrame((x.split('') for x in house.mianji),index=house.index,columns=['mianji_num','mi'])
    117 #将分列后的房源面积拼接回原数据表
    118 house=pd.merge(house,mianji_num_split,right_index=True, left_index=True)
    119 
    120 #去除mianji_num字段两端的空格
    121 #house['mianji_num']=house['mianji_num'].map(str.strip)
    122 
    123 #更改mianji_num字段格式为float
    124 house['mianji_num']=house['mianji_num'].astype(float)
    125 
    126 #查看所有房源面积的范围值
    127 house['mianji_num'].min(),house['mianji_num'].max()
    128 (18.850000000000001, 332.63)
    129 
    130 
    131 #对房源面积进行分组
    132 bins = [0, 50, 100, 150, 200, 250, 300, 350]
    133 group_mianji = ['小于50', '50-100', '100-150', '150-200','200-250','250-300','300-350']
    134 house['group_mianji'] = pd.cut(house['mianji_num'], bins, labels=group_mianji)
    135 
    136 #按房源面积分组对房源数量进行汇总
    137 group_mianji=house.groupby('group_mianji')['group_mianji'].agg(len)
    138 
    139 #绘制房源面积分布图
    140 plt.rc('font', family='STXihei', size=15)
    141 a=np.array([1,2,3,4,5,6,7])
    142 plt.barh([1,2,3,4,5,6,7],group_mianji,color='#052B6C',alpha=0.8,align='center',edgecolor='white')
    143 plt.ylabel('面积分组')
    144 plt.xlabel('数量')
    145 plt.title('房源面积分布')
    146 plt.legend(['数量'], loc='upper right')
    147 plt.grid(color='#95a5a6',linestyle='--', linewidth=1,axis='y',alpha=0.4)
    148 plt.yticks(a,('小于50', '50-100', '100-150', '150-200','200-250','250-300','300-350'))
    149 plt.show()
    150 
    151 #对房源关注度进行二次分列
    152 guanzhu_num_split = pd.DataFrame((x.split('') for x in house.guanzhu),index=house.index,columns=['guanzhu_num','ren'])
    153 #将分列后的关注度数据拼接回原数据表
    154 house=pd.merge(house,guanzhu_num_split,right_index=True, left_index=True)
    155 #去除房源关注度字段两端的空格
    156 house['guanzhu_num']=house['guanzhu_num'].map(str.strip)
    157 #更改房源关注度及总价字段的格式
    158 house[['guanzhu_num','totalprice']]=house[['guanzhu_num','totalprice']].astype(float)
    159 
    160 #查看房源关注度的区间
    161 house['guanzhu_num'].min(),house['guanzhu_num'].max()
    162 (0.0, 725.0)
    163 
    164 #对房源关注度进行分组
    165 bins = [0, 100, 200, 300, 400, 500, 600, 700,800]
    166 group_guanzhu = ['小于100', '100-200', '200-300', '300-400','400-500','500-600','600-700','700-800']
    167 house['group_guanzhu'] = pd.cut(house['guanzhu_num'], bins, labels=group_guanzhu)
    168 group_guanzhu=house.groupby('group_guanzhu')['group_guanzhu'].agg(len)
    169 
    170 #绘制房源关注度分布图
    171 plt.rc('font', family='STXihei', size=15)
    172 a=np.array([1,2,3,4,5,6,7,8])
    173 plt.barh([1,2,3,4,5,6,7,8],group_guanzhu,color='#052B6C',alpha=0.8,align='center',edgecolor='white')
    174 plt.ylabel('关注度分组')
    175 plt.xlabel('数量')
    176 plt.xlim(0,3000)
    177 plt.title('房源关注度分布')
    178 plt.legend(['数量'], loc='upper right')
    179 plt.grid(color='#95a5a6',linestyle='--', linewidth=1,axis='y',alpha=0.4)
    180 plt.yticks(a,('小于100', '100-200', '200-300', '300-400','400-500','500-600','600-700','700-800'))
    181 plt.show()
    182 
    183 #导入sklearn中的KMeans进行聚类分析
    184 from sklearn.cluster import KMeans
    185 #使用房源总价,面积和关注度三个字段进行聚类
    186 house_type = np.array(house[['totalprice','mianji_num','guanzhu_num']])
    187 #设置质心数量为3
    188 clf=KMeans(n_clusters=3)
    189 #计算聚类结果
    190 clf=clf.fit(house_type)
    191 
    192 #查看分类结果的中心坐标
    193 clf.cluster_centers_array([[ 772.97477064, 112.02389908, 58.96330275],[ 434.51073861, 84.92950236, 61.20115244],[ 1473.26719577, 170.65402116, 43.32275132]])
    194 
    195 #在原数据表中标注所属类别
    196 house['label']= clf.labels_
  • 相关阅读:
    htnl5中设置文本单行显示,超出部分打省略号,鼠标移到文本时alt出全部文本内容
    sql 查出一张表中重复的所有记录数据
    JS实现关闭当前子窗口,刷新父窗口
    jstl <c:forEach> 介绍
    Oracle SQL: TO_CHAR and TO_NUMBER 笔记
    Python学习记录七---继承、多态和封装
    iOS动画和第三方插件学习网址
    Python学习记录(六)--函数 定义和使用
    python学习记录(五) --语句块和比较符
    Python学习记录(四)--字典
  • 原文地址:https://www.cnblogs.com/leonardchen/p/6479492.html
Copyright © 2011-2022 走看看