#!/usr/bin/env python #-*- coding:utf-8-*- """ @author: wangzhu @desc: get qian cheng wu you qiu zhi wang information @contact: isaac.zhu@dbappsecurity.com.cn @data: 2019/8/7 """ import requests #导入请求包 import re #导入正则包 from random import randint """ 网站地址:https://www.danke.com/room/hz """ #Some User Agents hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}, {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, {'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}, {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'}, {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'}, {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'}, {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'}, {'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'}, {'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}] def CrawlerHouse(): url="https://www.danke.com/room/hz" res=requests.get(url,headers=hds[randint(0,len(hds)-1)]) res.encoding="utf-8" #对返回数据进行排版 #print(res.text) #<div class="r_ls_box"> /<div class="r_lbx"> infolist=re.findall('<div class="r_lbx_cena">(.*?)</div>',res.text,re.S) #获取所有数据 i=1 for one in infolist: #每一条的数据 two=one.split('<span class="location">{0}</span>'.format(i)) #过滤掉<span class="location">{0}</span>,以防止 >与下方的>冲突 two=''.join(two) #去掉外边中括号 #print(two) #获取岗位名称 job=re.findall('>(.*?)</a>',two,re.S) job=''.join(job).strip() #删除两边空字符 print(job) i+=1 #运行程序 CrawlerHouse()
<div class="r_ls_box"> <div class="r_lbx"> <a href="javascript:void(0)" class="rimg" key='0' xiaoqu='万科北宸之光'> <span class="img-hint"> <span></span> <span></span> </span> <img src="https://public.danke.com.cn/public-20190123-isz_ljR3BG1JKKfa2lXEilpNXgN1NTRV?imageView2/1/w/380/h/285" width="260" height="173" title="" alt="图片"/> </a> <div class="r_lbx_cen"> <div class="r_lbx_cena"> <span class="location">1</span> <a href="https://www.danke.com/duanzu/1913140756.html" key='0' xiaoqu='万科北宸之光' target="_blank" title="万达广场 万科北宸之光 3室2厅"> 万达广场 万科北宸之光 3室2厅 </a> <div class="r_lbx_cena"> <div class="sub_img"></div> 距5号线大运河站2700米 </div> </div> <div class="r_lbx_cenb"> <div class="address_img"></div> 建筑面积约12㎡ | 21楼 | 3室1卫 | 朝南 <i>合</i> </div> <div class="r_lbx_cenc"> </div> </div> <div class="r_lbx_money"> <div class="r_lbx_moneya"> <span class="ty_b">1890</span> 元/月 </div> <a class="lk_more" key='0' xiaoqu='万科北宸之光' href="https://www.danke.com/duanzu/1913140756.html" target="_blank"> 查看详情 </a> </div> </div>
<div class="r_ls_box">
<DIV类= “r_ls_box”>