zoukankan      html  css  js  c++  java
  • python爬虫实战<一>

    #!/usr/bin/env python
    #-*- coding:utf-8-*-
    
    """
    @author:    wangzhu
    @desc:  get qian cheng wu you qiu zhi wang information
    @contact:   isaac.zhu@dbappsecurity.com.cn
    @data:  2019/8/7
    """
    
    import requests  #导入请求包
    import re  #导入正则包
    from random import randint
    
    
    """
    网站地址:https://www.danke.com/room/hz
    """
    
    #Some User Agents
    hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},
        {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},
        {'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},
        {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},
        {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},
        {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
        {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
        {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},
        {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
        {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
        {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},
        {'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},
        {'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]
    
    def CrawlerHouse():
        url="https://www.danke.com/room/hz"
        res=requests.get(url,headers=hds[randint(0,len(hds)-1)])
        res.encoding="utf-8"  #对返回数据进行排版
        #print(res.text)  #<div class="r_ls_box">  /<div class="r_lbx">
        infolist=re.findall('<div class="r_lbx_cena">(.*?)</div>',res.text,re.S)  #获取所有数据
        i=1
        for one in infolist:  #每一条的数据
            two=one.split('<span class="location">{0}</span>'.format(i))  #过滤掉<span class="location">{0}</span>,以防止 >与下方的>冲突
            two=''.join(two) #去掉外边中括号
            #print(two)
            #获取岗位名称
            job=re.findall('>(.*?)</a>',two,re.S)
            job=''.join(job).strip()  #删除两边空字符
            print(job)
            i+=1
    
    #运行程序
    CrawlerHouse()

        <div class="r_ls_box">
                                
                <div class="r_lbx">
                    <a href="javascript:void(0)" class="rimg" key='0' xiaoqu='万科北宸之光'>
                        <span class="img-hint">
                            <span></span>
                            <span></span>
                        </span>
                        <img
                                src="https://public.danke.com.cn/public-20190123-isz_ljR3BG1JKKfa2lXEilpNXgN1NTRV?imageView2/1/w/380/h/285" width="260" height="173"
                                title=""
                                alt="图片"/>
    
                                        </a>
                    <div class="r_lbx_cen">
                        <div class="r_lbx_cena">
                            <span class="location">1</span>
                            <a href="https://www.danke.com/duanzu/1913140756.html" key='0' xiaoqu='万科北宸之光' target="_blank"
                               title="万达广场  万科北宸之光 3室2厅">
                                万达广场  万科北宸之光 3室2厅
                            </a>
                                                        <div class="r_lbx_cena">
                                    <div class="sub_img"></div>
                                    距5号线大运河站2700米
                                </div>
                                                </div>
                        <div class="r_lbx_cenb">
                            <div class="address_img"></div>
                            建筑面积约12㎡ | 21楼
                            | 3室1卫                          | 朝南
                                                        <i>合</i>
                                                </div>
                        <div class="r_lbx_cenc">
                                                                        </div>
                                        </div>
                    <div class="r_lbx_money">
                                                <div class="r_lbx_moneya">
                                                                <span class="ty_b">1890</span> 元/月
                                                        </div>
    
                                            <a class="lk_more" key='0' xiaoqu='万科北宸之光' href="https://www.danke.com/duanzu/1913140756.html"
                           target="_blank">
                            查看详情
                        </a>
                    </div>
                </div>
    
    
    <div class="r_ls_box">
    <DIV类= “r_ls_box”>
  • 相关阅读:
    一点创业想法
    【转】Java程序员常用工具类库
    向着高薪前进
    web开发可不可以是这样的?
    java 读取文本文件超简单的方法
    java操作xml超简单的方法
    Dijkstra算法
    ubuntu linux下如何配置ip地址以及DNS
    有关于string的一些用法
    Linux mint 17.2 系统下安装hust oj
  • 原文地址:https://www.cnblogs.com/gufengchen/p/12420798.html
Copyright © 2011-2022 走看看