zoukankan      html  css  js  c++  java
  • 链家深圳租房信息爬取练习 附加源码

    from urllib import request
    from time import sleep
    from  lxml import  etree
    import csv
    # import random    #sleep(random.random(1)*2) 随机秒数
    # 参数部分
    # sz_url = 'https://sz.lianjia.com/zufang/'
    #
    # header = {
    #
    # 'Referer': 'https://sz.lianjia.com/zufang/',
    # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
    # }
    # # 请求部分
    # res = request.Request(sz_url,headers=header)
    #
    # response = request.urlopen(res)
    # result = response.read().decode()
    # # print(result)
    # # 筛选部分
    # html = etree.HTML(result)
    # name_list = html.xpath('//ul[@id="house-lst"]/li/div[@class="info-panel"]/h2/a')
    # with open('house.csv',"wb") as f:
    #     for name in name_list:
    #         title=name.attrib["title"]
    #         f.write(title.encode())
    #         f.write('
    '.encode())
    #         print(title)
    
    
    # --------------------------------------------------------------------------------------------------------------
    # # 参数部分
    # sz_url = 'https://sz.lianjia.com/zufang/105101400296.html'
    #
    # header = {
    #
    # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
    # }
    # # 请求部分
    # res = request.Request(sz_url,headers=header)
    #
    # response = request.urlopen(res)
    # result = response.read().decode()
    # # print(result)
    #
    # html = etree.HTML(result)
    # name_list = html.xpath('//div[@class="brokerName"]/a')
    #
    # for name in name_list:
    #     text = name.text
    #     print(text)
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@code  tree@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # 请求测试
    def getRequet(url,xpath,**headers):
        default_headers = {
            'Connection': 'keep-alive',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTM, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
        }
        if headers :
            headers ={ **headers, **default_headers}
        else:
            headers = default_headers
        req = request.Request(url,headers=headers)
        response = request.urlopen(req)
        result = response.read().decode()
        html = etree.HTML(result)
        name_list = html.xpath(xpath)
        return name_list
    def main():
    
        with open('house.csv',"wb") as f:#打开csv文件 写入数据
            # csv_file = open('house.csv','wb')
            # csv_write = csv.writer(csv_file,dialect='excel')
            zf_url='https://sz.lianjia.com/zufang/'#要访问的url地址
            zf_xpath='//ul[@id="house-lst"]/li/div[@class="info-panel"]/h2/a'#租房xpath地址
            name_xpath='//div[@class="brokerName"]/a'#联系人名字xpath
            house_list=getRequet(zf_url,zf_xpath)
            for house  in house_list:
    
                print('正在下载:',zf_url)#打印下载链接地址
                attrib = house.attrib
                house_name = attrib['title']
                url =attrib['href']
                username=getRequet(url,name_xpath)[0].text#取联系人名字的文本信息下标0
                # csv_write.witerow(house_name,username)
                # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
                # print(name)
                # print(url)
    
    
                #
                f.write(house_name.encode())#encode编码
                f.write('
    '.encode())
                f.write(username.encode())
                f.write('
    '.encode())
    
                # print(house_name)
                # print(username)
                sleep(1)
                # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
            print('下载完成')
            f.close()
    
    if __name__=='__main__':
        main()
    

      

    逆风的方向更适合飞翔,不怕千万人阻挡,只怕自己投降!
  • 相关阅读:
    有没有对象???new一个???
    原生数组的方法--翻转
    rclone的基本用法
    golang 文件操作
    记一次挖矿程序处理 firstpress
    python 第三方库 网络 requests
    python 第三方库 时间 arrow
    ansible playbook loop 翻译
    硬盘性能测试工具之bonnie++
    磁盘性能测试工具之fio
  • 原文地址:https://www.cnblogs.com/jackzz/p/9203245.html
Copyright © 2011-2022 走看看