zoukankan      html  css  js  c++  java
  • 链家深圳租房信息爬取练习 附加源码

    from urllib import request
    from time import sleep
    from  lxml import  etree
    import csv
    # import random    #sleep(random.random(1)*2) 随机秒数
    # 参数部分
    # sz_url = 'https://sz.lianjia.com/zufang/'
    #
    # header = {
    #
    # 'Referer': 'https://sz.lianjia.com/zufang/',
    # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
    # }
    # # 请求部分
    # res = request.Request(sz_url,headers=header)
    #
    # response = request.urlopen(res)
    # result = response.read().decode()
    # # print(result)
    # # 筛选部分
    # html = etree.HTML(result)
    # name_list = html.xpath('//ul[@id="house-lst"]/li/div[@class="info-panel"]/h2/a')
    # with open('house.csv',"wb") as f:
    #     for name in name_list:
    #         title=name.attrib["title"]
    #         f.write(title.encode())
    #         f.write('
    '.encode())
    #         print(title)
    
    
    # --------------------------------------------------------------------------------------------------------------
    # # 参数部分
    # sz_url = 'https://sz.lianjia.com/zufang/105101400296.html'
    #
    # header = {
    #
    # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
    # }
    # # 请求部分
    # res = request.Request(sz_url,headers=header)
    #
    # response = request.urlopen(res)
    # result = response.read().decode()
    # # print(result)
    #
    # html = etree.HTML(result)
    # name_list = html.xpath('//div[@class="brokerName"]/a')
    #
    # for name in name_list:
    #     text = name.text
    #     print(text)
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@code  tree@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # 请求测试
    def getRequet(url,xpath,**headers):
        default_headers = {
            'Connection': 'keep-alive',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTM, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
        }
        if headers :
            headers ={ **headers, **default_headers}
        else:
            headers = default_headers
        req = request.Request(url,headers=headers)
        response = request.urlopen(req)
        result = response.read().decode()
        html = etree.HTML(result)
        name_list = html.xpath(xpath)
        return name_list
    def main():
    
        with open('house.csv',"wb") as f:#打开csv文件 写入数据
            # csv_file = open('house.csv','wb')
            # csv_write = csv.writer(csv_file,dialect='excel')
            zf_url='https://sz.lianjia.com/zufang/'#要访问的url地址
            zf_xpath='//ul[@id="house-lst"]/li/div[@class="info-panel"]/h2/a'#租房xpath地址
            name_xpath='//div[@class="brokerName"]/a'#联系人名字xpath
            house_list=getRequet(zf_url,zf_xpath)
            for house  in house_list:
    
                print('正在下载:',zf_url)#打印下载链接地址
                attrib = house.attrib
                house_name = attrib['title']
                url =attrib['href']
                username=getRequet(url,name_xpath)[0].text#取联系人名字的文本信息下标0
                # csv_write.witerow(house_name,username)
                # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
                # print(name)
                # print(url)
    
    
                #
                f.write(house_name.encode())#encode编码
                f.write('
    '.encode())
                f.write(username.encode())
                f.write('
    '.encode())
    
                # print(house_name)
                # print(username)
                sleep(1)
                # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
            print('下载完成')
            f.close()
    
    if __name__=='__main__':
        main()
    

      

    逆风的方向更适合飞翔,不怕千万人阻挡,只怕自己投降!
  • 相关阅读:
    Node Sass version 5.0.0 is incompatible with^4.0.0
    解决vue-cli引入sass,报错:this.getResolve is not a function问题
    解决nuxt官方脚手架的一些坑:1、支持es6+语法 2、样式支持sass
    针对【create-nuxt-app新版本v3.2.0】构建项目时没有server配置以及运行后弹出收集匿名数据选项等问题的解决方法
    create-nuxt-app创建出来的目录没有server文件夹
    Redis安装(Windows环境下Redis安装)
    koa2中间件,路由,cookies
    用同步的写法来执行异步操作, async, awiat
    koa2 安装与启动
    练习:自己写一个容器ArrayList集合 一一数组综合练习
  • 原文地址:https://www.cnblogs.com/jackzz/p/9203245.html
Copyright © 2011-2022 走看看