zoukankan      html  css  js  c++  java
  • 链家深圳租房信息爬取练习 附加源码

    from urllib import request
    from time import sleep
    from  lxml import  etree
    import csv
    # import random    #sleep(random.random(1)*2) 随机秒数
    # 参数部分
    # sz_url = 'https://sz.lianjia.com/zufang/'
    #
    # header = {
    #
    # 'Referer': 'https://sz.lianjia.com/zufang/',
    # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
    # }
    # # 请求部分
    # res = request.Request(sz_url,headers=header)
    #
    # response = request.urlopen(res)
    # result = response.read().decode()
    # # print(result)
    # # 筛选部分
    # html = etree.HTML(result)
    # name_list = html.xpath('//ul[@id="house-lst"]/li/div[@class="info-panel"]/h2/a')
    # with open('house.csv',"wb") as f:
    #     for name in name_list:
    #         title=name.attrib["title"]
    #         f.write(title.encode())
    #         f.write('
    '.encode())
    #         print(title)
    
    
    # --------------------------------------------------------------------------------------------------------------
    # # 参数部分
    # sz_url = 'https://sz.lianjia.com/zufang/105101400296.html'
    #
    # header = {
    #
    # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
    # }
    # # 请求部分
    # res = request.Request(sz_url,headers=header)
    #
    # response = request.urlopen(res)
    # result = response.read().decode()
    # # print(result)
    #
    # html = etree.HTML(result)
    # name_list = html.xpath('//div[@class="brokerName"]/a')
    #
    # for name in name_list:
    #     text = name.text
    #     print(text)
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@code  tree@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # 请求测试
    def getRequet(url,xpath,**headers):
        default_headers = {
            'Connection': 'keep-alive',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTM, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
        }
        if headers :
            headers ={ **headers, **default_headers}
        else:
            headers = default_headers
        req = request.Request(url,headers=headers)
        response = request.urlopen(req)
        result = response.read().decode()
        html = etree.HTML(result)
        name_list = html.xpath(xpath)
        return name_list
    def main():
    
        with open('house.csv',"wb") as f:#打开csv文件 写入数据
            # csv_file = open('house.csv','wb')
            # csv_write = csv.writer(csv_file,dialect='excel')
            zf_url='https://sz.lianjia.com/zufang/'#要访问的url地址
            zf_xpath='//ul[@id="house-lst"]/li/div[@class="info-panel"]/h2/a'#租房xpath地址
            name_xpath='//div[@class="brokerName"]/a'#联系人名字xpath
            house_list=getRequet(zf_url,zf_xpath)
            for house  in house_list:
    
                print('正在下载:',zf_url)#打印下载链接地址
                attrib = house.attrib
                house_name = attrib['title']
                url =attrib['href']
                username=getRequet(url,name_xpath)[0].text#取联系人名字的文本信息下标0
                # csv_write.witerow(house_name,username)
                # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
                # print(name)
                # print(url)
    
    
                #
                f.write(house_name.encode())#encode编码
                f.write('
    '.encode())
                f.write(username.encode())
                f.write('
    '.encode())
    
                # print(house_name)
                # print(username)
                sleep(1)
                # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
            print('下载完成')
            f.close()
    
    if __name__=='__main__':
        main()
    

      

    逆风的方向更适合飞翔,不怕千万人阻挡,只怕自己投降!
  • 相关阅读:
    java 微信公众号素材 新增其他类型永久素材
    @schedule
    idea 打包的jar运行报 “XXX中没有主清单属性”
    openjdk没有ssl支持的相关证书包,导致使用HTPPS调用第三方接口时候报错误 InvalidAlgorithmParameterException: the trustAnchors parameter must be non-empty
    查询端口是否开通
    企业微信-之无法修改成员手机号
    PostgreSQL使用distinct关键字给单独的几列去重
    centos全屏幕退出:Alt+Enter
    github上OC 和swift框架精选
    github每日精选---iOS版
  • 原文地址:https://www.cnblogs.com/jackzz/p/9203245.html
Copyright © 2011-2022 走看看