zoukankan      html  css  js  c++  java
  • python爬取链家租房信息

    import requests as rq
    from bs4 import BeautifulSoup
    import json
    import time
    import pandas as pd
    
    home_url = 'https://bj.lianjia.com/zufang'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
    }
    
    # 首页
    home_rt = rq.get(home_url, headers=headers).text
    home_soup = BeautifulSoup(home_rt, 'lxml')
    
    # 从首页获取到各个区域的入口链接
    district_url_rt = home_soup.find_all('li', attrs={'class': 'filter__item--level2', 'data-type': 'district'}) 
    district_urls = []
    for i in range(1,len(district_url_rt)):
        district_name = district_url_rt[i].a.string  # 区域名称
        dis_url = district_url_rt[i].a.attrs['href']  
        dis_url = 'https://bj.lianjia.com' + dis_url # 区域链接
        district_urls.append([district_name, dis_url])
    
    print(district_urls)
    print('区域接口获取完毕')
    
    finally_house_result = []
    # 遍历各个区域链接,分别从每个入口中获取到信息
    for dis_url in district_urls:
        time.sleep(5)
        district_name = dis_url[0] + ''
        district_url = dis_url[1]
        district_rt = rq.get(district_url, headers=headers)
        district_rt = district_rt.text
        district_soup = BeautifulSoup(district_rt, 'lxml')
        page_num = int(district_soup.find('div', attrs={'class': 'content__pg'}).attrs['data-totalpage'])  # 当前区域房屋信息 网页数
        
        # 遍历所有页,获取所有页 房屋标题+url
        house_titurl = []
        for page in range(1, page_num+1):
            time.sleep(0.8)
            page_url = district_url + f'/pg{page}'  # 当前页面链接
            page_results = rq.get(page_url, headers=headers).text
            page_soup = BeautifulSoup(page_results)
            current_page_rts = page_soup.find_all('div', attrs={'class': 'content__list--item'})  # 当前页面区域房屋信息列表
            
            # 遍历当前页面,获取 所有房屋 标题+ url
            for houselist_rt in current_page_rts:  
                house_url = 'https://bj.lianjia.com' + houselist_rt.a['href']  # urs
                house_title = houselist_rt.a.img['alt']  # 标题          
                address_list = houselist_rt.div.find('p', attrs={'class': 'content__list--item--des'}).find_all('a')
                address = address_list[1].string + '.' + address_list[2].string  # 地址
                house_titurl.append([house_title, address, house_url])
        district_num = len(house_titurl)
        print(f'{district_name}房屋标题&url获取完毕,共{district_num}套租房信息')
        
        # 遍历当前区域所有的房屋标题+链接,获取房屋具体信息
        for house_page in house_titurl:
            time.sleep(0.6)
            house_title = house_page[0]  # 房屋标题
            address = house_page[1]  # 地址
            house_url = house_page[2]  # 房屋链接
            house_rt = rq.get(house_url, headers=headers).text
            house_soup = BeautifulSoup(house_rt)
            
            house_rt1 = house_soup.find_all('li', attrs={'class': 'table_col'})
            pay_method = house_rt1[5].string  # 支付方式
            rent = house_rt1[6].string + house_rt1[1].find('span').string  # 房租
            deposit = house_rt1[7].string + house_rt1[2].find('span').string  # 押金
            service_fee = house_rt1[8].string + house_rt1[3].find('span').string  # 服务费
            agency_fee = house_rt1[9].string + house_rt1[4].find('span').string  # 中介费
            
            house_rt2 = house_soup.find_all('li', attrs={'class': 'fl oneline'})
            size = house_rt2[1].string[3:]  # 面积
            toward = house_rt2[2].string[3:]  # 朝向
            in_time = house_rt2[5].string[3:]  # 入住时间
            rent_term = house_rt2[7].string[3:]  # 租期
            storey = house_rt2[10].string[3:]  # 楼层
            elevator = house_rt2[11].string[3:]  # 电梯
            gas = house_rt2[17].string[3:]  # 燃气
    
            # 配套设施
            supporting_facilities = []
            for faci in range(21, len(house_rt2)):
                supporting_facilities.append(house_soup.find_all('li', attrs={'class': 'fl oneline'})[faci].text.strip())
            supporting_facilities = json.dumps(supporting_facilities, ensure_ascii=False)
            
            # 中介信息
            agency_names = house_soup.find_all('a', attrs={'class': 'name'})
            agency_phones = house_soup.find_all('div', attrs={'class': 'phone'})
            agency_scores = house_soup.find_all('div', attrs={'class': 'rate'})
            agency_list = []
            for name, phone, score in zip(agency_names, agency_phones, agency_scores):
                agency_list.append({'中介姓名': name.string, '电话': phone.string, '评分': score.text.strip()})
            agency_list = json.dumps(agency_list, ensure_ascii=False)
            
            finally_house_result.append([district_name, address, house_title, size, toward, storey, elevator, gas, supporting_facilities, rent_term, in_time, rent, deposit, service_fee, agency_fee, agency_list])
        print(f'{district_name}房屋信息获取完毕,共{district_num}套')
    
    data_num = len(finally_house_result)
    columns = ['区域', '地址', '标题', '面积', '朝向', '楼层', '电梯', '燃气', '配套设施', '租期', '入住时间', '房租', '押金', '服务费', '中介费', '中介联系方式']
    house_finally_dfdata = pd.DataFrame(finally_house_result, columns=columns)
    house_finally_dfdata.to_excel('d:\Desktop\20191124链家北京各城区租房信息.xlsx')
    print(f'北京市各城区租房信息获取完毕,共{data_num}套')
  • 相关阅读:
    jmeter非GUI模式命令
    jmeter性能测试--浪涌测试
    性能测试之场景设计
    性能测试用例实例
    jmeter常见错误及解决方法
    .NET中变量生存期
    SQL数据库从高版本导入低版本
    对称子字符串
    回溯法求解全排列问题(可去除重复排列)
    快速排序及快速选择问题
  • 原文地址:https://www.cnblogs.com/jaysonteng/p/12702066.html
Copyright © 2011-2022 走看看