zoukankan      html  css  js  c++  java
  • python爬虫实例

    import re
    
    import requests
    from bs4 import BeautifulSoup
    
    
    # 主方法
    def main():
        # 给请求指定一个请求头来模拟chrome浏览器
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
        page_max = 100
        for i in range(1, int(page_max) + 1):
            if i == 1:
                house = 'https://cc.lianjia.com/ershoufang/erdaoqu/'
            else:
                house = 'https://cc.lianjia.com/ershoufang/erdaoqu/pg'+str(i)
            res = requests.get(house, headers=headers)
            soup = BeautifulSoup(res.text, 'html.parser')
            li_max = soup.find('ul', class_='sellListContent').find_all('li')
            for li in li_max:
                try:
                    house_param = {}
                    content = li.find('div', class_='houseInfo').text
                    content = content.split("|")
                    house_param['housing_estate'] = content[0]
                    house_param['square_metre'] = re.findall(r'-?d+.?d*e?-?d*?', content[2])[0]
                    # --------------------------------------------------------#
                    position = li.find('div', class_='positionInfo').find('a').text
                    house_param['position'] = position
                    # --------------------------------------------------------#
                    totalprice = li.find('div', class_='totalPrice').text
                    house_param['total_price'] = re.sub("D", "", totalprice)
                    unitprice = li.find('div', class_='unitPrice').text
                    house_param['unit_price'] = re.sub("D", "", unitprice)
                    # --------------------------------------------------------#
                    follow = li.find('div', class_='followInfo').text
                    follow = follow.split("/")
                    house_param['follow'] = re.sub("D", "", follow[0])
                    house_param['take_look'] = re.sub("D", "", follow[1])
                    # --------------------------------------------------------#
                    title_src = li.find('div', class_='title').find('a').attrs['href']
                    house_param['url'] = re.sub("D", "", title_src)
                    res = requests.get(title_src, headers=headers)
                    soup = BeautifulSoup(res.text, 'html.parser')
                    # --------------------------------------------------------#                
                    pub_date = soup.find('div', class_='transaction').find_all('li')[0].find_all('span')[1].text
                    house_param['pub_date'] = pub_date
                    print(house_param)
                except Exception as e:
                    print(e)
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    Vue项目入门实例
    批量生成删除表数据的SQL语句
    收集的一个可多选日期的日期插件,带日历、农历
    .net core EF,多个dbcontext时,迁移数据方法
    【NET】雪花算法
    URL地址中使用中文作为的参数【转】
    C# 继承 base
    SQL Like
    SecureCRT 8.5 配置自动记录日志
    CentOS 7下Samba服务器的安装与配置
  • 原文地址:https://www.cnblogs.com/kgdxpr/p/10072285.html
Copyright © 2011-2022 走看看