zoukankan      html  css  js  c++  java
  • python爬虫实例

    import re
    
    import requests
    from bs4 import BeautifulSoup
    
    
    # 主方法
    def main():
        # 给请求指定一个请求头来模拟chrome浏览器
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
        page_max = 100
        for i in range(1, int(page_max) + 1):
            if i == 1:
                house = 'https://cc.lianjia.com/ershoufang/erdaoqu/'
            else:
                house = 'https://cc.lianjia.com/ershoufang/erdaoqu/pg'+str(i)
            res = requests.get(house, headers=headers)
            soup = BeautifulSoup(res.text, 'html.parser')
            li_max = soup.find('ul', class_='sellListContent').find_all('li')
            for li in li_max:
                try:
                    house_param = {}
                    content = li.find('div', class_='houseInfo').text
                    content = content.split("|")
                    house_param['housing_estate'] = content[0]
                    house_param['square_metre'] = re.findall(r'-?d+.?d*e?-?d*?', content[2])[0]
                    # --------------------------------------------------------#
                    position = li.find('div', class_='positionInfo').find('a').text
                    house_param['position'] = position
                    # --------------------------------------------------------#
                    totalprice = li.find('div', class_='totalPrice').text
                    house_param['total_price'] = re.sub("D", "", totalprice)
                    unitprice = li.find('div', class_='unitPrice').text
                    house_param['unit_price'] = re.sub("D", "", unitprice)
                    # --------------------------------------------------------#
                    follow = li.find('div', class_='followInfo').text
                    follow = follow.split("/")
                    house_param['follow'] = re.sub("D", "", follow[0])
                    house_param['take_look'] = re.sub("D", "", follow[1])
                    # --------------------------------------------------------#
                    title_src = li.find('div', class_='title').find('a').attrs['href']
                    house_param['url'] = re.sub("D", "", title_src)
                    res = requests.get(title_src, headers=headers)
                    soup = BeautifulSoup(res.text, 'html.parser')
                    # --------------------------------------------------------#                
                    pub_date = soup.find('div', class_='transaction').find_all('li')[0].find_all('span')[1].text
                    house_param['pub_date'] = pub_date
                    print(house_param)
                except Exception as e:
                    print(e)
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    python使用代理访问服务器
    python请求服务器时如何隐藏User-Agent
    python利用有道翻译实现“语言翻译器”的功能
    python请求服务器图片并下载到本地磁盘
    python的搜索路径与包(package)
    Python生成器(yield)
    Python迭代器(斐波拉切数列实例)
    Python定制容器
    Python描述符:property()函数的小秘密
    Python类属性访问的魔法方法
  • 原文地址:https://www.cnblogs.com/kgdxpr/p/10072285.html
Copyright © 2011-2022 走看看