zoukankan      html  css  js  c++  java
  • python爬虫实例

    import re
    
    import requests
    from bs4 import BeautifulSoup
    
    
    # 主方法
    def main():
        # 给请求指定一个请求头来模拟chrome浏览器
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
        page_max = 100
        for i in range(1, int(page_max) + 1):
            if i == 1:
                house = 'https://cc.lianjia.com/ershoufang/erdaoqu/'
            else:
                house = 'https://cc.lianjia.com/ershoufang/erdaoqu/pg'+str(i)
            res = requests.get(house, headers=headers)
            soup = BeautifulSoup(res.text, 'html.parser')
            li_max = soup.find('ul', class_='sellListContent').find_all('li')
            for li in li_max:
                try:
                    house_param = {}
                    content = li.find('div', class_='houseInfo').text
                    content = content.split("|")
                    house_param['housing_estate'] = content[0]
                    house_param['square_metre'] = re.findall(r'-?d+.?d*e?-?d*?', content[2])[0]
                    # --------------------------------------------------------#
                    position = li.find('div', class_='positionInfo').find('a').text
                    house_param['position'] = position
                    # --------------------------------------------------------#
                    totalprice = li.find('div', class_='totalPrice').text
                    house_param['total_price'] = re.sub("D", "", totalprice)
                    unitprice = li.find('div', class_='unitPrice').text
                    house_param['unit_price'] = re.sub("D", "", unitprice)
                    # --------------------------------------------------------#
                    follow = li.find('div', class_='followInfo').text
                    follow = follow.split("/")
                    house_param['follow'] = re.sub("D", "", follow[0])
                    house_param['take_look'] = re.sub("D", "", follow[1])
                    # --------------------------------------------------------#
                    title_src = li.find('div', class_='title').find('a').attrs['href']
                    house_param['url'] = re.sub("D", "", title_src)
                    res = requests.get(title_src, headers=headers)
                    soup = BeautifulSoup(res.text, 'html.parser')
                    # --------------------------------------------------------#                
                    pub_date = soup.find('div', class_='transaction').find_all('li')[0].find_all('span')[1].text
                    house_param['pub_date'] = pub_date
                    print(house_param)
                except Exception as e:
                    print(e)
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    【转】Oracle学习系列
    昏昏昏昏昏昏,怎么变成这样了。:(
    SQLServer 2K 安装重复出现挂起问题解决办法
    ORM iBATIS 学习,没弄清楚。
    MagicLinux让我用我一个方便的方法引导你吧。
    SOA大赛初赛文档已经提交.心中大石掉下.
    继续一下目标。
    ORM已经理解了.
    Spot the Bug Episode 2 中BUG的修改
    MaglicLinux启动加入了BOO.INI搞定.这个方便了.
  • 原文地址:https://www.cnblogs.com/kgdxpr/p/10072285.html
Copyright © 2011-2022 走看看