zoukankan      html  css  js  c++  java
  • 趴一趴

    趴房源

    import requests
    from bs4 import BeautifulSoup

    # 网页的请求头
    header = {
      'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
    }

    def get_page(url):
      response = requests.get(url, headers=header)

      # 通过BeautifulSoup进行解析出每个房源详细列表并进行打印
      soup_idex = BeautifulSoup(response.text, 'html.parser')
      result_li = soup_idex.find_all('li', {'class': 'list-item'})

      # 进行循环遍历其中的房源详细列表
      for i in result_li:
        # 由于BeautifulSoup传入的必须为字符串,所以进行转换
        page_url = str(i)
        soup = BeautifulSoup(page_url, 'html.parser')
        # 由于通过class解析的为一个列表,所以只需要第一个参数
        result_href = soup.find_all('a', {'class': 'houseListTitle'})[0]
        # 详细页面的函数调用
        get_page_detail(result_href.attrs['href'])


      # 进行下一页的爬取
      result_next_page = soup_idex.find_all('a', {'class': 'aNxt'})
      if len(result_next_page) != 0:
        # 函数进行递归
        get_page(result_next_page[0].attrs['href'])
      else:
        print('没有下一页了')

    # 进行字符串中空格,换行,tab键的替换及删除字符串两边的空格删除
    def my_strip(s):
      return str(s).replace(" ", "").replace(" ", "").replace(" ", "").strip()
    # 由于频繁进行BeautifulSoup的使用,封装一下
    def my_Beautifulsoup(response):
      return BeautifulSoup(str(response), 'html.parser')

    # 详细页面的爬取
    def get_page_detail(url):
      response = requests.get(url, headers=header)
      if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        result_title = soup.find_all('h3', {'class': 'long-title'})[0]
        result_price = soup.find_all('span', {'class': 'light info-tag'})[0]
        result_house_1 = soup.find_all('div', {'class': 'first-col detail-col'})
        result_house_2 = soup.find_all('div', {'class': 'second-col detail-col'})
        result_house_3 = soup.find_all('div', {'class': 'third-col detail-col'})
        soup_1 = my_Beautifulsoup(result_house_1)
        soup_2 = my_Beautifulsoup(result_house_2)
        soup_3 = my_Beautifulsoup(result_house_3)
        result_house_tar_1 = soup_1.find_all('dd')
        result_house_tar_2 = soup_2.find_all('dd')
        result_house_tar_3 = soup_3.find_all('dd')

        print(my_strip(result_title.text), my_strip(result_price.text))
        print(my_strip(result_house_tar_1[0].text),
        my_strip(my_Beautifulsoup(result_house_tar_1[1]).find_all('p')[0].text),
        my_strip(result_house_tar_1[2].text), my_strip(result_house_tar_1[3].text))
        print(my_strip(result_house_tar_2[0].text), my_strip(result_house_tar_2[1].text),
        my_strip(result_house_tar_2[2].text), my_strip(result_house_tar_2[3].text))
        print(my_strip(result_house_tar_3[0].text), my_strip(result_house_tar_3[1].text),
        my_strip(result_house_tar_3[2].text))

    if __name__ == '__main__':
      # url链接
      url = 'https://zhengzhou.anjuke.com/sale/'
      # 页面爬取函数调用
      get_page(url)

    小蟒蛇
  • 相关阅读:
    GTK+ 3.6.2 发布,小的 bug 修复版本
    RunJS 新增 Echo Ajax 测试功能
    Mozilla 发布 Popcorn Maker,在线创作视频
    Sina微博OAuth2框架解密
    Mina状态机State Machine
    Mozilla 发布 Shumway —— 纯JS的SWF解析器
    Code Browser 4.5 发布,代码浏览器
    ROSA 2012 "Enterprise Linux Server" 发布
    ltrace 0.7.0 发布,程序调试工具
    Artifactory 2.6.5 发布,Maven 扩展工具
  • 原文地址:https://www.cnblogs.com/pyxiaomangshe/p/7690702.html
Copyright © 2011-2022 走看看