zoukankan      html  css  js  c++  java
  • python3 爬虫之爬取安居客二手房资讯(第一版)

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    # Author;Tsukasa
    
    
    
    import requests
    from bs4 import BeautifulSoup
    import pandas
    import time
    
    
    url_all = []
    url_in = input('输入你所需要城市的字母简写:
    如:中山 zs , 广州 gz
    !!!不要乱输入,不然运行不了')
    url_number = 1+int(input('输入爬取页数:'))
    
    okl = []
    def open(nobe):
        res = requests.get(nobe)
        soup = BeautifulSoup(res.text,'html5lib')
        http_start = []
        url_start = 'http://esf.'+url_in+'.fang.com'
        for title in soup.select('.houseList dl'):  #网址链接列表
            url_end = title.select('.title a ')[0]['href']
            http_start.append(url_start + url_end)
        return http_start
    
    
    #获取详细信息
    def content(url):
        info = {}
        info['网页'] = url
        res = requests.get(url)
        soup = BeautifulSoup(res.text,'html5lib')
        info['标题'] = soup.select('h1')[0].text.strip()  #获取标题
        info['总价'] = soup.select('.red20b')[0].text + '万'   #总价
        info['联系电话'] = soup.select('#mobilecode')[0].text   #电话
        for sl in soup.select('span'):  #获取发布时间
            if '发布时间' in sl.text.lstrip('<span>'):
                key , value = (sl.text.strip().rstrip('(').split(':'))
                info[key] = value + '*' + soup.select('#Time')[0].text
        for dd in soup.select('dd'):  #获取详细内容
            if ':' in dd.text.strip():
                key , value = (dd.text.strip().split(':'))
                info[key] = value
        return info
    
    
    
    
    print('----------正在运行,请不要关闭----------')
    url_home = ('http://esf.'+ url_in + '.fang.com/house/i3{}/')
    for url_next in range(1,url_number):
        url_all.append((url_home.format(url_next)))
    
    home = []
    for i in url_all:
        a = (open(i))
        print('正在获取 -----> ',i,' <-----')
        time.sleep(1)
        for b in a:
            home.append(content(b))
            print('	正在获取详细信息 -> ',b,' <-----')
            time.sleep(2)
    
        #home.append(content(open(i[0])))
    last = pandas.DataFrame(home)
    last.to_excel('temp.xlsx',sheet_name='房源信息')
    print('----------运行结束----------
    
    ----------查看根目录---------')
    
    
    abcdefg = input('完成运行')
    

      源码先奉上,以后在填坑

  • 相关阅读:
    java练习6
    java练习5
    java练习4
    java练习3
    java练习2
    java练习1
    用代码实现判断字符串的开头和结尾
    语句练习题2
    语句练习题1
    值类型和引用类型的区别
  • 原文地址:https://www.cnblogs.com/Tsukasa/p/6721026.html
Copyright © 2011-2022 走看看