zoukankan      html  css  js  c++  java
  • python 爬虫实例(四)

    环境:

    OS:Window10

    python:3.7

    爬取链家地产上面的数据,两个画面上的数据的爬取

    效果,下面的两个网页中的数据取出来

    代码

    import datetime
    import threading
    
    import requests
    from bs4 import BeautifulSoup
    
    class LianjiaHouseInfo:
    
        '''
            初期化变量的值
        '''
        def __init__(self):
            # 定义自己要爬取的URL
            self.url = "https://dl.lianjia.com/ershoufang/pg{0}"
            self.path = r"C:pythonProjectLianjia_House"
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
    
        '''
            访问URL
        '''
        def request(self, param):
    
            # 如果不加的话可能会出现403的错误,所以尽量的都加上header,模仿网页来访问
            req = requests.get(param, headers=self.headers)
            # req.raise_for_status()
            # req.encoding = req.apparent_encoding
            return req.text
        '''
            page設定
        '''
        def all_pages(self, pageCn):
            dataListA = []
            for i in range(1, pageCn+1):
                if pageCn == 1:
                    dataListA = dataListA + self.getData(self.url[0:self.url.find("pg")])
                else:
                    url = self.url.format(i)
                    dataListA = dataListA + self.getData(url)
            # self.dataOrganize(dataListA)
        '''
           数据取得
        '''
        def getData(self, url):
            dataList = []
            thread_lock.acquire()
            req = self.request(url)
            # driver = webdriver.Chrome()
            # driver.get(self.url)
            # iframe_html = driver.page_source
            # driver.close()
            # print(iframe_html)
            soup = BeautifulSoup(req, 'lxml')
            countHouse = soup.find(class_="total fl").find("span")
            print("共找到 ", countHouse.string, " 套大连二手房")
    
            sell_all = soup.find(class_="sellListContent").find_all("li")
            for sell in sell_all:
    
                title = sell.find(class_="title")
                if title is not None:
                    print("------------------------概要--------------------------------------------")
                    title = title.find("a")
                    print("title:", title.string)
                    housInfo = sell.find(class_="houseInfo").get_text()
                    print("houseInfo:", housInfo)
                    positionInfo = sell.find(class_="positionInfo").get_text()
                    print("positionInfo:", positionInfo)
    
                    followInfo = sell.find(class_="followInfo").get_text()
                    print("followInfo:", followInfo)
    
                    print("------------------------詳細信息--------------------------------------------")
                    url_detail = title["href"]
                    req_detail = self.request(url_detail)
                    soup_detail = BeautifulSoup(req_detail, "lxml")
                    total = soup_detail.find(class_="total")
                    unit = soup_detail.find(class_="unit").get_text()
                    dataList.append(total.string+unit)
                    print("总价:", total.string, unit)
                    unitPriceValue = soup_detail.find(class_="unitPriceValue").get_text()
                    dataList.append(unitPriceValue)
                    print("单价:", unitPriceValue)
                    room_mainInfo = soup_detail.find(class_="room").find(class_="mainInfo").get_text()
                    dataList.append(room_mainInfo)
                    print("户型:", room_mainInfo)
                    type_mainInfo = soup_detail.find(class_="type").find(class_="mainInfo").get_text()
                    dataList.append(type_mainInfo)
                    print("朝向:", type_mainInfo)
                    area_mainInfo = soup_detail.find(class_="area").find(class_="mainInfo").get_text()
                    dataList.append(area_mainInfo)
                    print("面积:", area_mainInfo)
    
                else:
                    print("広告です")
            thread_lock.release()
            return dataList
        #
        # def dataOrganize(self, dataList):
        #
        #     data2 = pd.DataFrame(dataList)
        #     data2.to_csv(r'C:UserspeiqiangDesktoplagoujob.csv', header=False, index=False, mode='a+')
        #     data3 = pd.read_csv(r'C:UserspeiqiangDesktoplagoujob.csv', encoding='gbk')
    
    
    
    thread_lock = threading.BoundedSemaphore(value=100)
    house_Info = LianjiaHouseInfo()
    startTime = datetime.datetime.now()
    house_Info.all_pages(1)
    endTime = datetime.datetime.now()
    print("実行時間:", (endTime - startTime).seconds)
    

      运行之后的效果

  • 相关阅读:
    火狐插件火狐黑客插件将Firefox变成黑客工具的七个插件
    memcache安装环境:WINDOWS 7
    PHP正则表达式
    968. 监控二叉树 力扣(困难) dfs 官方说DP
    375. 猜数字大小 II 力扣(中等) 区间动态规划、记忆化搜索
    629. K个逆序对数组 力扣(困难) 区间动态规划
    剑指 Offer 51. 数组中的逆序对 力扣(困难) 巧用归并排序算法
    488. 祖玛游戏 力扣(困难) dfs
    16. 最接近的三数之和 力扣(中等) 双指针
    319. 灯泡开关 力扣(中等) 数论
  • 原文地址:https://www.cnblogs.com/killclock048/p/10144576.html
Copyright © 2011-2022 走看看