zoukankan      html  css  js  c++  java
  • 爬虫

    爬虫 - 斗鱼房间信息

    
    '''爬取斗鱼直播所有的房间信息,
    https://www.douyu.com/gapi/rkc/directory/0_0/1 还可直接获取json数据
    '''
    import time
    
    from selenium import webdriver
    from fake_useragent import UserAgent
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    
    class DouYuRoom(object):
        def __init__(self):
            ua = UserAgent().random
            self.url = 'https://www.douyu.com/directory/all'
            # self.driver = webdriver.PhantomJS(r'F:/tools/phantomjs-2.1.1-windows/bin/phantomjs.exe')
            self.driver = webdriver.Chrome()
            self.content_list = []
        def get_content_list(self):
    
            li_list = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li')
            content_list = []
            for li in li_list:
                content_dic = {}
                game = li.find_element_by_xpath('.//span[@class="DyListCover-zone"]').text
                room_url = li.find_element_by_xpath('.//a').get_attribute('href')
    		   #.....
                content_dic['game'] = game
                content_dic['room_url'] = room_url
                content_list.append(content_dic)
            next_url = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/div/ul/li[9]/span')
            next_url =next_url[0] if len(next_url)>0 else None
            print('next_url',next_url)
            return next_url,content_list
    
        def save_content(self,content):
            pass
    
        def run(self):
            # 获取数据
            self.driver.get(self.url)
            # 等待加载完毕,可以使用显示等待,隐式等待
            time.sleep(10)
            # 提取数据
            next_url,content_dic = self.get_content_list()
            print(content_dic)
            # 每次的数据都先保存下来了
            self.save_content(content_dic)
            # 开始下一页的提取
            while next_url:
                next_url.click()
                time.sleep(5)
                next_url, content_dic = self.get_content_list()
                print(content_dic)
                self.save_content(content_dic)
            # 关闭浏览器
            self.driver.close()
    
    
    if __name__ == '__main__':
        dy = DouYuRoom()
        dy.run()
    
    
    
  • 相关阅读:
    HTTP的三次握手
    HTTP协议的发展历史
    二分图 (最大匹配 + 最小点覆盖 + 最少路径覆盖 + 最大独立集)
    昂贵的聘礼 POJ
    Cow Contest POJ 3660 (Floyed ) (最短路专题)
    B-number HDU
    You Are the One HDU
    0 or 1 HDU
    Trie树模板 + 例题
    The Shortest Path in Nya Graph HDU
  • 原文地址:https://www.cnblogs.com/Afrafre/p/11731601.html
Copyright © 2011-2022 走看看