zoukankan      html  css  js  c++  java
  • 爬虫

    爬虫 - 斗鱼房间信息

    
    '''爬取斗鱼直播所有的房间信息,
    https://www.douyu.com/gapi/rkc/directory/0_0/1 还可直接获取json数据
    '''
    import time
    
    from selenium import webdriver
    from fake_useragent import UserAgent
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    
    class DouYuRoom(object):
        def __init__(self):
            ua = UserAgent().random
            self.url = 'https://www.douyu.com/directory/all'
            # self.driver = webdriver.PhantomJS(r'F:/tools/phantomjs-2.1.1-windows/bin/phantomjs.exe')
            self.driver = webdriver.Chrome()
            self.content_list = []
        def get_content_list(self):
    
            li_list = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li')
            content_list = []
            for li in li_list:
                content_dic = {}
                game = li.find_element_by_xpath('.//span[@class="DyListCover-zone"]').text
                room_url = li.find_element_by_xpath('.//a').get_attribute('href')
    		   #.....
                content_dic['game'] = game
                content_dic['room_url'] = room_url
                content_list.append(content_dic)
            next_url = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/div/ul/li[9]/span')
            next_url =next_url[0] if len(next_url)>0 else None
            print('next_url',next_url)
            return next_url,content_list
    
        def save_content(self,content):
            pass
    
        def run(self):
            # 获取数据
            self.driver.get(self.url)
            # 等待加载完毕,可以使用显示等待,隐式等待
            time.sleep(10)
            # 提取数据
            next_url,content_dic = self.get_content_list()
            print(content_dic)
            # 每次的数据都先保存下来了
            self.save_content(content_dic)
            # 开始下一页的提取
            while next_url:
                next_url.click()
                time.sleep(5)
                next_url, content_dic = self.get_content_list()
                print(content_dic)
                self.save_content(content_dic)
            # 关闭浏览器
            self.driver.close()
    
    
    if __name__ == '__main__':
        dy = DouYuRoom()
        dy.run()
    
    
    
  • 相关阅读:
    php笔记小结
    php知识总结(二)
    php知识点总结(一)
    js冒泡排序及计算其运行时间
    空中飘动的云动画
    网站建设教程:WordPress如何在虚拟主机上安装
    网站建设教程之PageAdmin建站系统的安装
    免费CMS建站系统哪个比较好?如何选择?
    网站建设之自助建站系统的选择?
    企业外贸网站制作的要求及注意事项
  • 原文地址:https://www.cnblogs.com/Afrafre/p/11731601.html
Copyright © 2011-2022 走看看