爬虫 - 斗鱼房间信息
'''爬取斗鱼直播所有的房间信息,
https://www.douyu.com/gapi/rkc/directory/0_0/1 还可直接获取json数据
'''
import time
from selenium import webdriver
from fake_useragent import UserAgent
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
class DouYuRoom(object):
def __init__(self):
ua = UserAgent().random
self.url = 'https://www.douyu.com/directory/all'
# self.driver = webdriver.PhantomJS(r'F:/tools/phantomjs-2.1.1-windows/bin/phantomjs.exe')
self.driver = webdriver.Chrome()
self.content_list = []
def get_content_list(self):
li_list = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li')
content_list = []
for li in li_list:
content_dic = {}
game = li.find_element_by_xpath('.//span[@class="DyListCover-zone"]').text
room_url = li.find_element_by_xpath('.//a').get_attribute('href')
#.....
content_dic['game'] = game
content_dic['room_url'] = room_url
content_list.append(content_dic)
next_url = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/div/ul/li[9]/span')
next_url =next_url[0] if len(next_url)>0 else None
print('next_url',next_url)
return next_url,content_list
def save_content(self,content):
pass
def run(self):
# 获取数据
self.driver.get(self.url)
# 等待加载完毕,可以使用显示等待,隐式等待
time.sleep(10)
# 提取数据
next_url,content_dic = self.get_content_list()
print(content_dic)
# 每次的数据都先保存下来了
self.save_content(content_dic)
# 开始下一页的提取
while next_url:
next_url.click()
time.sleep(5)
next_url, content_dic = self.get_content_list()
print(content_dic)
self.save_content(content_dic)
# 关闭浏览器
self.driver.close()
if __name__ == '__main__':
dy = DouYuRoom()
dy.run()