zoukankan      html  css  js  c++  java
  • 使用Selenium模拟浏览器抓取斗鱼直播间信息

    获取斗鱼直播间每个房间的名称、观看人数、tag、主播名字

    代码:

    import time
    from multiprocessing import Pool
    
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException
    from bs4 import BeautifulSoup
    from pymongo import MongoClient
    from pymongo.errors import PyMongoError
    
    
    # monogdb配置信息
    MONGO_HOST = "localhost"
    MONGO_DATABASE = "douyu"
    MONGO_TABLE = "zhibo"
    client = MongoClient(host=MONGO_HOST)
    db = client[MONGO_DATABASE]
    
    # PhantomJS 命令行相关配置
    # 参见 http://phantomjs.org/api/command-line.html
    SERVICE_ARGS = ['--disk-cache=true', '--load-images=false']
    
    # driver = webdriver.Chrome()  # 有界面
    driver = webdriver.PhantomJS(service_args=SERVICE_ARGS)  # 无界面
    delay = 10
    wait = WebDriverWait(driver, delay)
    driver.maximize_window()
    
    
    def get_total_pages():
        url = 'https://www.douyu.com/directory/all'
        driver.get(url)
        pages = int(driver.find_element_by_css_selector(
            '.shark-pager-dot + .shark-pager-item').text)
        print("正在获取第1页数据")
        room_list = get_rooms_by_beautifulsoup()
        save_to_monogodb(room_list)
        return pages
    
    
    # 根据页码获取指定页数据,并将其保存到数据库中
    def parse_page(page_num):
        print("正在获取第%d页数据" % page_num)
        try:
            page_num_box = wait.until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "input.jumptxt")))
            go_btn = wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, 'a.shark-pager-submit')))
            page_num_box.clear()
            page_num_box.send_keys(page_num)
            go_btn.click()
            # driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            # time.sleep(0.1)
            wait.until(
                EC.text_to_be_present_in_element(
                    (By.CSS_SELECTOR,
                     '.shark-pager-item.current'),
                    str(page_num)))
            # 对于By.CLASS_NAME invalid selector: Compound class names not permitted
            room_list = get_rooms_by_beautifulsoup()
            save_to_monogodb(room_list)
        except TimeoutException:
            print("请求第%d页失败" % page_num)
            print("尝试重新获取第%d页" % page_num)
            return parse_page(page_num)
    
    
    # 通过bs4解析数据
    def get_rooms_by_beautifulsoup():
        '''
        通过bs4库解析数据
        获取直播间的名称,观看人数,标签,主播名
        '''
        wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, "ul#live-list-contentbox > li")))
        html = driver.page_source
    
        soup = BeautifulSoup(html, 'lxml')
        rooms = soup.select('ul#live-list-contentbox > li')
        for room in rooms:
            room_name = room.find(
                'h3', attrs={
                    'class': 'ellipsis'}).get_text(
                strip=True)
            view_count = room.find('span', class_='dy-num fr').text
            tag = room.find('span', class_='tag ellipsis').text
            hostname = room.find('span', class_='dy-name ellipsis fl').text
            #print("房间名: " + room_name + "	观看人数: " + view_count + "	标签: " + tag + "	主播名: " + hostname)
            yield {
                'room_name': room_name,
                'view_count': view_count,
                'tag': tag,
                'hostname': hostname,
            }
    
    
    def save_to_monogodb(room_list):
        for room in room_list:
            try:
                db[MONGO_TABLE].insert(room)   # insert支持插入多条数据
                print("mongodb插入数据成功:", room)
            except PyMongoError as e:
                print("mongodb插入数据失败:", room, e)
    
    
    if __name__ == '__main__':
        try:
            total_pages = get_total_pages()
            for page_num in range(2, total_pages + 1):
                parse_page(page_num)
        except Exception as e:
            print("出错了", e)
        finally:  # 确保 浏览器能正常关闭
            print("共有%d页" % total_pages)
            driver.close()
  • 相关阅读:
    jQuery的实用技巧
    jQuery中的Ajax
    jQuery中的动画
    jQuery中的事件
    jQuery中的DOM操作
    详解jQuery的选择器
    微信小程序开发教程(九)视图层——.wxss详解
    微信小程序开发教程(八)视图层——.wxml详解
    14 组合查询
    13 创建高级联结
  • 原文地址:https://www.cnblogs.com/hupeng1234/p/7138551.html
Copyright © 2011-2022 走看看