zoukankan      html  css  js  c++  java
  • Beautiful Soup多线程爬取斗鱼所有主播信息(改良版)

    花点时间改良了一下代码。如下

    import requests
    from bs4 import BeautifulSoup
    import pymongo
    import lxml
    import time, datetime
    
    class douyu_host_info():
        def __init__(self):
            self.url_host = 'https://www.douyu.com'
            self.date_time = datetime.datetime.now().strftime('%Y%m%d_%H%M')
            self.url_list = []
            self.headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
            }
            # 类别后缀列表,由于是固定不变的,第一次抓取后直接保存起来
            self.categorys_list =[
                 '/g_LOL', '/g_blzy', '/g_DOTA2', '/g_qipai', '/g_DNF', '/g_CF', '/g_mszb', '/g_CSGO', '/g_How', '/g_DOTA',
                 '/g_WOW', '/g_nsh', '/g_Overwatch', '/g_wxy', '/directory/category/PCgame', '/g_jdqs', '/g_TVgame',
                 '/g_gwlrsj', '/g_FTG', '/g_xyqx', '/g_NBA2K', '/g_BF', '/g_DG', '/directory/category/djry', '/g_wzry',
                 '/g_jdqscjzc', '/g_jdqsqjcj', '/g_qqfcsy', '/g_hyrz', '/g_xyzx', '/g_HLMJ', '/g_phone', '/g_LRSZQ',
                 '/g_mhmnz', '/g_CFSY', '/directory/category/syxx', '/g_yz', '/g_xingyu', '/g_ecy', '/g_yqk', '/g_HW',
                 '/g_ms', '/g_music', '/g_ip', '/directory/category/yl', '/g_smkj', '/g_yj', '/g_Finance', '/g_kepu',
                 '/g_js', '/g_car', '/g_jlp', '/g_tkx', '/directory/sport/cate', '/g_FM233', '/g_yydt', '/g_lianmaihudong',
                 '/g_qinggan', '/directory/category/voice', '/g_znl'
            ]
    
        def Mongodb_set(self, sheet_name, r_data):
            client = pymongo.MongoClient('localhost', 27017)
            douyu = client['douyu']
            sheet_name = douyu[sheet_name]
            print(r_data)
            sheet_name.insert_one(r_data)
    
        def get_url_list(self):
            for category in self.categorys_list:
                category_url = self.url_host + category
                self.url_list.append(category_url)
                self.Mongodb_set(sheet_name='url_list', r_data={'url': category_url})
            return self.url_list
    
        def get_host_info(self, url):
            time.sleep(0.2)
            print('Now start open {}'.format(url))
            for i in range(3):
                try:
                    wb_data = requests.get(url, headers=self.headers)
                    break
                except:
                    print('net work error! will retry 3 times')
    
            soup = BeautifulSoup(wb_data.text, 'lxml')
            print('start analazy url')
            try:
                category = soup.select('h1')[0].get_text()
            except:
                category = '未定義類別'
            names = soup.select('.ellipsis.fl')
            nums = soup.select('.dy-num.fr')
            titles = soup.select('.mes h3')
            hrefs = soup.select('#live-list-contentbox  li  a')
            for name, num, href, title in zip(names, nums, hrefs, titles):
                data = {
                    '類別': category,
                    '主播': name.get_text(),
                    '标题': title.get_text().split('
    ')[-1].strip(),
                    '链接': self.url_host + href.get('href'),
                    '人氣指數': float(num.get_text()[:-1]) if '万'in num.get_text() else float(num.get_text())/10000,
                    '當前時間': self.date_time
                }
                if data['人氣指數'] > 2:
                    self.Mongodb_set(sheet_name='host_info_{}'.format(self.date_time), r_data=data)
    
        def db_check(self, sheetname, key_word):
            client = pymongo.MongoClient('localhost', 27017)
            douyu = client['douyu']
            sheetname = douyu[sheetname]
            for data in sheetname.find(key_word):
                print(data)
    

      

    from multiprocessing import Pool
    from douyu_host_2 import douyu_host_info
    
    douyu = douyu_host_info()
    
    def data_check():
        #{u'當前時間':'20180901 10:58', u'人氣指數':{'$gte':30}}
        #{'主播':'
        # sheetname = input('Which sheet do you want to check')
        sheetname = 'host_info_20180901_1530'
        # key_word = input('Do you want to check with?')
        key_word = {'類別': 'DOTA2'}
        douyu.db_check(sheetname=sheetname, key_word=key_word)
    
    
    def w_to_db():
        pool = Pool()
        url_list = douyu.get_url_list()
        pool.map(douyu.get_host_info, url_list)
    
    
    if __name__ == '__main__':
    
        w_to_db()
        data_check()
    

      这个爬虫没有包含翻页,只爬取了每个类别下面的首页,翻页爬所有主播请参考这个脚本。

      https://www.cnblogs.com/lkd8477604/p/9848958.html

  • 相关阅读:
    关于oralce字符集问题(复制别人的,纯属自己学习)
    Linux下oracle11gR2系统安装到数据库建立配置及最后oracle的dmp文件导入一站式操作记录
    Linux下部署ASP.NET服务连接oracle遇到的问题记录
    前端金钱分转元,元转分精度问题解决
    vue-element的form表单显示图片
    vue页面刷新技巧--(v-if指令)以及vue动态设置css属性
    vue后端获取的数据无法进行双向数据绑定
    Vue实现勾选框全选和局部选择功能
    VUE/jQuery生成二维码扫描跳转地址
    uni-app之数据状态改动后页面不刷新踩坑
  • 原文地址:https://www.cnblogs.com/lkd8477604/p/9570649.html
Copyright © 2011-2022 走看看