zoukankan      html  css  js  c++  java
  • 爬取梨视频

    
    import re
    import redis
    import requests,time
    
    from setting import PAGE,CATEGORY_ID,START,MAIN_URL,DETAIL_URL
    
    from concurrent.futures import ThreadPoolExecutor
    
    from myredis import POOL
    class CrawlVideo():
        pools = ThreadPoolExecutor(100)
    
        def __init__(self, page=PAGE):
            self.page = page
            self.video_info_dic_list = []
            self.conn = redis.Redis(connection_pool=POOL)
    
        def async_download(self,video_dic):
    
            video_link = video_dic["video_link"]
            if self.conn.get(video_link):
                return
            video_name = video_dic["title"][:3]
            response = requests.get(video_link)
            if response.status_code == 200:
                with open("%s.mp4" % video_name, "wb")as f:
                    f.write(response.content)
                self.conn.set(video_link,video_link)
    
    
        def download_video(self, category_id=CATEGORY_ID, start=START, num=PAGE):
    
            crawl_ids_list= self.crawl_videolist(category_id, start, num)
            print(len(crawl_ids_list))
            self.get_video_info(crawl_ids_list)
            i = 0
            while i < len(crawl_ids_list):
                try:
                    video_dic = self.video_info_dic_list.pop()
                    self.pools.submit(self.async_download,video_dic)
                    i += 1
                except Exception as e:
                    time.sleep(0.2)
    
    
    
        def get_video_ids(self, category_id, start):
            main_url = MAIN_URL.format(category_id, start)
            try:
                response = requests.get(main_url)
                video_id_list = re.findall('<a href="(video_d+)"', response.text)
                return video_id_list
            except Exception as e:
                pass
    
        # 爬取单个视频的id的列表,可以通过此列表发请求
        def crawl_videolist(self, category_id, start, num):
            crawl_ids_list = []
            page_num = self.get_page_num(num)
            for i in range(page_num):
                video_id_list = self.get_video_ids(category_id, start)
                crawl_ids_list.extend(video_id_list)
                start += self.page
            while len(crawl_ids_list) > num:
                crawl_ids_list.pop()
            return crawl_ids_list
    
    
        def get_detail(self, obj):
            response = obj.result()
            dic = {}
            title = re.search('<title>(.*?)</title>', response.text).group(1)
            video_link = re.search('srcUrl="(.*?)"', response.text).group(1)
            dic["title"] = title
            dic["video_link"] = video_link
            self.video_info_dic_list.append(dic)
    
    
    
        def async_request(self,url,video_addr):
            response = requests.get(url.format(video_addr))
            return response
    
        def get_video_info(self, video_id_list):
            url = DETAIL_URL
            try:
                for video_addr in video_id_list:
                    obj = self.pools.submit(self.async_request,url,video_addr)
                    obj.add_done_callback(self.get_detail)
            except Exception as e:
                print(e)
    
        def get_page_num(self, num):
            if num % self.page == 0:
                page_num = num / self.page
            elif num <= self.page:
                page_num = 1
            else:
                page_num = num // self.page + 1
            return int(page_num)
    
    
    
    crawl = CrawlVideo()
    crawl.download_video(start=1,num=2)
    
    
  • 相关阅读:
    设计模式系列
    Python3 系列之 可变参数和关键字参数
    设计模式系列
    【HANA系列】SAP HANA ODBC error due to mismatch of version
    【FICO系列】SAP FICO FS00修改科目为未清项目管理
    【FIORI系列】SAP OpenUI5 (SAPUI5) js框架简单介绍
    【HANA系列】SAP HANA SQL获取当前日期加若干天后的日期
    【HANA系列】SAP HANA SQL获取本周的周一
    【HANA系列】SAP HANA SQL获取当前日期
    【HANA系列】SAP HANA SQL获取当前日期最后一天
  • 原文地址:https://www.cnblogs.com/robert-zhou/p/10685764.html
Copyright © 2011-2022 走看看