zoukankan      html  css  js  c++  java
  • 爬取梨视频

    
    import re
    import redis
    import requests,time
    
    from setting import PAGE,CATEGORY_ID,START,MAIN_URL,DETAIL_URL
    
    from concurrent.futures import ThreadPoolExecutor
    
    from myredis import POOL
    class CrawlVideo():
        pools = ThreadPoolExecutor(100)
    
        def __init__(self, page=PAGE):
            self.page = page
            self.video_info_dic_list = []
            self.conn = redis.Redis(connection_pool=POOL)
    
        def async_download(self,video_dic):
    
            video_link = video_dic["video_link"]
            if self.conn.get(video_link):
                return
            video_name = video_dic["title"][:3]
            response = requests.get(video_link)
            if response.status_code == 200:
                with open("%s.mp4" % video_name, "wb")as f:
                    f.write(response.content)
                self.conn.set(video_link,video_link)
    
    
        def download_video(self, category_id=CATEGORY_ID, start=START, num=PAGE):
    
            crawl_ids_list= self.crawl_videolist(category_id, start, num)
            print(len(crawl_ids_list))
            self.get_video_info(crawl_ids_list)
            i = 0
            while i < len(crawl_ids_list):
                try:
                    video_dic = self.video_info_dic_list.pop()
                    self.pools.submit(self.async_download,video_dic)
                    i += 1
                except Exception as e:
                    time.sleep(0.2)
    
    
    
        def get_video_ids(self, category_id, start):
            main_url = MAIN_URL.format(category_id, start)
            try:
                response = requests.get(main_url)
                video_id_list = re.findall('<a href="(video_d+)"', response.text)
                return video_id_list
            except Exception as e:
                pass
    
        # 爬取单个视频的id的列表,可以通过此列表发请求
        def crawl_videolist(self, category_id, start, num):
            crawl_ids_list = []
            page_num = self.get_page_num(num)
            for i in range(page_num):
                video_id_list = self.get_video_ids(category_id, start)
                crawl_ids_list.extend(video_id_list)
                start += self.page
            while len(crawl_ids_list) > num:
                crawl_ids_list.pop()
            return crawl_ids_list
    
    
        def get_detail(self, obj):
            response = obj.result()
            dic = {}
            title = re.search('<title>(.*?)</title>', response.text).group(1)
            video_link = re.search('srcUrl="(.*?)"', response.text).group(1)
            dic["title"] = title
            dic["video_link"] = video_link
            self.video_info_dic_list.append(dic)
    
    
    
        def async_request(self,url,video_addr):
            response = requests.get(url.format(video_addr))
            return response
    
        def get_video_info(self, video_id_list):
            url = DETAIL_URL
            try:
                for video_addr in video_id_list:
                    obj = self.pools.submit(self.async_request,url,video_addr)
                    obj.add_done_callback(self.get_detail)
            except Exception as e:
                print(e)
    
        def get_page_num(self, num):
            if num % self.page == 0:
                page_num = num / self.page
            elif num <= self.page:
                page_num = 1
            else:
                page_num = num // self.page + 1
            return int(page_num)
    
    
    
    crawl = CrawlVideo()
    crawl.download_video(start=1,num=2)
    
    
  • 相关阅读:
    第一个Polymer应用
    Android-注解处理器
    怎样启动JDBC Debug模式,打印JDBC诊断日志
    OA项目之权限设计②
    Redis安装教程
    Skyscrapers Aren’t Scalable
    codeforces 183B
    iOS-NSMutableParagraphStyle段落风格
    【Java集合源代码剖析】Java集合框架
    经常使用的Hql语句
  • 原文地址:https://www.cnblogs.com/robert-zhou/p/10685764.html
Copyright © 2011-2022 走看看