zoukankan      html  css  js  c++  java
  • 爬取梨视频主页所有视频

    import requests
    import re
    import uuid
    from concurrent.futures import ThreadPoolExecutor
    
    pool = ThreadPoolExecutor(50)
    
    
    # 爬虫三部曲
    # 1.发送请求
    def get_html(url):
        print(f'start: {url}...')
        response = requests.get(url)
        return response
    
    # 2.解析数据
    # 解析主页,获取视频详情页url
    def parse_index(response):
        '''
        <a href="(.*?)" class="vervideo-lilink actplay" target="_blank">.*?<div class="vervideo-title">(.*?)</div>.*?</a>
        '''
        # 获取电影所有的id
        movie_id_list = re.findall(
            '<a href="video_(.*?)"',  # video_1637397
            response.text,
            re.S
        )
        return list(set(movie_id_list))
    
    
    # from concurrent.futures._base import Future
    # 解析视频详情页,获取真实视频url
    def parse_detail(res):  # res对象 --- 》 {'result': response}
        # print(type(res))
    
        # print(res.__dict__)
        # print('*'*100)
        response = res.result()
        # 通过回调得到的response参数是一个对象
        '''
        <video webkit-playsinline="" playsinline="" x-webkit-airplay="" autoplay="autoplay" src="https://video.pearvideo.com/mp4/adshort/20191228/cont-1637151-14745612_adpkg-ad_hd.mp4" style=" 100%; height: 100%;"></video>
        '''
        movie_url = re.findall('srcUrl="(.*?)"', response.text, re.S)[0]
    
        print('是否到此处了')
        # 异步提交任务爬取真实视频数据,并保存
        pool.submit(save_movie, movie_url)
    
    
    # 3.保存数据
    def save_movie(movie_url):
        print('start')
        movie_response = get_html(movie_url)
    
        # print(movie_response.text)
        # print(movie_response.content)
    
        with open(f'{str(uuid.uuid4())}.mp4', 'wb') as f:
            for line in movie_response.iter_content():
                f.write(line)
    
        print('end...')
    
    if __name__ == '__main__':
        import time
        index_url = 'https://www.pearvideo.com/'
        response = get_html(index_url)
        # 1.对梨视频主页进行解析,提取所有视频详情页的绝对路径
        movie_id_list = parse_index(response)
    
        for movie_id in movie_id_list:
            detail_url = 'https://www.pearvideo.com/video_' + movie_id
            time.sleep(0.1)
            # 循环并发异步提交任务, add_done_callback将get_html任务的执行结果,回调给
            pool.submit(get_html, detail_url).add_done_callback(parse_detail)
    
  • 相关阅读:
    取石子(二)巴仕博弈+尼姆博弈
    hdu2430Beans(单调队列)
    LCD: 2D-3D匹配算法
    如何选择视觉CV光源颜色
    gpgpu-sim卡分配程序设计实例分析
    PointRCNN: 点云的3D目标生成与检测
    3D点云重建原理及Pytorch实现
    GPU加速计算
    红外传感器技术
    Linux架构思维导图
  • 原文地址:https://www.cnblogs.com/chanyuli/p/12135616.html
Copyright © 2011-2022 走看看