zoukankan      html  css  js  c++  java
  • 爬取梨视频主页所有视频

    import requests
    import re
    import uuid
    from concurrent.futures import ThreadPoolExecutor
    
    pool = ThreadPoolExecutor(50)
    
    
    # 爬虫三部曲
    # 1.发送请求
    def get_html(url):
        print(f'start: {url}...')
        response = requests.get(url)
        return response
    
    # 2.解析数据
    # 解析主页,获取视频详情页url
    def parse_index(response):
        '''
        <a href="(.*?)" class="vervideo-lilink actplay" target="_blank">.*?<div class="vervideo-title">(.*?)</div>.*?</a>
        '''
        # 获取电影所有的id
        movie_id_list = re.findall(
            '<a href="video_(.*?)"',  # video_1637397
            response.text,
            re.S
        )
        return list(set(movie_id_list))
    
    
    # from concurrent.futures._base import Future
    # 解析视频详情页,获取真实视频url
    def parse_detail(res):  # res对象 --- 》 {'result': response}
        # print(type(res))
    
        # print(res.__dict__)
        # print('*'*100)
        response = res.result()
        # 通过回调得到的response参数是一个对象
        '''
        <video webkit-playsinline="" playsinline="" x-webkit-airplay="" autoplay="autoplay" src="https://video.pearvideo.com/mp4/adshort/20191228/cont-1637151-14745612_adpkg-ad_hd.mp4" style=" 100%; height: 100%;"></video>
        '''
        movie_url = re.findall('srcUrl="(.*?)"', response.text, re.S)[0]
    
        print('是否到此处了')
        # 异步提交任务爬取真实视频数据,并保存
        pool.submit(save_movie, movie_url)
    
    
    # 3.保存数据
    def save_movie(movie_url):
        print('start')
        movie_response = get_html(movie_url)
    
        # print(movie_response.text)
        # print(movie_response.content)
    
        with open(f'{str(uuid.uuid4())}.mp4', 'wb') as f:
            for line in movie_response.iter_content():
                f.write(line)
    
        print('end...')
    
    if __name__ == '__main__':
        import time
        index_url = 'https://www.pearvideo.com/'
        response = get_html(index_url)
        # 1.对梨视频主页进行解析,提取所有视频详情页的绝对路径
        movie_id_list = parse_index(response)
    
        for movie_id in movie_id_list:
            detail_url = 'https://www.pearvideo.com/video_' + movie_id
            time.sleep(0.1)
            # 循环并发异步提交任务, add_done_callback将get_html任务的执行结果,回调给
            pool.submit(get_html, detail_url).add_done_callback(parse_detail)
    
  • 相关阅读:
    预习非数值数据的编码方式
    预习原码补码反码
    C语言||作业01
    C语言寒假大作战04
    关于数据库及druid连接池版本,还有相关配置异常。。。
    关于idea部署web项目出现中文乱码
    spring与mybatis整合
    mybatis使用
    今日异常(7.8):关于maven项目复制问题
    今日异常(7.6):Mybatis错误:There is no getter for property named 'xxx' in 'class java.lang.String'
  • 原文地址:https://www.cnblogs.com/chanyuli/p/12135616.html
Copyright © 2011-2022 走看看