zoukankan      html  css  js  c++  java
  • 爬取校花网的视频

    from requests_html import HTMLSession
    import os
    session = HTMLSession()
    
    # http://www.xiaohuar.com/list-3-0.html
    #获取索引页url
    def get_index_page():
        for i in range(6):
            url = 'http://www.xiaohuar.com/list-3-%s.html'%i
            yield url
    
    #获取
    # url= "http://www.xiaohuar.com/list-3-0.html"
    # r = session.get(url=url)
    # for element in r.html.find('#images a[class="imglink"]'):
    #     print(element.attrs.get('href'))
    
    
    #解析索引页获取详情页url
    def get_detail_page(url):
        r = session.get(url=url)
        for element in r.html.find('#images a[class="imglink"]'):
            yield element.attrs.get('href')
    
    #测试解析详情页获取视频url,名字
    # url = 'http://www.xiaohuar.com/p-3-136.html'
    # r = session.get(url=url)
    # r.html.encoding = "gbk"
    # file_name = r.html.find('title',first=True).text.replace('\','')
    # print(file_name)
    #
    # element = r.html.find('#media source',first=True)
    # if element:
    #     mp4_url = element.attrs.get('src')
    # else:
    #     m3u8_url = r.html.search('var vHLSurl    = "{}";')[0]
    #     print(m3u8_url)
    
    
    #解析详情页获取视频url,名字
    def get_url_name(url):
        r = session.get(url=url)
        r.html.encoding = "gbk"
        file_name = r.html.find('title',first=True).text.replace('\','')
        print(file_name)
        element = r.html.find('#media source',first=True)
        if element:
            vurl = element.attrs.get('src')
            vtype = 'mp4'
        else:
            vurl = r.html.search('var vHLSurl    = "{}";')[0]
            vtype = 'm3u8'
        return file_name,vurl,vtype
    
    #保存文件
    def save(file_name,vurl,vtype):
        if vtype == "mp4":
            file_name += ".mp4"
            r = session.get(url=vurl)
            with open(file_name,'wb') as f:
                f.write(r.content)
        elif vtype == "m3u8":
            save_m3u8(file_name,vurl)
    
    #处理m3u8
    def save_m3u8(file_name,vurl):
        if not os.path.exists(file_name):
            os.mkdir(file_name)
        r = session.get(url=vurl)
        m3u8_path = os.path.join(file_name,'playlist.m3u8')
        with open(m3u8_path,'wb') as f:
            f.write(r.content)
        for line in r.text:
            if line.endswith('ts'):
                ts_url = vurl.replace('playlist.m3u8',line)
                ts_path = os.path.join(file_name,line)
                r0 = session.get(url=ts_url)
                with open(ts_path,'wb') as f:
                    f.write(r0.content)
    
    
    if __name__ == '__main__':
        for index_page in get_index_page():
            for detail_url in get_detail_page(index_page):
                file_name, vurl, vtype = get_url_name(detail_url)
                save(file_name, vurl, vtype)
    
    #  上述的for循环,是由于yield导致的!建议使用,看起来大气
    知识点补充:
    
    # print(str('电影'.encode('utf-8')).strip("b'").upper().replace('X','%'))
    
    #    前端页面对中文的参数的编码原理
    
    
    
    视频以m3u8结尾的,需要我门再进一步处理!拿到里面片段的ts文件!

    因为存在视屏链接是以index结尾的,所以需要进一步完善!

  • 相关阅读:
    C# 了解当前使用的语言版本
    Swagger api 接口管理 使用总结
    Git 常见错误操作
    NestJs 详解
    推荐一个NodeJS 框架 AdonisJS
    Lavavel8.x
    @typegoose/typegoose. 使用技巧
    Typeorm
    Cocos2Dx(4)——动画
    Java程序员的C#学习笔记(1) C#和.NET Framework概览
  • 原文地址:https://www.cnblogs.com/changwenjun-666/p/11324412.html
Copyright © 2011-2022 走看看