zoukankan      html  css  js  c++  java
  • 爬取校花网的视频

    from requests_html import HTMLSession
    import os
    session = HTMLSession()
    
    # http://www.xiaohuar.com/list-3-0.html
    #获取索引页url
    def get_index_page():
        for i in range(6):
            url = 'http://www.xiaohuar.com/list-3-%s.html'%i
            yield url
    
    #获取
    # url= "http://www.xiaohuar.com/list-3-0.html"
    # r = session.get(url=url)
    # for element in r.html.find('#images a[class="imglink"]'):
    #     print(element.attrs.get('href'))
    
    
    #解析索引页获取详情页url
    def get_detail_page(url):
        r = session.get(url=url)
        for element in r.html.find('#images a[class="imglink"]'):
            yield element.attrs.get('href')
    
    #测试解析详情页获取视频url,名字
    # url = 'http://www.xiaohuar.com/p-3-136.html'
    # r = session.get(url=url)
    # r.html.encoding = "gbk"
    # file_name = r.html.find('title',first=True).text.replace('\','')
    # print(file_name)
    #
    # element = r.html.find('#media source',first=True)
    # if element:
    #     mp4_url = element.attrs.get('src')
    # else:
    #     m3u8_url = r.html.search('var vHLSurl    = "{}";')[0]
    #     print(m3u8_url)
    
    
    #解析详情页获取视频url,名字
    def get_url_name(url):
        r = session.get(url=url)
        r.html.encoding = "gbk"
        file_name = r.html.find('title',first=True).text.replace('\','')
        print(file_name)
        element = r.html.find('#media source',first=True)
        if element:
            vurl = element.attrs.get('src')
            vtype = 'mp4'
        else:
            vurl = r.html.search('var vHLSurl    = "{}";')[0]
            vtype = 'm3u8'
        return file_name,vurl,vtype
    
    #保存文件
    def save(file_name,vurl,vtype):
        if vtype == "mp4":
            file_name += ".mp4"
            r = session.get(url=vurl)
            with open(file_name,'wb') as f:
                f.write(r.content)
        elif vtype == "m3u8":
            save_m3u8(file_name,vurl)
    
    #处理m3u8
    def save_m3u8(file_name,vurl):
        if not os.path.exists(file_name):
            os.mkdir(file_name)
        r = session.get(url=vurl)
        m3u8_path = os.path.join(file_name,'playlist.m3u8')
        with open(m3u8_path,'wb') as f:
            f.write(r.content)
        for line in r.text:
            if line.endswith('ts'):
                ts_url = vurl.replace('playlist.m3u8',line)
                ts_path = os.path.join(file_name,line)
                r0 = session.get(url=ts_url)
                with open(ts_path,'wb') as f:
                    f.write(r0.content)
    
    
    if __name__ == '__main__':
        for index_page in get_index_page():
            for detail_url in get_detail_page(index_page):
                file_name, vurl, vtype = get_url_name(detail_url)
                save(file_name, vurl, vtype)
    
    #  上述的for循环,是由于yield导致的!建议使用,看起来大气
    知识点补充:
    
    # print(str('电影'.encode('utf-8')).strip("b'").upper().replace('X','%'))
    
    #    前端页面对中文的参数的编码原理
    
    
    
    视频以m3u8结尾的,需要我门再进一步处理!拿到里面片段的ts文件!

    因为存在视屏链接是以index结尾的,所以需要进一步完善!

  • 相关阅读:
    [整] Android Fragment 生命周期图
    [原]Android Fragment 入门介绍
    [原]Android开发优化-Adapter优化
    [整] Android ListView 去除边缘阴影、选中色、拖动背景色等
    [整]Android开发优化-布局优化
    [原]Android 初遇Http错误 httpClient.execute
    [转]Android ANR 分析解决方法
    [整]Android SlidingMenu Demo 环境搭建
    RabbitMQ教程
    【centos7】添加开机启动服务/脚本
  • 原文地址:https://www.cnblogs.com/changwenjun-666/p/11324412.html
Copyright © 2011-2022 走看看