zoukankan      html  css  js  c++  java
  • 8.梨视频数据的爬取1.py

    import re
    import requests
    from lxml import etree

    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
    }

    url = "https://www.pearvideo.com/category_1"

    # 拿到首页源码数据:
    page_text = requests.get(url, headers=headers).text

    # 解析:
    tree = etree.HTML(page_text)

    # 定位视频所在位置
    li_list = tree.xpath('//*[@id="listvideoListUl"]/li')

    # 循环视频里的url信息和标题并拿到:
    for li in li_list:
    detail_url = "https://www.pearvideo.com/" + li.xpath('./div/a/@href')[0]
    title = li.xpath('./div/a/div[2]/text()')[0] + '.mp4'
    # 详情页信息:
    detail_page_text = requests.get(detail_url, headers=headers).text
    # 提取动态加载的视频数据:
    """
    var contId="1653941",liveStatusUrl="liveStatus.jsp",liveSta="",playSta="1",autoPlay=!1,isLiving=!1,
    isVrVideo=!1,hdflvUrl="",sdflvUrl="",hdUrl="",sdUrl="",ldUrl="",srcUrl="https://video.pearvideo
    .com/mp4/adshort/20200220/cont-1653941-14928648_adpkg-ad_hd.mp4",vdoUrl=srcUrl,skinRes="//www.pearvideo
    .com/domain/skin",videoCDN="//video.pearvideo.com";
    """
    # 正则匹配提取:srcUrl="、(.*?)表示从某某无穷开始到",vdoUrl结束
    ex = 'srcUrl="(.*?)",vdoUrl'
    video_url = re.findall(ex, detail_page_text, re.S)[0] # 参数1:要查找的内容,参数2:从哪查找,3、返回结果是列表的形式,4、将这个字符串作为一个整体,在整体中进行匹配
    video_data = requests.get(video_url, headers=headers).content
    with open(title, "wb") as fp:
    fp.write(video_data)
  • 相关阅读:
    BF3,MW3,CF,高端?亲民
    关于#ifdef __cplusplus extern
    lua源码阅读顺序
    (ZZ)如何实现游戏主循环(Game Loop)的详细解析
    D3D学习总结基础篇(二)从古墓丽影的画面设置了解基础概念
    比较两个json是否相等
    IPAD点滴 WebIM
    Remoting与Font对象 WebIM
    使用android隐藏api实现亮度调节
    symbian的HTTP引擎中对302、301事件的处理
  • 原文地址:https://www.cnblogs.com/zhang-da/p/12345896.html
Copyright © 2011-2022 走看看