zoukankan      html  css  js  c++  java
  • 爬取梨视频生活-对动态数据的爬取

    import requests
    import os
    from lxml import etree
    url = 'https://www.pearvideo.com/category_5'

    headers = {
    "User-Agent":'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0'
    }
    if not os.path.exists('./images'):
    os.mkdir('./images')
    page_text = requests.get(url=url, headers=headers).text
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
    video_list1 = []
    for li in li_list:
    each = li.xpath('./div/a/@href')[0]
    url_num = each.replace('video_', "")
    name = li.xpath('./div/a/div[2]/text()')[0] + '.mp4'
    dic = {
    "url_num":url_num,
    "name":name
    }

    video_list1.append(dic)
    

    print(video_list1)

    target = "https://www.pearvideo.com/videoStatus.jsp?contId="
    for dic_data in video_list1:
    new_url = target + dic_data['url_num']
    # print(new_url)
    headers = {
    "User-Agent": 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0',
    'Referer': 'https://www.pearvideo.com/video_' + dic_data['url_num']
    }
    url_data = requests.get(url=new_url,headers=headers).json()
    srcUrl = url_data['videoInfo']['videos']['srcUrl']
    cont = 'cont-' + dic_data['url_num']
    new1_url = srcUrl.replace(srcUrl.split("-")[0].split("/")[-1], cont)
    print(new1_url)
    # requests.get(url=new1_url)
    # 使用视频后缀当视频名称
    # filename = srcUrl.split("/")[-1]
    filename = dic_data["name"]

    # 保存到本地
    
    video_data = requests.get(new1_url,headers=headers).content
    with open("./images/" + filename, "wb") as f:
        f.write(video_data)
    人生苦短,我用python
  • 相关阅读:
    Java 分支结构
    Java 循环结构
    Java 运算符
    Java 修饰符
    Alpha冲刺——Day 6
    Alpha冲刺——Day 5
    Alpha冲刺——Day 4
    Alpha冲刺——Day 3
    Alpha冲刺——Day 2
    Alpha冲刺——Day 1
  • 原文地址:https://www.cnblogs.com/niucunguo/p/14438153.html
Copyright © 2011-2022 走看看