zoukankan      html  css  js  c++  java
  • 简单爬虫一部美剧(二:下载进度条)

    上篇获取到了每一集的下载url,这篇就用获取到的url下载视频

    在下载文件时希望可以给出进度条之类的提示,在网上搜索了一波,发现有一个库可以实现:tqdm库,具体用法参考这篇博客:https://www.jianshu.com/p/1ed2a8b2c77b

    在原来的类下面,再加一个方法,用来下载文件,如下

        def download_file(url, name):
            """下载文件"""
            try:
                response = requests.get(url=url, stream=True)
                content_size = int(response.headers['Content-Length']) / 1024  # 文件大小,从响应头中获取
    
                if content_size:
                    with open(name, "wb") as f:
                        print("total: ", content_size, 'k')
                        for data in tqdm(iterable=response.iter_content(1024), total=content_size, unit='k'):  # 实现下载进度条
                            f.write(data)
                            # f.close()
                        print("
     done " + name)
    
            except RequestException as e:
                print("接口错误信息为  %r", e)

    原有基础上加了一个主函数

        def main(self):
            """主函数"""
            root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
            print(root_dir)
            download_url = self.get_tv_url("血疫第一季")  # 调用get_tv_url(),获取剧集url
            for t in download_url:
                name = t.split('/')[-1] # 获取下载链接url中/后的一段文本,用作文件名
                file_path = root_dir + "/movies/"  # 设置文件保存路径
                if not os.path.exists(file_path):
                    os.makedirs(file_path)
                print("正在下载【{}】".format(name))
                self.download_file(t, file_path+name)  # 调用download_file(),根据下载链接开始下载文件

    最后完整的代码如下

      1 # coding: utf-8
      2 """
      3 author: 我是冰霜
      4 describe: 爬虫80s电影网
      5 create_time: 2020/01/18
      6 """
      7 
      8 import re
      9 
     10 import os
     11 from bs4 import BeautifulSoup
     12 from requests.exceptions import RequestException
     13 import requests
     14 from tqdm import tqdm
     15 
     16 class DownloadTV:
     17     @staticmethod
     18     def get_html(url, data=None, header=None, method=None):
     19         """获取一个url的html格式文本内容"""
     20 
     21         if method == "get":
     22             response = requests.get(url, params=data, headers=header)
     23         else:
     24             response = requests.post(url, data=data, headers=header)
     25         try:
     26             if response.status_code == 200:
     27                 response.encoding = response.apparent_encoding
     28                 # print(response.status_code)
     29                 # print(response.text)
     30                 return response.text
     31             return None
     32         except RequestException:
     33             print("请求失败")
     34             return None
     35 
     36     def get_tv_id(self, tv_name):
     37         """获取所查询电视剧对应的id"""
     38         headers = {
     39             "Content-Type": "application/x-www-form-urlencoded"
     40         }
     41 
     42         data = {
     43             "search_typeid": "1",
     44             "skey": tv_name,  # 用一个变量来表示要搜索的剧名
     45             "Input": "搜索"
     46         }
     47 
     48         url = "http://www.y80s.com/movie/search/"  # 请求url
     49 
     50         response = self.get_html(url, data, headers, "post")
     51 
     52         html = response
     53         # print(html)
     54 
     55         soup = BeautifulSoup(html, "html.parser")
     56         name_label = soup.find_all("a", title=tv_name)  # 获取所有title属性为影视剧名称的<a>标签,用一个变量来动态表示剧名
     57         # print(soup.prettify())
     58         # print(name_label)
     59         # print(name_label[0].get('href'))
     60 
     61         ju_id = re.compile(r'(d+)', re.S)  #  定义一个正则表达式,提取标签内容中的数字
     62         if name_label:
     63             href_value = ju_id.search(name_label[0].get('href'))
     64             if href_value:
     65                 tv_id = href_value.group()
     66                 print("查询影视剧对应的id为:{}".format(tv_id))
     67                 # print(type(tv_id))  # 查看获取到的tv_id的数据类型,如果是int的话,在后续拼接时需要使用str()转成字符串
     68                 return tv_id
     69 
     70     def get_tv_url(self, tv_name):
     71         """获取电视剧的下载url"""
     72         tv_id = self.get_tv_id(tv_name) # 调用get_tv_id()方法,获取tv_id
     73         url = "http://www.y80s.com/ju/" + tv_id  # 利用tv_id拼接url
     74 
     75         r = self.get_html(url,method="get")
     76         html = r
     77         soup = BeautifulSoup(html, "html.parser")
     78         a_tv_url = soup.find_all("a", title="本地下载")  # 提取title属性为"本地下载"的a标签,返回一个包含所有a标签的列表
     79         # print(a_tv_url)
     80         tv_url = []
     81         for t in a_tv_url:
     82             tv_url.append(t.get('href'))  # 用get方法获取每个a标签中的href属性值
     83         print(tv_url)
     84         return tv_url
     85 
     86     @staticmethod
     87     def download_file(url, name):
     88         """下载文件"""
     89         try:
     90             response = requests.get(url=url, stream=True)
     91             content_size = int(response.headers['Content-Length']) / 1024  # 文件大小,从响应头中获取
     92 
     93             if content_size:
     94                 with open(name, "wb") as f:
     95                     print("total: ", content_size, 'k')
     96                     for data in tqdm(iterable=response.iter_content(1024), total=content_size, unit='k'):
     97                         f.write(data)
     98                         # f.close()
     99                     print("
     done " + name)
    100 
    101         except RequestException as e:
    102             print("接口错误信息为  %r", e)
    103 
    104     def main(self):
    105         """主函数"""
    106         root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
    107         print(root_dir)
    108         download_url = self.get_tv_url("血疫第一季")  # 调用get_tv_url(),获取剧集url
    109         for t in download_url:
    110             name = t.split('/')[-1] # 获取下载链接url中/后的一段文本,用作文件名
    111             file_path = root_dir + "/movies/"  # 设置文件保存路径
    112             if not os.path.exists(file_path):
    113                 os.makedirs(file_path)
    114             print("正在下载【{}】".format(name))
    115             self.download_file(t, file_path+name)  # 调用download_file(),根据下载链接开始下载文件
    116 
    117 if __name__ == '__main__':
    118     test = DownloadTV()
    119     test.main()

    运行一下

  • 相关阅读:
    C#中WinForm程序退出方法技巧(转载)
    webbrowser访问网站禁止弹窗
    (转载)ASP.NET三大核心对象及基础功能解析
    webBrowser调用外部js文件和js函数(转载)
    java IO流
    java线程状态,优先级
    java线程
    Collection接口,Map接口
    序列化和反序列化
    JsonUtil自定义
  • 原文地址:https://www.cnblogs.com/hanmk/p/12323678.html
Copyright © 2011-2022 走看看