zoukankan      html  css  js  c++  java
  • Python爬取视频(其实是一篇福利)

    到上面去看了看,地址都是明文的,得,赶紧开始吧。

    下载流式文件,requests库中请求的stream设为True就可以啦,文档在此

    先找一个视频地址试验一下:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    # -*- coding: utf-8 -*-
    import requests
     
    def download_file(url, path):
        with requests.get(url, stream=True) as r:
            chunk_size = 1024
            content_size = int(r.headers['content-length'])
            print '下载开始'
            with open(path, "wb") as f:
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)
     
     
    if __name__ == '__main__':
        url = '就在原帖...'
        path = '想存哪都行'
        download_file(url, path)

    遭遇当头一棒:

    1
    AttributeError: __exit__

    这文档也会骗人的么!

    看样子是没有实现上下文需要的__exit__方法。既然只是为了保证要让r最后close以释放连接池,那就使用contextlib的closing特性好了:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    # -*- coding: utf-8 -*-
    import requests
    from contextlib import closing
     
    def download_file(url, path):
        with closing(requests.get(url, stream=True)) as r:
            chunk_size = 1024
            content_size = int(r.headers['content-length'])
            print '下载开始'
            with open(path, "wb") as f:
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)

    程序正常运行了,不过我盯着这文件,怎么大小不见变啊,到底是完成了多少了呢?还是要让下好的内容及时存进硬盘,还能省点内存是不是:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    # -*- coding: utf-8 -*-
    import requests
    from contextlib import closing
    import os
     
    def download_file(url, path):
        with closing(requests.get(url, stream=True)) as r:
            chunk_size = 1024
            content_size = int(r.headers['content-length'])
            print '下载开始'
            with open(path, "wb") as f:
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)
                    f.flush()
                    os.fsync(f.fileno())

    文件以肉眼可见的速度在增大,真心疼我的硬盘,还是最后一次写入硬盘吧,程序中记个数就好了:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    def download_file(url, path):
        with closing(requests.get(url, stream=True)) as r:
            chunk_size = 1024
            content_size = int(r.headers['content-length'])
            print '下载开始'
            with open(path, "wb") as f:
                n = 1
                for chunk in r.iter_content(chunk_size=chunk_size):
                    loaded = n*1024.0/content_size
                    f.write(chunk)
                    print '已下载{0:%}'.format(loaded)
                    n += 1

    结果就很直观了:

    1
    2
    3
    4
    已下载2.579129%
    已下载2.581255%
    已下载2.583382%
    已下载2.585508%

    心怀远大理想的我怎么会只满足于这一个呢,写个类一起使用吧:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    # -*- coding: utf-8 -*-
    import requests
    from contextlib import closing
    import time
     
    def download_file(url, path):
        with closing(requests.get(url, stream=True)) as r:
            chunk_size = 1024*10
            content_size = int(r.headers['content-length'])
            print '下载开始'
            with open(path, "wb") as f:
                p = ProgressData(size = content_size, unit='Kb', block=chunk_size)
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)
                    p.output()
     
     
    class ProgressData(object):
     
        def __init__(self, block,size, unit, file_name='', ):
            self.file_name = file_name
            self.block = block/1000.0
            self.size = size/1000.0
            self.unit = unit
            self.count = 0
            self.start = time.time()
        def output(self):
            self.end = time.time()
            self.count += 1
            speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0
            self.start = time.time()
            loaded = self.count*self.block
            progress = round(loaded/self.size, 4)
            if loaded >= self.size:
                print u'%s下载完成 '%self.file_name
            else:
                print u'{0}下载进度{1:.2f}{2}/{3:.2f}{4} 下载速度{5:.2%} {6:.2f}{7}/s'.
                      format(self.file_name, loaded, self.unit,
                      self.size, self.unit, progress, speed, self.unit)
                print '%50s'%('/'*int((1-progress)*50))

    运行:

    1
    2
    3
    4
    5
    下载开始
    下载进度10.24Kb/120174.05Kb 0.01% 下载速度4.75Kb/s
    /////////////////////////////////////////////////
    下载进度20.48Kb/120174.05Kb 0.02% 下载速度32.93Kb/s
    /////////////////////////////////////////////////

    看上去舒服多了。

    下面要做的就是多线程同时下载了,主线程生产url放入队列,下载线程获取url:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    # -*- coding: utf-8 -*-
    import requests
    from contextlib import closing
    import time
    import Queue
    import hashlib
    import threading
    import os
     
     
    def download_file(url, path):
        with closing(requests.get(url, stream=True)) as r:
            chunk_size = 1024*10
            content_size = int(r.headers['content-length'])
            if os.path.exists(path) and os.path.getsize(path)>=content_size:
                print '已下载'
                return
            print '下载开始'
            with open(path, "wb") as f:
                p = ProgressData(size = content_size, unit='Kb', block=chunk_size, file_name=path)
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)
                    p.output()
     
     
    class ProgressData(object):
     
        def __init__(self, block,size, unit, file_name='', ):
            self.file_name = file_name
            self.block = block/1000.0
            self.size = size/1000.0
            self.unit = unit
            self.count = 0
            self.start = time.time()
        def output(self):
            self.end = time.time()
            self.count += 1
            speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0
            self.start = time.time()
            loaded = self.count*self.block
            progress = round(loaded/self.size, 4)
            if loaded >= self.size:
                print u'%s下载完成 '%self.file_name
            else:
                print u'{0}下载进度{1:.2f}{2}/{3:.2f}{4} {5:.2%} 下载速度{6:.2f}{7}/s'.
                      format(self.file_name, loaded, self.unit,
                      self.size, self.unit, progress, speed, self.unit)
                print '%50s'%('/'*int((1-progress)*50))
     
     
    queue = Queue.Queue()
     
     
    def run():
        while True:
            url = queue.get(timeout=100)
            if url is None:
                print u'全下完啦'
                break
            h = hashlib.md5()
            h.update(url)
            name = h.hexdigest()
            path = 'e:/download/' + name + '.mp4'
            download_file(url, path)
     
     
    def get_url():
        queue.put(None)
     
     
    if __name__ == '__main__':
        get_url()
        for i in xrange(4):
            t = threading.Thread(target=run)
            t.daemon = True
            t.start()

    加了重复下载的判断,至于怎么源源不断的生产url,诸位摸索吧,保重身体!

  • 相关阅读:
    HDU 2433 Travel (最短路,BFS,变形)
    HDU 2544 最短路 (最短路,spfa)
    HDU 2063 过山车 (最大匹配,匈牙利算法)
    HDU 1150 Machine Schedule (最小覆盖,匈牙利算法)
    290 Word Pattern 单词模式
    289 Game of Life 生命的游戏
    287 Find the Duplicate Number 寻找重复数
    283 Move Zeroes 移动零
    282 Expression Add Operators 给表达式添加运算符
    279 Perfect Squares 完美平方数
  • 原文地址:https://www.cnblogs.com/wanghuaijun/p/12806024.html
Copyright © 2011-2022 走看看