zoukankan      html  css  js  c++  java
  • python多线程下载文件

    从文件中读取图片url和名称,将url中的文件下载下来。文件中每一行包含一个url和文件名,用制表符隔开。

    1、使用requests请求url并下载文件

    def download(img_url, img_name):
        with closing(requests.get(img_url, stream=True)) as r:
            with open(os.path.join(out_dir, img_name), 'wb') as f:
                for data in r.iter_content(1024):
                    f.write(data)

    2、从文件中读取url,考虑文件较大,使用生成器的方式读取。

    def get_imgurl_generate():
        with open('./example.txt', 'r') as f:
            for line in f:
                line = line.strip()
                yield imgs

    3、使用多线程进行下载

    lock = threading.Lock()
    def loop(imgs):
        while True:
            try:
                with lock:
                    img_url, img_name = next(imgs)
            except StopIteration:
                break
            download_pic(img_url, img_name)
    
    img_gen = imgurl_generate()
    
    for i in range(0, thread_num):
        t = threading.Thread(target=loop, args=(img_gen,))
        t.start()

    完整代码,加入异常处理

     1 # -*- coding: utf-8 -*-
     2 import os
     3 from contextlib import closing
     4 import threading
     5 import requests
     6 import time
     7 
     8 
     9 headers = {
    10 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
    11 }
    12 
    13 #输出文件夹
    14 out_dir = './output'
    15 #线程数
    16 thread_num = 20
    17 #http请求超时设置
    18 timeout = 5
    19 
    20 if not os.path.exists(out_dir):
    21     os.mkdir(out_dir)
    22 
    23 
    24 
    25 def download(img_url, img_name):
    26     if os.path.isfile(os.path.join(out_dir, img_name)):
    27         return
    28     with closing(requests.get(img_url, stream=True, headers=headers, timeout=timeout)) as r:
    29         rc = r.status_code
    30         if 299 < rc or rc < 200:
    31             print 'returnCode%s	%s' % (rc, img_url)
    32             return
    33         content_length = int(r.headers.get('content-length', '0'))
    34         if content_length == 0:
    35             print 'size0	%s' % img_url
    36             return
    37         try:
    38             with open(os.path.join(out_dir, img_name), 'wb') as f:
    39                 for data in r.iter_content(1024):
    40                     f.write(data)
    41         except:
    42             print 'savefail	%s' % img_url
    43 
    44 def get_imgurl_generate():
    45     with open('./final.scp', 'r') as f:
    46         index = 0
    47         for line in f:
    48             index += 1
    49             if index % 500 == 0:
    50                 print 'execute %s line at %s' % (index, time.time())
    51             if not line:
    52                 print ur'line %s is empty "	"' % index
    53                 continue
    54             line = line.strip()
    55             try:
    56                 imgs = line.split('	')
    57                 if len(imgs) != 2:
    58                     print ur'line %s splite error' % index
    59                     continue
    60                 if not imgs[0] or not imgs[1]:
    61                     print ur'line %s img is empty' % index
    62                     continue
    63                 yield imgs
    64             except:
    65                 print ur'line %s can not split by "	"' % index
    66 
    67 
    68 lock = threading.Lock()
    69 def loop(imgs):
    70     print 'thread %s is running...' % threading.current_thread().name
    71 
    72     while True:
    73         try:
    74             with lock:
    75                 img_url, img_name = next(imgs)
    76         except StopIteration:
    77             break
    78         try:
    79             download(img_url, img_name)
    80         except:
    81             print 'exceptfail	%s' % img_url
    82     print 'thread %s is end...' % threading.current_thread().name
    83 
    84 img_gen = get_imgurl_generate()
    85 
    86 for i in range(0, thread_num):
    87     t = threading.Thread(target=loop, name='LoopThread %s' % i, args=(img_gen,))
    88     t.start()
    View Code
  • 相关阅读:
    《python基础教程 》第二章 读书笔记
    hdu 4462 Scaring the Birds 解题报告
    hud 4454 Stealing a Cake 解题报告
    uva 532 Dungeon Master
    《python基础教程 》第一章 读书笔记
    开源项目资源站点
    syslog() 函数简单解析
    ftruncate()函数
    Mysql数据库函数
    int mysql_options() mysql_real_connect() mysql_real_query()/mysql_real_escape_string
  • 原文地址:https://www.cnblogs.com/lilinwei340/p/6793796.html
Copyright © 2011-2022 走看看