zoukankan      html  css  js  c++  java
  • python 自动下载 voa MP3

    因为英语学习的需要,经常会去网上下载一些VOA的MP3,常去的一个网站是http://www.51voa.com/

    要想下载该网站上的MP3,需要手动选择要下载的篇幅,打开之后再选择要下载的MP3文件。要下载单独一个MP3文件还好,但要是想把某一时间内的所有MP3文件都下载下来,就很繁琐,需要重复做那些无聊的操作。能否用python来做一个下载voa MP3的工具呢?

    设计思路如下:

    一、打开http://www.51voa.com/主页,分析html,解析出主页上 VOA美国之音听力最近更新 文件列表,生成<文件名,文件下载地址>的dictionary

    二、对已生成的dictionary按照当前日期过滤,得到能下载的当天的VOA MP3

    三、对过滤后的dictionary遍历,同时进行下载操作

    其中所使用的技术:

    一、解析html,可以使用standar library中的HTMLParser,或者SGMLParser,也可以选择3rd party的解析库,比如BeautifulSoup(对html和xml都能很好的支持),本文采用BeautifulSoup

    二、下载MP3,采用urllib,为提高效率,使用多线程进行下载,url header中可以使用Range参数分片下载,这样一来就能多部分协同操作。

    具体代码如下:

    一、多线程下载部分代码

    #!/usr/bin/env python
    
    # -*- coding :utf-8 -*-
    """
     It is a multi-thread downloading tool
    """
    
    import sys
    import os
    import time
    import urllib2
    import urllib
    from threading import Thread
    
    class MyWorkThread(Thread, urllib.FancyURLopener):
        """
        Multi-thread downloading class.
        run() is a vitual method of Thread
        """
        def __init__(self, threadname, url, filename, ranges = 0):
            Thread.__init__(self, name = threadname)
            urllib.FancyURLopener.__init__(self)
            self.name = threadname
            self.url = url
            self.filename = filename
            self.ranges = ranges
            self.downloaded = 0
        def run(self):
            """
            virtual function in Thread
            """
            try:
                self.downloaded = os.path.getsize(self.filename)
            except OSError:
                self.downloaded = 0
            #rebuild start point
            self.startpoint = self.ranges[0] + self.downloaded
            
            #if this part is completed
            if self.startpoint >= self.ranges[1]:
                print 'Part %s has been downloaded over.' % self.filename
                return
            self.oneTimeSize = 8 * 1024 #8K bytes / time
            print 'task %s will download from %d to %d' %(self.name, self.startpoint, self.ranges[1])
            self.addheader('Range', 'bytes=%d-%d' %(self.startpoint, self.ranges[1]))
            self.urlhandle = self.open(self.url)
            data = self.urlhandle.read(self.oneTimeSize)
            while data:
                filehandle = open(self.filename, 'ab+')
                filehandle.write(data)
                filehandle.close()
                self.downloaded += len(data)
                data = self.urlhandle.read(self.oneTimeSize)
                
    def GetUrlFileSize(url):
        urlHandler = urllib.urlopen(url)
        headers = urlHandler.info().headers
        length = 0
        for header in headers:
            if header.find('Length') != -1:
                length = header.split(':')[-1].strip()
                length = int(length)
        return length
    def SpliteBlocks(totalsize, blocknumber):
        blocksize = totalsize / blocknumber
        ranges = []
        for i in range(0, blocknumber -1):
            ranges.append((i * blocksize, i * blocksize + blocksize -1))
        ranges.append((blocksize * (blocknumber -1), totalsize -1))
        return ranges
    def isLive(tasks):
        for task in tasks:
            if task.isAlive():
                return True
        return False
    def downLoadFile(url, output, blocks = 6):
        sys.stdout.write('Begin to download from %s\n' %url )
        sys.stdout.flush()
        size = GetUrlFileSize(url)
        ranges = SpliteBlocks(size, blocks)
        
        threadname = ["thread_%d" %i for i in range(0, blocks)]
        filename = ["tmpfile_%d" %i for i in range(0, blocks)]
        tasks = []
        for i in range(0, blocks):
            task = MyWorkThread(threadname[i], url, filename[i], ranges[i])
            task.setDaemon(True)
            task.start()
            tasks.append(task)
        time.sleep(2)
        while isLive(tasks):
            downloaded = sum([task.downloaded for task in tasks])
            process = downloaded / float(size) * 100
            show = u'\rFilesize: %d Downloaded:%d Completed: %.2f%%' %(size, downloaded, process)
            sys.stdout.write(show)
            sys.stdout.flush
            time.sleep(1)
            
        output = formatFileName(output)
        filehandle = open(output, 'wb+')
        for i in filename:
            f = open(i, 'rb')
            filehandle.write(f.read())
            f.close()
            os.remove(i)
        filehandle.close()
        sys.stdout.write("Completed!\n")
        sys.stdout.flush()
            
    def formatFileName(filename):
        if isinstance(filename, str):
            header, tail = os.path.split(filename)
            if tail != '':
                tuple = ('\\','/',':', '*', '?', '"', '<', '>', '|')
                for char in tuple:
                    if tail.find(char) != -1:
                        tail = tail.replace(char, '')
            filename = os.path.join(header, tail)
            #print filename
            return filename
        else:
            return 'None'
        
    if __name__ == '__main__':
        url = r'http://www.51voa.com/path.asp?url=/201008/hennessy_africa_wildlife_18aug10-32b.mp3'
        output = r"D:\Voa\Study:'Shoot to Kill' Policy in Africa's Parks Abuses Human Rights.mp3"
        downLoadFile(url, output, blocks = 4)
    

    二、解析voa页面部分代码

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    import urllib2
    import chardet
    import os
    import time
    import string
    import re
    from HTMLParser import HTMLParser
    import sys
    from BeautifulSoup import BeautifulSoup
    import multiThreadDownloadTool


    VOA_URL = r'http://www.51voa.com'
    DOWNLOAD_DIR = r'D:/Voa'

    """
    File downloading from the web.
    """

    def getURLContent(url):
    """
    get url content of the url, begin with html and ignor the doctype declarations
    """
    file = urllib2.urlopen(url)
    #print file.info()
    data = file.read()
    file.close()
    #return data.decode('utf-8')
    index = data.find('html')
    data = data[index - 1 :]
    return data
    def getVOAURLs(content):
    """
    find the voa script urls in the content
    """
    urls = {}
    soup = BeautifulSoup(content)
    divs = soup.findAll('div', {'id':'rightContainer'})
    #print divs
    neededDiv = None
    if len(divs) >= 1:
    neededDiv = divs[0]
    if neededDiv != None:
    #pass the div
    #print neededDiv
    neededSpan = neededDiv.find('span', {'id' : 'list'})
    #print neededSpan
    lis = neededSpan.findAll('li')
    #print lis
    for li in lis:
    needAs = li.findAll('a')
    #got it
    #print needAs[1]
    #print needAs[1]['href']
    #print needAs[-1].string
    urls[needAs[-1].string] = VOA_URL + needAs[-1]['href']
    print "getVOAURLs() urls count is " , len(urls)
    return urls
    def filterbyDate(urls ,date):
    """
    filter the urls by date
    """
    neededURLs = {}
    currentDate = time.localtime(time.time());
    #currentDateStr = time.strftime('%Y-%m-%d', currentDate)
    #currentDateStr = currentDate.tm_year + '-' + currentDate.tm_mon + ' ' + currentDate.tm_mday
    currentDateStr = "%s-%s-%s" %(currentDate.tm_year, currentDate.tm_mon, currentDate.tm_mday)
    if(date != None):
    currentDateStr = date
    for url in urls.keys():
    name = url.lstrip().rstrip()
    length = len(name)
    publishDate = name[- len(currentDateStr) - 1 : -1]
    #print publishDate
    if publishDate == currentDateStr:
    neededURLs[name] = urls[url]
    print 'find ', name

    print 'After filter, the count is ' , len(neededURLs)
    return neededURLs

    def findMP3FileInURL(url):
    """
    find MP3 files in a url
    """
    print 'parse the content of ', url
    urls = []
    #define a MP3 re string
    p = re.compile(r'/path.asp\?url=[-\w/]*\.mp3')
    #p = re.compile(r'/[-\w/]*\.mp3')
    content = getURLContent(url)
    matchLinks = p.findall(content)
    #print matchLinks
    for link in matchLinks:
    tmp = VOA_URL + link
    if tmp in urls: # check if exist already
    pass
    else:
    urls.append(tmp)
    print 'Current count of mp3 files is ', len(urls)
    return urls
    def getHTMLFile(url, file_name):
    ifile = urllib2.urlopen(url)
    content = ifile.read()
    local_file = open(file_name, 'w')
    local_file.write(content)
    local_file.close()

    def downloadFile(url, fileName2Store):
    """
    download file from url, and store it to local system using fileName2Store parameter
    """
    try:
    full_path = os.path.join(DOWNLOAD_DIR, fileName2Store)
    print 'begin to download url to ', full_path
    if os.path.isfile(full_path):
    #already exist
    print 'the file ', full_path, 'is alreasy exist, so just skip it!'
    else:
    print '\tDownloading the mp3 file...',
    data=urllib2.urlopen(url).read()
    print 'Done'
    print '\tWriting data info file...',
    f=file(full_path, 'wb')
    f.write(data)
    print 'Done'
    f.close()
    except Exception, ex:
    print 'some exceptions occur when downloading ', ex
    if __name__ == "__main__":
    try:
    #getHTMLFile(VOA_URL, r'.\Voa.html')
    context = getURLContent(VOA_URL)
    #file_read = open(r'.\Voa.html', 'r')
    #context = file_read.read()
    #print context
    #print '\n' * 5 #print chardet.detect(context) print 'Begin to get download information, it may cost some minuts, please wait...' files2download = getVOAURLs(context) neededDownload = filterbyDate(files2download, None) neededDownloadMp3s = {} for name in neededDownload.keys(): fullURL = neededDownload[name] formatedName = name[: -11].lstrip().rstrip() #formatedName = formatedName.replace(' ', '-') #print formatedName, ' ' * 5, fullURL #print fullURL mp3Names = findMP3FileInURL(fullURL) if len(mp3Names) == 1: #there is only on mp3 file in this file ,so we will use the formatedname neededDownloadMp3s[formatedName] = mp3Names[0] else: for name in mp3Names: print name index_begin = name.rfind('/') index_end = name.rfind('.') tmpName = name[index_begin + 1 : index_end] neededDownloadMp3s[tmpName] = name print 'Now , the mp3 files are :' print neededDownloadMp3s #findMP3FileInURL(r'http://www.51voa.com/VOA_Special_English/Phoning-Fertilizer-Philippine-Rice-Farmers--38545.html') #findMP3FileInURL(r'http://www.51voa.com/Voa_English_Learning/Learn_A_Word_38412.html') #down load file for filename in neededDownloadMp3s.keys(): try: full_path = os.path.join(DOWNLOAD_DIR, filename) full_path = full_path + r'.mp3' if full_path == r'D:\Voa\hennessy_africa_wildlife_18aug10-32b.mp3': multiThreadDownloadTool.downLoadFile(neededDownloadMp3s[filename], full_path) except Exception, ex: print 'Some exceptions occur, when downloading file from %s, exception messages are %s' %(neededDownloadMp3s[filename], ex) #downloadFile(r'http://www.51voa.com/path.asp?url=/201008/mercer_australia_election_16aug10-32b.mp3', 'test.mp3') except Exception, ex: print 'Exception caught, tracebacks are :',sys.exc_info(), ex print 'download all completed!' raw_input("Press any key to continue...")

    需要注意的地方:

    在使用BeautifulSoup进行html解析的时候发现,BeautifulSoup对于

    <!DOCTYPE html PUBliC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" />

     的支持不是很好,经常解析不出来,所以为了方便,在解析的时候先将源文件解析,只将<html></html>之间的数据交与BeautifulSoup解析。具体为什么BeautifulSoup解析

    DOCTYPE出错,我还没查出问题所在,希望有知道的朋友告知一声。


  • 相关阅读:
    Golang 归并排序(MergeSort)
    Kubernetes-PV/PVC
    Python 快速排序(QuickSort)
    Kubernetes-Service
    Docker 架构
    Deployment 工作流程
    http响应code-405
    python实现计数累增的方法
    mysql使用记录、持续更新
    mac开发环境-brew、xcode
  • 原文地址:https://www.cnblogs.com/Jerryshome/p/1803261.html
Copyright © 2011-2022 走看看