zoukankan      html  css  js  c++  java
  • 采集百度top500歌曲,python2.7.2

    http://blog.b999.net/post/141/


    #-*- coding: UTF-8 -*-
    '''
    Created on 2012-3-8

    @author: tiantian

    Modify: 2012-4-15
    The correct save to file in windows
    '''
    import urllib
    import re
    import platform
    import os

    top500 = 'http://list.mp3.baidu.com/top/top500.html'
    #top500 = 'http://list.mp3.baidu.com/list/shaoergequ.html'

    songs = []

    if (os.path.exists('songs')== False):
     os.mkdir('songs')

    def main():

        divr = '
    .*?.*?
    '
        mf = urllib.urlopen(top500)
        content = mf.read()
        content = content.decode('gbk')

        content = re.sub(' +',' ',content)
        alldiv = re.findall(divr,content)
        i =0
        for div in alldiv:
            ulr = ''
            allul = re.findall(ulr,div)

            for ul in allul:
                lir = ''
                allli = re.findall(lir,ul)

                for li in allli:
                    if i<245:
                        i = i+1
                        continue
                    i = i+1
                    songName = '
    .*?(.*?).*?
    '
                    name = re.findall(songName,li)
                    songAuthor = '
    .*?(.*?).*?
    '
                    author = re.findall(songAuthor,li)

                    songs.append([name[0],author[0]])

                    songUrl = getSongUrl(name[0],author[0])

                    sysstr = platform.system()
                    if(sysstr =="Windows"):
                     filename = ('songs/'+name[0]+'-'+author[0]+'.mp3').encode('gbk')
                    elif(sysstr == "Linux"):
                     filename = 'songs/'+name[0]+'-'+author[0]+'.mp3'
                    else:
                     print ("Other System tasks")
                    print filename

                    try:
                        urllib.urlretrieve(songUrl,filename)
                        # 异常检查并不能判断是否下载成功,需要进行其他判断
                        print i,name[0],author[0],'下载成功'

                    except Exception :
                        print i,name[0],author[0],'没下载成功'


    def getSongUrl(songName,authorName):
        '''这里由于歌曲名称和作者名称的不完整,可能导致无法得到url,'''
        songUrl = 'http://box.zhangmen.baidu.com/x?op=12&count=1&mtype=1&title=%s$$%s$$$$&url=&listenreelect=0&.r=0.1696378872729838' % (urllib.quote(songName.encode('gbk')),urllib.quote(authorName.encode('gbk')))
        f = urllib.urlopen(songUrl)
        c = f.read()
        url1 = re.findall('.*?CDATA[(.*?)]].*?',c)
        url2 = re.findall('.*?CDATA[(.*?)]].*?',c)
        if len(url1) <1:
            return 'http://box.zhangmen.baidu.com/unknow.mp3'

        try:
            return url1[0][:url1[0].rindex('/')+1] + url2[0]
        except Exception:
            return url1[0]

    if __name__ == '__main__':
        main()

    采集的mp3文件保存在新建的目录 songs下


    <script>window._bd_share_config={"common":{"bdSnsKey":{},"bdText":"","bdMini":"2","bdMiniList":false,"bdPic":"","bdStyle":"0","bdSize":"16"},"share":{}};with(document)0[(getElementsByTagName('head')[0]||body).appendChild(createElement('script')).src='http://bdimg.share.baidu.com/static/api/js/share.js?v=89860593.js?cdnversion='+~(-new Date()/36e5)];</script>
    阅读(554) | 评论(0) | 转发(1) |
    给主人留下些什么吧!~~
    评论热议
  • 相关阅读:
    201571030114/201571030143《小学四则运算练习软件》结对项目报告
    201571030114随机四则运算
    略读构建之法
    使用staruml学习画类图
    了解面向对象方法学的优点
    做项目的感受和心得
    项目:学生查看自己的作业情况和分数(php)
    php操作数据库的简单示例
    学习使用html与css,并尝试写php
    html和css的联系
  • 原文地址:https://www.cnblogs.com/ztguang/p/12648340.html
Copyright © 2011-2022 走看看