zoukankan html css js c++ java

网页爬虫---音乐

import requests
import time
import re
import os

"""歌手字典"""
song_dict = {}

def song_static():
    """采集静态页面url和歌手"""
    try:
        response = requests.get('http://www.9ku.com/music/T_Singer.htm', timeout=30)
        html = response.text
        reg = r'<a href="(.*?)" class="t-t">(.*?)</a>'
        static_singer = re.findall(reg, html)
        for ul, title in static_singer:
            url = 'http://www.9ku.com' + ul
            song_dict[title]=url
    except requests.exceptions.Timeout as e:
        print(e)
    except requests.exceptions.HTTPError as e:
        print(e)
    # df = pd.DataFrame(song_list, columns=['url', '歌手'])
    # df.to_excel('歌手url.xlsx', engine='xlsxwriter', index=False)
    return song_dict


#动态歌手地址采集
def song_List():
    """采集动态页面url和歌手"""
    i=2
    print('数据采集中......')
    try:
        while True:
            print('正在采集第{}页数据'.format(i))
            response = requests.get("http://www.9ku.com/geshou/all-all-all/{}.htm".format(i),timeout=30)
            html = response.text
            reg = r'<a href="(.*?)" class="t-t">(.*?)</a>'
            data = re.findall(reg,html)
            if len(data):
                i += 1
                for ul ,title in data:
                    url = 'http://www.9ku.com'+ul
                    song_dict[title] = url
            else:
                response.close()
                break
    except requests.exceptions.Timeout as e:
        print (e)
    except requests.exceptions.HTTPError as e:
        print (e)
    # df = pd.DataFrame(dynamic_singer,columns=['url','歌手'])
    # df.to_excel('歌手url.xlsx',engine='xlsxwriter',index=False)
    print ('数据采集完成')
    return song_dict


def song_search():
    """歌曲下载"""
    while True:
        name = input("请输入歌手名称：")
        path ="" # 下载保存到哪个目录
        if name in song_dict:
            url = song_dict[name]
            response = requests.get(url,timeout=30)
            html = response.text
            regs = r'<div class="songName"><a target="_1" href="(.*?)" class="songNameA">'
            data = re.findall(regs, html)
            for i in data:
                song_id = i.strip('/play/')
                url = 'http://www.9ku.com/down/' + song_id
                response = requests.get(url,timeout=30)
                html = response.text
                regs = r'<a href="(.*?)" style="display:none">(.*?)</a>'
                data = re.findall(regs, html)
                for src, title in data:
                    song_name = title.strip('Mp3下载')
                    r = requests.get(src,timeout=30).content
                    time.sleep(1)
                    f = open('%s/%s.mp3' % (path,song_name), 'wb')
                    f.write(r)
                    print('{}：下载成功'.format(song_name))
                    f.close()
        else:
            print("未找到歌手")


if __name__ == '__main__':
    """采集静态页面数据"""
    song_static()
    """采集动态页面数据"""
    song_List()
    """下载歌曲"""
    song_search()

查看全文

相关阅读:
STL hash_map使用
 STL的 string 类赋值
 STL map使用详解
 下面我使用vector容器为基础来构成一棵树
 MFC中CString.Format的详细用法
 error LNK2001: 无法解析的外部符号 "public: static class stdext::hash_map
!!! STL的string类如何实现CString的Format功能这是一个经典问题，记住
 STL map和STL set(转载)
为什么提示此错误？RunTime Check Failure #2 Stack around the variable 'tch1'was corrupted.
STL源码剖析

原文地址：https://www.cnblogs.com/sheshidu/p/13282811.html