zoukankan      html  css  js  c++  java
  • 网页爬虫---音乐

    import requests
    import time
    import re
    import os

    """歌手字典"""
    song_dict = {}

    def song_static():
    """采集静态页面url和歌手"""
    try:
    response = requests.get('http://www.9ku.com/music/T_Singer.htm', timeout=30)
    html = response.text
    reg = r'<a href="(.*?)" class="t-t">(.*?)</a>'
    static_singer = re.findall(reg, html)
    for ul, title in static_singer:
    url = 'http://www.9ku.com' + ul
    song_dict[title]=url
    except requests.exceptions.Timeout as e:
    print(e)
    except requests.exceptions.HTTPError as e:
    print(e)
    # df = pd.DataFrame(song_list, columns=['url', '歌手'])
    # df.to_excel('歌手url.xlsx', engine='xlsxwriter', index=False)
    return song_dict


    #动态歌手地址采集
    def song_List():
    """采集动态页面url和歌手"""
    i=2
    print('数据采集中......')
    try:
    while True:
    print('正在采集第{}页数据'.format(i))
    response = requests.get("http://www.9ku.com/geshou/all-all-all/{}.htm".format(i),timeout=30)
    html = response.text
    reg = r'<a href="(.*?)" class="t-t">(.*?)</a>'
    data = re.findall(reg,html)
    if len(data):
    i += 1
    for ul ,title in data:
    url = 'http://www.9ku.com'+ul
    song_dict[title] = url
    else:
    response.close()
    break
    except requests.exceptions.Timeout as e:
    print (e)
    except requests.exceptions.HTTPError as e:
    print (e)
    # df = pd.DataFrame(dynamic_singer,columns=['url','歌手'])
    # df.to_excel('歌手url.xlsx',engine='xlsxwriter',index=False)
    print ('数据采集完成')
    return song_dict


    def song_search():
    """歌曲下载"""
    while True:
    name = input("请输入歌手名称:")
    path ="" # 下载保存到哪个目录
    if name in song_dict:
    url = song_dict[name]
    response = requests.get(url,timeout=30)
    html = response.text
    regs = r'<div class="songName"><a target="_1" href="(.*?)" class="songNameA">'
    data = re.findall(regs, html)
    for i in data:
    song_id = i.strip('/play/')
    url = 'http://www.9ku.com/down/' + song_id
    response = requests.get(url,timeout=30)
    html = response.text
    regs = r'<a href="(.*?)" style="display:none">(.*?)</a>'
    data = re.findall(regs, html)
    for src, title in data:
    song_name = title.strip('Mp3下载')
    r = requests.get(src,timeout=30).content
    time.sleep(1)
    f = open('%s/%s.mp3' % (path,song_name), 'wb')
    f.write(r)
    print('{}:下载成功'.format(song_name))
    f.close()
    else:
    print("未找到歌手")


    if __name__ == '__main__':
    """采集静态页面数据"""
    song_static()
    """采集动态页面数据"""
    song_List()
    """下载歌曲"""
    song_search()
  • 相关阅读:
    nvalid bound statement (not found)
    小程序
    maven启动项目时报错
    创建Maven项目出错
    小程序的tab标签实现效果
    C# 异步
    C#中计算时间差
    linq筛选唯一
    GMap.net控件学习记录
    nodepad++ 正则 替换
  • 原文地址:https://www.cnblogs.com/sheshidu/p/13282811.html
Copyright © 2011-2022 走看看