zoukankan      html  css  js  c++  java
  • Python爬虫-爬取音乐资源

    爬取音乐资源

    实现

    #python 的正则库
    import re 
    #python 的requests库
    import requests
    import time
    
    #找到url的规律
    #每一页的url
    # http://www.htqyy.com/top/hot
    # http://www.htqyy.com/top/musicList/hot?pageIndex=1&pageSize=20
    # http://www.htqyy.com/top/musicList/hot?pageIndex=2&pageSize=20
    
    #歌曲连接
    # http://www.htqyy.com/play/33
    # 33-每个歌曲的号码,页url可以找到
    #资源所在url
    # http://f2.htqyy.com/play8/33/mp3/6
    
    #class="num">41</span><span class="title"><a href="/play/46" target="play" title="琵琶语" sid="46">琵琶语</a></span>
    
    songName=[]
    songID=[]
    
    headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) 
    AppleWebKit/537.36 (KHTML, like Gecko) 
    Chrome/81.0.4044.138 Safari/537.36"
    }
    
    page=2
    #page=int(input("请输入您要爬取的页数:"))
    
    for i in range(0,page):
        url="http://www.htqyy.com/top/musicList/hot?pageIndex="+str(i)+"&pageSize=20"
    
        #发送get请求,获取音乐榜单网页信息
        r=requests.get(url,headers=headers)
        #GBK网页采用的编码格式
        r.encoding='GBK'
        html_text=r.text
        print(html_text)
        #正则找到对应歌的url
        part1=r'title="(.*?)" sid='
        part2=r'sid="(.*?)"'
    
        #将匹配的字串组成列表形式返回
        titlelist=re.findall(part1,html_text)
        idlist=re.findall(part2,html_text)
    
        #在一个列表尾添加另一个列表
        songName.extend(titlelist)
        songID.extend(idlist)
    
    for i in range(0,len(songID)):
        songurl="http://f2.htqyy.com/play8/"+str(songID[i])+"/mp3/6"
        songname=songName[i]
    
        #二进制文件
        data=requests.get(songurl).content
        
        print("正在下载...")
        with open("E:\music\{0}.mp3".format(songname),"wb") as f:
            f.write(data)
        
        time.sleep(5)

    当无法访问试试下面代码

    headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) 
    AppleWebKit/537.36 (KHTML, like Gecko) 
    Chrome/81.0.4044.138 Safari/537.36"
    }
    
    songurl="http://f2.htqyy.com/play8/33/mp3/6"
    songname="清风"
    
    #二进制文件
    data=requests.get(songurl,headers=headers).content
    
    print("正在下载...")
    with open("D:\Python\{0}.mp3".format(songname),"wb") as f:
        f.write(data)
    

    总结

      当得到的网页信息是乱码:

      print requests.get(url).encoding  打印获取到的网页信息采用什么编码

      r = requests.get(url)

      r.encoding = 'GBK'

      print(r.text)           将编码格式采用'GBK',网页编码,就不会出现乱码

      字符串拼接:

      +或者format()

  • 相关阅读:
    CF750D New Year and Fireworks
    raw,qcow2虚拟磁盘挂载
    虚拟机嵌套kvm/vmware
    CentOS Linux 7硬盘安装
    文本界面听歌神器--moc
    Ubuntu14.04升级内核3.14.25
    (转)MySQL初识-架构-安装-初始化-连接-管理工具-数据文件
    Ubuntu14.04使用国内163源或sohu源
    虚拟磁盘格式转换(raw,qcow2,vmdk等)--qemu-img
    phpMyAdmin4.2.12安装配置
  • 原文地址:https://www.cnblogs.com/Just-a-calm-programmer/p/12958040.html
Copyright © 2011-2022 走看看