zoukankan      html  css  js  c++  java
  • Python实例---爬去酷狗音乐

    项目一:获取酷狗TOP 100

    http://www.kugou.com/yy/rank/home/1-8888.html

    排名

    image

    文件&&歌手

    image

    时长

    image

    效果:

    image

    附源码:

    import time
    import json
    from bs4 import BeautifulSoup
    import requests
    
    
    class Kugou(object):
        def __init__(self):
            self.header = {
                "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'
            }
    
        def getInfo(self, url):
            html = requests.get(url, headers=self.header)
            soup = BeautifulSoup(html.text, 'html.parser')
            # print(soup.prettify())
            ranks = soup.select('.pc_temp_num')
            titles = soup.select('.pc_temp_songlist > ul > li > a')  # 层层标签查找
            times = soup.select('.pc_temp_time')
            for rank, title, songTime in zip(ranks, titles, times):
                data = {
                    # rank 全打印就是带HTML标签的
                    'rank': rank.get_text().strip(),
                    'title': title.get_text().split('-')[1].strip(),
                    'singer': title.get_text().split('-')[0].strip(),
                    'songTime': songTime.get_text().strip()
                }
                s = str(data)
                print('rank:%2s	' % data['rank'], 'title:%2s	' % data['title'], 'singer:%2s	' %data['singer'], 'songTime:%2s	' % data['songTime'])
                with open('hhh.txt', 'a', encoding='utf8') as f:
                   f.writelines(s + '
    ')
    
    if __name__ == '__main__':
        urls = [
            'http://www.kugou.com/yy/rank/home/{}-8888.html'.format(str(i)) for i in range(30)
        ]
    
        kugou = Kugou()
        for url in urls:
            kugou.getInfo(url)
            time.sleep(1)

    部分代码解析

    --------------------------------------------------------------------
    urls = ['http://www.kugou.com/yy/rank/home/{}-8888.html'.format(str(i)) for i in range(1, 5)]
    for i in urls:
        print(i)
    
    结果打印:
    	http://www.kugou.com/yy/rank/home/1-8888.html
    	http://www.kugou.com/yy/rank/home/2-8888.html
    	http://www.kugou.com/yy/rank/home/3-8888.html
    	http://www.kugou.com/yy/rank/home/4-8888.html
    --------------------------------------------------------------------
    for rank, title, songTime in zip(ranks, titles, times):
        data = {
            # rank 全打印就是带HTML标签的
            'rank': rank.get_text().strip(),
            'title': title.get_text().split('-')[0].strip(),
            'singer': title.get_text().split('-')[1].strip(),
            'songTime': songTime.get_text()
        }
        print(data['rank'])
        print(data['title'])
        print(data['singer'])
        print(data['songTime'])
    	
    结果打印:
        1
        飞驰于你
        许嵩
        4: 04
    --------------------------------------------------------------------   
    for rank, title, songTime in zip(ranks, titles, times):
    	data = {
    		# rank 全打印就是带HTML标签的
    		'rank': rank,
    		'title': title,
    		'songTime': songTime
    	}
    	print(data['rank'])
    	print(data['title'])
    	print(data['songTime'])
    结果打印:
    <span class="pc_temp_num">
    	<strong>1</strong>
    </span>
    <a class="pc_temp_songname" data-active="playDwn" data-index="0" hidefocus="true" href="http://www.kugou.com/song/pjn5xaa.html" title="许嵩 - 飞驰于你">许嵩 - 飞驰于你</a>
    <span class="pc_temp_time">	4:04 </span>

    项目二:搜索曲目获取URL

    根据关键字搜索后的结果:

    http://songsearch.kugou.com/song_search_v2?callback=jQuery191034642999175022426_1489023388639&keyword=%E5%9B%AD%E6%B8%B8%E4%BC%9A&page=1&pagesize=30&userid=-1&clientver=&platform=WebFilter&tag=em&filter=2&iscorrection=1&privilege_filter=0&_=1489023388641%27

    image

    # encoding=utf-8
    # Time    : 2018/4/27
    # Email   : z2615@163.com
    # Software: PyCharm
    # Language: Python 3
    import requests
    import json
    
    
    class KgDownLoader(object):
        def __init__(self):
            self.search_url = 'http://songsearch.kugou.com/song_search_v2?callback=jQuery191034642999175022426_1489023388639&keyword={}&page=1&pagesize=30&userid=-1&clientver=&platform=WebFilter&tag=em&filter=2&iscorrection=1&privilege_filter=0&_=1489023388641'
    
            # .format('园游会')
            self.play_url = 'http://www.kugou.com/yy/index.php?r=play/getdata&hash={}'
            self.song_info = {
                '歌名': None,
                '演唱者': None,
                '专辑': None,
                'filehash': None,
                'mp3url': None
            }
    
        def get_search_data(self, keys):
            search_file = requests.get(self.search_url.format(keys))
            search_html = search_file.content.decode().replace(')', '').replace(
                'jQuery191034642999175022426_1489023388639(', '')
            views = json.loads(search_html)
            for view in views['data']['lists']:
                song_name = view['SongName'].replace('<em>', '').replace('</em>', '')
                album_name = view['AlbumName'].replace('<em>', '').replace('</em>', '')
                sing_name = view['SingerName'].replace('<em>', '').replace('</em>', '')
                file_hash = view['FileHash']
                new_info = {
                    '歌名': song_name,
                    '演唱者': sing_name,
                    '专辑': album_name if album_name else None,
                    'filehash': file_hash,
                    'mp3url': None
                }
                self.song_info.update(new_info)
                yield self.song_info
    
        def get_mp3_url(self, filehash):
            mp3_file = requests.get(self.play_url.format(filehash)).content.decode()
            mp3_json = json.loads(mp3_file)
            real_url = mp3_json['data']['play_url']
            self.song_info['mp3url'] = real_url
            yield self.song_info
    
        def save_mp3(self, song_name, real_url):
            with open(song_name + ".mp3", "wb")as fp:
                fp.write(requests.get(real_url).content)
    
    
    if __name__ == '__main__':
        kg = KgDownLoader()
        mp3_info = kg.get_search_data(input('请输入歌名:'))
        for x in mp3_info:
            mp3info = kg.get_mp3_url(x['filehash'])
            for i in mp3info:
                print(i)

    image

    项目三:搜索下载歌曲

    代码仅供学习参考

    from selenium import webdriver
    
    from bs4 import BeautifulSoup
    
    import urllib.request
    
    from selenium.webdriver.common.action_chains import ActionChains
    
    input_string = input('>>>please input the search key:')
    
    #input_string="你就不要想起我"
    
    driver = webdriver.Chrome()
    
    driver.get('http://www.kugou.com/')
    
    
    a=driver.find_element_by_xpath('/html/body/div[1]/div[1]/div[1]/div[1]/input') #输入搜索内容/html/body/div[1]/div[1]/div[1]/div[1]/input
    
    a.send_keys(input_string)
    
    driver.find_element_by_xpath('/html/body/div[1]/div[1]/div[1]/div[1]/div/i').click() #点击搜索/html/body/div[1]/div[1]/div[1]/div[1]/div/i
    
    for handle in driver.window_handles:#方法二,始终获得当前最后的窗口,所以多要多次使用
    
        driver.switch_to_window(handle)
    
    #result_url = driver.current_url
    
    
    #driver = webdriver.Firefox()
    
    #driver.maximize_window()
    
    #driver.get(result_url)
    
    #j=driver.find_element_by_xpath('/html/body/div[4]/div[1]/div[2]/ul[2]/li[2]/div[1]/a').get_attribute('title')测试
    
    #print(j)
    
    soup = BeautifulSoup(driver.page_source,'lxml')
    
    PageAll = len(soup.select('ul.list_content.clearfix > li'))
    
    print(PageAll)
    
    for i in range(1,PageAll+1):
    
        j=driver.find_element_by_xpath('/html/body/div[4]/div[1]/div[2]/ul[2]/li[%d]/div[1]/a'%i).get_attribute('title')
    
        print('%d.'%i + j)
    
    choice=input("请输入你要下载的歌曲(输入序号):")
    
    #global mname
    
    #mname=driver.find_element_by_xpath('/html/body/div[4]/div[1]/div[2]/ul[2]/li[%d]/div[1]/a'%choice).get_attribute('title')#歌曲名
    
    a=driver.find_element_by_xpath('/html/body/div[4]/div[1]/div[2]/ul[2]/li[%s]/div[1]/a'%choice)#定位
    
    b=driver.find_element_by_xpath('/html/body/div[4]/div[1]/div[2]/ul[2]/li[%s]/div[1]/a'%choice).get_attribute('title')
    
    actions=ActionChains(driver)#selenium中定义的一个类
    
    actions.move_to_element(a)#将鼠标移动到指定位置
    
    actions.click(a)#点击
    
    actions.perform()
    
    #wait(driver)?
    
    #driver = webdriver.Firefox()
    
    #driver.maximize_window()
    
    #driver.get(result_url)
    
    #windows = driver.window_handles
    
    #driver.switch_to.window(windows[-1])
    
    #handles = driver.window_handles
    
    for handle in driver.window_handles:#方法二,始终获得当前最后的窗口,所以多要多次使用
    
        driver.switch_to_window(handle)
    
    Local=driver.find_element_by_xpath('//*[@id="myAudio"]').get_attribute('src')
    
    print(driver.find_element_by_xpath('//*[@id="myAudio"]').get_attribute('src'))
    
    def cbk(a, b, c):
    
        per = 100.0 * a * b / c
    
        if per > 100:
    
            per = 100
    
        print('%.2f%%' % per)
    
    soup=BeautifulSoup(b)
    
    name=soup.get_text()
    
    path='D:\%s.mp3'%name
    
    urllib.request.urlretrieve(Local, path, cbk)
    
    print('finish downloading %s.mp3' % name + '
    
    ')

    【更多参考】https://blog.csdn.net/abc_123456___/article/details/81101845

  • 相关阅读:
    2.Magicodes.NET框架之路——策略管理
    1.Magicodes.NET框架之路——起航
    为什么LINQ to XML的性能要优于XmlDocument?
    SharePoint如何配置Ipad跳转等问题
    写给自己
    ERP,SCM,CRM,BRP,OMS,WMS 企业管理的6大核心系统
    spring 发送邮件问题
    spring各种邮件发送
    css background-position结合disaply:inline-block使用
    .Net 两大利器Newtonsoft.NET和Dapper
  • 原文地址:https://www.cnblogs.com/ftl1012/p/9614146.html
Copyright © 2011-2022 走看看