zoukankan html css js c++ java

爬虫实现qq音乐歌单无vip批量下载

分享歌单链接
电脑网页无法获取歌单完信息，所以需要借助手机下载网页文件
利用下载网站实现批量下载

music.py

import requests
from fake_useragent import UserAgent
from lxml import html
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities


# 读取qq音乐分享文件
def get_html_file():
    file = input('请输入html文件地址（项目目录下则直接输入文件名包括后缀名）：')
    with open(file, 'r', encoding='utf-8') as f:
        html_ = f.read()
    return html_


# 从html中获取歌曲的信息（歌名和歌手）
def get_music_name_and_singer(html_):
    etree = html.etree
    e = etree.HTML(html_)
    # 不同歌曲歌单
    # music_infos = e.xpath('//p[@class="song_list__desc"]/text()')
    # 同一作者的歌单
    music_infos = e.xpath('//span[@class="song_list__txt"]/text()')

    # print(music_info)
    return music_infos


# 从下载网站获取歌曲下载地址
def get_download_url(music_info):
    url = 'https://www.musictool.top/?name={}&type=qq'
    url = url.format(str(music_info))
    option = webdriver.ChromeOptions()
    option.add_argument('headless')
    dcap = dict(DesiredCapabilities.CHROME)
    dcap['chrome.page.settings.userAgent'] = UserAgent().chrome
    driver = webdriver.Chrome(chrome_options=option, desired_capabilities=dcap)
    driver.get(url)
    sleep(6)
    response = driver.page_source
    # print(response)
    etree = html.etree
    e = etree.HTML(response)
    if ((e.xpath('//a[@id="j-src-btn"]/@href'))) == None:
        return None
    else:
        download_url = ''.join(e.xpath('//a[@id="j-src-btn"]/@href'))
        print(download_url)
        return download_url


# 下载歌曲，保存歌曲
def download_music(download_url, name):
    if download_url == None:
        print(name + '---下载失败')
    headers = {
        'User-Agent': UserAgent().random
    }
    response = requests.get(download_url, headers=headers)
    if response.status_code == 200:
        with open('music/' + name + '.mp3', 'wb') as f:
            f.write(response.content)
        print('下载完成---' + name)
        print('--------------------')
    else:
        print(name + '---下载失败')


# 主方法，遍历歌曲信息执行下载歌曲
def main():
    html_ = get_html_file()
    music_infos = get_music_name_and_singer(html_)
    print(music_infos)
    for music_info in music_infos:
        print('开始下载---' + music_info)
        #   music_info = '马良/孙茜茹 往后余生'
        download_url = get_download_url(music_info)
        music_info = music_info.replace(' · ', ' ').replace('/', '_').replace('?', '').strip()
        download_music(download_url, music_info)


if __name__ == '__main__':
    main()

说明：

1.歌曲下载网站使用的时Ajax异步请求，所以不能通过常规方法的爬取歌曲
 2.采用selenuim来获取网页完整的代码，从而获取歌曲的下载地址
 3.爬取过程中ua很重要的，一定要设置
 4.访问速度一定不要太快，所以采用了sleep()方法来减慢爬取速度，防止被检测到电脑操作，从而报错

查看全文

相关阅读:
C++ Websites
C++ smart pointer
Use of ‘const’ in Functions Return Values
Meaning of “const” last in a C++ method declaration?
为什么不要使用"using namespace XXX"
android.os.Handler
Windows下Anaconda的安装和简单使用
 matlab GPU 操作
 matlab 在柱状图上显示数字
 How to fix apt-get GPG error NO_PUBKEY Ubuntu 14

原文地址：https://www.cnblogs.com/zq98/p/15028019.html