zoukankan      html  css  js  c++  java
  • 爬取网易云超过十万的歌曲

    代码如下

    import json
    from urllib.parse import urlencode
    
    import requests
    from lxml import etree
    from requests import RequestException
    
    from selenium import webdriver
    import time
    import csv
    
    
    
    
    
    # 获取歌手id和歌手姓名
    def read_csv():
        with open("files/music_163_artists.csv", "r", encoding="utf-8") as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:
                artist_name, artist_id = row
                if str(artist_id) is "artist_id":
                    continue
                else:
                    yield artist_name, artist_id
        # 当程序的控制流程离开with语句块后, 文件将自动关闭
    
    def get_toal(music_id,song_name,driver):
        url = "https://music.163.com/#/song?id="+music_id
        driver.get(url)
        # 切换成frame
        driver.switch_to_frame("g_iframe")
        # 休眠3秒,等待加载完成!
        time.sleep(3)
        response = driver.page_source
        html = etree.HTML(response)
        comments = html.xpath("//span[@class='j-flag']/text()")
    
        if len(comments) > 0  and int(comments[0]) > 100000:
            print("获取 %s 的评论 %s 存储" % (song_name, comments[0]))
            return comments[0]
        else:
            print("获取 %s 的评论 %s 废弃" % (song_name, comments[0]))
            return None
    
    # 将获得的歌手的热门歌曲id和名字写入csv文件
    def write_to_csv(song_name,song_url,artist_name,driver):
        csvfile = open('./songs/hotsongs.csv', 'a', encoding='utf-8', newline='')  # 文件存储的位置
        writer = csv.writer(csvfile)
        #writer.writerow(('歌曲名称', '歌曲url','评论总数','歌手'))
    
        for name, url in zip(song_name, song_url):
            music_id = url.split('=')[-1]
            url = "https://music.163.com/#" + url
            try:
                if name is not None and url is not None:
                    song_comments = get_toal(music_id,name,driver)
                    if song_comments is not None:
                        writer.writerow([name, url,song_comments,artist_name])
            except Exception as msg:
                print(msg)
                # 当程序的控制流程离开with语句块后, 文件将自动关闭
    
    
    def main(driver):
    
        for item in read_csv():
            artist_name, artist_id = item
            # 可以任意选择浏览器,前提是要配置好相关环境,更多请参考selenium官方文档
            # 避免多次打开浏览器
            if artist_id != 'artist_id':
                url = "https://music.163.com/#/artist?id=" + str(artist_id)
                print("正在获取{}的热门歌曲...".format(artist_name))
                driver.get(url)
                # 切换成frame
                driver.switch_to.frame("g_iframe")
                # 休眠3秒,等待加载完成!
                time.sleep(2)
                response = driver.page_source
    
                html = etree.HTML(response)
                song_name = html.xpath("//span[@class='txt']/a/b/@title")
                song_url = html.xpath("//span[@class='txt']/a/@href")
    
                # 写入到csv文件里面
                write_to_csv(song_name, song_url,artist_name,driver)
                print("{}的热门歌曲写入到本地成功!".format(artist_name))
    
    
    if __name__ == "__main__":
        driver = webdriver.Chrome(executable_path="/www/spider-music163/songs/chromedriver.exe")
        main(driver)
    

    源码地址

    https://github.com/brady-wang/spider-music163  

  • 相关阅读:
    ASP.NET Web API 框架研究 Self Host模式下的消息处理管道
    ASP.NET Web API 框架研究 Web Host模式下的消息处理管道
    ASP.NET Web API 框架研究 核心的消息处理管道
    ASP.NET Web API 框架研究 Web Host模式路由及将请求转出到消息处理管道
    ASP.NET Web API 框架研究 ASP.NET Web API 路由
    ASP.NET Web API 框架研究 ASP.NET 路由
    ASP.NET Web API 入门 (API接口、寄宿方式、HttpClient调用)
    MVVM模式
    RESTful Web API 理解
    C# 函数式编程及Monads.net库
  • 原文地址:https://www.cnblogs.com/brady-wang/p/11864510.html
Copyright © 2011-2022 走看看