zoukankan      html  css  js  c++  java
  • 爬取网易云超过十万的歌曲

    代码如下

    import json
    from urllib.parse import urlencode
    
    import requests
    from lxml import etree
    from requests import RequestException
    
    from selenium import webdriver
    import time
    import csv
    
    
    
    
    
    # 获取歌手id和歌手姓名
    def read_csv():
        with open("files/music_163_artists.csv", "r", encoding="utf-8") as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:
                artist_name, artist_id = row
                if str(artist_id) is "artist_id":
                    continue
                else:
                    yield artist_name, artist_id
        # 当程序的控制流程离开with语句块后, 文件将自动关闭
    
    def get_toal(music_id,song_name,driver):
        url = "https://music.163.com/#/song?id="+music_id
        driver.get(url)
        # 切换成frame
        driver.switch_to_frame("g_iframe")
        # 休眠3秒,等待加载完成!
        time.sleep(3)
        response = driver.page_source
        html = etree.HTML(response)
        comments = html.xpath("//span[@class='j-flag']/text()")
    
        if len(comments) > 0  and int(comments[0]) > 100000:
            print("获取 %s 的评论 %s 存储" % (song_name, comments[0]))
            return comments[0]
        else:
            print("获取 %s 的评论 %s 废弃" % (song_name, comments[0]))
            return None
    
    # 将获得的歌手的热门歌曲id和名字写入csv文件
    def write_to_csv(song_name,song_url,artist_name,driver):
        csvfile = open('./songs/hotsongs.csv', 'a', encoding='utf-8', newline='')  # 文件存储的位置
        writer = csv.writer(csvfile)
        #writer.writerow(('歌曲名称', '歌曲url','评论总数','歌手'))
    
        for name, url in zip(song_name, song_url):
            music_id = url.split('=')[-1]
            url = "https://music.163.com/#" + url
            try:
                if name is not None and url is not None:
                    song_comments = get_toal(music_id,name,driver)
                    if song_comments is not None:
                        writer.writerow([name, url,song_comments,artist_name])
            except Exception as msg:
                print(msg)
                # 当程序的控制流程离开with语句块后, 文件将自动关闭
    
    
    def main(driver):
    
        for item in read_csv():
            artist_name, artist_id = item
            # 可以任意选择浏览器,前提是要配置好相关环境,更多请参考selenium官方文档
            # 避免多次打开浏览器
            if artist_id != 'artist_id':
                url = "https://music.163.com/#/artist?id=" + str(artist_id)
                print("正在获取{}的热门歌曲...".format(artist_name))
                driver.get(url)
                # 切换成frame
                driver.switch_to.frame("g_iframe")
                # 休眠3秒,等待加载完成!
                time.sleep(2)
                response = driver.page_source
    
                html = etree.HTML(response)
                song_name = html.xpath("//span[@class='txt']/a/b/@title")
                song_url = html.xpath("//span[@class='txt']/a/@href")
    
                # 写入到csv文件里面
                write_to_csv(song_name, song_url,artist_name,driver)
                print("{}的热门歌曲写入到本地成功!".format(artist_name))
    
    
    if __name__ == "__main__":
        driver = webdriver.Chrome(executable_path="/www/spider-music163/songs/chromedriver.exe")
        main(driver)
    

    源码地址

    https://github.com/brady-wang/spider-music163  

  • 相关阅读:
    1202诗人基本介绍&诗人画像
    1205人物关系优化&诗人轨迹
    把SQL Server 2000 表中的自动编号Id重新开始排列
    一个一直都不明白的东西今天知道了。关于sqlserver2000自动执行。
    服装打版界的扛把子ET自定义操作
    手把手教你搭建集中式版本控制系统SVN服务器
    分享一次实用的爬虫经验
    盘点CSV文件在Excel中打开后乱码问题的两种处理方法
    盘点服装设计所经常性使用的软件ET(下篇)
    sql 每个企业选择一条产品
  • 原文地址:https://www.cnblogs.com/brady-wang/p/11864510.html
Copyright © 2011-2022 走看看