zoukankan      html  css  js  c++  java
  • 爬虫学习笔记:酷狗音乐榜单TOP500

    一、背景

    酷狗音乐热门榜单-酷狗TOP500(网页版)链接为:

    # 链接
    https://www.kugou.com/yy/rank/home/1-8888.html?from=rank
    # 网页版并无下一页 只能通过自己构造链接实现
    # 经发现 2-8888 3-8888 替换即可
    

    二、实操

    1.加载模块

    import pandas as pd
    import numpy as np
    import time
    import requests
    from bs4 import BeautifulSoup
    import matplotlib.pyplot as plt
    from PIL import Image
    from wordcloud import WordCloud
    

    2.测试单独爬取

    # 待爬取网页
    url = r'https://www.kugou.com/yy/rank/home/1-8888.html?from=rank'
    # 头部文件
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36', 'Connection': 'close'
    }
    # 请求
    r = requests.get(url, headers=headers)
    r.status_code # 200 正常返回
    

    3.解析

    # bs4解析
    soup = BeautifulSoup(r.text, 'lxml')
    titles = soup.select('.pc_temp_songname')
    href = soup.select('.pc_temp_songname')
    times = soup.select('.pc_temp_time')
    
    # 存储列表
    data_all = []
    for titles, times, href in zip(titles, times, href):
        data = {
            '歌名':titles.get_text().replace('\n', '').replace('\t', '').replace('\r', '').split('-')[0].strip(),
            '歌手':titles.get_text().replace('\n', '').replace('\t', '').replace('\r', '').split('-')[1].strip(),
            '时长':times.get_text().replace('\n', '').replace('\t', '').replace('\r', '').strip(),
            '链接':href.get('href')
            }
        print(data)
        data_all.append(data)
    
    df = pd.DataFrame(data_all)
    '''
          歌名               歌手    时长                                           链接
    0    孤勇者              陈奕迅  4:16  https://www.kugou.com/mixsong/5rcb3re6.html
    1   一路生花              温奕心  4:16  https://www.kugou.com/mixsong/592l9gb7.html
    2      叹  黄龄、Tăng Duy Tân  4:11  https://www.kugou.com/mixsong/5w42mq78.html
    3  好想抱住你          程jiajia  3:42  https://www.kugou.com/mixsong/5uhaec79.html
    4     下潜      川青、Morerare  3:37  https://www.kugou.com/mixsong/5sewos85.html
    '''
    

    三、函数封装

    def get_data():
        dic = {}
        data_all = []
        for i in range(1, 24):
            url = f'https://www.kugou.com/yy/rank/home/{i}-8888.html?from=rank'
            # urls = 'https://www.kugou.com/yy/rank/home/%d-8888.html?from=rank' % i
            # 头部文件
            headers = {
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36', 
                    'Connection': 'close'
                    }
            # 请求
            r = requests.get(url, headers=headers)
            # bs4解析
            soup = BeautifulSoup(r.text, 'lxml')
            titles = soup.select('.pc_temp_songname')
            href = soup.select('.pc_temp_songname')
            times = soup.select('.pc_temp_time')
            # 存储列表
            for titles, times, href in zip(titles, times, href):
                data = {
                    '歌名':titles.get_text().replace('\n', '').replace('\t', '').replace('\r', '').split('-')[0].strip(),
                    '歌手':titles.get_text().replace('\n', '').replace('\t', '').replace('\r', '').split('-')[1].strip(),
                    '时长':times.get_text().replace('\n', '').replace('\t', '').replace('\r', '').strip(),
                    '链接':href.get('href')
                    }
                print(data)
                data_all.append(data)
                if data['歌手'] not in dic:
                    dic[data['歌手']] = 1
                else:
                    dic[data['歌手']] += 1
            time.sleep(2)
        return data_all, dic
    
    # 调用
    data_all, dic = get_data()
    df = pd.DataFrame(data_all)
    

    四、完整版

    import pandas as pd
    import numpy as np
    import time
    import requests
    from bs4 import BeautifulSoup
    import matplotlib.pyplot as plt
    from PIL import Image
    from wordcloud import WordCloud
    
    def cnt_songer(songer, dic):
        if songer not in dic:
            dic[songer] = 1
        else:
            dic[songer] += 1
    
    def get_data():
        dic = {}
        data_all = []
        for i in range(1, 24):
            url = f'https://www.kugou.com/yy/rank/home/{i}-8888.html?from=rank'
            # urls = 'https://www.kugou.com/yy/rank/home/%d-8888.html?from=rank' % i
            # 头部文件
            headers = {
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36', 
                    'Connection': 'close'
                    }
            # 请求
            r = requests.get(url, headers=headers)
            # bs4解析
            soup = BeautifulSoup(r.text, 'lxml')
            titles = soup.select('.pc_temp_songname')
            href = soup.select('.pc_temp_songname')
            times = soup.select('.pc_temp_time')
            # 存储列表
            for titles, times, href in zip(titles, times, href):
                data = {
                    '歌名':titles.get_text().replace('\n', '').replace('\t', '').replace('\r', '').split('-')[0].strip(),
                    '歌手':titles.get_text().replace('\n', '').replace('\t', '').replace('\r', '').split('-')[1].strip(),
                    '时长':times.get_text().replace('\n', '').replace('\t', '').replace('\r', '').strip(),
                    '链接':href.get('href')
                    }
                print(data)
                data_all.append(data)
                cnt_songer(data['歌手'], dic)
            time.sleep(2)
        return data_all, dic
    
    def process_data(dic):
        items = dict(sorted(dic.items(), key=lambda x: x[1], reverse=True))
        items = {key: value for key, value in items.items() if value > 1}
        # print(items)
        return items
    
    def main():
        data_all, dic = get_data()
        df = pd.DataFrame(data_all)
        items = process_data(dic)
        print(len(items))
        return df, items
    
    if __name__ == '__main__':
        data, dic_result = main()
    
    

    五、词云图

    有待继续学习!

    To be continue.........

    参考链接:华语乐坛到底姓什么?------酷狗篇

  • 相关阅读:
    C#开发微信门户及应用(18)-微信企业号的通讯录管理开发之成员管理
    C#开发微信门户及应用(17)-微信企业号的通讯录管理开发之部门管理
    C#开发微信门户及应用(16)-微信企业号的配置和使用
    C#开发微信门户及应用(15)-微信菜单增加扫一扫、发图片、发地理位置功能
    会员管理系统的设计和开发(3)--主界面的设计思路分享
    会员管理系统的设计和开发(2)-- RDLC报表的设计及动态加载
    Winform开发中常见界面的DevExpress处理操作
    在WCF数据访问中使用缓存提高Winform字段中文显示速度
    双指针算法模板和一些题目
    尾递归 递归函数中,递归调用是整个函数体中最后的语句,且它的返回值不属于表达式的一部分时,这个递归调用就是尾递归,空间复杂度是O(1)
  • 原文地址:https://www.cnblogs.com/hider/p/15808606.html
Copyright © 2011-2022 走看看