zoukankan      html  css  js  c++  java
  • 爬取b站周杰伦新歌mv弹幕 绘制词云

    1. 爬虫代码

    # -*- coding: utf-8 -*-
    """
    Created on Sat Jun 13 20:15:03 2020
    
    @author: Administrator
    """
    
    
    import requests
    import json
    import chardet
    import re
    from pprint import pprint
    # 1.根据bvid请求得到cid
    def get_cid():
        url = 'https://api.bilibili.com/x/player/pagelist?bvid=BV1PK4y1b7dt&jsonp=jsonp'
        res = requests.get(url).text
        json_dict = json.loads(res)
        
        return json_dict["data"][0]["cid"]
    
    # 2.根据cid请求弹幕,解析弹幕得到最终的数据
    # 接口
    def get_data(cid):
        final_url = "https://api.bilibili.com/x/v1/dm/list.so?oid=" + str(cid)
        final_res = requests.get(final_url)
        final_res.encoding = chardet.detect(final_res.content)['encoding']
        final_res = final_res.text
        pattern = re.compile('<d.*?>(.*?)</d>')
        data = pattern.findall(final_res)
        #pprint(final_res)
        return data
    
    # 3.保存弹幕列表
    def save_to_file(data):
        with open("dan_mu.txt", mode="w", encoding="utf-8") as f:
            for i in data:
                f.write(i)
                f.write("
    ")
    
    cid = get_cid()
    data = get_data(cid)
    save_to_file(data)
    

    2. 绘制词云

    #!/usr/bin/env python
    """
    Image-colored wordcloud
    =======================
    
    You can color a word-cloud by using an image-based coloring strategy
    implemented in ImageColorGenerator. It uses the average color of the region
    occupied by the word in a source image. You can combine this with masking -
    pure-white will be interpreted as 'don't occupy' by the WordCloud object when
    passed as mask.
    If you want white as a legal color, you can just pass a different image to
    "mask", but make sure the image shapes line up.
    """
    #导入必要的库
    from os import path
    from PIL import Image
    import numpy as np
    import matplotlib.pyplot as plt
    
    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
    
    
    # Read the whole text.
    # text = open(r'dan_mu.txt').read()
    with open(r'./dan_mu.txt', 'r', encoding='utf-8') as f:
        text = f.read()
    
    # read the mask / color image taken from
    # http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010
    alice_coloring = np.array(Image.open(r"./wordcloud/princess.jpg"))
    
    # 设置停用词
    stopwords = set(STOPWORDS)
    stopwords.add("said")
    
    # 你可以通过 mask 参数 来设置词云形状
    font = r'C:WindowsFontssimfang.ttf'
    wc = WordCloud(font_path=font, background_color="black", max_words=2000, mask=alice_coloring,
                   stopwords=stopwords, max_font_size=40, random_state=42)
    # generate word cloud
    wc.generate(text)
    
    # create coloring from image
    image_colors = ImageColorGenerator(alice_coloring)
    
    # show
    # 在只设置mask的情况下,你将会得到一个拥有图片形状的词云
    plt.figure(figsize=(8, 9))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    
    # plt.figure()
    # recolor wordcloud and show
    # we could also give color_func=image_colors directly in the constructor
    # 我们还可以直接在构造函数中直接给颜色
    # 通过这种方式词云将会按照给定的图片颜色布局生成字体颜色策略
    # plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
    # plt.axis("off")
    # plt.figure()
    
    # plt.imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")
    # plt.axis("off")
    
    plt.show()
    

    背景图:

  • 相关阅读:
    php-Zip打包文件
    PHP命令行类库 climate
    vim 添加块注释
    冒泡排序|插入排序
    PHP-SeasLog安装和使用
    链表
    多线程上下文切换
    竞态与线程安全
    线程的生命周期
    线程创建的两种方法
  • 原文地址:https://www.cnblogs.com/douzujun/p/13122229.html
Copyright © 2011-2022 走看看