zoukankan      html  css  js  c++  java
  • [python基础] python生成wordcloud并保存

    1.核心包

    #jieba、pandas用来处理数据,数据源以xls格式存储的,这里用pandas进行处理
    import
    jieba from jieba import analyse import pandas as pd
    #scipy、wordcloud创建词云
    from scipy.misc import imread from wordcloud import WordCloud
    from wordcloud import ImageColorGenerator
    #matpoltlib展示、保存生成的词云图
    import matplotlib.pyplot as plt

    2.过程

    import jieba
    from
    jieba import analyse import pandas as pd import sys reload(sys) sys.setdefaultencoding('utf-8') # 1.stopwords def stop_words(): stop_dict = set() with open(u'./百度停用词列表.txt', 'r')as f: words = f.readlines() for word in words: stop_dict.add(word.strip().decode('utf-8')) return stop_dict # 2.分词并去停用词 # save chinese only,remove english words,emoji def remove_stopwords(stop_words): source_data = pd.read_excel('./11.xls') all_content = [] content = source_data[u'内容'] f = open('./weibo.txt', 'w') for line in content: cut_list = [c for c in jieba.cut(line)] ret_set = set(cut_list) - stop_words ret_list = list(ret_set) f.writelines([str(line) for line in ret_list]) f.writelines(' ') all_content.extend(ret_list) f.close() #3.统计词频 def get_frequency_words(file): with open(file, 'r') as f: texts = f.read()
         # 统计词频 top_words
    = analyse.textrank(texts, topK=400, withWeight=True) ret_words = {} for word in top_words: ret_words[word[0]] = word[1] return ret_words from scipy.misc import imread from wordcloud import WordCloud from wordcloud import ImageColorGenerator import matplotlib.pyplot as plt
    # 4.生成词云图并保存
    def generate_word_cloud(dict): color_mask = imread('./background.jpg') cloud = WordCloud( # 设置字体,不指定就会出现乱码,文件名不支持中文 font_path="./static/chinese.msyh.ttf", # font_path=path.join(d,'simsun.ttc'), # 设置背景色,默认为黑,可根据需要自定义为颜色 background_color='white', # 词云形状, mask=color_mask, # 允许最大词汇 max_words=400, # 最大号字体,如果不指定则为图像高度 max_font_size=150, # 画布宽度和高度,如果设置了mask则不会生效 # 词语水平摆放的频率,默认为0.9.即竖直摆放的频率为0.1 prefer_horizontal=0.8 ) cloud.generate_from_frequencies(frequencies=dict) cloud.to_file('word_cloud.jpg') # plt.imshow(cloud) # 不现实坐标轴 plt.axis('off') # 绘制词云 # plt.figure(dpi = 600) image_colors = ImageColorGenerator(color_mask)
    # 重新上色 plt.imshow(cloud.recolor(color_func=image_colors))
      # 保存图片 plt.savefig(
    './result2.png') # plt.show() if __name__ == '__main__': stop_words = stop_words() remove_stopwords(stop_words=stop_words) words_frequency = get_frequency_words('./weibo.txt') generate_word_cloud(words_frequency)

    [注]:(1).wordcloud.generate_from_text(text=text)可以直接由文本生成词云,但必须是英文文本。

      (2).wordcloud.generate_from_frequencies(frequencies=dict)由词频字典生成词云,词频越大则显示该词size越大
    [结果]:

  • 相关阅读:
    【原创】贴片电容的测量方法。。。这是我从自己QQ空间转过来的,本人实操!
    CentOS6.4安装Apache+MySQL+PHP
    第一次在博客园写博客。。。新人
    C# 简单生成双色球代码
    从客户端中检测到有潜在危险的 Request.Form 值 方法
    经典实例
    js鼠标键禁用功能
    逻辑思维题
    C#运算符笔记
    C#基础
  • 原文地址:https://www.cnblogs.com/halleluyah/p/9792348.html
Copyright © 2011-2022 走看看