zoukankan      html  css  js  c++  java
  • jieba文本分词,去除停用词,添加用户词

    import jieba
    from collections import Counter
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    from PIL import Image
    import numpy as np
    import jieba.analyse
    from pyquery import PyQuery
    
    santi_text = open('./santi.txt', 'r', encoding='utf-8').read() #读取本地文档
    
    jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数 
    
    jieba.load_userdict('./userdict.txt')#加载外部 用户词典
    
    # 创建停用词list
    def stopwordslist(filepath):
        stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
        return stopwords
    
    # 对句子去除停用词
    def movestopwords(sentence):
        stopwords = stopwordslist('./stop_words.txt')  # 这里加载停用词的路径
        santi_words =[x for x in sentence if len(x) >1 and x not in stopwords]
    
        return santi_words
    
    def main():
        words = jieba.cut(PyQuery(santi_text).text()) #去除HTML标签
        word_list = movestopwords(words) # 去除停用词
        words_split = " ".join(word_list) #列表解析为字符串
    
        print('以下是tf-tdf算法-------------------------------------------------')
        keywords_tf = jieba.analyse.extract_tags(words_split, topK=100, withWeight=True,allowPOS=('ns', 'n', 'vn', 'v')) # tf-tdf算法
        for item in keywords_tf:
             print(item[0],item[1])
    
        print('以下是textrank算法-------------------------------------------------')
        keywords_rank = jieba.analyse.textrank(words_split, topK=100, withWeight=True,allowPOS=('ns', 'n', 'vn', 'v')) #textrank算法
        for item in keywords_rank:
             print(item[0],item[1])
    
        print('以下是纯词频统计-------------------------------------------------')
        mycount = Counter(word_list) # 统计词频
        for key, val in mycount.most_common(100):  # 有序(返回前10个)
            print(key, val)
    
        #alice_mask = np.array(Image.open("./zhihu.png")) #遮罩
        wc = WordCloud(
            # width=800,
            # height=600,
            background_color="#000000",  # 设置背景颜色
            max_words=50,  # 词的最大数(默认为200)
            max_font_size=400,  # 最大字体尺寸
            min_font_size=10,  # 最小字体尺寸(默认为4)
            #colormap='bone',  # string or matplotlib colormap, default="viridis"
            random_state=42,  # 设置有多少种随机生成状态,即有多少种配色方案
            #mask=plt.imread("./zhihu.png"),  # 读取遮罩图片!!
            #mask=alice_mask, #设置遮罩
            font_path='./SimHei.ttf'
        )
    
    
        my_wordcloud = wc.generate(words_split) #按词频生成词云
        plt.imshow(my_wordcloud) #展示词云
        plt.axis("off") #去除横纵轴
        plt.show()
        wc.to_file('zzz.png') # 保存图片文件
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    段落排版--对齐
    1055. The World's Richest (25)
    1054. The Dominant Color (20)
    (八十一)利用系统自带App来实现导航
    (八十)MapKit放置系统默认大头针和自定义大头针
    (七十九)MapKit的基本使用
    1052. Linked List Sorting (25)
    (七十八)使用第三方框架INTULocationManager实现定位
    (七十七)地理编码与反地理编码
    1051. Pop Sequence (25)
  • 原文地址:https://www.cnblogs.com/Erick-L/p/9395621.html
Copyright © 2011-2022 走看看