zoukankan      html  css  js  c++  java
  • python 可视化 词云图

    我的代码:

    # -*- coding: utf-8 -*-
    from pandas import read_csv
    import numpy as np
    from sklearn.datasets.base import Bunch
    import pickle  # 导入cPickle包并且取一个别名pickle #持久化类
    from sklearn.feature_extraction.text import TfidfVectorizer
    import jieba
    import operator  # 排序用
    from sklearn import metrics
    from sklearn.externals import joblib
    import xlwt
    #导入wordcloud模块和matplotlib模块
    import wordcloud
    import matplotlib.pyplot as plt
    from scipy.misc import imread
    
            
    '''读取停用词'''
    def _readfile(path):
        with open(path, "rb") as fp:
            content = fp.read()
        return content
    
    
    ''' 读取bunch对象'''
    
    
    def _readbunchobj(path):
        with open(path, "rb") as file_obj:
            bunch = pickle.load(file_obj)
        return bunch
    
    
    '''写入bunch对象'''
    
    
    def _writebunchobj(path, bunchobj):
        with open(path, "wb") as file_obj:
            pickle.dump(bunchobj, file_obj)
    
    def buildtestbunch(bunch_path, art_test):
        bunch = Bunch(contents=[])#label=[],
    # =============================================================================
    #     for item1 in testlabel:
    #         bunch.label.append(item1)
    # =============================================================================
    
        # testContentdatasave=[] #存储所有训练和测试数据的分词
        for item2 in art_test:
            item2 = str(item2)
            item2 = item2.replace("
    ", "")
            item2 = item2.replace(" ", "")
            content_seg = jieba.cut(item2)
            save2 = ''
            for item3 in content_seg:
                if len(item3) > 1 and item3 != '
    ':
                    # testContentdatasave.append(item3)
                    save2 = save2 + "," + item3
            bunch.contents.append(save2)
        with open(bunch_path, "wb") as file_obj:
            pickle.dump(bunch, file_obj)
        print("构建测试数据文本对象结束!!!")
    
    
    def vector_space(stopword_path, bunch_path, space_path):
        stpwrdlst = _readfile(stopword_path).splitlines()  # 读取停用词
        bunch = _readbunchobj(bunch_path)  # 导入分词后的词向量bunch对象
        # 构建tf-idf词向量空间对象
        tfidfspace = Bunch(label=bunch.label, tdm=[], vocabulary={})
    
        # 权重矩阵tdm,其中,权重矩阵是一个二维矩阵,tdm[i][j]表示,第j个词(即词典中的序号)在第i个类别中的IF-IDF值
    
        # 使用TfidVectorizer初始化向量空间模型
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001, use_idf=True,
                                     max_features=15000)
        # print(vectorizer)
        # 文本转为词频矩阵,单独保存字典文件
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
        tfidfspace.vocabulary = vectorizer.vocabulary_
        # 创建词袋的持久化
        _writebunchobj(space_path, tfidfspace)
        print("if-idf词向量空间实例创建成功!!!")
    
    
    def testvector_space(stopword_path, bunch_path, space_path, train_tfidf_path):
        stpwrdlst = _readfile(stopword_path).splitlines()  # 把停用词变成列表
        bunch = _readbunchobj(bunch_path)
        tfidfspace = Bunch(tdm=[], vocabulary={})#label=bunch.label, 
        # 导入训练集的TF-IDF词向量空间  ★★
        trainbunch = _readbunchobj(train_tfidf_path)
        tfidfspace.vocabulary = trainbunch.vocabulary
    
        vectorizer= TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary,
                                     min_df=0.001)
         
        
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
        _writebunchobj(space_path, tfidfspace)
        print("if-idf词向量空间实例创建成功!!!")
    
    if __name__=="__main__":  
        
        
        Sdata = []
        art = []
        '''============================先导入数据=================================='''
        file_test = 'F:/goverment/text analyse/type_in.csv'
    
        dataset = read_csv(file_test)
        Sdata = dataset.values[:, :]
        Sdata=Sdata.tolist()
        for line in Sdata:
            art.append(line[1])#line[1]为文本
        print(len(Sdata))
        
        '''==========================================================tf-idf对Bar进行文本特征提取============================================================================'''
        # 导入分词后的词向量bunch对象
        test_bunch_path = "F:/goverment/text analyse/trainbunch.bat"
        test_space_path = "F:/goverment/text analyse/traintfdifspace.dat"
        stopword_path = "F:/goverment/text analyse/hlt_stop_words.txt"
    
        '''============================================================tf-idf对Sart进行文本特征提取=============================================================================='''
    
        buildtestbunch(test_bunch_path, art)
    
        testvector_space(stopword_path, test_bunch_path, test_space_path, test_space_path)
    
        test_set = _readbunchobj(test_space_path)
    
        '''测试数据'''
    
        
        #获取已知 id 找 文本
        txtcut=[] #存放所有词
        dic={}
        for i in test_set.vocabulary.keys():
            txtcut.append(i)
            dic[test_set.vocabulary[i]]=i
            
    
        #print(dic)
        
        #print(test_set.tdm)
        #print(test_set.tdm[0])
        #print(dir(test_set))
        #print(test_set.vocabulary)
        #print(dir(test_set.tdm))
        
        #print(Sdata)
        
        #print(nonzero[1])
      
        '''final里放的是不超过15的词'''
        #print(Sdata)
        final=[]
        for k in range(len(Sdata)):#遍历每一条文本
            nonzero=test_set.tdm[k].nonzero()
            ls=[]
            ls.append(Sdata[k][0])
            num=0
            for i in range(len(nonzero[1])):
                num=num+1
                b=test_set.tdm[k, nonzero[1][i]]*100 #test_set.tdm[k, nonzero[1][i]]是第k条文本中,第i个权重非零的词权重
                a= dic[nonzero[1][i]] +"  "+str(round(b,2))+"%"
                ls.append(a)
                if num==15:
                    break
            final.append(ls)
        
        '''画词云图'''
        fig = plt.figure(figsize = (15,15))
        cloud = wordcloud.WordCloud(font_path='STXINGKA.TTF',mask=imread('water3.png'),mode='RGBA',
                                    background_color=None).generate(' '.join(txtcut))
        img = imread('water3.png')
        cloud_colors = wordcloud.ImageColorGenerator(np.array(img))
        cloud.recolor(color_func=cloud_colors)
        plt.imshow(cloud)
        plt.axis('off')
        plt.savefig('watercloud3.png',dpi=400)
        plt.show()
        
        myexcel = xlwt.Workbook()
        sheet = myexcel.add_sheet("sheet1")
        si=-1
        sj=-1
        for line in final:
            si=si+1
            sj=-1
            for i in line:
                sj=sj+1
                sheet.write(si,sj,str(i))
        
        myexcel.save("各条分词.xls")
        
        
        #把id存好
        myexcel = xlwt.Workbook()
        sheet = myexcel.add_sheet("sheet2")
        p=0
        for i in test_set.vocabulary.keys():
            sheet.write(p,0,i)
            print(i)
            sheet.write(p,1,str(test_set.vocabulary[i]))
            p=p+1
                
        myexcel.save("词汇id.xls")

     各条分词:

     词汇id:

  • 相关阅读:
    计算机基础总结
    Apache安装错误 APR not found解决方法
    一、编译错误
    2.2 进程控制之进程共享
    2.1 进程控制之fork创建子进程
    ARM串口控制终端命令
    u-boot、kernel、root系统烧写和挂载命令命令
    8.1 编写USB鼠标驱动程序,并测试
    八、USB驱动分析
    Source Insight的使用
  • 原文地址:https://www.cnblogs.com/caiyishuai/p/9590861.html
Copyright © 2011-2022 走看看