zoukankan      html  css  js  c++  java
  • 文本分析

    #pip install snownlp  插入自然语言处理库,文本处理库
    import numpy as np
    import pandas as pd
    from snownlp import SnowNLP
    import matplotlib.pyplot as plt
    #插入文本
    hotel = pd.read_csv('hotel.csv', encoding='gb18030') hotel.head()

    5 rows × 41 columns

    #查看前五条评价
    x = hotel['comment'].dropna() x.head()

    x.index()是从列表中找出某个对象第一个匹配项的索引位置,如果这个对象不在列表中会报一个异常。
    L.index(obj[,start=0[,stop=len(L)]]),obj是查找到对象,start,可选参数,开始索引,默认为0,能单独指定;stop可选参数,结束索引,默认为列表长度,不能单独指定。
    x.index

    长度是小于索引的,说明可能是有缺失的

    #显示第一个评价
    text = x[0] text

    #设置要测试的语句,
    s = SnowNLP(text) for sentence in s.sentences: print(sentence)

    s1 = SnowNLP(s.sentences[1])  #调用sentiments方法获取他的概率。
    s1.sentiments
    0.6955113989735859
    #测试所有的字段
    sentimentslist = [] for i in x.index: s1 = x[i] s = SnowNLP(s1) print (s.sentences) print (s.sentiments) sentimentslist.append(s.sentiments)

    #将概率全部列出
    sentimentslist

     pd.Series(sentimentslist).mean()#求均值
    0.5515285981366501
    #画柱形图
    plt.hist(sentimentslist, bins = 20, facecolor = 'blue') plt.xlabel('Sentiments Probability') plt.ylabel('Quantity') plt.title('Analysis of Sentiments') plt.show()

    #pip install jieba
    #conda install wordcloud
    import matplotlib.pyplot as plt #词云
    from scipy.misc import imread  #图像处理
    #from matplotlib.pyplot import imread
    from wordcloud import WordCloud
    import jieba, codecs
    from collections import Counter
    from wordcloud import WordCloud, ImageColorGenerator
    import csv
    csv_reader = csv.reader(open('hotel.csv', encoding='gb18030'))  #一般导入的时候还是需要加上编码类型,不然画图的时候就会发现标题等字段是乱码的
    with open('hotel.csv','r',encoding='gb18030') as csvfile:
        reader = csv.reader(csvfile)
        column = [row[-7] for row in reader]
    print (column)

    text = column[1:]
    text

    file=open('data.txt','w',encoding='utf-8') 
    file.write(str(text)); 
    file.close() 
    text = open('data.txt','r',encoding='utf-8').read()
     ' '.join(jieba.cut(text)) #精准匹配到.join

    text = open('data.txt','r',encoding='utf-8').read() #文本数据
    cut_text = ' '.join(jieba.cut(text))
    print(cut_text)
    #color_mask = imread("tupian.png")
    cloud = WordCloud(
        font_path='C:WindowsFontsSTZHONGS.TTF',  # 字体最好放在与脚本相同的目录下,而且必须设置
        background_color='white',
        #mask=color_mask,
        max_words=2000,
        max_font_size=40
    )

    #画出词云
    plt.figure(figsize=[20,8]) word_cloud = cloud.generate(cut_text) plt.imshow(word_cloud) plt.axis('off') plt.show()

  • 相关阅读:
    玩聚SD:感谢曹增辉的博客点评
    Social Dialogue征集IT意见领袖和优秀博客的RSS地址
    微软+Powerset>GoogleAdSense还是>GoogleSearch?
    1989旧金山地震:动物预测成功的非经典案例
    随手小记·危机来了与贪婪恐惧
    玩聚SD:感谢风言疯语之IT罗盘对玩聚SD的推荐
    独立思考之慎用孤例
    08软件技术英雄会:一次比一次接近完美
    独立思考之手动check
    MyBatisSpring MapperScannerConfigurer
  • 原文地址:https://www.cnblogs.com/RR-99/p/10410038.html
Copyright © 2011-2022 走看看