zoukankan      html  css  js  c++  java
  • 中文手机评论情感分析系列(二)

    第二部分是评论基于属性词典的分类,分类原则是只要评论中属性词典中的分词,那么评论就分到该属性类中去。

    '''基于词典的评论文本按属性分类'''
    
    import pandas as pd
    import re,time
    import jieba
    from sklearn.feature_extraction.text import  CountVectorizer, TfidfTransformer
    from sklearn.naive_bayes import MultinomialNB
    import numpy as np
    import pyltp,os
    
    class TextClass():
        def __init__(self):
            self.stopWord=[]
    
        '''分词和词性标注'''
        def seg_pos(self,sent):
            stopWord_Path = 'D:/论文文件/学习文件/情感分析/dict词典/哈工大stopword .txt'
            with open(stopWord_Path, 'r', encoding='utf-8') as fr:  # 加载停用词
                for word in fr.readlines():
                    self.stopWord.append(word.strip())
            line = re.sub(r'[a-zA-Z0-9]*', '', sent)
            abs_path = 'D:\LTP\ltp_data_v3.4.0\'  # 根目录绝对路径
            cws_path = os.path.join(abs_path, 'cws.model')  # 分词库文件路径
            seg = pyltp.Segmentor()  # 分词器对象
            seg.load(cws_path)  # 加载分词语料库
            cword = seg.segment(line)
            wordList = list(cword)
            # print(wordList)
            word_filter = [word for word in wordList if word not in self.stopWord]
            pos_model_path = os.path.join(abs_path, 'pos.model')  # 词性标注模型路径
            pos = pyltp.Postagger()
            pos.load(pos_model_path)
            pos_word = pos.postag(word_filter)
            pos.release()  # 释放实例
            seg.release()
            # return list(pos_word),
            return list(zip(word_filter, list(pos_word)))
    
        '''特征提取'''
        def feature_extraction(self,pos):
            features = []
            for i in pos:
                if i[1] in ['n', 'nz', 'j']:
                    features.append(i[0])  # 直接获得评论中的特征词
            return features
    
        '''加载词典'''
        def openFile(self,path):
            with open(path,'r',encoding='utf-8') as f:
                for word in f.readlines():
                    yield word.strip()
    
        '''文本特征化'''
        def text2feature(self,text_list):
            feature_list=[]
            for line in text_list:
                pos=self.seg_pos(line)
                if len(self.feature_extraction(pos))==0:#如果评论没有特征词的话,那么标记一下为'none',以便于后面对这类无特征文本的处理
                    feature_list.append(['none'])
                else:
                    feature_list.append(self.feature_extraction(pos))
            return feature_list
    
        '''文本分类'''
        def classify(self,feature_list):
            abs_path = 'D:\论文文件\阅读论文\写论文准备\字典构建\手机属性词典\dictionary_0_3\'
            save_name = ['相机.txt', '处理器.txt', '价格.txt', '性能.txt', '续航.txt','外观.txt', '售后.txt']
            c1 = list(self.openFile(abs_path+save_name[0]))
            c2 = list(self.openFile(abs_path+save_name[1]))
            c3 = list(self.openFile(abs_path+save_name[2]))
            c4 = list(self.openFile(abs_path + save_name[3]))
            c5 = list(self.openFile(abs_path + save_name[4]))
            c6 = list(self.openFile(abs_path + save_name[5]))
            c7 = list(self.openFile(abs_path + save_name[6]))
            dict={'camera':set([]),'processor':set([]),'price':set([]),'performance':set([]),'endurance':set([]),'appearance':set([]),'serve':set([]),'none':set([])}
            for i in range(len(feature_list)):
                for j in feature_list[i]:
                    if j in c1:
                        dict['camera'].add(i)
                    if j in c2:
                        dict['processor'].add(i)
                    if j in c3:
                        dict['price'].add(i)
                    if j in c4:
                        dict['performance'].add(i)
                    if j in c5:
                        dict['endurance'].add(i)
                    if j in c6:
                        dict['appearance'].add(i)
                    if j in c7:
                        dict['serve'].add(i)
                    if j=='none' :
                        dict['none'].add(i)
            return dict
            ##########!!!!!!这里缺少一个对于没有特正在7个属性词典的评论的处理,,明天解决!!!(初步想法是建立一个包含所有特征的词典,如果不在则评论放到无属性集合里
    
        '''根据上面分类的结果(索引字典)将原文件里的评论分开,并存储'''
        def classify_save(self,index_dict,abs_path,ori_file):
            # 其中index_dict是分类的结果,为字典索引
            # abs_path是存储分类文本的绝对地址
            # ori_file是要分类的评论文本文件,且是DataFrame格式文件
            #无返回文件,以csv格式存储分类结果
            keys=index_dict.keys()
            print(keys)
            for i in keys:#注意encoding='gbk',index=False编码格式和去掉列的默认索引DataFrame.ix[]的用法
                # print('关键词',i )
                # print(list(index_dict[i]))
                # print(ori_file.ix[list(index_dict[i])].comment)
                ori_file.ix[list(index_dict[i])].to_csv(abs_path+i+'.csv',encoding='gbk',index=False)
    
        '''集成所有功能的一个函数,直接输入预测数据地址,和保存分类好结果的地址'''
        def all(self,pre_data_path,save_abs_path):
            file=list(pd.read_csv(pre_data_path,sep=',',encoding='GBK').comment)
            text_feature = self.text2feature(file)
            result = self.classify(text_feature)
            comment_file = pd.read_csv(pre_data_path, sep=',', encoding='GBK')
            self.classify_save(result, save_abs_path, comment_file)
            for i in result.keys():#输出各个属性评论占总评论的比例
                print(i+'的评论比例:', len(result[i]) / len(file))
    
    if __name__=='__main__'  :
        s=time.time()
        '''对Excel中的预测数据按属性分类返回各类别评论索引'''
        '''华为预测数据'''
        path_xiaomi  = 'D:/machinelearning data/crawlerData/xiaomi6X_pre_JD100.csv'
        '''华为预测数据'''
        path_huawei= 'D:/machinelearning data/crawlerData/huaweiP20_pre_JD100.csv'
        abs_path_xiaomi='D:\machinelearning data\crawlerData\cluster_data\feature_phone_xiaomi\'
        abs_path_huawei='D:\machinelearning data\crawlerData\cluster_data\feature_phone_huawei\'
        demo=TextClass()
        #demo.all(path_xiaomi,abs_path_xiaomi)
        demo.all(path_huawei, abs_path_huawei)
        e=time.time()
        print('耗时:',e-s)
    

      

  • 相关阅读:
    intellij IDE 破解 简单 License server 法
    Unsupported major.minor version 52.0错误和 jdbc odbc
    MyEclipse优化攻略搜集
    感兴趣的WebGL ,来自微博的一个全景星空图~
    ie/chorme 清除缓存 刷新js,css
    PLSQL PL/SQL Developer Oracle 使用技巧 常用设置 卡顿问题 病毒防范( 附带:配置文件)
    MyEclipse eclipse console edit packageExplorer 颜色设置、个性化、常用设置
    java Map 四种遍历方法
    Eclipse MyEclipse 反编译.class文件 myeclipse source not found
    打印菱形
  • 原文地址:https://www.cnblogs.com/zz22--/p/9773971.html
Copyright © 2011-2022 走看看