zoukankan      html  css  js  c++  java
  • 情感分析

    import numpy as np
    from gensim.models.word2vec import Word2Vec
    from gensim.corpora.dictionary import Dictionary
    from gensim import models
    import pandas as pd
    import jieba
    import logging
    import numpy as np
    import re
    from gensim.models.word2vec import Word2Vec
    from gensim.corpora.dictionary import Dictionary
    from gensim import models
    from sklearn.model_selection import train_test_split
    from keras import Sequential
    from keras.preprocessing.sequence import pad_sequences
    from keras.layers import Bidirectional,LSTM,Dense,Embedding,Dropout,Activation,Softmax
    from sklearn.model_selection import train_test_split
    from keras.utils import np_utils



    def read_data(data_path):
        senlist = []
        labellist = []  
        with open(data_path, "r",encoding='utf-8',errors='ignore') as f:
             for data in  f.readlines():
                    data = data.strip()
                    label=data[0]
                    sen = data[2:]
                    if sen != "" and (label =="0" or label=="1"  ) :
                        senlist.append(sen)
                        labellist.append(label) 
                    else:
                        pass                    
        assert(len(senlist) == len(labellist))            
        return senlist ,labellist 

    #训练模型(获得word2vec.model的word2vec训练模型)
    def train_word2vec(sentences,save_path):
        sentences_seg = []
        sen_str = " ".join(sentences) #以 为连接符将sentences连接成一个字符串
        res = jieba.lcut(sen_str)
        seg_str = " ".join(res)
        sen_list = seg_str.split(" ")
        for i in sen_list:
            sentences_seg.append(i.split())
        print("开始训练词向量") 
    #     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        model = Word2Vec(sentences_seg,
                    size=100,  # 词向量维度
                    min_count=5,  # 词频阈值   #词的相似度
                    window=5)  # 窗口大小    
        model.save(save_path)
        return model

    def generate_id2wec(model):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2id = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引,从1开始编号
        w2vec = {word: model[word] for word in w2id.keys()}  # 词语的词向量
        n_vocabs = len(w2id) + 1
        embedding_weights = np.zeros((n_vocabs, 100))
        for w, index in w2id.items():  # 从索引为1的词语开始,用词向量填充矩阵
            embedding_weights[index, :] = w2vec[w]
        return w2id,embedding_weights

    def text_to_array(w2index, senlist):  # 文本转为索引数字模式
        sentences_array = []
        for sen in senlist:
            new_sen = [ w2index.get(word,0) for word in sen]   # 单词转索引数字
            sentences_array.append(new_sen)
        return np.array(sentences_array)

    def prepare_data(w2id,sentences,labels,max_len=200):
        X_train, X_val, y_train, y_val = train_test_split(sentences,labels, test_size=0.2)
        X_train = text_to_array(w2id, X_train)
        X_val = text_to_array(w2id, X_val)
        X_train = pad_sequences(X_train, maxlen=max_len)
        X_val = pad_sequences(X_val, maxlen=max_len)
        return np.array(X_train), np_utils.to_categorical(y_train) ,np.array(X_val), np_utils.to_categorical(y_val)


    # data = pd.read_csv("weibo.csv")  # 读取csv文件
    # sentences= data.review.values # 变成list #微博的句
    # label = data.label.values# 微博的情感的结果 (用来训练用的)
    # sentences=[]
    # for text in sents:
    #      print(text)
    #      pre = re.compile('@(.*?):')
    #      s1 = ''.join(pre.findall(text))#以空格的方式连接符合的字符串
    #      sentences.append(s1)
    # print(sentences)

    #  xxr begin
    sentences,labels = read_data("weibo.csv")
    # print(sentences)
    # print(labels[-100:])
    # model =  train_word2vec(sentences,'word2vec.model')   
    # model.save("model")
    #xxr end 

    model = Word2Vec.load("model")
    w2id,embedding_weights = generate_id2wec(model)
    # #打印结果
    # for key in w2id:
    #        print(str(key)+':'+str(w2id[key]))
    # print("-------------------------------------------------------------------")
    # print(embedding_weights)

    #存储生成的"embedding_weights.npy"np array
    # np.save("embedding_weights.npy",embedding_weights)
    embedding_weights=np.load("embedding_weights.npy")

    # print(embedding_weights)

    # # 将获取的字典 存在w2id.txt里面
    # f = open("w2id.txt", 'w',encoding='utf-8')
    # f.write(str(w2id))
    # f.close()

    ## 读取字典
    f = open("w2id.txt", 'r',encoding='utf-8')
    w2id = eval(f.read())
    f.close()

    # for key in w2id:
    #        print(str(key)+':'+str(w2id[key]))

    x_train,y_trian, x_val , y_val = prepare_data(w2id,sentences,labels,200)


    class Sentiment:
        def __init__(self,w2id,embedding_weights,Embedding_dim,maxlen,labels_category):
            self.Embedding_dim = Embedding_dim
            self.embedding_weights = embedding_weights
            self.vocab = w2id
            self.labels_category = labels_category
            self.maxlen = maxlen
            self.model = self.build_model()
          
            
        def build_model(self):
            model = Sequential()
            #input dim(140,100)
            model.add(Embedding(output_dim = self.Embedding_dim,
                               input_dim=len(self.vocab)+1,
                               weights=[self.embedding_weights],
                               input_length=self.maxlen))
            model.add(Bidirectional(LSTM(50),merge_mode='concat'))
            model.add(Dropout(0.5))
            model.add(Dense(self.labels_category))
            model.add(Activation('softmax'))
            model.compile(loss='categorical_crossentropy',
                         optimizer='adam', 
                         metrics=['accuracy'])
            model.summary()
            return model
        
        def train(self,X_train, y_train,X_test, y_test,n_epoch=5 ):
            self.model.fit(X_train, y_train, batch_size=32, epochs=n_epoch,
                          validation_data=(X_test, y_test))
            self.model.save('sentiment.h5')   
            
        def predict(self,model_path,new_sen):
            model = self.model
            model.load_weights(model_path)
            new_sen_list = jieba.lcut(new_sen)
            sen2id =[ self.vocab.get(word,0) for word in new_sen_list]
            sen_input = pad_sequences([sen2id], maxlen=self.maxlen)
            res = model.predict(sen_input)[0]
            return np.argmax(res)

    ## 训练
    senti = Sentiment(w2id,embedding_weights,100,200,2)
    # senti.train(x_train,y_trian, x_val ,y_val,1)
    #预测
    label_dic = {0:"消极的",1:"积极的"}
    sen_new = "现如今的公司能够做成这样已经很不错了,微订点单网站的信息更新很及时,内容来源很真实"
    pre = senti.predict("./sentiment.h5",sen_new)
    print("'{}'的情感是: {}".format(sen_new,label_dic.get(pre)))

      情感分析

     

     

    修改:

    import numpy as np
    from gensim.models.word2vec import Word2Vec
    from gensim.corpora.dictionary import Dictionary
    from gensim import models
    import pandas as pd
    import jieba
    import logging
    import numpy as np
    import re
    from gensim.models.word2vec import Word2Vec
    from gensim.corpora.dictionary import Dictionary
    from gensim import models
    from sklearn.model_selection import train_test_split
    from keras import Sequential
    from keras.preprocessing.sequence import pad_sequences
    from keras.layers import Bidirectional,LSTM,Dense,Embedding,Dropout,Activation,Softmax
    from sklearn.model_selection import train_test_split
    from keras.utils import np_utils
    
    
    
    def read_data(data_path):
        senlist = []
        labellist = []  
        with open(data_path, "r",encoding='utf-8',errors='ignore') as f:
             for data in  f.readlines():
                    data = data.strip()
                    label=data[0]
                    sen = data[2:]
                    if sen != "" and (label =="0" or label=="1"  ) :
                        senlist.append(sen)
                        labellist.append(label) 
                    else:
                        pass                    
        assert(len(senlist) == len(labellist))            
        return senlist ,labellist 
    
    #训练模型(获得word2vec.model的word2vec训练模型)
    def train_word2vec(sentences,save_path):
        sentences_seg = []
        sen_str = "
    ".join(sentences) #以
    为连接符将sentences连接成一个字符串
        res = jieba.lcut(sen_str)
        seg_str = " ".join(res)
        sen_list = seg_str.split("
    ")
        for i in sen_list:
            sentences_seg.append(i.split())
        print("开始训练词向量") 
    #     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        model = Word2Vec(sentences_seg,
                    size=100,  # 词向量维度
                    min_count=5,  # 词频阈值   #词的相似度
                    window=5)  # 窗口大小    
        model.save(save_path)
        return model
    
    def generate_id2wec(model):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2id = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引,从1开始编号
        w2vec = {word: model[word] for word in w2id.keys()}  # 词语的词向量
        n_vocabs = len(w2id) + 1
        embedding_weights = np.zeros((n_vocabs, 100))
        for w, index in w2id.items():  # 从索引为1的词语开始,用词向量填充矩阵
            embedding_weights[index, :] = w2vec[w]
        return w2id,embedding_weights
    
    def text_to_array(w2index, senlist):  # 文本转为索引数字模式
        sentences_array = []
        for sen in senlist:
            new_sen = [ w2index.get(word,0) for word in sen]   # 单词转索引数字
            sentences_array.append(new_sen)
        return np.array(sentences_array)
    
    def prepare_data(w2id,sentences,labels,max_len=200):
        X_train, X_val, y_train, y_val = train_test_split(sentences,labels, test_size=0.2)
        X_train = text_to_array(w2id, X_train)
        X_val = text_to_array(w2id, X_val)
        X_train = pad_sequences(X_train, maxlen=max_len)
        X_val = pad_sequences(X_val, maxlen=max_len)
        return np.array(X_train), np_utils.to_categorical(y_train) ,np.array(X_val), np_utils.to_categorical(y_val)
    
    
    
    
    #  xxr begin
    sentences,labels = read_data("weibo.csv")
    # print(sentences)
    # print(labels[-100:])
    
    # model =  train_word2vec(sentences,'word2vec.model')   
    # model.save("model")
    #xxr end 
    
    model = Word2Vec.load("model")
    w2id,embedding_weights = generate_id2wec(model)
    # #打印结果
    # for key in w2id:
    #        print(str(key)+':'+str(w2id[key]))
    # print("-------------------------------------------------------------------")
    # print(embedding_weights)
    
    #存储生成的"embedding_weights.npy"np array
    # np.save("embedding_weights.npy",embedding_weights)
    embedding_weights=np.load("embedding_weights.npy")
    
    # print(embedding_weights)
    
    # # 将获取的字典 存在w2id.txt里面
    # f = open("w2id.txt", 'w',encoding='utf-8')
    # f.write(str(w2id))
    # f.close()
    
    ## 读取字典
    f = open("w2id.txt", 'r',encoding='utf-8')
    w2id = eval(f.read())
    f.close()
    
    # for key in w2id:
    #        print(str(key)+':'+str(w2id[key]))
    
    x_train,y_trian, x_val , y_val = prepare_data(w2id,sentences,labels,200)
    
    
    class Sentiment:
        def __init__(self,w2id,embedding_weights,Embedding_dim,maxlen,labels_category):
            self.Embedding_dim = Embedding_dim
            self.embedding_weights = embedding_weights
            self.vocab = w2id
            self.labels_category = labels_category
            self.maxlen = maxlen
            self.model = self.build_model()
          
            
        def build_model(self):
            model = Sequential()
            #input dim(140,100)
            model.add(Embedding(output_dim = self.Embedding_dim,
                               input_dim=len(self.vocab)+1,
                               weights=[self.embedding_weights],
                               input_length=self.maxlen))
            model.add(Bidirectional(LSTM(50),merge_mode='concat'))
            model.add(Dropout(0.5))
            model.add(Dense(self.labels_category))
            model.add(Activation('softmax'))
            model.compile(loss='categorical_crossentropy',
                         optimizer='adam', 
                         metrics=['accuracy'])
            model.summary()
            return model
        
        def train(self,X_train, y_train,X_test, y_test,n_epoch=5 ):
            self.model.fit(X_train, y_train, batch_size=32, epochs=n_epoch,
                          validation_data=(X_test, y_test))
            self.model.save('sentiment.h5')   
            
        def predict(self,model_path,new_sen):
            model = self.model
            model.load_weights(model_path)
            new_sen_list = jieba.lcut(new_sen)
            sen2id =[ self.vocab.get(word,0) for word in new_sen_list]
            sen_input = pad_sequences([sen2id], maxlen=self.maxlen)
            res = model.predict(sen_input)[0]
            return np.argmax(res)
    
    ## 训练
    senti = Sentiment(w2id,embedding_weights,100,200,2)
    # senti.train(x_train,y_trian, x_val ,y_val,1)
    #预测
    sumP=0
    sumN=0
    label_dic = {0:"消极的",1:"积极的"}
    with open("source.csv", "r",encoding='utf-8',errors='ignore') as f:
             for data in  f.readlines():
                    data = data.strip()
                    pre = senti.predict("./sentiment.h5",data)
                    if(pre==0):
                          sumN+=1
                    else :
                          sumP+=1
                    print("'{}'的情感是:
    {}".format(data,label_dic.get(pre)))
                    print(str(sumP)+"hhhh"+str(sumN))
                    # print(data)
                    # print("----------------------------------------------")
    # sen_new = "现如今的公司能够做成这样已经很不错了,微订点单网站的信息更新很及时,内容来源很真实"
    # pre = senti.predict("./sentiment.h5",sen_new)
    # print("'{}'的情感是:
    {}".format(sen_new,label_dic.get(pre)))
    

      

     

     

  • 相关阅读:
    C#中的global::system***命名空间别名限定符
    返回一个整数数组中最大子数组的和
    敏捷开发概述
    单词查找排序输出
    关于电梯调度的设计
    关于电梯调度的一些想法
    C#中抽象类和接口的区别
    SharePoint2010列表表单:用后台代码生成表单
    外刊IT评论:远离.net
    程序员:编程给你现实生活带来了哪些坏习惯
  • 原文地址:https://www.cnblogs.com/kekexxr/p/11938888.html
Copyright © 2011-2022 走看看