zoukankan      html  css  js  c++  java
  • NLP之电影评分数据的情感分析

    1、基于词袋模型的逻辑回归情感分类

    # coding: utf-8
    import re
    import numpy as np
    import pandas as pd
    from bs4 import BeautifulSoup
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.metrics import confusion_matrix
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    import matplotlib.pyplot as plt
    import itertools
    
    ###########################词袋模型特征############################################
    #重组为新的句子
    def clean_text(text):
        """
        去掉html标签、移除标点、切分成词/token、去掉停用词、重组为新的句子
        :param text:
        :return:
        """
        print(text)
        text = BeautifulSoup(text, 'html.parser').get_text()
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        words = text.lower().split()
        stopwords = {}.fromkeys([line.rstrip() for line in open('../stopwords/stopwords_english.txt')])
        eng_stopwords = set(stopwords)
        print(eng_stopwords)
        words = [w for w in words if w not in eng_stopwords]
        print(words)
        return ' '.join(words)
    
    #混淆矩阵
    def plot_confusion_matrix(cm, classes,
                              title='Confusion matrix',
                              cmap=plt.cm.Blues):
        """
        This function prints and plots the confusion matrix.
        """
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=0)
        plt.yticks(tick_marks, classes)
    
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, cm[i, j],
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
    
        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
    
    if __name__=='__main__':
        #读取数据
        df = pd.read_csv('../data/labeledTrainData.tsv', sep='	', escapechar='\')
        print(df.head(5))
        #数据清洗,对df中的每一个Serial进行清洗
        df['clean_review'] = df.review.apply(clean_text)
        print(df['clean_review'])
        #抽取bag of words特征(用sklearn的CountVectorizer)
        vectorizer = CountVectorizer(max_features=5000)
        train_data_features = vectorizer.fit_transform(df.clean_review).toarray()
        print(train_data_features)
    
        # 数据切分
        X_train, X_test, y_train, y_test = train_test_split(train_data_features, df.sentiment, test_size=0.2,
                                                        random_state=0)
        print(X_train,X_test,y_train,y_test)
        # ### 训练分类器
        LR_model = LogisticRegression()
        LR_model = LR_model.fit(X_train, y_train)
        y_pred = LR_model.predict(X_test)
        cnf_matrix = confusion_matrix(y_test, y_pred)
    
        print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))
    
        print("accuracy metric in the testing dataset: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / (
                    cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1]))
    
        # Plot non-normalized confusion matrix
        class_names = [0, 1]
        plt.figure()
        plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
        plt.show()

    2、基于word2vec词向量模型的逻辑回归情感分类

    import re
    import numpy as np
    import pandas as pd
    from bs4 import BeautifulSoup
    from sklearn.metrics import confusion_matrix
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    import nltk
    import warnings
    from gensim.models.word2vec import Word2Vec
    from nltk.corpus import stopwords
    import matplotlib.pyplot as plt
    import itertools
    warnings.filterwarnings("ignore")
    
    def clean_text(text, remove_stopwords=False):
        text = BeautifulSoup(text, 'html.parser').get_text()
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        words = text.lower().split()
        eng_stopwords = set(stopwords.words('english'))
        if remove_stopwords:
            words = [w for w in words if w not in eng_stopwords]
        return words
    
    def split_sentences(review):
        #print(type(review))
        raw_sentences=tokenizer.tokenize(str(review).strip())
        sentences = [clean_text(s) for s in raw_sentences if s]
        return sentences
    
    def to_review_vector(review):
        global word_vec
        review = clean_text(review, remove_stopwords=True)
        # print (review)
        # words = nltk.word_tokenize(review)
        word_vec = np.zeros((1, 300))
        for word in review:
            # word_vec = np.zeros((1,300))
            if word in model:
                word_vec += np.array([model[word]])
        # print (word_vec.mean(axis = 0))
        return pd.Series(word_vec.mean(axis=0))
    
    def plot_confusion_matrix(cm, classes,
                              title='Confusion matrix',
                              cmap=plt.cm.Blues):
        """
        This function prints and plots the confusion matrix.
        """
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=0)
        plt.yticks(tick_marks, classes)
    
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, cm[i, j],
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
    
        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
    
    if __name__ == '__main__':
        #读取数据
        df = pd.read_csv('../data/labeledTrainData.tsv', sep='	', escapechar='\')
        #数据清洗
        df['clean_review'] = df.review.apply(clean_text)
        review_part = df['clean_review']
        #nltk库分词
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        sentences = sum(review_part.apply(split_sentences), [])
        sentences_list = []
        for line in sentences:
            sentences_list.append(nltk.word_tokenize(str(line).strip()))
    
        #word2vec
        num_features = 300  # Word vector dimensionality
        min_word_count = 40  # Minimum word count
        num_workers = 4  # Number of threads to run in parallel
        context = 10  # Context window size
        model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)
        model = Word2Vec(sentences_list, workers=num_workers, size=num_features, min_count=min_word_count, window=context)
        model.init_sims(replace=True)
        model.save('word2vec.models')
    
        train_data_features = df.review.apply(to_review_vector)
    
        X_train, X_test, y_train, y_test = train_test_split(train_data_features, df.sentiment, test_size=0.2, random_state=0)
    
        LR_model = LogisticRegression()
        LR_model = LR_model.fit(X_train, y_train)
        y_pred = LR_model.predict(X_test)
        cnf_matrix = confusion_matrix(y_test, y_pred)
    
        print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))
        print("accuracy metric in the testing dataset: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / (
                    cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1]))
    
        # Plot non-normalized confusion matrix
        class_names = [0, 1]
        plt.figure()
        plot_confusion_matrix(cnf_matrix , classes=class_names, title='Confusion matrix')
        plt.show()
  • 相关阅读:
    JavaScript小笔记の经典算法等....
    SEO
    幻灯片の纯CSS,NO JavaScript
    试说明采用双缓冲技术如何进行I/O操作
    常用的缓冲技术有哪几种?
    什么是缓冲,引入缓冲的原因是什么?
    什么是设备控制块,它主要包括什么内容,简述其作用?
    进程的逻辑设备如何与一个物理设备建立对应的关系?
    什么是设备独立性,引入这一概念有什么好处?
    试叙述段页式地址变换过程。
  • 原文地址:https://www.cnblogs.com/ywjfx/p/11119175.html
Copyright © 2011-2022 走看看