zoukankan      html  css  js  c++  java
  • 基于Word2Vec的影评挖掘

    0. 夜来幽梦忽还乡




        CSDN上有基于影评的二分类问题(消极和积极),学习后自动将用户的评价进行二分类,自行参考。影评数据集来源http://www.cs.cornell.edu/people/pabo/movie-review-data/。但这次本猫要描述的问题和这个不一样。我们将对影评数据进行分析,然后对电影的好坏进行简单的评判。情感分析其实很难,挖苦嘲讽这些深层语义如果不对上下文进行好好的挖掘是没有办法知道得。平时你说great,可能是嘲讽预期对吧,“灭霸真TMD great“。

    1. 小轩窗 正梳妆 相顾无言


    # 基于Word2Vec的影评挖掘
    # Author: Allen_ZQH
    # Date: 2018.3.20

    import tensorflow as tf
    import matplotlib.pyplot as plt
    import numpy as np
    import random
    import os
    import pickle
    import string
    import requests
    import collections
    import io
    import tarfile
    import urllib.request
    import text_helpers
    from nltk.corpus import stopwords
    from tensorflow.python.framework import ops


    # 开始计算图会话
    sess = tf.Session()

    # 确定CROW模型参数
    embedding_size = 200
    vocabulary_size = 2000
    batch_size = 100
    max_words = 100

    # 加载nltk库中的英文停顿词表
    stops = stopwords.words('english')

    # 载入数据
    print('Loading Data')
    data_folder_name = 'temp'
    texts, target = text_helpers.load_movie_data()

    # 使用text_helpers加载和转换文本数据集
    print('Normalizing Text Data')
    texts = text_helpers.normalize_text(texts, stops)

    # 一句评论至少包含3个词
    target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
    texts = [x for x in texts if len(x.split()) > 2]

    # 将数据分为测试集和训练集
    train_indices = np.random.choice(len(target), round(0.8*len(target)), replace=False)
    test_indices = np.array(list(set(range(len(target))) - set(train_indices)))
    texts_train = [x for ix, x in enumerate(texts) if ix in train_indices]
    texts_test = [x for ix, x in enumerate(texts) if ix in test_indices]
    target_train = np.array([x for ix, x in enumerate(target) if ix in train_indices])
    target_test = np.array([x for ix, x in enumerate(target) if ix in test_indices])

    # 加载词典和Embedding矩阵
    dict_file = os.path.join( '..', '05_Working_With_CBOW_Embeddings', 'temp', 'movie_vocab.pkl')
    word_dictionary = pickle.load(open(dict_file, 'rb'))

    # 通过字典将加载的句子转化为数值型numpy数组
    text_data_train = np.array(text_helpers.text_to_numbers(texts_train, word_dictionary))
    text_data_test = np.array(text_helpers.text_to_numbers(texts_test, word_dictionary))

    # 由于影评长度不一样,规定一句影评为100个单词,不足用0填充
    text_data_train = np.array([x[0:max_words] for x in [y+[0]*max_words for y in text_data_train]])
    text_data_test = np.array([x[0:max_words] for x in [y+[0]*max_words for y in text_data_test]])

    print('Creating Model')

    # Embedding层(Word2Vec相关,其实tf中可以直接调用google开发的Word2Vec库)
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

    # 定义Embedding层模型:

    # 声明逻辑回归的模型变量和占位符
    A = tf.Variable(tf.random_normal(shape=[embedding_size,1]))
    b = tf.Variable(tf.random_normal(shape=[1,1]))
    x_data = tf.placeholder(shape=[None, max_words], dtype=tf.int32)
    y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32)

    # 在计算图中假如嵌套查找操作。计算句子中所有单词的平均嵌套
    embed = tf.nn.embedding_lookup(embeddings, x_data)
    embed_avg = tf.reduce_mean(embed, 1)

    # 声明模型操作和损失函数
    model_output = tf.add(tf.matmul(embed_avg, A), b)
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=model_output, labels=y_target))

    # 预测函数和准确度函数
    prediction = tf.round(tf.sigmoid(model_output))
    predictions_correct = tf.cast(tf.equal(prediction, y_target), tf.float32)
    accuracy = tf.reduce_mean(predictions_correct)
    my_opt = tf.train.AdagradOptimizer(0.005)
    train_step = my_opt.minimize(loss)

    # 初始化变量
    init = tf.global_variables_initializer()

    # 随机初始化单词嵌套,导入CBOW模型
    model_checkpoint_path = os.path.join( '..', '05_Working_With_CBOW_Embeddings',
    saver = tf.train.Saver({"embeddings": embeddings})
    saver.restore(sess, model_checkpoint_path)

    # 开始训练,每迭代100次保存训练集和测试集的损失和准确度
    # 每500次打印一次模型状态
    print('Starting Model Training')
    train_loss = []
    test_loss = []
    train_acc = []
    test_acc = []
    i_data = []
    for i in range(10000):
        rand_index = np.random.choice(text_data_train.shape[0], size=batch_size)
        rand_x = text_data_train[rand_index]
        rand_y = np.transpose([target_train[rand_index]])
        sess.run(train_step, feed_dict={x_data: rand_x, y_target: rand_y})
        # Only record loss and accuracy every 100 generations
        if (i+1)%100==0:
            train_loss_temp = sess.run(loss, feed_dict={x_data: rand_x, y_target: rand_y})
            test_loss_temp = sess.run(loss, feed_dict={x_data: text_data_test, y_target: np.transpose([target_test])})
            train_acc_temp = sess.run(accuracy, feed_dict={x_data: rand_x, y_target: rand_y})
            test_acc_temp = sess.run(accuracy, feed_dict={x_data: text_data_test, y_target: np.transpose([target_test])})
        if (i+1)%500==0:
            acc_and_loss = [i+1, train_loss_temp, test_loss_temp, train_acc_temp, test_acc_temp]
            acc_and_loss = [np.round(x,2) for x in acc_and_loss]
            print('Generation # {}. Train Loss (Test Loss): {:.2f} ({:.2f}). Train Acc (Test Acc): {:.2f} ({:.2f})'.format(*acc_and_loss))

    # 绘制损失函数
    plt.plot(i_data, train_loss, 'k-', label='Train Loss')
    plt.plot(i_data, test_loss, 'r--', label='Test Loss', linewidth=4)
    plt.title('Cross Entropy Loss per Generation')
    plt.ylabel('Cross Entropy Loss')
    plt.legend(loc='upper right')

    # 绘制训练和测试函数
    plt.plot(i_data, train_acc, 'k-', label='Train Set Accuracy')
    plt.plot(i_data, test_acc, 'r--', label='Test Set Accuracy', linewidth=4)
    plt.title('Train and Test Accuracy')
    plt.legend(loc='lower right')

    # Text Helper Functions

    import string
    import os
    import urllib.request
    import io
    import tarfile
    import collections
    import numpy as np
    import requests
    import gzip

    # Normalize text
    def normalize_text(texts, stops):
        # Lower case
        texts = [x.lower() for x in texts]

        # Remove punctuation
        texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]

        # Remove numbers
        texts = [''.join(c for c in x if c not in '0123456789') for x in texts]

        # Remove stopwords
        texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]

        # Trim extra whitespace
        texts = [' '.join(x.split()) for x in texts]

    # Build dictionary of words
    def build_dictionary(sentences, vocabulary_size):
        # Turn sentences (list of strings) into lists of words
        split_sentences = [s.split() for s in sentences]
        words = [x for sublist in split_sentences for x in sublist]
        # Initialize list of [word, word_count] for each word, starting with unknown
        count = [['RARE', -1]]
        # Now add most frequent words, limited to the N-most frequent (N=vocabulary size)
        # Now create the dictionary
        word_dict = {}
        # For each word, that we want in the dictionary, add it, then make it
        # the value of the prior dictionary length
        for word, word_count in count:
            word_dict[word] = len(word_dict)

    # Turn text data into lists of integers from dictionary
    def text_to_numbers(sentences, word_dict):
        # Initialize the returned data
        data = []
        for sentence in sentences:
            sentence_data = []
            # For each word, either use selected index or rare word index
            for word in sentence.split():
                if word in word_dict:
                    word_ix = word_dict[word]
                    word_ix = 0

    # Generate data randomly (N words behind, target, N words ahead)
    def generate_batch_data(sentences, batch_size, window_size, method='skip_gram'):
        # Fill up data batch
        batch_data = []
        label_data = []
        while len(batch_data) < batch_size:
            # select random sentence to start
            rand_sentence_ix = int(np.random.choice(len(sentences), size=1))
            rand_sentence = sentences[rand_sentence_ix]
            # Generate consecutive windows to look at
            window_sequences = [rand_sentence[max((ix-window_size),0):(ix+window_size+1)] for ix, x in enumerate(rand_sentence)]
            # Denote which element of each window is the center word of interest
            label_indices = [ix if ix<window_size else window_size for ix,x in enumerate(window_sequences)]
            # Pull out center word of interest for each window and create a tuple for each window
            if method=='skip_gram':
                batch_and_labels = [(x[y], x[:y] + x[(y+1):]) for x,y in zip(window_sequences, label_indices)]
                # Make it in to a big list of tuples (target word, surrounding word)
                tuple_data = [(x, y_) for x,y in batch_and_labels for y_ in y]
                batch, labels = [list(x) for x in zip(*tuple_data)]
            elif method=='cbow':
                batch_and_labels = [(x[:y] + x[(y+1):], x[y]) for x,y in zip(window_sequences, label_indices)]
                # Only keep windows with consistent 2*window_size
                batch_and_labels = [(x,y) for x,y in batch_and_labels if len(x)==2*window_size]
                batch, labels = [list(x) for x in zip(*batch_and_labels)]
            elif method=='doc2vec':
                # For doc2vec we keep LHS window only to predict target word
                batch_and_labels = [(rand_sentence[i:i+window_size], rand_sentence[i+window_size]) for i in range(0, len(rand_sentence)-window_size)]
                batch, labels = [list(x) for x in zip(*batch_and_labels)]
                # Add document index to batch!! Remember that we must extract the last index in batch for the doc-index
                batch = [x + [rand_sentence_ix] for x in batch]
                raise ValueError('Method {} not implemented yet.'.format(method))
            # extract batch and labels
        # Trim batch and label at the end
        batch_data = batch_data[:batch_size]
        label_data = label_data[:batch_size]
        # Convert to numpy array
        batch_data = np.array(batch_data)
        label_data = np.transpose(np.array([label_data]))
        return(batch_data, label_data)
    # Load the movie review data
    # Check if data was downloaded, otherwise download it and save for future use
    def load_movie_data():
        save_folder_name = 'temp'
        pos_file = os.path.join(save_folder_name, 'rt-polaritydata', 'rt-polarity.pos')
        neg_file = os.path.join(save_folder_name, 'rt-polaritydata', 'rt-polarity.neg')

        # Check if files are already downloaded
        if not os.path.exists(os.path.join(save_folder_name, 'rt-polaritydata')):
            movie_data_url = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'

            # Save tar.gz file
            req = requests.get(movie_data_url, stream=True)
            with open('temp_movie_review_temp.tar.gz', 'wb') as f:
                for chunk in req.iter_content(chunk_size=1024):
                    if chunk:
            # Extract tar.gz file into temp folder
            tar = tarfile.open('temp_movie_review_temp.tar.gz', "r:gz")

        pos_data = []
        with open(pos_file, 'r', encoding='latin-1') as f:
            for line in f:
        pos_data = [x.rstrip() for x in pos_data]

        neg_data = []
        with open(neg_file, 'r', encoding='latin-1') as f:
            for line in f:
        neg_data = [x.rstrip() for x in neg_data]
        texts = pos_data + neg_data
        target = [1]*len(pos_data) + [0]*len(neg_data)
        return(texts, target)



    2. 相顾无言 惟有泪千行


  • 相关阅读:
    lsattr, chattr
  • 原文地址:https://www.cnblogs.com/catallen/p/9111168.html
Copyright © 2011-2022 走看看