zoukankan      html  css  js  c++  java
  • tensorflow在文本处理中的使用——Word2Vec预测

    代码来源于:tensorflow机器学习实战指南(曾益强 译,2017年9月)——第七章:自然语言处理

    代码地址:https://github.com/nfmcclure/tensorflow-cookbook

    数据:http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz

    问题:加载和使用预训练的嵌套,并使用这些单词嵌套进行情感分析,通过训练线性逻辑回归模型来预测电影的好坏

    步骤如下:

    • 必要包
    • 声明模型参数
    • 读取并转换文本数据集,划分训练集和测试集
    • 构建图
    • 训练

    step1:必要包

    import tensorflow as tf
    import matplotlib.pyplot as plt
    import numpy as np
    import random
    import os
    import pickle
    import string
    import requests
    import collections
    import io
    import tarfile
    import urllib.request
    import text_helpers
    from nltk.corpus import stopwords
    from tensorflow.python.framework import ops
    ops.reset_default_graph()
    
    os.chdir(os.path.dirname(os.path.realpath(__file__)))
    
    # Start a graph session
    sess = tf.Session()

    step2:声明模型参数

    # Declare model parameters
    embedding_size = 200
    vocabulary_size = 2000
    batch_size = 100
    max_words = 100
    
    # Declare stop words
    stops = stopwords.words('english'

    step3:读取并转换本文数据集,划分训练集和测试集

    参考:tensorflow在文本处理中的使用——辅助函数

    # Load Data
    print('Loading Data')
    data_folder_name = 'temp'
    texts, target = text_helpers.load_movie_data(data_folder_name)
    
    # Normalize text
    print('Normalizing Text Data')
    texts = text_helpers.normalize_text(texts, stops)
    
    # Texts must contain at least 3 words
    target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
    texts = [x for x in texts if len(x.split()) > 2]
    
    # Split up data set into train/test
    train_indices = np.random.choice(len(target), round(0.8*len(target)), replace=False)
    test_indices = np.array(list(set(range(len(target))) - set(train_indices)))
    texts_train = [x for ix, x in enumerate(texts) if ix in train_indices]
    texts_test = [x for ix, x in enumerate(texts) if ix in test_indices]
    target_train = np.array([x for ix, x in enumerate(target) if ix in train_indices])
    target_test = np.array([x for ix, x in enumerate(target) if ix in test_indices])
    
    # Load dictionary and embedding matrix加载CBOW嵌套中保存的单词字典
    dict_file = os.path.join(data_folder_name, 'movie_vocab.pkl')
    word_dictionary = pickle.load(open(dict_file, 'rb'))
    
    # Convert texts to lists of indices根据单词字典将加载的句子转化为数值型numpy数组
    text_data_train = np.array(text_helpers.text_to_numbers(texts_train, word_dictionary))
    text_data_test = np.array(text_helpers.text_to_numbers(texts_test, word_dictionary))
    
    # Pad/crop movie reviews to specific length电影影评长度不一,不满100维的用0凑满,超过100维的取前100维
    text_data_train = np.array([x[0:max_words] for x in [y+[0]*max_words for y in text_data_train]])
    text_data_test = np.array([x[0:max_words] for x in [y+[0]*max_words for y in text_data_test]])

    step4:构建图

    print('Creating Model')
    # Define Embeddings:创建嵌套变量,用于之后加载CBOW训练好的嵌套向量
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    
    # Define model:
    # Create variables for logistic regression变量
    A = tf.Variable(tf.random_normal(shape=[embedding_size,1]))
    b = tf.Variable(tf.random_normal(shape=[1,1]))
    
    # Initialize placeholders数据占位符
    x_data = tf.placeholder(shape=[None, max_words], dtype=tf.int32)
    y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32)
    
    # Lookup embeddings vectors
    embed = tf.nn.embedding_lookup(embeddings, x_data)
    # Take average of all word embeddings in documents计算句子中所有单词的平均嵌套
    embed_avg = tf.reduce_mean(embed, 1)
    
    # Declare logistic model (sigmoid in loss function)
    model_output = tf.add(tf.matmul(embed_avg, A), b)
    
    # Declare loss function (Cross Entropy loss)
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(model_output, y_target))
    
    # Actual Prediction
    prediction = tf.round(tf.sigmoid(model_output))
    predictions_correct = tf.cast(tf.equal(prediction, y_target), tf.float32)
    accuracy = tf.reduce_mean(predictions_correct)
    
    # Declare optimizer
    my_opt = tf.train.AdagradOptimizer(0.005)
    train_step = my_opt.minimize(loss)

    step5:训练

    # Intitialize Variables
    init = tf.initialize_all_variables()
    sess.run(init)
    
    # Load model embeddings加载CBOW训练好的嵌套矩阵
    model_checkpoint_path = os.path.join(data_folder_name,'cbow_movie_embeddings.ckpt')
    saver = tf.train.Saver({"embeddings": embeddings})
    saver.restore(sess, model_checkpoint_path)
    
    
    # Start Logistic Regression
    print('Starting Model Training')
    train_loss = []
    test_loss = []
    train_acc = []
    test_acc = []
    i_data = []
    for i in range(10000):
        rand_index = np.random.choice(text_data_train.shape[0], size=batch_size)
        rand_x = text_data_train[rand_index]
        rand_y = np.transpose([target_train[rand_index]])
        sess.run(train_step, feed_dict={x_data: rand_x, y_target: rand_y})
        
        # Only record loss and accuracy every 100 generations
        if (i+1)%100==0:
            i_data.append(i+1)
            train_loss_temp = sess.run(loss, feed_dict={x_data: rand_x, y_target: rand_y})
            train_loss.append(train_loss_temp)
            
            test_loss_temp = sess.run(loss, feed_dict={x_data: text_data_test, y_target: np.transpose([target_test])})
            test_loss.append(test_loss_temp)
            
            train_acc_temp = sess.run(accuracy, feed_dict={x_data: rand_x, y_target: rand_y})
            train_acc.append(train_acc_temp)
        
            test_acc_temp = sess.run(accuracy, feed_dict={x_data: text_data_test, y_target: np.transpose([target_test])})
            test_acc.append(test_acc_temp)
        if (i+1)%500==0:
            acc_and_loss = [i+1, train_loss_temp, test_loss_temp, train_acc_temp, test_acc_temp]
            acc_and_loss = [np.round(x,2) for x in acc_and_loss]
            print('Generation # {}. Train Loss (Test Loss): {:.2f} ({:.2f}). Train Acc (Test Acc): {:.2f} ({:.2f})'.format(*acc_and_loss))

    可视化结果展示:

    # Plot loss over time
    plt.plot(i_data, train_loss, 'k-', label='Train Loss')
    plt.plot(i_data, test_loss, 'r--', label='Test Loss', linewidth=4)
    plt.title('Cross Entropy Loss per Generation')
    plt.xlabel('Generation')
    plt.ylabel('Cross Entropy Loss')
    plt.legend(loc='upper right')
    plt.show()
    
    # Plot train and test accuracy
    plt.plot(i_data, train_acc, 'k-', label='Train Set Accuracy')
    plt.plot(i_data, test_acc, 'r--', label='Test Set Accuracy', linewidth=4)
    plt.title('Train and Test Accuracy')
    plt.xlabel('Generation')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.show()

  • 相关阅读:
    Js时间处理
    Css 显示删除条目效果
    vue权限控制菜单显示的简单实现
    js取对象的属性值循环
    标题的内容响应式设计
    将一个字符串当做一个方法名或对象的key
    JSON序列化和解析
    import 与export详解
    使用flask-mail扩展发送邮件
    正则表达式练习-计算器实现
  • 原文地址:https://www.cnblogs.com/helloworld0604/p/9009871.html
Copyright © 2011-2022 走看看