zoukankan      html  css  js  c++  java
  • 吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow:使用TensorFlow和Keras开发高级自然语言处理系统——LSTM网络原理以及使用LSTM实现人机问答系统

    !mkdir '/content/gdrive/My Drive/conversation'
    '''
    将文本句子分解成单词,并构建词库
    '''
    
    path = '/content/gdrive/My Drive/conversation/'
    with open(path + 'question.txt', 'r') as fopen:
      text_question = fopen.read().lower().split('
    ')
    with open(path + 'answer.txt', 'r') as fopen:
      text_answer = fopen.read().lower().split('
    ')
      
    concat_question = ' '.join(text_question).split()
    vocabulary_size_question = len(list(set(concat_question)))
    data_question, count_question, dictionary_question, rev_dictionary_question = build_dataset(concat_question, 
                                                                                             vocabulary_size_question)
    print('vocab question size: %d' %(vocabulary_size_question))
    #前4个单词有控制功能,具有含义的单词从编号4开始
    print('Most common words:', count_question[4:10])
    print('Sample data: ', data_question[:10], [rev_dictionary_question[i] for i in data_question[:10]])
    
    path = '/content/gdrive/My Drive/conversation/'
    with open(path + 'question.txt', 'r') as fopen:
      text_question = fopen.read().lower().split('
    ')
    with open(path + 'answer.txt', 'r') as fopen:
      text_answer = fopen.read().lower().split('
    ')
      
    concat_answer = ' '.join(text_answer).split()
    vocabulary_size_answer = len(list(set(concat_answer)))
    data_answer, count_answer, dictionary_answer, rev_dictionary_answer = build_dataset(concat_answer, 
                                                                                             vocabulary_size_answer)
    print('vocab answer size: %d' %(vocabulary_size_answer))
    #前4个单词有控制功能,具有含义的单词从编号4开始
    print('Most common words:', count_answer[4:10])
    print('Sample data: ', data_answer[:10], [rev_dictionary_answer[i] for i in data_answer[:10]])
    
    l = [dictionary_question['how'], dictionary_question['are'], dictionary_question['you']]
    print(l)
    l = [dictionary_answer['i'],dictionary_answer['am'], dictionary_answer['fine']]
    print(l)

    GO = dictionary_question['GO']
    PAD = dictionary_question['PAD']
    EOS = dictionary_question['EOS']
    UNK = dictionary_question['UNK']
    
    import tensorflow as tf
    '''
    tf.strided_slice(input, begin, end, stride), 对向量进行切片begin表示切片的起始位置,
    end-1为切片的结束位置
    '''
    data = [1,2,3,4,5,6,7,8]
    #从下标0开始到下标为5的元素作为切片,每隔一个元素切一刀
    x = tf.strided_slice(data, [0], [6], [1])
    with tf.Session() as sess:
      print('begin: 0, end: 6, stride: 1: ', sess.run(x))
    #从0开始到下标为5的元素范围内,每隔两个元素切一刀
    x = tf.strided_slice(data, [0], [6], [2])
    with tf.Session() as sess:
      print('begin: 0, end: 6, stride: 2: ', sess.run(x))
    #end可以是负数,表示倒数第几个元素,例如-1表示最后一个元素,-2表示倒数第二个元素
    x = tf.strided_slice(data, [2], [-2], [1])
    with tf.Session() as sess:
      print('begin:2, end 6, stride: 1: ', sess.run(x))

    '''
    tf.strided_slice如果对高维向量进行切片时,begin, end, stride要对应向量的维度,
    如果我们想把最后一列从多维向量中去除,可以运行如下代码,
    '''
    data = [[1, 2, 3],
            [4, 5, 6],
            [7, 8, 9]]
    
    batch_size = len(data[0])
    print('batch_size: ', batch_size)
    x = tf.strided_slice(data, [0,0], [batch_size, -1], [1, 1])
    with tf.Session() as sess:
      print('begin: [0, 0], end: [batch_size, -1], stride: [1,1]: ', sess.run(x))

    x = tf.concat([tf.fill([batch_size, 2], GO), data], 1)
    with tf.Session() as sess:
      print('concat and fill', sess.run(x))

    max_value = tf.reduce_max([1,3,2])
    mask = tf.sequence_mask([1,3,2], max_value, dtype=tf.float32)
    with tf.Session() as sess:
      print('mask: ', sess.run(mask))

    class sequence2sequence:
      def  __init__(self, size_layer, num_layers, embedded_size, question_dict_size, 
                   answer_dict_size, learning_rate, batch_size):
        def  cells(reuse = False):
          return tf.nn.rnn_cell.LSTMCell(size_layer, initializer=tf.orthogonal_initializer(),
                                        reuse = reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.placeholder(tf.int32, [None])
        self.Y_seq_len = tf.placeholder(tf.int32, [None])
        
        #构造图12-22所示的embedding层
        with tf.variable_scope('encoder_embeddings'):
          #embedding层是二维矩阵,行数对应问题文本的单词格式,列数有程序指定
          encoder_embeddings = tf.Variable(tf.random_uniform([question_dict_size,
                                                             embedded_size], -1, 1))
          #self.X包含句子每个单词的编号,网络根据编号从矩阵中挑选出对应的行
          encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
          #去掉最后一列,因为最后一个单词对应的是一个EOS控制符而不是有意义的单词
          main = tf.strided_slice(self.X, [0,0], [batch_size, -1], [1, 1])
        
        with tf.variable_scope('decoder_embeddings'):
          '''
          在main向量前面增加一列,用GO对应的值来填充,假设
          main = [ [1,2,3],
                   [4,5,6],
                   [7,8,9]]
          下面一行代码执行后,main会变成
          main = [ [0,1,2,3],
                   [0,4,5,6],
                   [0,7,8,9]]
          '''
          decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
          decoder_embeddings = tf.Variable(tf.random_uniform([answer_dict_size, embedded_size], -1, 1))
          decoder_embedded = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)
          
        with tf.variable_scope('encoder'):
          #将两个LSTM节点串联起来
          rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
          _, last_state = tf.nn.dynamic_rnn(rnn_cells, encoder_embedded, dtype = tf.float32)
        with tf.variable_scope('decoder'):
          rnn_cells_dec = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
          #把encoder部分LSTM网络输出的最终状态当做decoder部分的LSTM网络的初始状态
          outputs, _ = tf.nn.dynamic_rnn(rnn_cells_dec, decoder_embedded,
                                        initial_state = last_state,
                                        dtype = tf.float32)
          
        
        '''
          这里需要注意,假设self.X对应第一个句子是'how are your',self.Y对应的第一个句子是答案:'i am fine',
          后者转换为编号后的向量[4,10, 106],由于每个句子对应向量含有50个分量,
          那么不足50个单词的句子对应的向量会用控制符PAD的值进行填充,于是向量变为[4,6,106,1,1....1]
          也就是后面47个分量用1来填充,
          
          经过调用tf.sequence_mask后,会产生一个标志位向量[True, True, True, True,....True],
          于是当向量输入到网络时,sequence_loss就会查看targets参数对应向量中的50个分量,其他分量
          不看,由于答案文本中单词数可能很多,因此网络只考虑向量中对应的50个分量对应的节点进行更新,这样能有效
          降低运算量
          
          它在内部构造一个长度与answer_dict_size相同的内部向量,由于答案文本包含504个单词,因此
          网络构造内部向量的长度为504,他把所有分量的值设置为0,根据填充后向量为[4,6,106,1,1....1],
          它把下标为4,6,106,1这4个下标对应分量设置为1
          
          然后与网络的输出层做差值运算,将运算结果反向传播后,使用梯度下降法调节参数,使得网络输出最后
          结果中,下标为4,6,106,1这四个节点的值尽可能接近1,这意味着我们在调教网络看到输入语句是
          'how are you'时,它懂得回答'i am fine'
        '''
        with tf.variable_scope('logits'):
          self.logits = tf.layers.dense(outputs, answer_dict_size)
          print(self.logits)
          masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype = tf.float32)
          
        with tf.variable_scope('cost'):
          self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.logits,
                                                      targets = self.Y
                                                      ,weights = masks)
        with tf.variable_scope('optimizer'):
          self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
    #将单词转换为编号
    def  str_idx(corpus, dic):
      X = []
      for i in corpus:
        ints = []
        for k in i.split():
          try:
            ints.append(dic[k])
          except Exception as e:
            print(e)
            #2表示句子的结束
            ints.append(2)
            
        X.append(ints)
      return X
    
    '''
    填充句子向量,当我们把句子单词转换为编号存储为向量时,规定向量的最大长度是50,如果句子单词不足50个
    后面就用控制字符PAD填充,例如'how are you' 转换为编号时是11,10,4,填充后为11,10,4,1,1,1...1总共50个元素
    '''
    def  pad_sentence_batch(sentence_batch, pad_int):
      padded_seqs = []
      seq_lens = []
      max_sentence_len = 50
      for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(50)
      return padded_seqs, seq_lens
    
    def  check_accuracy(logits, Y):
      '''
      logits是网络预测结果,Y是正确结果,我们将网络预测结果与正确结果的契合程度
      作为网络预测准确性的判断
      '''
      acc = 0
      for i in range(logits.shape[0]):
        internal_acc = 0
        for k in range(len(Y[i])):
          if Y[i][k] == logits[i][k]:
            internal_acc += 1
        acc += (internal_acc / len(Y[i]))
        
      return acc / logits.shape[0]
    import tensorflow as tf
    import os
    #LSTM节点输出向量长度
    size_layer = 128
    #每个MultiRNNCell含有的LSTM节点个数
    num_layers = 2
    #一个单词对应的向量长度
    embedded_size = 128
    learning_rate = 0.001
    batch_size = 32
    epoch = 500
    
    tf.reset_default_graph()
    sess = tf.InteractiveSession()
    model = sequence2sequence(size_layer, num_layers, embedded_size, vocabulary_size_question + 4,
                             vocabulary_size_answer + 4, learning_rate, batch_size)
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(tf.global_variables(), max_to_keep = 2)
    checkpoint_dir = os.path.abspath(os.path.join('/content/gdrive/My Drive/', 'checkpoints_chatbot'))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    X = str_idx(text_question, dictionary_question)
    Y = str_idx(text_answer, dictionary_answer)
    
    
    for i in range(epoch):
      total_loss, total_accuracy = 0, 0
      for k in range(0, (len(text_question) // batch_size) * batch_size, batch_size):
        batch_x, seq_x = pad_sentence_batch(X[k : k + batch_size], PAD)
        batch_y, seq_y = pad_sentence_batch(Y[k : k + batch_size], PAD)
        
       
        predicted, loss, _ = sess.run([tf.argmax(model.logits, 2), model.cost,
                                      model.optimizer], feed_dict = {
            model.X : batch_x,
            model.Y : batch_y,
            model.X_seq_len : seq_x,
            model.Y_seq_len : seq_y
        })
        total_loss += loss
        total_accuracy += check_accuracy(predicted, batch_y)
       
      total_loss /= (len(text_question) // batch_size)
      total_accuracy /= (len(text_answer) // batch_size)
      print("epoch: %d, avg loss : %f, avg accuracy: %f" %(i+1, total_loss, total_accuracy))
      path = saver.save(sess, checkpoint_prefix, global_step = i + 1)
    import numpy as np
    import tensorflow as tf
    
    def predict(sentence, sess):
        X_in = []
        for word in sentence.split():
            try:
                X_in.append(dictionary_question[word])
            except:
                X_in.append(PAD)
                pass
            
        test, seq_x = pad_sentence_batch([X_in], PAD)
        input_batch = np.zeros([batch_size,seq_x[0]])
        input_batch[0] =test[0] 
            
        log = sess.run(tf.argmax(model.logits,2), 
                                          feed_dict={
                                                  model.X:input_batch,
                                                  model.X_seq_len:seq_x,
                                                  model.Y_seq_len:seq_x
                                                  }
                                          )
        
        result=' '.join(rev_dictionary_answer[i] for i in log[0])
        return result
        
    
    with tf.Session() as sess:
      path = '/content/gdrive/My Drive/checkpoints_chatbot/model-498'
      saver = tf.train.import_meta_graph(path + '.meta')
      #将训练后存储成文件的网络参数重新加载
      saver.restore(sess, path)
      print(predict('how are you ?', sess)) 
      print(predict('where do you live?', sess))
      print(predict('what is your name', sess))

  • 相关阅读:
    AcWing 1135. 新年好 图论 枚举
    uva 10196 将军 模拟
    LeetCode 120. 三角形最小路径和 dp
    LeetCode 350. 两个数组的交集 II 哈希
    LeetCode 174. 地下城游戏 dp
    LeetCode 面试题 16.11.. 跳水板 模拟
    LeetCode 112. 路径总和 递归 树的遍历
    AcWing 1129. 热浪 spfa
    Thymeleaf Javascript 取值
    Thymeleaf Javascript 取值
  • 原文地址:https://www.cnblogs.com/tszr/p/12270060.html
Copyright © 2011-2022 走看看