zoukankan      html  css  js  c++  java
  • 吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow:使用TensorFlow和Keras开发高级自然语言处理系统——RNN网络原理与实现

    !pip install utils

    !pip install sklearn

    !mkdir '/content/gdrive/My Drive/movie review'
    !mkdir '/content/gdrive/My Drive/movie review/good/'
    !mkdir '/content/gdrive/My Drive/movie review/bad/'
    from utils import *
    import tensorflow as tf
    import sklearn
    from sklearn import datasets
    from sklearn.model_selection import train_test_split
    import time
    import re
    import numpy as np
    
    def clearstring(string):
      #只选择包含字母和数字的字符串
      string = re.sub('[^A-Za-z0-9]+', ' ', string)
      #把句子分割成多个单词合成的队列
      string = string.split(' ')
      string = filter(None, string)
      #消除单词首尾空格
      string = [y.strip() for y in string] 
      string = ' '.join(string)
    
      return string.lower()
    
    def  seperate_dataset(trainset, ratio=0.5):
      '''
      把文本中语句一条条分隔开,并打上标签
      '''
      text = []
      label = []
      for i in range(int(len(trainset.data) * ratio)):
        #把文本分割成多个句子
        data_ = trainset.data[i].split('
    ')
        #过滤掉空行
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
          #去掉句子中不符合规则的单词
          data_[n] = clearstring(data_[n])
        text += data_
        for n in range(len(data_)):
          #打上标签,因为目录下只有两个文件夹因此只有两种标签
          label.append(trainset.target[i])
      return text, label
    s = ' this is 98 !@# *q'
    s = clearstring(s)
    print(s)

    path = '/content/gdrive/My Drive/movie review'
    trainset = sklearn.datasets.load_files(container_path = path, encoding = 'UTF-8')
    
    '''
    将文本中的句子抽取出来,形成两个集合,由于文件夹下只有两个子文件夹
    因此它们对应两个标签
    '''
    trainset.data, trainset.target = seperate_dataset(trainset, 1.0)
    print(trainset.target_names)
    print('training data has {0} items'.format(len(trainset.data)))

    #为每条记录附带两个标志位
    onehot = np.zeros((len(trainset.data), len(trainset.target_names)))
    #属于bad目录下的句子对应标签[1,0],属于good目录下的句子附带标签[0,1]
    onehot[np.arange(len(trainset.data)), trainset.target] = 1.0
    '''
    将trainset.data, trianset.target, onehot三个数组以8:2的方式分成两部分
    一部分用于测试
    '''
    train_X, test_X, train_Y, test_Y, train_onehot,test_onehot = train_test_split(trainset.data, trainset.target, onehot, test_size = 0.2)
    
    concat = ' '.join(trainset.data).split()
    vocabulary_size = len(list(set(concat)))
    print(vocabulary_size)
    print(train_onehot[0])

    import collections
    
    def  build_dataset(words, n_words):
      '''
      将文本中的单词拆解成字典
      '''
      count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
      #统计每个单词的出现次数,我们只选出现次数在前n_words范围内的单词
      count.extend(collections.Counter(words).most_common(n_words - 1))
      dictionary = dict()
      for word, _ in count:
        #给每个单词进行编号,编号会从4开始
        dictionary[word] = len(dictionary)
        
      data = list()
      unk_count = 0
      for word in words:
        index = dictionary.get(word, 0)
        #统计没有被选入范围的单词
        if index == 0:
          unk_count += 1
        data.append(index)
      
      count[0][1] = unk_count
      #把编号和单词对应关系对换
      reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
      return data, count, dictionary, reversed_dictionary
          
    data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
    
    print('Most common words', count[4:10])
    print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])
    '''
    网络分两层,第一层128个节点,第二层也是128个节点,
    '''
    size_layer = 128
    num_layers = 2
    embedded_size = 128
    dimension_output = len(trainset.target_names)
    learning_rate = 1e-3
    maxlen = 50
    batch_size = 128
    
    class RNN:
      def __init__(self, size_layer, num_layer, embedded_size, dict_size,
                  dimension_output, learning_rate):
        def  cells(reuse = False):
          '''
          tensorflow封装了RNN节点,它跟我们前面描述的能在内部记录当前输入数据处理信息,
          并将信息传递到下一次数据处理的R节点一样
          '''
          return tf.nn.rnn_cell.BasicRNNCell(size_layer, reuse=reuse)
        #定义输入数据变量
        self.X = tf.placeholder(tf.int32, [None, None])
        #定义输出数据变量
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        '''
        定义embedding层,这一层与我们前面讲单词向量训练时提到过的网络第一层一样,它对应
        一个二维矩阵,矩阵的每一行表示单词向量,self.X是one-hont-vector,它会将矩阵的某一行
        挑选出来
        '''
        embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        embedded = tf.nn.embedding_lookup(embeddings, self.X)
        #串联两个RNN节点增强识别能力
        rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
        #根据输入数据长度构造相应数量RNN节点,形成RNN网络层
        outputs, _ = tf.nn.dynamic_rnn(rnn_cells, embedded, dtype=tf.float32)
        #将RNN网络层输出的含有128个分量的向量转换为只有2个分量的向量
        W = tf.get_variable('w', shape=(size_layer, dimension_output),
                           initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b', shape=(dimension_output), initializer=tf.zeros_initializer())
        #将两个分量重,数值较大的那个当做当前语句所属分类
        self.logits = tf.matmul(outputs[:, -1], W) + b
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits,
                                                                    labels=self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost)
        
        
        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    '''
    网络分两层,第一层128个节点,第二层也是128个节点,
    '''
    size_layer = 128
    num_layers = 2
    embedded_size = 128
    dimension_output = len(trainset.target_names)
    learning_rate = 1e-3
    maxlen = 50
    batch_size = 128
    
    class RNN:
      def __init__(self, size_layer, num_layer, embedded_size, dict_size,
                  dimension_output, learning_rate):
        def  cells(reuse = False):
          '''
          tensorflow封装了RNN节点,它跟我们前面描述的能在内部记录当前输入数据处理信息,
          并将信息传递到下一次数据处理的R节点一样
          '''
          return tf.nn.rnn_cell.LSTMCell(size_layer, reuse=reuse)
        #定义输入数据变量
        self.X = tf.placeholder(tf.int32, [None, None])
        #定义输出数据变量
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        '''
        定义embedding层,这一层与我们前面讲单词向量训练时提到过的网络第一层一样,它对应
        一个二维矩阵,矩阵的每一行表示单词向量,self.X是one-hont-vector,它会将矩阵的某一行
        挑选出来
        '''
        embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        embedded = tf.nn.embedding_lookup(embeddings, self.X)
        #串联两个RNN节点增强识别能力
        rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
        #根据输入数据长度构造相应数量RNN节点,形成RNN网络层
        outputs, _ = tf.nn.dynamic_rnn(rnn_cells, embedded, dtype=tf.float32)
        #将RNN网络层输出的含有128个分量的向量转换为只有2个分量的向量
        W = tf.get_variable('w', shape=(size_layer, dimension_output),
                           initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b', shape=(dimension_output), initializer=tf.zeros_initializer())
        #将两个分量重,数值较大的那个当做当前语句所属分类
        self.logits = tf.matmul(outputs[:, -1], W) + b
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits,
                                                                    labels=self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost)
        
        
        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    '''
    将输入语句中的单词转换成对应编号
    '''
    def  word_to_index(corpus, dic, maxlen, UNK=3):
      X = np.zeros((len(corpus), maxlen))
      for i in range(len(corpus)):
        '''
        规定一句话单词量不能超过maxlen,超过了就截断。然后从最后一个单词开始,到第一个单词,
        将每个单词转换为对应编号
        '''
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
          try:
            X[i, -1 - no] = dic[k]
          except:
            X[i, -1 - no] = UNK
          
      return X
    
    s = []
    s.append("the rock is destined to be")
    x = word_to_index(s, dictionary, maxlen)
    print(x)

    import time
    
    tf.reset_default_graph()
    sess = tf.InteractiveSession()
    rnn = RNN(size_layer, num_layers, embedded_size, vocabulary_size+4, dimension_output,
             learning_rate)
    sess.run(tf.global_variables_initializer())
    
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=2)
    checkpint_dir =  '/content/gdrive/My Drive/dataset/checkpoints_basci_rnn'
    
    EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 30, 0, 0, 0
    while True:
      lasttime = time.time()
      if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch: %d
    ' % (EPOCH))
        break
        
      train_acc, train_loss, test_acc,test_loss = 0,0,0,0
      #训练网络
      for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = word_to_index(train_X[i : i + batch_size], dictionary, maxlen)
        acc, loss, _ = sess.run([rnn.accuracy, rnn.cost, rnn.optimizer], 
                                feed_dict = {rnn.X: batch_x,
                                             rnn.Y: train_onehot[i : i + batch_size]})
        train_loss += loss
        train_acc += acc
      
      #检测训练结果
      for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = word_to_index(test_X[i : i + batch_size], dictionary, batch_size)
        acc, loss = sess.run([rnn.accuracy, rnn.cost],
                            feed_dict = {rnn.X : batch_x,
                                                
                                        rnn.Y : test_onehot[i : i + batch_size]})
        test_loss += loss
        test_acc += acc
      
      train_loss /= (len(train_X) // batch_size)
      train_acc /= (len(train_X) // batch_size)
      test_loss /= (len(test_X) // batch_size)
      test_acc /= (len(test_X) // batch_size)
      
      if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc %f' % (EPOCH, CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
      else:
        CURRENT_CHECKPOINT += 1
      
      print('time taken: ', time.time() - lasttime)
      print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f
    ' % (EPOCH, 
                                                                                                 train_loss, 
                                                                                                train_acc, test_loss,
                                                                                                test_acc))
      path = saver.save(sess, checkpint_dir, global_step = EPOCH)
      EPOCH += 1
      
      

  • 相关阅读:
    linux安装JRE和Tomcat
    微信公众号授权登录
    linux Nginx设置多级域名
    bootstrap制作收藏夹导航
    js下拉菜单
    QQ授权登录
    centos7.6下安装LNMP环境(linux+nginx+mysql5.7+PHP)
    typora快捷键(转载)
    旋转魔方(2)-添加照片
    test
  • 原文地址:https://www.cnblogs.com/tszr/p/12270034.html
Copyright © 2011-2022 走看看