zoukankan html css js c++ java

吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow：使用TensorFlow和Keras开发高级自然语言处理系统——RNN网络原理与实现

!pip install utils

!pip install sklearn

!mkdir '/content/gdrive/My Drive/movie review'
!mkdir '/content/gdrive/My Drive/movie review/good/'
!mkdir '/content/gdrive/My Drive/movie review/bad/'

from utils import *
import tensorflow as tf
import sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
import time
import re
import numpy as np

def clearstring(string):
  #只选择包含字母和数字的字符串
  string = re.sub('[^A-Za-z0-9]+', ' ', string)
  #把句子分割成多个单词合成的队列
  string = string.split(' ')
  string = filter(None, string)
  #消除单词首尾空格
  string = [y.strip() for y in string] 
  string = ' '.join(string)

  return string.lower()

def  seperate_dataset(trainset, ratio=0.5):
  '''
  把文本中语句一条条分隔开，并打上标签
  '''
  text = []
  label = []
  for i in range(int(len(trainset.data) * ratio)):
    #把文本分割成多个句子
    data_ = trainset.data[i].split('
')
    #过滤掉空行
    data_ = list(filter(None, data_))
    for n in range(len(data_)):
      #去掉句子中不符合规则的单词
      data_[n] = clearstring(data_[n])
    text += data_
    for n in range(len(data_)):
      #打上标签，因为目录下只有两个文件夹因此只有两种标签
      label.append(trainset.target[i])
  return text, label

s = ' this is 98 !@# *q'
s = clearstring(s)
print(s)

path = '/content/gdrive/My Drive/movie review'
trainset = sklearn.datasets.load_files(container_path = path, encoding = 'UTF-8')

'''
将文本中的句子抽取出来，形成两个集合，由于文件夹下只有两个子文件夹
因此它们对应两个标签
'''
trainset.data, trainset.target = seperate_dataset(trainset, 1.0)
print(trainset.target_names)
print('training data has {0} items'.format(len(trainset.data)))

#为每条记录附带两个标志位
onehot = np.zeros((len(trainset.data), len(trainset.target_names)))
#属于bad目录下的句子对应标签[1,0],属于good目录下的句子附带标签[0,1]
onehot[np.arange(len(trainset.data)), trainset.target] = 1.0
'''
将trainset.data, trianset.target, onehot三个数组以8:2的方式分成两部分
一部分用于测试
'''
train_X, test_X, train_Y, test_Y, train_onehot,test_onehot = train_test_split(trainset.data, trainset.target, onehot, test_size = 0.2)

concat = ' '.join(trainset.data).split()
vocabulary_size = len(list(set(concat)))
print(vocabulary_size)

print(train_onehot[0])

import collections

def  build_dataset(words, n_words):
  '''
  将文本中的单词拆解成字典
  '''
  count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
  #统计每个单词的出现次数,我们只选出现次数在前n_words范围内的单词
  count.extend(collections.Counter(words).most_common(n_words - 1))
  dictionary = dict()
  for word, _ in count:
    #给每个单词进行编号,编号会从4开始
    dictionary[word] = len(dictionary)
    
  data = list()
  unk_count = 0
  for word in words:
    index = dictionary.get(word, 0)
    #统计没有被选入范围的单词
    if index == 0:
      unk_count += 1
    data.append(index)
  
  count[0][1] = unk_count
  #把编号和单词对应关系对换
  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reversed_dictionary

data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)

print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

'''
网络分两层，第一层128个节点，第二层也是128个节点，
'''
size_layer = 128
num_layers = 2
embedded_size = 128
dimension_output = len(trainset.target_names)
learning_rate = 1e-3
maxlen = 50
batch_size = 128

class RNN:
  def __init__(self, size_layer, num_layer, embedded_size, dict_size,
              dimension_output, learning_rate):
    def  cells(reuse = False):
      '''
      tensorflow封装了RNN节点，它跟我们前面描述的能在内部记录当前输入数据处理信息，
      并将信息传递到下一次数据处理的R节点一样
      '''
      return tf.nn.rnn_cell.BasicRNNCell(size_layer, reuse=reuse)
    #定义输入数据变量
    self.X = tf.placeholder(tf.int32, [None, None])
    #定义输出数据变量
    self.Y = tf.placeholder(tf.float32, [None, dimension_output])
    '''
    定义embedding层，这一层与我们前面讲单词向量训练时提到过的网络第一层一样，它对应
    一个二维矩阵，矩阵的每一行表示单词向量,self.X是one-hont-vector,它会将矩阵的某一行
    挑选出来
    '''
    embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
    embedded = tf.nn.embedding_lookup(embeddings, self.X)
    #串联两个RNN节点增强识别能力
    rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
    #根据输入数据长度构造相应数量RNN节点，形成RNN网络层
    outputs, _ = tf.nn.dynamic_rnn(rnn_cells, embedded, dtype=tf.float32)
    #将RNN网络层输出的含有128个分量的向量转换为只有2个分量的向量
    W = tf.get_variable('w', shape=(size_layer, dimension_output),
                       initializer=tf.orthogonal_initializer())
    b = tf.get_variable('b', shape=(dimension_output), initializer=tf.zeros_initializer())
    #将两个分量重，数值较大的那个当做当前语句所属分类
    self.logits = tf.matmul(outputs[:, -1], W) + b
    self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits,
                                                                labels=self.Y))
    self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost)
    
    
    correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
    self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

'''
网络分两层，第一层128个节点，第二层也是128个节点，
'''
size_layer = 128
num_layers = 2
embedded_size = 128
dimension_output = len(trainset.target_names)
learning_rate = 1e-3
maxlen = 50
batch_size = 128

class RNN:
  def __init__(self, size_layer, num_layer, embedded_size, dict_size,
              dimension_output, learning_rate):
    def  cells(reuse = False):
      '''
      tensorflow封装了RNN节点，它跟我们前面描述的能在内部记录当前输入数据处理信息，
      并将信息传递到下一次数据处理的R节点一样
      '''
      return tf.nn.rnn_cell.LSTMCell(size_layer, reuse=reuse)
    #定义输入数据变量
    self.X = tf.placeholder(tf.int32, [None, None])
    #定义输出数据变量
    self.Y = tf.placeholder(tf.float32, [None, dimension_output])
    '''
    定义embedding层，这一层与我们前面讲单词向量训练时提到过的网络第一层一样，它对应
    一个二维矩阵，矩阵的每一行表示单词向量,self.X是one-hont-vector,它会将矩阵的某一行
    挑选出来
    '''
    embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
    embedded = tf.nn.embedding_lookup(embeddings, self.X)
    #串联两个RNN节点增强识别能力
    rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
    #根据输入数据长度构造相应数量RNN节点，形成RNN网络层
    outputs, _ = tf.nn.dynamic_rnn(rnn_cells, embedded, dtype=tf.float32)
    #将RNN网络层输出的含有128个分量的向量转换为只有2个分量的向量
    W = tf.get_variable('w', shape=(size_layer, dimension_output),
                       initializer=tf.orthogonal_initializer())
    b = tf.get_variable('b', shape=(dimension_output), initializer=tf.zeros_initializer())
    #将两个分量重，数值较大的那个当做当前语句所属分类
    self.logits = tf.matmul(outputs[:, -1], W) + b
    self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits,
                                                                labels=self.Y))
    self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost)
    
    
    correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
    self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

'''
将输入语句中的单词转换成对应编号
'''
def  word_to_index(corpus, dic, maxlen, UNK=3):
  X = np.zeros((len(corpus), maxlen))
  for i in range(len(corpus)):
    '''
    规定一句话单词量不能超过maxlen，超过了就截断。然后从最后一个单词开始，到第一个单词，
    将每个单词转换为对应编号
    '''
    for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
      try:
        X[i, -1 - no] = dic[k]
      except:
        X[i, -1 - no] = UNK
      
  return X

s = []
s.append("the rock is destined to be")
x = word_to_index(s, dictionary, maxlen)
print(x)

import time

tf.reset_default_graph()
sess = tf.InteractiveSession()
rnn = RNN(size_layer, num_layers, embedded_size, vocabulary_size+4, dimension_output,
         learning_rate)
sess.run(tf.global_variables_initializer())

saver = tf.train.Saver(tf.global_variables(), max_to_keep=2)
checkpint_dir =  '/content/gdrive/My Drive/dataset/checkpoints_basci_rnn'

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 30, 0, 0, 0
while True:
  lasttime = time.time()
  if CURRENT_CHECKPOINT == EARLY_STOPPING:
    print('break epoch: %d
' % (EPOCH))
    break
    
  train_acc, train_loss, test_acc,test_loss = 0,0,0,0
  #训练网络
  for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
    batch_x = word_to_index(train_X[i : i + batch_size], dictionary, maxlen)
    acc, loss, _ = sess.run([rnn.accuracy, rnn.cost, rnn.optimizer], 
                            feed_dict = {rnn.X: batch_x,
                                         rnn.Y: train_onehot[i : i + batch_size]})
    train_loss += loss
    train_acc += acc
  
  #检测训练结果
  for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
    batch_x = word_to_index(test_X[i : i + batch_size], dictionary, batch_size)
    acc, loss = sess.run([rnn.accuracy, rnn.cost],
                        feed_dict = {rnn.X : batch_x,
                                            
                                    rnn.Y : test_onehot[i : i + batch_size]})
    test_loss += loss
    test_acc += acc
  
  train_loss /= (len(train_X) // batch_size)
  train_acc /= (len(train_X) // batch_size)
  test_loss /= (len(test_X) // batch_size)
  test_acc /= (len(test_X) // batch_size)
  
  if test_acc > CURRENT_ACC:
    print('epoch: %d, pass acc: %f, current acc %f' % (EPOCH, CURRENT_ACC, test_acc))
    CURRENT_ACC = test_acc
    CURRENT_CHECKPOINT = 0
  else:
    CURRENT_CHECKPOINT += 1
  
  print('time taken: ', time.time() - lasttime)
  print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f
' % (EPOCH, 
                                                                                             train_loss, 
                                                                                            train_acc, test_loss,
                                                                                            test_acc))
  path = saver.save(sess, checkpint_dir, global_step = EPOCH)
  EPOCH += 1

查看全文

相关阅读:
使用Optioanl优雅的处理空值
 综合对比 Kafka、RabbitMQ、RocketMQ、ActiveMQ 四个分布式消息队列
 Nginx 相关介绍
 在Intellij IDEA中使用Debug
关于Spring的BeanUtils
MySQL 索引总结
 java中值传递和引用传递
 SQL易错锦集
 Java和SQL取两个字符间的值
 好文章收藏--五分钟理解一致性哈希算法(consistent hashing)

原文地址：https://www.cnblogs.com/tszr/p/12270034.html