zoukankan      html  css  js  c++  java
  • TensorFlow教程使用RNN生成唐诗

    本教程转载至:TensorFlow练习7: 基于RNN生成古诗词

    使用的数据集是全唐诗,首先提供一下数据集的下载链接:https://pan.baidu.com/s/13pNWfffr5HSN79WNb3Y0_w              提取码:koss

    RNN不像传统的神经网络-它们的输出输出是固定的,而RNN允许我们输入输出向量序列。RNN是为了对序列数据进行建模而产生的。本帖代码移植自char-rnn,它是基于Torch的洋文模型,稍加修改即可应用于中文。char-rnn使用文本文件做为输入、训练RNN模型,然后使用它生成和训练数据类似的文本。

    下边代码有修改,以适应TensorFlow1.4和GPU平台

      1 #coding=utf-8
      2 import collections
      3 import numpy as np 
      4 import tensorflow as tf 
      5 import io
      6 import sys
      7 import os
      8 reload(sys)
      9 sys.setdefaultencoding('utf-8')
     10 #-------------------------------数据预处理---------------------------#
     11  
     12 poetry_file ='poetry.txt'
     13  
     14 # 诗集
     15 poetrys = []
     16 with io.open(poetry_file, "r", encoding='utf-8',) as f:
     17     for line in f:
     18         # print line
     19         try:
     20             title, content = line.strip().split(':')
     21             content = content.replace(' ','')
     22             if '_' in content or '(' in content or '' in content or '' in content or '[' in content:
     23                 continue
     24             if len(content) < 5 or len(content) > 79:
     25                 continue
     26             content = '[' + content + ']'
     27             poetrys.append(content)
     28         except Exception as e:
     29             pass
     30 
     31 #按诗的字数排序
     32 poetrys = sorted(poetrys,key=lambda line: len(line))
     33 print(u"唐诗总数: ")
     34 print(len(poetrys))
     35 print(u"测试")
     36 
     37 # 统计每个字出现次数
     38 all_words = []
     39 for poetry in poetrys:
     40     all_words += [word for word in poetry]
     41 counter = collections.Counter(all_words)
     42 count_pairs = sorted(counter.items(), key=lambda x: -x[1])
     43 words, _ = zip(*count_pairs)
     44 
     45 # 取前多少个常用字
     46 words = words[:len(words)] + (' ',)
     47 # 每个字映射为一个数字ID
     48 word_num_map = dict(zip(words, range(len(words))))
     49 # 把诗转换为向量形式,参考TensorFlow练习1
     50 to_num = lambda word: word_num_map.get(word, len(words))
     51 poetrys_vector = [ list(map(to_num, poetry)) for poetry in poetrys]
     52 #[[314, 3199, 367, 1556, 26, 179, 680, 0, 3199, 41, 506, 40, 151, 4, 98, 1],
     53 #[339, 3, 133, 31, 302, 653, 512, 0, 37, 148, 294, 25, 54, 833, 3, 1, 965, 1315, 377, 1700, 562, 21, 37, 0, 2, 1253, 21, 36, 264, 877, 809, 1]
     54 #....]
     55 
     56 # 每次取64首诗进行训练
     57 batch_size = 64
     58 n_chunk = len(poetrys_vector) // batch_size
     59 x_batches = []
     60 y_batches = []
     61 for i in range(n_chunk):
     62     start_index = i * batch_size
     63     end_index = start_index + batch_size
     64 
     65     batches = poetrys_vector[start_index:end_index]
     66     length = max(map(len,batches))
     67     xdata = np.full((batch_size,length), word_num_map[' '], np.int32)
     68     for row in range(batch_size):
     69         xdata[row,:len(batches[row])] = batches[row]
     70     ydata = np.copy(xdata)
     71     ydata[:,:-1] = xdata[:,1:]
     72     """
     73     xdata             ydata
     74     [6,2,4,6,9]       [2,4,6,9,9]
     75     [1,4,2,8,5]       [4,2,8,5,5]
     76     """
     77     x_batches.append(xdata)
     78     y_batches.append(ydata)
     79 
     80 #---------------------------------------RNN--------------------------------------#
     81 
     82 input_data = tf.placeholder(tf.int32, [batch_size, None])
     83 output_targets = tf.placeholder(tf.int32, [batch_size, None])
     84 # 定义RNN
     85 def neural_network(model='lstm', rnn_size=128, num_layers=2):
     86     if model == 'rnn':
     87         cell_fun = tf.nn.rnn_cell.BasicRNNCell
     88     elif model == 'gru':
     89         cell_fun = tf.nn.rnn_cell.GRUCell
     90     elif model == 'lstm':
     91         cell_fun = tf.nn.rnn_cell.BasicLSTMCell
     92 
     93     cell = cell_fun(rnn_size, state_is_tuple=True)
     94     cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
     95 
     96     initial_state = cell.zero_state(batch_size, tf.float32)
     97 
     98     with tf.variable_scope('rnnlm'):
     99         softmax_w = tf.get_variable("softmax_w", [rnn_size, len(words)+1])
    100         softmax_b = tf.get_variable("softmax_b", [len(words)+1])
    101         with tf.device("/gpu:0"):
    102             embedding = tf.get_variable("embedding", [len(words)+1, rnn_size])
    103             inputs = tf.nn.embedding_lookup(embedding, input_data)
    104 
    105     outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, scope='rnnlm')
    106     output = tf.reshape(outputs,[-1, rnn_size])
    107 
    108     logits = tf.matmul(output, softmax_w) + softmax_b
    109     probs = tf.nn.softmax(logits)
    110     return logits, last_state, probs, cell, initial_state
    111 
    112 ckpt_dir="./ckpt_dir"
    113 if not os.path.exists(ckpt_dir):
    114     os.makedirs(ckpt_dir)
    115 
    116 #训练
    117 def train_neural_network():
    118     logits, last_state, _, _, _ = neural_network()
    119     targets = tf.reshape(output_targets, [-1])
    120     loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [targets], [tf.ones_like(targets, dtype=tf.float32)], len(words))
    121     cost = tf.reduce_mean(loss)
    122     learning_rate = tf.Variable(0.0, trainable=False)
    123     tvars = tf.trainable_variables()
    124     grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5)
    125     optimizer = tf.train.AdamOptimizer(learning_rate)
    126     train_op = optimizer.apply_gradients(zip(grads, tvars))
    127 
    128     with tf.Session() as sess:
    129         sess.run(tf.initialize_all_variables())
    130 
    131         saver = tf.train.Saver(tf.all_variables())
    132 
    133         for epoch in range(295):
    134             sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** epoch)))
    135             n = 0
    136             for batche in range(n_chunk):
    137                 train_loss, _ , _ = sess.run([cost, last_state, train_op], feed_dict={input_data: x_batches[n], output_targets: y_batches[n]})
    138                 n += 1
    139                 print(epoch, batche, train_loss)
    140             if epoch % 7 == 0:
    141                 saver.save(sess, ckpt_dir+'/poetry.module', global_step=epoch)
    142 
    143 train_neural_network()

    这里我只说自己对bug调试和调优的一些想法,具体代码理解,请联系作者本人。

    首先是#coding=utf-8的问题,这里是告诉python环境,当前python脚本的文字编码是utf-8,这里如果不调整的话,默认的ansii环境极有可能报告编码错误。

    之后是数据集的utf-8编码问题,这里在encoding的时候,用了utf-8 的选项,但是却没有告诉python环境,字符集编码是utf-8,会导致每次解析到的content和title都会报错,最终处理完的数据集大小为0,设置sys的默认编码可以解决。

    同时,默认的open函数没有encoding选项,这个是在io.open中的选项,这个地方需要修改。

    还有一点是一些接口使用问题,比如saver.save现在需要一个parent directory

    之后是预测的代码

      1 #coding=utf-8
      2 import collections
      3 import numpy as np
      4 import tensorflow as tf
      5 import io
      6 import sys
      7 import os
      8 import pdb
      9 import time
     10 reload(sys)
     11 sys.setdefaultencoding('utf-8')
     12 #-------------------------------数据预处理---------------------------#
     13  
     14 poetry_file ='poetry.txt'
     15  
     16 # 诗集
     17 poetrys = []
     18 with io.open(poetry_file, "r", encoding='utf-8',) as f:
     19     for line in f:
     20         try:
     21             title, content = line.strip().split(':')
     22             content = content.replace(' ','')
     23             if '_' in content or '(' in content or '' in content or '' in content or '[' in content:
     24                 continue
     25             if len(content) < 5 or len(content) > 79:
     26                 continue
     27             content = '[' + content + ']'
     28             poetrys.append(content)
     29         except Exception as e: 
     30             pass
     31  
     32 # 按诗的字数排序
     33 poetrys = sorted(poetrys,key=lambda line: len(line))
     34 print(u'唐诗总数: ', len(poetrys))
     35  
     36 # 统计每个字出现次数
     37 all_words = []
     38 for poetry in poetrys:
     39     all_words += [word for word in poetry]
     40 counter = collections.Counter(all_words)
     41 count_pairs = sorted(counter.items(), key=lambda x: -x[1])
     42 words, _ = zip(*count_pairs)
     43  
     44 # 取前多少个常用字
     45 words = words[:len(words)] + (' ',)
     46 # 每个字映射为一个数字ID
     47 word_num_map = dict(zip(words, range(len(words))))
     48 # 把诗转换为向量形式
     49 to_num = lambda word: word_num_map.get(word, len(words))
     50 poetrys_vector = [ list(map(to_num, poetry)) for poetry in poetrys]
     51 #[[314, 3199, 367, 1556, 26, 179, 680, 0, 3199, 41, 506, 40, 151, 4, 98, 1],
     52 #[339, 3, 133, 31, 302, 653, 512, 0, 37, 148, 294, 25, 54, 833, 3, 1, 965, 1315, 377, 1700, 562, 21, 37, 0, 2, 1253, 21, 36, 264, 877, 809, 1]
     53 #....]
     54  
     55 batch_size = 1
     56 n_chunk = len(poetrys_vector) // batch_size
     57 x_batches = []
     58 y_batches = []
     59 for i in range(n_chunk):
     60     start_index = i * batch_size
     61     end_index = start_index + batch_size
     62  
     63     batches = poetrys_vector[start_index:end_index]
     64     length = max(map(len,batches))
     65     xdata = np.full((batch_size,length), word_num_map[' '], np.int32)
     66     for row in range(batch_size):
     67         xdata[row,:len(batches[row])] = batches[row]
     68     ydata = np.copy(xdata)
     69     ydata[:,:-1] = xdata[:,1:]
     70     """
     71     xdata             ydata
     72     [6,2,4,6,9]       [2,4,6,9,9]
     73     [1,4,2,8,5]       [4,2,8,5,5]
     74     """
     75     x_batches.append(xdata)
     76     y_batches.append(ydata)
     77  
     78  
     79 #---------------------------------------RNN--------------------------------------#
     80  
     81 input_data = tf.placeholder(tf.int32, [batch_size, None])
     82 output_targets = tf.placeholder(tf.int32, [batch_size, None])
     83 # 定义RNN
     84 def neural_network(model='lstm', rnn_size=128, num_layers=2):
     85     if model == 'rnn':
     86         cell_fun = tf.nn.rnn_cell.BasicRNNCell
     87     elif model == 'gru':
     88         cell_fun = tf.nn.rnn_cell.GRUCell
     89     elif model == 'lstm':
     90         cell_fun = tf.nn.rnn_cell.BasicLSTMCell
     91  
     92     cell = cell_fun(rnn_size, state_is_tuple=True)
     93     cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
     94  
     95     initial_state = cell.zero_state(batch_size, tf.float32)
     96  
     97     with tf.variable_scope('rnnlm'):
     98         softmax_w = tf.get_variable("softmax_w", [rnn_size, len(words)+1])
     99         softmax_b = tf.get_variable("softmax_b", [len(words)+1])
    100         with tf.device("/gpu:0"):
    101             embedding = tf.get_variable("embedding", [len(words)+1, rnn_size])
    102             inputs = tf.nn.embedding_lookup(embedding, input_data)
    103  
    104     outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, scope='rnnlm')
    105     output = tf.reshape(outputs,[-1, rnn_size])
    106  
    107     logits = tf.matmul(output, softmax_w) + softmax_b
    108     probs = tf.nn.softmax(logits)
    109     return logits, last_state, probs, cell, initial_state
    110  
    111 #-------------------------------生成古诗---------------------------------#
    112 # 使用训练完成的模型
    113  
    114 def gen_poetry():
    115     def to_word(weights):
    116         t = np.cumsum(weights)
    117         s = np.sum(weights)
    118         sample = int(np.searchsorted(t, np.random.rand(1)*s))
    119         return words[sample]
    120  
    121     _, last_state, probs, cell, initial_state = neural_network()
    122  
    123     with tf.Session() as sess:
    124         sess.run(tf.initialize_all_variables())
    125  
    126         saver = tf.train.Saver(tf.all_variables())
    127         saver.restore(sess, './ckpt_dir/poetry.module-294')
    128  
    129         state_ = sess.run(cell.zero_state(1, tf.float32))
    130  
    131         x = np.array([list(map(word_num_map.get, '['))])
    132         [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_})
    133         word = to_word(probs_)
    134         #word = words[np.argmax(probs_)]
    135         poem = ''
    136         while word != ']':
    137             poem += word
    138             x = np.zeros((1,1))
    139             x[0,0] = word_num_map[word]
    140             [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_})
    141             word = to_word(probs_)
    142             #word = words[np.argmax(probs_)]
    143         return poem
    144  
    145 
    146 
    147 def gen_poetry_with_head(head):
    148     def to_word(weights):
    149         t = np.cumsum(weights)
    150         s = np.sum(weights)
    151         sample = int(np.searchsorted(t, np.random.rand(1)*s))
    152         return words[sample]
    153  
    154     _, last_state, probs, cell, initial_state = neural_network()
    155  
    156     with tf.Session() as sess:
    157         sess.run(tf.initialize_all_variables())
    158  
    159         saver = tf.train.Saver(tf.all_variables())
    160         saver.restore(sess, './ckpt_dir/poetry.module-294')
    161  
    162         state_ = sess.run(cell.zero_state(1, tf.float32))
    163         poem = ''
    164         i = 0
    165         # print head
    166         # pdb.set_trace()
    167         for word in head:
    168             while word != '' and word != '':
    169                 poem += word
    170                 # print poem
    171                 # print head
    172                 # print word
    173                 x = np.array([list(map(word_num_map.get, word))])
    174                 [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_})
    175                 word = to_word(probs_)
    176                 time.sleep(1)
    177             if i % 2 == 0:
    178                 poem += ''
    179             else:
    180                 poem += ''
    181             i += 1
    182         return poem
    183 
    184 print(gen_poetry())
    185 # print(gen_poetry_with_head(u'一二三四'))

    这个藏头诗的代码用法有问题,不建议使用,我调了很久才调好,这次还是先列原作者的代码,下次单独说这块的调整和调优问题。

    结果:

    有那么点意思,但仔细看问题还是很大,胡言乱语,模型的调优远远不行。

  • 相关阅读:
    【Android 7.1.1】 锁屏界面点击“空白处”响应事件
    ubuntu14.04 LTS Visual Studio Code 编辑器推荐
    使用MAT分析Java内存
    Android 性能优化:使用 Lint 优化代码、去除多余资源
    Android 内存使用hprof文件打开方法
    使用Android Studio调试内存问题
    使用新版Android Studio检测内存泄露和性能
    Android性能优化篇 [ 谷歌官方 ]
    Android 内存泄露总结(附内存检测工具)
    Android MemInfo 各项的意义(转)
  • 原文地址:https://www.cnblogs.com/jourluohua/p/10253798.html
Copyright © 2011-2022 走看看