zoukankan      html  css  js  c++  java
  • Andrej Karpathy的char-rnn Python3版本

      1 """
      2 Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
      3 BSD License
      4 """
      5 import numpy as np
      6     
      7 # data I/O
      8 data = open('input.txt', 'r', encoding='utf-8').read() # should be simple plain text file
      9 chars = list(set(data))
     10 data_size, vocab_size = len(data), len(chars)
     11 print('data has %d characters, %d unique.' % (data_size, vocab_size))
     12 char_to_ix = { ch:i for i,ch in enumerate(chars) }
     13 ix_to_char = { i:ch for i,ch in enumerate(chars) }
     14 
     15 # hyperparameters
     16 hidden_size = 100 # size of hidden layer of neurons
     17 seq_length = 25 # number of steps to unroll the RNN for
     18 learning_rate = 1e-1
     19 
     20 # model parameters
     21 Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
     22 Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
     23 Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
     24 bh = np.zeros((hidden_size, 1)) # hidden bias
     25 by = np.zeros((vocab_size, 1)) # output bias
     26 
     27 def lossFun(inputs, targets, hprev):
     28     """
     29     inputs,targets are both list of integers.
     30     hprev is Hx1 array of initial hidden state
     31     returns the loss, gradients on model parameters, and last hidden state
     32     """
     33     xs, hs, ys, ps = {}, {}, {}, {}
     34     hs[-1] = np.copy(hprev)
     35     loss = 0
     36     # forward pass
     37     for t in range(len(inputs)):
     38         xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
     39         xs[t][inputs[t]] = 1
     40         hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
     41         ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
     42         ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
     43         loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
     44     # backward pass: compute gradients going backwards
     45     dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
     46     dbh, dby = np.zeros_like(bh), np.zeros_like(by)
     47     dhnext = np.zeros_like(hs[0])
     48     for t in reversed(range(len(inputs))):
     49         dy = np.copy(ps[t])
     50         dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
     51         dWhy += np.dot(dy, hs[t].T)
     52         dby += dy
     53         dh = np.dot(Why.T, dy) + dhnext # backprop into h
     54         dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
     55         dbh += dhraw
     56         dWxh += np.dot(dhraw, xs[t].T)
     57         dWhh += np.dot(dhraw, hs[t-1].T)
     58         dhnext = np.dot(Whh.T, dhraw)
     59     for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
     60         np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
     61     return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
     62 
     63 def sample(h, seed_ix, n):
     64     """ 
     65     sample a sequence of integers from the model 
     66     h is memory state, seed_ix is seed letter for first time step
     67     """
     68     x = np.zeros((vocab_size, 1))
     69     x[seed_ix] = 1
     70     ixes = []
     71     for t in range(n):
     72         h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
     73         y = np.dot(Why, h) + by
     74         p = np.exp(y) / np.sum(np.exp(y))
     75         ix = np.random.choice(list(range(vocab_size)), p=p.ravel())
     76         x = np.zeros((vocab_size, 1))
     77         x[ix] = 1
     78         ixes.append(ix)
     79     return ixes
     80 
     81 n, p = 0, 0
     82 mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
     83 mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
     84 smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
     85 while True:
     86     # prepare inputs (we're sweeping from left to right in steps seq_length long)
     87     if p+seq_length+1 >= len(data) or n == 0: 
     88         hprev = np.zeros((hidden_size,1)) # reset RNN memory
     89         p = 0 # go from start of data
     90     inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
     91     targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
     92 
     93     # sample from the model now and then
     94     if n % 100 == 0:
     95         sample_ix = sample(hprev, inputs[0], 200)
     96         txt = ''.join(ix_to_char[ix] for ix in sample_ix)
     97         print('----
     %s 
    ----' % (txt, ))
     98 
     99     # forward seq_length characters through the net and fetch gradient
    100     loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    101     smooth_loss = smooth_loss * 0.999 + loss * 0.001
    102     if n % 100 == 0: print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
    103     
    104     # perform parameter update with Adagrad
    105     for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
    106                                     [dWxh, dWhh, dWhy, dbh, dby], 
    107                                     [mWxh, mWhh, mWhy, mbh, mby]):
    108         mem += dparam * dparam
    109         param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update
    110 
    111     p += seq_length # move data pointer
    112     n += 1 # iteration counter 
  • 相关阅读:
    【DS】仓库里的财宝(线段树+分块)
    【图论】树链剖分
    Java基础之:集合——Collection——Set
    Java基础之:List——LinkedList
    Java基础之:List——ArrayList & Vector
    Java基础之:集合——Collection——List
    Java基础之:集合——Collection
    Java基础之:日期类
    Java基础之:大数
    Java基础之:Math & Arrays & System
  • 原文地址:https://www.cnblogs.com/yanhuihang/p/char-rnn-py3.html
Copyright © 2011-2022 走看看