zoukankan      html  css  js  c++  java
  • RNN例子解释前向传导和后向反向传播

      1 import numpy as np
      2 from datetime import datetime
      3 import sys
      4 
      5 from numpy.core.fromnumeric import shape
      6 
      7 class RNN:
      8     def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
      9         self.word_dim = word_dim
     10         self.hidden_dim = hidden_dim
     11         self.bptt_truncate = bptt_truncate
     12         # Randomly initialize the network parameters, np.random.uniform(low,high,size=(m,n)) -> matrix: m * n
     13         self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
     14         self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
     15         self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
     16 
     17     def softmax(self,x):
     18         exp_x = np.exp(x)
     19         softmax_x = exp_x / np.sum(exp_x)
     20         return softmax_x
     21 
     22     def forward_propagation(self, x):
     23         # hidden states is h, prediction is y_hat
     24         T = len(x)
     25         h = np.zeros((T + 1, self.hidden_dim))
     26         h[-1] = np.zeros(self.hidden_dim)
     27         y_hat = np.zeros((T, self.word_dim))
     28         # For each time step...
     29         for t in np.arange(T):
     30             x_t = np.array(x[t]).reshape(-1,1)
     31             h[t] = (self.U.dot(x_t) + self.W.dot(h[t-1].reshape(-1,1))).reshape(-1)
     32             o_t = self.V.dot(h[t])
     33             y_hat[t] = self.softmax(o_t)
     34         return y_hat, h
     35   
     36     def predict(self, x):
     37         # Perform forward propagation and return index of the highest score
     38         y, h = self.forward_propagation(x)
     39         return np.argmax(y, axis=1)
     40 
     41     def calculate_total_loss(self, x, labels):
     42         total_L = 0
     43         # For each sentence...
     44         for i in np.arange(len(labels)):
     45             y_hat, h = self.forward_propagation(x[i])
     46             total_L += -1 * sum([np.log(y_pred.T.dot(y_true)) for y_pred,y_true in zip(y_hat,np.array(labels[i]))])
     47         return total_L
     48     
     49     def calculate_loss(self, x, labels):
     50         # Divide the total loss by the number of training examples 
     51         N = np.sum([len(label_i) for label_i in labels])
     52         return self.calculate_total_loss(x,labels)/N
     53 
     54     def bptt(self, x, label):
     55         T = len(label)
     56         # Perform forward propagation
     57         y_hat, h = self.forward_propagation(x)
     58         # We accumulate the gradients in these variables
     59         dLdU = np.zeros(self.U.shape)
     60         dLdV = np.zeros(self.V.shape)
     61         dLdW = np.zeros(self.W.shape)
     62         # delta_y -> dLdy: y_hat_t - y_t
     63         delta_y = np.zeros(y_hat.shape)
     64         # For each output backwards...
     65         for t in np.arange(T - 1,-1,-1):
     66             delta_y[t] = y_hat[t] - np.array(label[t])
     67             dLdV += delta_y[t].reshape(-1,1) @ h[t].T.reshape(1,-1)
     68             # Initial delta_t calculation when t is T
     69             if t == T - 1:
     70                 delta_t = np.diag(1 - np.power(h[t],2)) @ self.V.T @ delta_y[t].reshape(-1,1)
     71             else:
     72                 delta_t = np.diag(1 - np.power(h[t],2)) @ (self.V.T @ delta_y[t].reshape(-1,1) + self.W.T @ delta_t.reshape(-1,1))
     73             dLdW += delta_t @ h[t - 1].reshape(1,-1)
     74             dLdU += delta_t @ np.array(x[t]).reshape(1,-1)
     75         return dLdU, dLdV, dLdW
     76 
     77     # Performs one step of SGD.
     78     def numpy_sdg_step(self, x, label, learning_rate):
     79         # Calculate the gradients
     80         dLdU, dLdV, dLdW = self.bptt(x, label)
     81         # Change parameters according to gradients and learning rate
     82         self.U -= learning_rate * dLdU
     83         self.V -= learning_rate * dLdV
     84         self.W -= learning_rate * dLdW
     85         
     86 # - model: The RNN model instance
     87 # - X_train: The training data set
     88 # - y_train: The training data labels
     89 # - learning_rate: Initial learning rate for SGD
     90 # - nepoch: Number of times to iterate through the complete dataset
     91 # - evaluate_loss_after: Evaluate the loss after this many epochs
     92 def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
     93     # We keep track of the losses so we can plot them later
     94     losses = []
     95     num_examples_seen = 0
     96     for epoch in range(nepoch):
     97         # Optionally evaluate the loss
     98         if (epoch % evaluate_loss_after == 0):
     99             loss = model.calculate_loss(X_train, y_train)
    100             losses.append((num_examples_seen, loss))
    101             time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    102             print(f'{time} Loss after num_examples_seen {num_examples_seen} epoch {epoch}, current loss is {loss}')
    103             # Adjust the learning rate if loss increases
    104             if(len(losses)>1 and losses[-1][1]>losses[-2][1]):
    105                 learning_rate = learning_rate * 0.5  
    106                 print("Setting learning rate to %f" % learning_rate)
    107                 
    108         # For each training example...
    109         for i in range(len(y_train)):
    110             # One SGD step
    111             model.numpy_sdg_step(X_train[i], y_train[i], learning_rate)
    112             num_examples_seen += 1
    113             
    114 if __name__=='__main__':
    115     s1 = '你 好 李 焕 英'
    116     s2 = '夏 洛 特 烦 恼'
    117     vocab_size= len(s1.split(' ')) + len(s2.split(' '))
    118     vocab = [[0] * vocab_size for _ in range(vocab_size)]
    119     for i in range(vocab_size): vocab[i][i] = 1
    120     x_sample = [vocab[:5]] + [vocab[5:]]
    121     labels = [vocab[1:6]] + [vocab[6:]+[vocab[0]]]
    122 
    123     rnn = RNN(10)
    124     train_with_sgd(rnn,x_sample,labels)

    参考博客:https://zhuanlan.zhihu.com/p/371849556

    每一个不曾起舞的日子,都是对生命的辜负。
  • 相关阅读:
    loj#6433. 「PKUSC2018」最大前缀和(状压dp)
    PKUWC2019游记
    10. Regular Expression Matching
    9. Palindrome Number
    8. String to Integer (atoi)
    7. Reverse Integer
    6. ZigZag Conversion
    5. Longest Palindromic Substring
    4. Median of Two Sorted Arrays
    3. Longest Substring Without Repeating Characters
  • 原文地址:https://www.cnblogs.com/randy-lo/p/15268683.html
Copyright © 2011-2022 走看看