zoukankan html css js c++ java

改善深层神经网络-week1编程题（初始化、正则化、梯度校验）

初始化

分别使用0、随机数和抑梯度异常初始化参数，比较发现抑梯度异常初始化参数可以得到更高的准确度。

原始数据：

import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
from init_utils import sigmoid, relu, compute_loss, forward_propagation, backward_propagation
from init_utils import update_parameters, predict, load_dataset, plot_decision_boundary, predict_dec
from math import sqrt

# %matplotlib inline
plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# load image dataset: blue/red dots in circles
train_X, train_Y, test_X, test_Y = load_dataset()

使用抑梯度异常初始化代码如下：

  1 #three layers
  2 def model(X, Y, learning_rate=0.01, num_iterations=15000, print_cost=True, initialization="he"):
  3     """
  4     Implements a three-layer neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SIGMOID.
  5 
  6     Arguments:
  7     X -- input data, of shape (2, number of examples)
  8     Y -- true "label" vector (containing 0 for red dots; 1 for blue dots), of shape (1, number of examples)
  9     learning_rate -- learning rate for gradient descent
 10     num_iterations -- number of iterations to run gradient descent
 11     print_cost -- if True, print the cost every 1000 iterations
 12     initialization -- flag to choose which initialization to use ("zeros","random" or "he")
 13 
 14     Returns:
 15     parameters -- parameters learnt by the model
 16     """
 17 
 18     grads = {}
 19     costs = []     # to keep track of the loss
 20     m = X.shape[1] # number of examples
 21     layers_dims = [X.shape[0], 10, 5, 1]
 22 
 23     # Initialize parameters dictionary.
 24     parameters = initialize_parameters_he(layers_dims)
 25 
 26     # Loop (gradient descent)
 27     for i in range(0, num_iterations):
 28 
 29         # Forward propagation: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID.
 30         a3, cache = forward_propagation(X, parameters)
 31 
 32         # Loss
 33         cost = compute_loss(a3, Y)
 34 
 35         # Backward propagation.
 36         grads = backward_propagation(X, Y, cache)
 37 
 38         # Update parameters.
 39         parameters = update_parameters(parameters, grads, learning_rate)
 40 
 41         # Print the loss every 1000 iterations
 42         if print_cost and i % 1000 == 0:
 43             print("Cost after iteration {}: {}".format(i, cost))
 44             costs.append(cost)
 45 
 46     # plot the loss
 47     plt.plot(costs)
 48     plt.ylabel('cost')
 49     plt.xlabel('iterations (per hundreds)')
 50     plt.title("Learning rate =" + str(learning_rate))
 51     plt.show()
 52 
 53     return parameters
 54 
 55 
 56 # GRADED FUNCTION: initialize_parameters_he
 57 def initialize_parameters_he(layers_dims):
 58     """
 59     Arguments:
 60     layer_dims -- python array (list) containing the size of each layer.
 61 
 62     Returns:
 63     parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
 64                     W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
 65                     b1 -- bias vector of shape (layers_dims[1], 1)
 66                     ...
 67                     WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
 68                     bL -- bias vector of shape (layers_dims[L], 1)
 69     """
 70 
 71     np.random.seed(3)
 72     parameters = {}
 73     L = len(layers_dims) - 1 # integer representing the number of layers
 74 
 75     for l in range(1, L + 1):
 76         ### START CODE HERE ### (≈ 2 lines of code)
 77         parameters['W'+str(l)]=np.random.randn(layers_dims[l], layers_dims[l-1])*sqrt(2./layers_dims[l-1])
 78         parameters['b'+str(l)]=np.zeros((layers_dims[l], 1))
 79         ### END CODE HERE ###
 80     return parameters
 81 
 82 parameters = initialize_parameters_he([2, 4, 1])
 83 print("W1 = " + str(parameters["W1"]))
 84 print("b1 = " + str(parameters["b1"]))
 85 print("W2 = " + str(parameters["W2"]))
 86 print("b2 = " + str(parameters["b2"]))
 87 
 88 
 89 parameters = model(train_X, train_Y, initialization = "he")
 90 print("On the train set:")
 91 predictions_train = predict(train_X, train_Y, parameters)
 92 print("On the test set:")
 93 predictions_test = predict(test_X, test_Y, parameters)
 94 
 95 
 96 plt.title("Model with He initialization")
 97 axes = plt.gca()
 98 axes.set_xlim([-1.5, 1.5])
 99 axes.set_ylim([-1.5, 1.5])
100 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

View Code

预测准确度0.96

L2正则化

原始数据：

1 from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
2 from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
3 import scipy.io
4 from testCases_v3 import *
5 
6 train_X, train_Y, test_X, test_Y = load_2D_dataset()

如果不使用正则化：

 1 def model(X, Y, learning_rate = 0.3, num_iterations = 30000, print_cost = True, lambd = 0, keep_prob = 1):
 2     """
 3     Implements a three-layer neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SIGMOID.
 4 
 5     Arguments:
 6     X -- input data, of shape (input size, number of examples)
 7     Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (output size, number of examples)
 8     learning_rate -- learning rate of the optimization
 9     num_iterations -- number of iterations of the optimization loop
10     print_cost -- If True, print the cost every 10000 iterations
11     lambd -- regularization hyperparameter, scalar
12     keep_prob - probability of keeping a neuron active during drop-out, scalar.
13 
14     Returns:
15     parameters -- parameters learned by the model. They can then be used to predict.
16     """
17 
18     grads = {}
19     costs = []                            # to keep track of the cost
20     m = X.shape[1]                        # number of examples
21     layers_dims = [X.shape[0], 20, 3, 1]
22 
23     # Initialize parameters dictionary.
24     parameters = initialize_parameters(layers_dims)
25 
26     # Loop (gradient descent)
27     for i in range(0, num_iterations):
28 
29         # Forward propagation: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID.
30         if keep_prob == 1:
31             a3, cache = forward_propagation(X, parameters)
32         elif keep_prob < 1:
33             a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
34 
35         # Cost function
36         if lambd == 0:
37             cost = compute_cost(a3, Y)
38         else:
39             cost = compute_cost_with_regularization(a3, Y, parameters, lambd)
40 
41         # Backward propagation.
42         assert(lambd == 0 or keep_prob == 1)    # it is possible to use both L2 regularization and dropout,
43                                             # but this assignment will only explore one at a time
44         if lambd == 0 and keep_prob == 1:
45             grads = backward_propagation(X, Y, cache)
46         elif lambd != 0:
47             grads = backward_propagation_with_regularization(X, Y, cache, lambd)
48         elif keep_prob < 1:
49             grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)
50 
51         # Update parameters.
52         parameters = update_parameters(parameters, grads, learning_rate)
53 
54         # Print the loss every 10000 iterations
55         if print_cost and i % 10000 == 0:
56             print("Cost after iteration {}: {}".format(i, cost))
57         if print_cost and i % 1000 == 0:
58             costs.append(cost)
59 
60     # plot the cost
61     plt.plot(costs)
62     plt.ylabel('cost')
63     plt.xlabel('iterations (x1,000)')
64     plt.title("Learning rate =" + str(learning_rate))
65     plt.show()
66 
67     return parameters
68 
69 
70 parameters = model(train_X, train_Y)
71 print("On the training set:")
72 predictions_train = predict(train_X, train_Y, parameters)
73 print("On the test set:")
74 predictions_test = predict(test_X, test_Y, parameters)
75 
76 
77 plt.title("Model without regularization")
78 axes = plt.gca()
79 axes.set_xlim([-0.75, 0.40])
80 axes.set_ylim([-0.75, 0.65])
81 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

View Code

预测准确度0.915

使用了L2正则化：

 1 # GRADED FUNCTION: compute_cost_with_regularization
 2 def compute_cost_with_regularization(A3, Y, parameters, lambd):
 3     """
 4     Implement the cost function with L2 regularization. See formula (2) above.
 5     
 6     Arguments:
 7     A3 -- post-activation, output of forward propagation, of shape (output size, number of examples)
 8     Y -- "true" labels vector, of shape (output size, number of examples)
 9     parameters -- python dictionary containing parameters of the model
10     
11     Returns:
12     cost - value of the regularized loss function (formula (2))
13     """
14     m = Y.shape[1]
15     W1 = parameters["W1"]
16     W2 = parameters["W2"]
17     W3 = parameters["W3"]
18     
19     cross_entropy_cost = compute_cost(A3, Y) # This gives you the cross-entropy part of the cost
20     
21     ### START CODE HERE ### (approx. 1 line)
22     L2_regularization_cost=lambd/(2*m)*(np.sum(np.square(W1)) + np.sum(np.square(W2))  + np.sum(np.square(W3)))
23     ### END CODER HERE ###
24     
25     cost = cross_entropy_cost + L2_regularization_cost    
26     return cost
27 
28 A3, Y_assess, parameters = compute_cost_with_regularization_test_case()
29 print("cost = " + str(compute_cost_with_regularization(A3, Y_assess, parameters, lambd = 0.1)))
30 
31 
32 # GRADED FUNCTION: backward_propagation_with_regularization
33 def backward_propagation_with_regularization(X, Y, cache, lambd):
34     """
35     Implements the backward propagation of our baseline model to which we added an L2 regularization.
36     
37     Arguments:
38     X -- input dataset, of shape (input size, number of examples)
39     Y -- "true" labels vector, of shape (output size, number of examples)
40     cache -- cache output from forward_propagation()
41     lambd -- regularization hyperparameter, scalar
42     
43     Returns:
44     gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
45     """
46     
47     m = X.shape[1]
48     (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache
49     
50     dZ3 = A3 - Y
51     
52     ### START CODE HERE ### (approx. 1 line)
53     dW3=np.dot(dZ3,A2.T)/m+lambd*W3/m    
54     ### END CODE HERE ###
55     db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
56     
57     dA2 = np.dot(W3.T, dZ3)
58     dZ2 = np.multiply(dA2, np.int64(A2 > 0))
59     ### START CODE HERE ### (approx. 1 line）
60     dW2=np.dot(dZ2,A1.T)/m+lambd*W2/m    
61     ### END CODE HERE ###
62     db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
63     
64     dA1 = np.dot(W2.T, dZ2)
65     dZ1 = np.multiply(dA1, np.int64(A1 > 0))
66     ### START CODE HERE ### (approx. 1 line)
67     dW1=np.dot(dZ1,X.T)/m+lambd*W1/m    
68     ### END CODE HERE ###
69     db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)
70     
71     gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3, "dA2": dA2,
72                  "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, 
73                  "dZ1": dZ1, "dW1": dW1, "db1": db1}    
74     return gradients
75 
76 
77 X_assess, Y_assess, cache = backward_propagation_with_regularization_test_case()
78 grads = backward_propagation_with_regularization(X_assess, Y_assess, cache, lambd=0.7)
79 print ("dW1 = " + str(grads["dW1"]))
80 print ("dW2 = " + str(grads["dW2"]))
81 print ("dW3 = " + str(grads["dW3"]))
82 
83 
84 parameters = model(train_X, train_Y, lambd=0.7)
85 print("On the train set:")
86 predictions_train = predict(train_X, train_Y, parameters)
87 print("On the test set:")
88 predictions_test = predict(test_X, test_Y, parameters)
89 
90 
91 plt.title("Model with L2-regularization")
92 axes = plt.gca()
93 axes.set_xlim([-0.75,0.40])
94 axes.set_ylim([-0.75,0.65])
95 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

View Code

预测准确率0.93

Dropout正则化（随机失活）

  1 # GRADED FUNCTION: forward_propagation_with_dropout
  2 def forward_propagation_with_dropout(X, parameters, keep_prob=0.5):
  3     """
  4     Implements the forward propagation: LINEAR -> RELU + DROPOUT -> LINEAR -> RELU + DROPOUT -> LINEAR -> SIGMOID.
  5     
  6     Arguments:
  7     X -- input dataset, of shape (2, number of examples)
  8     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
  9                     W1 -- weight matrix of shape (20, 2)
 10                     b1 -- bias vector of shape (20, 1)
 11                     W2 -- weight matrix of shape (3, 20)
 12                     b2 -- bias vector of shape (3, 1)
 13                     W3 -- weight matrix of shape (1, 3)
 14                     b3 -- bias vector of shape (1, 1)
 15     keep_prob - probability of keeping a neuron active during drop-out, scalar
 16     
 17     Returns:
 18     A3 -- last activation value, output of the forward propagation, of shape (1,1)
 19     cache -- tuple, information stored for computing the backward propagation
 20     """    
 21     np.random.seed(1)
 22     
 23     # retrieve parameters
 24     W1 = parameters["W1"]
 25     b1 = parameters["b1"]
 26     W2 = parameters["W2"]
 27     b2 = parameters["b2"]
 28     W3 = parameters["W3"]
 29     b3 = parameters["b3"]
 30     
 31     # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
 32     Z1 = np.dot(W1, X) + b1
 33     A1 = relu(Z1)
 34     ### START CODE HERE ### (approx. 4 lines)         # Steps 1-4 below correspond to the Steps 1-4 described above. 
 35     D1=np.random.rand(A1.shape[0],A1.shape[1])        # Step 1: initialize matrix D1 = np.random.rand(..., ...)
 36     D1=D1<keep_prob                                   # Step 2: convert entries of D1 to 0 or 1 (using keep_prob as the threshold)
 37     A1=np.multiply(A1,D1)                             # Step 3: shut down some neurons of A1
 38     A1/=keep_prob                                     # Step 4: scale the value of neurons that haven't been shut down
 39     ### END CODE HERE ###
 40        
 41     Z2 = np.dot(W2, A1) + b2
 42     A2 = relu(Z2)
 43     ### START CODE HERE ### (approx. 4 lines)
 44     D2=np.random.rand(A2.shape[0], A2.shape[1])      # Step 1: initialize matrix D2 = np.random.rand(..., ...)
 45     D2=D2<keep_prob                                  # Step 2: convert entries of D2 to 0 or 1 (using keep_prob as the threshold)                           
 46     A2=np.multiply(A2,D2)                            # Step 3: shut down some neurons of A2
 47     A2/=keep_prob                                    # Step 4: scale the value of neurons that haven't been shut down
 48     ### END CODE HERE ###
 49     
 50     Z3 = np.dot(W3, A2) + b3
 51     A3 = sigmoid(Z3)
 52     
 53     cache = (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3)   
 54     return A3, cache
 55 
 56 X_assess, parameters = forward_propagation_with_dropout_test_case()
 57 A3, cache = forward_propagation_with_dropout(X_assess, parameters, keep_prob=0.7)
 58 print ("A3 = " + str(A3))
 59 
 60 
 61 # GRADED FUNCTION: backward_propagation_with_dropout
 62 def backward_propagation_with_dropout(X, Y, cache, keep_prob):
 63     """
 64     Implements the backward propagation of our baseline model to which we added dropout.
 65     
 66     Arguments:
 67     X -- input dataset, of shape (2, number of examples)
 68     Y -- "true" labels vector, of shape (output size, number of examples)
 69     cache -- cache output from forward_propagation_with_dropout()
 70     keep_prob - probability of keeping a neuron active during drop-out, scalar
 71     
 72     Returns:
 73     gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
 74     """
 75     
 76     m = X.shape[1]
 77     (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) = cache
 78     
 79     dZ3 = A3 - Y
 80     dW3 = 1. / m * np.dot(dZ3, A2.T)
 81     db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
 82     
 83     dA2 = np.dot(W3.T, dZ3)
 84     ### START CODE HERE ### (≈ 2 lines of code)
 85     dA2 = dA2*D2               # Step 1: Apply mask D2 to shut down the same neurons as during the forward propagation
 86     dA2 = dA2/keep_prob        # Step 2: Scale the value of neurons that haven't been shut down
 87     ### END CODE HERE ###
 88     
 89     dZ2 = np.multiply(dA2, np.int64(A2 > 0))
 90     dW2 = 1. / m * np.dot(dZ2, A1.T)
 91     db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
 92     
 93     dA1 = np.dot(W2.T, dZ2)
 94     ### START CODE HERE ### (≈ 2 lines of code)
 95     dA1=dA1*D1                   # Step 1: Apply mask D1 to shut down the same neurons as during the forward propagation
 96     dA1=dA1/keep_prob            # Step 2: Scale the value of neurons that haven't been shut down
 97     ### END CODE HERE ###
 98     
 99     dZ1 = np.multiply(dA1, np.int64(A1 > 0))
100     dW1 = 1. / m * np.dot(dZ1, X.T)
101     db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)
102     
103     gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2,
104                  "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, 
105                  "dZ1": dZ1, "dW1": dW1, "db1": db1}
106     
107     return gradients
108 
109 
110 X_assess, Y_assess, cache = backward_propagation_with_dropout_test_case()
111 gradients = backward_propagation_with_dropout(X_assess, Y_assess, cache, keep_prob=0.8)
112 print ("dA1 = " + str(gradients["dA1"]))
113 print ("dA2 = " + str(gradients["dA2"]))
114 
115 
116 parameters = model(train_X, train_Y, keep_prob=0.86, learning_rate=0.3)
117 print("On the train set:")
118 predictions_train = predict(train_X, train_Y, parameters)
119 print("On the test set:")
120 predictions_test = predict(test_X, test_Y, parameters)
121 
122 
123 plt.title("Model with dropout")
124 axes = plt.gca()
125 axes.set_xlim([-0.75, 0.40])
126 axes.set_ylim([-0.75, 0.65])
127 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

View Code

预测准确度0.95

梯度校验

一维梯度校验：

 1 from testCases_v3 import gradient_check_n_test_case
 2 from gc_utils import sigmoid, relu, dictionary_to_vector, vector_to_dictionary, gradients_to_vector
 3 
 4 #一维梯度校验
 5 # GRADED FUNCTION: forward_propagation
 6 def forward_propagation(x, theta):
 7     """
 8     Implement the linear forward propagation (compute J) presented in Figure 1 (J(theta) = theta * x)
 9     
10     Arguments:
11     x -- a real-valued input
12     theta -- our parameter, a real number as well
13     
14     Returns:
15     J -- the value of function J, computed using the formula J(theta) = theta * x
16     """
17     
18     ### START CODE HERE ### (approx. 1 line)
19     J = np.dot(theta, x)
20     ### END CODE HERE ### 
21     return J
22 
23 x, theta = 2, 4
24 J = forward_propagation(x, theta)
25 print ("J = " + str(J))
26 
27 
28 # GRADED FUNCTION: backward_propagation
29 def backward_propagation(x, theta):
30     """
31     Computes the derivative of J with respect to theta (see Figure 1).
32     
33     Arguments:
34     x -- a real-valued input
35     theta -- our parameter, a real number as well
36     
37     Returns:
38     dtheta -- the gradient of the cost with respect to theta
39     """    
40     ### START CODE HERE ### (approx. 1 line)
41     dtheta=x
42     ### END CODE HERE ###   
43     return dtheta
44 
45 x, theta = 2, 4
46 dtheta = backward_propagation(x, theta)
47 print ("dtheta = " + str(dtheta))
48 
49 
50 # GRADED FUNCTION: gradient_check
51 def gradient_check(x, theta, epsilon=1e-7):
52     """
53     Implement the backward propagation presented in Figure 1.
54     
55     Arguments:
56     x -- a real-valued input
57     theta -- our parameter, a real number as well
58     epsilon -- tiny shift to the input to compute approximated gradient with formula(1)
59     
60     Returns:
61     difference -- difference (2) between the approximated gradient and the backward propagation gradient
62     """
63     
64     # Compute gradapprox using left side of formula (1). epsilon is small enough, you don't need to worry about the limit.
65     ### START CODE HERE ### (approx. 5 lines)
66     theta1=theta+epsilon                              # Step 1
67     theta2=theta-epsilon                              # Step 2
68     J1=forward_propagation(x, theta1)                 # Step 3
69     J2=forward_propagation(x, theta2)                 # Step 4
70     gradapprox=(J1-J2)/(2*epsilon)                    # Step 5
71     ### END CODE HERE ###
72     
73     # Check if gradapprox is close enough to the output of backward_propagation()
74     ### START CODE HERE ### (approx. 1 line)
75     grad = backward_propagation(x, theta)
76     ### END CODE HERE ###
77     
78     ### START CODE HERE ### (approx. 1 line)    
79     numerator = np.linalg.norm(grad - gradapprox)                      # Step 1'
80     denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)    # Step 2'
81     difference = numerator / denominator                               # Step 3'
82     ### END CODE HERE ###
83     
84     if difference < 1e-7:
85         print("The gradient is correct!")
86     else:
87         print("The gradient is wrong!")
88     
89     return difference
90 
91 x, theta = 2, 4
92 difference = gradient_check(x, theta)
93 print("difference = " + str(difference))

View Code

输出：

The gradient is correct!
difference = 2.919335883291695e-10

N维梯度校验：

  1 #N维梯度校验
  2 def forward_propagation_n(X, Y, parameters):
  3     """
  4     Implements the forward propagation (and computes the cost) presented in Figure 3.
  5     
  6     Arguments:
  7     X -- training set for m examples
  8     Y -- labels for m examples 
  9     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
 10                     W1 -- weight matrix of shape (5, 4)
 11                     b1 -- bias vector of shape (5, 1)
 12                     W2 -- weight matrix of shape (3, 5)
 13                     b2 -- bias vector of shape (3, 1)
 14                     W3 -- weight matrix of shape (1, 3)
 15                     b3 -- bias vector of shape (1, 1)
 16     
 17     Returns:
 18     cost -- the cost function (logistic cost for one example)
 19     """
 20     
 21     # retrieve parameters
 22     m = X.shape[1]
 23     W1 = parameters["W1"]
 24     b1 = parameters["b1"]
 25     W2 = parameters["W2"]
 26     b2 = parameters["b2"]
 27     W3 = parameters["W3"]
 28     b3 = parameters["b3"]
 29 
 30     # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
 31     Z1 = np.dot(W1, X) + b1
 32     A1 = relu(Z1)
 33     Z2 = np.dot(W2, A1) + b2
 34     A2 = relu(Z2)
 35     Z3 = np.dot(W3, A2) + b3
 36     A3 = sigmoid(Z3)
 37 
 38     # Cost
 39     logprobs = np.multiply(-np.log(A3), Y) + np.multiply(-np.log(1 - A3), 1 - Y)
 40     cost = 1. / m * np.sum(logprobs)
 41     
 42     cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3)
 43     
 44     return cost, cache
 45 
 46 
 47 def backward_propagation_n(X, Y, cache):
 48     """
 49     Implement the backward propagation presented in figure 2.
 50     
 51     Arguments:
 52     X -- input datapoint, of shape (input size, 1)
 53     Y -- true "label"
 54     cache -- cache output from forward_propagation_n()
 55     
 56     Returns:
 57     gradients -- A dictionary with the gradients of the cost with respect to each parameter, activation and pre-activation variables.
 58     """
 59     
 60     m = X.shape[1]
 61     (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache
 62     
 63     dZ3 = A3 - Y
 64     dW3 = 1. / m * np.dot(dZ3, A2.T)
 65     db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
 66     
 67     dA2 = np.dot(W3.T, dZ3)
 68     dZ2 = np.multiply(dA2, np.int64(A2 > 0))
 69     dW2 = 1. / m * np.dot(dZ2, A1.T) * 2  # Should not multiply by 2
 70     db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
 71     
 72     dA1 = np.dot(W2.T, dZ2)
 73     dZ1 = np.multiply(dA1, np.int64(A1 > 0))
 74     dW1 = 1. / m * np.dot(dZ1, X.T)
 75     db1 = 4. / m * np.sum(dZ1, axis=1, keepdims=True) # Should not multiply by 4
 76     
 77     gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,
 78                  "dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2,
 79                  "dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1}
 80     
 81     return gradients
 82 
 83 
 84 # GRADED FUNCTION: gradient_check_n
 85 def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7):
 86     """
 87     Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n
 88     
 89     Arguments:
 90     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
 91     grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters. 
 92     x -- input datapoint, of shape (input size, 1)
 93     y -- true "label"
 94     epsilon -- tiny shift to the input to compute approximated gradient with formula(1)
 95     
 96     Returns:
 97     difference -- difference (2) between the approximated gradient and the backward propagation gradient
 98     """
 99     
100     # Set-up variables
101     parameters_values, _ = dictionary_to_vector(parameters)
102     grad = gradients_to_vector(gradients)
103     num_parameters = parameters_values.shape[0]
104     J_plus = np.zeros((num_parameters, 1))
105     J_minus = np.zeros((num_parameters, 1))
106     gradapprox = np.zeros((num_parameters, 1))
107     
108     # Compute gradapprox
109     for i in range(num_parameters):
110         
111         # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]".
112         # "_" is used because the function you have to outputs two parameters but we only care about the first one
113         ### START CODE HERE ### (approx. 3 lines)
114         theta1=np.copy(parameters_values)                       # Step 1
115         theta1[i][0]+=epsilon                                   # Step 2
116         J_plus[i],_=forward_propagation_n(X, Y, vector_to_dictionary(theta1))           # Step 3
117         ### END CODE HERE ###
118         
119         
120         # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]".
121         ### START CODE HERE ### (approx. 3 lines)
122         theta2=np.copy(parameters_values)                       # Step 1
123         theta2[i][0]-=epsilon                                   # Step 2
124         J_minus[i],_=forward_propagation_n(X, Y, vector_to_dictionary(theta2))          # Step 3
125         ### END CODE HERE ###
126         
127         # Compute gradapprox[i]
128         ### START CODE HERE ### (approx. 1 line)
129         gradapprox[i]=(J_plus[i]-J_minus[i])/(2*epsilon)
130         ### END CODE HERE ###
131     
132     # Compare gradapprox to backward propagation gradients by computing difference.
133     ### START CODE HERE ### (approx. 1 line)
134     numerator = np.linalg.norm(grad - gradapprox)                      # Step 1'
135     denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)    # Step 2'
136     difference = numerator / denominator                               # Step 3'                                            
137     ### END CODE HERE ###
138 
139     if difference > 1e-7:
140         print("33[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "33[0m")
141     else:
142         print("33[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "33[0m")
143     
144     return difference
145 
146 
147 X, Y, parameters = gradient_check_n_test_case()
148 
149 cost, cache = forward_propagation_n(X, Y, parameters)
150 gradients = backward_propagation_n(X, Y, cache)
151 difference = gradient_check_n(parameters, gradients, X, Y)

View Code

输出：

There is a mistake in the backward propagation! difference = 0.2850931566540251

查看全文

相关阅读:
《C# to IL》第一章 IL入门
 multiple users to one ec2 instance setup
Route53 health check与 Cloudwatch alarm 没法绑定
 rsync aws ec2 pem
通过jvm 查看死锁
 wait, notify 使用清晰讲解
 for aws associate exam
docker 容器不能联网
 本地运行aws lambda credential 配置 (missing credential config error)
Cannot connect to the Docker daemon. Is 'docker daemon' running on this host?

原文地址：https://www.cnblogs.com/cxq1126/p/13093231.html