zoukankan      html  css  js  c++  java
  • 梯度下降算法改进笔记

     作业:

     自己实现动量梯度下降Adam梯度下降算法

    优化算法:

    import numpy as np
    import matplotlib.pyplot as plt
    import math
    import sklearn
    import sklearn.datasets
    
    # 前向传播,计算损失,反向传播
    from utils import initialize_parameters, forward_propagation, compute_cost, backward_propagation
    from utils import load_dataset, predict
    
    
    def random_mini_batches(X, Y, mini_batch_size=64, seed=0):
        """
        创建每批次固定数量特征值和目标值
        """
    
        np.random.seed(seed)
        # 样本数量
        m = X.shape[1]
        mini_batches = []
    
        # 对所有数据进行打乱
        permutation = list(np.random.permutation(m))
    
        shuffled_X = X[:, permutation]
        shuffled_Y = Y[:, permutation].reshape((1, m))
    
        # 循环将每批次数据按照固定格式装进列表当中
        num_complete_minibatches = math.floor(
            m / mini_batch_size)
    
        # 所有训练数据分成多少组
        for k in range(0, num_complete_minibatches):
            mini_batch_X = shuffled_X[:, k * mini_batch_size: (k + 1) * mini_batch_size]
            mini_batch_Y = shuffled_Y[:, k * mini_batch_size: (k + 1) * mini_batch_size]
            mini_batch = (mini_batch_X, mini_batch_Y)
            mini_batches.append(mini_batch)
    
        # 最后剩下的样本数量mini-batch < mini_batch_size
        if m % mini_batch_size != 0:
            mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size:]
            mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size:]
            mini_batch = (mini_batch_X, mini_batch_Y)
            mini_batches.append(mini_batch)
    
        return mini_batches
    
    
    def initialize_momentum(parameters):
        """
        初始化网络中每一层的动量梯度下降的指数加权平均结果参数
        parameters['W' + str(l)] = Wl
        parameters['b' + str(l)] = bl
        return:
        v['dW' + str(l)] = velocity of dWl
        v['db' + str(l)] = velocity of dbl
        """
        # 得到网络的层数
        L = len(parameters) // 2
        v = {}
    
        # 初始化动量参数
        for l in range(L):
            v["dW" + str(l + 1)] = np.zeros(parameters['W' + str(l + 1)].shape)
            v["db" + str(l + 1)] = np.zeros(parameters['b' + str(l + 1)].shape)
    
        return v
    
    
    def update_parameters_with_momentum(parameters, gradients, v, beta, learning_rate):
        """
        动量梯度下降算法实现
        """
        # 得到网络的层数
        L = len(parameters) // 2
    
        # 动量梯度参数更新
        for l in range(L):
    
            # 开始
    
            # 1.计算梯度的指数加权平均数
            v["dW" + str(l + 1)] = beta * v["dW" + str(l + 1)] + (1 - beta) * gradients["dW" + str(l + 1)]
            v["db" + str(l + 1)] = beta * v["db" + str(l + 1)] + (1 - beta) * gradients["db" + str(l + 1)]
            # 2.利用该值来更新参数
            parameters['W' + str(l + 1)] -= learning_rate * v["dW" + str(l + 1)]
            parameters['b' + str(l + 1)] -= learning_rate * v["db" + str(l + 1)]
            # 结束
    
        return parameters, v
    
    
    def initialize_adam(parameters):
        """
        初始化Adam算法中的参数
        parameters['W' + str(l)] = Wl
        parameters['b' + str(l)] = bl
        return:
        v['dW' + str(l)] = velocity of v_dWl
        v['db' + str(l)] = velocity of v_dbl
        s['dw' + str(l)] = velocity of s_dwl
        s['db' + str(l)] = velocity of s_dbl
        """
        # 得到网络的参数
        L = len(parameters) // 2
        v = {}
        s = {}
    
        # 利用输入,初始化参数v,s
        for l in range(L):
    
            v["dW" + str(l + 1)] = np.zeros(parameters['W' + str(l + 1)].shape)
            v["db" + str(l + 1)] = np.zeros(parameters['b' + str(l + 1)].shape)
            s["dW" + str(l + 1)] = np.zeros(parameters['W' + str(l + 1)].shape)
            s["db" + str(l + 1)] = np.zeros(parameters['b' + str(l + 1)].shape)
    
        return v, s
    
    
    def update_parameters_with_adam(parameters, gradients, v, s, t, learning_rate=0.01,
                                    beta1=0.9, beta2=0.999, epsilon=1e-8):
        """
        更新Adam算法网络的参数
        """
        # 网络大小
        L = len(parameters) // 2
        v_corrected = {}
        s_corrected = {}
    
        # 更新所有参数
        for l in range(L):
            # 对梯度进行移动平均计算. 输入: "v, gradients, beta1". 输出: "v".
            # 开始
            v["dW" + str(l + 1)] = beta1 * v['dW' + str(l + 1)] + (1 - beta1) * gradients['dW' + str(l + 1)]
            v["db" + str(l + 1)] = beta1 * v['db' + str(l + 1)] + (1 - beta1) * gradients['db' + str(l + 1)]
            # 结束
    
            # 计算修正结果. 输入: "v, beta1, t". 输出: "v_corrected".
            # 开始
            v_corrected["dW" + str(l + 1)] = v['dW' + str(l + 1)] / (1 - np.power(beta1, t))
            v_corrected["db" + str(l + 1)] = v['db' + str(l + 1)] / (1 - np.power(beta1, t))
            # 结束
    
            # 平方梯度的移动平均值. 输入: "s, gradients, beta2". 输出: "s".
            # 开始
            s["dW" + str(l + 1)] = beta2 * s['dW' + str(l + 1)] + (1 - beta2) * np.power(gradients['dW' + str(l + 1)], 2)
            s["db" + str(l + 1)] = beta2 * s['db' + str(l + 1)] + (1 - beta2) * np.power(gradients['db' + str(l + 1)], 2)
            # 结束
    
            # 计算修正的结果. 输入: "s, beta2, t". 输出: "s_corrected".
            # 开始
            s_corrected["dW" + str(l + 1)] = s['dW' + str(l + 1)] / (1 - np.power(beta2, t))
            s_corrected["db" + str(l + 1)] = s['db' + str(l + 1)] / (1 - np.power(beta2, t))
            # 结束
    
            # 更新参数. 输入: "parameters, learning_rate, v_corrected, s_corrected, epsilon". 输出: "parameters".
            # 开始
            parameters["W" + str(l + 1)] = parameters['W' + str(l + 1)] - learning_rate * v_corrected[
                'dW' + str(l + 1)] / np.sqrt(s_corrected['dW' + str(l + 1)] + epsilon)
            parameters["b" + str(l + 1)] = parameters['b' + str(l + 1)] - learning_rate * v_corrected[
                'db' + str(l + 1)] / np.sqrt(s_corrected['db' + str(l + 1)] + epsilon)
            # 结束
    
        return parameters, v, s
    
    
    def model(X, Y, optimizer, learning_rate=0.0007, mini_batch_size=64, beta=0.9,
              beta1=0.9, beta2=0.999, epsilon=1e-8, num_epochs=10000, print_cost=True):
        """
        模型逻辑
        定义一个三层网络(不包括输入层)
        第一个隐层:5个神经元
        第二个隐层:2个神经元
        输出层:1个神经元
        """
        # 计算网络的层数
        layers_dims = [train_X.shape[0], 5, 2, 1]
    
        L = len(layers_dims)
        costs = []
        t = 0
        seed = 10
    
        # 初始化网络结构
        parameters = initialize_parameters(layers_dims)
    
        # 初始化优化器参数
        if optimizer == "momentum":
            v = initialize_momentum(parameters)
        elif optimizer == "adam":
            v, s = initialize_adam(parameters)
    
        # 优化逻辑
        for i in range(num_epochs):
    
            # 每次迭代所有样本顺序打乱不一样
            seed = seed + 1
            # 获取每批次数据
            minibatches = random_mini_batches(X, Y, mini_batch_size, seed)
            # print("minibatches到底是啥", minibatches)
            # mini_batch = (mini_batch_X, mini_batch_Y)
            # mini_batches.append(mini_batch)
    
            # 开始
            for minibatch in minibatches:
    
                # 1.准备数据(minibatch每批次的数据)
                mini_batch_X, mini_batch_Y = minibatch
    
                # 2.前向传播
                a3, cache = forward_propagation(mini_batch_X, parameters)
    
                # 3.计算损失
                cost = compute_cost(a3, mini_batch_Y)
    
                # 4.反向传播,回传损失,返回梯度
                gradients = backward_propagation(mini_batch_X, mini_batch_Y, cache)
    
                # 5.利用梯度更新参数
                if optimizer == "momentum":
                    parameters, v = update_parameters_with_momentum(parameters, gradients, v, beta, learning_rate)
                elif optimizer == "adam":
                    # todo
                    t = t + 1
                    parameters, v, s = update_parameters_with_adam(parameters, gradients, v, s, t, learning_rate,
                                                beta1, beta2, epsilon)
    
    
    
            # 结束
    
            # 每个1000批次打印损失
            if print_cost and i % 1000 == 0:
                print("第 %i 次迭代的损失值: %f" % (i, cost))
            if print_cost and i % 100 == 0:
                costs.append(cost)
    
        # 画出损失的变化
        plt.plot(costs)
        plt.ylabel('cost')
        plt.xlabel('epochs (per 100)')
        plt.title("损失图")
        plt.show()
    
        return parameters
    
    
    if __name__ == '__main__':
    
        train_X, train_Y = load_dataset()
    
        parameters = model(train_X, train_Y, optimizer="adam")
    
        predictions = predict(train_X, train_Y, parameters)

    辅助代码(初始化参数+前向传播+计算损失+反向传播(计算梯度)+进行预测+加载数据集....):

    import numpy as np
    import matplotlib.pyplot as plt
    import h5py
    import scipy.io
    import sklearn
    import sklearn.datasets
    
    
    def sigmoid(x):
        """
        Compute the sigmoid of x
    
        Arguments:
        x -- A scalar or numpy array of any size.
    
        Return:
        s -- sigmoid(x)
        """
        s = 1/(1+np.exp(-x))
        return s
    
    
    def relu(x):
        """
        Compute the relu of x
    
        Arguments:
        x -- A scalar or numpy array of any size.
    
        Return:
        s -- relu(x)
        """
        s = np.maximum(0,x)
        
        return s
    
    
    def initialize_parameters(layer_dims):
        """
        Arguments:
        layer_dims -- python array (list) containing the dimensions of each layer in our network
        
        Returns:
        parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
                        W1 -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
                        b1 -- bias vector of shape (layer_dims[l], 1)
                        Wl -- weight matrix of shape (layer_dims[l-1], layer_dims[l])
                        bl -- bias vector of shape (1, layer_dims[l])
                        
        Tips:
        - For example: the layer_dims for the "Planar Data classification model" would have been [2,2,1]. 
        This means W1's shape was (2,2), b1 was (1,2), W2 was (2,1) and b2 was (1,1). Now you have to generalize it!
        - In the for loop, use parameters['W' + str(l)] to access Wl, where l is the iterative integer.
        """
        
        np.random.seed(3)
        parameters = {}
        L = len(layer_dims) # number of layers in the network
    
        for l in range(1, L):
            parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*  np.sqrt(2 / layer_dims[l-1])
            parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
            
            assert(parameters['W' + str(l)].shape == layer_dims[l], layer_dims[l-1])
            assert(parameters['W' + str(l)].shape == layer_dims[l], 1)
            
        return parameters
    
    
    def compute_cost(a3, Y):
        
        """
        Implement the cost function
        
        Arguments:
        a3 -- post-activation, output of forward propagation
        Y -- "true" labels vector, same shape as a3
        
        Returns:
        cost - value of the cost function
        """
        m = Y.shape[1]
        
        logprobs = np.multiply(-np.log(a3),Y) + np.multiply(-np.log(1 - a3), 1 - Y)
        cost = 1./m * np.sum(logprobs)
        
        return cost
    
    
    def forward_propagation(X, parameters):
        """
        Implements the forward propagation (and computes the loss) presented in Figure 2.
        
        Arguments:
        X -- input dataset, of shape (input size, number of examples)
        parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
                        W1 -- weight matrix of shape ()
                        b1 -- bias vector of shape ()
                        W2 -- weight matrix of shape ()
                        b2 -- bias vector of shape ()
                        W3 -- weight matrix of shape ()
                        b3 -- bias vector of shape ()
        
        Returns:
        loss -- the loss function (vanilla logistic loss)
        """
        
        # retrieve parameters
        W1 = parameters["W1"]
        b1 = parameters["b1"]
        W2 = parameters["W2"]
        b2 = parameters["b2"]
        W3 = parameters["W3"]
        b3 = parameters["b3"]
        
        # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
        z1 = np.dot(W1, X) + b1
        a1 = relu(z1)
        z2 = np.dot(W2, a1) + b2
        a2 = relu(z2)
        z3 = np.dot(W3, a2) + b3
        a3 = sigmoid(z3)
        
        cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3)
        
        return a3, cache
    
    
    def backward_propagation(X, Y, cache):
        """
        Implement the backward propagation presented in figure 2.
        
        Arguments:
        X -- input dataset, of shape (input size, number of examples)
        Y -- true "label" vector (containing 0 if cat, 1 if non-cat)
        cache -- cache output from forward_propagation()
        
        Returns:
        gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
        """
        m = X.shape[1]
        (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) = cache
        
        dz3 = 1./m * (a3 - Y)
        dW3 = np.dot(dz3, a2.T)
        db3 = np.sum(dz3, axis=1, keepdims = True)
        
        da2 = np.dot(W3.T, dz3)
        dz2 = np.multiply(da2, np.int64(a2 > 0))
        dW2 = np.dot(dz2, a1.T)
        db2 = np.sum(dz2, axis=1, keepdims = True)
        
        da1 = np.dot(W2.T, dz2)
        dz1 = np.multiply(da1, np.int64(a1 > 0))
        dW1 = np.dot(dz1, X.T)
        db1 = np.sum(dz1, axis=1, keepdims = True)
        
        gradients = {"dz3": dz3, "dW3": dW3, "db3": db3,
                     "da2": da2, "dz2": dz2, "dW2": dW2, "db2": db2,
                     "da1": da1, "dz1": dz1, "dW1": dW1, "db1": db1}
        
        return gradients
    
    
    def predict(X, y, parameters):
        """
        This function is used to predict the results of a  n-layer neural network.
        
        Arguments:
        X -- data set of examples you would like to label
        parameters -- parameters of the trained model
        
        Returns:
        p -- predictions for the given dataset X
        """
        
        m = X.shape[1]
        p = np.zeros((1,m), dtype = np.int)
        
        # Forward propagation
        a3, caches = forward_propagation(X, parameters)
        
        # convert probas to 0/1 predictions
        for i in range(0, a3.shape[1]):
            if a3[0,i] > 0.5:
                p[0,i] = 1
            else:
                p[0,i] = 0
    
        print("Accuracy: "  + str(np.mean((p[0,:] == y[0,:]))))
        
        return p
    
    
    def load_dataset():
        np.random.seed(3)
        train_X, train_Y = sklearn.datasets.make_moons(n_samples=300, noise=.2)
        train_X = train_X.T
        train_Y = train_Y.reshape((1, train_Y.shape[0]))
        return train_X, train_Y
  • 相关阅读:
    golang json处理
    关于单例模式中懒汉模式和饿汉模式的学习
    关于Lambda表达式的研究
    左值和右值得研究
    关于epoll的详解说明关于epoll的详解说明
    关于c++中条件变量的分析
    关于c++种std::function和bind的用法
    关于iterator_traits的使用
    STL容器之deque数据结构解析
    STL容器之Array的用法
  • 原文地址:https://www.cnblogs.com/kongweisi/p/11079913.html
Copyright © 2011-2022 走看看