zoukankan      html  css  js  c++  java
  • TensorFlow使用记录 (八): 梯度修剪 和 Max-Norm Regularization

    梯度修剪

    梯度修剪主要避免训练梯度爆炸的问题,一般来说使用了 Batch Normalization 就不必要使用梯度修剪了,但还是有必要理解下实现的

    In TensorFlow, the optimizer’s minimize() function takes care of both computing the gradients and applying them, so you must instead call the optimizer’s compute_gradients() method first, then create an operation to clip the gradients using the clip_by_value() function, and finally create an operation to apply the clipped gradients using the optimizer’s apply_gradients() method:

    threshold = 1.0
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    grads_and_vars = optimizer.compute_gradients(loss)
    capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
             for grad, var in grads_and_vars]
    training_op = optimizer.apply_gradients(capped_gvs)

    例子:

    import tensorflow as tf
    
    def Swish(features):
        return features*tf.nn.sigmoid(features)
    
    # 1. create data
    from tensorflow.examples.tutorials.mnist import input_data
    mnist = input_data.read_data_sets('../MNIST_data', one_hot=True)
    
    X = tf.placeholder(tf.float32, shape=(None, 784), name='X')
    y = tf.placeholder(tf.int32, shape=(None), name='y')
    is_training = tf.placeholder(tf.bool, None, name='is_training')
    
    # 2. define network
    he_init = tf.contrib.layers.variance_scaling_initializer()
    with tf.name_scope('dnn'):
        hidden1 = tf.layers.dense(X, 300, kernel_initializer=he_init, name='hidden1')
        # hidden1 = tf.layers.batch_normalization(hidden1, momentum=0.9)
        hidden1 = tf.nn.relu(hidden1)
        hidden2 = tf.layers.dense(hidden1, 100, kernel_initializer=he_init, name='hidden2')
        # hidden2 = tf.layers.batch_normalization(hidden2, training=is_training, momentum=0.9)
        hidden2 = tf.nn.relu(hidden2)
        logits = tf.layers.dense(hidden2, 10, kernel_initializer=he_init, name='output')
        # prob = tf.layers.dense(hidden2, 10, tf.nn.softmax, name='prob')
    
    # 3. define loss
    with tf.name_scope('loss'):
        # tf.losses.sparse_softmax_cross_entropy() label is not one_hot and dtype is int*
        # xentropy = tf.losses.sparse_softmax_cross_entropy(labels=tf.argmax(y, axis=1), logits=logits)
        # tf.nn.sparse_softmax_cross_entropy_with_logits() label is not one_hot and dtype is int*
        # xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.argmax(y, axis=1), logits=logits)
        # loss = tf.reduce_mean(xentropy)
        loss = tf.losses.softmax_cross_entropy(onehot_labels=y, logits=logits) # label is one_hot
    
    # 4. define optimizer
    learning_rate = 0.01
    with tf.name_scope('train'):
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)  # for batch normalization
        with tf.control_dependencies(update_ops):
            # optimizer_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
            threshold = 1.0
            optimizer = tf.train.GradientDescentOptimizer(learning_rate)
            grads_and_vars = optimizer.compute_gradients(loss)
            capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
                          for grad, var in grads_and_vars]
            optimizer_op = optimizer.apply_gradients(capped_gvs)
    
    
    
    with tf.name_scope('eval'):
        correct = tf.nn.in_top_k(logits, tf.argmax(y, axis=1), 1) # 目标是否在前K个预测中, label's dtype is int*
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
    # 5. initialize
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    saver = tf.train.Saver()
    # =================
    print([v.name for v in tf.trainable_variables()])
    print([v.name for v in tf.global_variables()])
    # =================
    # 5. train & test
    n_epochs = 20
    n_batches = 50
    batch_size = 50
    
    with tf.Session() as sess:
        sess.run(init_op)
        for epoch in range(n_epochs):
            for iteration in range(mnist.train.num_examples // batch_size):
                X_batch, y_batch = mnist.train.next_batch(batch_size)
                sess.run(optimizer_op, feed_dict={X: X_batch, y: y_batch, is_training:True})
                # =================
                # for grad, var in grads_and_vars:
                #     grad = grad.eval(feed_dict={X: X_batch, y: y_batch, is_training:True})
                #     var = var.eval()
                # =================
            acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch, is_training:False}) # 最后一个 batch 的 accuracy
            acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
            loss_test = loss.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
            print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test, "Test loss:", loss_test)
        save_path = saver.save(sess, "./my_model_final.ckpt")
    
    with tf.Session() as sess:
        sess.run(init_op)
        saver.restore(sess, "./my_model_final.ckpt")
        acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
        loss_test = loss.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
        print("Test accuracy:", acc_test, ", Test loss:", loss_test)
    View Code

    下面我们来看看上面这个例子里所涉及的一些东西

    compute_gradients

    compute_gradients 是任何一个优化器都有的方法:

    compute_gradients(
        loss,
        var_list=None,
        gate_gradients=GATE_OP,
        aggregation_method=None,
        colocate_gradients_with_ops=False,
        grad_loss=None
    )

    计算 loss 中可训练的 var_list 中的梯度。
    相当于minimize() 的第一步,返回 (gradient, variable) 列表。

    获得了梯度后我们就可以手动进行梯度裁剪了,下面这句话就是将梯度限制到 [-threshold, threshold] 的范围内:

    capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
                          for grad, var in grads_and_vars]

    apply_gradients

    apply_gradients 同样是任何一个优化器都有的方法:

    apply_gradients(
        grads_and_vars,
        global_step=None,
        name=None
    )

    minimize() 的第二部分,返回一个执行梯度更新的 ops。

    Max-Norm Regularization

    对于每个节点,max-norm regularization 会对权重 $mathbf{w}$ 进行限制 $lVert mathbf{w} Vert_2 le r$:

     egin{equation}
    label{a}
    mathbf{w} gets mathbf{w} frac{r}{lVert mathbf{w} Vert_2}
    end{equation}

    实例代码:

    import tensorflow as tf
    
    # =================
    def max_norm_regularizer(threshold=1.0, axes=1, name="max_norm",
                             collection="max_norm"):
        def max_norm(weights):
            clipped = tf.clip_by_norm(weights, clip_norm=threshold, axes=axes)
            clip_weights = tf.assign(weights, clipped, name=name)
            tf.add_to_collection(collection, clip_weights)
            return None # there is no regularization loss term
        return max_norm
    max_norm_reg = max_norm_regularizer(threshold=1.0)
    # =================
    
    # 1. create data
    from tensorflow.examples.tutorials.mnist import input_data
    mnist = input_data.read_data_sets('../MNIST_data', one_hot=True)
    
    X = tf.placeholder(tf.float32, shape=(None, 784), name='X')
    y = tf.placeholder(tf.int32, shape=(None), name='y')
    is_training = tf.placeholder(tf.bool, None, name='is_training')
    
    # 2. define network
    he_init = tf.contrib.layers.variance_scaling_initializer()
    with tf.name_scope('dnn'):
        hidden1 = tf.layers.dense(X, 300, kernel_initializer=he_init,
                                  kernel_regularizer=max_norm_reg, name='hidden1')
        # hidden1 = tf.layers.batch_normalization(hidden1, momentum=0.9)
        hidden1 = tf.nn.relu(hidden1)
        hidden2 = tf.layers.dense(hidden1, 100, kernel_initializer=he_init,
                                  kernel_regularizer=max_norm_reg, name='hidden2')
        # hidden2 = tf.layers.batch_normalization(hidden2, training=is_training, momentum=0.9)
        hidden2 = tf.nn.relu(hidden2)
        logits = tf.layers.dense(hidden2, 10, kernel_initializer=he_init, name='output')
    
    # 3. define loss
    with tf.name_scope('loss'):
        loss = tf.losses.softmax_cross_entropy(onehot_labels=y, logits=logits) # label is one_hot
    
    # 4. define optimizer
    learning_rate_init = 0.01
    global_step = tf.Variable(0, trainable=False)
    with tf.name_scope('train'):
        learning_rate = tf.train.polynomial_decay(  # 多项式衰减
            learning_rate=learning_rate_init,  # 初始学习率
            global_step=global_step,  # 当前迭代次数
            decay_steps=22000,  # 在迭代到该次数实际,学习率衰减为 learning_rate * dacay_rate
            end_learning_rate=learning_rate_init / 10,  # 最小的学习率
            power=0.9,
            cycle=False
        )
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)  # for batch normalization
        with tf.control_dependencies(update_ops):
            optimizer_op = tf.train.MomentumOptimizer(
                learning_rate=learning_rate, momentum=0.9).minimize(
                loss=loss,
                var_list=tf.trainable_variables(),
                global_step=global_step # 不指定的话学习率不更新
            )
            # ================= clip gradient
            # threshold = 1.0
            # optimizer = tf.train.GradientDescentOptimizer(learning_rate)
            # grads_and_vars = optimizer.compute_gradients(loss)
            # capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
            #               for grad, var in grads_and_vars]
            # optimizer_op = optimizer.apply_gradients(capped_gvs)
            # =================
    
    with tf.name_scope('eval'):
        correct = tf.nn.in_top_k(logits, tf.argmax(y, axis=1), 1) # 目标是否在前K个预测中, label's dtype is int*
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
    
    # 5. initialize
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    saver = tf.train.Saver()
    
    # =================
    clip_all_weights = tf.get_collection("max_norm")
    # =================
    
    # 6. train & test
    n_epochs = 20
    batch_size = 50
    
    with tf.Session() as sess:
        sess.run(init_op)
        # saver.restore(sess, './my_model_final.ckpt')
        for epoch in range(n_epochs):
            for iteration in range(mnist.train.num_examples // batch_size):
                X_batch, y_batch = mnist.train.next_batch(batch_size)
                sess.run([optimizer_op, learning_rate], feed_dict={X: X_batch, y: y_batch, is_training:True})
                sess.run(clip_all_weights)
                # ================= check gradient
                # for grad, var in grads_and_vars:
                #     grad = grad.eval(feed_dict={X: X_batch, y: y_batch, is_training:True})
                #     var = var.eval()
                # =================
            learning_rate_cur = learning_rate.eval()
            acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch, is_training:False}) # 最后一个 batch 的 accuracy
            acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
            loss_test = loss.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
            print(epoch, "Current learning rate:", learning_rate_cur, "Train accuracy:", acc_train, "Test accuracy:", acc_test, "Test loss:", loss_test)
        save_path = saver.save(sess, "./my_model_final.ckpt")
    View Code
  • 相关阅读:
    13、字符串相互匹配删除
    12、指定长度替换制表符
    11、输入字符颠倒输出
    10、字符串输入删除末尾特殊符,清除空行
    9、筛选满足长度条件的输入内容进行输出
    8、获取输入的字符串并输出最长的那个
    7、初识函数
    6、计数垂直直方图输出
    5、计数水平直方图输出
    Tiny4412 Uboot
  • 原文地址:https://www.cnblogs.com/xuanyuyt/p/11652556.html
Copyright © 2011-2022 走看看