zoukankan      html  css  js  c++  java
  • TensorFlow使用记录 (八): 梯度修剪 和 Max-Norm Regularization

    梯度修剪

    梯度修剪主要避免训练梯度爆炸的问题,一般来说使用了 Batch Normalization 就不必要使用梯度修剪了,但还是有必要理解下实现的

    In TensorFlow, the optimizer’s minimize() function takes care of both computing the gradients and applying them, so you must instead call the optimizer’s compute_gradients() method first, then create an operation to clip the gradients using the clip_by_value() function, and finally create an operation to apply the clipped gradients using the optimizer’s apply_gradients() method:

    threshold = 1.0
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    grads_and_vars = optimizer.compute_gradients(loss)
    capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
             for grad, var in grads_and_vars]
    training_op = optimizer.apply_gradients(capped_gvs)

    例子:

    import tensorflow as tf
    
    def Swish(features):
        return features*tf.nn.sigmoid(features)
    
    # 1. create data
    from tensorflow.examples.tutorials.mnist import input_data
    mnist = input_data.read_data_sets('../MNIST_data', one_hot=True)
    
    X = tf.placeholder(tf.float32, shape=(None, 784), name='X')
    y = tf.placeholder(tf.int32, shape=(None), name='y')
    is_training = tf.placeholder(tf.bool, None, name='is_training')
    
    # 2. define network
    he_init = tf.contrib.layers.variance_scaling_initializer()
    with tf.name_scope('dnn'):
        hidden1 = tf.layers.dense(X, 300, kernel_initializer=he_init, name='hidden1')
        # hidden1 = tf.layers.batch_normalization(hidden1, momentum=0.9)
        hidden1 = tf.nn.relu(hidden1)
        hidden2 = tf.layers.dense(hidden1, 100, kernel_initializer=he_init, name='hidden2')
        # hidden2 = tf.layers.batch_normalization(hidden2, training=is_training, momentum=0.9)
        hidden2 = tf.nn.relu(hidden2)
        logits = tf.layers.dense(hidden2, 10, kernel_initializer=he_init, name='output')
        # prob = tf.layers.dense(hidden2, 10, tf.nn.softmax, name='prob')
    
    # 3. define loss
    with tf.name_scope('loss'):
        # tf.losses.sparse_softmax_cross_entropy() label is not one_hot and dtype is int*
        # xentropy = tf.losses.sparse_softmax_cross_entropy(labels=tf.argmax(y, axis=1), logits=logits)
        # tf.nn.sparse_softmax_cross_entropy_with_logits() label is not one_hot and dtype is int*
        # xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.argmax(y, axis=1), logits=logits)
        # loss = tf.reduce_mean(xentropy)
        loss = tf.losses.softmax_cross_entropy(onehot_labels=y, logits=logits) # label is one_hot
    
    # 4. define optimizer
    learning_rate = 0.01
    with tf.name_scope('train'):
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)  # for batch normalization
        with tf.control_dependencies(update_ops):
            # optimizer_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
            threshold = 1.0
            optimizer = tf.train.GradientDescentOptimizer(learning_rate)
            grads_and_vars = optimizer.compute_gradients(loss)
            capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
                          for grad, var in grads_and_vars]
            optimizer_op = optimizer.apply_gradients(capped_gvs)
    
    
    
    with tf.name_scope('eval'):
        correct = tf.nn.in_top_k(logits, tf.argmax(y, axis=1), 1) # 目标是否在前K个预测中, label's dtype is int*
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
    # 5. initialize
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    saver = tf.train.Saver()
    # =================
    print([v.name for v in tf.trainable_variables()])
    print([v.name for v in tf.global_variables()])
    # =================
    # 5. train & test
    n_epochs = 20
    n_batches = 50
    batch_size = 50
    
    with tf.Session() as sess:
        sess.run(init_op)
        for epoch in range(n_epochs):
            for iteration in range(mnist.train.num_examples // batch_size):
                X_batch, y_batch = mnist.train.next_batch(batch_size)
                sess.run(optimizer_op, feed_dict={X: X_batch, y: y_batch, is_training:True})
                # =================
                # for grad, var in grads_and_vars:
                #     grad = grad.eval(feed_dict={X: X_batch, y: y_batch, is_training:True})
                #     var = var.eval()
                # =================
            acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch, is_training:False}) # 最后一个 batch 的 accuracy
            acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
            loss_test = loss.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
            print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test, "Test loss:", loss_test)
        save_path = saver.save(sess, "./my_model_final.ckpt")
    
    with tf.Session() as sess:
        sess.run(init_op)
        saver.restore(sess, "./my_model_final.ckpt")
        acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
        loss_test = loss.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
        print("Test accuracy:", acc_test, ", Test loss:", loss_test)
    View Code

    下面我们来看看上面这个例子里所涉及的一些东西

    compute_gradients

    compute_gradients 是任何一个优化器都有的方法:

    compute_gradients(
        loss,
        var_list=None,
        gate_gradients=GATE_OP,
        aggregation_method=None,
        colocate_gradients_with_ops=False,
        grad_loss=None
    )

    计算 loss 中可训练的 var_list 中的梯度。
    相当于minimize() 的第一步,返回 (gradient, variable) 列表。

    获得了梯度后我们就可以手动进行梯度裁剪了,下面这句话就是将梯度限制到 [-threshold, threshold] 的范围内:

    capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
                          for grad, var in grads_and_vars]

    apply_gradients

    apply_gradients 同样是任何一个优化器都有的方法:

    apply_gradients(
        grads_and_vars,
        global_step=None,
        name=None
    )

    minimize() 的第二部分,返回一个执行梯度更新的 ops。

    Max-Norm Regularization

    对于每个节点,max-norm regularization 会对权重 $mathbf{w}$ 进行限制 $lVert mathbf{w} Vert_2 le r$:

     egin{equation}
    label{a}
    mathbf{w} gets mathbf{w} frac{r}{lVert mathbf{w} Vert_2}
    end{equation}

    实例代码:

    import tensorflow as tf
    
    # =================
    def max_norm_regularizer(threshold=1.0, axes=1, name="max_norm",
                             collection="max_norm"):
        def max_norm(weights):
            clipped = tf.clip_by_norm(weights, clip_norm=threshold, axes=axes)
            clip_weights = tf.assign(weights, clipped, name=name)
            tf.add_to_collection(collection, clip_weights)
            return None # there is no regularization loss term
        return max_norm
    max_norm_reg = max_norm_regularizer(threshold=1.0)
    # =================
    
    # 1. create data
    from tensorflow.examples.tutorials.mnist import input_data
    mnist = input_data.read_data_sets('../MNIST_data', one_hot=True)
    
    X = tf.placeholder(tf.float32, shape=(None, 784), name='X')
    y = tf.placeholder(tf.int32, shape=(None), name='y')
    is_training = tf.placeholder(tf.bool, None, name='is_training')
    
    # 2. define network
    he_init = tf.contrib.layers.variance_scaling_initializer()
    with tf.name_scope('dnn'):
        hidden1 = tf.layers.dense(X, 300, kernel_initializer=he_init,
                                  kernel_regularizer=max_norm_reg, name='hidden1')
        # hidden1 = tf.layers.batch_normalization(hidden1, momentum=0.9)
        hidden1 = tf.nn.relu(hidden1)
        hidden2 = tf.layers.dense(hidden1, 100, kernel_initializer=he_init,
                                  kernel_regularizer=max_norm_reg, name='hidden2')
        # hidden2 = tf.layers.batch_normalization(hidden2, training=is_training, momentum=0.9)
        hidden2 = tf.nn.relu(hidden2)
        logits = tf.layers.dense(hidden2, 10, kernel_initializer=he_init, name='output')
    
    # 3. define loss
    with tf.name_scope('loss'):
        loss = tf.losses.softmax_cross_entropy(onehot_labels=y, logits=logits) # label is one_hot
    
    # 4. define optimizer
    learning_rate_init = 0.01
    global_step = tf.Variable(0, trainable=False)
    with tf.name_scope('train'):
        learning_rate = tf.train.polynomial_decay(  # 多项式衰减
            learning_rate=learning_rate_init,  # 初始学习率
            global_step=global_step,  # 当前迭代次数
            decay_steps=22000,  # 在迭代到该次数实际,学习率衰减为 learning_rate * dacay_rate
            end_learning_rate=learning_rate_init / 10,  # 最小的学习率
            power=0.9,
            cycle=False
        )
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)  # for batch normalization
        with tf.control_dependencies(update_ops):
            optimizer_op = tf.train.MomentumOptimizer(
                learning_rate=learning_rate, momentum=0.9).minimize(
                loss=loss,
                var_list=tf.trainable_variables(),
                global_step=global_step # 不指定的话学习率不更新
            )
            # ================= clip gradient
            # threshold = 1.0
            # optimizer = tf.train.GradientDescentOptimizer(learning_rate)
            # grads_and_vars = optimizer.compute_gradients(loss)
            # capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
            #               for grad, var in grads_and_vars]
            # optimizer_op = optimizer.apply_gradients(capped_gvs)
            # =================
    
    with tf.name_scope('eval'):
        correct = tf.nn.in_top_k(logits, tf.argmax(y, axis=1), 1) # 目标是否在前K个预测中, label's dtype is int*
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
    
    # 5. initialize
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    saver = tf.train.Saver()
    
    # =================
    clip_all_weights = tf.get_collection("max_norm")
    # =================
    
    # 6. train & test
    n_epochs = 20
    batch_size = 50
    
    with tf.Session() as sess:
        sess.run(init_op)
        # saver.restore(sess, './my_model_final.ckpt')
        for epoch in range(n_epochs):
            for iteration in range(mnist.train.num_examples // batch_size):
                X_batch, y_batch = mnist.train.next_batch(batch_size)
                sess.run([optimizer_op, learning_rate], feed_dict={X: X_batch, y: y_batch, is_training:True})
                sess.run(clip_all_weights)
                # ================= check gradient
                # for grad, var in grads_and_vars:
                #     grad = grad.eval(feed_dict={X: X_batch, y: y_batch, is_training:True})
                #     var = var.eval()
                # =================
            learning_rate_cur = learning_rate.eval()
            acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch, is_training:False}) # 最后一个 batch 的 accuracy
            acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
            loss_test = loss.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
            print(epoch, "Current learning rate:", learning_rate_cur, "Train accuracy:", acc_train, "Test accuracy:", acc_test, "Test loss:", loss_test)
        save_path = saver.save(sess, "./my_model_final.ckpt")
    View Code
  • 相关阅读:
    面试题总结(vue面试题)
    面试题总结(css面试题)
    设置div居中显示
    关于js中iframe 中 location.href的用法
    js判断是否在iframe中
    npm git 常用命令行 记录
    mongDB数据库 小白学习
    EJS 入门学习
    bower 基础认识
    gulp 粗粗学习 记录下
  • 原文地址:https://www.cnblogs.com/xuanyuyt/p/11652556.html
Copyright © 2011-2022 走看看