zoukankan      html  css  js  c++  java
  • 2.5BatchNormalzation

    BatchNormalzation是一种解决深度神经网络层数太多,而没有办法有效前向传递的问题,因为每层的输出值都会有不同的均值和方差,所以输出数据的分布也不一样。

    如果对于输入的X*W本身得到的值通过tanh激活函数已经输出为1,在通过下一层的神经元之后结果并没有什么改变,因为该值已经很大(神经网络过敏感问题)

    对于这种情况我们可以首先做一些批处理正则化

     

      可以看出没有BN的时候,每层的值迅速全部都变为0,也可以说,所有的神经元迅速都已经死去了,而有BN,relu之后,每层的值都能有一个比较好的分布效果

    大部分神经元都还活着

    relu下的误差对比,no BN的曲线是没有的,很快消失

    如果改为tanh,误差变化如下

    """
    Build two networks.
    1. Without batch normalization
    2. With batch normalization
    
    Run tests on these two networks.
    """
    
    #  Batch Normalization
    
    import numpy as np
    import tensorflow as tf
    import matplotlib.pyplot as plt
    
    
    ACTIVATION = tf.nn.relu #tf.nn.tanh 
    N_LAYERS = 7
    N_HIDDEN_UNITS = 30
    
    
    def fix_seed(seed=1):
        # reproducible
        np.random.seed(seed)
        tf.set_random_seed(seed)
    
    
    def plot_his(inputs, inputs_norm):
        # plot histogram for the inputs of every layer
        for j, all_inputs in enumerate([inputs, inputs_norm]):
            for i, input in enumerate(all_inputs):
                plt.subplot(2, len(all_inputs), j*len(all_inputs)+(i+1))
                plt.cla()
                if i == 0:
                    the_range = (-7, 10)
                else:
                    the_range = (-1, 1)
                plt.hist(input.ravel(), bins=15, range=the_range, color='#FF5733')
                plt.yticks(())
                if j == 1:
                    plt.xticks(the_range)
                else:
                    plt.xticks(())
                ax = plt.gca()
                ax.spines['right'].set_color('none')
                ax.spines['top'].set_color('none')
            plt.title("%s normalizing" % ("Without" if j == 0 else "With"))
        plt.draw()
        plt.pause(0.01)
    
    
    def built_net(xs, ys, norm):
        def add_layer(inputs, in_size, out_size, activation_function=None, norm=False):
            # weights and biases (bad initialization for this case)
            Weights = tf.Variable(tf.random_normal([in_size, out_size], mean=0., stddev=1.))
            biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
    
            # fully connected product
            Wx_plus_b = tf.matmul(inputs, Weights) + biases
    
            # normalize fully connected product
            if norm:
                # Batch Normalize
                fc_mean, fc_var = tf.nn.moments(
                    Wx_plus_b,
                    axes=[0],   # the dimension you wanna normalize, here [0] for batch
                                # for image, you wanna do [0, 1, 2] for [batch, height, width] but not channel
                )
                scale = tf.Variable(tf.ones([out_size]))
                shift = tf.Variable(tf.zeros([out_size]))
                epsilon = 0.001
    
                # apply moving average for mean and var when train on batch
                ema = tf.train.ExponentialMovingAverage(decay=0.5)
                def mean_var_with_update():
                    ema_apply_op = ema.apply([fc_mean, fc_var])
                    with tf.control_dependencies([ema_apply_op]):
                        return tf.identity(fc_mean), tf.identity(fc_var)
                mean, var = mean_var_with_update()
                #all parmeter batch normalization
                Wx_plus_b = tf.nn.batch_normalization(Wx_plus_b, mean, var, shift, scale, epsilon)
                # similar with this two steps:
                # Wx_plus_b = (Wx_plus_b - fc_mean) / tf.sqrt(fc_var + 0.001)
                # Wx_plus_b = Wx_plus_b * scale + shift
    
            # activation
            if activation_function is None:
                outputs = Wx_plus_b
            else:
                outputs = activation_function(Wx_plus_b)
    
            return outputs
    
        
        
        fix_seed(1)
    
        if norm:
            # BN for the first input
            fc_mean, fc_var = tf.nn.moments(
                xs,
                axes=[0],
            )
            scale = tf.Variable(tf.ones([1]))
            shift = tf.Variable(tf.zeros([1]))
            epsilon = 0.001
            # apply moving average for mean and var when train on batch
            ema = tf.train.ExponentialMovingAverage(decay=0.5)
            def mean_var_with_update():
                ema_apply_op = ema.apply([fc_mean, fc_var])
                with tf.control_dependencies([ema_apply_op]):
                    return tf.identity(fc_mean), tf.identity(fc_var)
            mean, var = mean_var_with_update()
            xs = tf.nn.batch_normalization(xs, mean, var, shift, scale, epsilon)
    
        # record inputs for every layer
        layers_inputs = [xs]
    
        # build hidden layers
        for l_n in range(N_LAYERS):
            layer_input = layers_inputs[l_n]
            in_size = layers_inputs[l_n].get_shape()[1].value
    
            output = add_layer(
                layer_input,    # input
                in_size,        # input size
                N_HIDDEN_UNITS, # output size
                ACTIVATION,     # activation function
                norm,           # normalize before activation
            )
            layers_inputs.append(output)    # add output for next run
    
        # build output layer
        prediction = add_layer(layers_inputs[-1], 30, 1, activation_function=None)
    
        cost = tf.reduce_mean(tf.reduce_sum(tf.square(ys - prediction), reduction_indices=[1]))
        train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
        return [train_op, cost, layers_inputs]
    
    
    
    # make up data
    fix_seed(1)
    x_data = np.linspace(-7, 10, 2500)[:, np.newaxis]
    np.random.shuffle(x_data)
    noise = np.random.normal(0, 8, x_data.shape)
    y_data = np.square(x_data) - 5 + noise
    
    # plot input data
    plt.scatter(x_data, y_data)
    plt.show()
    
    xs = tf.placeholder(tf.float32, [None, 1])  # [num_samples, num_features]
    ys = tf.placeholder(tf.float32, [None, 1])
    #create 2 neuralnetwork
    train_op, cost, layers_inputs = built_net(xs, ys, norm=False)   # without BN
    train_op_norm, cost_norm, layers_inputs_norm = built_net(xs, ys, norm=True) # with BN
    
    
    
    
    sess = tf.Session()
    if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
        init = tf.initialize_all_variables()
    else:
        init = tf.global_variables_initializer()
    sess.run(init)
    
    # record cost
    cost_his = []
    cost_his_norm = []
    record_step = 5
    
    plt.ion()
    plt.figure(figsize=(7, 3))
    for i in range(250):
        if i % 50 == 0:
            # plot histogram
            all_inputs, all_inputs_norm = sess.run([layers_inputs, layers_inputs_norm], feed_dict={xs: x_data, ys: y_data})
            plot_his(all_inputs, all_inputs_norm)
    
        # train on batch
        sess.run([train_op, train_op_norm], feed_dict={xs: x_data[i*10:i*10+10], ys: y_data[i*10:i*10+10]})
    
        if i % record_step == 0:
            # record cost
            cost_his.append(sess.run(cost, feed_dict={xs: x_data, ys: y_data}))
            cost_his_norm.append(sess.run(cost_norm, feed_dict={xs: x_data, ys: y_data}))
    
    plt.ioff()
    plt.figure()
    plt.plot(np.arange(len(cost_his))*record_step, np.array(cost_his), label='no BN')     # no norm
    plt.plot(np.arange(len(cost_his))*record_step, np.array(cost_his_norm), label='BN')   # norm
    plt.legend()
    plt.show()

    数据集图示 

  • 相关阅读:
    PAT (Advanced Level) Practice 1054 The Dominant Color (20 分)
    PAT (Advanced Level) Practice 1005 Spell It Right (20 分) (switch)
    PAT (Advanced Level) Practice 1006 Sign In and Sign Out (25 分) (排序)
    hdu 5114 Collision
    hdu4365 Palindrome graph
    单链表查找最大值、两个递增的链表合并并且去重
    蓝桥杯-最短路 (SPFA算法学习)
    蓝桥杯-最大最小公倍数
    Codeforces-470 div2 C题
    蓝桥杯-地宫取宝
  • 原文地址:https://www.cnblogs.com/jackchen-Net/p/8126739.html
Copyright © 2011-2022 走看看