zoukankan      html  css  js  c++  java
  • 89、tensorflow使用GPU并行计算

    '''
    Created on May 25, 2017
    
    @author: p0079482
    '''
    # 分布式深度学习模型训练模式
    # 在一台机器的多个GPU上并行训练深度学习模型
    from datetime import datetime
    import os
    import time
    
    import tensorflow as tf
    import mnist_inference
    
    # 定义训练神经网络时需要用到的配置。
    BATCH_SIZE = 100
    LEARNING_RATE_BASE = 0.001
    LEARNING_RATE_DECAY = 0.99
    REGULARAZTION_REATE = 0.0001
    TRAINING_STEPS = 1000
    MOVING_AVERAGE_DECAY = 0.99
    N_GPU = 4
    
    # 定义日志和模型输出的路径
    MODEL_SAVE_PATH = "/path/to/logs_and_models/"
    MODEL_NAME = "model.ckpt"
    
    # 定义数据存储的路径。因为需要为不同的GPU提供不同的训练数据,所以通过placerholder的方式
    # 就需要手动准备多分数据。为了方便训练数据的获取过程,可以采用第7章中介绍的输入队列的方式
    # 从TFRecord中读取数据。于是在这里提供的数据文件路径为将MNIST训练数据转化为TFRecords格式之后的路径
    # 如何将MNIST数据转化为TFRecord格式在第7章中有详细介绍,这里不再赘述
    DATA_PATH = "/path/to/output.tfrecords"
    
    
    # 定义输入队列得到训练数据,具体细节可以参考第7章
    def get_input():
        filename_queue = tf.train.string_input_producer([DATA_PATH])
        reader = tf.TFRecordReader()
        _, serialized_example = reader.read(filename_queue)
        # 定义数据解析格式
        features = tf.parse_single_example(serialized_example,
                                           features={
                                              'image_raw':tf.FixedLenFeature([], tf.string),
                                              'pixels':tf.FixedLenFeature([], tf.int64),
                                              'label':tf.FixedLenFeature([], tf.int64),
                                            })
        # 解析图片和标签信息
        decoded_image = tf.decode_raw(features['image_raw'], tf.uint8)
        reshaped_image = tf.reshape(decoded_image, [784])
        retyped_image = tf.cast(reshaped_image, tf.float32)
        label = tf.cast(features['label'], tf.int32)
        
        # 定义输入队列并返回
        min_after_dequeue = 10000
        capacity = min_after_dequeue + 3 * BATCH_SIZE
        return tf.train.shuffle_batch([retyped_image, label],
                                      batch_size=BATCH_SIZE,
                                      capacity=capacity,
                                      min_after_dequeue=min_after_dequeue)
        
    # 定义损失函数。对于给定的训练数据、正则化损失计算规则和命名空间,计算在这个命名空间下的总损失
    # 之所以需要给定命名空间是因为不同的GPU上计算得出的正则化损失都会加入名为loss的集合,
    # 如果不通过命名空间就会将不同GPU上的正则化损失都加进来
    def get_loss(x, y_, regularizer, scope, reuse_variables=None):
        # 沿用5.5节中定义的函数来计算神经网络的前向传播结果
        with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables):
            y = mnist_inference.inference(x, regularizer)
        # 计算交叉熵损失
        cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=y_))
        # 计算当前GPU上计算得到的正则化损失
        regularization_loss = tf.add_n(tf.get_collection('losses', scope))
        # 计算最终的总损失
        loss = cross_entropy + regularization_loss
        return loss
    
    def average_gradients(tower_grads):
        average_grads = []
        # 枚举所有的变量和变量在不同GPU上计算得出的梯度
        for grad_and_vars in zip(*tower_grads):
            # 计算所有GPU上的梯度平均值
            grads = []
            for g, _ in grad_and_vars:
                expanded_g = tf.expand_dims(g, 0)
                grads.append(expanded_g)
            grad = tf.concat(grads, 0)
            grad = tf.reduce_mean(grad, 0)
            
            v = grad_and_vars[0][1]
            grad_and_var = (grad, v)
            # 将变量和它的平均梯度对应起来
            average_grads.append(grad_and_var)
        # 返回所有变量的平均梯度,这将被用于变量更新
        return average_grads
    
    # 主训练过程
    def main(argv=None):
        # 将简单的运算放在CPU上,只有神经网络的训练过程放在GPU上
        with tf.Graph().as_default(), tf.device('/cpu:0'):
            # 获取训练batch
            x, y_ = get_input()
            regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_REATE)
            
            # 定义训练轮数和指数衰减的学习率
            global_step = tf.get_variable('global_step',
                                          [],
                                          initializer=tf.constant_initializer(0),
                                          trainable=False)
            learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE,
                                                       global_step,
                                                       60000 / BATCH_SIZE,
                                                       LEARNING_RATE_DECAY)
            # 定义优化方法
            opt = tf.train.GradientDescentOptimizer(learning_rate)
            
            tower_grads = []
            reuse_variables = False
            # 将神经网络的优化过程跑在不同的GPU上
            for i in range(N_GPU):
                # 将优化过程指定在一个GPU上
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('GPU_%d' % i) as scope:
                        cur_loss = get_loss(x, y_, regularizer, scope, reuse_variables)
                        # 在第一次声明变量之后,将控制变量重用的参数设置为True.这样可以
                        # 让不同的GPU更新同一组参数。注意tf.name_scope函数并不会影响
                        # tf.get_variable的命名空间
                        reuse_variables = True
                        # 使用当前GPU计算所有变量的梯度
                        grads = opt.compute_gradients(cur_loss)
                        tower_grads.append(grads)
            
            # 计算变量的平均梯度,并输出到TensorBoard日志中
            grads = average_gradients(tower_grads)
            for grad, var in grads:
                if grad is not None:
                    tf.summary.histogram('gradients_on_average/%s' % var.op.name, grad)
            
            # 使用平均梯度更新参数
            apply_gradient_op = opt.apply_gradients(grads,
                                                    global_step=global_step)
            for var in tf.trainable_variables():
                tf.summary.histogram(var.op.name, var)
            
            # 计算变量的滑动平均值
            variable_averages = tf.train.ExponentialMovingAverage(
                                                                  MOVING_AVERAGE_DECAY,
                                                                  global_step)
            variables_to_average = (tf.trainable_variables() + tf.moving_average_variables())
            variable_averages_op = variable_averages.apply(variables_to_average)
            
            # 每一轮迭代需要更新变量的取值并更新变量的滑动平均值
            train_op = tf.group(apply_gradient_op, variable_averages_op)
            saver = tf.train.Saver(tf.global_variables())
            summary_op = tf.summary.merge_all()
            init = tf.global_variables_initializer()
            
            # 训练过程
            with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                                  log_device_placement=True)) as sess:
                # 初始化所有变量并启动队列
                init.run()
                coord = tf.train.Coordinator()
                threads = tf.train.start_queue_runners(sess=sess, coord=coord)
                summary_writer = tf.summary.FileWriter(MODEL_SAVE_PATH, sess.graph)
                for step in range(TRAINING_STEPS):
                    # 执行神经网络训练操作,并记录训练操作的运行时间
                    start_time = time.time()
                    _, loss_value = sess.run([train_op, cur_loss])
                    duration = time.time() - start_time
                    
                     # 每隔一段时间展示当前的训练进度,并统计训练速度
                    if step != 0 and step % 10 == 0 and duration != 0:
                        # 计算使用过的训练数据个数,因为在每一次运行训练操作时,每一个GPU
                        # 都会使用一个batch的训练数据,所以总共用到的训练数据个数为
                        # batch大小*GPU个数
                        num_examples_per_step = BATCH_SIZE * N_GPU
                        
                        # num_examples_per_step为本次迭代使用到的训练数据个数
                        # duration为运行当前训练过程使用的时间,于是平均每秒可以处理的训练数据个数为
                        # num_examples_per_step/duration
                        examples_per_sec = num_examples_per_step / duration
                        
                        # duration为运行当前训练过程使用的时间,因为在每一个训练过程中
                        # 每一个GPU都会使用一个batch的训练数据,所以在单个batch上的训练所需要时间为
                        # duration/GPU个数
                        sec_per_batch = duration / N_GPU
                        
                        # 输出训练信息
                        format_str = ('step %d, loss = %.2f (%  .1f examples/ sec; %.3f sec/batch)')
                        print(format_str % (step, loss_value, examples_per_sec, sec_per_batch))
                        
                        # 通过TensorBoard可视化训练过程
                        summary = sess.run(summary_op)
                        summary_writer.add_summary(summary, step)
                        
                        # 每隔一段时间保存当前的模型
                        if step % 1000 == 0 or (step + 1) == TRAINING_STEPS:
                            checkpoint_path = os.path.join(MODEL_SAVE_PATH, MODEL_NAME)
                            saver.save(sess, checkpoint_path, global_step=step)
                coord.request_stop()
                coord.join(threads)
    
    if __name__ == '__main__':
        tf.app.run()
                        
            

    下面是训练完的结果

    step 20, loss = 29.53 ( 10362.6 examples/ sec; 0.010 sec/batch)
    step 30, loss = 9.62 ( 12022.4 examples/ sec; 0.008 sec/batch)
    step 40, loss = 16.63 ( 10689.3 examples/ sec; 0.009 sec/batch)
    step 50, loss = 10.68 ( 11293.4 examples/ sec; 0.009 sec/batch)
    step 60, loss = 14.73 ( 10895.0 examples/ sec; 0.009 sec/batch)
    step 70, loss = 17.17 ( 11192.9 examples/ sec; 0.009 sec/batch)
    step 80, loss = 12.43 ( 11236.8 examples/ sec; 0.009 sec/batch)
    step 90, loss = 5.16 ( 11398.3 examples/ sec; 0.009 sec/batch)
    step 100, loss = 8.06 ( 12466.7 examples/ sec; 0.008 sec/batch)
    step 110, loss = 13.57 ( 11081.5 examples/ sec; 0.009 sec/batch)
    step 120, loss = 9.43 ( 11396.2 examples/ sec; 0.009 sec/batch)
    step 130, loss = 12.21 ( 13296.7 examples/ sec; 0.008 sec/batch)
    step 140, loss = 6.15 ( 11868.9 examples/ sec; 0.008 sec/batch)
    step 150, loss = 9.93 ( 12089.1 examples/ sec; 0.008 sec/batch)
    step 160, loss = 10.42 ( 11733.5 examples/ sec; 0.009 sec/batch)
    step 170, loss = 23.47 ( 11859.4 examples/ sec; 0.008 sec/batch)
    step 180, loss = 2.97 ( 11358.0 examples/ sec; 0.009 sec/batch)
    step 190, loss = 5.44 ( 11085.0 examples/ sec; 0.009 sec/batch)
    step 200, loss = 3.98 ( 13347.3 examples/ sec; 0.007 sec/batch)
    step 210, loss = 11.98 ( 10551.4 examples/ sec; 0.009 sec/batch)
    step 220, loss = 9.17 ( 11115.3 examples/ sec; 0.009 sec/batch)
    step 230, loss = 15.31 ( 12450.5 examples/ sec; 0.008 sec/batch)
    step 240, loss = 5.92 ( 11729.5 examples/ sec; 0.009 sec/batch)
    step 250, loss = 9.94 ( 10497.2 examples/ sec; 0.010 sec/batch)
    step 260, loss = 2.94 ( 11398.1 examples/ sec; 0.009 sec/batch)
    step 270, loss = 7.30 ( 10497.4 examples/ sec; 0.010 sec/batch)
    step 280, loss = 3.98 ( 11946.0 examples/ sec; 0.008 sec/batch)
    step 290, loss = 7.66 ( 11307.2 examples/ sec; 0.009 sec/batch)
    step 300, loss = 2.03 ( 11968.7 examples/ sec; 0.008 sec/batch)
    step 310, loss = 2.39 ( 8672.0 examples/ sec; 0.012 sec/batch)
    step 320, loss = 2.07 ( 3835.6 examples/ sec; 0.026 sec/batch)
    step 330, loss = 2.71 ( 12087.7 examples/ sec; 0.008 sec/batch)
    step 340, loss = 2.70 ( 11907.3 examples/ sec; 0.008 sec/batch)
    step 350, loss = 7.17 ( 7671.2 examples/ sec; 0.013 sec/batch)
    step 360, loss = 8.36 ( 11863.6 examples/ sec; 0.008 sec/batch)
    step 370, loss = 2.48 ( 11782.7 examples/ sec; 0.008 sec/batch)
    step 380, loss = 2.27 ( 11081.5 examples/ sec; 0.009 sec/batch)
    step 390, loss = 2.85 ( 11562.4 examples/ sec; 0.009 sec/batch)
    step 400, loss = 2.99 ( 12088.9 examples/ sec; 0.008 sec/batch)
    step 410, loss = 5.08 ( 12465.6 examples/ sec; 0.008 sec/batch)
    step 420, loss = 2.12 ( 12869.1 examples/ sec; 0.008 sec/batch)
    step 430, loss = 2.83 ( 13756.3 examples/ sec; 0.007 sec/batch)
    step 440, loss = 7.56 ( 13297.8 examples/ sec; 0.008 sec/batch)
    step 450, loss = 3.51 ( 12634.6 examples/ sec; 0.008 sec/batch)
    step 460, loss = 2.23 ( 13297.8 examples/ sec; 0.008 sec/batch)
    step 470, loss = 1.80 ( 12869.2 examples/ sec; 0.008 sec/batch)
    step 480, loss = 5.92 ( 9730.3 examples/ sec; 0.010 sec/batch)
    step 490, loss = 4.01 ( 12647.0 examples/ sec; 0.008 sec/batch)
    step 500, loss = 2.29 ( 12466.9 examples/ sec; 0.008 sec/batch)
    step 510, loss = 2.20 ( 13078.4 examples/ sec; 0.008 sec/batch)
    step 520, loss = 3.70 ( 13296.5 examples/ sec; 0.008 sec/batch)
    step 530, loss = 2.11 ( 13298.3 examples/ sec; 0.008 sec/batch)
    step 540, loss = 1.73 ( 13296.6 examples/ sec; 0.008 sec/batch)
    step 550, loss = 1.20 ( 12868.9 examples/ sec; 0.008 sec/batch)
    step 560, loss = 3.44 ( 13078.6 examples/ sec; 0.008 sec/batch)
    step 570, loss = 1.35 ( 11562.0 examples/ sec; 0.009 sec/batch)
    step 580, loss = 3.51 ( 13205.2 examples/ sec; 0.008 sec/batch)
    step 590, loss = 3.11 ( 12868.8 examples/ sec; 0.008 sec/batch)
    step 600, loss = 3.40 ( 12869.1 examples/ sec; 0.008 sec/batch)
    step 610, loss = 2.49 ( 13297.7 examples/ sec; 0.008 sec/batch)
    step 620, loss = 2.68 ( 12620.3 examples/ sec; 0.008 sec/batch)
    step 630, loss = 2.09 ( 11907.3 examples/ sec; 0.008 sec/batch)
    step 640, loss = 3.82 ( 8487.3 examples/ sec; 0.012 sec/batch)
    step 650, loss = 2.77 ( 11081.5 examples/ sec; 0.009 sec/batch)
    step 660, loss = 2.55 ( 12089.1 examples/ sec; 0.008 sec/batch)
    step 670, loss = 2.53 ( 10228.3 examples/ sec; 0.010 sec/batch)
    step 680, loss = 5.17 ( 9498.5 examples/ sec; 0.011 sec/batch)
    step 690, loss = 2.02 ( 10498.4 examples/ sec; 0.010 sec/batch)
    step 700, loss = 0.21 ( 12088.9 examples/ sec; 0.008 sec/batch)
    step 710, loss = 1.95 ( 12868.7 examples/ sec; 0.008 sec/batch)
    step 720, loss = 3.90 ( 13296.2 examples/ sec; 0.008 sec/batch)
    step 730, loss = 2.17 ( 9277.6 examples/ sec; 0.011 sec/batch)
    step 740, loss = 1.09 ( 9730.1 examples/ sec; 0.010 sec/batch)
    step 750, loss = 1.33 ( 12466.8 examples/ sec; 0.008 sec/batch)
    step 760, loss = 3.17 ( 9797.9 examples/ sec; 0.010 sec/batch)
    step 770, loss = 3.20 ( 13297.9 examples/ sec; 0.008 sec/batch)
    step 780, loss = 4.28 ( 13756.4 examples/ sec; 0.007 sec/batch)
    step 790, loss = 1.23 ( 12465.4 examples/ sec; 0.008 sec/batch)
    step 800, loss = 1.78 ( 12868.8 examples/ sec; 0.008 sec/batch)
    step 810, loss = 1.12 ( 12924.2 examples/ sec; 0.008 sec/batch)
    step 820, loss = 2.09 ( 13297.1 examples/ sec; 0.008 sec/batch)
    step 830, loss = 0.71 ( 11967.1 examples/ sec; 0.008 sec/batch)
    step 840, loss = 3.03 ( 12088.8 examples/ sec; 0.008 sec/batch)
    step 850, loss = 2.76 ( 12868.8 examples/ sec; 0.008 sec/batch)
    step 860, loss = 1.64 ( 12087.1 examples/ sec; 0.008 sec/batch)
    step 870, loss = 2.43 ( 9066.8 examples/ sec; 0.011 sec/batch)
    step 880, loss = 1.73 ( 11398.2 examples/ sec; 0.009 sec/batch)
    step 890, loss = 0.61 ( 12980.4 examples/ sec; 0.008 sec/batch)
    step 900, loss = 3.44 ( 12868.8 examples/ sec; 0.008 sec/batch)
    step 910, loss = 0.96 ( 11445.9 examples/ sec; 0.009 sec/batch)
    step 920, loss = 2.95 ( 13756.3 examples/ sec; 0.007 sec/batch)
    step 930, loss = 2.99 ( 12868.5 examples/ sec; 0.008 sec/batch)
    step 940, loss = 0.34 ( 13752.5 examples/ sec; 0.007 sec/batch)
    step 950, loss = 1.05 ( 13297.8 examples/ sec; 0.008 sec/batch)
    step 960, loss = 2.34 ( 13295.7 examples/ sec; 0.008 sec/batch)
    step 970, loss = 1.32 ( 13297.6 examples/ sec; 0.008 sec/batch)
    step 980, loss = 2.46 ( 12466.6 examples/ sec; 0.008 sec/batch)
    step 990, loss = 1.02 ( 13297.7 examples/ sec; 0.008 sec/batch)

  • 相关阅读:
    51 Nod 1086 多重背包问题(单调队列优化)
    51 Nod 1086 多重背包问题(二进制优化)
    51 Nod 1085 01背包问题
    poj 2559 Largest Rectangle(单调栈)
    51 Nod 1089 最长回文子串(Manacher算法)
    51 Nod N的阶乘的长度 (斯特林近似)
    51 Nod 1134 最长递增子序列(经典问题回顾)
    51 Nod 1020 逆序排列
    PCA-主成分分析(Principal components analysis)
    Python中cPickle
  • 原文地址:https://www.cnblogs.com/weizhen/p/6911261.html
Copyright © 2011-2022 走看看