zoukankan      html  css  js  c++  java
  • TensorFlow 笔记02-mnist 的 tensorRT 实现,从 .npz 文件中加载参数进行推理

    ● 代码,tf 卷积神经网络,将训练好的参数保存为 .npz 文件给 tensorRT 用

    1 # tf 模型搭建和训练部分同上一篇博客
    2 tfArg = {}
    3 for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):      # 遍历全局元素集合,全部放进 tfArg 中来
    4     tfArg[i.name] = sess.run(i)
    5 tfArg['testX']=mnist[2].images                                  # 补上 mnist 的测试数据
    6 tfArg['testY']=mnist[2].labels
    7 
    8 np.savez(pbFilePath + 'tfArg.npz',**tfArg)                      # 保存 tfArg 为 .npz 文件
    9 sess.close()

    ● 代码,将前面一模一样的神经网络用 trt 重写一遍,加载训练好的参数来推理

     1 import numpy as np
     2 import tensorflow as tf
     3 import tensorrt as trt
     4 import pycuda.autoinit
     5 import pycuda.driver as cuda
     6 import input_data
     7 from datetime import datetime as dt
     8 
     9 pbFilePath = "tempFile/"
    10 
    11 # 网络基础设施
    12 iGpu = 0
    13 print("GPU in use:", cuda.Device(iGpu).name())
    14 cuda.Device(iGpu).make_context()
    15 logger = trt.Logger(trt.Logger.WARNING)
    16 builder = trt.Builder(logger)
    17 network = builder.create_network()
    18 builder.max_batch_size = 64
    19 builder.max_workspace_size = 0 << 20
    20 
    21 # 读取参数
    22 para = np.load(pbFilePath + 'tfArg.npz')
    23 w1= para['w1:0'].transpose((3, 2, 0, 1)).reshape(-1)                            # NHWC -> NCHW,所有权重都要 reshape(-1) 压成 1 维
    24 b1 = para['b1:0']
    25 w2 = para['w2:0'].transpose((3, 2, 0, 1)).reshape(-1)
    26 b2 = para['b2:0']
    27 w3 = para['w3:0'].reshape(7,7,64,1024).transpose((3, 2, 0, 1)).reshape(-1)
    28 b3 = para['b3:0']
    29 w4 = para['w4:0'].reshape(1024,10).transpose((1,0)).reshape(-1)
    30 b4 = para['b4:0']
    31 testX = para['testX']                                                           # 测试数据
    32 testY = para['testY']
    33 
    34 # 建立网络
    35 batchSize = 64
    36 data = network.add_input("data", trt.DataType.FLOAT, (batchSize, 1, 28, 28))    # 输入层,batchSize 张 1 通道 28 行 28 列
    37 
    38 h1 = network.add_convolution(data, 32, (5, 5), w1, b1)                          # 卷积 1,指定输出特征数,窗口高宽,权重值(隐式转换为 trt.Weigfhts)
    39 h1.stride = (1, 1)                                                              # 外侧补充指定跨步和光环
    40 h1.padding = (2, 2)
    41 h1Act = network.add_activation(h1.get_output(0), trt.ActivationType.RELU)       # 激活层,指定激活类型
    42 
    43 h1Pool = network.add_pooling(h1Act.get_output(0), trt.PoolingType.MAX, (2, 2))  # 池化层,指定池化类型,窗口高宽
    44 h1Pool.stride = (2, 2)
    45 h1Pool.padding = (0, 0)
    46                
    47 h2 = network.add_convolution(h1Pool.get_output(0), 64, (5, 5), w2, b2)          # 卷积 2
    48 h2.stride = (1, 1)
    49 h2.padding = (2, 2)
    50 h2Act = network.add_activation(h2.get_output(0), trt.ActivationType.RELU)
    51 
    52 h2Pool = network.add_pooling(h1Act.get_output(0), trt.PoolingType.MAX, (2, 2))  # 池化 2
    53 h2Pool.stride = (2, 2)
    54 h2Pool.padding = (0, 0)
    55 
    56 
    57 h3 = network.add_fully_connected(h2Pool.get_output(0), 1024, w3, b3)            # 全连接层,指定输出特征数,权重值
    58 h3Act = network.add_activation(h3.get_output(0), trt.ActivationType.RELU)
    59 
    60 h4 = network.add_fully_connected(h3Act.get_output(0), 10, w4, b4)               # 全连接层 2
    61 y = network.add_softmax(h4.get_output(0))                                       # softmax 层
    62 
    63 network.mark_output(y.get_output(0))                                            # 指定输出层
    64 engine = builder.build_cuda_engine(network)                                     # 建立 engine
    65 
    66 # 申请内存
    67 h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32)  
    68 h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32)
    69 d_input = cuda.mem_alloc(h_input.nbytes)
    70 d_output = cuda.mem_alloc(h_output.nbytes)
    71 
    72 # 流和上下文
    73 stream = cuda.Stream()
    74 context = engine.create_execution_context()
    75 
    76 # 测试
    77 print( "%s, start!" %( dt.now()) )
    78 acc = 0
    79 nTest = len(para['testX'])
    80 for i in range(nTest // batchSize):                                             # 向下取整,尾巴可能没测完       
    81     h_input = para['testX'][i*batchSize:(i+1)*batchSize].reshape(-1,1,28,28)
    82     
    83     cuda.memcpy_htod_async(d_input, h_input, stream)                            # 数据拷贝
    84     
    85     context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)  # 执行内核
    86     
    87     cuda.memcpy_dtoh_async(h_output, d_output, stream)
    88     
    89     stream.synchronize()                                                        # 同步,否则 yy 是 全零矩阵
    90     
    91     yy = np.argmax(h_output.reshape(engine.get_binding_shape(1)),1).reshape(-1)
    92     label = np.argmax(para['testY'][i*batchSize:(i+1)*batchSize],1)    
    93     acc += np.sum( ( yy == label ).astype(np.int) )
    94     
    95 cuda.Context.pop()                                                              # 删除上下文
    96 print( "%s, acc = %f" %( dt.now(), acc/(len(para['testX'])) ) )

    ▶ 总结 tensorRT 的关键步骤(包含 engine 的读写,避免每次都新建 engine 浪费时间)

     1 import tensorflow as tf
     2 import tensorrt as trt
     3 import pycuda.autoinit
     4 import pycuda.driver as cuda
     5 
     6 iGpu = 0
     7 cuda.Device(iGpu).make_context()                # 设备上下文
     8 logger = trt.Logger(trt.Logger.WARNING)         # 创建 logger
     9 
    10 trtFilePath = "./densenetEngine.trt"            # 读取现成的 engine 序列文件,否则现场生成一个 engine 并序列化保存为文件
    11 if os.path.isfile(trtFilePath) and not DEBUG:
    12     with open(trtFilePath, 'rb') as f:
    13         engineStr = f.read()
    14 else:
    15     builder = trt.Builder(logger)               # 创建 builder
    16     builder.max_batch_size     = 64
    17     builder.max_workspace_size = 200 << 20
    18     builder.fp16_mode          = True           # 是否使用 float16
    19     
    20     network = builder.create_network()          # 创建 network
    21 
    22     h0 = network.add_input("h0", ...)           # 开始建网
    23     
    24     ...
    25 
    26     y = network.add_...
    27     
    28     network.mark_output(y.get_output(0))        # 标记输出节点
    29     
    30     engine = builder.build_cuda_engine(network) # 建立 engine,最容易失败的位置
    31 
    32     if engine == None:
    33         print("build engine failed!")
    34         return None        
    35     
    36     engineStr = engine.serialize()              # 创建序列化的 engine 并写入文件中,方便下次直接取用
    37     with open(trtFilePath, 'wb') as f:
    38         f.write(engineStr)
    39 
    40 runtime = trt.Runtime(logger)                                               # 利用运行时环境读取序列化的 engine(现场创建 engine 的可以跳过这步)
    41 engine  = runtime.deserialize_cuda_engine(engineStr)
    42 context = engine.create_execution_context()                                 # 创建内核上下文(区别于设备上下文)
    43 stream  = cuda.Stream()                                                     # 创建流(可选)
    44 
    45 hIn = cuda.pagelocked_empty(engine.get_binding_shape(0), dtype=np.float32)  # 使用无初始化的页锁定内存,指定尺寸(隐式转换为 trt.volume)和数据类型,也可用 np.empty 等来申请一般内存
    46 hOut = cuda.pagelocked_empty(engine.get_binding_shape(1), dtype=np.float32) # engine.get_binding_shape 的 (0) 和 (1) 分别等于network 的输入和输出节点尺寸
    47 dIn = cuda.mem_alloc(h_input.nbytes)                                        # 申请设备内存,使用主机内存的大小
    48 dOut = cuda.mem_alloc(h_output.nbytes)
    49 
    50 cuda.memcpy_htod_async(d_input, h_input, stream)                            # 异步数据拷贝
    51 #cuda.memcpy_htod(d_input, data)                                            # 非异步数据拷贝
    52 context.execute_async(batchSize, bindings=[int(dIn), int(dOut)], stream_handle=stream.handle)  # 异步执行内核
    53 context.execute(batchSize, bindings=[int(dIn), int(dOut)])                  # 非异步执行内核
    54 cuda.memcpy_dtoh_async(hOut, dOut, stream)
    55 
    56 stream.synchronize()                                                        # 同步
    57 
    58 context = None                                                              # 清空内核上下文和 engine
    59 engine  = None
    60 cuda.Context.pop()                                                          # 关闭设备上下文

    ▶ 留坑,使用 convert_to_uff.py 将保存的 .pb 模型转化为 .uff 模型,方便 tendorRT 直接加载和使用,不用再在 tenorRT 中重建。中间遇到一些问题,尚未成功。

  • 相关阅读:
    学习:Radio Button和Check Box
    学习:访问Edit Control的七种方法
    实现:EDIT控件字符个数与长度的计算
    学习:GDI基础
    学习:MFC的CWinApp和CFrameWnd
    学习:远程代码注入
    实现:获取指定进程PID
    学习:远程线程实现DLL注入和shellcode注入以及OD调试原理
    学习:内存映射文件
    实现 Trie (前缀树)
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/11725715.html
Copyright © 2011-2022 走看看