在MXNET源码的example/numpy-ops/下有官方提供的使用python编写新操作符的实例。分别跑ndarray_softmax.py、numpy_softmax.py和custom_softmax.py 发现ndarray_softmax.py中训练速度将近其他两种方法的3倍,分析发现ndarray_softmax.py中调用cuda核,而其他两种方法都是numpy在cpu上的运行。
这里总结一下我对ndarray_softmax.py的理解。
分析一下ndarray_softmax.py源码,重写了父类的一些基本方法,其中最重要的是前向和后向操作:
1 def forward(self, in_data, out_data): 2 x = in_data[0] 3 y = out_data[0] 4 if self.fwd_kernel is None: 5 self.fwd_kernel = mx.rtc('softmax', [('x', x)], [('y', y)], """ 6 int i = threadIdx.x + blockIdx.x*blockDim.x; 7 float max_x = x[i*x_dims[1]]; 8 for (int j = 1; j < x_dims[1]; ++j) { 9 if (max_x < x[i*x_dims[1]+j]) { 10 max_x = x[i*x_dims[1]+j]; 11 } 12 } 13 float sum = 0.0f; 14 for (int j = 0; j < x_dims[1]; ++j) { 15 sum += expf(x[i*x_dims[1]+j]-max_x); 16 } 17 for (int j = 0; j < x_dims[1]; ++j) { 18 y[i*x_dims[1]+j] = expf(x[i*x_dims[1]+j]-max_x)/sum; 19 } 20 """) 21 self.fwd_kernel.push([x], [y], (1, 1, 1), (x.shape[0], 1, 1)) 22 23 def backward(self, out_grad, in_data, out_data, in_grad): 24 l = in_data[1] 25 y = out_data[0] 26 dx = in_grad[0] 27 if self.bwd_kernel is None: 28 self.bwd_kernel = mx.rtc('softmax_grad', [('y', y), ('l', l)], [('dx', dx)], """ 29 int i = blockIdx.x; 30 int j = threadIdx.x; 31 int k = static_cast<int>(l[i]); 32 if (j == k) { 33 dx[i*dx_dims[1]+j] = y[i*dx_dims[1]+j] - 1.0f; 34 } else { 35 dx[i*dx_dims[1]+j] = y[i*dx_dims[1]+j]; 36 } 37 """) 38 self.bwd_kernel.push([y,l], [dx], (y.shape[0],1,1), (y.shape[1], 1, 1))
使用mx.rtc(...)定义的就是cuda 编译相关内容了,查看/python/mxnet/rtc.py中Rtc类的定义,其参数部分描述如下:
1 """MXRtc object in mxnet. 2 This class allow you to write CUDA kernels in Python 3 and call them with NDArray. 4 5 Parameters 6 ---------- 7 name : str 8 Name of the kernel. 9 inputs : tuple of (str, mxnet.ndarray) 10 List of input names and ndarray. 11 outputs : tuple of (str, mxnet.ndarray) 12 List of output names and ndarray. 13 kernel : str 14 The actual kernel code. 15 Note that this is only the body of the kernel, i.e. 16 after { and before }. Rtc will decorate the kernel. 17 For example, if ``name = "mykernel"`` and 18 inputs = [('x', mx.nd.zeros((10,)))] 19 outputs = [('y', mx.nd.zeros((10,)))] 20 kernel = "y[threadIdx.x] = x[threadIdx.x];", 21 then the compiled kernel will be: 22 extern "C" __global__ mykernel(float *x, float *y) { 23 const int x_ndim = 1; 24 const int x_dims = { 10 }; 25 const int y_ndim = 1; 26 const int y_dims = { 10 }; 27 28 y[threadIdx.x] = x[threadIdx.x]; 29 } 30 """
以ndarray_softmax.py为例, softmax层输入数据shape=(100,10),输出数据shape=(100,10),那么forward方法里的x_dim=(100,10), 第三个参数即cuda编译的要执行的语句。 在forward方法中看到最后一句push方法,gridDim={'x':1,'y':1,'z':1}, blockDim={'x':100,'y':1,'z':1} (cuda存储参见cudaMemcpy与kernel),于是每一个线程操作一个sample的10个elements,threadIdx.x表示线程在block块中的索引,那么threadIdx.x+blockIdx.x*blockDim.x就是对应线程总的索引,blockDim对应的是block中threads的个数,然后后面softmax前向计算就容易理解了。
再看backward方法,这个kernel将gradDim划分成(100,1,1), blockDim划分成(10,1,1),即每一个element对应着一个线程,然后在每一个线程中计算该element对应的梯度。
example:
实现一个reorganize层,也就是yolo中特征重组层,具体功能YOLO v2 reorg 当然,最清楚的方式是看darknet中源码如何实现。
这个例子只是想继承mx.operator.NDArrayOp实现新的操作层,该操作层中没有权重参数,对于有权重的层要在forward和backward中操作对应的值。
1 # -*- coding: utf-8 -*- 2 import mxnet as mx 3 import numpy as np 4 import logging 5 6 class NDArrayReorg(mx.operator.NDArrayOp): 7 def __init__(self, stride=2): 8 super(NDArrayReorg, self).__init__(True) 9 self.stride = stride 10 self.fwd_kernel = None 11 self.bwd_kernel = None 12 13 def list_arguments(self): 14 return ['data'] 15 16 def list_outputs(self): 17 return ['output'] 18 19 def infer_shape(self, in_shape): 20 data_shape = in_shape[0] 21 output_shape = [in_shape[0][0], in_shape[0][1]*4 22 , in_shape[0][2]/self.stride, in_shape[0][3]/self.stride] 23 24 return [data_shape], [output_shape] 25 26 def forward(self, in_data, out_data): 27 x = in_data[0] 28 y = out_data[0] 29 if self.fwd_kernel is None: 30 self.fwd_kernel = mx.rtc('reorg',[('x',x)],[('y',y)],""" 31 int i = threadIdx.x + blockIdx.x*blockDim.x ; 32 int yw=y_dims[3]; 33 int yh = y_dims[2]; 34 int N = yw*yh; 35 int xw=x_dims[3]; 36 int xh = x_dims[2]; 37 int len_block = x_dims[2]*x_dims[3]; 38 for(int j =0; j<xh; j+=2) 39 for(int k=0; k<xw; k+=2) 40 { int t=j/2; 41 y[i*len_block+t*yw+k/2] = x[i*len_block+j*xw+k]; 42 y[i*len_block+t*yw+k/2+N] = x[i*len_block + j*xw+k+1]; 43 y[i*len_block+t*yw+k/2+2*N] = x[i*len_block +(j+1)*xw+k]; 44 y[i*len_block+t*yw+k/2+3*N] = x[i*len_block +(j+1)*xw+k+1]; 45 } 46 """) 47 self.fwd_kernel.push([x],[y],(x.shape[0]*x.shape[1],1,1),(1,1,1)) 48 49 def backward(self, out_grad, in_data, out_data, in_grad): 50 y = out_grad[0] 51 dx = in_grad[0] 52 if self.bwd_kernel is None: 53 self.bwd_kernel = mx.rtc('reorg_grad',[('y',y)],[('dx', dx)],""" 54 int i = threadIdx.x + blockIdx.x * blockDim.x; 55 int yh = y_dims[2]; 56 int yw = y_dims[3]; 57 int N = yw*yh; 58 int old_block = dx_dims[2]*dx_dims[3]; 59 for(int k=0;k<4;++k) 60 for(int j=0; j<yw; ++j) 61 for(int t=0; t<yh; ++t){ 62 dx[i*old_block+2*j*yw+t*2+k]=y[i*old_block+k*N+j*yw+t]; 63 } 64 """) 65 self.bwd_kernel.push([y],[dx],(y.shape[0]*y.shape[1]/4,1,1),(1,1,1)) 66 67 mnist = mx.test_utils.get_mnist() 68 batch_size = 100 69 train_iter = mx.io.NDArrayIter(mnist['train_data'], mnist['train_label'], batch_size, shuffle=True) 70 val_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size) 71 72 73 data = mx.sym.var('data') 74 conv1 = mx.sym.Convolution(data=data, kernel=(5,5), num_filter=20) 75 tanh1 = mx.sym.Activation(data=conv1, act_type="tanh") 76 # pool1 = mx.sym.Pooling(data=tanh1, pool_type="max", kernel=(2,2), stride=(2,2)) 77 78 reorg = NDArrayReorg(stride=2) 79 reg = reorg(data=tanh1, name='reorg') 80 conv2 = mx.sym.Convolution(data=reg, kernel=(5,5), num_filter=20) 81 tanh2 = mx.sym.Activation(data=conv2, act_type="tanh") # 80x8x8 82 83 conv2 = mx.sym.Convolution(data=tanh2, kernel=(5,5), num_filter=50) 84 tanh2 = mx.sym.Activation(data=conv2, act_type="tanh") 85 # pool2 = mx.sym.Pooling(data=tanh2, pool_type="max", kernel=(2,2), stride=(2,2)) 86 87 flatten = mx.sym.flatten(data=tanh2) 88 fc1 = mx.sym.FullyConnected(data=flatten,num_hidden=500) 89 tanh3 = mx.sym.Activation(data=fc1, act_type="tanh") 90 91 fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=10) 92 93 mynet = mx.sym.SoftmaxOutput(data=fc2, name='softmax') 94 95 print(mynet.infer_shape(data=(100,1,28,28))) 96 mynet_model = mx.mod.Module(symbol=mynet, context=mx.gpu()) 97 98 mynet_model.fit(train_iter, 99 eval_data=val_iter, 100 optimizer='sgd', 101 optimizer_params = {'learning_rate':0.1}, 102 eval_metric='acc', 103 batch_end_callback=mx.callback.Speedometer(100,100), 104 num_epoch=10) 105 106 test_iter = mx.io.NDArrayIter(mnist['test_data'], None, batch_size) 107 prob = mynet_model.predict(test_iter) 108 test_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size) 109 # predict accuracy for lenet 110 acc = mx.metric.Accuracy() 111 mynet_model.score(test_iter, acc) 112 print(acc) # 网络是随便构建的,参数也是随便选的,所以出来的值并没有什么参考价值,只是为了验证调用mx.rtc创建cuda的kernel
因此,对于定制的层,可是使用类似的方法定义,该方法显然比使用numpy要快的多。