1. swig实现
在Python 调用 C/C++实现卷积中,尝试了python通过swig调用c提高性能的方法。
以下为pool2d im2col col2im三个函数在swig下的配置。
%module t %include <stdint.i> %typemap(in,numinputs=0,noblock=1) size_t *l1 { size_t templen1; $1 = &templen1; } %typemap(in,numinputs=0,noblock=1) size_t *l2 { size_t templen2; $1 = &templen2; } %typemap(in,numinputs=0,noblock=1) size_t *l3 { size_t templen3; $1 = &templen3; } %typemap(in,numinputs=0,noblock=1) size_t *l4 { size_t templen4; $1 = &templen4; } %typemap(in) double* { int i,j,k,l,index=0; if (!PySequence_Check($input)) { PyErr_SetString(PyExc_TypeError,"Expecting a sequence"); return NULL; } int l1=PyObject_Length($input); PyObject *t,*t1,*t2,*t3,*t4; t= PySequence_GetItem($input,0); int l2=PyObject_Length(t); t = PySequence_GetItem(t,0); int l3=PyObject_Length(t); t = PySequence_GetItem(t,0); int l4=PyObject_Length(t); $1 = (double *) malloc(l1*l2*l3*l4*sizeof(double)); for(i=0;i<l1;i++) { t1 = PySequence_GetItem($input,i); for(j=0;j<l2;j++) { t2 = PySequence_GetItem(t1,j); for(k=0;k<l3;k++) { t3 = PySequence_GetItem(t2,k); for(l=0;l<l4;l++) { t4 = PySequence_GetItem(t3,l); if (!PyNumber_Check(t4)) { PyErr_SetString(PyExc_ValueError,"Expecting a 4 dim sequence of doubles"); free($1); return NULL; } *($1+index)= (double)PyFloat_AsDouble(t4); ++index; } } } } } %typemap(out) double* pool2d { int i,j,k,l,index=0; $result = PyList_New(templen1); for (i = 0; i < templen1; i++) { PyObject *t2= PyList_New(templen2); for (j = 0; j < templen2; j++) { PyObject *t1= PyList_New(templen3); for (k = 0; k < templen3; k++) { PyObject *t= PyList_New(templen4); for (l = 0; l < templen4; l++) { PyObject *o = PyFloat_FromDouble((double)$1[index]); PyList_SetItem(t,l,o); ++index; } PyList_SetItem(t1,k,t); } PyList_SetItem(t2,j,t1); } PyList_SetItem($result,i,t2); } delete [] $1; } %typemap(out) double* im2col { int i,j,index=0; $result = PyList_New(templen1); for (i = 0; i < templen1; i++) { PyObject *t= PyList_New(templen2); for (j = 0; j < templen2; j++) { PyObject *o = PyFloat_FromDouble((double)$1[index]); PyList_SetItem(t,j,o); ++index; } PyList_SetItem($result,i,t); } delete [] $1; } %typemap(out) double* col2im { int i,j,k,l,index=0; $result = PyList_New(templen1); for (i = 0; i < templen1; i++) { PyObject *t2= PyList_New(templen2); for (j = 0; j < templen2; j++) { PyObject *t1= PyList_New(templen3); for (k = 0; k < templen3; k++) { PyObject *t= PyList_New(templen4); for (l = 0; l < templen4; l++) { PyObject *o = PyFloat_FromDouble((double)$1[index]); PyList_SetItem(t,l,o); ++index; } PyList_SetItem(t1,k,t); } PyList_SetItem(t2,j,t1); } PyList_SetItem($result,i,t2); } delete [] $1; } %typemap(freearg) double *a { if ($1) free($1); } %inline %{ double* pool2d(const double* const bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height,size_t *l1,size_t *l2,size_t *l3,size_t *l4) { const int w = width; const int h = height; const int m = channels; const int n = num; const int d = pooled_height; const int zh = h / d + h % d; const int zw = w / d + w % d; double* top_data; top_data=new double [n*m*zh*zw*sizeof(double)]; int i,j,k,o,u,v,index,index2=0; double s; for (k = 0; k < n; ++k) for (o = 0; o < m; ++o) for (i = 0; i < zh; ++i) for (j = 0; j < zw; ++j) { index=k*m*h*w+o*h*w+d*i*w+d*j; s=-10000.0; for (u = 0; u < d&&(u+d*i)<h; ++u) for (v = 0; v < d&&(v+d*j)<w; ++v) if (*(bottom_data+index+u*w+v)>s) s=*(bottom_data+index+u*w+v); *(top_data+index2)=s; ++index2; } *l1=n; *l2=m; *l3=zh; *l4=zw; return top_data; } double* im2col(const double* const bottom_data, const int num, const int channels, const int height, const int width, const int filter_height, const int filter_width,size_t *l1,size_t *l2) { const int w = width; const int h = height; const int m = channels; const int n = num; const int vh = filter_height; const int vw = filter_width; const int t = h - vh + 1; const int kh = t *t; const int kw = vh * vh; double* top_data; top_data=new double [n*m*kh*kw*sizeof(double)]; int k,o,i,j,u,v,index,index2; for (k = 0; k < n; ++k) for (o = 0; o < m; ++o) for (i = 0; i < t; ++i) for (j = 0; j < t; ++j) { index=k*kh*kw*m+(i*t+j)*kw*m+o*kw; index2=k*m*h*w+o*h*w; for (u = 0; u < vh; ++u) for (v = 0; v < vw; ++v) { *(top_data+index+u*vh+v)= *(bottom_data+index2+(i+u)*w+j+v); } } *l1=n*kh; *l2=kw*m; return top_data; }
double* col2im(const double* const bottom_data, const int num, const int channels, const int height, const int width, const int filter_height, const int filter_width, size_t *l1,size_t *l2,size_t *l3,size_t *l4) { const int w = width; const int h = height; const int m = channels; const int n = num; const int vh = filter_height; const int vw = filter_width; const int t = h - vh + 1; const int kh = t *t; const int kw = vh * vh; int k,o,i,j,u,v,index,rh,b; double* top_data; top_data=new double [n*m*h*w*sizeof(double)]; for (k = 0; k < n; ++k) for (o = 0; o < m; ++o) for (i = 0; i < h; ++i) { rh=vh * (i<(vh - 1)?i:(vh-1)); b = ((i + 1 - vh)> 0?(i + 1 - vh):0) * t; index=k*m*h*h+o*h*h+i*h; for (u = 0; u < vh; ++u) { *(top_data+index+u)= *(bottom_data+k*kh*kw*m+b*kw*m+rh+u); } for (v = 0; v < t-1; ++v) { *(top_data+index+vh+v)= *(bottom_data+k*kh*kw*m+(b+1+v)*kw*m+rh+vh-1); } } *l1=n; *l2=m; *l3=h; *l4=w; return top_data; } %}
2.使用效果
只有pool2d提升很大,使用另两个函数速度不仅没有提升,反而有所下降。
# coding:utf8 import cPickle import numpy as np import time import t """ cost per 500 data: default 22s +pool2d 10s +im2col 11s +col2im 12s """ class ConvPoolLayer(object): def __init__(self, image_shape,filter_shape,poolsize=(2,2)): self.filter_shape = filter_shape self.image_shape = image_shape self.weights = np.random.normal(loc=0, scale=np.sqrt(1.0/np.prod(filter_shape[1:])), size=(np.prod(filter_shape[1:]),filter_shape[0])) self.b = np.random.normal(loc=0, scale=0.2, size=(1,filter_shape[0],1,1)) self.poolsize = poolsize self.samp_shape=(image_shape[-2] - filter_shape[-2] + 1,image_shape[-1] - filter_shape[-1] + 1) self.out_shape=(self.samp_shape[0]/poolsize[0],self.samp_shape[1]/poolsize[1]) self.v=0 def im2col(self,a): n, m, h, w = self.image_shape vn, vm, vh, vw = self.filter_shape z=t.im2col(a,n,m,h,w,vh,vw) z = np.array(z) return z def col2im(self,a): n, m, h, w = self.image_shape vn, vm, vh, vw = self.filter_shape z=t.col2im(a,n,m,h,w,vh,vw) z = np.array(z) return z def fw_shape(self,a): res = np.reshape(a, (self.image_shape[0], -1, self.filter_shape[0])) res = np.rollaxis(res, 2, 1) res = np.reshape(res, (self.image_shape[0], -1, self.samp_shape[0], self.samp_shape[1])) return res def bp_shape(self,a): res = np.reshape(a, (self.image_shape[0], self.filter_shape[0], -1)) res = np.rollaxis(res, 1, 3) res = np.reshape(res, (-1, self.filter_shape[0])) return res def feedforward(self, a): z = self.im2col(a) res = np.dot(z, self.weights) res = self.fw_shape(res) self.out = self.relu(res+self.b) return np.array(self.pool2d(self.out)) def backprop(self, x, dnext,eta=0.001,weight_decay=0,momentum=0.9): if dnext.ndim<3: dnext = np.reshape(dnext,(self.image_shape[0],self.filter_shape[0], self.out_shape[0], -1)) u = self.relu_prime(self.out) dnext = np.multiply(u,self.up(dnext,self.poolsize[0])) delta = self.bp_shape(dnext)/self.image_shape[0] x=self.im2col(x) out_delta = np.dot(delta, self.weights.T) out_delta=self.col2im(out_delta) w = np.dot(x.T, delta) w = eta * w+weight_decay*self.weights**2 self.v = momentum * self.v - w self.weights += self.v self.b -= eta * np.reshape(np.sum(delta,0),(self.b.shape)) return out_delta def pool2d(self,input, ds=(2, 2)): n, m, h, w = np.shape(input) return t.pool2d(input,n,m,h,w,self.poolsize[0]) def up(self,a,n): b=np.ones((n,n)) return np.kron(a,b) def relu(self,z): return np.maximum(z, 0.0) def relu_prime(self,z): z[z>0]=1 return z class SoftmaxLayer(object): def __init__(self, in_num=100,out_num=10): self.weights = np.random.randn(in_num, out_num)/np.sqrt(out_num) self.v=0 def feedforward(self, input): self.out=self.softmax(np.dot(input, self.weights)) return self.out def backprop(self, input, y,eta=0.001,weight_decay=0,momentum=0.9): o=self.out delta =o-y out_delta=np.dot(delta,self.weights.T) w = np.dot(input.T,delta) w=eta*w+weight_decay*self.weights**2 self.v = momentum * self.v - w self.weights += self.v return out_delta def softmax(self,a): m = np.exp(a) return m / (np.sum(m,axis=1)[:,np.newaxis]) class FullLayer(object): def __init__(self, in_num=720,out_num=100): self.in_num=in_num self.out_num=out_num self.biases = np.random.randn(out_num) self.weights = np.random.randn(in_num, out_num)/np.sqrt(out_num) self.v=0 def feedforward(self, x): if x.ndim>2: x = np.reshape(x, (len(x), self.in_num)) self.out = self.sigmoid(np.dot(x, self.weights)+self.biases) return self.out def backprop(self, x,delta,eta=0.001,weight_decay=0,momentum=0.9): if x.ndim>2: x = np.reshape(x, (len(x), self.in_num)) sp=self.sigmoid_prime(self.out) delta = delta * sp out_delta = np.dot(delta, self.weights.T) w = np.dot( x.T,delta) w=eta*w+weight_decay*self.weights**2 self.v=momentum*self.v-w self.weights +=self.v self.biases -= eta*np.sum(delta,0) return out_delta def sigmoid(self,z): return 1.0/(1.0+np.exp(-z)) def sigmoid_prime(self,z): return z*(1-z) class Network(object): def __init__(self, layers): self.layers=layers self.num_layers = len(layers) self.a=[] def feedforward(self, x): self.a.append(x) for layer in self.layers: x=layer.feedforward(x) self.a.append(x) return x def SGD(self, training_data, test_data,epochs, mini_batch_size, lr=0.001,weight_decay=0.0005,momentum=0.9): self.n = len(training_data[0]) self.mini_batch_size=mini_batch_size self.weight_decay = weight_decay self.momentum=momentum self.lr = lr rate=np.exp(np.log(0.9)/2000)**mini_batch_size cx=range(epochs) for j in cx: for k in xrange(0, self.n , mini_batch_size): self.lr=self.lr*rate batch_x = np.array(training_data[0][k:k + mini_batch_size]) batch_y = training_data[1][k:k + mini_batch_size] self.backprop(batch_x,batch_y) if k%500==0: print "Epoch {0}:{1},test:{2},cost={3},lr={4}".format(j,k, self.evaluate([test_data[0],test_data[1]]),self.cost,self.lr) print time.time() def backprop(self, x_in, y): self.feedforward(x_in) for i in range(self.num_layers): delta=self.layers[-i-1].backprop(self.a[-i-2],y,eta=self.lr, weight_decay=self.weight_decay,momentum=self.momentum) y=delta def evaluate(self, test_data): x,y=test_data num=len(x) x=[self.feedforward(np.array(x[size*i:size*i+size])) for i in range((num/size))] x=np.reshape(x,(num,np.shape(x)[-1])) xp = np.argmax(x, axis=1) yp= np.argmax(y, axis=1) if y[0].ndim else y self.cost = -np.mean(np.log(x)[np.arange(num),yp]) return np.mean(yp == xp)*100 if __name__ == '__main__': def get_data(data): return [np.reshape(x, (1,28,28)) for x in data[0]] def get_label(i): c = np.zeros((10)) c[i] = 1 return c f = open('data/mnist.pkl', 'rb') training_data, validation_data, test_data = cPickle.load(f) training_inputs = get_data(training_data) training_label=[get_label(y_) for y_ in training_data[1]] test_inputs = get_data(test_data) test = zip(test_inputs,test_data[1]) size=50 net = Network([ConvPoolLayer(image_shape=[size,1,28,28],filter_shape=[20,1,5,5],poolsize=(2,2)), ConvPoolLayer(image_shape=[size,20,12,12],filter_shape=[40,20,5,5], poolsize=(2,2)), FullLayer(in_num=40*4*4,out_num=100), SoftmaxLayer(in_num=100,out_num=10)]) net.SGD([training_inputs,training_label],[test_inputs[:500],test_data[1][:500]], epochs=3,mini_batch_size=size, lr=0.005,weight_decay=0,momentum=0.9)
3.内存
上面的swig配置内存泄露严重,以下经过修改有所降低,但仍然不断增长。
可能是由于返回时创建了太多PyObject。
%module t %include <stdint.i> %typemap(in,numinputs=0,noblock=1) size_t *l1 { size_t templen1; $1 = &templen1; } %typemap(in,numinputs=0,noblock=1) size_t *l2 { size_t templen2; $1 = &templen2; } %typemap(in,numinputs=0,noblock=1) size_t *l3 { size_t templen3; $1 = &templen3; } %typemap(in,numinputs=0,noblock=1) size_t *l4 { size_t templen4; $1 = &templen4; } %typemap(in) double* { int i,j,k,l,index=0; if (!PySequence_Check($input)) { PyErr_SetString(PyExc_TypeError,"Expecting a sequence"); return NULL; } int len1=PyObject_Length($input); PyObject *t; t = PySequence_GetItem($input,0); int len2=PyObject_Length(t); t = PySequence_GetItem(t,0); int len3=PyObject_Length(t); t = PySequence_GetItem(t,0); int len4=PyObject_Length(t); $1 = (double *) malloc(len1*len2*len3*len4*sizeof(double)); for(i=0;i<len1;i++) { PyObject *t1 = PySequence_GetItem($input,i); for(j=0;j<len2;j++) { PyObject *t2 = PySequence_GetItem(t1,j); for(k=0;k<len3;k++) { PyObject *t3 = PySequence_GetItem(t2,k); for(l=0;l<len4;l++) { PyObject *t4 = PySequence_GetItem(t3,l); if (!PyNumber_Check(t4)) { PyErr_SetString(PyExc_ValueError,"Expecting a 4 dim sequence of doubles"); free($1); Py_DECREF(t1); Py_DECREF(t2); Py_DECREF(t3); Py_XDECREF(t4); Py_DECREF($input); return NULL; } *($1+index)= (double)PyFloat_AsDouble(t4); ++index; Py_DECREF(t4); } Py_DECREF(t3); } Py_DECREF(t2); } Py_DECREF(t1); } Py_DECREF($input); //Py_DECREF(t); } %typemap(out) double* pool2d { int i,j,k,l,index=0; $result = PyList_New(templen1); for (i = 0; i < templen1; i++) { PyObject *t2= PyList_New(templen2); for (j = 0; j < templen2; j++) { PyObject *t1= PyList_New(templen3); for (k = 0; k < templen3; k++) { PyObject *t= PyList_New(templen4); for (l = 0; l < templen4; l++) { PyObject *o = PyFloat_FromDouble((double)$1[index]); PyList_SetItem(t,l,o); ++index; } PyList_SetItem(t1,k,t); } PyList_SetItem(t2,j,t1); } PyList_SetItem($result,i,t2); } delete [] $1; } %typemap(out) double* im2col { int i,j,index=0; $result = PyList_New(templen1); for (i = 0; i < templen1; i++) { PyObject *t= PyList_New(templen2); for (j = 0; j < templen2; j++) { PyObject *o = PyFloat_FromDouble((double)$1[index]); PyList_SetItem(t,j,o); ++index; } PyList_SetItem($result,i,t); } delete [] $1; } %typemap(out) double* col2im { int i,j,k,l,index=0; $result = PyList_New(templen1); for (i = 0; i < templen1; i++) { PyObject *t2= PyList_New(templen2); for (j = 0; j < templen2; j++) { PyObject *t1= PyList_New(templen3); for (k = 0; k < templen3; k++) { PyObject *t= PyList_New(templen4); for (l = 0; l < templen4; l++) { PyObject *o = PyFloat_FromDouble((double)$1[index]); PyList_SetItem(t,l,o); ++index; } PyList_SetItem(t1,k,t); } PyList_SetItem(t2,j,t1); } PyList_SetItem($result,i,t2); } delete [] $1; } %typemap(freearg) double *a { if ($1) free($1); } %inline %{ double* pool2d(const double* const bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height,size_t *l1,size_t *l2,size_t *l3,size_t *l4) { const int w = width; const int h = height; const int m = channels; const int n = num; const int d = pooled_height; const int zh = h / d + h % d; const int zw = w / d + w % d; double* top_data; top_data=new double [n*m*zh*zw*sizeof(double)]; int i,j,k,o,u,v,index,index2=0; double s; for (k = 0; k < n; ++k) for (o = 0; o < m; ++o) for (i = 0; i < zh; ++i) for (j = 0; j < zw; ++j) { index=k*m*h*w+o*h*w+d*i*w+d*j; s=-10000.0; for (u = 0; u < d&&(u+d*i)<h; ++u) for (v = 0; v < d&&(v+d*j)<w; ++v) if (*(bottom_data+index+u*w+v)>s) s=*(bottom_data+index+u*w+v); *(top_data+index2)=s; ++index2; } *l1=n; *l2=m; *l3=zh; *l4=zw; return top_data; } double* im2col(const double* const bottom_data, const int num, const int channels, const int height, const int width, const int filter_height, const int filter_width,size_t *l1,size_t *l2) { const int w = width; const int h = height; const int m = channels; const int n = num; const int vh = filter_height; const int vw = filter_width; const int t = h - vh + 1; const int kh = t *t; const int kw = vh * vh; double* top_data; top_data=new double [n*m*kh*kw*sizeof(double)]; int k,o,i,j,u,v,index,index2; for (k = 0; k < n; ++k) for (o = 0; o < m; ++o) for (i = 0; i < t; ++i) for (j = 0; j < t; ++j) { index=k*kh*kw*m+(i*t+j)*kw*m+o*kw; index2=k*m*h*w+o*h*w; for (u = 0; u < vh; ++u) for (v = 0; v < vw; ++v) { *(top_data+index+u*vh+v)= *(bottom_data+index2+(i+u)*w+j+v); } } *l1=n*kh; *l2=kw*m; return top_data; } double* col2im(const double* const bottom_data, const int num, const int channels, const int height, const int width, const int filter_height, const int filter_width, size_t *l1,size_t *l2,size_t *l3,size_t *l4) { const int w = width; const int h = height; const int m = channels; const int n = num; const int vh = filter_height; const int vw = filter_width; const int t = h - vh + 1; const int kh = t *t; const int kw = vh * vh; int k,o,i,j,u,v,index,rh,b; double* top_data; top_data=new double [n*m*h*w*sizeof(double)]; for (k = 0; k < n; ++k) for (o = 0; o < m; ++o) for (i = 0; i < h; ++i) { rh=vh * (i<(vh - 1)?i:(vh-1)); b = ((i + 1 - vh)> 0?(i + 1 - vh):0) * t; index=k*m*h*h+o*h*h+i*h; for (u = 0; u < vh; ++u) { *(top_data+index+u)= *(bottom_data+k*kh*kw*m+b*kw*m+rh+u); } for (v = 0; v < t-1; ++v) { *(top_data+index+vh+v)= *(bottom_data+k*kh*kw*m+(b+1+v)*kw*m+rh+vh-1); } } *l1=n; *l2=m; *l3=h; *l4=w; return top_data; } %}