https://code.google.com/p/deep-learning-faces/source/browse/trunk/cuda_ut/include/bsxfun.h?r=7&spec=svn7
/* Copyright (C) 2013 Yichuan Tang. contact: tang at cs.toronto.edu http://www.cs.toronto.edu/~tang This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #ifndef _BSXFUN_H_ #define _BSXFUN_H_ #include "cu_util.h" #include "cu_clmatrix.h" /*********************************************************************************************************** * @brief: this function performs a matrix + col. vector operation * * @param[in]: pA and pOut: nI by nJ matrix * pB is a column vector nI by 1 * nInJ is the total dimensionality of the matrix pA * * @param[out]: * @topology: assumes a 1D block layout in x direction and covers the entire matrix pA * @note: assume column-major * @change: * @tested: * @to_do: *********************************************************************************************************** */ template<class O, typename T> __global__ void bsxfun_colvec_1dkernel( const T* pA, const T* pVec, T* pOut, int nI, int nJ, int nInJ, O op) { const unsigned int ind = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int totalThreads = blockDim.x*gridDim.x; for (int i = ind; i < nInJ; i += totalThreads) pOut[i] = op(pA[i], pVec[i % nI]); } /*********************************************************************************************************** * @brief: this function performs a matrix + row. vector operation * @param[in]: pA and pOut: nI by nJ matrix * pVec is a row vector 1 by nJ * nInJ is the total dimensionality of the matrix pA * * @param[out]: * @topology: assumes a 1D block layout in x direction and covers the entire matrix pA * @note: assume column-major * @change: * @tested: * @to_do: *********************************************************************************************************** */ template<class O, typename T> __global__ void bsxfun_rowvec_1dkernel( const T* pA, const T* pVec, T* pOut, int nI, int nJ, int nInJ, O op) { const unsigned int ind = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int totalThreads = blockDim.x*gridDim.x; for (int i = ind; i < nInJ; i += totalThreads) pOut[i] = op(pA[i], pVec[i / nI]); } //alpha beta version template<class O, typename T> __global__ void bsxfun_colvec_1dkernel( T alpha, const T* pA, T beta, const T* pVec, T* pOut, int nI, int nJ, int nInJ, O op) { const unsigned int ind = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int totalThreads = blockDim.x*gridDim.x; for (int i = ind; i < nInJ; i += totalThreads) pOut[i] = op(pA[i], alpha, pVec[i % nI], beta); } template<class O, typename T> __global__ void bsxfun_rowvec_1dkernel( T alpha, const T * pA, T beta, const T* pVec, T* pOut, int nI, int nJ, int nInJ, O op) { const unsigned int ind = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int totalThreads = blockDim.x*gridDim.x; for (int i = ind; i < nInJ; i += totalThreads) pOut[i] = op(pA[i], alpha, pVec[i / nI], beta); } /*********************************************************************************************************** * @brief: function similar to bsxfun of matlab * A op B ---> Out * @param[in]: op - type of operation * A - first matrix * B - col/row vector, one dimension must be 1 * @param[out]: if Out is set to A, the operation is inplace, overwrites A * * @topology: * @note: * @change: * @tested: * @to_do: switch to shared memory operators to see if we can achieve speedup?! *********************************************************************************************************** */ template<class O, typename T> int Bsxfun( const clMatrix<T>& A, O op, const clMatrix<T>& B, clMatrix<T>& Out){ if (! (B.nI == 1 || B.nJ == 1) ) return -1; if ( ( B.nI == 1 && B.nJ != A.nJ) || ( B.nJ == 1 && B.nI != A.nI) ){ if (!(B.nI == 1 && B.nJ == 1)) //special case return -2; } if ( A.nI != Out.nI || A.nJ != Out.nJ) return -3; const unsigned int datadim = A.nJ*A.nI; dim3 dim_block( MEDIUM_NUM_THREADS ); dim3 dim_grid( MIN( MAX_GRIDS, (datadim + dim_block.x-1)/dim_block.x) ); if (B.nJ == 1 && B.nI != 1){ bsxfun_colvec_1dkernel<<<dim_grid, dim_block>>>( A.pData, B.pData, Out.pData, A.nI, A.nJ, datadim, op); }else if (B.nJ != 1 && B.nI == 1){ bsxfun_rowvec_1dkernel<<<dim_grid, dim_block>>>( A.pData, B.pData, Out.pData, A.nI, A.nJ, datadim, op ); }else{ // when B is 1x1 if (A.nI == 1){ bsxfun_colvec_1dkernel<<<dim_grid, dim_block>>>( A.pData, B.pData, Out.pData, A.nI, A.nJ, datadim, op); }else if (A.nJ == 1){ bsxfun_rowvec_1dkernel<<<dim_grid, dim_block>>>( A.pData, B.pData, Out.pData, A.nI, A.nJ, datadim, op ); }else{ return -4; //invalid case } } return 0; } //alpha beta version template<class O, typename T> int Bsxfun(T alpha, const clMatrix<T>& A, O op, T beta, const clMatrix<T>& B, clMatrix<T>& Out){ if (! (B.nI == 1 || B.nJ == 1) ) return -1; if ( ( B.nI == 1 && B.nJ != A.nJ) || ( B.nJ == 1 && B.nI != A.nI) ){ if (!(B.nI == 1 && B.nJ == 1)) //special case return -2; } if ( A.nI != Out.nI || A.nJ != Out.nJ) return -3; const uint64_t datadim = A.nJ*A.nI; dim3 dim_block( MEDIUM_NUM_THREADS ); dim3 dim_grid( MIN( MAX_GRIDS, (datadim + dim_block.x-1)/dim_block.x) ); if (B.nJ == 1 && B.nI != 1){ bsxfun_colvec_1dkernel<<<dim_grid, dim_block>>>( alpha, A.pData, beta, B.pData, Out.pData, A.nI, A.nJ, datadim, op); }else if (B.nJ != 1 && B.nI == 1){ bsxfun_rowvec_1dkernel<<<dim_grid, dim_block>>>( alpha, A.pData, beta, B.pData, Out.pData, A.nI, A.nJ, datadim, op ); }else{ if (A.nI == 1){ bsxfun_colvec_1dkernel<<<dim_grid, dim_block>>>(alpha, A.pData, beta, B.pData, Out.pData, A.nI, A.nJ, datadim, op); }else if (A.nJ == 1){ bsxfun_rowvec_1dkernel<<<dim_grid, dim_block>>>(alpha, A.pData, beta, B.pData, Out.pData, A.nI, A.nJ, datadim, op ); }else{ return -4; //invalid case } } return 0; } #endif