这里对 上一篇 ---cuda实践3 中的代码进行修改,在block中对share memory 进行迭代。
template <int BLOCK_SIZE> __global__ void caculateShelter_cuda( float *uv_triangulation_0, float *uv_triangulation, float *w_triangulation, float *w_triangulation_center, float *position_panorama_vect, int triangule_num_d, int panorama_num_d, int imgHeight_d, int imgWidth_d, float dis_threshold, int*inside_ptr_d, int* result) { int distance_threshold = dis_threshold; int times = triangule_num_d; int num = imgHeight_d * imgWidth_d; int x = threadIdx.x; int y = threadIdx.y; int Row = blockIdx.y * BLOCK_SIZE + threadIdx.y; int Col = blockIdx.x * BLOCK_SIZE + threadIdx.x; //int blockId = blockIdx.x + blockIdx.y * gridDim.x; //int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x; float searchPoint[2]; searchPoint[0] = Col * 1.0 / imgWidth_d; searchPoint[1] = (imgHeight_d - Row) * 1.0 / imgHeight_d; int threadId = Row * imgWidth_d + Col; int inside = -1; int index_temp = -1; int step = BLOCK_SIZE * BLOCK_SIZE; int grid_size = (times / step) + 1; int threadid_0 = BLOCK_SIZE * y + x; int tempx = threadid_0 % 32; int tempy = threadid_0 / 32; for (int t = 0; t < grid_size; t++) { ////////////////////////////////////////share memory __shared__ float tile_uv[BLOCK_SIZE*BLOCK_SIZE * 6]; if ((t*step + BLOCK_SIZE * y + x) < times) { tile_uv[32 * tempy * 6 + tempx] = uv_triangulation_0[t*step + BLOCK_SIZE * y + x]; //注意越域 tile_uv[32 * tempy * 6 + 1 * 32 + tempx] = uv_triangulation_0[1 * times + t * step + BLOCK_SIZE * y + x]; tile_uv[32 * tempy * 6 + 2 * 32 + tempx] = uv_triangulation_0[2 * times + t * step + BLOCK_SIZE * y + x]; tile_uv[32 * tempy * 6 + 3 * 32 + tempx] = uv_triangulation_0[3 * times + t * step + BLOCK_SIZE * y + x]; tile_uv[32 * tempy * 6 + 4 * 32 + tempx] = uv_triangulation_0[4 * times + t * step + BLOCK_SIZE * y + x]; tile_uv[32 * tempy * 6 + 5 * 32 + tempx] = uv_triangulation_0[5 * times + t * step + BLOCK_SIZE * y + x]; } else { tile_uv[32 * tempy * 6 + tempx] = 0.0; //注意越域 tile_uv[32 * tempy * 6 + 1 * 32 + tempx] = 0.0; tile_uv[32 * tempy * 6 + 2 * 32 + tempx] = 0.0; tile_uv[32 * tempy * 6 + 3 * 32 + tempx] = 0.0; tile_uv[32 * tempy * 6 + 4 * 32 + tempx] = 0.0; tile_uv[32 * tempy * 6 + 5 * 32 + tempx] = 0.0; } __syncthreads(); int v = 0; while (v < BLOCK_SIZE*BLOCK_SIZE) { float A[2], B[2], C[2]; int threadid_1 = (threadid_0 + v) % (BLOCK_SIZE*BLOCK_SIZE); int tempx2 = threadid_1 % 32; int tempy2 = threadid_1 / 32; A[0] = tile_uv[32 * tempy2 * 6 + tempx2]; A[1] = tile_uv[32 * tempy2 * 6 + 1 * 32 + tempx2]; B[0] = tile_uv[32 * tempy2 * 6 + 2 * 32 + tempx2]; B[1] = tile_uv[32 * tempy2 * 6 + 3 * 32 + tempx2]; C[0] = tile_uv[32 * tempy2 * 6 + 4 * 32 + tempx2]; C[1] = tile_uv[32 * tempy2 * 6 + 5 * 32 + tempx2]; if ((t*step + tempy2*32 + tempx2) >= times) { break; } bool inornot = pointInTriangle_cuda(A, B, C, searchPoint); if (inornot && inside == -1 && inside == -1) //inside /on { index_temp = t * step + tempy2 * 32 + tempx2; inside = 0; break; } v++; } __syncthreads(); //////////////////////////////////////////////global memory //float A[2], B[2], C[2]; //A[0] = uv_triangulation_0[t]; //A[1] = uv_triangulation_0[1 * times + t]; //B[0] = uv_triangulation_0[2 * times + t]; //B[1] = uv_triangulation_0[3 * times + t]; //C[0] = uv_triangulation_0[4 * times + t]; //C[1] = uv_triangulation_0[5 * times + t]; //if (pointInTriangle_cuda(A, B, C, searchPoint)) //inside /on //{ // index_temp = t; // inside = 0; // break; //} } inside_ptr_d[2 * threadId] = inside; inside_ptr_d[2 * threadId + 1] = index_temp; if (inside == 0 && threadId < num) { int tr_index = index_temp; float pt3d[3]; float uv_triangulation_temp[6]; uv_triangulation_temp[0] = uv_triangulation[6 * tr_index]; uv_triangulation_temp[1] = uv_triangulation[6 * tr_index + 1]; uv_triangulation_temp[2] = uv_triangulation[6 * tr_index + 2]; uv_triangulation_temp[3] = uv_triangulation[6 * tr_index + 3]; uv_triangulation_temp[4] = uv_triangulation[6 * tr_index + 4]; uv_triangulation_temp[5] = uv_triangulation[6 * tr_index + 5]; float w_triangulation_temp[9]; w_triangulation_temp[0] = w_triangulation[9 * tr_index]; w_triangulation_temp[1] = w_triangulation[9 * tr_index + 1]; w_triangulation_temp[2] = w_triangulation[9 * tr_index + 2]; w_triangulation_temp[3] = w_triangulation[9 * tr_index + 3]; w_triangulation_temp[4] = w_triangulation[9 * tr_index + 4]; w_triangulation_temp[5] = w_triangulation[9 * tr_index + 5]; w_triangulation_temp[6] = w_triangulation[9 * tr_index + 6]; w_triangulation_temp[7] = w_triangulation[9 * tr_index + 7]; w_triangulation_temp[8] = w_triangulation[9 * tr_index + 8]; caculateMappingTriangle2dTo3d(uv_triangulation_temp, searchPoint, w_triangulation_temp, pt3d); // rewrite in cuda int not_in_shelter_num = 0; //Shelter check for (int m = 0; m < panorama_num_d; m++) { if (not_in_shelter_num >= PanoramaNUM) { break; } // add threshold ditance (pt3d to optical_center) //... // add threshold angle of triangle face normal and vector(pt3d to optical_center) //... float optical_center[3]; optical_center[0] = position_panorama_vect[m * 3]; optical_center[1] = position_panorama_vect[m * 3 +1]; optical_center[2] = position_panorama_vect[m * 3 +2]; //caculate shelter //caculate near triangles of the ray int ret0 = 0; float redius_ = distance3d(optical_center, pt3d); float rayoptical[3]; rayoptical[0] = optical_center[0] - pt3d[0]; rayoptical[1] = optical_center[1] - pt3d[1]; rayoptical[2] = optical_center[2] - pt3d[2]; float angle = angleOfNormalOfTriangleAndSunline(w_triangulation_temp, rayoptical); float temp0 = fabs(angle - 3.1415926 / 2.0); if (redius_ > distance_threshold || temp0 < 3.1415926 / 10.0) // { continue; } float mid_pt[3]; mid_pt[0] = (optical_center[0] + pt3d[0])*0.5; mid_pt[1] = (optical_center[1] + pt3d[1])*0.5; mid_pt[2] = (optical_center[2] + pt3d[2])*0.5; for (int i = 0; i < times; i++) { float triangle[9]; triangle[0] = w_triangulation[9 * i]; triangle[1] = w_triangulation[9 * i + 1]; triangle[2] = w_triangulation[9 * i + 2]; triangle[3] = w_triangulation[9 * i + 3]; triangle[4] = w_triangulation[9 * i + 4]; triangle[5] = w_triangulation[9 * i + 5]; triangle[6] = w_triangulation[9 * i + 6]; triangle[7] = w_triangulation[9 * i + 7]; triangle[8] = w_triangulation[9 * i + 8]; float pt_temp[3]; pt_temp[0] = w_triangulation_center[3 * i]; pt_temp[1] = w_triangulation_center[3 * i + 1]; pt_temp[2] = w_triangulation_center[3 * i + 2]; float temp1 = distance3d(pt_temp, mid_pt); if (temp1 > 0.36*redius_ || i == tr_index) { continue; } //int ret1 = rayTracingShelterCaculate_cuda2(pt3d, optical_center, triangle); //if (ret1 == 0) // in shelter //{ // ret0 = -1; // break; //} float ret_dis = pt3dToLine3d(pt_temp, optical_center, pt3d); if (ret_dis < 0.10) // in shelter { ret0 = -1; break; } } if (ret0 == 0) { result[threadId*PanoramaNUM + not_in_shelter_num] = m+1; //not in shelter not_in_shelter_num++; } } } }