可移动固定内存测试,项目打包下载
1 #include "../common/book.h" 2 #include "cuda_runtime.h" 3 #include "device_launch_parameters.h" 4 #include "device_functions.h" 5 #define imin(a,b) (a<b?a:b) 6 7 #define N (33*1024*1024) 8 const int threadsPerBlock = 256; 9 const int blocksPerGrid = 10 imin(32, (N / 2 + threadsPerBlock - 1) / threadsPerBlock); 11 12 13 __global__ void dot(int size, float *a, float *b, float *c) { 14 __shared__ float cache[threadsPerBlock]; 15 int tid = threadIdx.x + blockIdx.x * blockDim.x; 16 int cacheIndex = threadIdx.x; 17 18 float temp = 0; 19 while (tid < size) { 20 temp += a[tid] * b[tid]; 21 tid += blockDim.x * gridDim.x; 22 } 23 24 // set the cache values 25 cache[cacheIndex] = temp; 26 27 //块内线程同步 28 __syncthreads(); 29 30 // for reductions, threadsPerBlock must be a power of 2 31 // because of the following code 32 int i = blockDim.x / 2; 33 while (i != 0) { 34 if (cacheIndex < i) 35 cache[cacheIndex] += cache[cacheIndex + i]; 36 __syncthreads(); 37 i /= 2; 38 } 39 40 if (cacheIndex == 0) 41 c[blockIdx.x] = cache[0]; 42 } 43 44 45 struct DataStruct { 46 int deviceID; 47 int size; 48 int offset; 49 float *a; 50 float *b; 51 float returnValue; 52 }; 53 54 unsigned WINAPI routine(void *pvoidData) 55 //void* routine(void *pvoidData) 56 { 57 DataStruct *data = (DataStruct*)pvoidData; 58 //device0上已经调用了这个代码,这里是device为非0才调用 59 if (data->deviceID != 0) { 60 HANDLE_ERROR(cudaSetDevice(data->deviceID)); 61 //告诉运行时希望在和这个设备上分配零拷贝内存,不用在设定是否为可移动的,因为在device0中已经设定 62 HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost)); 63 } 64 65 int size = data->size; 66 float *a, *b, c, *partial_c; 67 float *dev_a, *dev_b, *dev_partial_c; 68 69 // allocate memory on the CPU side 70 a = data->a; 71 b = data->b; 72 partial_c = (float*)malloc(blocksPerGrid*sizeof(float)); 73 74 // allocate the memory on the GPU 75 HANDLE_ERROR(cudaHostGetDevicePointer(&dev_a, a, 0)); 76 HANDLE_ERROR(cudaHostGetDevicePointer(&dev_b, b, 0)); 77 HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c, 78 blocksPerGrid*sizeof(float))); 79 80 // offset 'a' and 'b' to where this GPU is gets it data 81 dev_a += data->offset; 82 dev_b += data->offset; 83 84 dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b, 85 dev_partial_c); 86 // copy the array 'c' back from the GPU to the CPU 87 HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c, 88 blocksPerGrid*sizeof(float), 89 cudaMemcpyDeviceToHost)); 90 91 // finish up on the CPU side 92 c = 0; 93 for (int i = 0; i<blocksPerGrid; i++) { 94 c += partial_c[i]; 95 } 96 97 HANDLE_ERROR(cudaFree(dev_partial_c)); 98 99 // free memory on the CPU side 100 free(partial_c); 101 102 data->returnValue = c; 103 return 0; 104 } 105 106 107 int main(void) { 108 int deviceCount; 109 HANDLE_ERROR(cudaGetDeviceCount(&deviceCount)); 110 if (deviceCount < 2) { 111 printf("We need at least two compute 1.0 or greater " 112 "devices, but only found %d ", deviceCount); 113 return 0; 114 } 115 116 cudaDeviceProp prop; 117 for (int i = 0; i<2; i++) { 118 HANDLE_ERROR(cudaGetDeviceProperties(&prop, i)); 119 if (prop.canMapHostMemory != 1) { 120 printf("Device %d can not map memory. ", i); 121 return 0; 122 } 123 } 124 125 float *a, *b; 126 HANDLE_ERROR(cudaSetDevice(0)); 127 HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost)); 128 /* 129 在设置了设备0后,设置了分配内存的类型为cudaHostAllocPortable, 130 否则只有设备0会将这些分配的内存视为固定内存 131 只在device0中设定为可移动的 132 */ 133 HANDLE_ERROR(cudaHostAlloc((void**)&a, N*sizeof(float), 134 cudaHostAllocWriteCombined | 135 cudaHostAllocPortable | 136 cudaHostAllocMapped)); 137 HANDLE_ERROR(cudaHostAlloc((void**)&b, N*sizeof(float), 138 cudaHostAllocWriteCombined | 139 cudaHostAllocPortable | 140 cudaHostAllocMapped)); 141 142 // fill in the host memory with data 143 for (int i = 0; i<N; i++) { 144 a[i] = i; 145 b[i] = i * 2; 146 } 147 148 // prepare for multithread 149 DataStruct data[2]; 150 data[0].deviceID = 0; 151 data[0].offset = 0; 152 data[0].size = N / 2; 153 data[0].a = a; 154 data[0].b = b; 155 156 data[1].deviceID = 1; 157 data[1].offset = N / 2; 158 data[1].size = N / 2; 159 data[1].a = a; 160 data[1].b = b; 161 162 CUTThread thread = start_thread(routine, &(data[1])); 163 routine(&(data[0])); 164 end_thread(thread); 165 166 167 // free memory on the CPU side 168 HANDLE_ERROR(cudaFreeHost(a)); 169 HANDLE_ERROR(cudaFreeHost(b)); 170 171 printf("Value calculated: %f ", 172 data[0].returnValue + data[1].returnValue); 173 174 return 0; 175 }