基于共享内存的位图,项目打包下载
1 /*
2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3 *
4 * NVIDIA Corporation and its licensors retain all intellectual property and
5 * proprietary rights in and to this software and related documentation.
6 * Any use, reproduction, disclosure, or distribution of this software
7 * and related documentation without an express license agreement from
8 * NVIDIA Corporation is strictly prohibited.
9 *
10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 * associated with this source code for terms and conditions that govern
12 * your use of this NVIDIA software.
13 *
14 */
15 #include <GLglut.h>
16 #include "cuda.h"
17 #include "cuda_runtime.h"
18 #include "device_launch_parameters.h"
19 #include "cuda.h"
20 #include "../common/book.h"
21 #include "../common/cpu_bitmap.h"
22
23
24 #define DIM 1024
25 #define PI 3.1415926535897932f
26
27 __global__ void kernel(unsigned char *ptr) {
28 // map from threadIdx/BlockIdx to pixel position
29 int x = threadIdx.x + blockIdx.x * blockDim.x;
30 int y = threadIdx.y + blockIdx.y * blockDim.y;
31 int offset = x + y * blockDim.x * gridDim.x;
32
33 __shared__ float shared[16][16];
34
35 // now calculate the value at that position
36 const float period = 128.0f;
37
38 shared[threadIdx.x][threadIdx.y] =
39 255 * (sinf(x*2.0f*PI / period) + 1.0f) *
40 (sinf(y*2.0f*PI / period) + 1.0f) / 4.0f;
41
42 // removing this syncthreads shows graphically what happens
43 // when it doesn't exist. this is an example of why we need it.
44 __syncthreads();
45
46 ptr[offset * 4 + 0] = 0;
47 ptr[offset * 4 + 1] = shared[15 - threadIdx.x][15 - threadIdx.y];
48 ptr[offset * 4 + 2] = 0;
49 ptr[offset * 4 + 3] = 255;
50 }
51
52 // globals needed by the update routine
53 struct DataBlock {
54 unsigned char *dev_bitmap;
55 };
56
57 int main(void) {
58 DataBlock data;
59 CPUBitmap bitmap(DIM, DIM, &data);
60 unsigned char *dev_bitmap;
61
62 HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap,
63 bitmap.image_size()));
64 data.dev_bitmap = dev_bitmap;
65
66 dim3 grids(DIM / 16, DIM / 16);
67 dim3 threads(16, 16);
68 kernel <<<grids, threads >>>(dev_bitmap);
69
70 HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(), dev_bitmap,
71 bitmap.image_size(),
72 cudaMemcpyDeviceToHost));
73
74 HANDLE_ERROR(cudaFree(dev_bitmap));
75
76 bitmap.display_and_exit();
77 }
kernel函数中加粗标红的 __syncthreads()在去掉和加上时的效果图是不一样的。
取消时:
加上时:
这也是为什么加上同步的重要性。
抛砖引玉
这个
int offset = x + y * blockDim.x * gridDim.x;
以及
1 ptr[offset * 4 + 0] = 0;
2 ptr[offset * 4 + 1] = shared[15 - threadIdx.x][15 - threadIdx.y];
3 ptr[offset * 4 + 2] = 0;
4 ptr[offset * 4 + 3] = 255;
如何理解?!