光线跟踪通过全局内存和常量内存实现,项目打包下载
全局内存
1 /*
2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3 *
4 * NVIDIA Corporation and its licensors retain all intellectual property and
5 * proprietary rights in and to this software and related documentation.
6 * Any use, reproduction, disclosure, or distribution of this software
7 * and related documentation without an express license agreement from
8 * NVIDIA Corporation is strictly prohibited.
9 *
10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 * associated with this source code for terms and conditions that govern
12 * your use of this NVIDIA software.
13 *
14 */
15
16 #include <GLglut.h>
17 #include "cuda.h"
18 #include "../common/book.h"
19 #include "../common/cpu_bitmap.h"
20 #include "cuda_runtime.h"
21 #include "device_launch_parameters.h"
22 #include <math.h>
23 #define DIM 512
24
25 #define rnd( x ) (x * rand() / RAND_MAX)
26 #define INF 2e10f
27
28 struct Sphere {
29 float r, b, g;
30 float radius;
31 float x, y, z;
32 __device__ float hit(float ox, float oy, float *n) {
33 float dx = ox - x;
34 float dy = oy - y;
35 if (dx*dx + dy*dy < radius*radius) {
36 float dz = sqrtf(radius*radius - dx*dx - dy*dy);
37 *n = dz / sqrtf(radius * radius);
38 return dz + z;
39 }
40 return -INF;
41 }
42 };
43 #define SPHERES 100
44
45 __constant__ Sphere s[SPHERES];
46
47 __global__ void kernel(unsigned char *ptr) {
48 // map from threadIdx/BlockIdx to pixel position
49 int x = threadIdx.x + blockIdx.x * blockDim.x;
50 int y = threadIdx.y + blockIdx.y * blockDim.y;
51 int offset = x + y * blockDim.x * gridDim.x;
52 float ox = (x - DIM / 2);
53 float oy = (y - DIM / 2);
54
55 float r = 0, g = 0, b = 0;
56 float maxz = -INF;
57 for (int i = 0; i<SPHERES; i++) {
58 float n;
59 float t = s[i].hit(ox, oy, &n);
60 if (t > maxz) {
61 float fscale = n;
62 r = s[i].r * fscale;
63 g = s[i].g * fscale;
64 b = s[i].b * fscale;
65 maxz = t;
66 }
67 }
68
69 ptr[offset * 4 + 0] = (int)(r * 255);
70 ptr[offset * 4 + 1] = (int)(g * 255);
71 ptr[offset * 4 + 2] = (int)(b * 255);
72 ptr[offset * 4 + 3] = 255;
73 }
74
75 // globals needed by the update routine
76 struct DataBlock {
77 unsigned char *dev_bitmap;
78 };
79
80 int main(void) {
81 DataBlock data;
82 // capture the start time
83 cudaEvent_t start, stop;
84 HANDLE_ERROR(cudaEventCreate(&start));
85 HANDLE_ERROR(cudaEventCreate(&stop));
86 HANDLE_ERROR(cudaEventRecord(start, 0));
87
88 CPUBitmap bitmap(DIM, DIM, &data);
89 unsigned char *dev_bitmap;
90
91 // allocate memory on the GPU for the output bitmap
92 HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, bitmap.image_size()));
93
94 // allocate temp memory, initialize it, copy to constant
95 // memory on the GPU, then free our temp memory
96 Sphere *temp_s = (Sphere*)malloc(sizeof(Sphere)* SPHERES);
97 for (int i = 0; i<SPHERES; i++) {
98 temp_s[i].r = rnd(1.0f);
99 temp_s[i].g = rnd(1.0f);
100 temp_s[i].b = rnd(1.0f);
101 temp_s[i].x = rnd(1000.0f) - 500;
102 temp_s[i].y = rnd(1000.0f) - 500;
103 temp_s[i].z = rnd(1000.0f) - 500;
104 temp_s[i].radius = rnd(100.0f) + 20;
105 }
106 /*
107 将SPHERES个球面对象存放在常量内存中
108 通过cudaMemcpyToSymbol来操作
109 */
110 HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere)* SPHERES));
111 free(temp_s);
112
113 // generate a bitmap from our sphere data
114 dim3 grids(DIM / 16, DIM / 16);
115 dim3 threads(16, 16);
116 kernel <<<grids, threads >>>(dev_bitmap);
117
118 // copy our bitmap back from the GPU for display
119 HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(), dev_bitmap, bitmap.image_size(), cudaMemcpyDeviceToHost));
120
121 // get stop time, and display the timing results
122 HANDLE_ERROR(cudaEventRecord(stop, 0));
123 HANDLE_ERROR(cudaEventSynchronize(stop));
124 float elapsedTime;
125 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
126 printf("Time to generate: %3.1f ms
", elapsedTime);
127
128 HANDLE_ERROR(cudaEventDestroy(start));
129 HANDLE_ERROR(cudaEventDestroy(stop));
130
131 HANDLE_ERROR(cudaFree(dev_bitmap));
132
133 // display
134 bitmap.display_and_exit();
135 }
常量内存:
1 /*
2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3 *
4 * NVIDIA Corporation and its licensors retain all intellectual property and
5 * proprietary rights in and to this software and related documentation.
6 * Any use, reproduction, disclosure, or distribution of this software
7 * and related documentation without an express license agreement from
8 * NVIDIA Corporation is strictly prohibited.
9 *
10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 * associated with this source code for terms and conditions that govern
12 * your use of this NVIDIA software.
13 *
14 */
15 #include <GLglut.h>
16 #include "cuda.h"
17 #include "cuda_runtime.h"
18 #include "device_launch_parameters.h"
19 #include "../common/book.h"
20 #include "../common/cpu_bitmap.h"
21 #include "device_functions.h"
22
23 #define DIM 512
24
25 #define rnd( x ) (x * rand() / RAND_MAX)
26 #define INF 2e10f
27
28 struct Sphere {
29 float r, b, g;
30 float radius;
31 float x, y, z;
32 __device__ float hit(float ox, float oy, float *n) {
33 //将中心坐标移动到图像中间,dx和dy就是相对于新的中心坐标ox和oy的新坐标
34 float dx = ox - x;
35 float dy = oy - y;
36 //只处理点在圆内的Sphere对象
37 if (dx*dx + dy*dy < radius*radius) {
38 /*
39 计算的dz为离圆心轴的距离
40 */
41 float dz = sqrtf(radius*radius - dx*dx - dy*dy);
42 /*
43 和半径相除,由于dz是float类型,所以结果也为float类型
44 也就是说结果为0.xxx这样的数字,
45 n为一个指针,*n为解析指针n,存放的也就是值0.xxx这样的值
46 呈现在最后的结果就是图像颜色的渐变效果
47 */
48 *n = dz / sqrtf(radius * radius);
49 /*
50 在三维空间中,已经将xoy投影到为图上,z轴垂直于位图
51 距离圆心的距离dz再加上原来的Z轴坐标就是当前坐标对应于xoy面的Z轴方向距离
52 */
53 return dz + z;
54 }
55 return -INF;
56 }
57 };
58 #define SPHERES 100
59
60
61 __global__ void kernel(Sphere *s, unsigned char *ptr) {
62 // 映射到图像像素上的位置
63 int x = threadIdx.x + blockIdx.x * blockDim.x;
64 int y = threadIdx.y + blockIdx.y * blockDim.y;
65 int offset = x + y * blockDim.x * gridDim.x;//步长
66 //移动使得Z轴在图像中心
67 float ox = (x - DIM / 2);
68 float oy = (y - DIM / 2);
69
70 float r = 0, g = 0, b = 0;
71 float maxz = -INF;
72 //每个像素递归判断SPHERES个对象在这个像素点上的值
73 for (int i = 0; i<SPHERES; i++) {
74 float n;
75 float t = s[i].hit(ox, oy, &n);
76 //这里垂直于屏幕的坐标朝里为负,朝外为正,因此选择最大的那个距离显示颜色,距离小的我们认为看不见
77 if (t > maxz) {
78 //这里取n的地址,hit函数将结果存放在&n地址所指的空间,不同的n对应不同的颜色及深度
79 float fscale = n;
80 r = s[i].r * fscale;
81 g = s[i].g * fscale;
82 b = s[i].b * fscale;
83 maxz = t;
84 }
85 }
86
87 ptr[offset * 4 + 0] = (int)(r * 255);
88 ptr[offset * 4 + 1] = (int)(g * 255);
89 ptr[offset * 4 + 2] = (int)(b * 255);
90 ptr[offset * 4 + 3] = 255;
91 }
92
93
94 // globals needed by the update routine
95 struct DataBlock {
96 unsigned char *dev_bitmap;
97 Sphere *s;
98 };
99
100 int main(void) {
101 DataBlock data;
102 //获取时间
103 cudaEvent_t start, stop;
104 HANDLE_ERROR(cudaEventCreate(&start));
105 HANDLE_ERROR(cudaEventCreate(&stop));
106 HANDLE_ERROR(cudaEventRecord(start, 0));
107
108 CPUBitmap bitmap(DIM, DIM, &data);
109 unsigned char *dev_bitmap;
110 Sphere *s;
111
112 // allocate memory on the GPU for the output bitmap
113 HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, bitmap.image_size()));
114 //在全局内存中分配s
115 HANDLE_ERROR(cudaMalloc((void**)&s,
116 sizeof(Sphere)* SPHERES));
117
118 //主机上申请存储空间
119 Sphere *temp_s = (Sphere*)malloc(sizeof(Sphere)* SPHERES);
120 for (int i = 0; i<SPHERES; i++) {
121 temp_s[i].r = rnd(1.0f);
122 temp_s[i].g = rnd(1.0f);
123 temp_s[i].b = rnd(1.0f);
124 temp_s[i].x = rnd(1000.0f) - 500;
125 temp_s[i].y = rnd(1000.0f) - 500;
126 temp_s[i].z = rnd(1000.0f) - 500;
127 temp_s[i].radius = rnd(100.0f) + 20;
128 }
129 HANDLE_ERROR(cudaMemcpy(s, temp_s, sizeof(Sphere)* SPHERES, cudaMemcpyHostToDevice));
130 free(temp_s);
131
132 // generate a bitmap from our sphere data
133 dim3 grids(DIM / 16, DIM / 16);
134 dim3 threads(16, 16);
135 kernel <<<grids, threads >>>(s, dev_bitmap);
136
137 // copy our bitmap back from the GPU for display
138 HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(), dev_bitmap, bitmap.image_size(), cudaMemcpyDeviceToHost));
139
140 // get stop time, and display the timing results
141 HANDLE_ERROR(cudaEventRecord(stop, 0));
142 HANDLE_ERROR(cudaEventSynchronize(stop));
143 float elapsedTime;
144 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
145 start, stop));
146 printf("Time to generate: %3.1f ms
", elapsedTime);
147
148 HANDLE_ERROR(cudaEventDestroy(start));
149 HANDLE_ERROR(cudaEventDestroy(stop));
150
151 HANDLE_ERROR(cudaFree(dev_bitmap));
152 HANDLE_ERROR(cudaFree(s));
153
154 // display
155 bitmap.display_and_exit();
156 }
两者的结果为
全局内存
常量内存
但是问题是我的性能怎么就没有提升呢?请大侠看到了,给小弟指导下。E-mail:lianglianghelloworld@gmail.com。期待大牛的回复。