zoukankan html css js c++ java

光线跟踪=全局内存+常量内存

光线跟踪通过全局内存和常量内存实现，项目打包下载

全局内存

  1 /*
  2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
  3 *
  4 * NVIDIA Corporation and its licensors retain all intellectual property and
  5 * proprietary rights in and to this software and related documentation.
  6 * Any use, reproduction, disclosure, or distribution of this software
  7 * and related documentation without an express license agreement from
  8 * NVIDIA Corporation is strictly prohibited.
  9 *
 10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 11 * associated with this source code for terms and conditions that govern
 12 * your use of this NVIDIA software.
 13 *
 14 */
 15 
 16 #include <GLglut.h>
 17 #include "cuda.h"
 18 #include "../common/book.h"
 19 #include "../common/cpu_bitmap.h"
 20 #include "cuda_runtime.h"
 21 #include "device_launch_parameters.h"
 22 #include <math.h>
 23 #define DIM 512
 24 
 25 #define rnd( x ) (x * rand() / RAND_MAX)
 26 #define INF     2e10f
 27 
 28 struct Sphere {
 29     float   r, b, g;
 30     float   radius;
 31     float   x, y, z;
 32     __device__ float hit(float ox, float oy, float *n) {
 33         float dx = ox - x;
 34         float dy = oy - y;
 35         if (dx*dx + dy*dy < radius*radius) {
 36             float dz = sqrtf(radius*radius - dx*dx - dy*dy);
 37             *n = dz / sqrtf(radius * radius);
 38             return dz + z;
 39         }
 40         return -INF;
 41     }
 42 };
 43 #define SPHERES 100
 44 
 45 __constant__ Sphere s[SPHERES];
 46 
 47 __global__ void kernel(unsigned char *ptr) {
 48     // map from threadIdx/BlockIdx to pixel position
 49     int x = threadIdx.x + blockIdx.x * blockDim.x;
 50     int y = threadIdx.y + blockIdx.y * blockDim.y;
 51     int offset = x + y * blockDim.x * gridDim.x;
 52     float   ox = (x - DIM / 2);
 53     float   oy = (y - DIM / 2);
 54 
 55     float   r = 0, g = 0, b = 0;
 56     float   maxz = -INF;
 57     for (int i = 0; i<SPHERES; i++) {
 58         float   n;
 59         float   t = s[i].hit(ox, oy, &n);
 60         if (t > maxz) {
 61             float fscale = n;
 62             r = s[i].r * fscale;
 63             g = s[i].g * fscale;
 64             b = s[i].b * fscale;
 65             maxz = t;
 66         }
 67     }
 68 
 69     ptr[offset * 4 + 0] = (int)(r * 255);
 70     ptr[offset * 4 + 1] = (int)(g * 255);
 71     ptr[offset * 4 + 2] = (int)(b * 255);
 72     ptr[offset * 4 + 3] = 255;
 73 }
 74 
 75 // globals needed by the update routine
 76 struct DataBlock {
 77     unsigned char   *dev_bitmap;
 78 };
 79 
 80 int main(void) {
 81     DataBlock   data;
 82     // capture the start time
 83     cudaEvent_t     start, stop;
 84     HANDLE_ERROR(cudaEventCreate(&start));
 85     HANDLE_ERROR(cudaEventCreate(&stop));
 86     HANDLE_ERROR(cudaEventRecord(start, 0));
 87 
 88     CPUBitmap bitmap(DIM, DIM, &data);
 89     unsigned char   *dev_bitmap;
 90 
 91     // allocate memory on the GPU for the output bitmap
 92     HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, bitmap.image_size()));
 93 
 94     // allocate temp memory, initialize it, copy to constant
 95     // memory on the GPU, then free our temp memory
 96     Sphere *temp_s = (Sphere*)malloc(sizeof(Sphere)* SPHERES);
 97     for (int i = 0; i<SPHERES; i++) {
 98         temp_s[i].r = rnd(1.0f);
 99         temp_s[i].g = rnd(1.0f);
100         temp_s[i].b = rnd(1.0f);
101         temp_s[i].x = rnd(1000.0f) - 500;
102         temp_s[i].y = rnd(1000.0f) - 500;
103         temp_s[i].z = rnd(1000.0f) - 500;
104         temp_s[i].radius = rnd(100.0f) + 20;
105     }
106     /*
107     将SPHERES个球面对象存放在常量内存中
108     通过cudaMemcpyToSymbol来操作
109     */
110     HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere)* SPHERES));
111     free(temp_s);
112 
113     // generate a bitmap from our sphere data
114     dim3    grids(DIM / 16, DIM / 16);
115     dim3    threads(16, 16);
116     kernel <<<grids, threads >>>(dev_bitmap);
117 
118     // copy our bitmap back from the GPU for display
119     HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(), dev_bitmap, bitmap.image_size(), cudaMemcpyDeviceToHost));
120 
121     // get stop time, and display the timing results
122     HANDLE_ERROR(cudaEventRecord(stop, 0));
123     HANDLE_ERROR(cudaEventSynchronize(stop));
124     float   elapsedTime;
125     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
126     printf("Time to generate:  %3.1f ms
", elapsedTime);
127 
128     HANDLE_ERROR(cudaEventDestroy(start));
129     HANDLE_ERROR(cudaEventDestroy(stop));
130 
131     HANDLE_ERROR(cudaFree(dev_bitmap));
132 
133     // display
134     bitmap.display_and_exit();
135 }

常量内存：

  1 /*
  2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
  3 *
  4 * NVIDIA Corporation and its licensors retain all intellectual property and
  5 * proprietary rights in and to this software and related documentation.
  6 * Any use, reproduction, disclosure, or distribution of this software
  7 * and related documentation without an express license agreement from
  8 * NVIDIA Corporation is strictly prohibited.
  9 *
 10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 11 * associated with this source code for terms and conditions that govern
 12 * your use of this NVIDIA software.
 13 *
 14 */
 15 #include <GLglut.h>
 16 #include "cuda.h"
 17 #include "cuda_runtime.h"
 18 #include "device_launch_parameters.h"
 19 #include "../common/book.h"
 20 #include "../common/cpu_bitmap.h"
 21 #include "device_functions.h"
 22 
 23 #define DIM 512
 24 
 25 #define rnd( x ) (x * rand() / RAND_MAX)
 26 #define INF     2e10f
 27 
 28 struct Sphere {
 29     float   r, b, g;
 30     float   radius;
 31     float   x, y, z;
 32     __device__ float hit(float ox, float oy, float *n) {
 33         //将中心坐标移动到图像中间，dx和dy就是相对于新的中心坐标ox和oy的新坐标
 34         float dx = ox - x;
 35         float dy = oy - y;
 36         //只处理点在圆内的Sphere对象
 37         if (dx*dx + dy*dy < radius*radius) {
 38             /*
 39             计算的dz为离圆心轴的距离
 40             */
 41             float dz = sqrtf(radius*radius - dx*dx - dy*dy);
 42             /*
 43             和半径相除，由于dz是float类型，所以结果也为float类型
 44             也就是说结果为0.xxx这样的数字，
 45             n为一个指针，*n为解析指针n，存放的也就是值0.xxx这样的值
 46             呈现在最后的结果就是图像颜色的渐变效果
 47             */
 48             *n = dz / sqrtf(radius * radius);
 49             /*
 50             在三维空间中，已经将xoy投影到为图上，z轴垂直于位图
 51             距离圆心的距离dz再加上原来的Z轴坐标就是当前坐标对应于xoy面的Z轴方向距离
 52             */
 53             return dz + z;
 54         }
 55         return -INF;
 56     }
 57 };
 58 #define SPHERES 100
 59 
 60 
 61 __global__ void kernel(Sphere *s, unsigned char *ptr) {
 62     // 映射到图像像素上的位置
 63     int x = threadIdx.x + blockIdx.x * blockDim.x;
 64     int y = threadIdx.y + blockIdx.y * blockDim.y;
 65     int offset = x + y * blockDim.x * gridDim.x;//步长
 66     //移动使得Z轴在图像中心
 67     float   ox = (x - DIM / 2);
 68     float   oy = (y - DIM / 2);
 69 
 70     float   r = 0, g = 0, b = 0;
 71     float   maxz = -INF;
 72     //每个像素递归判断SPHERES个对象在这个像素点上的值
 73     for (int i = 0; i<SPHERES; i++) {
 74         float   n;
 75         float   t = s[i].hit(ox, oy, &n);
 76         //这里垂直于屏幕的坐标朝里为负，朝外为正，因此选择最大的那个距离显示颜色，距离小的我们认为看不见
 77         if (t > maxz) {
 78             //这里取n的地址，hit函数将结果存放在&n地址所指的空间，不同的n对应不同的颜色及深度
 79             float fscale = n;
 80             r = s[i].r * fscale;
 81             g = s[i].g * fscale;
 82             b = s[i].b * fscale;
 83             maxz = t;
 84         }
 85     }
 86 
 87     ptr[offset * 4 + 0] = (int)(r * 255);
 88     ptr[offset * 4 + 1] = (int)(g * 255);
 89     ptr[offset * 4 + 2] = (int)(b * 255);
 90     ptr[offset * 4 + 3] = 255;
 91 }
 92 
 93 
 94 // globals needed by the update routine
 95 struct DataBlock {
 96     unsigned char   *dev_bitmap;
 97     Sphere          *s;
 98 };
 99 
100 int main(void) {
101     DataBlock   data;
102     //获取时间
103     cudaEvent_t     start, stop;
104     HANDLE_ERROR(cudaEventCreate(&start));
105     HANDLE_ERROR(cudaEventCreate(&stop));
106     HANDLE_ERROR(cudaEventRecord(start, 0));
107 
108     CPUBitmap bitmap(DIM, DIM, &data);
109     unsigned char   *dev_bitmap;
110     Sphere          *s;
111 
112     // allocate memory on the GPU for the output bitmap
113     HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, bitmap.image_size()));
114     //在全局内存中分配s
115     HANDLE_ERROR(cudaMalloc((void**)&s,
116         sizeof(Sphere)* SPHERES));
117 
118     //主机上申请存储空间
119     Sphere *temp_s = (Sphere*)malloc(sizeof(Sphere)* SPHERES);
120     for (int i = 0; i<SPHERES; i++) {
121         temp_s[i].r = rnd(1.0f);
122         temp_s[i].g = rnd(1.0f);
123         temp_s[i].b = rnd(1.0f);
124         temp_s[i].x = rnd(1000.0f) - 500;
125         temp_s[i].y = rnd(1000.0f) - 500;
126         temp_s[i].z = rnd(1000.0f) - 500;
127         temp_s[i].radius = rnd(100.0f) + 20;
128     }
129     HANDLE_ERROR(cudaMemcpy(s, temp_s, sizeof(Sphere)* SPHERES, cudaMemcpyHostToDevice));
130     free(temp_s);
131 
132     // generate a bitmap from our sphere data
133     dim3    grids(DIM / 16, DIM / 16);
134     dim3    threads(16, 16);
135     kernel <<<grids, threads >>>(s, dev_bitmap);
136 
137     // copy our bitmap back from the GPU for display
138     HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(), dev_bitmap, bitmap.image_size(), cudaMemcpyDeviceToHost));
139 
140     // get stop time, and display the timing results
141     HANDLE_ERROR(cudaEventRecord(stop, 0));
142     HANDLE_ERROR(cudaEventSynchronize(stop));
143     float   elapsedTime;
144     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
145         start, stop));
146     printf("Time to generate:  %3.1f ms
", elapsedTime);
147 
148     HANDLE_ERROR(cudaEventDestroy(start));
149     HANDLE_ERROR(cudaEventDestroy(stop));
150 
151     HANDLE_ERROR(cudaFree(dev_bitmap));
152     HANDLE_ERROR(cudaFree(s));
153 
154     // display
155     bitmap.display_and_exit();
156 }

两者的结果为

全局内存

常量内存

但是问题是我的性能怎么就没有提升呢？请大侠看到了，给小弟指导下。E-mail:lianglianghelloworld@gmail.com。期待大牛的回复。

查看全文

相关阅读:
Netty实现Http客户端
 Netty实现Http服务端
 Netty实现Tcp客户端
 Netty实现Tcp服务端
 MySQL进阶系列：一文详解explain
spring boot 获取运行时的yml里的active配置
 eureka 注册中心添加认证
 zuul 负载
 jenkins spring cloud
秒杀系统如何设计？

原文地址：https://www.cnblogs.com/liangliangdetianxia/p/3988626.html