zoukankan      html  css  js  c++  java
  • 0_Simple__simpleLayeredTexture

    二维分层纹理

    ▶ 源代码。用纹理方法把元素按原顺序从 CUDA3D 数组中取出来,求个相反数再加上层数放入全局内存,输出。

      1 #include <stdio.h>
      2 #include "cuda_runtime.h"
      3 #include "device_launch_parameters.h"
      4 #include <helper_functions.h>
      5 #include <helper_cuda.h>
      6 
      7 #define MIN_EPSILON_ERROR 5e-3f
      8 #define OUTPUT 5
      9 
     10 texture<float, cudaTextureType2DLayered> tex;
     11 
     12 __global__ void transformKernel(float *g_odata, int width, int height, int layer)
     13 {
     14     unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
     15     unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
     16 
     17     float u = (x + 0.5f) / (float)width;
     18     float v = (y + 0.5f) / (float)height;
     19 
     20     g_odata[layer*width*height + y*width + x] = - tex2DLayered(tex, u, v, layer) + layer;
     21 }
     22 
     23 int main(int argc, char **argv)
     24 {
     25     unsigned int width = 512, height = 512, num_layers = 5;
     26     unsigned int size = width * height * num_layers * sizeof(float);
     27     float *h_data = (float *)malloc(size);
     28     float *h_data_ref = (float *)malloc(size);
     29     float *d_data = NULL;
     30     cudaMalloc((void **)&d_data, size);
     31 
     32     for (unsigned int layer = 0; layer < num_layers; layer++)
     33     {
     34         for (int i = 0; i < (int)(width * height); i++)
     35             h_data[layer*width*height + i] = (float)i;
     36     }
     37    
     38     for (unsigned int layer = 0; layer < num_layers; layer++)
     39     {
     40         for (int i = 0; i < (int)(width * height); i++)
     41             h_data_ref[layer*width*height + i] = - h_data[layer*width*height + i] + layer;
     42     }
     43 
     44     printf("
    	Input data
    	");
     45     for (int i = 0; i < num_layers; i++)
     46     {
     47         for (int j = 0; j < OUTPUT; j++)
     48         {
     49             for(int k=0;k<OUTPUT;k++)
     50                 printf("%2.1f ", h_data[i*width*height+j*width+k]);
     51             printf("
    	");
     52         }
     53         printf("
    	");
     54     }
     55     printf("
    	Ideal output data
    	");
     56     for (int i = 0; i < num_layers; i++)
     57     {
     58         for (int j = 0; j < OUTPUT; j++)
     59         {
     60             for (int k = 0; k<OUTPUT; k++)
     61                 printf("%2.1f ", h_data_ref[i*width*height + j*width + k]);
     62             printf("
    	");
     63         }
     64         printf("
    	");
     65     }
     66 
     67     // 设置 CUDA 3D 数组参数和数据拷贝
     68     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
     69     cudaArray *cu_3darray;
     70     cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered);
     71     cudaMemcpy3DParms myparms = { 0 };
     72     myparms.srcPos = make_cudaPos(0, 0, 0);
     73     myparms.dstPos = make_cudaPos(0, 0, 0);
     74     myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
     75     myparms.dstArray = cu_3darray;
     76     myparms.extent = make_cudaExtent(width, height, num_layers);
     77     myparms.kind = cudaMemcpyHostToDevice;
     78     cudaMemcpy3D(&myparms);
     79 
     80     // 设置纹理参数并绑定
     81     tex.addressMode[0] = cudaAddressModeWrap;
     82     tex.addressMode[1] = cudaAddressModeWrap;
     83     tex.filterMode = cudaFilterModeLinear;
     84     tex.normalized = true;
     85     cudaBindTextureToArray(tex, cu_3darray, channelDesc);
     86 
     87     dim3 dimBlock(8, 8, 1);
     88     dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
     89     printf("Covering 2D data of %d * %d * %d: Grid size is %d x %d, each block has 8 x 8 threads
    ", width, height, num_layers, dimGrid.x, dimGrid.y);
     90     transformKernel << < dimGrid, dimBlock >> >(d_data, width, height, 0);// 预跑
     91     cudaDeviceSynchronize();
     92 
     93     StopWatchInterface *timer = NULL;
     94     sdkCreateTimer(&timer);
     95     sdkStartTimer(&timer);
     96 
     97     for (unsigned int layer = 0; layer < num_layers; layer++)// 启用多个核,每个核完成一层
     98         transformKernel << < dimGrid, dimBlock, 0 >> >(d_data, width, height, layer);
     99     cudaDeviceSynchronize();
    100     
    101     sdkStopTimer(&timer);
    102     printf("
    Time: %.3f msec, %.2f Mtexlookups/sec
    ", sdkGetTimerValue(&timer), (width *height *num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
    103     sdkDeleteTimer(&timer);
    104 
    105     // 返回计算结果并检验
    106     memset(h_data, 0, size);
    107     cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
    108     if (checkCmdLineFlag(argc, (const char **)argv, "regression"))
    109         sdkWriteFile<float>("./data/regression.dat", h_data, width * width, 0.0f, false);
    110     else
    111         printf("Comparing kernel output to expected data return %d
    ", compareData(h_data, h_data_ref, width * height * num_layers, MIN_EPSILON_ERROR, 0.0f));
    112 
    113     printf("
    	Actual output data
    	");
    114     for (int i = 0; i < num_layers; i++)
    115     {
    116         for (int j = 0; j < OUTPUT; j++)
    117         {
    118             for (int k = 0; k<OUTPUT; k++)
    119                 printf("%2.1f ", h_data[i*width*height + j*width + k]);
    120             printf("
    	");
    121         }
    122         printf("
    	");
    123     }
    124 
    125     free(h_data);
    126     free(h_data_ref);
    127     cudaFree(d_data);
    128     cudaFreeArray(cu_3darray);
    129 
    130     getchar();
    131     return 0;
    132 }

    ▶ 输出结果

        Input data
        0.0 1.0 2.0 3.0 4.0
        512.0 513.0 514.0 515.0 516.0
        1024.0 1025.0 1026.0 1027.0 1028.0
        1536.0 1537.0 1538.0 1539.0 1540.0
        2048.0 2049.0 2050.0 2051.0 2052.0
    
        0.0 1.0 2.0 3.0 4.0
        512.0 513.0 514.0 515.0 516.0
        1024.0 1025.0 1026.0 1027.0 1028.0
        1536.0 1537.0 1538.0 1539.0 1540.0
        2048.0 2049.0 2050.0 2051.0 2052.0
    
        0.0 1.0 2.0 3.0 4.0
        512.0 513.0 514.0 515.0 516.0
        1024.0 1025.0 1026.0 1027.0 1028.0
        1536.0 1537.0 1538.0 1539.0 1540.0
        2048.0 2049.0 2050.0 2051.0 2052.0
    
        0.0 1.0 2.0 3.0 4.0
        512.0 513.0 514.0 515.0 516.0
        1024.0 1025.0 1026.0 1027.0 1028.0
        1536.0 1537.0 1538.0 1539.0 1540.0
        2048.0 2049.0 2050.0 2051.0 2052.0
    
        0.0 1.0 2.0 3.0 4.0
        512.0 513.0 514.0 515.0 516.0
        1024.0 1025.0 1026.0 1027.0 1028.0
        1536.0 1537.0 1538.0 1539.0 1540.0
        2048.0 2049.0 2050.0 2051.0 2052.0
    
    
        Ideal output data
        0.0 -1.0 -2.0 -3.0 -4.0
        -512.0 -513.0 -514.0 -515.0 -516.0
        -1024.0 -1025.0 -1026.0 -1027.0 -1028.0
        -1536.0 -1537.0 -1538.0 -1539.0 -1540.0
        -2048.0 -2049.0 -2050.0 -2051.0 -2052.0
    
        1.0 0.0 -1.0 -2.0 -3.0
        -511.0 -512.0 -513.0 -514.0 -515.0
        -1023.0 -1024.0 -1025.0 -1026.0 -1027.0
        -1535.0 -1536.0 -1537.0 -1538.0 -1539.0
        -2047.0 -2048.0 -2049.0 -2050.0 -2051.0
    
        2.0 1.0 0.0 -1.0 -2.0
        -510.0 -511.0 -512.0 -513.0 -514.0
        -1022.0 -1023.0 -1024.0 -1025.0 -1026.0
        -1534.0 -1535.0 -1536.0 -1537.0 -1538.0
        -2046.0 -2047.0 -2048.0 -2049.0 -2050.0
    
        3.0 2.0 1.0 0.0 -1.0
        -509.0 -510.0 -511.0 -512.0 -513.0
        -1021.0 -1022.0 -1023.0 -1024.0 -1025.0
        -1533.0 -1534.0 -1535.0 -1536.0 -1537.0
        -2045.0 -2046.0 -2047.0 -2048.0 -2049.0
    
        4.0 3.0 2.0 1.0 0.0
        -508.0 -509.0 -510.0 -511.0 -512.0
        -1020.0 -1021.0 -1022.0 -1023.0 -1024.0
        -1532.0 -1533.0 -1534.0 -1535.0 -1536.0
        -2044.0 -2045.0 -2046.0 -2047.0 -2048.0
    
        Covering 2D data of 512 * 512 * 5: Grid size is 64 x 64, each block has 8 x 8 threads
    
    Time: 0.995 msec, 1317.00 Mtexlookups/sec
    Comparing kernel output to expected data return 1
    
        Actual output data
        0.0 -1.0 -2.0 -3.0 -4.0
        -512.0 -513.0 -514.0 -515.0 -516.0
        -1024.0 -1025.0 -1026.0 -1027.0 -1028.0
        -1536.0 -1537.0 -1538.0 -1539.0 -1540.0
        -2048.0 -2049.0 -2050.0 -2051.0 -2052.0
    
        1.0 0.0 -1.0 -2.0 -3.0
        -511.0 -512.0 -513.0 -514.0 -515.0
        -1023.0 -1024.0 -1025.0 -1026.0 -1027.0
        -1535.0 -1536.0 -1537.0 -1538.0 -1539.0
        -2047.0 -2048.0 -2049.0 -2050.0 -2051.0
    
        2.0 1.0 0.0 -1.0 -2.0
        -510.0 -511.0 -512.0 -513.0 -514.0
        -1022.0 -1023.0 -1024.0 -1025.0 -1026.0
        -1534.0 -1535.0 -1536.0 -1537.0 -1538.0
        -2046.0 -2047.0 -2048.0 -2049.0 -2050.0
    
        3.0 2.0 1.0 0.0 -1.0
        -509.0 -510.0 -511.0 -512.0 -513.0
        -1021.0 -1022.0 -1023.0 -1024.0 -1025.0
        -1533.0 -1534.0 -1535.0 -1536.0 -1537.0
        -2045.0 -2046.0 -2047.0 -2048.0 -2049.0
    
        4.0 3.0 2.0 1.0 0.0
        -508.0 -509.0 -510.0 -511.0 -512.0
        -1020.0 -1021.0 -1022.0 -1023.0 -1024.0
        -1532.0 -1533.0 -1534.0 -1535.0 -1536.0
        -2044.0 -2045.0 -2046.0 -2047.0 -2048.0

    ▶ 涨姿势

    ● 与前面立方体贴图纹理不同的地方:申请 CUDA3D 数组的时候使用标志 cudaArrayLayered 而不是 cudaArrayCubemap,并注意调整相关的维度参数。

  • 相关阅读:
    jvm性能监控(4)–JVM的监控工具Jconsole
    jvm性能监控(3)-jdk自带工具 jps jstack jmap
    jvm性能监控(2)–JVM的监控工具jstat
    jvm(1)性能监控-linux相关命令
    jvm学习(5) 对象的创建与结构
    jvm(4) 对象创建
    jvm学习(3)方法区、堆、对象存储位置
    jvm学习(2)JVM内存说明
    java泛型
    windows下部署spring boot 的jar
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7881977.html
Copyright © 2011-2022 走看看