zoukankan      html  css  js  c++  java
  • 零内存拷贝和普通拷贝对比

    下载链接

      1 /*
      2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
      3 *
      4 * NVIDIA Corporation and its licensors retain all intellectual property and
      5 * proprietary rights in and to this software and related documentation.
      6 * Any use, reproduction, disclosure, or distribution of this software
      7 * and related documentation without an express license agreement from
      8 * NVIDIA Corporation is strictly prohibited.
      9 *
     10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
     11 * associated with this source code for terms and conditions that govern
     12 * your use of this NVIDIA software.
     13 *
     14 */
     15 
     16 
     17 #include "../common/book.h"
     18 #include "cuda.h"
     19 #include "cuda_runtime.h"
     20 #include "device_launch_parameters.h"
     21 #include "device_functions.h"
     22 #define imin(a,b) (a<b?a:b)
     23 
     24 const int N = 33 * 1024 * 1024;
     25 const int threadsPerBlock = 256;
     26 const int blocksPerGrid =
     27 imin(32, (N + threadsPerBlock - 1) / threadsPerBlock);
     28 
     29 
     30 __global__ void dot(int size, float *a, float *b, float *c) {
     31     __shared__ float cache[threadsPerBlock];
     32     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     33     int cacheIndex = threadIdx.x;
     34 
     35     float   temp = 0;
     36     while (tid < size) {
     37         temp += a[tid] * b[tid];
     38         tid += blockDim.x * gridDim.x;
     39     }
     40 
     41     // set the cache values
     42     cache[cacheIndex] = temp;
     43 
     44     // synchronize threads in this block
     45     __syncthreads();
     46 
     47     // for reductions, threadsPerBlock must be a power of 2
     48     // because of the following code
     49     int i = blockDim.x / 2;
     50     while (i != 0) {
     51         if (cacheIndex < i)
     52             cache[cacheIndex] += cache[cacheIndex + i];
     53         __syncthreads();
     54         i /= 2;
     55     }
     56 
     57     if (cacheIndex == 0)
     58         c[blockIdx.x] = cache[0];
     59 }
     60 
     61 
     62 float malloc_test(int size) {
     63     cudaEvent_t     start, stop;
     64     float           *a, *b, c, *partial_c;
     65     float           *dev_a, *dev_b, *dev_partial_c;
     66     float           elapsedTime;
     67 
     68     HANDLE_ERROR(cudaEventCreate(&start));
     69     HANDLE_ERROR(cudaEventCreate(&stop));
     70 
     71     // allocate memory on the CPU side
     72     a = (float*)malloc(size*sizeof(float));
     73     b = (float*)malloc(size*sizeof(float));
     74     partial_c = (float*)malloc(blocksPerGrid*sizeof(float));
     75 
     76     // allocate the memory on the GPU
     77     HANDLE_ERROR(cudaMalloc((void**)&dev_a,
     78         size*sizeof(float)));
     79     HANDLE_ERROR(cudaMalloc((void**)&dev_b,
     80         size*sizeof(float)));
     81     HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c,
     82         blocksPerGrid*sizeof(float)));
     83 
     84     // fill in the host memory with data
     85     for (int i = 0; i<size; i++) {
     86         a[i] = i;
     87         b[i] = i * 2;
     88     }
     89 
     90     HANDLE_ERROR(cudaEventRecord(start, 0));
     91     // copy the arrays 'a' and 'b' to the GPU
     92     HANDLE_ERROR(cudaMemcpy(dev_a, a, size*sizeof(float),
     93         cudaMemcpyHostToDevice));
     94     HANDLE_ERROR(cudaMemcpy(dev_b, b, size*sizeof(float),
     95         cudaMemcpyHostToDevice));
     96 
     97     dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b,
     98         dev_partial_c);
     99     // copy the array 'c' back from the GPU to the CPU
    100     HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c,
    101         blocksPerGrid*sizeof(float),
    102         cudaMemcpyDeviceToHost));
    103 
    104     HANDLE_ERROR(cudaEventRecord(stop, 0));
    105     HANDLE_ERROR(cudaEventSynchronize(stop));
    106     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
    107         start, stop));
    108 
    109     // finish up on the CPU side
    110     c = 0;
    111     for (int i = 0; i<blocksPerGrid; i++) {
    112         c += partial_c[i];
    113     }
    114 
    115     HANDLE_ERROR(cudaFree(dev_a));
    116     HANDLE_ERROR(cudaFree(dev_b));
    117     HANDLE_ERROR(cudaFree(dev_partial_c));
    118 
    119     // free memory on the CPU side
    120     free(a);
    121     free(b);
    122     free(partial_c);
    123 
    124     // free events
    125     HANDLE_ERROR(cudaEventDestroy(start));
    126     HANDLE_ERROR(cudaEventDestroy(stop));
    127 
    128     printf("Value calculated:  %f
    ", c);
    129 
    130     return elapsedTime;
    131 }
    132 
    133 
    134 float cuda_host_alloc_test(int size) {
    135     cudaEvent_t     start, stop;
    136     float           *a, *b, c, *partial_c;
    137     float           *dev_a, *dev_b, *dev_partial_c;
    138     float           elapsedTime;
    139 
    140     HANDLE_ERROR(cudaEventCreate(&start));
    141     HANDLE_ERROR(cudaEventCreate(&stop));
    142 
    143     // allocate the memory on the CPU
    144     HANDLE_ERROR(cudaHostAlloc((void**)&a,
    145         size*sizeof(float),
    146         cudaHostAllocWriteCombined |
    147         cudaHostAllocMapped));
    148     HANDLE_ERROR(cudaHostAlloc((void**)&b,
    149         size*sizeof(float),
    150         cudaHostAllocWriteCombined |
    151         cudaHostAllocMapped));
    152     HANDLE_ERROR(cudaHostAlloc((void**)&partial_c,
    153         blocksPerGrid*sizeof(float),
    154         cudaHostAllocMapped));
    155 
    156     // find out the GPU pointers
    157     HANDLE_ERROR(cudaHostGetDevicePointer(&dev_a, a, 0));
    158     HANDLE_ERROR(cudaHostGetDevicePointer(&dev_b, b, 0));
    159     HANDLE_ERROR(cudaHostGetDevicePointer(&dev_partial_c,
    160         partial_c, 0));
    161 
    162     // fill in the host memory with data
    163     for (int i = 0; i<size; i++) {
    164         a[i] = i;
    165         b[i] = i * 2;
    166     }
    167 
    168     HANDLE_ERROR(cudaEventRecord(start, 0));
    169 
    170     dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b,
    171         dev_partial_c);
    172 
    173     HANDLE_ERROR(cudaThreadSynchronize());
    174     HANDLE_ERROR(cudaEventRecord(stop, 0));
    175     HANDLE_ERROR(cudaEventSynchronize(stop));
    176     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
    177         start, stop));
    178 
    179     // finish up on the CPU side
    180     c = 0;
    181     for (int i = 0; i<blocksPerGrid; i++) {
    182         c += partial_c[i];
    183     }
    184 
    185     HANDLE_ERROR(cudaFreeHost(a));
    186     HANDLE_ERROR(cudaFreeHost(b));
    187     HANDLE_ERROR(cudaFreeHost(partial_c));
    188 
    189     // free events
    190     HANDLE_ERROR(cudaEventDestroy(start));
    191     HANDLE_ERROR(cudaEventDestroy(stop));
    192 
    193     printf("Value calculated:  %f
    ", c);
    194 
    195     return elapsedTime;
    196 }
    197 
    198 
    199 int main(void) {
    200     cudaDeviceProp  prop;
    201     int whichDevice;
    202     HANDLE_ERROR(cudaGetDevice(&whichDevice));
    203     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
    204     if (prop.canMapHostMemory != 1) {
    205         printf("Device can not map memory.
    ");
    206         return 0;
    207     }
    208 
    209     float           elapsedTime;
    210 
    211     HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost));
    212 
    213     // try it with malloc
    214     elapsedTime = malloc_test(N);
    215     printf("Time using cudaMalloc:  %3.1f ms
    ",
    216         elapsedTime);
    217 
    218     // now try it with cudaHostAlloc
    219     elapsedTime = cuda_host_alloc_test(N);
    220     printf("Time using cudaHostAlloc:  %3.1f ms
    ",
    221         elapsedTime);
    222 }
  • 相关阅读:
    UrlRewriter
    PortalBasic Web 应用开发框架
    首页静态化和定时执行
    细谈在C#中读写Excel系列文章之四
    Ubuntu安装SVN及SvnManager
    Select语句导致瓶颈
    策略模式
    抽象工厂
    递归优化之快速排序
    从安装git开始,一步一步把程序发布到AppHarbor
  • 原文地址:https://www.cnblogs.com/liangliangdetianxia/p/3996389.html
Copyright © 2011-2022 走看看