zoukankan      html  css  js  c++  java
  • 利用两个流进行操作演示

    项目下载链接

      1 /*
      2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
      3 *
      4 * NVIDIA Corporation and its licensors retain all intellectual property and
      5 * proprietary rights in and to this software and related documentation.
      6 * Any use, reproduction, disclosure, or distribution of this software
      7 * and related documentation without an express license agreement from
      8 * NVIDIA Corporation is strictly prohibited.
      9 *
     10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
     11 * associated with this source code for terms and conditions that govern
     12 * your use of this NVIDIA software.
     13 *
     14 */
     15 
     16 
     17 #include "../common/book.h"
     18 #include "cuda.h"
     19 #include "cuda_runtime.h"
     20 #include "device_launch_parameters.h"
     21 #define N   (1024*1024)
     22 #define FULL_DATA_SIZE   (N*20)
     23 
     24 
     25 __global__ void kernel(int *a, int *b, int *c) {
     26     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     27     if (idx < N) {
     28         int idx1 = (idx + 1) % 256;
     29         int idx2 = (idx + 2) % 256;
     30         float   as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
     31         float   bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
     32         c[idx] = (as + bs) / 2;
     33     }
     34 }
     35 
     36 
     37 int main(void) {
     38     cudaDeviceProp  prop;
     39     int whichDevice;
     40     HANDLE_ERROR(cudaGetDevice(&whichDevice));
     41     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
     42     if (!prop.deviceOverlap) {
     43         printf("Device will not handle overlaps, so no speed up from streams
    ");
     44         return 0;
     45     }
     46 
     47     cudaEvent_t     start, stop;
     48     float           elapsedTime;
     49 
     50     cudaStream_t    stream0, stream1;
     51     int *host_a, *host_b, *host_c;
     52     int *dev_a0, *dev_b0, *dev_c0;
     53     int *dev_a1, *dev_b1, *dev_c1;
     54 
     55     // start the timers
     56     HANDLE_ERROR(cudaEventCreate(&start));
     57     HANDLE_ERROR(cudaEventCreate(&stop));
     58 
     59     //初始化两个流
     60     HANDLE_ERROR(cudaStreamCreate(&stream0));
     61     HANDLE_ERROR(cudaStreamCreate(&stream1));
     62 
     63     // allocate the memory on the GPU
     64     HANDLE_ERROR(cudaMalloc((void**)&dev_a0,
     65         N * sizeof(int)));
     66     HANDLE_ERROR(cudaMalloc((void**)&dev_b0,
     67         N * sizeof(int)));
     68     HANDLE_ERROR(cudaMalloc((void**)&dev_c0,
     69         N * sizeof(int)));
     70     HANDLE_ERROR(cudaMalloc((void**)&dev_a1,
     71         N * sizeof(int)));
     72     HANDLE_ERROR(cudaMalloc((void**)&dev_b1,
     73         N * sizeof(int)));
     74     HANDLE_ERROR(cudaMalloc((void**)&dev_c1,
     75         N * sizeof(int)));
     76 
     77     //在主机上分配锁定页内存
     78     HANDLE_ERROR(cudaHostAlloc((void**)&host_a,
     79         FULL_DATA_SIZE * sizeof(int),
     80         cudaHostAllocDefault));
     81     HANDLE_ERROR(cudaHostAlloc((void**)&host_b,
     82         FULL_DATA_SIZE * sizeof(int),
     83         cudaHostAllocDefault));
     84     HANDLE_ERROR(cudaHostAlloc((void**)&host_c,
     85         FULL_DATA_SIZE * sizeof(int),
     86         cudaHostAllocDefault));
     87 
     88     for (int i = 0; i<FULL_DATA_SIZE; i++) {
     89         host_a[i] = rand();
     90         host_b[i] = rand();
     91     }
     92 
     93     HANDLE_ERROR(cudaEventRecord(start, 0));
     94     // now loop over full data, in bite-sized chunks
     95     for (int i = 0; i<FULL_DATA_SIZE; i += N * 2) {
     96         // enqueue copies of a in stream0 and stream1
     97         HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i,
     98             N * sizeof(int),
     99             cudaMemcpyHostToDevice,
    100             stream0));
    101         HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,
    102             N * sizeof(int),
    103             cudaMemcpyHostToDevice,
    104             stream1));
    105         // enqueue copies of b in stream0 and stream1
    106         HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,
    107             N * sizeof(int),
    108             cudaMemcpyHostToDevice,
    109             stream0));
    110         HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,
    111             N * sizeof(int),
    112             cudaMemcpyHostToDevice,
    113             stream1));
    114 
    115         // enqueue kernels in stream0 and stream1   
    116         kernel << <N / 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0);
    117         kernel << <N / 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1);
    118 
    119         //从设备上将结果拷贝回主机上的锁定页内存
    120         HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,
    121             N * sizeof(int),
    122             cudaMemcpyDeviceToHost,
    123             stream0));
    124         HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,
    125             N * sizeof(int),
    126             cudaMemcpyDeviceToHost,
    127             stream1));
    128     }
    129     //将计算结果从锁定页内存复制会主机内存
    130     HANDLE_ERROR(cudaStreamSynchronize(stream0));
    131     HANDLE_ERROR(cudaStreamSynchronize(stream1));
    132 
    133     HANDLE_ERROR(cudaEventRecord(stop, 0));
    134 
    135     HANDLE_ERROR(cudaEventSynchronize(stop));
    136     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
    137         start, stop));
    138     printf("Time taken:  %3.1f ms
    ", elapsedTime);
    139 
    140     // cleanup the streams and memory
    141     HANDLE_ERROR(cudaFreeHost(host_a));
    142     HANDLE_ERROR(cudaFreeHost(host_b));
    143     HANDLE_ERROR(cudaFreeHost(host_c));
    144     HANDLE_ERROR(cudaFree(dev_a0));
    145     HANDLE_ERROR(cudaFree(dev_b0));
    146     HANDLE_ERROR(cudaFree(dev_c0));
    147     HANDLE_ERROR(cudaFree(dev_a1));
    148     HANDLE_ERROR(cudaFree(dev_b1));
    149     HANDLE_ERROR(cudaFree(dev_c1));
    150     HANDLE_ERROR(cudaStreamDestroy(stream0));
    151     HANDLE_ERROR(cudaStreamDestroy(stream1));
    152 
    153     return 0;
    154 }
  • 相关阅读:
    node递归批量重命名指定文件夹下的文件
    nvm
    node在Web中的用途
    给flash续命(rtmp/http-flv网页播放器)
    AMR/PCM格式语音采集/编码/转码/解码/播放
    视频分析,目标跟踪应用方案梳理
    srs-librtmp pusher(push h264 raw)
    srs
    nginx-rtmp/http-flv
    Introduction to Sound Programming with ALSA
  • 原文地址:https://www.cnblogs.com/liangliangdetianxia/p/3996353.html
Copyright © 2011-2022 走看看