zoukankan      html  css  js  c++  java
  • 多GPU设备处理点积示例

     多GPU设备处理点积示例,项目打包下载

      1 /*
      2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
      3 *
      4 * NVIDIA Corporation and its licensors retain all intellectual property and
      5 * proprietary rights in and to this software and related documentation.
      6 * Any use, reproduction, disclosure, or distribution of this software
      7 * and related documentation without an express license agreement from
      8 * NVIDIA Corporation is strictly prohibited.
      9 *
     10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
     11 * associated with this source code for terms and conditions that govern
     12 * your use of this NVIDIA software.
     13 *
     14 */
     15 
     16 
     17 #include "../common/book.h"
     18 #include "cuda.h"
     19 #include "device_launch_parameters.h"
     20 #include "device_functions.h"
     21 #include "cuda_runtime.h"
     22 
     23 #define imin(a,b) (a<b?a:b)
     24 
     25 #define     N    (33*1024*1024)
     26 const int threadsPerBlock = 256;
     27 const int blocksPerGrid =
     28 imin(32, (N / 2 + threadsPerBlock - 1) / threadsPerBlock);
     29 
     30 
     31 __global__ void dot(int size, float *a, float *b, float *c) {
     32     __shared__ float cache[threadsPerBlock];
     33     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     34     int cacheIndex = threadIdx.x;
     35 
     36     float   temp = 0;
     37     while (tid < size) {
     38         temp += a[tid] * b[tid];
     39         tid += blockDim.x * gridDim.x;
     40     }
     41 
     42     // set the cache values
     43     cache[cacheIndex] = temp;
     44 
     45     // synchronize threads in this block
     46     __syncthreads();
     47 
     48     //块内归约
     49     int i = blockDim.x / 2;
     50     while (i != 0) {
     51         if (cacheIndex < i)
     52             cache[cacheIndex] += cache[cacheIndex + i];
     53         __syncthreads();
     54         i /= 2;
     55     }
     56 
     57     if (cacheIndex == 0)
     58         c[blockIdx.x] = cache[0];
     59 }
     60 
     61 
     62 struct DataStruct {
     63     int     deviceID;
     64     int     size;
     65     float   *a;
     66     float   *b;
     67     float   returnValue;
     68 };
     69 
     70 unsigned WINAPI routine(void *pvoidData)
     71 //void* routine(void *pvoidData) 
     72 {
     73     DataStruct  *data = (DataStruct*)pvoidData;
     74     HANDLE_ERROR(cudaSetDevice(data->deviceID));
     75 
     76     int     size = data->size;
     77     float   *a, *b, c, *partial_c;
     78     float   *dev_a, *dev_b, *dev_partial_c;
     79 
     80     // allocate memory on the CPU side
     81     a = data->a;
     82     b = data->b;
     83     partial_c = (float*)malloc(blocksPerGrid*sizeof(float));
     84 
     85     // allocate the memory on the GPU
     86     HANDLE_ERROR(cudaMalloc((void**)&dev_a,
     87         size*sizeof(float)));
     88     HANDLE_ERROR(cudaMalloc((void**)&dev_b,
     89         size*sizeof(float)));
     90     HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c,
     91         blocksPerGrid*sizeof(float)));
     92 
     93     // copy the arrays 'a' and 'b' to the GPU
     94     HANDLE_ERROR(cudaMemcpy(dev_a, a, size*sizeof(float),
     95         cudaMemcpyHostToDevice));
     96     HANDLE_ERROR(cudaMemcpy(dev_b, b, size*sizeof(float),
     97         cudaMemcpyHostToDevice));
     98 
     99     dot <<<blocksPerGrid, threadsPerBlock >>>(size, dev_a, dev_b,
    100         dev_partial_c);
    101     // copy the array 'c' back from the GPU to the CPU
    102     HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c,
    103         blocksPerGrid*sizeof(float),
    104         cudaMemcpyDeviceToHost));
    105 
    106     // finish up on the CPU side
    107     c = 0;
    108     for (int i = 0; i<blocksPerGrid; i++) {
    109         c += partial_c[i];
    110     }
    111 
    112     HANDLE_ERROR(cudaFree(dev_a));
    113     HANDLE_ERROR(cudaFree(dev_b));
    114     HANDLE_ERROR(cudaFree(dev_partial_c));
    115 
    116     // free memory on the CPU side
    117     free(partial_c);
    118 
    119     data->returnValue = c;
    120     return 0;
    121 }
    122 
    123 
    124 int main(void) {
    125     int deviceCount;
    126     HANDLE_ERROR(cudaGetDeviceCount(&deviceCount));
    127     //要求两个设备
    128     if (deviceCount < 2) {
    129         printf("We need at least two compute 1.0 or greater "
    130             "devices, but only found %d
    ", deviceCount);
    131         return 0;
    132     }
    133 
    134     float   *a = (float*)malloc(sizeof(float)* N);
    135     HANDLE_NULL(a);
    136     float   *b = (float*)malloc(sizeof(float)* N);
    137     HANDLE_NULL(b);
    138 
    139     // fill in the host memory with data
    140     for (int i = 0; i<N; i++) {
    141         a[i] = i;
    142         b[i] = i * 2;
    143     }
    144 
    145     /*
    146     为多线程做准备
    147     每个DateStruct都为数据集大小的一半
    148     */
    149     DataStruct  data[2];
    150     data[0].deviceID = 0;
    151     data[0].size = N / 2;
    152     data[0].a = a;
    153     data[0].b = b;
    154 
    155     data[1].deviceID = 1;
    156     data[1].size = N / 2;
    157     data[1].a = a + N / 2;
    158     data[1].b = b + N / 2;
    159 
    160     CUTThread   thread = start_thread(routine, &(data[0]));
    161     routine(&(data[1]));
    162     end_thread(thread);
    163 
    164 
    165     // free memory on the CPU side
    166     free(a);
    167     free(b);
    168 
    169     printf("Value calculated:  %f
    ",
    170         data[0].returnValue + data[1].returnValue);
    171 
    172     return 0;
    173 }
  • 相关阅读:
    HDU 5213 分块 容斥
    HDU 2298 三分
    HDU 5144 三分
    HDU 5145 分块 莫队
    HDU 3938 并查集
    HDU 3926 并查集 图同构简单判断 STL
    POJ 2431 优先队列
    HDU 1811 拓扑排序 并查集
    HDU 2685 GCD推导
    HDU 4496 并查集 逆向思维
  • 原文地址:https://www.cnblogs.com/liangliangdetianxia/p/3998729.html
Copyright © 2011-2022 走看看