zoukankan      html  css  js  c++  java
  • cuda多线程间通信

      1 #include "cuda_runtime.h"
      2 #include "device_launch_parameters.h"
      3 
      4 #include <stdio.h>
      5 #include <time.h>
      6 #include <stdlib.h>
      7 
      8 #define MAX 120
      9 #define MIN 0
     10 cudaError_t addWithCuda(int *c, const int *a, size_t size);
     11 
     12 __global__ void addKernel(int *c, const int *a) {
     13     int i = threadIdx.x;
     14     extern __shared__ int smem[];
     15     smem[i] = a[i];
     16     __syncthreads();
     17     if (i == 0)  // 0号线程做平方和
     18             {
     19         c[0] = 0;
     20         for (int d = 0; d < 5; d++) {
     21             c[0] += smem[d] * smem[d];
     22         }
     23     }
     24     if (i == 1)  //1号线程做累加
     25             {
     26         c[1] = 0;
     27         for (int d = 0; d < 5; d++) {
     28             c[1] += smem[d];
     29         }
     30     }
     31     if (i == 2)  //2号线程做累乘
     32             {
     33         c[2] = 1;
     34         for (int d = 0; d < 5; d++) {
     35             c[2] = smem[d];
     36         }
     37 
     38     }
     39 
     40     if (i == 3)  //3号线程做异或
     41                 {
     42             c[3] = 0;
     43             for (int d = 0; d < 5; d++) {
     44                 c[3] ^= smem[d];
     45             }
     46 
     47         }
     48 }
     49 
     50 int main() {
     51     const int arraySize = 5;
     52     srand((unsigned) time(NULL));
     53     const int a[arraySize] = { rand() % (MAX + 1 - MIN) + MIN, rand()
     54             % (MAX + 1 - MIN) + MIN, rand() % (MAX + 1 - MIN) + MIN, rand()
     55             % (MAX + 1 - MIN) + MIN, rand() % (MAX + 1 - MIN) + MIN };
     56     int c[arraySize] = { 0 };
     57     // Add vectors in parallel.
     58     cudaError_t cudaStatus = addWithCuda(c, a, arraySize);
     59     if (cudaStatus != cudaSuccess) {
     60         fprintf(stderr, "addWithCuda failed!");
     61         return 1;
     62     }
     63     printf(
     64             "	%d+%d+%d+%d+%d = %d
    	%d^2+%d^2+%d^2+%d^2+%d^2 = %d
    	%d*%d*%d*%d*%d = %d
    	%d^%d^%d^%d^%d = %d
    
    
    
    
    ",
     65             a[0], a[1], a[2], a[3], a[4], c[1], a[0], a[1], a[2], a[3], a[4],
     66             c[0], a[0], a[1], a[2], a[3], a[4], c[2],a[0], a[1], a[2], a[3], a[4], c[3]);
     67     // cudaThreadExit must be called before exiting in order for profiling and
     68     // tracing tools such as Nsight and Visual Profiler to show complete traces.
     69     cudaStatus = cudaThreadExit();
     70     if (cudaStatus != cudaSuccess) {
     71         fprintf(stderr, "cudaThreadExit failed!");
     72         return 1;
     73     }
     74     return 0;
     75 }
     76 
     77 // Helper function for using CUDA to add vectors in parallel.
     78 cudaError_t addWithCuda(int *c, const int *a, size_t size) {
     79     int *dev_a = 0;
     80     int *dev_c = 0;
     81     cudaError_t cudaStatus;
     82 
     83     // Choose which GPU to run on, change this on a multi-GPU system.
     84     cudaStatus = cudaSetDevice(0);
     85     if (cudaStatus != cudaSuccess) {
     86         fprintf(stderr,
     87                 "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
     88         goto Error;
     89     }
     90 
     91     // Allocate GPU buffers for three vectors (two input, one output)    .
     92     cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));
     93     if (cudaStatus != cudaSuccess) {
     94         fprintf(stderr, "cudaMalloc failed!");
     95         goto Error;
     96     }
     97 
     98     cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));
     99     if (cudaStatus != cudaSuccess) {
    100         fprintf(stderr, "cudaMalloc failed!");
    101         goto Error;
    102     }
    103     // Copy input vectors from host memory to GPU buffers.
    104     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),
    105             cudaMemcpyHostToDevice);
    106     if (cudaStatus != cudaSuccess) {
    107         fprintf(stderr, "cudaMemcpy failed!");
    108         goto Error;
    109     }
    110     // Launch a kernel on the GPU with one thread for each element.
    111     addKernel<<<1, size, size * sizeof(int), 0>>>(dev_c, dev_a);
    112 
    113     // cudaThreadSynchronize waits for the kernel to finish, and returns
    114     // any errors encountered during the launch.
    115     cudaStatus = cudaThreadSynchronize();
    116     if (cudaStatus != cudaSuccess) {
    117         fprintf(stderr,
    118                 "cudaThreadSynchronize returned error code %d after launching addKernel!
    ",
    119                 cudaStatus);
    120         goto Error;
    121     }
    122 
    123     // Copy output vector from GPU buffer to host memory.
    124     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),
    125             cudaMemcpyDeviceToHost);
    126     if (cudaStatus != cudaSuccess) {
    127         fprintf(stderr, "cudaMemcpy failed!");
    128         goto Error;
    129     }
    130 
    131     Error: cudaFree(dev_c);
    132     cudaFree(dev_a);
    133     return cudaStatus;
    134 }

    22+103+61+63+17 = 266
    22^2+103^2+61^2+63^2+17^2 = 19072
    22*103*61*63*17 = 17
    22^103^61^63^17 = 98

    OPTIMISM, PASSION & HARDWORK
  • 相关阅读:
    IDEA使用Git传放项目
    前端自适应知识点
    数据可视化
    vue调用Moment显示时间
    java判断通常的逻辑
    java常用的逻辑
    java匿名内部类练习
    java 匿名内部类
    java 内部类定义在局部时需要注意的情况
    java内部类的定义原则
  • 原文地址:https://www.cnblogs.com/hiramlee0534/p/5935098.html
Copyright © 2011-2022 走看看