zoukankan      html  css  js  c++  java
  • Cuda learn record two

    这是一个cuda 自带的算例,包含cuda 计算的一般流程。

    这个地址有比较清楚的cuda的介绍。感谢作者分享(http://blog.csdn.net/hjimce/article/details/51506207

    一般来说,cuda 计算的流程是:

    1. 设置显卡编号:cudaSetDevice; 这个主要是在有多个GPU的机器上使用,其编号是从0号开始。

    2. 为显卡开辟内存变量: cudaMalloc;使用方法:cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));  

    这里的指针是指向设备端的内存地址,无法再主机端使用。

    3.把主机端的数据拷贝到设备端:cudaMemcpy; 使用方法:

    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);  

    这里注意需要指明数据传输的地址,

    4. 调用内核函数__global__ 类型函数;

    cudaAdd<<<blocksPerGrid, threadsPerBlock>>> (     )

    这里blocksPerGrid, threadsPerBlock 都是Dim3型的数据,

    5. 把计算结果拷贝到主机端。

    6. 释放显存空间。

      1 #include "cuda_runtime.h"
      2 #include "device_launch_parameters.h"
      3 
      4 #include <stdio.h>
      5 
      6 static void HandleError(cudaError_t err,
      7     const char *file,
      8     int line) {
      9     if (err != cudaSuccess) {
     10         printf("%s in %s at line %d
    ", cudaGetErrorString(err),
     11             file, line);
     12         exit(EXIT_FAILURE);
     13     }
     14 }
     15 #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
     16 
     17 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
     18 void printCudaInformation();
     19 
     20 __global__ void addKernel(int *c, const int *a, const int *b)
     21 {
     22     int i = threadIdx.x;
     23     c[i] = a[i] + b[i];
     24 }
     25 
     26 int main()
     27 {
     28     const int arraySize = 5;
     29     const int a[arraySize] = { 1, 2, 3, 4, 5 };
     30     const int b[arraySize] = { 10, 20, 30, 40, 50 };
     31     int c[arraySize] = { 0 };
     32 
     33     // Add vectors in parallel.
     34     HANDLE_ERROR( addWithCuda(c, a, b, arraySize) );
     35 
     36     printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}
    ",
     37         c[0], c[1], c[2], c[3], c[4]);
     38 
     39     // cudaDeviceReset must be called before exiting in order for profiling and
     40     // tracing tools such as Nsight and Visual Profiler to show complete traces.
     41     HANDLE_ERROR( cudaDeviceReset() );
     42 
     43     system("pause");
     44     printCudaInformation();
     45     system("pause");
     46     return 0;
     47 }
     48 
     49 // Helper function for using CUDA to add vectors in parallel.
     50 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
     51 {
     52     int *dev_a = 0;
     53     int *dev_b = 0;
     54     int *dev_c = 0;
     55     cudaError_t cudaStatus=cudaSuccess;
     56 
     57     // Choose which GPU to run on, change this on a multi-GPU system.
     58     HANDLE_ERROR(cudaSetDevice(0));
     59 
     60     // Allocate GPU buffers for three vectors (two input, one output)   
     61     HANDLE_ERROR(cudaMalloc((void**)&dev_c, size * sizeof(int)));
     62     HANDLE_ERROR(cudaMalloc((void**)&dev_a, size * sizeof(int)));
     63     HANDLE_ERROR(cudaMalloc((void**)&dev_b, size * sizeof(int)));
     64 
     65     // Copy input vectors from host memory to GPU buffers.
     66     HANDLE_ERROR(cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice));
     67     HANDLE_ERROR(cudaMemcpy(dev_b, a, size * sizeof(int), cudaMemcpyHostToDevice));
     68 
     69 
     70     // Launch a kernel on the GPU with one thread for each element.
     71     addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
     72 
     73     // Check for any errors launching the kernel
     74     HANDLE_ERROR(cudaGetLastError());
     75     
     76     // cudaDeviceSynchronize waits for the kernel to finish, and returns
     77     // any errors encountered during the launch.
     78     HANDLE_ERROR(cudaDeviceSynchronize());
     79 
     80     // Copy output vector from GPU buffer to host memory.
     81     HANDLE_ERROR(cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost));
     82     
     83     return cudaStatus;
     84 }
     85 
     86 void printCudaInformation()
     87 {
     88     int count;
     89     cudaGetDeviceCount(&count);
     90     printf("count=%d 
    ", count);
     91     cudaDeviceProp myProp;
     92     cudaGetDeviceProperties(&myProp, 0);
     93     printf(" --- General Information of My Cuda Device ---
    ");
     94     printf("     Device name: %s
    ", myProp.name);
     95     printf("     Computer capatibility : %d.%d
    ", myProp.major, myProp.minor);
     96     printf("     Clock rate: %d
    ", myProp.clockRate);
     97 
     98     printf(" --- Memory Information of My Cuda Device ---
    ");
     99     printf("    Total global memory: %ld =%d double 
    ", myProp.totalGlobalMem, myProp.totalGlobalMem / sizeof(double));
    100     printf("    Total const memory: %ld =%d int 
    ", myProp.totalConstMem, myProp.totalConstMem / sizeof(int));
    101     printf("    max memoory pitch: %ld 
    ", myProp.memPitch);
    102 
    103     printf(" --- Multiprocessor Information of My Cuda Device ---
    ");
    104     printf("    multprocessor count= %d
    ", myProp.multiProcessorCount);
    105     printf("    Shared mem per mp=%d
    ", myProp.sharedMemPerBlock);
    106     printf("    Registers per mp=%d
    ", myProp.regsPerBlock);
    107     printf("    Thread in wrap=%d
    ", myProp.warpSize);
    108     printf("    Max thread per block=%d
    ", myProp.maxThreadsPerBlock);
    109     printf("    Max threads dimensions= (%d, %d, %d) 
    ",
    110         myProp.maxThreadsDim[0], myProp.maxThreadsDim[1], myProp.maxThreadsDim[2]);
    111     printf("    Max Grid dimensions= (%d, %d, %d) 
    ",
    112         myProp.maxGridSize[0], myProp.maxGridSize[1], myProp.maxGridSize[2]);
    113     printf("
    ");
    114 }
  • 相关阅读:
    nginx详解
    keeplived高可用集群
    mysql主从同步
    elasticsearch基础
    redis集群管理--sentinel
    socket阻塞与非阻塞,同步与异步,select,pool,epool
    django+channels+dephne实现websockrt部署
    Django+Nginx+uWSGI生产环境部署
    进制转换
    对golang指针的理解
  • 原文地址:https://www.cnblogs.com/cofludy/p/6925642.html
Copyright © 2011-2022 走看看