zoukankan      html  css  js  c++  java
  • linux利用CMakeLists编译cuda程序

    文件目录:

    cudaTest

        |--utils.cu

        |--utils.h

        |--squaresum.cu

        |--squaresum.h

        |--test.cpp

        |--CMakeLists.txt

    编译命令:

    $cd /root/cudaTest

    $mkdir build

    $cd build

    $cmake ..

    $make

    调佣关系:

    utils:提供常用工具,这里提供查询设备信息功能;

    squaresum:计算平方和功能,为cuda运行的核心函数实现

    test:调用平方和函数

    CMakeLists.txt:组织所有文件编译生成可执行文件

    注意:调用cu文件中的函数时要在头文件声明成extern “C”

    文件内容:

    CMakeLists.txt

    # CMakeLists.txt to build hellocuda.cu
    cmake_minimum_required(VERSION 2.8)
    find_package(CUDA QUIET REQUIRED)
     
    # Specify binary name and source file to build it from
    #add_library(utils utils.cpp)
    cuda_add_executable(
        squaresum
        test.cpp squaresum.cu utils.cu)
    #target_link_libraries(squaresum utils)

    test.cpp

    #include <iostream>
    #include "squaresum.h"
    
    //extern "C" int squaresum();
    
    int main(){
      squaresum();
      return 0;
    }

    squaresum.h

    #include "utils.h"
    #include <cuda_runtime.h>
    
    extern "C" {
      int squaresum();
    }

    squaresum.cu

    #include <stdio.h>
    #include <stdlib.h>
    //#include "utils.h"
    #include <iostream>
    #include "squaresum.h"
    // ======== define area ========
    #define DATA_SIZE 1048576 // 1M
    
    // ======== global area ========
    int data[DATA_SIZE];
    
    __global__ static void squaresSum(int *data, int *sum, clock_t *time)
    {
     int sum_t = 0;
     clock_t start = clock();
     for (int i = 0; i < DATA_SIZE; ++i) {
      sum_t += data[i] * data[i];
     }
     *sum = sum_t;
     *time = clock() - start;
    }
    
    // ======== used to generate rand datas ========
    void generateData(int *data, int size)
    {
     for (int i = 0; i < size; ++i) {
      data[i] = rand() % 10;
     }
    }
    
    int squaresum()
    {
     // init CUDA device
     if (!InitCUDA()) {
      return 0;
     }
     printf("CUDA initialized.
    ");
    
     // generate rand datas
     generateData(data, DATA_SIZE);
    
     // malloc space for datas in GPU
     int *gpuData, *sum;
     clock_t *time;
     cudaMalloc((void**) &gpuData, sizeof(int) * DATA_SIZE);
     cudaMalloc((void**) &sum, sizeof(int));
     cudaMalloc((void**) &time, sizeof(clock_t));
     cudaMemcpy(gpuData, data, sizeof(int) * DATA_SIZE, cudaMemcpyHostToDevice);
    
     // calculate the squares's sum
     squaresSum<<<1, 1, 0>>>(gpuData, sum, time);
    
     // copy the result from GPU to HOST
     int result;
     clock_t time_used;
     cudaMemcpy(&result, sum, sizeof(int), cudaMemcpyDeviceToHost);
     cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
    
     // free GPU spaces
     cudaFree(gpuData);
     cudaFree(sum);
     cudaFree(time);
    
     // print result
     printf("(GPU) sum:%d time:%ld
    ", result, time_used);
    
     // CPU calculate
     result = 0;
     clock_t start = clock();
     for (int i = 0; i < DATA_SIZE; ++i) {
      result += data[i] * data[i];
     }
     time_used = clock() - start;
     printf("(CPU) sum:%d time:%ld
    ", result, time_used);
    
     return 0;
    }

    utils.h

    #include <stdio.h>
    #include <cuda_runtime.h>
    
    extern "C" {
      bool InitCUDA();
    }

    utils.cu

    #include "utils.h"
    #include <cuda_runtime.h>
    #include <iostream>
    
    void printDeviceProp(const cudaDeviceProp &prop)
    {
     printf("Device Name : %s.
    ", prop.name);
     printf("totalGlobalMem : %d.
    ", prop.totalGlobalMem);
     printf("sharedMemPerBlock : %d.
    ", prop.sharedMemPerBlock);
     printf("regsPerBlock : %d.
    ", prop.regsPerBlock);
     printf("warpSize : %d.
    ", prop.warpSize);
     printf("memPitch : %d.
    ", prop.memPitch);
     printf("maxThreadsPerBlock : %d.
    ", prop.maxThreadsPerBlock);
     printf("maxThreadsDim[0 - 2] : %d %d %d.
    ", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
     printf("maxGridSize[0 - 2] : %d %d %d.
    ", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
     printf("totalConstMem : %d.
    ", prop.totalConstMem);
     printf("major.minor : %d.%d.
    ", prop.major, prop.minor);
     printf("clockRate : %d.
    ", prop.clockRate);
     printf("textureAlignment : %d.
    ", prop.textureAlignment);
     printf("deviceOverlap : %d.
    ", prop.deviceOverlap);
     printf("multiProcessorCount : %d.
    ", prop.multiProcessorCount);
    }
    
    bool InitCUDA()
    {
     //used to count the device numbers
     int count; 
    
     // get the cuda device count
     cudaGetDeviceCount(&count);
    // print("%d
    ", count);
    std::cout << count << std::endl;
     if (count == 0) {
      fprintf(stderr, "There is no device.
    ");
      return false;
     }
    
     // find the device >= 1.X
     int i;
     for (i = 0; i < count; ++i) {
      cudaDeviceProp prop;
      if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
       if (prop.major >= 1) {
        printDeviceProp(prop);
        break;
       }
      }
     }
    
     // if can't find the device
     if (i == count) {
      fprintf(stderr, "There is no device supporting CUDA 1.x.
    ");
      return false;
     }
    
     // set cuda device
     cudaSetDevice(i);
    
     return true;
    }
    
    //int main(){
    //  InitCUDA();
    //}
  • 相关阅读:
    mysql的备份与恢复(windows、Linux并拷贝至备机)
    eclipse导出可执行jar
    ORCLE中两张表对比更新合入(MERGE INTO)
    js中事件冒泡的问题
    Spring事务传播行为详解
    Java中的锁分类与使用
    用某浏览器全屏延时启动应用
    Springboot整合WebSocket的交互实例(点对点、点对面)
    Windows程序设计------字体不等宽引出的问题及其细节知识
    关于VS2013使用constexpr报错问题
  • 原文地址:https://www.cnblogs.com/haiyang21/p/7788063.html
Copyright © 2011-2022 走看看