zoukankan      html  css  js  c++  java
  • 0_Simple__simpleSeparateCompilation

    把代码文件和主程序文件分开编译,使用头文件的形式进行引用。

    ▶ 源代码

    1 // simpleDeviceLibrary.cuh
    2 #ifndef SIMPLE_DEVICE_LIBRARY_CUH
    3 #define SIMPLE_DEVICE_LIBRARY_CUH
    4 
    5 extern __device__ float multiplyByTwo(float number);
    6 
    7 extern __device__ float divideByTwo(float number);
    8 
    9 #endif
     1 // simpleDeviceLibrary.cu
     2 #include <cuda_runtime.h>
     3 #include "device_launch_parameters.h"
     4 __device__ float multiplyByTwo(float number)
     5 {
     6     return number * 2.0f;
     7 }
     8 
     9 __device__ float divideByTwo(float number)
    10 {
    11     return number * 0.5f;
    12 }
     1 #include <stdio.h>
     2 #include <iostream>
     3 #include <vector>
     4 #include <cuda_runtime.h>
     5 #include "device_launch_parameters.h"
     6 #include "simpleDeviceLibrary.cuh"
     7 
     8 using std::cout;
     9 using std::endl;
    10 using std::vector;
    11 
    12 #define EPS 1e-5
    13 
    14 typedef float(*deviceFunc)(float);
    15 
    16 __device__ deviceFunc dMultiplyByTwoPtr = multiplyByTwo;
    17 __device__ deviceFunc dDivideByTwoPtr = divideByTwo;
    18 
    19 __global__ void transformVector(float *v, deviceFunc f, unsigned int size)
    20 {
    21     unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    22     if (tid < size)
    23         v[tid] = (*f)(v[tid]);
    24 }
    25 
    26 bool test()
    27 {
    28     bool result = true;
    29     cudaSetDevice(0);
    30 
    31     const unsigned int kVectorSize = 1000;
    32     vector<float> hVector(kVectorSize);
    33     for (unsigned int i = 0; i < kVectorSize; ++i)
    34         hVector[i] = rand() / static_cast<float>(RAND_MAX);
    35     float *dVector;
    36     cudaMalloc(&dVector, kVectorSize * sizeof(float));
    37     cudaMemcpy(dVector, &hVector[0], kVectorSize * sizeof(float), cudaMemcpyHostToDevice);
    38 
    39     dim3 dimGrid(1);
    40     dim3 dimBlock(1024);
    41 
    42     // 函数指针需要用 cudaMemcpyFromSymbol 放入设备常量内存
    43     deviceFunc hFunctionPtr;
    44     cudaMemcpyFromSymbol(&hFunctionPtr, dMultiplyByTwoPtr, sizeof(deviceFunc));
    45     transformVector << <dimGrid, dimBlock >> > (dVector, hFunctionPtr, kVectorSize);
    46     cudaGetLastError();
    47 
    48     cudaMemcpyFromSymbol(&hFunctionPtr, dDivideByTwoPtr, sizeof(deviceFunc));
    49     transformVector << <dimGrid, dimBlock >> > (dVector, hFunctionPtr, kVectorSize);
    50     cudaGetLastError();
    51 
    52     vector<float> hResultVector(kVectorSize);
    53     cudaMemcpy(&hResultVector[0], dVector, kVectorSize * sizeof(float), cudaMemcpyDeviceToHost);
    54 
    55     // 检查结果
    56     for (int i = 0; i < kVectorSize; ++i)
    57     {
    58         if (fabs(hVector[i] - hResultVector[i]) > EPS)
    59         {
    60             printf("
    	Error at %d, gpu[i] = %f, cpu[i] = %f
    ", i, hResultVector[i], hVector[i]);
    61             result = false;
    62             break;
    63         }
    64     }
    65     return result;
    66 }
    67 
    68 int main(int argc, char **argv)
    69 {
    70     printf("
    	Start
    ");
    71     printf("
    	Finish, %s
    ", test() ? "Passed" : "Failed");
    72 
    73     getchar();
    74     return 0;
    75 }

    ▶ 输出结果

      未测试

    ▶ 涨姿势

    ●写在其他 .cpp 文件中的设备函数,需要用函数 cudaMemcpyFromSymbol() 放入设备常量内存才能使用。

    1 typedef float(*deviceFunc)(float);
    2 deviceFunc hFunctionPtr;
    3 cudaMemcpyFromSymbol(&hFunctionPtr, dMultiplyByTwoPtr, sizeof(deviceFunc));
  • 相关阅读:
    程序为什么加载到内存中
    cortex-A cortex-R cortex-M处理器的性能比较
    makefile 中的赋值方式
    python(老男孩全栈观后感------文件处理)
    python------lambda(匿名函数)
    python------filter(过滤器)
    Express深入解读
    nodejs安装
    一道有意思的题目
    charAt获取数组,测试
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7895550.html
Copyright © 2011-2022 走看看