zoukankan      html  css  js  c++  java
  • 0_Simple__fp16ScalarProduct

    ▶ 使用cuda内置无符号整数结构(__half2)及其汇编函数,计算两个向量的内积。

    ▶ 源代码

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <time.h>
      4 #include "cuda_runtime.h"
      5 #include "device_launch_parameters.h"
      6 #include "cuda_fp16.h"
      7 #include "helper_cuda.h"
      8 
      9 // 将数组 v 进行二分规约加法,使用 __forceinline__ 强制内联
     10 __forceinline__ __device__ void reduceInShared(half2 * const v)
     11 {
     12     if (threadIdx.x < 64)
     13         v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
     14     __syncthreads();
     15     for (int i = 32; i > 0; i /= 2)
     16     {
     17         if (threadIdx.x < 32)
     18             v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + i]);
     19         __syncthreads();
     20     }
     21 }
     22 
     23 // 将数组 a 与 b 相加后进行规约加法,输入还包括指向结果的指针 h_result 及数组大小
     24 __global__ void scalarProductKernel(half2 const * const a, half2 const * const b, float * const h_result, size_t const size)
     25 {    
     26     __shared__ half2 shArray[128];
     27     const int stride = gridDim.x * blockDim.x;
     28 
     29     shArray[threadIdx.x] = __float2half2_rn(0.f);                               // 浮点数转无符号整数,这里相当于初始化为 0
     30     
     31     half2 value = __float2half2_rn(0.f);                                        
     32     for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride)  // 半精度混合乘加,value = a[i] * b[i] + value
     33         value = __hfma2(a[i], b[i], value);                                     
     34     shArray[threadIdx.x] = value;
     35     __syncthreads();
     36 
     37     reduceInShared(shArray);                                                    // 规约得 a 和 b 的内积,因为使用了内联,共享内存指针可以传入
     38 
     39     if (threadIdx.x == 0)                                                       // 0 号线程负责写入结果
     40     {
     41         half2 result = shArray[0];
     42         h_result[blockIdx.x] = (float)(__low2float(result) + __high2float(result));
     43     }
     44 }
     45 
     46 void generateInput(half2 * a, size_t size)                                      // 生成随机数组
     47 {
     48     for (size_t i = 0; i < size; ++i)
     49     {
     50         unsigned temp = rand();
     51         temp &= 0x83FF83FF;                                                     // 2214560767(10), 10000011111111111000001111111111(2)
     52         temp |= 0x3C003C00;                                                     // 1006648320(10),   111100000000000011110000000000(2)
     53         a[i] = *(half2*)&temp;
     54     }
     55 }
     56 
     57 int main(int argc, char *argv[])
     58 {
     59     srand(time(NULL));
     60     const int blocks = 128, threads = 128;
     61     size_t size = blocks * threads * 16;
     62 
     63     int devID = 0;
     64     cudaDeviceProp devProp;
     65     cudaGetDeviceProperties(&devProp, devID);
     66     if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3))
     67     {
     68         printf("required GPU with compute SM 5.3 or higher.
    ");
     69         return EXIT_WAIVED;
     70     }
     71 
     72     half2 *h_vec[2], *d_vec[2];
     73     float *h_result, *d_result;
     74     for (int i = 0; i < 2; ++i)
     75     {
     76         cudaMallocHost((void**)&h_vec[i], size * sizeof*h_vec[i]);
     77         cudaMalloc((void**)&d_vec[i], size * sizeof*d_vec[i]);
     78     }
     79     cudaMallocHost((void**)&h_result, blocks * sizeof*h_result);
     80     cudaMalloc((void**)&d_result, blocks * sizeof*d_result);
     81     for (int i = 0; i < 2; ++i)
     82     {
     83         generateInput(h_vec[i], size);
     84         cudaMemcpy(d_vec[i], h_vec[i], size * sizeof*h_vec[i], cudaMemcpyHostToDevice);
     85     }
     86     scalarProductKernel << <blocks, threads >> >(d_vec[0], d_vec[1], d_result, size);
     87     cudaMemcpy(h_result, d_result, blocks * sizeof * h_result, cudaMemcpyDeviceToHost);
     88     cudaDeviceSynchronize();
     89 
     90     float result = 0;
     91     for (int i = 0; i < blocks; ++i)
     92         result += h_result[i];
     93     printf("Result: %f 
    ", result);
     94 
     95     for (int i = 0; i < 2; ++i)
     96     {
     97         cudaFree(d_vec[i]);
     98         cudaFreeHost(h_vec[i]);
     99     }
    100     cudaFree(d_result);
    101     cudaFreeHost(h_result);
    102     getchar();
    103     return EXIT_SUCCESS;
    104 }

    ● 输出结果

    GPU Device 0: "GeForce GTX 1070" with compute capability 6.1
    
    Result: 853856.000000

    ▶ 涨姿势

    ● CUDA 无符号半精度整数,就是用 unsigned short 对齐到 2 Byte 来封装的

    1 typedef struct __align__(2) { unsigned short x; } __half;
    2 
    3 typedef struct __align__(4) { unsigned int x; } __half2; 
    4 
    5 #ifndef CUDA_NO_HALF
    6 typedef __half half;
    7 typedef __half2 half2;
    8 #endif

    ● 关于 __inline__ 和 __forceinline__

    参考stackoverflow。https://stackoverflow.com/questions/19897803/forceinline-effect-at-cuda-c-device-functions

    与C中__forceinline__类似,忽略编译器的建议,强制实现内联函数。如果函数只调用累次那么优化没有效果,但是如果调用了多次(如内联函数出现在循环中),则会产生明显的提升。另外,在递归中一般不用。

    ● 关于 __CUDACC__ 和 __CUDA_ARCH__

    ■ 参考 stackoverflow【https://stackoverflow.com/questions/8796369/cuda-and-nvcc-using-the-preprocessor-to-choose-between-float-or-double】

    ■ __CUDACC__ 使用 nvcc 进行编译时有定义。

    ■ __CUDA_ARCH__ 编译主机代码时无定义(无论是否使用 nvcc);编译设备代码时有定义,且值等于编译命令指定的计算能力号。

    ■ 范例代码:(为了方便查看,使用了缩进)

     1 #ifdef __CUDACC__
     2     #warning using nvcc
     3 
     4     template <typename T>                  // 一般的核函数
     5     __global__ void add(T *x, T *y, T *z)
     6     {
     7         int idx = threadIdx.x + blockDim.x * blockIdx.x;
     8         z[idx] = x[idx] + y[idx];
     9     }
    10 
    11     #ifdef __CUDA_ARCH__
    12         #warning device code trajectory
    13         #if __CUDA_ARCH__ > 120
    14             #warning compiling with datatype double
    15             template void add<double>(double *, double *, double *);
    16         #else
    17             #warning compiling with datatype float
    18             template void add<float>(float *, float *, float *);
    19         #endif
    20     #else
    21         #warning nvcc host code trajectory
    22     #endif
    23 #else
    24     #warning non - nvcc code trajectory
    25 #endif

    ■ 编译及输出结果

    $ ln -s cudaarch.cu cudaarch.cc
    $ gcc -c cudaarch.cc -o cudaarch.o
    cudaarch.cc:26:2: warning: #warning non-nvcc code trajectory
    
    $ nvcc -arch=sm_11 -Xptxas="-v" -c cudaarch.cu -o cudaarch.cu.o
    cudaarch.cu:3:2: warning: #warning using nvcc
    cudaarch.cu:14:2: warning: #warning device code trajectory
    cudaarch.cu:19:2: warning: #warning compiling with datatype float
    cudaarch.cu:3:2: warning: #warning using nvcc
    cudaarch.cu:23:2: warning: #warning nvcc host code trajectory
    ptxas info    : Compiling entry function '_Z3addIfEvPT_S1_S1_' for 'sm_11'
    ptxas info    : Used 4 registers, 12+16 bytes smem
    
    $ nvcc -arch=sm_20 -Xptxas="-v" -c cudaarch.cu -o cudaarch.cu.o
    cudaarch.cu:3:2: warning: #warning using nvcc
    cudaarch.cu:14:2: warning: #warning device code trajectory
    cudaarch.cu:16:2: warning: #warning compiling with datatype double
    cudaarch.cu:3:2: warning: #warning using nvcc
    cudaarch.cu:23:2: warning: #warning nvcc host code trajectory
    ptxas info    : Compiling entry function '_Z3addIdEvPT_S1_S1_' for 'sm_20'
    ptxas info    : Used 8 registers, 44 bytes cmem[0]

    ● 用到的汇编函数

     1 // 表明主机和设备共有代码
     2 #define __CUDA_FP16_DECL__ __host__ __device__
     3 
     4 // 浮点数转无符号整数
     5 __CUDA_FP16_DECL__ __half2 __float2half2_rn(const float f)
     6 {
     7     __half2 val;
     8     asm("{.reg .f16 low;
    "
     9         "  cvt.rn.f16.f32 low, %1;
    "
    10         "  mov.b32 %0, {low,low};}
    " : "=r"(val.x) : "f"(f));
    11     return val;
    12 }
    13 
    14 // 计算无符号整数 a + b
    15 #define BINARY_OP_HALF2_MACRO(name)                                             
    16     do                                                                          
    17     {                                                                           
    18         __half2 val;                                                            
    19         asm("{"#name".f16x2 %0,%1,%2;
    }" :"=r"(val.x) : "r"(a.x), "r"(b.x));   
    20         return val;                                                             
    21     }                                                                           
    22     while(0);                                                                   
    23 
    24 __CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b)
    25 {
    26     BINARY_OP_HALF2_MACRO(add);
    27 }
    28 
    29 // 计算无符号整数 a * b + c
    30 #define TERNARY_OP_HALF2_MACRO(name)                                                        
    31     do                                                                                      
    32     {                                                                                       
    33         __half2 val;                                                                        
    34         asm("{"#name".f16x2 %0,%1,%2,%3;
    }" : "=r"(val.x) : "r"(a.x), "r"(b.x), "r"(c.x)); 
    35         return val;                                                                         
    36     }                                                                                       
    37     while(0);                                                                               
    38 
    39 __CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c)
    40 {
    41     TERNARY_OP_HALF2_MACRO(fma.rn);
    42 }
    43 
    44 // 将无符号整数的低 2 字节转化为浮点数
    45 __CUDA_FP16_DECL__ float __low2float(const __half2 l)
    46 {
    47     float val;
    48     asm("{.reg .f16 low,high;
    "
    49         "  mov.b32 {low,high},%1;
    "
    50         "  cvt.f32.f16 %0, low;}
    " : "=f"(val) : "r"(l.x));
    51     return val;
    52 }
    53 
    54 // 将无符号整数的高 2 字节转化为浮点数
    55 __CUDA_FP16_DECL__ float __high2float(const __half2 l)
    56 {
    57     float val;
    58     asm("{.reg .f16 low,high;
    "
    59         "  mov.b32 {low,high},%1;
    "
    60         "  cvt.f32.f16 %0, high;}
    " : "=f"(val) : "r"(l.x));
    61     return val;
    62 }
  • 相关阅读:
    Linux 修改最大线程数
    Openresty+Nginx+Lua+Nginx_http_upstream_check_module 搭建
    SSDB 性能测试
    面向对象:类的成员
    封装,多态,类的约束,super()深入了解
    面向对象:继承
    面向对象:类的空间问题,类之间关系
    面向对象初识
    软件开发规范
    模块(四)包和logging日志
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7743578.html
Copyright © 2011-2022 走看看