zoukankan      html  css  js  c++  java
  • OpenCL 增强单work-item kernel性能策略

    1、基于反馈的Optimization Report解决单个Work-item的Kernel相关性

      在许多情况下,将OpenCL™应用程序设计为单个工作项内核就足以在不执行其他优化步骤的情况下最大化性能。

      建议采用以下优化单个work-item kernel的选项来按照实用性顺序解决单个work-item kernel循环携带的依赖性:

        removal,relaxation,simplification,transfer to local memory。

    (1) Removing Loop-Carried Dependency

      根据优化报告的反馈,可以通过实现更简单的内存访问的方式来消除循环携带的依赖关系。

    优化前:

     1 #define N 128
     2 
     3 __kernel void unoptimized (__global int * restrict A,
     4                            __global int * restrict B,
     5                            __global int* restrict result)
     6 {
     7   int sum = 0;
     8 
     9   for (unsigned i = 0; i < N; i++) {
    10     for (unsigned j = 0; j < N; j++) {
    11       sum += A[i*N+j];
    12     }
    13     sum += B[i];
    14   }
    15 
    16   * result = sum;
    17 }

    优化后:

     1 #define N 128
     2 
     3 __kernel void optimized (__global int * restrict A,
     4                          __global int * restrict B,
     5                          __global int * restrict result)
     6 {
     7   int sum = 0;
     8 
     9   for (unsigned i = 0; i < N; i++) {
    10     // Step 1: Definition
    11     int sum2 = 0;
    12 
    13     // Step 2: Accumulation of array A values for one outer loop iteration
    14     for (unsigned j = 0; j < N; j++) {
    15       sum2 += A[i*N+j];
    16     }
    17 
    18     // Step 3: Addition of array B value for an outer loop iteration
    19     sum += sum2;
    20     sum += B[i];
    21   }
    22 
    23   * result = sum;
    24 }

    (2) Relaxing Loop-Carried Dependency

      根据优化报告的反馈,可以通过增加依赖距离的方式来relax缓解循环携带的依赖关系。

    优化前:

     1 #define N 128
     2 
     3 __kernel void unoptimized (__global float * restrict A,
     4                            __global float * restrict result)
     5 {
     6   float mul = 1.0f;
     7 
     8   for (unsigned i = 0; i < N; i++)
     9     mul *= A[i];
    10 
    11   * result = mul;
    12 }

    优化后:

     1 #define N 128
     2 #define M 8
     3 
     4 __kernel void optimized (__global float * restrict A,
     5                          __global float * restrict result)
     6 {
     7   float mul = 1.0f;
     8 
     9   // Step 1: Declare multiple copies of variable mul
    10   float mul_copies[M];
    11 
    12   // Step 2: Initialize all copies
    13   for (unsigned i = 0; i < M; i++)
    14     mul_copies[i] = 1.0f;
    15 
    16   for (unsigned i = 0; i < N; i++) {
    17     // Step 3: Perform multiplication on the last copy
    18     float cur = mul_copies[M-1] * A[i];
    19 
    20     // Step 4a: Shift copies
    21     #pragma unroll 
    22     for (unsigned j = M-1; j > 0; j--)
    23       mul_copies[j] = mul_copies[j-1];
    24 
    25     // Step 4b: Insert updated copy at the beginning
    26     mul_copies[0] = cur;
    27   }
    28 
    29   // Step 5: Perform reduction on copies
    30   #pragma unroll 
    31   for (unsigned i = 0; i < M; i++)
    32     mul *= mul_copies[i];
    33 
    34   * result = mul;
    35 }

    (3) Transferring Loop-Carried Dependency to Local Memory

      对于不能消除的循环携带的依赖关系,可以通过将具有循环依赖的数组从全局内存global memory移动到local memory的方式来改进循环的启动间隔(initiation interval, II)。

    优化前:

    1 #define N 128
    2
    3 __kernel void unoptimized( __global int* restrict A )
    4 {
    5     for (unsigned i = 0; i < N; i++)
    6           A[N-i] = A[i];
    7 }     

    优化后:

     1 #define N 128
     2
     3 __kernel void optimized( __global int* restrict A )
     4 {
     5     int B[N];
     6
     7     for (unsigned i = 0; i < N; i++)
     8         B[i] = A[i];
     9
    10     for (unsigned i = 0; i < N; i++)
    11         B[N-i] = B[i];
    12
    13     for (unsigned i = 0; i < N; i++)
    14         A[i] = B[i];
    15 }

    (4) Relaxing Loop-Carried Dependency by Inferring Shift Registers

      为了使SDK能够有效地处理执行双精度浮点运算的单个work-item kernels,可以通过推断移位寄存器的方式移除循环携带的依赖性。

    优化前:

     1 __kernel void double_add_1 (__global double *arr,
     2                             int N,
     3                             __global double *result)
     4 {
     5   double temp_sum = 0;
     6
     7   for (int i = 0; i < N; ++i)
     8   {
     9       temp_sum += arr[i];
    10   }
    11 
    12   *result = temp_sum;
    13 }

    优化后:

     1 //Shift register size must be statically determinable
     2 #define II_CYCLES 12
     3
     4 __kernel void double_add_2 (__global double *arr,
     5                             int N,
     6                             __global double *result)
     7 {
     8     //Create shift register with II_CYCLE+1 elements
     9     double shift_reg[II_CYCLES+1];
    10    
    11     //Initialize all elements of the register to 0
    12     for (int i = 0; i < II_CYCLES + 1; i++)
    13     {
    14         shift_reg[i] = 0;
    15     }
    16    
    17     //Iterate through every element of input array
    18     for(int i = 0; i < N; ++i)
    19     {
    20         //Load ith element into end of shift register
    21         //if N > II_CYCLE, add to shift_reg[0] to preserve values
    22         shift_reg[II_CYCLES] = shift_reg[0] + arr[i];
    23 
    24         #pragma unroll
    25         //Shift every element of shift register
    26         for(int j = 0; j < II_CYCLES; ++j)
    27         {
    28             shift_reg[j] = shift_reg[j + 1];
    29         }
    30     }
    31 
    32     //Sum every element of shift register
    33     double temp_sum = 0;
    34     
    35     #pragma unroll 
    36     for(int i = 0; i < II_CYCLES; ++i)
    37     {
    38         temp_sum += shift_reg[i];
    39     }
    40
    41     *result = temp_sum;
    42 }

    (5) Removing Loop-Carried Dependencies Cause by Accesses to Memory Arrays

      在单个work-item的kernel中包含ivdep pragma以保证堆内存array的访问不会导致循环携带的依赖性。

      如果对循环中的内存阵列的所有访问都不会导致循环携带的依赖性,在内核代码中的循环之前添加#pragma ivdep。

    // no loop-carried dependencies for A and B array accesses
    #pragma ivdep
    for (int i = 0; i < N; i++) {
        A[i] = A[i - X[i]];
        B[i] = B[i - Y[i]];
    }

      要指定对循环内特定内存数组的访问不会导致循环携带的依赖关系,在内核代码中的循环之前添加#pragma ivdep arrayarray_name)。

       ivdep pragma指定的数组必须是local / private内存数组,或者是指向global/local/private内存存储的指针变量。 如果指定的数组是指针,则ivdep pragma也适用于所有可能带有指定指针别名的数组。

       ivdep pragma指示指定的数组也可以是struct的数组或指针成员。

    // No loop-carried dependencies for A array accesses
    // The offline compiler will insert hardware that reinforces dependency constraints for B
    #pragma ivdep array(A)
    for (int i = 0; i < N; i++) {
        A[i] = A[i - X[i]];
        B[i] = B[i - Y[i]];
    }
    
    // No loop-carried dependencies for array A inside struct
    #pragma ivdep array(S.A)
    for (int i = 0; i < N; i++) {
        S.A[i] = S.A[i - X[i]];
    }
    
    // No loop-carried dependencies for array A inside the struct pointed by S
    #pragma ivdep array(S->X[2][3].A)
    for (int i = 0; i < N; i++) {
        S->X[2][3].A[i] = S.A[i - X[i]];
    }
    
    // No loop-carried dependencies for A and B because ptr aliases
    // with both arrays
    int *ptr = select ? A : B;
    #pragma ivdep array(ptr)
    for (int i = 0; i < N; i++) {
        A[i] = A[i - X[i]];
        B[i] = B[i - Y[i]];
    }
    
    // No loop-carried dependencies for A because ptr only aliases with A
    int *ptr = &A[10];
    #pragma ivdep array(ptr)
    for (int i = 0; i < N; i++) {
        A[i] = A[i - X[i]];
        B[i] = B[i - Y[i]];
    }

    2、Single work-item kernel的设计技巧

      避免指针混淆。使用restrict关键词。

      创建格式正确的循环。退出条件与整数进行比较,且每次迭代的增量为1。格式正确的嵌套循环也对最大化kernel性能有帮助。

      最小化loop-carried循环携带依赖遵循以下原则:

        避免使用指针算数;

        声明简单的数组索引;

        尽可能在kernel中使用恒定边界的循环。

      避免复杂的循环退出条件;

      将嵌套循环转换为单循环;

      避免条件循环; 避免if else中包含循环,尽量转换为在循环中包含if else。

      在尽可能深的范围内声明变量。

    未经允许,请勿转载
  • 相关阅读:
    js控制弹出窗口
    精品JS代码大全
    (转)QQ在线客服代码
    打字效果的实现方法
    “/”应用程序中的服务器错误。 操作必须使用一个可更新的查询。
    jQuery中 trigger() & bind() 使用心得
    (转)asp.net 的中的TimeSpan 详解
    (转)VS部分验证控件的实现方法
    Repeater控件嵌套使用
    图解使用Win8Api进行Metro风格的程序开发十三加解密
  • 原文地址:https://www.cnblogs.com/zhuzhudong/p/13403737.html
Copyright © 2011-2022 走看看