zoukankan      html  css  js  c++  java
  • OpenACC 书上的范例代码(Jacobi 迭代),part 1

    ▶ 使用Jacobi 迭代求泊松方程的数值解

    ● 原始串行版本

     1 #include <stdio.h>
     2 #include <stdlib.h>
     3 #include <math.h>
     4 
     5 #if defined(_WIN32) || defined(_WIN64)                                                      // 统一计时器
     6 #include <C:Program FilesPGIwin6419.4includewrapsys	imeb.h>    
     7 #define gettime(a)  _ftime(a)
     8 #define usec(t1,t2) ((((t2).time - (t1).time) * 1000 + (t2).millitm - (t1).millitm))        // 单位 ms
     9 typedef struct _timeb timestruct;
    10 #else
    11 #include <sys/time.h>
    12 #define gettime(a)  gettimeofday(a, NULL)
    13 #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000000 + (t2).tv_usec - (t1).tv_usec)   // 单位 us
    14 typedef struct timeval timestruct;
    15 #endif
    16 
    17 #define IMPROV                                          // 是否额外使用 “每次计算的修正量” 作为退出循环的条件
    18 
    19 inline float uval(float x, float y)                     // 求该点到原点距离的平方
    20 {
    21     return x * x + y * y;
    22 }
    23 
    24 int main()
    25 {
    26     const int row = 8191, col = 1023;                   // 网格行数和列数,
    27     const float height = 1.0, width = 2.0;              // 实际高度和宽度,与网格行列数不成比例说明是矩形网格
    28     const float hx = height / row, wy = width / col;    // 每个网格的高度和宽度
    29     const float fij = -4.0f;                            // 函数 f(x,y) = -4,此时方程的解为 z = x^2 + y^2
    30     const float hx2 = hx * hx, wy2 = wy * wy, c1 = hx2 * wy2, c2 = 1.0f / (2.0 * (hx2 + wy2));// 其他用到的参数    
    31     const int maxIter = 100;                            // 最大迭代次数
    32     const int colPlus = col + 1;                        // 实际列数
    33 #ifdef IMPROV
    34     const float errControl = 0.0f;                      // 修正量控制,取 0 表示无用
    35     float err = 0.0f;                                   // 修正量
    36 #endif
    37 
    38     float *u0 = (float *)malloc(sizeof(float)*(row + 1)*colPlus);       // 用来存放网格数据的两张表,行列数等于 row 和 col 各自加 1,
    39     float *u1 = (float *)malloc(sizeof(float)*(row + 1)*colPlus);
    40     float *utemp = NULL;                                                // 用于交换 u1 和 u0 的临时指针    
    41     
    42     // 初始化边界为 g(x,y) = x^2+y^2
    43     for (int ix = 0; ix <= row; ix++)                                   // 左右边界
    44     {
    45         u0[ix*colPlus + 0] = u1[ix*colPlus + 0] = uval(ix * hx, 0.0f);        
    46         u0[ix*colPlus + col] = u1[ix*colPlus + col] = uval(ix*hx, col * wy);
    47     }
    48     for (int jy = 0; jy <= col; jy++)                                   // 上下边界
    49     {
    50         u0[jy] = u1[jy] = uval(0.0f, jy * wy);
    51         u0[row*colPlus + jy] = u1[row*colPlus + jy] = uval(row*hx, jy * wy);
    52     }
    53     for (int ix = 1; ix < row; ix++)                                    // 内部格点初始化为 0.0f 
    54     {
    55         for (int jy = 1; jy < col; jy++)
    56             u0[ix*colPlus + jy] = 0.0f;
    57     }
    58     
    59     // 计算
    60     timestruct t1, t2;    
    61     gettime(&t1);    
    62     for (int iter = 1; iter < maxIter; iter++)
    63     {
    64         for (int ix = 1; ix < row; ix++)
    65         {
    66             for (int jy = 1; jy < col; jy++)
    67             {
    68                 u1[ix*colPlus + jy] = (c1*fij + wy2 * (u0[(ix - 1)*colPlus + jy] + u0[(ix + 1)*colPlus + jy]) + 
    69                     hx2 * (u0[ix*colPlus + jy - 1] + u0[ix*colPlus + jy + 1])) * c2;
    70 #ifdef IMPROV
    71                 err = max(fabs(u0[ix*colPlus + jy] - u1[ix*colPlus + jy]), err);  // 记录整张表上的最大修正量
    72 #endif
    73             }
    74         }
    75 #ifdef IMPROV        
    76         //printf("
    iter = %d, err = %e
    ", iter, err);                 // 逐次输出
    77         if (err < errControl)                                           // 修正量小于指定量就可以退出
    78             break;        
    79 #endif
    80         utemp = u0, u0 = u1, u1 = utemp;                                // 交换指针
    81     }
    82     gettime(&t2);
    83 
    84     long long timeElapse = usec(t1, t2);
    85     printf("
    Elapsed time: %13ld ms.
    ", timeElapse);
    86     free(u0);
    87     free(u1);
    88     getchar();
    89     return 0;
    90 }

    ● 输出结果(使用 IMPROV),可以看到很多 not fused,这都是可以改进的地方

    D:CodeOpenACC>pgcc main.c -Minfo -o main.exe                      // 普通编译
    main:
         66, FMA (fused multiply-add) instruction(s) generated          // 使用乘加指令
    uval:
         21, FMA (fused multiply-add) instruction(s) generated
    
    D:CodeOpenACC>pgcc main.c -Minfo -o main-fast.exe -fast           // 添加 fast 选项
    main:
         45, uval inlined, size=3 (inline) file main.c (20)             // 4 个内联函数
              43, Loop not fused: different loop trip count             // 担心 for 中存在数据依赖,拒绝并行
                  Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed  // 担心 u0 和 u1是否重叠,拒绝并行
                  Loop not vectorized: data dependency
                  Loop unrolled 4 times  // 循环展开
                  Generated 4 prefetches in scalar loop
         46, uval inlined, size=3 (inline) file main.c (20)
         50, uval inlined, size=3 (inline) file main.c (20)
              48, Loop not vectorized: data dependency
                  Loop unrolled 4 times
         51, uval inlined, size=3 (inline) file main.c (20)
         55, Memory zero idiom, loop replaced by call to __c_mzero4     // 使用 memcpy 来赋零值
         62, Loop not vectorized/parallelized: potential early exits    // 有额外脱离循环的条件,拒绝并行
         66, Loop not vectorized: data dependency
             Loop unrolled 2 times
             FMA (fused multiply-add) instruction(s) generated
    
    D:CodeOpenACC>main.exe
    
    Elapsed time:          2754 ms.
    
    D:CodeOpenACC>main-fast.exe
    
    Elapsed time:          2804 ms.                                     // 加了 fast 反而更慢

    ● 输出结果(不用 IMPROV),发现变快了,可见提前跳出循环的 if 语句对并行化有很大影响。在本例中我们让 errControl = 0,每次循环多一个判断(实际绝对不会跳出),就严重干扰了编译

    D:CodeOpenACC>pgcc main.c -Minfo -o main.exe
    main:
         66, FMA (fused multiply-add) instruction(s) generated
    uval:
         21, FMA (fused multiply-add) instruction(s) generated
    
    D:CodeOpenACC>pgcc main.c -Minfo -o main-fast.exe -fast
    main:
         45, uval inlined, size=3 (inline) file main.c (20)
              43, Loop not fused: different loop trip count
                  Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed
                  Loop not vectorized: data dependency
                  Loop unrolled 4 times
                  Generated 4 prefetches in scalar loop
         46, uval inlined, size=3 (inline) file main.c (20)
         50, uval inlined, size=3 (inline) file main.c (20)
              48, Loop not vectorized: data dependency
                  Loop unrolled 4 times
         51, uval inlined, size=3 (inline) file main.c (20)
         55, Memory zero idiom, loop replaced by call to __c_mzero4
         62, Loop not fused : function call before adjacent loop    //
         66, Loop not vectorized : data dependency
          Loop unrolled 4 times                                     // 展开次数由 2 变成 4
             FMA (fused multiply-add) instruction(s) generated
    
    D:CodeOpenACC>main.exe
    
    Elapsed time:          1384 ms.                                 // 变快了 1 倍
    
    D:CodeOpenACC>main-fast.exe
    
    Elapsed time:           618 ms.                                 // 再变快 1 倍

    ● 使用 OpenMP 优化(就一句导语)

    1 // #include <math.h> 下面
    2 #include <omp.h>
    3 
    4 //for (int iter = 1; iter < maxIter; iter++){ 下面
    5 #ifdef IMPROV
    6 #pragma omp parallel for reduction(max:err) default(none) shared(u0, u1, c1, c2, hx2, wy2, colPlus) private(err)
    7 #else
    8 #pragma omp parallel for default(none) shared(u0, u1, c1, c2, hx2, wy2, colPlus)
    9 #endif

    ● 输出结果

    D:CodeOpenACC>set OMP_NUM_THREADS=4                           // 使用 4 个线程
    
    D:CodeOpenACC>pgcc main.c -Minfo -o main4I.exe -fast -mp      // 用 IMPROV
    main:
         46, uval inlined, size=3 (inline) file main.c (21)
              44, Loop not fused: different loop trip count
                  Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed
                  Loop not vectorized: data dependency
                  Loop unrolled 4 times
                  Generated 4 prefetches in scalar loop
         47, uval inlined, size=3 (inline) file main.c (21)
         51, uval inlined, size=3 (inline) file main.c (21)
              49, Loop not vectorized: data dependency
                  Loop unrolled 4 times
         52, uval inlined, size=3 (inline) file main.c (21)
         56, Memory zero idiom, loop replaced by call to __c_mzero4
         64, Loop not vectorized/parallelized: potential early exits
         71, Parallel region activated                              // OpenMP 并行区
             Parallel loop activated with static block schedule
         73, Loop not vectorized: data dependency
             Loop unrolled 2 times
             FMA (fused multiply-add) instruction(s) generated    
         84, Begin critical section                                 // 脱出循环的判断导致的串行区
             End critical section
             Barrier                                                // 栅栏
             Parallel region terminated                             
    
    D:CodeOpenACC>main4I.exe                                      
    
    Elapsed time:           723 ms.                                 // 还是快了 3.8 倍
    
    D:CodeOpenACC>pgcc main.c -Minfo -o main4.exe -fast -mp       // 不用 IMPROV
    main:
         46, uval inlined, size=3 (inline) file main.c (21)
              44, Loop not fused: different loop trip count
                  Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed
                  Loop not vectorized: data dependency
                  Loop unrolled 4 times
                  Generated 4 prefetches in scalar loop
         47, uval inlined, size=3 (inline) file main.c (21)
         51, uval inlined, size=3 (inline) file main.c (21)
              49, Loop not vectorized: data dependency
                  Loop unrolled 4 times
         52, uval inlined, size=3 (inline) file main.c (21)
         56, Memory zero idiom, loop replaced by call to __c_mzero4
         64, Loop not vectorized/parallelized: contains a parallel region   // 有 OpenMP的并行区,拒绝并行
         71, Parallel region activated
             Parallel loop activated with static block schedule
         73, Loop not vectorized: data dependency
             Loop unrolled 4 times
             FMA (fused multiply-add) instruction(s) generated
         87, Barrier                                                // 没有了串行区
             Parallel region terminated
    
    D:CodeOpenACC>main4.exe
    
    Elapsed time:           430 ms.                                 // 还能再快点,加速比 1.4
             
    D:CodeOpenACC>set OMP_NUM_THREADS=8                           // 使用 8 线程
    
    D:CodeOpenACC>pgcc main.c -Minfo -o main8.exe -fast -mp
    
    ...// 跟 4 线程时一模一样
    
    D:CodeOpenACC>main8.exe
    
    Elapsed time:           399 ms.                                 // 不宰线性加速,加速比 1.5 

    ▶ 在 Ubuntu 下跑的结果,加速前比 win10 慢很多,关闭 IMPROV 并开启 OpenMP 和 fast 选项后速度接近

    mainI.exe           3907651 us
    
    mainI-fast.exe      1269052 us  // 极速比 3.1
    
    main.exe            1895224 us  // 加速比 2.1
    
    main-fast.exe        616599 us  // 加速比 6.4
    
    cuan@CUAN:~$ pgcc mainI.c -Minfo -o main4I-fast.exe -fast -mp // 要求我将 row,col,fij 放入 OpenMP 的 shared 导语中,在 win10 下没有显式放入也行
    PGC-S-0155-row must appear in a proper data sharing clause (e.g., PRIVATE) (mainI.c: 72)
    PGC-S-0155-col must appear in a proper data sharing clause (e.g., PRIVATE) (mainI.c: 74)
    PGC-S-0155-fij must appear in a proper data sharing clause (e.g., PRIVATE) (mainI.c: 76)
    PGC/x86-64 Linux 19.4-0: compilation completed with severe errors
    
    main4I-fast.exe      584937 us  
    
    main4-fast.exe       445125 us  // 加速比 8.8 
    
    main8-fast.exe       435593 us  // 不能继续线性加速
  • 相关阅读:
    ZOJ 2702 Unrhymable Rhymes(DP)
    unique() 去重函数
    HDU 4712 Hamming Distance(随机算法)
    HDU 4708 Rotation Lock Puzzle(模拟)
    HBase源代码分析之MemStore的flush发起时机、推断条件等详情(二)
    Androidproject师进阶之路 :《Android开发进阶:从小工到专家》上市啦!
    POJ1062 昂贵的聘礼(最短路)
    easyui required 提交验证
    leetcode
    【翻译自mos文章】在Oracle单机数据库中定义database service
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9414715.html
Copyright © 2011-2022 走看看