▶ 使用Jacobi 迭代求泊松方程的数值解
● 原始串行版本
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <math.h> 4 5 #if defined(_WIN32) || defined(_WIN64) // 统一计时器 6 #include <C:Program FilesPGIwin6419.4includewrapsys imeb.h> 7 #define gettime(a) _ftime(a) 8 #define usec(t1,t2) ((((t2).time - (t1).time) * 1000 + (t2).millitm - (t1).millitm)) // 单位 ms 9 typedef struct _timeb timestruct; 10 #else 11 #include <sys/time.h> 12 #define gettime(a) gettimeofday(a, NULL) 13 #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000000 + (t2).tv_usec - (t1).tv_usec) // 单位 us 14 typedef struct timeval timestruct; 15 #endif 16 17 #define IMPROV // 是否额外使用 “每次计算的修正量” 作为退出循环的条件 18 19 inline float uval(float x, float y) // 求该点到原点距离的平方 20 { 21 return x * x + y * y; 22 } 23 24 int main() 25 { 26 const int row = 8191, col = 1023; // 网格行数和列数, 27 const float height = 1.0, width = 2.0; // 实际高度和宽度,与网格行列数不成比例说明是矩形网格 28 const float hx = height / row, wy = width / col; // 每个网格的高度和宽度 29 const float fij = -4.0f; // 函数 f(x,y) = -4,此时方程的解为 z = x^2 + y^2 30 const float hx2 = hx * hx, wy2 = wy * wy, c1 = hx2 * wy2, c2 = 1.0f / (2.0 * (hx2 + wy2));// 其他用到的参数 31 const int maxIter = 100; // 最大迭代次数 32 const int colPlus = col + 1; // 实际列数 33 #ifdef IMPROV 34 const float errControl = 0.0f; // 修正量控制,取 0 表示无用 35 float err = 0.0f; // 修正量 36 #endif 37 38 float *u0 = (float *)malloc(sizeof(float)*(row + 1)*colPlus); // 用来存放网格数据的两张表,行列数等于 row 和 col 各自加 1, 39 float *u1 = (float *)malloc(sizeof(float)*(row + 1)*colPlus); 40 float *utemp = NULL; // 用于交换 u1 和 u0 的临时指针 41 42 // 初始化边界为 g(x,y) = x^2+y^2 43 for (int ix = 0; ix <= row; ix++) // 左右边界 44 { 45 u0[ix*colPlus + 0] = u1[ix*colPlus + 0] = uval(ix * hx, 0.0f); 46 u0[ix*colPlus + col] = u1[ix*colPlus + col] = uval(ix*hx, col * wy); 47 } 48 for (int jy = 0; jy <= col; jy++) // 上下边界 49 { 50 u0[jy] = u1[jy] = uval(0.0f, jy * wy); 51 u0[row*colPlus + jy] = u1[row*colPlus + jy] = uval(row*hx, jy * wy); 52 } 53 for (int ix = 1; ix < row; ix++) // 内部格点初始化为 0.0f 54 { 55 for (int jy = 1; jy < col; jy++) 56 u0[ix*colPlus + jy] = 0.0f; 57 } 58 59 // 计算 60 timestruct t1, t2; 61 gettime(&t1); 62 for (int iter = 1; iter < maxIter; iter++) 63 { 64 for (int ix = 1; ix < row; ix++) 65 { 66 for (int jy = 1; jy < col; jy++) 67 { 68 u1[ix*colPlus + jy] = (c1*fij + wy2 * (u0[(ix - 1)*colPlus + jy] + u0[(ix + 1)*colPlus + jy]) + 69 hx2 * (u0[ix*colPlus + jy - 1] + u0[ix*colPlus + jy + 1])) * c2; 70 #ifdef IMPROV 71 err = max(fabs(u0[ix*colPlus + jy] - u1[ix*colPlus + jy]), err); // 记录整张表上的最大修正量 72 #endif 73 } 74 } 75 #ifdef IMPROV 76 //printf(" iter = %d, err = %e ", iter, err); // 逐次输出 77 if (err < errControl) // 修正量小于指定量就可以退出 78 break; 79 #endif 80 utemp = u0, u0 = u1, u1 = utemp; // 交换指针 81 } 82 gettime(&t2); 83 84 long long timeElapse = usec(t1, t2); 85 printf(" Elapsed time: %13ld ms. ", timeElapse); 86 free(u0); 87 free(u1); 88 getchar(); 89 return 0; 90 }
● 输出结果(使用 IMPROV),可以看到很多 not fused,这都是可以改进的地方
D:CodeOpenACC>pgcc main.c -Minfo -o main.exe // 普通编译 main: 66, FMA (fused multiply-add) instruction(s) generated // 使用乘加指令 uval: 21, FMA (fused multiply-add) instruction(s) generated D:CodeOpenACC>pgcc main.c -Minfo -o main-fast.exe -fast // 添加 fast 选项 main: 45, uval inlined, size=3 (inline) file main.c (20) // 4 个内联函数 43, Loop not fused: different loop trip count // 担心 for 中存在数据依赖,拒绝并行 Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed // 担心 u0 和 u1是否重叠,拒绝并行 Loop not vectorized: data dependency Loop unrolled 4 times // 循环展开 Generated 4 prefetches in scalar loop 46, uval inlined, size=3 (inline) file main.c (20) 50, uval inlined, size=3 (inline) file main.c (20) 48, Loop not vectorized: data dependency Loop unrolled 4 times 51, uval inlined, size=3 (inline) file main.c (20) 55, Memory zero idiom, loop replaced by call to __c_mzero4 // 使用 memcpy 来赋零值 62, Loop not vectorized/parallelized: potential early exits // 有额外脱离循环的条件,拒绝并行 66, Loop not vectorized: data dependency Loop unrolled 2 times FMA (fused multiply-add) instruction(s) generated D:CodeOpenACC>main.exe Elapsed time: 2754 ms. D:CodeOpenACC>main-fast.exe Elapsed time: 2804 ms. // 加了 fast 反而更慢
● 输出结果(不用 IMPROV),发现变快了,可见提前跳出循环的 if 语句对并行化有很大影响。在本例中我们让 errControl = 0,每次循环多一个判断(实际绝对不会跳出),就严重干扰了编译
D:CodeOpenACC>pgcc main.c -Minfo -o main.exe main: 66, FMA (fused multiply-add) instruction(s) generated uval: 21, FMA (fused multiply-add) instruction(s) generated D:CodeOpenACC>pgcc main.c -Minfo -o main-fast.exe -fast main: 45, uval inlined, size=3 (inline) file main.c (20) 43, Loop not fused: different loop trip count Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed Loop not vectorized: data dependency Loop unrolled 4 times Generated 4 prefetches in scalar loop 46, uval inlined, size=3 (inline) file main.c (20) 50, uval inlined, size=3 (inline) file main.c (20) 48, Loop not vectorized: data dependency Loop unrolled 4 times 51, uval inlined, size=3 (inline) file main.c (20) 55, Memory zero idiom, loop replaced by call to __c_mzero4 62, Loop not fused : function call before adjacent loop // ? 66, Loop not vectorized : data dependency Loop unrolled 4 times // 展开次数由 2 变成 4 FMA (fused multiply-add) instruction(s) generated D:CodeOpenACC>main.exe Elapsed time: 1384 ms. // 变快了 1 倍 D:CodeOpenACC>main-fast.exe Elapsed time: 618 ms. // 再变快 1 倍
● 使用 OpenMP 优化(就一句导语)
1 // #include <math.h> 下面 2 #include <omp.h> 3 4 //for (int iter = 1; iter < maxIter; iter++){ 下面 5 #ifdef IMPROV 6 #pragma omp parallel for reduction(max:err) default(none) shared(u0, u1, c1, c2, hx2, wy2, colPlus) private(err) 7 #else 8 #pragma omp parallel for default(none) shared(u0, u1, c1, c2, hx2, wy2, colPlus) 9 #endif
● 输出结果
D:CodeOpenACC>set OMP_NUM_THREADS=4 // 使用 4 个线程 D:CodeOpenACC>pgcc main.c -Minfo -o main4I.exe -fast -mp // 用 IMPROV main: 46, uval inlined, size=3 (inline) file main.c (21) 44, Loop not fused: different loop trip count Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed Loop not vectorized: data dependency Loop unrolled 4 times Generated 4 prefetches in scalar loop 47, uval inlined, size=3 (inline) file main.c (21) 51, uval inlined, size=3 (inline) file main.c (21) 49, Loop not vectorized: data dependency Loop unrolled 4 times 52, uval inlined, size=3 (inline) file main.c (21) 56, Memory zero idiom, loop replaced by call to __c_mzero4 64, Loop not vectorized/parallelized: potential early exits 71, Parallel region activated // OpenMP 并行区 Parallel loop activated with static block schedule 73, Loop not vectorized: data dependency Loop unrolled 2 times FMA (fused multiply-add) instruction(s) generated 84, Begin critical section // 脱出循环的判断导致的串行区 End critical section Barrier // 栅栏 Parallel region terminated D:CodeOpenACC>main4I.exe Elapsed time: 723 ms. // 还是快了 3.8 倍 D:CodeOpenACC>pgcc main.c -Minfo -o main4.exe -fast -mp // 不用 IMPROV main: 46, uval inlined, size=3 (inline) file main.c (21) 44, Loop not fused: different loop trip count Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed Loop not vectorized: data dependency Loop unrolled 4 times Generated 4 prefetches in scalar loop 47, uval inlined, size=3 (inline) file main.c (21) 51, uval inlined, size=3 (inline) file main.c (21) 49, Loop not vectorized: data dependency Loop unrolled 4 times 52, uval inlined, size=3 (inline) file main.c (21) 56, Memory zero idiom, loop replaced by call to __c_mzero4 64, Loop not vectorized/parallelized: contains a parallel region // 有 OpenMP的并行区,拒绝并行 71, Parallel region activated Parallel loop activated with static block schedule 73, Loop not vectorized: data dependency Loop unrolled 4 times FMA (fused multiply-add) instruction(s) generated 87, Barrier // 没有了串行区 Parallel region terminated D:CodeOpenACC>main4.exe Elapsed time: 430 ms. // 还能再快点,加速比 1.4 D:CodeOpenACC>set OMP_NUM_THREADS=8 // 使用 8 线程 D:CodeOpenACC>pgcc main.c -Minfo -o main8.exe -fast -mp ...// 跟 4 线程时一模一样 D:CodeOpenACC>main8.exe Elapsed time: 399 ms. // 不宰线性加速,加速比 1.5
▶ 在 Ubuntu 下跑的结果,加速前比 win10 慢很多,关闭 IMPROV 并开启 OpenMP 和 fast 选项后速度接近
mainI.exe 3907651 us mainI-fast.exe 1269052 us // 极速比 3.1 main.exe 1895224 us // 加速比 2.1 main-fast.exe 616599 us // 加速比 6.4 cuan@CUAN:~$ pgcc mainI.c -Minfo -o main4I-fast.exe -fast -mp // 要求我将 row,col,fij 放入 OpenMP 的 shared 导语中,在 win10 下没有显式放入也行 PGC-S-0155-row must appear in a proper data sharing clause (e.g., PRIVATE) (mainI.c: 72) PGC-S-0155-col must appear in a proper data sharing clause (e.g., PRIVATE) (mainI.c: 74) PGC-S-0155-fij must appear in a proper data sharing clause (e.g., PRIVATE) (mainI.c: 76) PGC/x86-64 Linux 19.4-0: compilation completed with severe errors main4I-fast.exe 584937 us main4-fast.exe 445125 us // 加速比 8.8 main8-fast.exe 435593 us // 不能继续线性加速