zoukankan      html  css  js  c++  java
  • Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 2

    ▶ 第四章,逐步优化了一个三维卷积计算的过程

    ● 基准代码

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <string.h>
      4 #include <math.h>
      5 #include <time.h>
      6 #include <sys/time.h>
      7 #include <omp.h>
      8 #include <assert.h>
      9 #include <sys/mman.h>
     10 
     11 #define REAL float
     12 #define NX (64)
     13 
     14 #ifndef M_PI
     15     #define M_PI (3.1415926535897932384626)
     16 #endif
     17 
     18 // 初始化格点矩阵
     19 void init(REAL *buff, const int nx, const int ny, const int nz, const REAL kx, const REAL ky, const REAL kz,
     20     const REAL dx, const REAL dy, const REAL dz, const REAL kappa, const REAL time)
     21 {
     22     REAL ax = exp(-kappa * time*(kx*kx)), ay = exp(-kappa * time*(ky*ky)), az = exp(-kappa * time*(kz*kz));
     23     for (int jz = 0; jz < nz; jz++)
     24     {
     25         for (int jy = 0; jy < ny; jy++)
     26         {
     27             for (int jx = 0; jx < nx; jx++)
     28             {
     29                 int j = (jz * ny + jy) * NX + jx;
     30                 REAL x = dx * ((REAL)(jx + 0.5)), y = dy * ((REAL)(jy + 0.5)), z = dz * ((REAL)(jz + 0.5));
     31                 buff[j] = (REAL)0.125*(1.0 - ax * cos(kx * x))*(1.0 - ay * cos(ky * y))*(1.0 - az * cos(kz * z));;
     32             }
     33         }
     34     }
     35 }
     36 
     37 // 计算卷积
     38 void diffusion(REAL *f1, REAL *f2, int nx, int ny, int nz,
     39     REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)
     40 {
     41     for (int i = 0; i < count; ++i)
     42     {
     43         for (int z = 0; z < nz; z++)
     44         {
     45             for (int y = 0; y < ny; y++)
     46             {
     47                 for (int x = 0; x < nx; x++)
     48                 {
     49                     int c = (z * ny + y) * NX + x;
     50                     int w = (x == 0) ? c : c - 1;
     51                     int e = (x == NX - 1) ? c : c + 1;
     52                     int n = (y == 0) ? c : c - NX;
     53                     int s = (y == ny - 1) ? c : c + NX;
     54                     int b = (z == 0) ? c : c - NX * ny;
     55                     int t = (z == nz - 1) ? c : c + NX * ny;
     56                     f2[c] = cc * f1[c] + cw * f1[w] + ce * f1[e] + cs * f1[s] + cn * f1[n] + cb * f1[b] + ct * f1[t];
     57                 }
     58             }
     59         }
     60         REAL *t = f1;
     61         f1 = f2;
     62         f2 = t;
     63     }
     64     return;
     65 }
     66 
     67 static double cur_second(void)                                      // 计时器,返回一个秒数
     68 {
     69     struct timeval tv;
     70     gettimeofday(&tv, NULL);
     71     return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
     72 }
     73 
     74 REAL accuracy(const REAL *b1, REAL *b2, const int len)              //计算两个数组的差距
     75 {
     76     REAL err = 0.0;
     77     for (int i = 0; i < len; i++)
     78         err += (b1[i] - b2[i]) * (b1[i] - b2[i]);
     79     return (REAL)sqrt(err / len);
     80 }
     81 
     82 void dump_result(REAL *f, int nx, int ny, int nz, char *out_path)   // 将结果写到文件中
     83 {
     84     FILE *out = fopen(out_path, "w");
     85     assert(out);
     86     fwrite(f, sizeof(REAL), nx * ny * nz, out);
     87     fclose(out);
     88 }
     89 
     90 int main(int argc, char *argv[])
     91 {
     92     int nx = NX, ny = NX, nz = NX;
     93     REAL *f1 = (REAL *)malloc(sizeof(REAL) * NX * NX * NX);
     94     REAL *f2 = (REAL *)malloc(sizeof(REAL) * NX * NX * NX);
     95     REAL *f3 = (REAL *)malloc(sizeof(REAL) * NX * ny * nz);
     96     assert(f1 != MAP_FAILED);
     97     assert(f2 != MAP_FAILED);
     98     assert(f3 != MAP_FAILED);
     99 
    100     REAL dx, dy, dz, kx, ky, kz;
    101     dx = dy = dz = 1.0 / nx;                // 边长 1.0
    102     kx = ky = kz = 2.0 * M_PI;
    103     REAL kappa = 0.1;
    104     REAL dt = 0.1 * dx * dx / kappa;
    105     int count = 0.1 / dt;
    106 
    107     init(f1, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, 0.0);
    108 
    109     REAL ce, cw, cn, cs, ct, cb, cc;
    110     ce = cw = kappa * dt / (dx * dx);
    111     cn = cs = kappa * dt / (dy * dy);
    112     ct = cb = kappa * dt / (dz * dz);
    113     cc = 1.0 - (ce + cw + cn + cs + ct + cb);
    114 
    115     printf("Running diffusion kernel %d times
    ", count);
    116     fflush(stdout);
    117     struct timeval time_b, time_e;
    118     gettimeofday(&time_b, NULL);
    119     diffusion(f1, f2, nx, ny, nz, ce, cw, cn, cs, ct, cb, cc, dt, count);
    120     gettimeofday(&time_e, NULL);
    121     //dump_result((count % 2) ? f2 : f1, nx, ny, nz, "diffusion_result.dat");
    122 
    123     init(f3, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, count * dt);            // 对比基准结果
    124     REAL err = accuracy((count % 2) ? f2 : f1, f3, nx*ny*nz);
    125     double elapsed_time = (time_e.tv_sec - time_b.tv_sec) + (time_e.tv_usec - time_b.tv_usec) * 1.0e-6;
    126     REAL mflops = (nx*ny*nz)*13.0*count / elapsed_time * 1.0e-06;
    127     double thput = (nx * ny * nz) * sizeof(REAL) * 3.0 * count / elapsed_time * 1.0e-09;
    128 
    129     printf("Elapsed time : %.3f (s)
    FLOPS        : %.3f (MFlops)
    ", elapsed_time, mflops);
    130     printf("Throughput   : %.3f (GB/s)
    Accuracy     : %e
    ", thput, err);
    131 
    132     free(f1);
    133     free(f2);
    134     return 0;
    135 }

    ■ 输出结果

    Running diffusion kernel 1638 times
    Elapsed time : 177.015 (s)
    FLOPS        : 252.276 (MFlops)
    Throughput   : 0.233 (GB/s)
    Accuracy     : 5.068947e-06

    ● 计算内核加入 OpenMP

     1 void diffusion(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz,
     2     REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)// 加了 restrict
     3 {
     4     #pragma omp parallel                    // openMP 并行域
     5     {
     6         REAL *f1_t = f1, *f2_t = f2;        // 使用局部的指针
     7         for (int i = 0; i < count; ++i)
     8         {
     9             #pragma omp for collapse(2)     // 展开外两层循环
    10             for (int z = 0; z < nz; z++)
    11             {
    12                 for (int y = 0; y < ny; y++)
    13                 {
    14                     for (int x = 0; x < nx; x++)
    15                     {
    16                         int c = (z * ny + y) * NX + x;
    17                         int w = (x == 0) ? c : c - 1;
    18                         int e = (x == NX - 1) ? c : c + 1;
    19                         int n = (y == 0) ? c : c - NX;
    20                         int s = (y == ny - 1) ? c : c + NX;
    21                         int b = (z == 0) ? c : c - NX * ny;
    22                         int t = (z == nz - 1) ? c : c + NX * ny;
    23                         f2_t[c] = cc * f1_t[c] + cw * f1_t[w] + ce * f1_t[e] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
    24                     }
    25                 }
    26             }
    27             REAL *t = f1_t;
    28             f1_t = f2_t;
    29             f2_t = t;
    30         }
    31     }
    32     return;
    33 }

    ■ 输出结果

    Running diffusion kernel 1638 times
    Elapsed time : 2.936 (s)
    FLOPS        : 15209.439 (MFlops)
    Throughput   : 14.039 (GB/s)
    Accuracy     : 4.789139e-06

    ● 保证向量化

     1 void diffusion(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz,
     2     REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)
     3 {
     4 #pragma omp parallel
     5         {
     6             REAL *f1_t = f1, *f2_t = f2;
     7             for (int i = 0; i < count; ++i) 
     8             {
     9                 #pragma omp for collapse(2)
    10                 for (int z = 0; z < nz; z++) 
    11                 {
    12                     for (int y = 0; y < ny; y++) 
    13                     {
    14                         #pragma simd                        // 保证向量化,不考虑 f1_t 和 f2_t 之间的独立子性
    15                         for (int x = 0; x < nx; x++) 
    16                         {
    17                         int c = (z * ny + y) * NX + x;
    18                         int w = (x == 0) ? c : c - 1;
    19                         int e = (x == NX - 1) ? c : c + 1;
    20                         int n = (y == 0) ? c : c - NX;
    21                         int s = (y == ny - 1) ? c : c + NX;
    22                         int b = (z == 0) ? c : c - NX * ny;
    23                         int t = (z == nz - 1) ? c : c + NX * ny;
    24                         f2_t[c] = cc * f1_t[c] + cw * f1_t[w] + ce * f1_t[e] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
    25                         }
    26                     }
    27                 }
    28                 REAL *t = f1_t;
    29                 f1_t = f2_t;
    30                 f2_t = t;
    31             }
    32         }
    33         return;
    34 }

    ■ 输出结果

    Running diffusion kernel 1638 times
    Elapsed time : 0.865 (s)
    FLOPS        : 51651.863 (MFlops)
    Throughput   : 47.679 (GB/s)
    Accuracy     : 4.427611e-06

    ● 手动剥离边界

     1 void diffusion(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz,
     2     REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)
     3 {
     4 #pragma omp parallel
     5     {
     6         REAL *f1_t = f1, *f2_t = f2;
     7         for (int i = 0; i < count; ++i)
     8         {
     9             #pragma omp for collapse(2)
    10             for (int z = 0; z < nz; z++)
    11             {
    12                 for (int y = 0; y < ny; y++)
    13                 {
    14                     int x = 0;                                  // 每行首次
    15                     int c = (z * ny + y) * NX + x;              // 注意 w 方向的下标是 c
    16                     int n = (y == 0) ? c : c - NX;
    17                     int s = (y == ny - 1) ? c : c + NX;
    18                     int b = (z == 0) ? c : c - NX * ny;
    19                     int t = (z == nz - 1) ? c : c + NX * ny;
    20                     f2_t[c] = cc * f1_t[c] + cw * f1_t[c] + ce * f1_t[c + 1] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
    21                     #pragma simd
    22                     for (x = 1; x < nx - 1; x++)                // 中间部分,注意循环要按照 OpenMP 格式书写
    23                     {
    24                         c++;
    25                         n++;
    26                         s++;
    27                         b++;
    28                         t++;
    29                         f2_t[c] = cc * f1_t[c] + cw * f1_t[c - 1] + ce * f1_t[c + 1] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
    30                     }
    31                     c++;                                        // 每行末次
    32                     n++;                                        // 注意 e 方向的下标是 c                                        
    33                     s++;
    34                     b++;
    35                     t++;
    36                     f2_t[c] = cc * f1_t[c] + cw * f1_t[c - 1] + ce * f1_t[c] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];   
    37                 }
    38             }
    39             REAL *t = f1_t;
    40             f1_t = f2_t;
    41             f2_t = t;
    42         }
    43     }
    44     return;
    45 }

    ■ 输出结果

    Running diffusion kernel 1638 times
    Elapsed time : 0.565 (s)
    FLOPS        : 79071.250 (MFlops)
    Throughput   : 72.989 (GB/s)
    Accuracy     : 4.577150e-06

    ● 数据切片

     1 void diffusion(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz,
     2     REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)
     3 {
     4 #pragma omp parallel
     5     {
     6         REAL *f1_t = f1, *f2_t = f2;
     7         for (int i = 0; i < count; ++i)
     8         {
     9             #define YBF 16                                          // 分块大小
    10             #pragma omp for collapse(2)
    11             for (int yy = 0; yy < ny; yy += YBF)                    // 在循环之外放入分块
    12             {
    13                 for (int z = 0; z < nz; z++)
    14                 {
    15                     int yyy = (yy + YBF) >= ny ? ny : (yy + YBF);   // 该分块的末端
    16                     for (int y = yy; y < yyy; y++)                  // y 限定在分块内循环
    17                     {
    18                         int x = 0;
    19                         int c = (z * ny + y) * NX + x;
    20                         int n = (y == 0) ? c : c - NX;
    21                         int s = (y == ny - 1) ? c : c + NX;
    22                         int b = (z == 0) ? c : c - NX * ny;
    23                         int t = (z == nz - 1) ? c : c + NX * ny;
    24                         f2_t[c] = cc * f1_t[c] + cw * f1_t[c] + ce * f1_t[c + 1] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
    25                         #pragma simd  
    26                         for (x = 1; x < nx - 1; x++)
    27                         {
    28                             c++;
    29                             n++;
    30                             s++;
    31                             b++;
    32                             t++;
    33                             f2_t[c] = cc * f1_t[c] + cw * f1_t[c - 1] + ce * f1_t[c + 1] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
    34                         }
    35                         c++;
    36                         n++;
    37                         s++;
    38                         b++;
    39                         t++;
    40                         f2_t[c] = cc * f1_t[c] + cw * f1_t[c - 1] + ce * f1_t[c] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
    41                     }
    42                 }
    43             }
    44             REAL *t = f1_t;
    45             f1_t = f2_t;
    46             f2_t = t;
    47         }
    48     }
    49     return;
    50 }

    ■ 输出结果,没有明显优化

    Running diffusion kernel 1638 times
    Elapsed time : 0.594 (s)
    FLOPS        : 75224.680 (MFlops)
    Throughput   : 69.438 (GB/s)
    Accuracy     : 4.577150e-06
  • 相关阅读:
    etcd数据单机部署
    PostgreSQL INSERT ON CONFLICT不存在则插入,存在则更新
    ERROR 1709 (HY000): Index column size too large. The maximum column size is 767 bytes.
    Hbase 0.92.1集群数据迁移到新集群
    PostgreSQL创建只读账户
    Kafka技术内幕 读书笔记之(六) 存储层——服务端处理读写请求、分区与副本
    Kafka技术内幕 读书笔记之(六) 存储层——日志的读写
    Kafka技术内幕 读书笔记之(五) 协调者——消费组状态机
    Kafka技术内幕 读书笔记之(五) 协调者——延迟的加入组操作
    Kafka技术内幕 读书笔记之(五) 协调者——协调者处理请求
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/10323596.html
Copyright © 2011-2022 走看看