zoukankan      html  css  js  c++  java
  • OpenACC 书上的范例代码(Jacobi 迭代),part 3

    ▶ 使用Jacobi 迭代求泊松方程的数值解

    ● 使用 data 构件,强行要求 u0 仅拷入和拷出 GPU 各一次,u1 仅拷入GPU 一次

     1 #include <stdio.h>
     2 #include <stdlib.h>
     3 #include <math.h>
     4 #include <time.h>
     5 #include <openacc.h>
     6 
     7 #if defined(_WIN32) || defined(_WIN64)
     8     #include <C:Program FilesPGIwin6419.4includewrapsys	imeb.h>    
     9     #define timestruct clock_t
    10     #define gettime(a) (*(a) = clock())
    11     #define usec(t1,t2) (t2 - t1)
    12 #else
    13     #include <sys/time.h>
    14     #define gettime(a)  gettimeofday(a, NULL)
    15     #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000000 + (t2).tv_usec - (t1).tv_usec)   
    16     typedef struct timeval timestruct;
    17 #endif
    18 
    19 inline float uval(float x, float y)
    20 {
    21     return x * x + y * y;
    22 }
    23 
    24 int main()
    25 {
    26     const int row = 8191, col = 1023;
    27     const float height = 1.0, width = 2.0;
    28     const float hx = height / row, wy = width / col;
    29     const float fij = -4.0f;
    30     const float hx2 = hx * hx, wy2 = wy * wy, c1 = hx2 * wy2, c2 = 1.0f / (2.0 * (hx2 + wy2));
    31     const int maxIter = 100;
    32     const int colPlus = col + 1;
    33 
    34     float *restrict u0 = (float *)malloc(sizeof(float)*(row + 1)*colPlus);
    35     float *restrict u1 = (float *)malloc(sizeof(float)*(row + 1)*colPlus);
    36     float *utemp = NULL;
    37 
    38     // 初始化
    39     for (int ix = 0; ix <= row; ix++)
    40     {
    41         u0[ix*colPlus + 0] = u1[ix*colPlus + 0] = uval(ix * hx, 0.0f);
    42         u0[ix*colPlus + col] = u1[ix*colPlus + col] = uval(ix*hx, col * wy);
    43     }
    44     for (int jy = 0; jy <= col; jy++)
    45     {
    46         u0[jy] = u1[jy] = uval(0.0f, jy * wy);
    47         u0[row*colPlus + jy] = u1[row*colPlus + jy] = uval(row*hx, jy * wy);
    48     }
    49     for (int ix = 1; ix < row; ix++)
    50     {
    51         for (int jy = 1; jy < col; jy++)
    52             u0[ix*colPlus + jy] = 0.0f;
    53     }
    54 
    55     // 计算
    56     timestruct t1, t2;
    57     acc_init(acc_device_nvidia);
    58     gettime(&t1);
    59 #pragma acc data copy(u0[0:(row + 1) * colPlus]) copyin(u1[0:(row + 1) * colPlus])      // 循环外侧添加 data 构件,跨迭代(内核)构造数据空间
    60     {
    61         for (int iter = 0; iter < maxIter; iter++)
    62         {
    63 #pragma acc kernels present(u0[0:((row + 1) * colPlus)], u1[0:((row + 1) * colPlus)])   // 每次调用内核时声明 u0 和 u1 已经存在,不要再拷贝
    64             {
    65 #pragma acc loop independent
    66                 for (int ix = 1; ix < row; ix++)
    67                 {
    68 #pragma acc loop independent
    69                     for (int jy = 1; jy < col; jy++)
    70                     {
    71                         u1[ix*colPlus + jy] = (c1*fij + wy2 * (u0[(ix - 1)*colPlus + jy] + u0[(ix + 1)*colPlus + jy]) + 
    72                             hx2 * (u0[ix*colPlus + jy - 1] + u0[ix*colPlus + jy + 1])) * c2;
    73                     }
    74                 }
    75             }
    76             utemp = u0, u0 = u1, u1 = utemp;
    77         }
    78     }
    79     gettime(&t2);
    80 
    81     long long timeElapse = usec(t1, t2);
    82 #if defined(_WIN32) || defined(_WIN64)
    83     printf("
    Elapsed time: %13ld ms.
    ", timeElapse);
    84 #else    
    85     printf("
    Elapsed time: %13ld us.
    ", timeElapse);
    86 #endif
    87     free(u0);
    88     free(u1);
    89     acc_shutdown(acc_device_nvidia);
    90     //getchar();
    91     return 0;
    92 }

    ● 输出结果,win10 中运行结果,关闭 PGI_ACC_NOTIFY 后可以达到 67 ms

    D:CodeOpenACC>pgcc main.c -o main.exe -c99 -Minfo -acc
    main:
         51, Memory zero idiom, loop replaced by call to __c_mzero4
         59, Generating copy(u0[:colPlus*(row+1)])
             Generating copyin(u1[:colPlus*(row+1)])
         61, Generating present(u1[:colPlus*(row+1)],u0[:colPlus*(row+1)])
         66, Loop is parallelizable
         69, Loop is parallelizable
             Generating Tesla code
             66, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
             69, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
         69, FMA (fused multiply-add) instruction(s) generated
    uval:
         21, FMA (fused multiply-add) instruction(s) generated
    
    D:CodeOpenACC>main.exe
    launch CUDA kernel  file=D:CodeOpenACCmain.c function=main line=69 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
    
    ...
    
    launch CUDA kernel  file=D:CodeOpenACCmain.c function=main line=69 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
    
    Elapsed time:           346 ms.

    ● nvvp 结果,可见大部分时间都花在了初始化设备上,计算用时已经比较少了,拷贝用时更少,只有开头和结尾有一点

     ● 输出结果,Ubuntu 中运行结果,含开启 PGI_ACC_TIME 的数据

    cuan@CUAN:~$ pgcc data.c -o data.exe -c99 -Minfo -acc
    main:
         51, Memory zero idiom, loop replaced by call to __c_mzero4
         59, Generating copy(u0[:colPlus*(row+1)])
             Generating copyin(u1[:colPlus*(row+1)])
         61, Generating present(utemp[:],u1[:colPlus*(row+1)],u0[:colPlus*(row+1)])
             FMA (fused multiply-add) instruction(s) generated
         66, Loop is parallelizable
         69, Loop is parallelizable
             Generating Tesla code
             66, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
             69, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
    uval:
         22, FMA (fused multiply-add) instruction(s) generated
    cuan@CUAN:~$ ./data.exe
    launch CUDA kernel  file=/home/cuan/data.c function=main line=69 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
    
    ...
    
    launch CUDA kernel  file=/home/cuan/data.c function=main line=69 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
    
    Elapsed time:         64828 us.
    
    Accelerator Kernel Timing data
    /home/cuan/data.c
      main  NVIDIA  devicenum=0
        time(us): 41,745
        59: data region reached 2 times
            59: data copyin transfers: 4
                 device time(us): total=5,188 max=1,308 min=1,287 avg=1,297
            79: data copyout transfers: 3
                 device time(us): total=2,598 max=1,308 min=11 avg=866
        61: data region reached 200 times
        63: compute region reached 100 times
            69: kernel launched 100 times
                grid: [32x1024]  block: [32x4]
                 device time(us): total=33,959 max=358 min=336 avg=339
                elapsed time(us): total=37,984 max=1,003 min=349 avg=379

    ● 将 tempp 放到了更里一层循环,报运行时错误 715 或 719,参考【https://stackoverflow.com/questions/41366915/openacc-create-data-while-running-inside-a-kernels】,大意是关于内存泄露

    D:CodeOpenACC>main.exe
    launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
    line=69 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
    launch CUDA kernel  file=D:CodeOpenACCOpenACCProjectOpenACCProjectmain.c function=main
    line=74 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=1 grid=1 block=1
    call to cuStreamSynchronize returned error 715: Illegal instruction

    call to cuMemFreeHost returned error 715: Illegal instruction

    D:CodeOpenACC>main.exe launch CUDA kernel file=D:CodeOpenACCmain.c function=main line=69 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4 launch CUDA kernel file=D:CodeOpenACCmain.c function=main line=74 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=1 grid=1 block=1 Failing in Thread:1 call to cuStreamSynchronize returned error 719: Launch failed (often invalid pointer dereference) Failing in Thread:1 call to cuMemFreeHost returned error 719: Launch failed (often invalid pointer dereference)

     ● 尝试 在 data 构件中添加 create(utemp) 或在交换指针的位置临时定义 float *utemp 都会报运行时错误 700

    D:CodeOpenACC>main.exe
    launch CUDA kernel  file=D:CodeOpenACCmain.c function=main line=69 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
    launch CUDA kernel  file=D:CodeOpenACCmain.c function=main line=74 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=1 grid=1 block=1
    Failing in Thread:1
    call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
    
    Failing in Thread:1
    call to cuMemFreeHost returned error 700: Illegal address during kernel execution

    ▶ 恢复错误控制,添加 reduction 导语用来计量改进量

     1 #include <stdio.h>
     2 #include <stdlib.h>
     3 #include <math.h>
     4 #include <time.h>
     5 #include <openacc.h>
     6 
     7 #if defined(_WIN32) || defined(_WIN64)
     8 #include <C:Program FilesPGIwin6419.4includewrapsys	imeb.h>    
     9 #define timestruct clock_t
    10 #define gettime(a) (*(a) = clock())
    11 #define usec(t1,t2) (t2 - t1)
    12 #else
    13 #include <sys/time.h>
    14 #define gettime(a)  gettimeofday(a, NULL)
    15 #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000000 + (t2).tv_usec - (t1).tv_usec)   
    16 typedef struct timeval timestruct;
    17 
    18 #define max(x,y) ((x) > (y) ? (x) : (y))
    19 #endif
    20 
    21 inline float uval(float x, float y)
    22 {
    23     return x * x + y * y;
    24 }
    25 
    26 int main()
    27 {
    28     const int row = 8191, col = 1023;
    29     const float height = 1.0, width = 2.0;
    30     const float hx = height / row, wy = width / col;
    31     const float fij = -4.0f;
    32     const float hx2 = hx * hx, wy2 = wy * wy, c1 = hx2 * wy2, c2 = 1.0f / (2.0 * (hx2 + wy2)), errControl = 0.0f;
    33     const int maxIter = 100;
    34     const int colPlus = col + 1;
    35 
    36     float *restrict u0 = (float *)malloc(sizeof(float)*(row + 1)*colPlus);
    37     float *restrict u1 = (float *)malloc(sizeof(float)*(row + 1)*colPlus);
    38     float *utemp = NULL;
    39 
    40     // 初始化
    41     for (int ix = 0; ix <= row; ix++)
    42     {
    43         u0[ix*colPlus + 0] = u1[ix*colPlus + 0] = uval(ix * hx, 0.0f);
    44         u0[ix*colPlus + col] = u1[ix*colPlus + col] = uval(ix*hx, col * wy);
    45     }
    46     for (int jy = 0; jy <= col; jy++)
    47     {
    48         u0[jy] = u1[jy] = uval(0.0f, jy * wy);
    49         u0[row*colPlus + jy] = u1[row*colPlus + jy] = uval(row*hx, jy * wy);
    50     }
    51     for (int ix = 1; ix < row; ix++)
    52     {
    53         for (int jy = 1; jy < col; jy++)
    54             u0[ix*colPlus + jy] = 0.0f;
    55     }
    56 
    57     // 计算
    58     timestruct t1, t2;
    59     acc_init(acc_device_nvidia);
    60     gettime(&t1);
    61 #pragma acc data copy(u0[0:(row + 1) * colPlus]) copyin(u1[0:(row + 1) * colPlus])
    62     {
    63         for (int iter = 1; iter < maxIter; iter++)
    64         {
    65             float uerr = 0.0f;                                                              // uerr 要放到前面,否则离开代码块数据未定义,书上这里是错的
    66 #pragma acc kernels present(u0[0:(row + 1) * colPlus]) present(u1[0:(row + 1) * colPlus])
    67             {
    68 #pragma acc loop independent reduction(max:uerr)                                            // 添加 reduction 语句统计改进量
    69                 for (int ix = 1; ix < row; ix++)
    70                 {
    71                     for (int jy = 1; jy < col; jy++)
    72                     {
    73                         u1[ix*colPlus + jy] = (c1*fij + wy2 * (u0[(ix - 1)*colPlus + jy] + u0[(ix + 1)*colPlus + jy]) + 
    74                             hx2 * (u0[ix*colPlus + jy - 1] + u0[ix*colPlus + jy + 1])) * c2;
    75                         uerr = max(uerr, fabs(u0[ix * colPlus + jy] - u1[ix * colPlus + jy]));
    76                     }
    77                 }
    78             }
    79             printf("
    iter = %d, uerr = %e
    ", iter, uerr);
    80             if (uerr < errControl)
    81                 break;
    82             utemp = u0, u0 = u1, u1 = utemp;
    83         }
    84     }
    85     gettime(&t2);
    86 
    87     long long timeElapse = usec(t1, t2);
    88 #if defined(_WIN32) || defined(_WIN64)
    89     printf("
    Elapsed time: %13ld ms.
    ", timeElapse);
    90 #else    
    91     printf("
    Elapsed time: %13ld us.
    ", timeElapse);
    92 #endif
    93     free(u0);
    94     free(u1);
    95     acc_shutdown(acc_device_nvidia);
    96     //getchar();
    97     return 0;
    98 }

    ● 输出结果,win10 相比没有错误控制的情形整整慢了一倍,nvvp 没有明显变化,不放上来了

    D:CodeOpenACC>pgcc main.c -o main.exe -c99 -Minfo -acc
    main:
         53, Memory zero idiom, loop replaced by call to __c_mzero4
         61, Generating copy(u0[:colPlus*(row+1)])
             Generating copyin(u1[:colPlus*(row+1)])
         66, Generating present(u0[:colPlus*(row+1)])
             Generating implicit copy(uerr)
             Generating present(u1[:colPlus*(row+1)])
         69, Loop is parallelizable
         71, Loop is parallelizable
             Generating Tesla code
             69, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
             71, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
                 Generating reduction(max:uerr)                                 // 多了 reduction 的信息           
         71, FMA (fused multiply-add) instruction(s) generated
    uval:
         23, FMA (fused multiply-add) instruction(s) generated
    
    D:CodeOpenACC>main.exe
    
    iter = 1, uerr = 2.496107e+00
    
    ...
    
    iter = 99, uerr = 2.202189e-02
    
    Elapsed time:           125 ms.

    ● 输出结果,Unubtu

    cuan@CUAN:~$ pgcc data+reduction.c -o data+reduction.exe -c99 -Minfo -acc
    main:
         53, Memory zero idiom, loop replaced by call to __c_mzero4
         61, Generating copyin(u1[:colPlus*(row+1)])
             Generating copy(u0[:colPlus*(row+1)])
         63, FMA (fused multiply-add) instruction(s) generated
         66, Generating present(u0[:colPlus*(row+1)])
             Generating implicit copy(uerr)
             Generating present(u1[:colPlus*(row+1)])
         69, Loop is parallelizable
         71, Loop is parallelizable
             Generating Tesla code
             69, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
             71, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
                 Generating reduction(max:uerr)
    uval:
         24, FMA (fused multiply-add) instruction(s) generated
    cuan@CUAN:~$ ./data+reduction.exe 
    launch CUDA kernel  file=/home/cuan/data+reduction.c function=main line=71 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4 shared memory=1024
    launch CUDA kernel  file=/home/cuan/data+reduction.c function=main line=71 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=256 grid=1 block=256 shared memory=1024
    
    iter = 1, uerr = 2.496107e+00
    launch CUDA kernel  file=/home/cuan/data+reduction.c function=main line=71 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4 shared memory=1024
    launch CUDA kernel  file=/home/cuan/data+reduction.c function=main line=71 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=256 grid=1 block=256 shared memory=1024
    
    ...
    
    iter = 98, uerr = 2.214956e-02
    launch CUDA kernel  file=/home/cuan/data+reduction.c function=main line=71 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4 shared memory=1024
    launch CUDA kernel  file=/home/cuan/data+reduction.c function=main line=71 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=256 grid=1 block=256 shared memory=1024
    
    iter = 99, uerr = 2.202189e-02
    
    Elapsed time:        101391 us.
    
    Accelerator Kernel Timing data
    /home/cuan/data+reduction.c
      main  NVIDIA  devicenum=0
        time(us): 73,976
        61: data region reached 2 times
            61: data copyin transfers: 4
                 device time(us): total=5,223 max=1,328 min=1,287 avg=1,305
            85: data copyout transfers: 3
                 device time(us): total=2,562 max=1,280 min=4 avg=854
        66: compute region reached 99 times
            71: kernel launched 99 times
                grid: [32x1024]  block: [32x4]
                 device time(us): total=61,336 max=622 min=618 avg=619
                elapsed time(us): total=63,876 max=1,645 min=630 avg=645
            71: reduction kernel launched 99 times
                grid: [1]  block: [256]
                 device time(us): total=4,161 max=48 min=41 avg=42
                elapsed time(us): total=7,376 max=1,082 min=51 avg=74
        66: data region reached 396 times
            66: data copyin transfers: 99
                 device time(us): total=297 max=18 min=2 avg=3
            79: data copyout transfers: 99
                 device time(us): total=397 max=14 min=3 avg=4

    ▶ 尝试在计算的循环导语上加上 collapse(2) 子句,意思是合并两个较小的循环为一个较大的循环。发现效果不显著,不放上来了

  • 相关阅读:
    work 2
    chapter02
    7.23作业
    第五章
    三层交换机
    基于nginx结合openssl实现https
    chapter10--进程和计划任务管理
    Linux系统管理08--服务器RAID及配置实战
    chapter07-- LVM逻辑卷
    Linux系统管理07--文件系统与LVM
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9417047.html
Copyright © 2011-2022 走看看