▶ 书中第4章,数据管理部分的代码和说明
● 代码,关于 copy,copyin,copyout,create
1 #include <stdio.h> 2 #include <openacc.h> 3 4 int main() 5 { 6 const int length = 1024; 7 int a[length], b[length], c[length],d[length]; 8 9 for (int i = 0; i < length; a[i] = b[i] = c[i] = 1); 10 { 11 #pragma acc kernels create(d) 12 for (int i = 0; i < length; i++) 13 { 14 a[i] ++; 15 c[i] = a[i] + b[i]; 16 d[i] = 0; 17 } 18 } 19 for (int i = 0; i < 10; i++) 20 printf("a[%d] = %d, c[%d] = %d ", i, a[i], i, c[i]); 21 getchar(); 22 return 0; 23 }
● 输出结果,显式创建了中间变量 d,隐式创建了 a,b,c,并具有不同的拷贝属性
D:CodeOpenACCOpenACCProjectOpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe main: 11, Generating create(d[:]) Generating implicit copyout(c[:1024]) Generating implicit copyin(b[:1024]) Generating implicit copy(a[:1024]) 12, Loop is parallelizable Accelerator kernel generated Generating Tesla code 12, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
● 在 kernels 里单独使用 copyout 时报警告:PGC-W-0996-The directive #pragma acc copyout is deprecated; use #pragma acc declare copyout instead (main.c: XX)
● enter data 和 exit data 用于 C++。
■ 首先,windows 中 pgi 不支持 C++ 编译,只有 pgcc.exe 而没有 pgc++*.exe,只能乖乖到 Linux 下去写!
■ 书上的代码有点问题,大意是 OpenACC 的 copy 是浅拷贝,对于内含指针的数据结构(如 vector,class)不会连着指针指向的对象一起拷。这里有两种解决办法,一种是去结构化,将 class 中的数据集中成简单数组来进行拷贝;另一种是使用 Managed 内存,也就不存在显式拷贝的问题了。【https://stackoverflow.com/questions/53860467/how-to-copy-on-gpu-a-vector-of-vector-pointer-memory-allocated-in-openacc】
■ 书上的代码没有采用这两种解决方案,会报错 “call to cuStreamSynchronize returned error 700: Illegal address during kernel execution” 以及 “call to cuStreamSynchronize returned error 700: Illegal address during kernel execution”,这个问题还蛮常见的【https://stackoverflow.com/search?q=call+to+returned+error+700%3A+Illegal+address+during+kernel+execution】
● 使用去结构化来使用数组
1 #include <iostream> 2 #include <vector> 3 #include <cstdint> 4 5 using namespace std; 6 7 int main() 8 { 9 const int vectorCount = 1024, vectorLength = 20; 10 long sum = 0; 11 12 vector<int32_t> *vectorTable = new vector<int32_t>[vectorCount]; // 1024 个向量,每个向量放入 20 个元素 13 for (int i = 0; i < vectorCount; i++) 14 { 15 for (int j = 0; j < vectorLength; j++) 16 vectorTable[i].push_back(i); 17 } 18 int32_t **arrayTable = new int32_t *[vectorCount]; // 仅包含向量数据的数组,与 vectorTable 对应 19 int *vectorSize = new int[vectorCount]; // 每个向量的尺寸 20 21 #pragma acc enter data create(arrayTable [0:vectorCount] [0:0]) // 设备中创建 arryTable,注意维度 22 for (int i = 0; i < vectorCount; i++) 23 { 24 int sze = vectorTable[i].size(); 25 vectorSize[i] = sze; 26 arrayTable[i] = vectorTable[i].data(); // 把每个向量数据的指针赋给 arrayTable 27 #pragma acc enter data copyin(arrayTable [i:1][:sze]) // 把每个向量的数据拷贝进设备 28 } 29 #pragma acc enter data copyin(vectorSize[:vectorCount]) // 向量尺寸也放进设备 30 31 #pragma acc parallel loop gang vector reduction(+: sum) present(arrayTable, vectorSize) // 规约计算 32 for (int i = 0; i < vectorCount; i++) 33 { 34 for (int j = 0; j < vectorSize[i]; ++j) 35 sum += arrayTable[i][j]; 36 } 37 cout << "Sum: " << sum << endl; 38 39 #pragma acc exit data delete (vectorSize) 40 #pragma acc exit data delete (arrayTable) 41 delete[] vectorSize; 42 delete[] vectorTable; 43 return 0; 44 }
● 输出结果
cuan@CUAN:~$ pgc++ main.cpp -o main.exe --c++11 -ta=tesla -Minfo -acc main: 95, Generating enter data create(arrayTable[:1024][:0]) 32, Generating enter data copyin(arrayTable[i][:sze+1],vectorSize[:1024]) Generating implicit copy(sum) Generating present(vectorSize[:]) Generating Tesla code 32, Generating reduction(+:sum) 32, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 34, #pragma acc loop seq 32, Generating present(arrayTable[:][:]) 34, Loop is parallelizable 41, Generating exit data delete(vectorSize[:1],arrayTable[:1][:1]) cuan@CUAN:~$ ./main.exe Sum: 10475520
● 使用 Managed 内存
1 #include <iostream> 2 3 using namespace std; 4 5 class ivector 6 { 7 public: 8 int len; 9 int *arr; 10 ivector(int length) 11 { 12 len = length; 13 arr = new int[len]; 14 #pragma acc enter data copyin(this) 15 #pragma acc enter data create(arr [0:len]) 16 #pragma acc parallel loop present(arr [0:len]) 17 for (int iend = len, i = 0; i < iend; i++) // 使用临时变量 iend,防止编译器认为 len 值在循环中会改变,从而拒绝并行化 18 arr[i] = i; 19 } 20 21 ivector(const ivector &s) 22 { 23 len = s.len; 24 arr = new int[len]; 25 #pragma acc enter data copyin(this) 26 #pragma acc enter data create(arr [0:len]) 27 #pragma acc parallel loop present(arr [0:len], s.arr [0:len]) // s 也已经在设备上了 28 for (int iend = len, i = 0; i < iend; i++) 29 arr[i] = s.arr[i]; 30 } 31 32 ~ivector() 33 { 34 #pragma acc exit data delete (arr) // 销毁对象时依次销毁设备上的 arr 和 this 35 #pragma acc exit data delete (this) 36 cout << "deconstruction!" << endl; 37 delete[] arr; 38 len = 0; 39 } 40 41 int &operator[](int i) 42 { 43 if (i < 0 || i >= this->len) 44 return arr[0]; 45 return arr[i]; 46 } 47 48 void add(int c) 49 { 50 #pragma acc kernels loop present(arr [0:len]) // 每次涉及修改 arr 的操作都要注明 present 51 for (int iend = len, i = 0; i < iend; i++) 52 arr[i] += c; 53 } 54 55 void updateHost() // 手动更新主机端数据 56 { 57 #pragma acc update host(arr [0:len]) 58 } 59 }; 60 61 int main() 62 { 63 ivector s1(20); 64 s1.add(10); 65 s1.updateHost(); 66 cout << "s1[1] = " << s1[1] << endl; 67 68 ivector s2(s1); 69 s2.updateHost(); 70 cout << "s2[1] = " << s2[1] << endl; 71 72 return 0; 73 }
● 输出结果,不加 -ta=tesla:managed 会报错【填坑】
cuan@CUAN:~$ pgc++ main.cpp -o main.exe --c++11 -ta=tesla:managed -Minfo -acc ivector::ivector(int): 13, Generating enter data copyin(this[:1]) Generating enter data create(arr[:len]) Generating Tesla code 17, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 13, Generating implicit copy(this[:]) Generating present(arr[:len]) ivector::ivector(const ivector&): 24, Generating enter data create(arr[:len]) Generating enter data copyin(this[:1]) Generating present(arr[:len]) Generating Tesla code 28, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 24, Generating implicit copyin(s[:]) Generating implicit copy(this[:]) Generating present(s->arr[:len]) ivector::~ivector(): 36, Generating exit data delete(this[:1],arr[:1]) ivector::add(int): 0, Generating Tesla code 49, Accelerator serial kernel generated Generating implicit copy(this[:]) Generating present(arr[:len]) 51, Loop is parallelizable Generating Tesla code 51, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ ivector::updateHost(): 58, Generating update self(arr[:len]) cuan@CUAN:~$ ./main.exe launch CUDA kernel file=/home/cuan/main.cpp function=_ZN7ivectorC1Ei line=13 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128 launch CUDA kernel file=/home/cuan/main.cpp function=_ZN7ivector3addEi line=49 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=1 grid=1 block=1 launch CUDA kernel file=/home/cuan/main.cpp function=_ZN7ivector3addEi line=51 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128 s1[1] = 11 launch CUDA kernel file=/home/cuan/main.cpp function=_ZN7ivectorC1ERKS_ line=24 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128 s2[1] = 11 deconstruction! deconstruction!
● 在这本书上找到了 C++ 中使用 OpenACC 的办法 【https://www.elsevier.com/books/parallel-programming-with-openacc/farber/978-0-12-410397-9】,代码是【https://github.com/rmfarber/ParallelProgrammingWithOpenACC/tree/master/Chapter05】下的 accList.double.cpp
1 // accList.h 2 #ifndef ACC_LIST_H 3 #define ACC_LIST_H 4 5 #include <cstdlib> 6 #include <cassert> 7 #ifdef _OPENACC 8 #include <openacc.h> 9 #endif 10 11 template<typename T> 12 class accList 13 { 14 public: 15 explicit accList() {} 16 explicit accList(size_t size) // 构造函数把 this 指针拷进设备,然后创建内存 17 { 18 #pragma acc enter data copyin(this) 19 allocate(size); 20 } 21 22 ~accList() // 析构时释放内存,再删除 this 指针 23 { 24 release(); 25 #pragma acc exit data delete(this) 26 } 27 28 #pragma acc routine seq 29 T& operator[](size_t idx) 30 { 31 return _A[idx]; 32 } 33 34 #pragma acc routine seq 35 const T& operator[](size_t idx) const 36 { 37 return _A[idx]; 38 } 39 40 size_t size() const 41 { 42 return _size; 43 } 44 45 accList& operator=(const accList& B) 46 { 47 allocate(B.size()); 48 for (size_t j = 0; j < _size; ++j) 49 { 50 _A[j] = B[j]; 51 } 52 accUpdateDevice(); 53 return *this; 54 } 55 56 void insert(size_t idx, const T& val) 57 { 58 _A[idx] = val; 59 } 60 void insert(size_t idx, const T* val) 61 { 62 _A[idx] = *val; 63 } 64 65 void accUpdateSelf() 66 { 67 accUpdateSelfT(_A, 0); 68 } 69 void accUpdateDevice() 70 { 71 accUpdateDeviceT(_A, 0); 72 } 73 74 private: 75 T * _A{ nullptr }; // 数据成员只有指针和长度 76 size_t _size{ 0 }; 77 78 void release() 79 { 80 if (_size > 0) 81 { 82 #pragma acc exit data delete(_A[0:_size]) // 释放内存时删除设备内存 83 delete[] _A; 84 _A = nullptr; 85 _size = 0; 86 } 87 } 88 89 void allocate(size_t size) 90 { 91 if (_size != size) // 申请内存尺寸与当前尺寸不一致时重新开辟一块 92 { 93 release(); 94 _size = size; 95 #pragma acc update device(_size) 96 if (_size > 0) 97 { 98 _A = new T[_size]; 99 #ifdef _OPENACC // 有 OpenACC 的话检查 _A 是否已经在设备上了 100 assert(!acc_is_present(&_A[0], sizeof(T))); 101 #endif 102 #pragma acc enter data create(_A[0:_size]) // 在设备上申请新内存 103 } 104 } 105 } 106 107 template<typename U> 108 void accUpdateSelfT(U *p, long) 109 { 110 #pragma acc update self(p[0:_size]) 111 } 112 113 template<typename U> 114 auto accUpdateSelfT(U *p, int) -> decltype(p->accUpdateSelf()) 115 { 116 for (size_t j = 0; j < _size; ++j) 117 { 118 p[j].accUpdateSelf(); 119 } 120 } 121 122 template<typename U> 123 void accUpdateDeviceT(U *p, long) 124 { 125 #pragma acc update device(p[0:_size]) 126 } 127 128 template<typename U> 129 auto accUpdateDeviceT(U *p, int) -> decltype(p->accUpdateDevice()) 130 { 131 for (size_t j = 0; j < _size; ++j) 132 { 133 p[j].accUpdateDevice(); 134 } 135 } 136 }; 137 #endif 138 139 // main.cpp 140 #include <iostream> 141 #include <cstdlib> 142 #include <cstdint> 143 #include "accList.h" 144 using namespace std; 145 #ifndef N 146 #define N 1024 147 #endif 148 149 int main() 150 { 151 accList<double> A(N), B(N); 152 for (int i = 0; i < B.size(); ++i) 153 B[i] = 2.5; 154 B.accUpdateDevice(); // 手动更新设备内存 155 #pragma acc parallel loop gang vector present(A,B) 156 for (int i = 0; i < A.size(); ++i) 157 A[i] = B[i] + i; 158 A.accUpdateSelf(); // 手动更新主机内存 159 for (int i = 0; i<10; ++i) 160 cout << "A[" << i << "]: " << A[i] << endl; 161 cout << "......" << endl; 162 for (int i = N - 10; i<N; ++i) 163 cout << "A[" << i << "]: " << A[i] << endl; 164 return 0; 165 }
● 运行结果
cuan@CUAN:~/acc$ pgc++ main.cpp -o main.exe -Minfo -acc main: 16, Generating present(B,A) Generating Tesla code 18, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ accList<double>::accList(unsigned long): 4, include "accList.h" 18, Generating enter data copyin(this[:1]) accList<double>::~accList(): 4, include "accList.h" 25, Generating exit data delete(this[:1]) accList<double>::operator [](unsigned long): 4, include "accList.h" 29, Generating acc routine seq Generating Tesla code accList<double>::size() const: 4, include "accList.h" 40, Generating implicit acc routine seq Generating acc routine seq Generating Tesla code accList<double>::release(): 4, include "accList.h" 82, Generating exit data delete(_A[:_size]) accList<double>::allocate(unsigned long): 4, include "accList.h" 95, Generating update device(_size) 102, Generating enter data create(_A[:_size]) void accList<double>::accUpdateSelfT<double>(T1 *, long): 4, include "accList.h" 110, Generating update self(p[:_size]) void accList<double>::accUpdateDeviceT<double>(T1 *, long): 4, include "accList.h" 125, Generating update device(p[:_size]) cuan@CUAN:~/acc$ ./main.exe launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line=16 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128 A[0]: 2.5 A[1]: 3.5 A[2]: 4.5 A[3]: 5.5 A[4]: 6.5 A[5]: 7.5 A[6]: 8.5 A[7]: 9.5 A[8]: 10.5 A[9]: 11.5 ...... A[1014]: 1016.5 A[1015]: 1017.5 A[1016]: 1018.5 A[1017]: 1019.5 A[1018]: 1020.5 A[1019]: 1021.5 A[1020]: 1022.5 A[1021]: 1023.5 A[1022]: 1024.5 A[1023]: 1025.5 Accelerator Kernel Timing data /home/cuan/acc/main.cpp main NVIDIA devicenum=0 time(us): 13 16: compute region reached 1 time 16: kernel launched 1 time grid: [1024] block: [128] device time(us): total=13 max=13 min=13 avg=13 elapsed time(us): total=329 max=329 min=329 avg=329 16: data region reached 2 times /home/cuan/acc/main.cpp _ZN7accListIdEC1Em NVIDIA devicenum=0 time(us): 12 18: data region reached 2 times 18: data copyin transfers: 2 device time(us): total=12 max=9 min=3 avg=6 /home/cuan/acc/main.cpp _ZN7accListIdED1Ev NVIDIA devicenum=0 time(us): 0 25: data region reached 2 times /home/cuan/acc/main.cpp _ZN7accListIdE7releaseEv NVIDIA devicenum=0 time(us): 7 82: data region reached 2 times 82: data copyin transfers: 2 device time(us): total=7 max=4 min=3 avg=3 /home/cuan/acc/main.cpp _ZN7accListIdE8allocateEm NVIDIA devicenum=0 time(us): 11 95: update directive reached 2 times 95: data copyin transfers: 2 device time(us): total=5 max=3 min=2 avg=2 102: data region reached 2 times 102: data copyin transfers: 2 device time(us): total=6 max=3 min=3 avg=3 /home/cuan/acc/main.cpp _ZN7accListIdE14accUpdateSelfTIdEEvPT_l NVIDIA devicenum=0 time(us): 10 110: update directive reached 1 time 110: data copyout transfers: 1 device time(us): total=10 max=10 min=10 avg=10 /home/cuan/acc/main.cpp _ZN7accListIdE16accUpdateDeviceTIdEEvPT_l NVIDIA devicenum=0 time(us): 3 125: update directive reached 1 time 125: data copyin transfers: 1 device time(us): total=3 max=3 min=3 avg=3