首先这里有两段代码:
main.cpp:
#include <stdio.h> #include <iostream> extern "C" { int func(); } int main() { std::cout<<"Hello C++"<<std::endl; func(); return 0; }
test.cu:
#include <cuda_runtime.h> #include <stdio.h> //thread 1D __global__ void testThread1(int *c, const int *a, const int *b) { int i = threadIdx.x; c[i] = b[i] - a[i]; } void addWithCuda(int *c, const int *a, const int *b, unsigned int size) { int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaSetDevice(0); cudaMalloc((void**)&dev_c, size * sizeof(int)); cudaMalloc((void**)&dev_a, size * sizeof(int)); cudaMalloc((void**)&dev_b, size * sizeof(int)); cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); testThread1<<<1, size>>>(dev_c, dev_a, dev_b); cudaMemcpy(c, dev_c, size*sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); cudaGetLastError(); } extern "C" int func() { const int n = 1000; int *a = new int[n]; int *b = new int[n]; int *c = new int[n]; int *cc = new int[n]; for (int i = 0; i < n; i++) { a[i] = rand() % 100; b[i] = rand() % 100; c[i] = b[i] - a[i]; } addWithCuda(cc, a, b, n); FILE *fp = fopen("out.txt", "w"); for (int i = 0; i < n; i++) fprintf(fp, "%d %d ", c[i], cc[i]); fclose(fp); bool flag = true; for (int i = 0; i < n; i++) { if (c[i] != cc[i]) { flag = false; break; } } if (flag == false) printf("no pass"); else printf("pass"); cudaDeviceReset(); delete[] a; delete[] b; delete[] c; delete[] cc; return 0; }
Linux下可以这样:
nvcc -c test.cu g++ -c main.cpp g++ -o main main.o test.o -lcudart -L/usr/local/cuda/lib64
Windows下可以这样:
nvcc -c test.cu cl -c main.cpp link -out:main.exe main.obj test.obj cudart.lib -libpath:"C:Program FilesNVIDIA GPU Computing ToolkitCUDAv10.2libx64"
应该都差不多。