zoukankan      html  css  js  c++  java
  • cuda实现向量相加

    cuda实现向量相加

    博客最后附上整体代码

    如果有说的不对的地方还请前辈指出, 因为cuda真的接触没几天

    一些总结(建议看)

    1. cuda 并不纯GPU在运行程序, 而是 cpu 与 gpu 一起在运行程序, cpu负责调度, gpu 负责运算, cpu称为**HOST **, gpu 称为 DEVICE
    2. 记住三个东西 grid block thread ,关系分别是 grid 包含多个 block , block 包含多个 thread
    3. 一个block中thread个数选取一般为32的整数倍, 原因和warp有关, 有兴趣自行查阅
    4. 一个grid中block的个数选取和你的kernel函数以及thread数量有关, 举个例子, int a[1000] 加上 int b[1000] , 你的thread为64, 那么, block = 1000/64 = 16个合适
    5. __global__函数一般表示一个内核函数,是一组由GPU执行的并行计算任务,由cpu调用
    6. __host__一般是由CPU调用,由CPU执行的函数,
    7. __device__一般表示由GPU中一个线程调用的函数

    代码实现

    引入

    #include <stdio.h>
    #include <cuda_runtime.h>
    

    kernel函数

    __global__ void
    vectorAdd(float *a, float *b, float *c, int num){
            int i = blockDim.x * blockIdx.x + threadIdx.x; //vector is 1-dim, blockDim means the number of thread in a block
            if(i < num){
                    c[i] = a[i] + b[i];
            }
    }
    

    int i = blockDim.x * blockIdx.x + threadIdx.x;

    这句代码解释一下:

    blockDim.x 表示block的size行数(如果是一维的block的话,即一行有多少个thread)

    blockIdx.x 表示当前运行到的第几个block(一维grid的话,即该grid中第几个block)

    threadIdx.x 表示当前运行到的第几个thread (一维的block的话.即该block中第几个thread)

    画个图解释一下

    比如上面这个图的话, ABCDE各代表一个block, 总的为一个Grid, 每个block中有四个thread, 图中我花了箭头的也就是代表着第1个block中的第0个thread.

    那么 i = blockDim.x * blockIdx.x + threadIdx.x 就是指 i = 4 * 1 + 0

    申请内存空间与释放

    host中申请内存

    float *a = (float *)malloc(size);
    float *b = (float *)malloc(size);
    float *c = (float *)malloc(size);
    
    free(a);
    free(b);
    free(c);
    

    device中申请内存

    float *da = NULL;
    float *db = NULL;
    float *dc = NULL;
    
    cudaMalloc((void **)&da, size);
    cudaMalloc((void **)&db, size);
    cudaMalloc((void **)&dc, size);
    
    cudaFree(da);
    cudaFree(db);
    cudaFree(dc);
    

    host中内存copy到device

    cudaMemcpy(da,a,size,cudaMemcpyHostToDevice);
    cudaMemcpy(db,b,size,cudaMemcpyHostToDevice);
    cudaMemcpy(dc,c,size,cudaMemcpyHostToDevice);
    

    上面的cudaMemcpyHostToDevice用于指定方向有四种关键词

    cudaMemcpyHostToDevice | cudaMemcpyHostToHost | cudaMemcpyDeviceToDevice | cudaMemcpyDeviceToHost

    启动 kernel函数

    int threadPerBlock = 256;                        
    int blockPerGrid = (num + threadPerBlock - 1)/threadPerBlock;
    vectorAdd <<< blockPerGrid, threadPerBlock >>> (da,db,dc,num)
    

    此处确定了block中的thread数量以及一个grid中block数量

    利用kernel function <<< blockPerGrid, threadPerBlock>>> (paras,...) 来实现在cuda中运算

    参考

    https://zhuanlan.zhihu.com/p/345877391

    https://docs.nvidia.com/cuda/cuda-c-programming-guide/

    源码展示

    #include <stdio.h>
    
    #include <cuda_runtime.h>
    
    // vectorAdd run in device
    __global__ void 
    vectorAdd(float *a, float *b, float *c, int num){
    	int i = blockDim.x * blockIdx.x + threadIdx.x; //vector is 1-dim, blockDim means the number of thread in a block
    	if(i < num){
    		c[i] = a[i] + b[i];
    	}
    }
    
    // main run in host
    int
    main(void){
    	int num = 10000; // size of vector
    	size_t size = num * sizeof(float);
    
    	// host memery
    	float *a = (float *)malloc(size);
    	float *b = (float *)malloc(size);
    	float *c = (float *)malloc(size);
    
    	// init the vector
    	for(int i=1;i<num;++i){
    		a[i] = rand()/(float)RAND_MAX;
    		b[i] = rand()/(float)RAND_MAX;
    	}
    
    	// copy the host memery to device memery
    	float *da = NULL;
    	float *db = NULL;
    	float *dc = NULL;
    
    	cudaMalloc((void **)&da, size);
    	cudaMalloc((void **)&db, size);
    	cudaMalloc((void **)&dc, size);
    
    	cudaMemcpy(da,a,size,cudaMemcpyHostToDevice);
    	cudaMemcpy(db,b,size,cudaMemcpyHostToDevice);
    	cudaMemcpy(dc,c,size,cudaMemcpyHostToDevice);
    
    	// launch function add kernel
    	int threadPerBlock = 256;
    	int blockPerGrid = (num + threadPerBlock - 1)/threadPerBlock;
    	printf("threadPerBlock: %d 
    blockPerGrid: %d 
    ",threadPerBlock,blockPerGrid);
    
    	vectorAdd <<< blockPerGrid, threadPerBlock >>> (da,db,dc,num);
    
    	//copy the device result to host
    	cudaMemcpy(c,dc,size,cudaMemcpyDeviceToHost);
    
    	// Verify that the result vector is correct
    	for (int i = 0; i < num; ++i){
    		if (fabs(a[i] + b[i] - c[i]) > 1e-5){
    			fprintf(stderr, "Result verification failed at element %d!
    ", i);
    			return 0;
    		}
    	}
    
    	printf("Test PASSED
    ");
    
    	// Free device global memory
    	cudaFree(da);
    	cudaFree(db);
    	cudaFree(dc);
    	// Free host memory
    	free(a);
    	free(b);
    	free(c);
    
    	printf("free is ok
    ");
    	return 0;
    }
    
  • 相关阅读:
    Xcode 自定义代码段
    Foundation和CoreFoundation之间的转换
    如何由新特性跳转到App首页
    判断是否是新特性,决定是否进入欢迎页
    iOS面试题01
    公司生日聚会-水长城之游
    jquery操作html元素之(设置内容和属性)
    jquery操作html元素之( 获得内容和属性)
    解决:Map的area属性标签鼠标Hover可以给area加背景
    在网页制作中,面对设计图,网页制作人员一般要遵循的原则是什么?
  • 原文地址:https://www.cnblogs.com/wangha/p/10803696.html
Copyright © 2011-2022 走看看