zoukankan      html  css  js  c++  java
  • 『CUDA C编程权威指南』第二章编程题选做

    第一题

    设置线程块中线程数为1024效果优于设置为1023,且提升明显,不过原因未知,以后章节看看能不能回答。

    第二题

    参考文件sumArraysOnGPUtimer.cu,设置block=256,新建内核,使每个线程处理两个元素。

    思路很简单,将数据的虚拟内存对半分为高低两块,每一内核线程同时处理两个索引区域序列相同的数据即可:

    # include <cuda_runtime.h>
    # include <stdio.h>
    # include <sys/time.h>
    # include "common.h"
    
    __global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N)
    {
        int i = blockIdx.x * blockDim.x + threadIdx.x;
    
        if (i < N/2) {
        	C[i] = A[i] + B[i];
        	C[i+N/2] = A[i+N/2] + B[i+N/2];
        }
    }
    
    int main(int argc, char **argv)
    {
        printf("%s Starting...
    ", argv[0]);
    
        // set up device
        int dev = 0;
        cudaDeviceProp deviceProp;
        CHECK(cudaGetDeviceProperties(&deviceProp, dev));
        printf("Using Device %d: %s
    ", dev, deviceProp.name);
        CHECK(cudaSetDevice(dev));
    
        // set up data size of vectors
        int nElem = 1 << 24;
        printf("Vector size %d
    ", nElem);
    
        // malloc host memory
        size_t nBytes = nElem * sizeof(float);
    
        float *h_A, *h_B, *hostRef, *gpuRef;
        h_A     = (float *)malloc(nBytes);
        h_B     = (float *)malloc(nBytes);
        hostRef = (float *)malloc(nBytes);
        gpuRef  = (float *)malloc(nBytes);
    
        double iStart, iElaps;
    
        // initialize data at host side
        iStart = cpuSecond();
        initialData(h_A, nElem);
        initialData(h_B, nElem);
        iElaps = cpuSecond() - iStart;
        printf("initialData Time elapsed %f sec
    ", iElaps);
        memset(hostRef, 0, nBytes);
        memset(gpuRef,  0, nBytes);
    
        // add vector at host side for result checks
        iStart = cpuSecond();
        sumArraysOnHost(h_A, h_B, hostRef, nElem);
        iElaps = cpuSecond() - iStart;
        printf("sumArraysOnHost Time elapsed %f sec
    ", iElaps);
    
        // malloc device global memory
        float *d_A, *d_B, *d_C;
        CHECK(cudaMalloc((float**)&d_A, nBytes));
        CHECK(cudaMalloc((float**)&d_B, nBytes));
        CHECK(cudaMalloc((float**)&d_C, nBytes));
    
        // transfer data from host to device
        CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
        CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
        CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice));
    
        // invoke kernel at host side
        int iLen = 512;
        dim3 block (iLen);
        dim3 grid  ((nElem + block.x - 1) / block.x / 2);
        // <<<  16384, 512  >>>  Time elapsed 0.000747 sec 
        // <<<  32768, 512  >>>  Time elapsed  0.000709 sec
    
        iStart = cpuSecond();
        sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);
        CHECK(cudaDeviceSynchronize());
        iElaps = cpuSecond() - iStart;
        printf("sumArraysOnGPU <<<  %d, %d  >>>  Time elapsed %f sec
    ", grid.x,
               block.x, iElaps);
    
        // check kernel error
        // CHECK(cudaGetLastError()) ;
    
        // copy kernel result back to host side
        CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
    
        // check device results
        checkResult(hostRef, gpuRef, nElem);
    
        // free device global memory
        CHECK(cudaFree(d_A));
        CHECK(cudaFree(d_B));
        CHECK(cudaFree(d_C));
    
        // free host memory
        free(h_A);
        free(h_B);
        free(hostRef);
        free(gpuRef);
    
        return(0);
    }
    

    第四题

    参考文件sumMatrixOnGPU-2D-gril-1D-block.cu,新建内核,使每个线程处理两个元素。

    思路同上,由于是二维索引,所以采取的划分是按照纵坐标y将数据对半划分,可以直观理解为沿着y/2将数据对折,然后同一个线程处理数据为两个块中对应位置即可:

    # include <cuda_runtime.h>
    # include <stdio.h>
    # include <sys/time.h>
    # include "common.h"
    
    // grid 2D block 1D
    __global__ void sumMatrixsOnGPUMix(float *MatA, float *MatB, float *MatC, 
    	int nx, int ny)
    {
        int ix = threadIdx.x + blockIdx.x * blockDim.x;
        int iy = blockIdx.y;
        int idx = iy * nx + ix;
    
        if (ix < nx && iy < ny/2) {
            MatC[idx] = MatA[idx] + MatB[idx];
        	MatC[idx + nx*ny/2] = MatA[idx + nx*ny/2] + MatB[idx + nx*ny/2];
        }
    }
    
    int main(int argc, char **argv){
    	printf("%s Startin... 
    ", argv[0]);
    
    	//set up device
    	int dev = 0;
    	cudaDeviceProp deviceProp;
    	CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    	printf("Using Device %d: %s
    ", dev, deviceProp.name);
    	CHECK(cudaSetDevice(dev));
    
    	// matrix size
    	int nx = 1<<13;
    	int ny = 1<<5;  // 2**18
    
    	int nxy = nx * ny;
    	int nBytes = nxy * sizeof(float);
    	printf("Matrix size:nx %d, ny %d
    ", nx, ny);
    
    	float *h_A, *h_B, *hostRef, *gpuRef;
    	h_A = (float *)malloc(nBytes);
    	h_B = (float *)malloc(nBytes);
    	hostRef = (float *)malloc(nBytes);
    	gpuRef = (float *)malloc(nBytes);
    
    	// initialize data at host side
    	double iStart, iElaps;
    	iStart = cpuSecond();
    	initialData(h_A, nxy);
    	initialData(h_B, nxy);
    	iElaps = cpuSecond() - iStart;
    
    	memset(hostRef, 0, nBytes);
    	memset(gpuRef, 0, nBytes);
    
    	iStart = cpuSecond();
    	sumMatrixsOnHost(h_A, h_B, hostRef, nx, ny);
    	iElaps = cpuSecond() - iStart;
    
    	// malloc device global memory
    	float *d_MatA, *d_MatB, *d_MatC;
    	cudaMalloc((float **)&d_MatA, nBytes);
    	cudaMalloc((float **)&d_MatB, nBytes);
    	cudaMalloc((float **)&d_MatC, nBytes);
    
    	// transfer data from host to device
    	cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);
    	cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);
    
    	// invoke kernel at host to device
    	dim3 block (256);  // 2维块设置
    	dim3 grid ((nx+block.x-1)/block.x, ny/2);  // 2维网格设置
    	/*
    	<<<(1024, 16384), (16, 1)>>> Time elapsed 0.021947sec
    	<<<(512, 16384), (32, 1)>>> Time elapsed  0.011039sec
    	<<<(64, 16384), (256, 1)>>> Time elapsed  0.009063sec
    	*/
    
    	iStart = cpuSecond();
    	sumMatrixsOnGPUMix<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
    	cudaDeviceSynchronize();  // 测试用,同步线程,实际无需等待子线程
    	iElaps = cpuSecond() - iStart;
    	printf("sumArraysOnGPU <<<(%d, %d), (%d, %d)>>> Time elapsed %f" 
    		   "sec
    ", grid.x, grid.y, block.x, block.y, iElaps);
    
    	cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost);
    	checkResult(hostRef, gpuRef, nxy);
    
    	// free device global memory
    	cudaFree(d_MatA);
    	cudaFree(d_MatB);
    	cudaFree(d_MatC);
    
    	// free host memory
    	free(h_A);
    	free(h_B);
    	free(hostRef);
    	free(gpuRef);
    
    	// reset device
    	cudaDeviceReset();
    
    	return 0;
    }
    

    运行结果如下:

    附common.h文件

    # include <cuda_runtime.h>
    # include <stdio.h>
    # include <sys/time.h>
    # define CHECK(call)                                                           
    {                                                                              
        const cudaError_t error = call;                                            
        if (error != cudaSuccess)                                                  
        {                                                                          
            fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 
            fprintf(stderr, "code: %d, reason: %s
    ", error,                       
                    cudaGetErrorString(error));                                    
            exit(1);                                                               
        }                                                                          
    }
    
    void initialData(float *ip, int size)
    {
    	time_t t;
    	srand((unsigned int) time(&t));
    
    	for (int i=0; i<size; i++)
    	{
    		ip[i] = (float)(rand() & 0xFF)/10.0f;
    	}
    }
    
    double cpuSecond() {
    	struct timeval tp;
    	gettimeofday(&tp, NULL);
    	return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
    }
    
    void checkResult(float *hostRef, float *gpuRef, const int N) {
    	double epsilon = 1.0E-8;
    	bool match = 1;
    	for (int i=0; i<N; i++) {
    		if (abs(hostRef[i] - gpuRef[i]) > epsilon) {
    			match = 0;
    			printf("Arrays do not match!
    ");
    			printf("host %5.2f gpu %5.2f at current %d
    ", 
    				hostRef[i], gpuRef[i], i);
    			break;
    		}
    	}
    	if (match) printf("Arrays match.
    
    ");
    }
    
    void sumArraysOnHost(float *A, float *B, float *C, const int N) {
    	for (int idx=0; idx<N; idx++) 
    		C[idx] = A[idx] + B[idx];
    }
    
    void sumMatrixsOnHost(float *A, float *B, float *C, const int nx, const int ny){
    	float *ia = A;
    	float *ib = B;
    	float *ic = C;
    	for (int iy=0; iy<ny; iy++){
    		for (int ix=0; ix<nx; ix++){
    			ic[ix] = ia[ix] + ib[ix];
    		}
    		ia += nx;
    		ib += nx;
    		ic += nx;
    	}
    }
    
  • 相关阅读:
    linux下RTP编程(使用JRTPLIB)(转)
    omap 移植qt4.7.0
    mknod指令详解
    mplayer 在线播放错误
    移植Mplayer到OK6410开发板
    海量数据处理算法—Bit-Map
    海量数据面试题----分而治之/hash映射 + hash统计 + 堆/快速/归并排序
    并行、并发、同步和互斥
    阿里巴巴集团2014年校园招聘北京笔试题
    动态规划之矩阵连乘
  • 原文地址:https://www.cnblogs.com/hellcat/p/9585884.html
Copyright © 2011-2022 走看看