zoukankan      html  css  js  c++  java
  • 6.2 CUDA streams

    stream是什么

    nivdia给出的解释是:
    A sequence of operations that execute in issue-order on the GPU.  可以理解成在GPU上执行的操作序列.比如下面的这些动作.

    cudaMemcpy()
    kernel launch
    device sync
    cudaMemcpy()

    不同的流操作可能是交叉执行的,可能是同事执行的.

    流的API:

    cudaEvent_t start;
    cudaEventCreate(&start);
    cudaEventRecord( start, 0 );

    我们可以把一个应用程序的整体对的stream的情况称之为pipeline.优化程序以stream的角度就是优化pipeline 

    cuda overlap重叠

    支持设备重叠的cuda GPU设备能够在执行kernel函数时同时执行设备与主机之间的内存拷贝动作.可以用下面的代码查看设备是否支持overlap:

    int dev_count;
    cudaDeviceProp prop;
    cudaGetDeviceCount( &dev_count);
    for (int i = 0; i < dev_count; i++) {
        cudaGetDeviceProperties(&prop, i);
        if (prop.deviceOverlap) ...

    cudaMemcpyAsync()

    memcpy是以同步方式执行的,当函数返回时,复制操作已经完成.而cudaMemcpyAsync()是异步函数,它只是放置一个请求,表示在流中执行一次内存复制操作,这个复制操作是通过参数stream来指定的.当函数返回时我们无法保证函数已经执行完成,能够保证的是复制操作肯定会在下一个放入流的操作之前执行完成.任何传递给cudaMemcpyAsync()的主机内存指针都必须已经通过cudaHostAlloc()分配好内存,也就是,你只能以异步方式对页锁定内存进行复制操作.

    Vector stream add 向量流加法

    优化这个pipeline,最理想的pipeline如下:


    可以看到在同一时间,lanuch kernel, copy host to device, copy device back to host 三个任务同时执行. 有2个stream流,一个是copy, 一个用于执行kernel.

    实际优化pipeline的时候并不是这么简单和容易的,先看下面一段host代码:

        for (int i=0; i<n; i+=SegSize*2) {
            cudaMemcpyAsync(d_A0, h_A+i, SegSize*sizeof(float),..., stream0);
            cudaMemcpyAsync(d_B0, h_B+i, SegSize*sizeof(float),..., stream0);
            vecAdd<<<SegSize/256, 256, 0, stream0>>>(d_A0, d_B0,...);
            cudaMemcpyAsync(h_C+i, d_C0, SegSize*sizeof(float),..., stream0);
            cudaMemcpyAsync(d_A1, h_A+i+SegSize, SegSize*sizeof(float),...,
                            stream1);
            cudaMemcpyAsync(d_B1, h_B+i+SegSize, SegSize*sizeof(float) ,...,
                            stream1);
            vecAdd<<<SegSize/256, 256, 0, stream1>>>(d_A1, d_B1, ...);
            cudaMemcpyAsync(d_C1, h_C+i+SegSize, SegSize*sizeof(float),...,
                            stream1);
        }

    这段代码的pipeline的情况是: 执行kernel计算和 下一块拷贝主机内存到设备是同事进行的.

     再看下面这段代码:

    for (int i=0; i<n; i+=SegSize*2) {
            cudaMemcpyAsync(d_A0, h_A+i, SegSize*sizeof(float),..., stream0);
            cudaMemcpyAsync(d_B0, h_B+i, SegSize*sizeof(float),..., stream0);
            cudaMemcpyAsync(d_A1, h_A+i+SegSize, SegSize*sizeof(float),...,
                            stream1);
            cudaMemcpyAsync(d_B1, h_B+i+SegSize, SegSize*sizeof(float),...,
                            stream1);
            vecAdd<<<SegSize/256, 256, 0, stream0>>>(d_A0, d_B0, ...);
            vecAdd<<<SegSize/256, 256, 0, stream1>>>(d_A1, d_B1, ...);
            cudaMemcpyAsync(h_C+i, d_C0, SegSize*sizeof(float),..., stream0);
            cudaMemcpyAsync(h_C+i+SegSize, d_C1, SegSize*sizeof(float),...,
                            stream1);
    }

    这段代码的pipeline情况是:和上一种的区别是把拷贝A和B元素与kernel并行,可以形象的理解成,下一行向左移动一下,那么整个pipeline整体是缩短了的.

    strean 同步API

    cudaStreamSynchronize(stream_id): 等待一个stream中的所有任务执行完成.

    cudaDeviceSynchronize(): 不带参数等待设备中所有流任务执行完成

    Vector-stream-add Code

    首先使用2个stream来做:

    #include    <wb.h>
    #define wbCheck(stmt) do {                                                    
            cudaError_t err = stmt;                                               
            if (err != cudaSuccess) {                                             
                wbLog(ERROR, "Failed to run stmt ", #stmt);                       
                wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));    
                return -1;                                                        
            }                                                                     
        } while(0)  
    
    #define SegSize 256
    #define StreamNum 2
    
    __global__ void vecAdd(float * in1, float * in2, float * out, int len) {
        //@@ Insert code to implement vector addition here
        int gidx = blockIdx.x*blockDim.x + threadIdx.x;
    
        if(gidx< len)
        {
            out[gidx]= in1[gidx]+in2[gidx];
        }
    }
    
    int main(int argc, char ** argv) {
        wbArg_t args;
        int inputLength;
        float * hostInput1;
        float * hostInput2;
        float * hostOutput;
      //  float * deviceInput1;
      //  float * deviceInput2;
      //  float * deviceOutput;
        float *h_A, *h_B, *h_C;
        
        //cudaStream_t stream0, stream1;
        //cudaStreamCreate(&stream0);
        //cudaStreamCreate(&stream1);
        float *d_A0, *d_B0, *d_C0;// device memory for stream 0
        float *d_A1, *d_B1, *d_C1;// device memory for stream 1
    
        args = wbArg_read(argc, argv);
        int Csize = SegSize*sizeof(float);
    
        wbTime_start(Generic, "Importing data and creating memory on host");
        hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength);
        hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength);
        hostOutput = (float *) malloc(inputLength * sizeof(float));
        printf("inputLength ==%d, SegSize =%d
    ", inputLength, SegSize);
        wbTime_stop(Generic, "Importing data and creating memory on host");
        
        cudaHostAlloc((void**)&h_A, inputLength*sizeof(float), cudaHostAllocDefault);
        cudaHostAlloc((void**)&h_B, inputLength*sizeof(float), cudaHostAllocDefault);
        cudaHostAlloc((void**)&h_C, inputLength*sizeof(float), cudaHostAllocDefault);
    
        memcpy(h_A, hostInput1,inputLength*sizeof(float));
        memcpy(h_B, hostInput2,inputLength*sizeof(float));
        
        wbCheck(cudaMalloc((void **)&d_A0, Csize));
        wbCheck(cudaMalloc((void **)&d_A1, Csize));
        wbCheck(cudaMalloc((void **)&d_B0, Csize));
        wbCheck(cudaMalloc((void **)&d_B1, Csize));
        wbCheck(cudaMalloc((void **)&d_C0, Csize));
        wbCheck(cudaMalloc((void **)&d_C1, Csize));
        
        cudaStream_t *streams = (cudaStream_t*) malloc(StreamNum * sizeof(cudaStream_t));
        for(int i = 0; i < StreamNum; i++)
            cudaStreamCreate(&(streams[i]));
        
        int main = inputLength/(SegSize*StreamNum);
        int left = inputLength%(SegSize*StreamNum);
        
        printf("main =%d, left=%d
    ", main, left);
            int i = 0; // keep the increaing length
          for(i; i < inputLength; i+=SegSize*StreamNum)
        {
                cudaMemcpyAsync(d_A0, hostInput1+i, Csize, cudaMemcpyHostToDevice, streams[0]);
                cudaMemcpyAsync(d_B0, hostInput2+i, Csize, cudaMemcpyHostToDevice, streams[0]);
                cudaMemcpyAsync(d_A1, hostInput1+i+SegSize, Csize, cudaMemcpyHostToDevice, streams[1]);
                cudaMemcpyAsync(d_B1, hostInput2+i+SegSize, Csize, cudaMemcpyHostToDevice, streams[1]); 
                
                // block size is 256
                vecAdd<<<SegSize/256, SegSize, 1, streams[0]>>>(d_A0, d_B0, d_C0, SegSize);
                vecAdd<<<SegSize/256, SegSize, 1, streams[1]>>>(d_A1, d_B1, d_C1, SegSize);
    
              //  cudaStreamSynchronize(yiming_stream0);
                cudaMemcpyAsync(hostOutput+i, d_C0, Csize, cudaMemcpyDeviceToHost, streams[0]);
                //cudaStreamSynchronize(yiming_stream1);
                cudaMemcpyAsync(hostOutput+i+SegSize, d_C1, Csize, cudaMemcpyDeviceToHost, streams[1]);
        }
        
        // Process the remaining elements
    
    
        if(SegSize < left)
        {
            printf("AAAAAAA, left- size ==%d
    ", left-SegSize);
            cudaMemcpyAsync(d_A0, hostInput1+i, Csize, cudaMemcpyHostToDevice, streams[0]);
            cudaMemcpyAsync(d_B0, hostInput2+i, Csize, cudaMemcpyHostToDevice, streams[0]);
            cudaMemcpyAsync(d_A1, hostInput1+i+SegSize, (left-SegSize)*sizeof(float), cudaMemcpyHostToDevice, streams[1]);
            cudaMemcpyAsync(d_B1, hostInput2+i+SegSize, (left-SegSize)*sizeof(float), cudaMemcpyHostToDevice, streams[1]);
    
                
            // block size is 256
            vecAdd<<<1, SegSize, 1, streams[0]>>>(d_A0, d_B0, d_C0, SegSize);
            vecAdd<<<1, left-SegSize, 1, streams[1]>>>(d_A0, d_B0, d_C0, left-SegSize);                                                                                                                                    
    
           // cudaStreamSynchronize(streams[0]);
            cudaMemcpyAsync(hostOutput+i, d_C0, Csize,cudaMemcpyDeviceToHost, streams[0]);
            cudaMemcpyAsync(hostOutput+i+SegSize, d_C0, (left-SegSize)*sizeof(float),cudaMemcpyDeviceToHost, streams[1]);                                                                                                                                    
            
        //    i+=SegSize;
        //    left = left - SegSize;
        }
        else if(left > 0)
        {
            printf("BBBBBBB
    ");
            cudaMemcpyAsync(d_A0, hostInput1+i, left*sizeof(float), cudaMemcpyHostToDevice);
            cudaMemcpyAsync(d_B0, hostInput2+i, left*sizeof(float), cudaMemcpyHostToDevice);
            
            vecAdd<<<1, left, 1, streams[0]>>>(d_A0, d_B0, d_C0, left);
            
            //cudaDeviceSynchronize();
            cudaMemcpyAsync(hostOutput+i, d_C0, left*sizeof(float), cudaMemcpyDeviceToHost);    
        }
        
        cudaDeviceSynchronize(); 
        wbSolution(args, hostOutput, inputLength);
    
        free(hostInput1);
        free(hostInput2);
        free(hostOutput);
        
        for(int i = 0; i < StreamNum; i++)
            cudaStreamDestroy(streams[i]);
    
        cudaFree(d_A0);
        cudaFree(d_A1);
        cudaFree(d_B0);
        cudaFree(d_B1);
        cudaFree(d_C0);
        cudaFree(d_C1);
        return 0;
    }
    View Code

    然后是使用4个流来做,code如下:

    #include    <wb.h>
    #define wbCheck(stmt) do {                                                    
            cudaError_t err = stmt;                                               
            if (err != cudaSuccess) {                                             
                wbLog(ERROR, "Failed to run stmt ", #stmt);                       
                wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));    
                return -1;                                                        
            }                                                                     
        } while(0)  
    
    #define SegSize 256
    #define StreamNum 4
    
    __global__ void vecAdd(float * in1, float * in2, float * out, int len) {
        //@@ Insert code to implement vector addition here
        int gidx = blockIdx.x*blockDim.x + threadIdx.x;
    
        if(gidx< len)
        {
            out[gidx]= in1[gidx]+in2[gidx];
        }
    }
    
    int main(int argc, char ** argv) {
        wbArg_t args;
        int inputLength, i;
        float * hostInput1;
        float * hostInput2;
        float * hostOutput;
      //  float * deviceInput1;
      //  float * deviceInput2;
      //  float * deviceOutput;
        float *h_A, *h_B, *h_C;
        
        //cudaStream_t stream0, stream1;
        //cudaStreamCreate(&stream0);
        //cudaStreamCreate(&stream1);
        float *d_A0, *d_B0, *d_C0;// device memory for stream 0
        float *d_A1, *d_B1, *d_C1;// device memory for stream 1
        float *d_A2, *d_B2, *d_C2;// device memory for stream 2
        float *d_A3, *d_B3, *d_C3;// device memory for stream 3
    
        args = wbArg_read(argc, argv);
        int Csize = SegSize*sizeof(float);
    
        wbTime_start(Generic, "Importing data and creating memory on host");
        hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength);
        hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength);
        hostOutput = (float *) malloc(inputLength * sizeof(float));
        printf("inputLength ==%d, SegSize =%d
    ", inputLength, SegSize);
        wbTime_stop(Generic, "Importing data and creating memory on host");
        
        cudaHostAlloc((void**)&h_A, inputLength*sizeof(float), cudaHostAllocDefault);
        cudaHostAlloc((void**)&h_B, inputLength*sizeof(float), cudaHostAllocDefault);
        cudaHostAlloc((void**)&h_C, inputLength*sizeof(float), cudaHostAllocDefault);
    
        memcpy(h_A, hostInput1,inputLength*sizeof(float));
        memcpy(h_B, hostInput2,inputLength*sizeof(float));
        
        wbCheck(cudaMalloc((void **)&d_A0, Csize));
        wbCheck(cudaMalloc((void **)&d_A1, Csize));
        wbCheck(cudaMalloc((void **)&d_B0, Csize));
        wbCheck(cudaMalloc((void **)&d_B1, Csize));
        wbCheck(cudaMalloc((void **)&d_C0, Csize));
        wbCheck(cudaMalloc((void **)&d_C1, Csize));
        wbCheck(cudaMalloc((void **)&d_A2, Csize));
        wbCheck(cudaMalloc((void **)&d_A3, Csize));
        wbCheck(cudaMalloc((void **)&d_B2, Csize));
        wbCheck(cudaMalloc((void **)&d_B3, Csize));
        wbCheck(cudaMalloc((void **)&d_C2, Csize));
        wbCheck(cudaMalloc((void **)&d_C3, Csize));
        
        cudaStream_t *streams = (cudaStream_t*) malloc(StreamNum * sizeof(cudaStream_t));
        for(int i = 0; i < StreamNum; i++)
            cudaStreamCreate(&(streams[i]));
        
        int main = inputLength/(SegSize*StreamNum);
        int left = inputLength%(SegSize*StreamNum);
        
        printf("main =%d, left=%d
    ", main, left);
        for(i=0; i < inputLength; i+=SegSize*StreamNum)
        {
                cudaMemcpyAsync(d_A0, hostInput1+i, Csize, cudaMemcpyHostToDevice, streams[0]);
                cudaMemcpyAsync(d_B0, hostInput2+i, Csize, cudaMemcpyHostToDevice, streams[0]);
                cudaMemcpyAsync(d_A1, hostInput1+i+SegSize, Csize, cudaMemcpyHostToDevice, streams[1]);
                cudaMemcpyAsync(d_B1, hostInput2+i+SegSize, Csize, cudaMemcpyHostToDevice, streams[1]); 
                cudaMemcpyAsync(d_A2, hostInput1+i+SegSize*2, Csize, cudaMemcpyHostToDevice, streams[2]);
                cudaMemcpyAsync(d_B2, hostInput2+i+SegSize*2, Csize, cudaMemcpyHostToDevice, streams[2]);
                cudaMemcpyAsync(d_A3, hostInput1+i+SegSize*3, Csize, cudaMemcpyHostToDevice, streams[3]);
                cudaMemcpyAsync(d_B3, hostInput2+i+SegSize*3, Csize, cudaMemcpyHostToDevice, streams[3]); 
                
                // block size is 256
                vecAdd<<<SegSize/256, SegSize, 1, streams[0]>>>(d_A0, d_B0, d_C0, SegSize);
                vecAdd<<<SegSize/256, SegSize, 1, streams[1]>>>(d_A1, d_B1, d_C1, SegSize);
                vecAdd<<<SegSize/256, SegSize, 1, streams[2]>>>(d_A2, d_B2, d_C2, SegSize);
                vecAdd<<<SegSize/256, SegSize, 1, streams[3]>>>(d_A3, d_B3, d_C3, SegSize);
                
                cudaMemcpyAsync(hostOutput+i, d_C0, Csize, cudaMemcpyDeviceToHost, streams[0]);
                //cudaStreamSynchronize(yiming_stream1);
                cudaMemcpyAsync(hostOutput+i+SegSize, d_C1, Csize, cudaMemcpyDeviceToHost, streams[1]);
                cudaMemcpyAsync(hostOutput+i+SegSize*2, d_C2, Csize, cudaMemcpyDeviceToHost, streams[2]);
                cudaMemcpyAsync(hostOutput+i+SegSize*3, d_C3, Csize, cudaMemcpyDeviceToHost, streams[3]);
        }
        
        // Process the remaining elements
        if(SegSize*3 < left){
                printf("DDDDDDDD
    ");
                cudaMemcpyAsync(d_A0, hostInput1+i, Csize, cudaMemcpyHostToDevice, streams[0]);
                cudaMemcpyAsync(d_B0, hostInput2+i, Csize, cudaMemcpyHostToDevice, streams[0]);
                cudaMemcpyAsync(d_A1, hostInput1+i+SegSize, Csize, cudaMemcpyHostToDevice, streams[1]);
                cudaMemcpyAsync(d_B1, hostInput2+i+SegSize, Csize, cudaMemcpyHostToDevice, streams[1]); 
                cudaMemcpyAsync(d_A2, hostInput1+i+SegSize*2, Csize, cudaMemcpyHostToDevice, streams[2]);
                cudaMemcpyAsync(d_B2, hostInput2+i+SegSize*2, Csize, cudaMemcpyHostToDevice, streams[2]);
                cudaMemcpyAsync(d_A3, hostInput1+i+SegSize*3, (left-SegSize*3)*sizeof(float), cudaMemcpyHostToDevice, streams[3]);
                cudaMemcpyAsync(d_B3, hostInput2+i+SegSize*3, (left-SegSize*3)*sizeof(float), cudaMemcpyHostToDevice, streams[3]); 
                
                // block size is 256
                vecAdd<<<1, SegSize, 1, streams[0]>>>(d_A0, d_B0, d_C0, SegSize);
                vecAdd<<<1, SegSize, 1, streams[1]>>>(d_A1, d_B1, d_C1, SegSize);
                vecAdd<<<1, SegSize, 1, streams[2]>>>(d_A2, d_B2, d_C2, SegSize);
                vecAdd<<<1, (left-SegSize*3), 1, streams[3]>>>(d_A3, d_B3, d_C3, (left-SegSize*3));
                
                cudaMemcpyAsync(hostOutput+i, d_C0, Csize, cudaMemcpyDeviceToHost, streams[0]);
                //cudaStreamSynchronize(yiming_stream1);
                cudaMemcpyAsync(hostOutput+i+SegSize, d_C1, Csize, cudaMemcpyDeviceToHost, streams[1]);
                cudaMemcpyAsync(hostOutput+i+SegSize*2, d_C2, Csize, cudaMemcpyDeviceToHost, streams[2]);
                cudaMemcpyAsync(hostOutput+i+SegSize*3, d_C3, (left-SegSize*3)*sizeof(float), cudaMemcpyDeviceToHost, streams[3]);
        }
        else if(SegSize*2 < left){
                printf("CCCCCCCC
    ");
                cudaMemcpyAsync(d_A0, hostInput1+i, Csize, cudaMemcpyHostToDevice, streams[0]);
                cudaMemcpyAsync(d_B0, hostInput2+i, Csize, cudaMemcpyHostToDevice, streams[0]);
                cudaMemcpyAsync(d_A1, hostInput1+i+SegSize, Csize, cudaMemcpyHostToDevice, streams[1]);
                cudaMemcpyAsync(d_B1, hostInput2+i+SegSize, Csize, cudaMemcpyHostToDevice, streams[1]); 
                cudaMemcpyAsync(d_A2, hostInput1+i+SegSize*2, (left-SegSize*2)*sizeof(float), cudaMemcpyHostToDevice, streams[2]);
                cudaMemcpyAsync(d_B2, hostInput2+i+SegSize*2, (left-SegSize*2)*sizeof(float), cudaMemcpyHostToDevice, streams[2]);
                
                // block size is 256
                vecAdd<<<1, SegSize, 1, streams[0]>>>(d_A0, d_B0, d_C0, SegSize);
                vecAdd<<<1, SegSize, 1, streams[1]>>>(d_A1, d_B1, d_C1, SegSize);
                vecAdd<<<1, left-SegSize*2, 1, streams[2]>>>(d_A2, d_B2, d_C2, (left-SegSize*2));
                
                cudaMemcpyAsync(hostOutput+i, d_C0, Csize, cudaMemcpyDeviceToHost, streams[0]);
                //cudaStreamSynchronize(yiming_stream1);
                cudaMemcpyAsync(hostOutput+i+SegSize, d_C1, Csize, cudaMemcpyDeviceToHost, streams[1]);
                cudaMemcpyAsync(hostOutput+i+SegSize*2, d_C2, (left-SegSize*2)*sizeof(float), cudaMemcpyDeviceToHost, streams[2]);
        
        }
        else if(SegSize < left)
        {
            printf("AAAAAAA, left- size ==%d
    ", left-SegSize);
            cudaMemcpyAsync(d_A0, hostInput1+i, Csize, cudaMemcpyHostToDevice, streams[0]);
            cudaMemcpyAsync(d_B0, hostInput2+i, Csize, cudaMemcpyHostToDevice, streams[0]);
            cudaMemcpyAsync(d_A1, hostInput1+i+SegSize, (left-SegSize)*sizeof(float), cudaMemcpyHostToDevice, streams[1]);
            cudaMemcpyAsync(d_B1, hostInput2+i+SegSize, (left-SegSize)*sizeof(float), cudaMemcpyHostToDevice, streams[1]);
    
                
            // block size is 256
            vecAdd<<<1, SegSize, 1, streams[0]>>>(d_A0, d_B0, d_C0, SegSize);
            vecAdd<<<1, left-SegSize, 1, streams[1]>>>(d_A0, d_B0, d_C0, left-
    SegSize);                                                                                                                                    
    
           // cudaStreamSynchronize(streams[0]);
            cudaMemcpyAsync(hostOutput+i, d_C0, Csize,cudaMemcpyDeviceToHost, streams[0]);
            cudaMemcpyAsync(hostOutput+i+SegSize, d_C1, (left-SegSize)*sizeof(float),cudaMemcpyDeviceToHost, streams[1]);                                                                                                                                    
            
        //    i+=SegSize;
        //    left = left - SegSize;
        }
        else if(left > 0)
        {
            printf("BBBBBBB
    ");
            cudaMemcpyAsync(d_A0, hostInput1+i, left*sizeof(float), cudaMemcpyHostToDevice);
            cudaMemcpyAsync(d_B0, hostInput2+i, left*sizeof(float), cudaMemcpyHostToDevice);
            
            vecAdd<<<1, left, 1, streams[0]>>>(d_A0, d_B0, d_C0, left);
            
            //cudaDeviceSynchronize();
            cudaMemcpyAsync(hostOutput+i, d_C0, left*sizeof(float), cudaMemcpyDeviceToHost);    
        }
        
        cudaDeviceSynchronize(); 
        wbSolution(args, hostOutput, inputLength);
    
        free(hostInput1);
        free(hostInput2);
        free(hostOutput);
        
        for(int i = 0; i < StreamNum; i++)
            cudaStreamDestroy(streams[i]);
    
        cudaFree(d_A0);
        cudaFree(d_A1);
        cudaFree(d_B0);
        cudaFree(d_B1);
        cudaFree(d_C0);
        cudaFree(d_C1);
        cudaFree(d_A2);
        cudaFree(d_A3);
        cudaFree(d_B2);
        cudaFree(d_B3);
        cudaFree(d_C2);
        cudaFree(d_C3);
        return 0;
    }
    View Code

    运行成功,但是遗留一个问题,当我把拷贝内存的代码改成:

    cudaMemcpyAsync(d_A0, h_A+i, Csize, cudaMemcpyHostToDevice, streams[0]);  即使用页固定内存,结果就会错误,不明白为什么
  • 相关阅读:
    02 Filter过滤器
    Vue入门
    Git教程
    Markdown 常用语言关键字
    王爽《汇编语言》(第三版)实验16解析
    王爽《汇编语言》(第三版)实验15解析
    王爽《汇编语言》(第三版)实验14解析
    王爽《汇编语言》(第三版)实验13解析
    王爽《汇编语言》(第三版)实验12解析
    王爽《汇编语言》(第三版)实验11解析
  • 原文地址:https://www.cnblogs.com/biglucky/p/4313266.html
Copyright © 2011-2022 走看看