zoukankan      html  css  js  c++  java
  • C++中如何使用CUDA进行高性能大规模矩阵乘法运算?| cublasSgemm for large matrix multiplication on gpu in C++

    本文首发于个人博客https://kezunlin.me/post/ad5c5bd9/,欢迎阅读最新内容!

    cublasSgemm for large matrix multiplication on gpu in C++

    Guide

    code

    demo.cu

    #include <cuda_runtime.h>
    #include <cublas.h>
    #include <cublas_api.h>
    #include <cublas_v2.h>
    
    bool CompareFeatureMtoN_gpu(float * featureM, float * featureN, float * result, 
    	int count_m, int count_n, int size, int gpu_id) {
    	float *dev_featureM = 0;
    	float *dev_featureN = 0;
    	float *dev_result = 0;
    	const float alpha = 1, beta = 0;
    	cublasHandle_t handle;
    	cudaError_t cudaStatus;
    
    	cudaStatus = cudaSetDevice(gpu_id);
    	if (cudaStatus != cudaSuccess) {
    		printf("cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?
    ");
    		goto out;
    	}
    	cublasCreate(&handle);
    
    	cudaStatus = cudaMalloc((void**)&dev_featureM, count_m * size * sizeof(float));
    	if (cudaStatus != cudaSuccess) {
    		printf("%s, line %d, cudaMalloc failed!
    ", __func__, __LINE__);
    		goto out;
    	}
    	cudaStatus = cudaMalloc((void**)&dev_featureN, count_n * size * sizeof(float));
    	if (cudaStatus != cudaSuccess) {
    		printf("%s, line %d, cudaMalloc failed!
    ", __func__, __LINE__);
    		goto out;
    	}
    	cudaStatus = cudaMalloc((void**)&dev_result, count_m * count_n * sizeof(float));
    	if (cudaStatus != cudaSuccess) {
    		printf("%s, line %d, cudaMalloc failed!
    ", __func__, __LINE__);
    		goto out;
    	}
    
    	cudaStatus = cudaMemcpy(dev_featureM, featureM, count_m * size * sizeof(float), 
    		cudaMemcpyHostToDevice);
    	if (cudaStatus != cudaSuccess) {
    		printf("%s, line %d, cudaMalloc failed!
    ", __func__, __LINE__);
    		goto out;
    	}
    	cudaStatus = cudaMemcpy(dev_featureN, featureN, count_n * size * sizeof(float), 
    		cudaMemcpyHostToDevice);
    	if (cudaStatus != cudaSuccess) {
    		printf("%s, line %d, cudaMalloc failed!
    ", __func__, __LINE__);
    		goto out;
    	}
    
    	/*
    	
    	CUBLAS assumes that the matrix in the device is stored in column major:
    
    	" where α and β are scalars, and A , B and C are matrices stored in column-major 
    	format with dimensions op ( A ) m × k , op ( B ) k × n and C m × n , respectively. 
    	 
    	 Also, for matrix A
    
    
    	 // Multiply the arrays A and B on GPU and save the result in C (coloum-major)
     	 // C(m,n) = A(m,k) * B(k,n)
    
    	  cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    	 */
    
    	cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, count_n, count_m, size, 
    		&alpha, dev_featureN, size, dev_featureM, size, &beta, dev_result, count_n);
    	cudaStatus = cudaThreadSynchronize();
    
    	cudaStatus = cudaMemcpy(result, dev_result, count_m * count_n  * sizeof(float), 
    		cudaMemcpyDeviceToHost);
    	if (cudaStatus != cudaSuccess) {
    		printf("%s, line %d, cudaMemcpy failed!
    ", __func__, __LINE__);
    		goto out;
    	}
    
    out:
    	if(dev_featureM) cudaFree(dev_featureM);
    	if(dev_featureN) cudaFree(dev_featureN);
    	if(dev_result) cudaFree(dev_result);
    	cublasDestroy(handle);
    	return cudaStatus == cudaSuccess;
    }
    

    usage

    demo.cu

    void test_feature_compare()
    {
    	/*
    
    	[a1]
    	[a2]
    	[a3]  
    		 * [b1]
    		   [b2]
    
    	[10,35]
    	[10,35]
    	[10,35]
    	*/
    	std::vector<float> f1{0,1,2,3,4,5,6,7,8,9};
    	std::vector<float> f2{1,1,1,1,1,0,0,0,0,0},f22{0,0,0,0,0,1,1,1,1,1};
    
    	std::vector<std::vector<float>> A,B;
    	// 3*10
    	A.push_back(f1);
    	A.push_back(f1);
    	A.push_back(f1);
    
    	// 10 * 2
    	B.push_back(f2);
    	B.push_back(f22);
    
    
    	int m = 3;
    	int n = 2; 
    	int dim = 10;
    	int gpu_id = 0;
    
    	float* feature_m = new float[ m*dim ];
    	float* feature_n = new float[ n*dim ];
    	auto tmp = feature_m;
    	for (int i = 0; i < m; i++) {
    		for (int j = 0; j < dim; j++)
    			*tmp++ = A[i][j];
    	}
    
    	tmp = feature_n;
    	for (int i = 0; i < n; i++) {
    		for (int j = 0; j < dim; j++)
    			*tmp++ = B[i][j];
    	}
    
    	printf("m = %d, n= %d, size= %d 
    ", m, n, dim); // 3, 2, 10
    
    	//float* result = CompareFeatureMtoN(feature_m, m*dim, feature_n, n*dim, dim, gpu_id);
    
    	float* result = new float[m*n];
    	CompareFeatureMtoN_gpu(feature_m, feature_n, result, m, n, dim, gpu_id);
    
    	tmp = result;
    	for(int i=0;i<6;i++)
    		printf("%f ", *(tmp++));
    
    	delete []feature_m;
    	delete []feature_n;
    	delete []result;
    }
    

    output

    m = 3, n= 2, size= 10 
    10.000000 35.000000 10.000000 35.000000 10.000000 35.000000
    

    Reference

    History

    • 20191015: created.

    Copyright

  • 相关阅读:
    Ubuntu 16 安装redis客户端
    crontab 参数详解
    PHP模拟登录发送闪存
    Nginx配置端口访问的网站
    Linux 增加对外开放的端口
    Linux 实用指令之查看端口开启情况
    无敌的极路由
    不同的域名可以指向同一个项目
    MISCONF Redis is configured to save RDB snapshots, but is currently not able to persist on disk. Commands that may modify the data set are disabled. Please check Redis logs for details about the error
    Redis 创建多个端口
  • 原文地址:https://www.cnblogs.com/kezunlin/p/12102414.html
Copyright © 2011-2022 走看看