zoukankan      html  css  js  c++  java
  • C++中如何使用CUDA进行高性能大规模矩阵乘法运算?| cublasSgemm for large matrix multiplication on gpu in C++

    本文首发于个人博客https://kezunlin.me/post/ad5c5bd9/,欢迎阅读最新内容!

    cublasSgemm for large matrix multiplication on gpu in C++

    Guide

    code

    demo.cu

    #include <cuda_runtime.h>
    #include <cublas.h>
    #include <cublas_api.h>
    #include <cublas_v2.h>
    
    bool CompareFeatureMtoN_gpu(float * featureM, float * featureN, float * result, 
    	int count_m, int count_n, int size, int gpu_id) {
    	float *dev_featureM = 0;
    	float *dev_featureN = 0;
    	float *dev_result = 0;
    	const float alpha = 1, beta = 0;
    	cublasHandle_t handle;
    	cudaError_t cudaStatus;
    
    	cudaStatus = cudaSetDevice(gpu_id);
    	if (cudaStatus != cudaSuccess) {
    		printf("cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?
    ");
    		goto out;
    	}
    	cublasCreate(&handle);
    
    	cudaStatus = cudaMalloc((void**)&dev_featureM, count_m * size * sizeof(float));
    	if (cudaStatus != cudaSuccess) {
    		printf("%s, line %d, cudaMalloc failed!
    ", __func__, __LINE__);
    		goto out;
    	}
    	cudaStatus = cudaMalloc((void**)&dev_featureN, count_n * size * sizeof(float));
    	if (cudaStatus != cudaSuccess) {
    		printf("%s, line %d, cudaMalloc failed!
    ", __func__, __LINE__);
    		goto out;
    	}
    	cudaStatus = cudaMalloc((void**)&dev_result, count_m * count_n * sizeof(float));
    	if (cudaStatus != cudaSuccess) {
    		printf("%s, line %d, cudaMalloc failed!
    ", __func__, __LINE__);
    		goto out;
    	}
    
    	cudaStatus = cudaMemcpy(dev_featureM, featureM, count_m * size * sizeof(float), 
    		cudaMemcpyHostToDevice);
    	if (cudaStatus != cudaSuccess) {
    		printf("%s, line %d, cudaMalloc failed!
    ", __func__, __LINE__);
    		goto out;
    	}
    	cudaStatus = cudaMemcpy(dev_featureN, featureN, count_n * size * sizeof(float), 
    		cudaMemcpyHostToDevice);
    	if (cudaStatus != cudaSuccess) {
    		printf("%s, line %d, cudaMalloc failed!
    ", __func__, __LINE__);
    		goto out;
    	}
    
    	/*
    	
    	CUBLAS assumes that the matrix in the device is stored in column major:
    
    	" where α and β are scalars, and A , B and C are matrices stored in column-major 
    	format with dimensions op ( A ) m × k , op ( B ) k × n and C m × n , respectively. 
    	 
    	 Also, for matrix A
    
    
    	 // Multiply the arrays A and B on GPU and save the result in C (coloum-major)
     	 // C(m,n) = A(m,k) * B(k,n)
    
    	  cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
    	 */
    
    	cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, count_n, count_m, size, 
    		&alpha, dev_featureN, size, dev_featureM, size, &beta, dev_result, count_n);
    	cudaStatus = cudaThreadSynchronize();
    
    	cudaStatus = cudaMemcpy(result, dev_result, count_m * count_n  * sizeof(float), 
    		cudaMemcpyDeviceToHost);
    	if (cudaStatus != cudaSuccess) {
    		printf("%s, line %d, cudaMemcpy failed!
    ", __func__, __LINE__);
    		goto out;
    	}
    
    out:
    	if(dev_featureM) cudaFree(dev_featureM);
    	if(dev_featureN) cudaFree(dev_featureN);
    	if(dev_result) cudaFree(dev_result);
    	cublasDestroy(handle);
    	return cudaStatus == cudaSuccess;
    }
    

    usage

    demo.cu

    void test_feature_compare()
    {
    	/*
    
    	[a1]
    	[a2]
    	[a3]  
    		 * [b1]
    		   [b2]
    
    	[10,35]
    	[10,35]
    	[10,35]
    	*/
    	std::vector<float> f1{0,1,2,3,4,5,6,7,8,9};
    	std::vector<float> f2{1,1,1,1,1,0,0,0,0,0},f22{0,0,0,0,0,1,1,1,1,1};
    
    	std::vector<std::vector<float>> A,B;
    	// 3*10
    	A.push_back(f1);
    	A.push_back(f1);
    	A.push_back(f1);
    
    	// 10 * 2
    	B.push_back(f2);
    	B.push_back(f22);
    
    
    	int m = 3;
    	int n = 2; 
    	int dim = 10;
    	int gpu_id = 0;
    
    	float* feature_m = new float[ m*dim ];
    	float* feature_n = new float[ n*dim ];
    	auto tmp = feature_m;
    	for (int i = 0; i < m; i++) {
    		for (int j = 0; j < dim; j++)
    			*tmp++ = A[i][j];
    	}
    
    	tmp = feature_n;
    	for (int i = 0; i < n; i++) {
    		for (int j = 0; j < dim; j++)
    			*tmp++ = B[i][j];
    	}
    
    	printf("m = %d, n= %d, size= %d 
    ", m, n, dim); // 3, 2, 10
    
    	//float* result = CompareFeatureMtoN(feature_m, m*dim, feature_n, n*dim, dim, gpu_id);
    
    	float* result = new float[m*n];
    	CompareFeatureMtoN_gpu(feature_m, feature_n, result, m, n, dim, gpu_id);
    
    	tmp = result;
    	for(int i=0;i<6;i++)
    		printf("%f ", *(tmp++));
    
    	delete []feature_m;
    	delete []feature_n;
    	delete []result;
    }
    

    output

    m = 3, n= 2, size= 10 
    10.000000 35.000000 10.000000 35.000000 10.000000 35.000000
    

    Reference

    History

    • 20191015: created.

    Copyright

  • 相关阅读:
    tomcat7的catalina.sh配置说明
    nginx防攻击的简单配置
    linux系统自签发免费ssl证书,为nginx生成自签名ssl证书
    mysql ERROR 1045 (28000): Access denied for user 'root'@'localhost'
    /var/log/secure 文件清空
    Linux日志文件
    记一次网站被挂马处理
    Uedit32对文本进行回车换行
    安装mysql血泪史。
    mysql-8.0.19安装教程(Windows)
  • 原文地址:https://www.cnblogs.com/kezunlin/p/12102414.html
Copyright © 2011-2022 走看看