zoukankan      html  css  js  c++  java
  • C++ vs Python向量运算速度评测

    本文的起源来自最近一个让我非常不爽的事。

    我最近在改一个开源RNN工具包currennt(http://sourceforge.net/projects/currennt/),想用它实现RNNLM功能。

    currennt使用了大量的面向对象的编程技巧,可以使用GPU,向量运算使用了thrust库(https://code.google.com/p/thrust/)。

    RNNLM(http://rnnlm.org/)也有相应开源实现,非常算法风格的代码,向量运算就是自己使用数组实现的。

    结果……大出我的语料,在不使用GPU的情况下,currennt慢成狗!我不断的修改,直到最后几乎完全在currennt里重写了一个RNNLM……速度才终于一致了。这花费了我大量时间,最关键的是我根本没打算花这些时间,算是计划外开销。

    所以这里干脆对常用的几种向量运算做个评测,下回遇到至少心里有数。


    参与评测的向量实现包括:

    1. C++ array
    2. C++ STL vector
    3. C++ thrust(CPU)
    4. C++ thrust(GPU)
    5. python
    6. python numpy

    评测指标包括:

    • 创建、填充向量
    • 向量点乘,相乘
    • 矩阵相乘

    测试环境:

    Intel Xeon CPU E5649@2.53GHz x24

    VS2010

    python 2.7.6 (32bit)

    thrust v1.5

    numpy 1.8.1


    C++ array

    创建全0向量:0.000s,几乎不占用时间

    int vector_size=100000000;
    float* vector=(float*)calloc(vector_size,sizeof(float));

    创建+填充向量:0.140s

    int vector_size=100000000;
    float* vector=(float*)calloc(vector_size,sizeof(float));
    for (int i=0;i<vector_size;++i){
    	vector[i]=0.01;
    }

    向量点乘:0.390s

    float sum=0;
    for(int i=0;i<vector_size;++i){
    	sum+=vector1[i]*vector2[i];
    }

    向量相乘:0.265s

    float sum=0;
    for(int i=0;i<vector_size;++i){
    	vector3[i]=vector1[i]*vector2[i];
    }
    

    矩阵乘向量:0.344s

    int matrix1_colnum=50000;
    int matrix1_rownum=2000;
    int matrix1_size=matrix1_colnum*matrix1_rownum;
    float* vector1=(float*)calloc(matrix1_size,sizeof(float));
    for (int i=0;i<matrix1_size;++i){
    	vector1[i]=0.01;
    }
    
    float* vector2=(float*)calloc(matrix1_colnum,sizeof(float));
    for (int i=0;i<matrix1_colnum;++i){
    	vector2[i]=0.02;
    }
    
    start_t=clock();
    float* vector3=(float*)calloc(matrix1_rownum,sizeof(float));
    for(int row=0;row<matrix1_rownum;++row){
    	for(int col=0;col<matrix1_colnum;++col){
    		vector3[row]+=vector1[row*matrix1_colnum+col]*vector2[col];
    	}
    }
    end_t=clock();
    

    矩阵乘矩阵:0.749

    (耗费时间与matrix1_rownum*matrix1_colnum*matrix2_colnum成正比)

    int matrix1_rownum=200;
    int matrix1_colnum=5000;
    int matrix1_size=matrix1_colnum*matrix1_rownum;
    float* vector1=(float*)calloc(matrix1_size,sizeof(float));
    for (int i=0;i<matrix1_size;++i){
    	vector1[i]=0.01;
    }
    
    int matrix2_rownum=5000;
    int matrix2_colnum=200;
    int matrix2_size=matrix2_rownum*matrix2_colnum;
    float* vector2=(float*)calloc(matrix2_size,sizeof(float));
    for (int i=0;i<matrix2_size;++i){
    	vector2[i]=0.02;
    }
    
    int matrix3_size=matrix1_rownum*matrix2_colnum;
    float* vector3=(float*)calloc(matrix3_size,sizeof(float));
    start_t=clock();
    for(int row1=0;row1<matrix1_rownum;++row1){
    	for(int col2=0;col2<matrix2_colnum;++col2){
    		for(int col1=0;col1<matrix1_colnum;++col1){
    			vector3[row1*matrix2_colnum+col2]+=vector1[row1*matrix1_colnum+col1]*vector2[col1*matrix2_colnum+col2];
    		}
    	}
    }
    end_t=clock();
    

    C++ STL vector

    创建全0向量:0.140s

    int vect_size=100000000;
    vector<float> vector(vect_size);

    创建+填充向量:0.140s

    int vect_size=100000000;
    vector<float> vector(vect_size,0.01);
    

    向量点乘:0.375s

    int vect_size=100000000;
    vector<float> vector1(vect_size,0.01);
    vector<float> vector2(vect_size,0.02);
    start_t=clock();
    float sum=0;
    for(int i=0;i<vect_size;++i){
    	sum+=vector1[i]*vector2[i];
    }
    end_t=clock();
    

    向量相乘:0.250s

    int vect_size=100000000;
    vector<float> vector1(vect_size,0.01);
    vector<float> vector2(vect_size,0.02);
    vector<float> vector3(vect_size);
    start_t=clock();
    for(int i=0;i<vect_size;++i){
    	vector3[i]=vector1[i]*vector2[i];
    }
    end_t=clock();
    

    矩阵乘向量:0.390s

    int matrix1_colnum=50000;
    int matrix1_rownum=2000;
    int matrix1_size=matrix1_colnum*matrix1_rownum;
    vector<float> vector1(matrix1_size,0.01);
    vector<float> vector2(matrix1_colnum,0.02);
    vector<float> vector3(matrix1_rownum);
    start_t=clock();
    for(int row=0;row<matrix1_rownum;++row){
    	for(int col=0;col<matrix1_colnum;++col){
    		vector3[row]+=vector1[row*matrix1_colnum+col]*vector2[col];
    	}
    }
    end_t=clock();
    

    矩阵乘法:0.827s

    int matrix1_rownum=200;
    int matrix1_colnum=5000;
    int matrix1_size=matrix1_colnum*matrix1_rownum;
    vector<float> vector1(matrix1_size,0.01);
    
    int matrix2_rownum=5000;
    int matrix2_colnum=200;
    int matrix2_size=matrix2_rownum*matrix2_colnum;
    vector<float> vector2(matrix2_size,0.02);
    
    int matrix3_size=matrix1_rownum*matrix2_colnum;
    vector<float> vector3(matrix3_size);
    start_t=clock();
    for(int row1=0;row1<matrix1_rownum;++row1){
    	for(int col2=0;col2<matrix2_colnum;++col2){
    		for(int col1=0;col1<matrix1_colnum;++col1){
    			vector3[row1*matrix2_colnum+col2]+=vector1[row1*matrix1_colnum+col1]*vector2[col1*matrix2_colnum+col2];
    		}
    	}
    }
    end_t=clock();
    

    C++ thrust(CPU)

    创建全0向量:0.140s

    int vect_size=100000000;
    thrust::host_vector<float> vector1(vect_size);
    

    创建+填充向量:0.140s

    int vect_size=100000000;
    thrust::host_vector<float> vector1(vect_size,0.01);
    

    填充向量:0.078s

    thrust::fill(vector1.begin(),vector1.end(),0.01);
    

    向量点乘:0.359s

    int vect_size=100000000;
    thrust::host_vector<float> vector1(vect_size,(float)0.1);
    thrust::host_vector<float> vector2(vect_size,(float)0.2);
    thrust::host_vector<float> vector3(vect_size,(float)0.2);
    
    start_t=clock();
    thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies<float>());
    float sum=thrust::reduce(vector3.begin(),vector3.end(),(float)0,thrust::multiplies<float>());
    end_t=clock();
    

    向量相乘:0.187s

    int vect_size=100000000;
    thrust::host_vector<float> vector1(vect_size,(float)0.1);
    thrust::host_vector<float> vector2(vect_size,(float)0.2);
    thrust::host_vector<float> vector3(vect_size);
    start_t=clock();
    thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies<float>());
    end_t=clock();
    

    矩阵乘向量:0.110s

    struct matrixXvect_func
    {
    	thrust::host_vector<float>* matrix;
    	thrust::host_vector<float>* vector;
    	int matrix_rownum;
    	int matrix_colnum;
    
    	__host__ __device__
    	float operator()(const int& idx) const{
    		float t=0;
    		for(int col=0;col<matrix_colnum;++col){
    			t+=(*matrix)[idx*matrix_colnum+col]* (*vector)[col];
    		}
    		return t;
    	}
    };
    
    int matrix1_rownum=2000;
    int matrix1_colnum=50000; int matrix1_size=matrix1_colnum*matrix1_rownum; thrust::host_vector<float> vector1(matrix1_size,(float)0.1); thrust::host_vector<float> vector2(matrix1_colnum,(float)0.2); thrust::host_vector<float> vector3(matrix1_rownum); start_t=clock(); matrixXvect_func fn; fn.matrix=&vector1; fn.vector=&vector2; fn.matrix_rownum=matrix1_rownum; fn.matrix_colnum=matrix1_colnum; thrust::transform( thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(0) + matrix1_rownum, vector3.begin(), fn ); end_t=clock();

    矩阵乘矩阵:0.655s

    struct matrixXmatrix_func
    {
    	thrust::host_vector<float>* matrix1;
    	thrust::host_vector<float>* matrix2;
    	int matrix1_rownum;
    	int matrix1_colnum;
    	int matrix2_rownum;
    	int matrix2_colnum;
    
    	__host__ __device__
    	float operator()(const int& idx) const{
    		int rownum=idx/matrix2_colnum;
    		int colnum=idx%matrix2_colnum;
    		float t=0;
    		for(int col=0;col<matrix1_colnum;++col){
    			t+=(*matrix1)[rownum*matrix1_colnum+col]* (*matrix2)[col*matrix2_colnum+colnum];
    		}
    		return t;
    	}
    };
    
    int matrix1_rownum=200;
    int matrix1_colnum=5000;
    int matrix1_size=matrix1_colnum*matrix1_rownum;
    thrust::host_vector<float> vector1(matrix1_size,(float)0.1);
    
    int matrix2_rownum=5000;
    int matrix2_colnum=200;
    int matrix2_size=matrix2_rownum*matrix2_colnum;
    thrust::host_vector<float> vector2(matrix2_size,(float)0.2);
    
    int matrix3_size=matrix1_rownum*matrix2_colnum;
    thrust::host_vector<float> vector3(matrix3_size);
    
    start_t=clock();
    
    matrixXmatrix_func fn;
    fn.matrix1=&vector1;
    fn.matrix2=&vector2;
    fn.matrix1_rownum=matrix1_rownum;
    fn.matrix1_colnum=matrix1_colnum;
    fn.matrix2_rownum=matrix2_rownum;
    fn.matrix2_colnum=matrix2_colnum;
    
    thrust::transform(
                thrust::counting_iterator<int>(0),
                thrust::counting_iterator<int>(0) + matrix3_size,
                vector3.begin(),
                fn
                );
    
    end_t=clock();
    

    C++ thrust(GPU)

    创建全0向量:0.140s

    int vect_size=1000000;
    thrust::device_vector<float> vector1(vect_size);
    

    创建+填充向量:0.140s

    int vect_size=1000000;
    thrust::device_vector<float> vector1(vect_size,0.1);
    

    CPU向量赋值:0.141s

    int vect_size=1000000;
    thrust::host_vector<float> vector1(vect_size,0.1);
    start_t=clock();
    thrust::device_vector<float> vector2=vector1;
    end_t=clock();
    

    填充向量:0.000s

    int vect_size=1000000;
    thrust::device_vector<float> vector(vect_size);
    start_t=clock();
    thrust::fill(vector.begin(),vector.end(),(float)0.1);
    end_t=clock();
    

    向量点乘:0.016s

    int vect_size=100000000;
    thrust::device_vector<float> vector1(vect_size,(float)0.1);
    thrust::device_vector<float> vector2(vect_size,(float)0.2);
    thrust::device_vector<float> vector3(vect_size,(float)0.2);
     
    start_t=clock();
    thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies<float>());
    float sum=thrust::reduce(vector3.begin(),vector3.end(),(float)0,thrust::multiplies<float>());
    end_t=clock();
    

    向量相乘:0.000s

    int vect_size=100000000;
    thrust::device_vector<float> vector1(vect_size,(float)0.1);
    thrust::device_vector<float> vector2(vect_size,(float)0.2);
    thrust::device_vector<float> vector3(vect_size);
    start_t=clock();
    thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies<float>());
    end_t=clock();
    

    矩阵乘向量(实现1):0.530s

    int matrix1_rownum=2000;
    int matrix1_colnum=50000;
    int matrix1_size=matrix1_colnum*matrix1_rownum;
     
    thrust::device_vector<float> vector1(matrix1_size,(float)0.1);
    thrust::device_vector<float> vector2(matrix1_colnum,(float)0.2);
    thrust::device_vector<float> tmp(matrix1_colnum);
    thrust::device_vector<float> vector3(matrix1_rownum);
     
    start_t=clock();
    for(int row=0;row<matrix1_rownum;++row){
    	thrust::transform(vector1.begin()+row*matrix1_colnum,vector1.begin()+(row+1)*matrix1_colnum,vector2.begin(),tmp.begin(),thrust::multiplies<float>());
    	vector3[row]=thrust::reduce(tmp.begin(),tmp.end(),(float)0,thrust::multiplies<float>());
    }
    end_t=clock();
    

    矩阵乘向量(实现2)CUBLAS,待试

    矩阵乘矩阵CUBLAS,待试

    Python

    直接使用python的list实现上述功能实在太慢……而且由于无法指定float类型,其默认使用16位double类型来表示小数,使用10^8会超出list索引上限……故只使用10^7实验,速度差距可以自行换算。

    大致估算python的向量运算比c++慢50倍,矩阵运算慢1000。

    初始化向量并赋值:1.51s

    vector_size=10000000
    vector=[]
    for i in range(vector_size):
    	vector.append(0.1)
    

    向量点乘:1.75s

    vector_size=10000000
    vector1=[] for i in range(vector_size): vector1.append(0.1) vector2=[] for i in range(vector_size): vector2.append(0.1) start_t=time.time() sum=0 for i in range(vector_size): sum+=vector1[i]*vector2[i] end_t=time.time()

    向量相乘:2.39

    vector_size=10000000
    vector1=[]
    for i in range(vector_size):
    	vector1.append(0.1)
    vector2=[]
    for i in range(vector_size):
    	vector2.append(0.1)
    vector3=[]
    for i in range(vector_size):
    	vector3.append(0.1)
    start_t=time.time()
    for i in range(vector_size):
    	vector3[i]=vector1[i]*vector2[i]
    end_t=time.time()
    

    矩阵乘向量:3.06s

    matrix1_rownum=2000
    matrix1_colnum=5000
    matrix1_size=matrix1_rownum*matrix1_colnum
    vector1=[]
    for i in range(matrix1_size):
    	vector1.append(0.1)
    vector2=[]
    for i in range(matrix1_colnum):
    	vector2.append(0.1)
    vector3=[]
    for i in range(matrix1_rownum):
    	vector3.append(0.1)
    start_t=time.time()
    for row in range(matrix1_rownum):
    	for col in range(matrix1_colnum):
    		vector3[row]=vector1[row*matrix1_colnum+col]*vector2[col]
    end_t=time.time()
    

    矩阵相乘:11.37s

    matrix1_rownum=200
    matrix1_colnum=500
    matrix1_size=matrix1_rownum*matrix1_colnum
    vector1=[]
    for i in range(matrix1_size):
    	vector1.append(0.1)
    matrix2_rownum=500
    matrix2_colnum=200
    matrix2_size=matrix2_rownum*matrix2_colnum
    vector2=[]
    for i in range(matrix2_size):
    	vector2.append(0.1)
    matrix3_size=matrix1_rownum*matrix2_colnum
    vector3=[]
    for i in range(matrix3_size):
    	vector3.append(0.1)
    start_t=time.time()
    for row in range(matrix1_rownum):
    	for col in range(matrix2_colnum):
    		for i in range(matrix1_colnum):
    			vector3[row*matrix2_colnum+col]+=vector1[row*matrix1_colnum+i]*vector2[i*matrix2_colnum+col]
    end_t=time.time()
    

    当然实际进行向量运算没人会拿python的list数据结构进行运算,这里只是好奇定量测一下list到底有多慢……

    Python numpy

    创建全0向量:0.0s

    vector_size=100000000
    vector=numpy.zeros(vector_size)
    

    创建+填充向量:0.25s

    vector_size=100000000
    vector=numpy.zeros(vector_size)
    vector.fill(0.01)
    

    向量点乘:0.125s(由于python是32位……内存原因,数据规模减半)

    vector_size=50000000
    vector1=numpy.zeros(vector_size)
    vector1.fill(0.01)
    vector2=numpy.zeros(vector_size)
    vector2.fill(0.02)
    start_t=time.time()
    sum=numpy.inner(vector1,vector2)
    end_t=time.time()
    

    向量相乘:0.234s

    vector_size=50000000
    vector1=numpy.zeros(vector_size)
    vector1.fill(0.01)
    vector2=numpy.zeros(vector_size)
    vector2.fill(0.02)
    start_t=time.time()
    vector3=numpy.multiply(vector1,vector2)
    end_t=time.time()
    

    矩阵乘向量:0.094s

    matrix1_rownum=2000
    matrix1_colnum=50000
    matrix1_size=matrix1_rownum*matrix1_colnum
    vector1=numpy.zeros(matrix1_size)
    vector1.fill(0.01)
    
    vector2=numpy.zeros(matrix1_colnum)
    vector2.fill(0.02)
    
    start_t=time.time()
    vector1=vector1.reshape(matrix1_rownum,matrix1_colnum)
    vector2=vector2.reshape(matrix1_colnum,1)
    vector3=numpy.dot(vector1,vector2)
    end_t=time.time()
    

    矩阵乘矩阵:23.16s(numpy.dot出乎意料的慢,使用numpy.matrix类时间为11.73s,依旧很慢而且占用更大内存,在创建matrix对象时也要0.4s)

    matrix1_rownum=2000
    matrix1_colnum=50000
    matrix1_size=matrix1_rownum*matrix1_colnum
    vector1=numpy.zeros(matrix1_size)
    vector1.fill(0.01)
    matrix2_rownum=50000
    matrix2_colnum=1000
    matrix2_size=matrix2_rownum*matrix2_colnum
    vector2=numpy.zeros(matrix2_size)
    vector2.fill(0.02)
    start_t=time.time()
    vector1=vector1.reshape(matrix1_rownum,matrix1_colnum)
    vector2=vector2.reshape(matrix2_rownum,matrix2_colnum)
    vector3=numpy.dot(vector1,vector2)
    end_t=time.time()
    
  • 相关阅读:
    ActiveMQ消息队列技术融合Spring
    ActiveMQ消息队列技术Demo
    网页静态化技术Freemaker
    Solr的基本语法
    Solr的页面展示以及高亮显示
    Solr的了解与配置
    Angular中上传图片到分布式文件服务器FastDFS上
    分布式文件服务器FastDFS的使用
    自我学习笔记01
    数组转换成List集合
  • 原文地址:https://www.cnblogs.com/plwang1990/p/4147379.html
Copyright © 2011-2022 走看看