zoukankan      html  css  js  c++  java
  • CUDA大作业_进行图像特征匹配V2.0

    在V1.0的基础上改变了排序方式并对部分并行代码进行了优化

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <string>
    #include <stdio.h>
    #include <stdlib.h>
    #include <iostream>
    #include <fstream>
    #include <sstream>
    #include <string>
    #include <math.h>
    #include<vector>
    #include<algorithm>
    #include<time.h>

    using namespace std;


    template<typename T>
    void readMatrixfromTXT(const char *fileName, const int numColumn, const int numRow, T *matrix);
    float CalDist(float*mat, int row, int col);

    int getFileColumns(const char *fileName);
    int getFileRows(const char *fileName);

    cudaStream_t stream[3];
    int gpu[3] = { 3,4,5 };

    typedef struct data_ind{
    float data;
    float i;
    };

    #define inf 1000000
    #define CUDA_CHECK_RETURN(value) {
    cudaError_t _m_cudaStat = value;
    if (_m_cudaStat != cudaSuccess) {
    fprintf(stderr, "Error %s at line %d in file %s ",
    cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__);
    exit(1);
    } }

    void checkCUDAError(const char *msg)
    {
    cudaError_t err = cudaGetLastError();
    if (cudaSuccess != err)
    {
    fprintf(stderr, "Cuda error: %s: %s. ", msg, cudaGetErrorString(err));
    exit(-1);
    }
    }


    const int M = 181, N = 14, M1 = 168, N1 = 14, M2 = 168, N2 = 14;
    __constant__ float test[13] = { 131.0000, 5.2004, 0.5679, 1.5263, 2.1928, 2.78, 5.2777, 4.2365, 5.2146, 3.3337, 2.9507, 1.7726, 3.1895 };

    const float test1[13] = { 131.0000, 5.2004, 0.5679, 1.5263, 2.1928, 2.78, 5.2777, 4.2365, 5.2146, 3.3337, 2.9507, 1.7726, 3.1895 };


    __global__ void reduceComp (float *g_idata, float *g_odata,int N, int blockSize)
    {
    volatile __shared__ data_ind sdata[512], sdata1[512], sdata2[512];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockSize*2) + tid;
    sdata[tid].data = inf;sdata[tid].i = tid;
    sdata1[tid].data = inf;sdata1[tid].i = tid;
    sdata2[tid].data = inf;sdata2[tid].i = tid;

    if(i+blockSize<N){
    sdata[tid] .data=( g_idata[i] <=g_idata[i+blockSize]?g_idata[i]:g_idata[i+blockSize]);
    sdata1[tid].data=sdata[tid].data;
    sdata2[tid].data=sdata[tid].data;}
    else if(i<N){
    sdata[tid] .data=g_idata[i];
    sdata1[tid].data=sdata[tid].data;
    sdata2[tid].data=sdata[tid].data;
    }
    else{
    sdata[tid] .data=inf;
    sdata1[tid].data=inf;
    sdata2[tid].data=inf;
    }
    __syncthreads();

    /********************** Sort firt1****************************/


    if (blockSize >= 1024) { if (tid < 1024) { int flagg=(sdata[tid].data<=sdata[tid + 512].data? 0:1); sdata[tid].data=sdata[tid+flagg*512].data;sdata[tid].i=tid+flagg*512;}
    __syncthreads(); }
    if (blockSize >= 512) { if (tid < 256) { int flagg=(sdata[tid].data<=sdata[tid + 256].data? 0:1); sdata[tid].data=sdata[tid+flagg*256].data;sdata[tid].i=tid+flagg*256;}
    __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { int flagg=(sdata[tid].data<=sdata[tid + 128].data? 0:1); sdata[tid].data=sdata[tid+flagg*128].data;sdata[tid].i=tid+flagg*128;}
    __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { int flagg=(sdata[tid].data<=sdata[tid + 64].data? 0:1); sdata[tid].data=sdata[tid+flagg*64].data;sdata[tid].i=tid+flagg*64;}
    __syncthreads(); }
    if (tid < 32) {
    if (blockSize >= 64) {int flagg =(sdata[tid].data<=sdata[tid + 32].data? 0:1);sdata[tid].data=sdata[tid+flagg*32].data;sdata[tid].i=sdata[tid+flagg*32].i;}
    if (blockSize >= 32) {int flagg =(sdata[tid].data<=sdata[tid + 16].data? 0:1);sdata[tid].data=sdata[tid+flagg*16].data;sdata[tid].i=sdata[tid+flagg*16].i;}
    if (blockSize >= 16) {int flagg =(sdata[tid].data<=sdata[tid + 8].data? 0:1);sdata[tid].data=sdata[tid+flagg*8].data;sdata[tid].i=sdata[tid+flagg*8].i;}
    if (blockSize >= 8){int flagg =(sdata[tid].data<=sdata[tid + 4].data? 0:1);sdata[tid].data=sdata[tid+flagg*4].data;sdata[tid].i=sdata[tid+flagg*4].i;}
    if (blockSize >= 4) {int flagg =(sdata[tid].data<=sdata[tid + 2].data? 0:1);sdata[tid].data=sdata[tid+flagg*2].data;sdata[tid].i=sdata[tid+flagg*2].i;}
    if (blockSize >= 2){int flagg =(sdata[tid].data<=sdata[tid + 1].data? 0:1);sdata[tid].data=sdata[tid+flagg*1].data;sdata[tid].i=sdata[tid+flagg*1].i;}
    }
    __syncthreads();
    if (tid == 0){ g_odata[3*blockIdx.x] = sdata[0].data;sdata1[int(sdata[0].i)].data=inf;sdata2[int(sdata[0].i)].data=inf;}
    //g_odata[3*blockIdx.x]=16;

    /********************** Sort second****************************/


    if (blockSize >= 1024) { if (tid < 1024) { int flagg=(sdata1[tid].data<=sdata1[tid + 512].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*512].data;sdata1[tid].i=tid+flagg*512;}
    __syncthreads(); }
    if (blockSize >= 512) { if (tid < 256) { int flagg=(sdata1[tid].data<=sdata1[tid + 256].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*256].data;sdata1[tid].i=tid+flagg*256;}
    __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { int flagg=(sdata1[tid].data<=sdata1[tid + 128].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*128].data;sdata1[tid].i=tid+flagg*128;}
    __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { int flagg=(sdata1[tid].data<=sdata1[tid + 64].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*64].data;sdata1[tid].i=tid+flagg*64;}
    __syncthreads(); }
    if (tid < 32) {
    if (blockSize >= 64) {int flagg =(sdata1[tid].data<=sdata1[tid + 32].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*32].data;sdata1[tid].i=sdata1[tid+flagg*32].i;}
    if (blockSize >= 32) {int flagg =(sdata1[tid].data<=sdata1[tid + 16].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*16].data;sdata1[tid].i=sdata1[tid+flagg*16].i;}
    if (blockSize >= 16) {int flagg =(sdata1[tid].data<=sdata1[tid + 8].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*8].data;sdata1[tid].i=sdata1[tid+flagg*8].i;}
    if (blockSize >= 8){int flagg =(sdata1[tid].data<=sdata1[tid + 4].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*4].data;sdata1[tid].i=sdata1[tid+flagg*4].i;}
    if (blockSize >= 4) {int flagg =(sdata1[tid].data<=sdata1[tid + 2].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*2].data;sdata1[tid].i=sdata1[tid+flagg*2].i;}
    if (blockSize >= 2){int flagg =(sdata1[tid].data<=sdata1[tid + 1].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*1].data;sdata1[tid].i=sdata1[tid+flagg*1].i;}
    }
    __syncthreads();
    if (tid == 0){ g_odata[3*blockIdx.x+1] = sdata1[0].data;sdata[int(sdata1[0].i)].data=inf;sdata2[int(sdata1[0].i)].data=inf;}
    //g_odata[3*blockIdx.x+1]=16;

    /********************** Sort third****************************/


    if (blockSize >= 1024) { if (tid < 1024) { int flagg=(sdata2[tid].data<=sdata2[tid + 512].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*512].data;sdata2[tid].i=tid+flagg*512;}
    __syncthreads(); }
    if (blockSize >= 512) { if (tid < 256) { int flagg=(sdata2[tid].data<=sdata2[tid + 256].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*256].data;sdata2[tid].i=tid+flagg*256;}
    __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { int flagg=(sdata2[tid].data<=sdata2[tid + 128].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*128].data;sdata2[tid].i=tid+flagg*128;}
    __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { int flagg=(sdata2[tid].data<=sdata2[tid + 64].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*64].data;sdata2[tid].i=tid+flagg*64;}
    __syncthreads(); }
    if (tid < 32) {
    if (blockSize >= 64) {int flagg =(sdata2[tid].data<=sdata2[tid + 32].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*32].data;sdata2[tid].i=sdata2[tid+flagg*32].i;}
    if (blockSize >= 32) {int flagg =(sdata2[tid].data<=sdata2[tid + 16].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*16].data;sdata2[tid].i=sdata2[tid+flagg*16].i;}
    if (blockSize >= 16) {int flagg =(sdata2[tid].data<=sdata2[tid + 8].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*8].data;sdata2[tid].i=sdata2[tid+flagg*8].i;}
    if (blockSize >= 8){int flagg =(sdata2[tid].data<=sdata2[tid + 4].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*4].data;sdata2[tid].i=sdata2[tid+flagg*4].i;}
    if (blockSize >= 4) {int flagg =(sdata2[tid].data<=sdata2[tid + 2].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*2].data;sdata2[tid].i=sdata2[tid+flagg*2].i;}
    if (blockSize >= 2){int flagg =(sdata2[tid].data<=sdata2[tid + 1].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*1].data;sdata2[tid].i=sdata2[tid+flagg*1].i;}
    }
    __syncthreads();
    if (tid == 0){ g_odata[3*blockIdx.x+2] = sdata2[0].data;}
    //g_odata[3*blockIdx.x+2]=16;
    }

    __global__ void global_sort(float * a/*first three*blockNum1 element find by CalDis_Sort */, float*first_thr/*first three minimum */, int N)//improved rank sort
    {
    int index = blockDim.x*blockIdx.x + threadIdx.x;
    first_thr[0] = inf;
    first_thr[1] = inf;
    first_thr[2] = inf;
    if (index<N){
    float temp = a[index];
    int k = 0;
    for (int j = 0; j < N; j++)
    {
    if (temp>a[j])
    {
    ++k;
    }
    else if (temp == a[j] && index>j)
    {
    ++k;
    }
    }
    if (k<3)first_thr[k] = temp;
    }
    }


    const int thdsPerblock_cal_dis=512;


    __global__ void CalDis_Sort( float *b/*matrix to cal Euclidean dis*/, float *tosort, int R/*Matrix row number*/, int C/*colum number*/,int blockSize)
    {
    /********************** CalDis ****************************/

    volatile __shared__ data_ind sdata[thdsPerblock_cal_dis], sdata1[thdsPerblock_cal_dis], sdata2[thdsPerblock_cal_dis]; //__shared__ float first_three[3];
    //__shared__ float share_test[13];
    int tid = threadIdx.x;
    int index = tid + blockIdx.x*blockDim.x;
    // temp[index] = 100000;
    sdata[tid].data = inf;sdata1[tid].data = inf;sdata2[tid].data = inf;sdata[tid].i =tid;sdata1[tid].i =tid;sdata2[tid].i =tid;

    //if (tid < 13) share_test[tid] = test[tid];

    __syncthreads();
    // for(int i=0;i<13;i++)
    // share_test[i] = test[i];
    float temp_add = 0;
    if (index<R)
    {
    if (abs(test[0] - b[index]) <= 10)
    {
    for (int i = 1; i < 13; i++)
    {
    float addup = test[i] - b[i*R + index];
    temp_add += addup*addup;
    }
    sdata[tid].data = temp_add;
    sdata1[tid].data = temp_add;
    sdata2[tid].data = temp_add;

    }
    // temp[thdI] = 1000;
    }
    __syncthreads();
    /********************** Sort firt1****************************/


    if (blockSize >= 1024) { if (tid < 1024) { int flagg=(sdata[tid].data<=sdata[tid + 512].data? 0:1); sdata[tid].data=sdata[tid+flagg*512].data;sdata[tid].i=tid+flagg*512;}
    __syncthreads(); }
    if (blockSize >= 512) { if (tid < 256) { int flagg=(sdata[tid].data<=sdata[tid + 256].data? 0:1); sdata[tid].data=sdata[tid+flagg*256].data;sdata[tid].i=tid+flagg*256;}
    __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { int flagg=(sdata[tid].data<=sdata[tid + 128].data? 0:1); sdata[tid].data=sdata[tid+flagg*128].data;sdata[tid].i=tid+flagg*128;}
    __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { int flagg=(sdata[tid].data<=sdata[tid + 64].data? 0:1); sdata[tid].data=sdata[tid+flagg*64].data;sdata[tid].i=tid+flagg*64;}
    __syncthreads(); }
    if (tid < 32) {
    if (blockSize >= 64) {int flagg =(sdata[tid].data<=sdata[tid + 32].data? 0:1);sdata[tid].data=sdata[tid+flagg*32].data;sdata[tid].i=sdata[tid+flagg*32].i;}
    if (blockSize >= 32) {int flagg =(sdata[tid].data<=sdata[tid + 16].data? 0:1);sdata[tid].data=sdata[tid+flagg*16].data;sdata[tid].i=sdata[tid+flagg*16].i;}
    if (blockSize >= 16) {int flagg =(sdata[tid].data<=sdata[tid + 8].data? 0:1);sdata[tid].data=sdata[tid+flagg*8].data;sdata[tid].i=sdata[tid+flagg*8].i;}
    if (blockSize >= 8){int flagg =(sdata[tid].data<=sdata[tid + 4].data? 0:1);sdata[tid].data=sdata[tid+flagg*4].data;sdata[tid].i=sdata[tid+flagg*4].i;}
    if (blockSize >= 4) {int flagg =(sdata[tid].data<=sdata[tid + 2].data? 0:1);sdata[tid].data=sdata[tid+flagg*2].data;sdata[tid].i=sdata[tid+flagg*2].i;}
    if (blockSize >= 2){int flagg =(sdata[tid].data<=sdata[tid + 1].data? 0:1);sdata[tid].data=sdata[tid+flagg*1].data;sdata[tid].i=sdata[tid+flagg*1].i;}
    }
    __syncthreads();
    if (tid == 0){ tosort[3*blockIdx.x] = sdata[0].data;sdata1[int(sdata[0].i)].data=inf;sdata2[int(sdata[0].i)].data=inf;}

    /********************** Sort second****************************/


    if (blockSize >= 1024) { if (tid < 1024) { int flagg=(sdata1[tid].data<=sdata1[tid + 512].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*512].data;sdata1[tid].i=tid+flagg*512;}
    __syncthreads(); }
    if (blockSize >= 512) { if (tid < 256) { int flagg=(sdata1[tid].data<=sdata1[tid + 256].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*256].data;sdata1[tid].i=tid+flagg*256;}
    __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { int flagg=(sdata1[tid].data<=sdata1[tid + 128].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*128].data;sdata1[tid].i=tid+flagg*128;}
    __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { int flagg=(sdata1[tid].data<=sdata1[tid + 64].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*64].data;sdata1[tid].i=tid+flagg*64;}
    __syncthreads(); }
    if (tid < 32) {
    if (blockSize >= 64) {int flagg =(sdata1[tid].data<=sdata1[tid + 32].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*32].data;sdata1[tid].i=sdata1[tid+flagg*32].i;}
    if (blockSize >= 32) {int flagg =(sdata1[tid].data<=sdata1[tid + 16].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*16].data;sdata1[tid].i=sdata1[tid+flagg*16].i;}
    if (blockSize >= 16) {int flagg =(sdata1[tid].data<=sdata1[tid + 8].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*8].data;sdata1[tid].i=sdata1[tid+flagg*8].i;}
    if (blockSize >= 8){int flagg =(sdata1[tid].data<=sdata1[tid + 4].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*4].data;sdata1[tid].i=sdata1[tid+flagg*4].i;}
    if (blockSize >= 4) {int flagg =(sdata1[tid].data<=sdata1[tid + 2].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*2].data;sdata1[tid].i=sdata1[tid+flagg*2].i;}
    if (blockSize >= 2){int flagg =(sdata1[tid].data<=sdata1[tid + 1].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*1].data;sdata1[tid].i=sdata1[tid+flagg*1].i;}
    }
    __syncthreads();
    if (tid == 0){ tosort[3*blockIdx.x+1] = sdata1[0].data;sdata2[int(sdata1[0].i)].data=inf;}


    /********************** Sort third****************************/


    if (blockSize >= 1024) { if (tid < 1024) { int flagg=(sdata2[tid].data<=sdata2[tid + 512].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*512].data;sdata2[tid].i=tid+flagg*512;}
    __syncthreads(); }
    if (blockSize >= 512) { if (tid < 256) { int flagg=(sdata2[tid].data<=sdata2[tid + 256].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*256].data;sdata2[tid].i=tid+flagg*256;}
    __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { int flagg=(sdata2[tid].data<=sdata2[tid + 128].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*128].data;sdata2[tid].i=tid+flagg*128;}
    __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { int flagg=(sdata2[tid].data<=sdata2[tid + 64].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*64].data;sdata2[tid].i=tid+flagg*64;}
    __syncthreads(); }
    if (tid < 32) {
    if (blockSize >= 64) {int flagg =(sdata2[tid].data<=sdata2[tid + 32].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*32].data;sdata2[tid].i=sdata2[tid+flagg*32].i;}
    if (blockSize >= 32) {int flagg =(sdata2[tid].data<=sdata2[tid + 16].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*16].data;sdata2[tid].i=sdata2[tid+flagg*16].i;}
    if (blockSize >= 16) {int flagg =(sdata2[tid].data<=sdata2[tid + 8].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*8].data;sdata2[tid].i=sdata2[tid+flagg*8].i;}
    if (blockSize >= 8){int flagg =(sdata2[tid].data<=sdata2[tid + 4].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*4].data;sdata2[tid].i=sdata2[tid+flagg*4].i;}
    if (blockSize >= 4) {int flagg =(sdata2[tid].data<=sdata2[tid + 2].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*2].data;sdata2[tid].i=sdata2[tid+flagg*2].i;}
    if (blockSize >= 2){int flagg =(sdata2[tid].data<=sdata2[tid + 1].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*1].data;sdata2[tid].i=sdata2[tid+flagg*1].i;}
    }
    __syncthreads();
    if (tid == 0){ tosort[3*blockIdx.x+2] = sdata2[0].data;}

    }


    class CRunGPU{
    public:
    cudaEvent_t start_gpu, end_gpu;
    float consume_gpu;
    int row;
    float *BTR_train;

    float *dev_first_thr;
    float *dev_BTR_train;
    float *dev_tosort;
    float *tosort_host;
    float *sort_block;
    int temp_gpu;
    cudaStream_t temp_stream;


    int threadsPerBlock ;
    int blocksPerGrid ;

    int threadsPerBlock2;
    int blocksPerGrid2 ;

    int flag,flag2;
    int R;
    int C;
    const char*fileName;
    CRunGPU(const char*fileName, int R/*row*/, int C/*column*/,int flag)//construc function
    {
    this->R=R;
    this->C=C;
    this->fileName=fileName;
    this->flag=flag;
    row = thdsPerblock_cal_dis * int((R + thdsPerblock_cal_dis-1) / thdsPerblock_cal_dis);;
    BTR_train = (float*)malloc(row*C*sizeof(float));

    threadsPerBlock = thdsPerblock_cal_dis;
    blocksPerGrid = (R + threadsPerBlock - 1) / threadsPerBlock;

    threadsPerBlock2 = 512;
    //blocksPerGrid2 = (blocksPerGrid * 3 + threadsPerBlock2 - 1) / threadsPerBlock2;
    blocksPerGrid2 = ((blocksPerGrid * 3+1)/2 + threadsPerBlock2 - 1) / threadsPerBlock2;

    cout<<"blocksPerGrid: "<<blocksPerGrid<<endl;
    cout<<"blocksPerGrid2: "<<blocksPerGrid2<<endl;


    cout << "BTR_train" << row << "C" << C << endl;
    readMatrixfromTXT<float>(fileName, C, R, BTR_train);
    tosort_host = (float *)malloc(3 * sizeof(float));
    switch (flag)
    {
    case(0) : temp_stream = stream[0]; temp_gpu = gpu[0]; break;

    case(1) : temp_stream = stream[1]; temp_gpu = gpu[1]; break;

    case(2) : temp_stream = stream[2]; temp_gpu = gpu[2]; break;
    }

    cout << "malloc and copy " << endl;
    cudaSetDevice(temp_gpu);
    cudaStreamCreate(&temp_stream);
    //CUDA_CHECK_RETURN(cudaMalloc((void**)&dev_test, 13 * sizeof(float)));

    CUDA_CHECK_RETURN(cudaMalloc((void**)&dev_first_thr, 3*blocksPerGrid2* sizeof(float)));

    CUDA_CHECK_RETURN(cudaMalloc((void**)&dev_BTR_train, C*row*sizeof(float)));
    // CUDA_CHECK_RETURN(cudaMemcpyAsync(dev_test, test, 13 * sizeof(float), cudaMemcpyHostToDevice,temp_stream));
    CUDA_CHECK_RETURN(cudaMemcpyAsync(dev_BTR_train, BTR_train, C*row*sizeof(float), cudaMemcpyHostToDevice, temp_stream));
    cout << "start calculate" << endl;
    cudaMalloc((void**)&dev_tosort, blocksPerGrid * 3* sizeof(float));
    sort_block = (float *)malloc(blocksPerGrid2 * 3 * sizeof(float));
    if(sort_block)
    cout<<"malloc sort_block successful"<<endl;

    }
    void compute()
    {
    cudaSetDevice(temp_gpu);
    // cudaEventCreate(&start_gpu);
    // cudaEventCreate(&end_gpu);

    // cudaEventRecord(start_gpu, temp_stream);
    CalDis_Sort << <blocksPerGrid, threadsPerBlock, 3*800 * sizeof(float), temp_stream >> >( dev_BTR_train, dev_tosort, row, C,thdsPerblock_cal_dis);

    checkCUDAError("CalDis_Sort");
    // checkCUDAError("kernel invocation1");


    //global_sort << <blocksPerGrid2, threadsPerBlock2, 400 * sizeof(float), temp_stream >> >(dev_tosort, dev_first_thr, blocksPerGrid * 3);
    reduceComp<< <blocksPerGrid2, threadsPerBlock2, 512*2*3 * sizeof(float), temp_stream >> >(dev_tosort, dev_first_thr, blocksPerGrid * 3,threadsPerBlock2);

    int num=blocksPerGrid *3;
    flag2 =1;
    while(num>3)
    {

    blocksPerGrid2 = (num + threadsPerBlock2 -1)/threadsPerBlock2;
    if(flag2 == 1)
    {
    reduceComp << <blocksPerGrid2, threadsPerBlock2,512*2*3 * sizeof(float)+12, temp_stream >> >(dev_tosort, dev_first_thr,num,threadsPerBlock2); checkCUDAError("reduceComp");
    flag2=0;
    }
    else
    {
    reduceComp << <blocksPerGrid2, threadsPerBlock2,512*2*3 * sizeof(float)+12, temp_stream >> >(dev_first_thr,dev_tosort,num,threadsPerBlock2); checkCUDAError("reduceComp");
    flag2=1;
    }
    num = blocksPerGrid2*3;
    }


    }
    float getResult()
    {
    cout<<"get result"<<endl;
    CUDA_CHECK_RETURN(cudaMemcpyAsync(sort_block, dev_first_thr, 3* sizeof(float), cudaMemcpyDeviceToHost, temp_stream));
    //cudaMemcpyAsync(sort_block, dev_tosort, 3*blocksPerGrid2* sizeof(float), cudaMemcpyDeviceToHost, temp_stream);
    // for(int i=0;i< 3*blocksPerGrid2;i++)
    // cout<<sort_block[0]<< endl;
    // BubbleSortFirst3(sort_block, 3*blocksPerGrid2);
    cout << fileName << "distance is" << endl;
    float result=(sqrt(sort_block[0])+sqrt(sort_block[1])+sqrt(sort_block[2]))/3;
    printf(":%f ",result);
    return result;
    }

    void BubbleSortFirst3(float* pData/*array*/, int count/*the dimension of array*/)
    {
    float temp;
    for (int i = 0; i < 3; i++)
    {
    for (int j = count - 1; j > i; --j)
    {
    if (pData[j] < pData[j - 1])
    {
    temp = pData[j - 1];
    pData[j - 1] = pData[j];
    pData[j] = temp;
    }
    }
    }
    }

    virtual ~CRunGPU()
    {

    switch (flag)
    {
    case(0) : temp_stream = stream[0]; temp_gpu = gpu[0]; break;

    case(1) : temp_stream = stream[1]; temp_gpu = gpu[1]; break;

    case(2) : temp_stream = stream[2]; temp_gpu = gpu[2]; break;
    }
    cudaSetDevice(temp_gpu);
    CUDA_CHECK_RETURN(cudaFree(dev_BTR_train));
    //CUDA_CHECK_RETURN(cudaFree(dev_test));
    free(BTR_train);
    free(tosort_host);
    CUDA_CHECK_RETURN(cudaFree(dev_tosort));
    CUDA_CHECK_RETURN(cudaFree(dev_first_thr));
    free(sort_block);
    }
    };


    //void runCPU(const char*fileName){}

    #define MULNUM 1
    #define GPUCYCLE 1000
    #define CPUCYCLE 10

    const char *fileNameBTR = "data/BTR.txt";const char *fileNameBMP = "data/BMP.txt";const char *fileNameT = "data/T.txt";

    int main()
    {
    /********************************************`
    / GPU go first
    /*******************************************/
    printf("***********************START OF GPU*********************** ");
    int row_M = getFileRows(fileNameBTR);
    int row_M1 = getFileRows(fileNameBMP);
    int row_M2 = getFileRows(fileNameT);
    cout << row_M << endl;
    cout << "creating array of stream and gpu success" << endl;
    //double consume_gpu=10;

    CRunGPU btr(fileNameBTR, row_M, N,0) ;
    CRunGPU bmp(fileNameBMP, row_M1, N1,1) ;
    CRunGPU t(fileNameT, row_M2, N2,2) ;

    clock_t gpuParelledTime_b,gpuParelledTime_e;
    gpuParelledTime_b=clock();
    for(int i=0;i<GPUCYCLE;i++)
    {
    btr.compute();
    bmp.compute();
    t.compute();
    cudaStreamSynchronize(btr.temp_stream);
    cudaStreamSynchronize(bmp.temp_stream);
    cudaStreamSynchronize(t.temp_stream);
    // cudaEventSynchronize(btr.end_gpu); // ratio = 870
    // cudaEventSynchronize(bmp.end_gpu); // ratio = 860
    // cudaEventSynchronize(t.end_gpu); //ratio = 860
    }
    gpuParelledTime_e=clock();
    double consume_gpu=(double)( gpuParelledTime_e - gpuParelledTime_b ) / CLOCKS_PER_SEC * 1000 /GPUCYCLE;
    printf("gpu_time=%f ", consume_gpu);
    btr.getResult();
    bmp.getResult();
    t.getResult();

    //float consume_gpu = (gpu_t1 + gpu_t2 + gpu_t3) / 3;
    printf("***********************END OF GPU************************* ");
    printf(" ");
    printf(" ");

    /********************************************
    / CPU go second
    /*******************************************/

    printf("***********************START OF CPU*********************** ");
    int col, row;
    float distBMP, distBTR, distT;
    float*dataMatrix1, *dataMatrix2, *dataMatrix3;
    //vector<float> DisBTR;
    //vector<float> DisBMP;
    //vector<float> DisT;

    //BTR

    col = getFileColumns(fileNameBTR);
    row = getFileRows(fileNameBTR);
    int r_btr = thdsPerblock_cal_dis * int((row + thdsPerblock_cal_dis-1) / thdsPerblock_cal_dis);
    //printf("row:%d,col:%d ",row,col);
    dataMatrix1 = (float*)malloc(col*r_btr*sizeof(float));
    if (dataMatrix1){
    //printf("malloc matrix successful! ");
    readMatrixfromTXT<float>(fileNameBTR, col, row, dataMatrix1);
    }
    cout << "col" << col << endl;

    // BMP

    col = getFileColumns(fileNameBMP);
    row = getFileRows(fileNameBMP);
    int r_bmp = thdsPerblock_cal_dis * int((row + thdsPerblock_cal_dis-1) / thdsPerblock_cal_dis);

    //printf("row:%d,col:%d ",row,col);
    dataMatrix2 = (float*)malloc(col*r_bmp*sizeof(float));
    if (dataMatrix2){
    //printf("malloc matrix successful! ");
    readMatrixfromTXT<float>(fileNameBMP, col, row, dataMatrix2);
    }


    // T

    col = getFileColumns(fileNameT);
    row = getFileRows(fileNameT);
    int r_t = thdsPerblock_cal_dis * int((row + thdsPerblock_cal_dis-1) / thdsPerblock_cal_dis);
    //printf("row:%d,col:%d ",row,col);
    dataMatrix3 = (float*)malloc(col*r_t*sizeof(float));
    if (dataMatrix3){
    //printf("malloc matrix successful! ");
    readMatrixfromTXT<float>(fileNameT, col, row, dataMatrix3);
    }

    clock_t start_cpu = clock();
    //printf("start_cpu: %f ", start_cpu);
    for (int cnt = 0; cnt<CPUCYCLE; ++cnt){
    distBTR = CalDist(dataMatrix1, r_btr, col);
    distBMP = CalDist(dataMatrix2, r_bmp, col);
    distT = CalDist(dataMatrix3, r_t, col);
    }
    clock_t end_cpu = clock();
    //printf("end_cpu: %f ", end_cpu);
    clock_t consume_cpu = (end_cpu - start_cpu);


    //DisBTR.push_back(distBTR);
    printf("The distance of BTR is %f ", distBTR);
    //DisBMP.push_back(distBMP);
    printf("The distance of BMP is %f ", distBMP);
    //DisT.push_back(distT);
    printf("The distance of T is %f ", distT);
    free(dataMatrix1);
    free(dataMatrix2);
    free(dataMatrix3);

    if (distBMP < distBTR)
    {
    if (distBMP < distT)
    {
    printf("The type of picture is BMP ");
    string type = "BMP";
    }
    else
    {
    printf("The type of picture is T ");
    string type = "T";
    }
    }
    else
    {
    if (distBTR < distT)
    {
    printf("The type of picture is BTR ");
    string type = "BTR";
    }
    else
    {
    printf("The type of picture is T ");
    string type = "T";
    }
    }
    printf("cpu_time=%f ", (double)consume_cpu / CLOCKS_PER_SEC * 1000 / CPUCYCLE);
    printf("***********************END OF CPU************************* ");
    printf("cpu_time/gpu_time=%f ", (double)consume_cpu / CLOCKS_PER_SEC * 1000 / consume_gpu / CPUCYCLE);
    //printf("cpu_time/gpu_time=%f ", (double)totaltime/CLOCKS_PER_SEC*1000 / consume_gpu);

    return 0;
    }


    void BubbleSort(float* pData/*array*/, int count/*the dimension of array*/)
    {
    float temp;
    for (int i = 1; i < 4; i++)
    {
    for (int j = count - 1; j >= i; j--)
    {
    if (pData[j] < pData[j - 1])
    {
    temp = pData[j - 1];
    pData[j - 1] = pData[j];
    pData[j] = temp;
    }
    }
    }
    }

    template<typename T>
    void readMatrixfromTXT(const char *fileName, const int numColumn, const int numRow, T *matrix)
    {
    // std::ifstream fin(fileName,std::ifstream::in);
    ifstream fin(fileName);
    // ifstream fin(fileName.c_str(),ios::in);
    if (!fin)
    {
    cerr << "??????????ò????????????" << endl;
    exit(1);
    }
    string line;
    float tmp;
    int j = 0;
    int i = 0;
    int numRow2 = thdsPerblock_cal_dis * int((numRow + thdsPerblock_cal_dis-1) / thdsPerblock_cal_dis);
    for (i = 0; i<numRow - 1; i++){
    getline(fin, line);
    j = 0;
    //for(int j=0;j<numColumn;j++){
    istringstream istr(line);
    while (istr >> tmp){
    //matrix[i*numColumn + j] = tmp;
    matrix[j*numRow2 + i]=tmp;
    ++j;
    //cout<<tmp<<endl;
    }
    istr.clear();
    line.clear();
    }
    // cout<<"to add to num%256==0"<<endl;
    getline(fin, line);
    fin.close();
    j = 0;
    int rownum2 = numRow - 1;


    do
    {
    j = 0;
    istringstream istr(line);
    while (istr >> tmp){
    matrix[j*numRow2 + rownum2]=tmp;
    ++j;
    }
    istr.clear();
    ++rownum2;
    } while (rownum2 % thdsPerblock_cal_dis>0);

    for(int i=numRow;i<numRow2;i++)
    matrix[i]=1000;
    }

    int getFileColumns(const char *fileName){
    return 14;
    }

    int getFileRows(const char *fileName){
    ifstream fileStream(fileName, ios::in);
    string tmp;
    int count = 0;
    if (fileStream){
    while (getline(fileStream, tmp, ' ')){
    count++;
    }
    }
    fileStream.close();
    return count;
    }

    float CalDist(float*mat, int row, int col){
    //vector<float>tmp_dist;
    float * a= (float*)malloc(sizeof(float)*MULNUM*181);
    int count=0;
    for (int i = 0; i<row; i++){
    if (abs(test1[0] - mat[i]) <= 10){
    float sum = 0;
    for (int k = 1; k<13; k++){
    sum += (test1[k] - mat[i + k*row])*(test1[k] - mat[i + k*row]);
    }
    sum = sqrt(sum);
    //tmp_dist.push_back(sum);
    a[count++]=sum;
    }
    }
    //cout<<"cpu sorting..."<<endl;
    BubbleSort(a, count);
    float dist1 = (a[0] + a[1] + a[2])/3;
    free(a);
    return dist1;
    }

  • 相关阅读:
    使用 libevent 和 libev 提高网络应用性能
    An existing connection was forcibly closed by the remote host
    各种浏览器的兼容css
    vs输出窗口,显示build的时间
    sass
    网站设置404错误页
    List of content management systems
    css footer not displaying at the bottom of the page
    强制刷新css
    sp_executesql invalid object name
  • 原文地址:https://www.cnblogs.com/Erdos001/p/4544775.html
Copyright © 2011-2022 走看看