zoukankan      html  css  js  c++  java
  • c++和cuda混合编程 实现传统神经网络

    直接放代码了。。。

    实现的是x1+x2=y的预测,但梯度下降很慢。。。233333,gpu运行时间很快!!

    //
    //  main.cpp
    //  bp
    //
    //  Created by jzc on 2018/4/18.
    //  Copyright © 2018年 jzc. All rights reserved.
    //
    #include <stdio.h>
    #include <iostream>
    #include <time.h>
    #include <stdlib.h>
    #include <math.h>
    #include <fstream>
    #include <cuda_runtime.h>
    using namespace std;
    #define DATASIZE 10000
    #define TESTSIZE 100
    #define NEURESIZE 50
    #define RW 0.1
    #define EPOCH 1000
    #define E 2.71828
    //打印设备信息
    void printDeviceProp(const cudaDeviceProp &prop)
    {
        printf("Device Name : %s.
    ", prop.name);
        printf("totalGlobalMem : %ld.
    ", prop.totalGlobalMem);
        printf("sharedMemPerBlock : %ld.
    ", prop.sharedMemPerBlock);
        printf("regsPerBlock : %d.
    ", prop.regsPerBlock);
        printf("warpSize : %d.
    ", prop.warpSize);
        printf("memPitch : %ld.
    ", prop.memPitch);
        printf("maxThreadsPerBlock : %d.
    ", prop.maxThreadsPerBlock);
        printf("maxThreadsDim[0 - 2] : %d %d %d.
    ", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
        printf("maxGridSize[0 - 2] : %d %d %d.
    ", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
        printf("totalConstMem : %ld.
    ", prop.totalConstMem);
        printf("major.minor : %d.%d.
    ", prop.major, prop.minor);
        printf("clockRate : %d.
    ", prop.clockRate);
        printf("textureAlignment : %ld.
    ", prop.textureAlignment);
        printf("deviceOverlap : %d.
    ", prop.deviceOverlap);
        printf("multiProcessorCount : %d.
    ", prop.multiProcessorCount);
    }
    
    //CUDA 初始化
    bool InitCUDA()
    {
        int count;
        
        //取得支持Cuda的装置的数目
        cudaGetDeviceCount(&count);
        
        if (count == 0) {
            fprintf(stderr, "There is no device.
    ");
            return false;
        }
        
        int i;
        
        for (i = 0; i < count; i++) {
            
            cudaDeviceProp prop;
            cudaGetDeviceProperties(&prop, i);
            //打印设备信息
            printDeviceProp(prop);
            
            if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
                if (prop.major >= 1) {
                    break;
                }
            }
        }
        
        if (i == count) {
            fprintf(stderr, "There is no device supporting CUDA 1.x.
    ");
            return false;
        }
        
        cudaSetDevice(i);
        
        return true;
    }
    void init(int num,int range,double a[],double offset){
        for(int i=0;i<num;i++){
            a[i] = (double)(rand()%(range*1000)/1000.0) - offset;
        }
    }
    
    void getM(int num,double a[],double m[]){
        m[0] = m[1] = 0.0;
        for(int i=0;i<num;i++){
            if(a[i]<m[0]){
                m[0] = a[i];
            }else if(a[i]>m[1]){
                m[1] = a[i];
            }
        }
    }
    
    
    void normalize(int num,double a[],double m[]){
        for(int i =0;i<num;i++){
            a[i] = (a[i]-m[0]+1)/(m[1]-m[0]+1);
        }
    }
    
    
    void renorm(int num,double a[],double m[]){
        for(int i =0;i<num;i++){
            a[i] = a[i]*(m[1]-m[0]+1) + m[0] - 1;
        }
    }
    
    
    void printArray(int num,double a[]){
        for(int i=0;i<num;i++){
            printf("%6.4lf ",a[i]);
            if((i+1)%10==0){
                cout<<endl;
            }
        }
    }
    
    __global__ static void hidenLayer(double x1,double x2,double w1[],double w2[],double yh[]){
        /*for(int i=0;i<NEURESIZE;i++){
            yh[i] = w1[i]*x1 + w2[i]*x2;
            yh[i] = 1/(1+pow(E,0-yh[i]));
        }*/
        const int tid = threadIdx.x;
        int i =tid;
        yh[i] = w1[i]*x1 + w2[i]*x2;
        yh[i] = 1/(1+pow(E,0-yh[i]));
    }
    
    double outLayer(double yh[],double v[]){
        double y2;
        for(int i=0;i<NEURESIZE;i++){
            y2 += yh[i] * v[i];
        }
        y2 = 1/(1+pow(E,0-y2));
        return y2;
        
    }
    
    __global__ static void update(double x1[],double x2[],double yh[],double v[],double w1[],double w2[],double *loss){
        const int tid = threadIdx.x;
        int i = tid;
        /*for(int i=0;i<NEURESIZE;i++){
            w1[i] += x1[i] * (1-x1[i]) * loss * RW;
            w2[i] += x2[i] * (1-x2[i]) * loss * RW;
            v[i] += yh[i] * loss * RW;
        }*/
        w1[i] += x1[i] * (1-x1[i]) * (*loss) * RW;
        w2[i] += x2[i] * (1-x2[i]) * (*loss) * RW;
        v[i] += yh[i] * (*loss) * RW;
    }
    
    /*double test(double w1[],double w2[],double v[],double m1[],double m2[],double my[]){
        double tx1[TESTSIZE],tx2[TESTSIZE],ty[TESTSIZE],tyh[NEURESIZE],ty2[TESTSIZE];
        double avLoss = 0.0;
        
        init(TESTSIZE,10,tx1,0.0);
        init(TESTSIZE,10,tx2,0.0);
        
        for(int i=0;i<TESTSIZE;i++){
            ty[i] = tx1[i] + tx2[i];
        }
        normalize(TESTSIZE,tx1,m1);
        normalize(TESTSIZE,tx2,m2);
        for(int q=0;q<TESTSIZE;q++){
            hidenLayer(tx1[q],tx2[q],w1,w2,tyh);
            ty2[q] = outLayer(tyh,v);
        }
        
        renorm(TESTSIZE,ty2,my);
        for(int i=0;i<TESTSIZE;i++){
            if(i<10){
                printf("%2d y=%2.4f y2=%2.4f
    ",i,ty[i],ty2[i]);
            }
            avLoss += pow(ty[i]-ty2[i],2);
        }
        avLoss /= TESTSIZE;
        //cout<<avLoss<<endl;
        return avLoss;
    }*/
    
    
    int main(){
        ofstream outf;
        outf.open("trainloss.txt");
        srand( (unsigned)time(NULL) );
        long starttime = clock();
        double x1[DATASIZE],x2[DATASIZE],y[DATASIZE],y2[DATASIZE];
        double w1[NEURESIZE],w2[NEURESIZE],v[NEURESIZE],yh[NEURESIZE];
        double m1[2],m2[2],my[2];                                      
        double cLoss,realLoss,minTrainLoss = 1.0,minTestLoss = 1.0;
        init(DATASIZE,10,x1,0.0);
        init(DATASIZE,10,x2,0.0);
        init(NEURESIZE,2,w1,1.0);
        init(NEURESIZE,2,w2,1.0);
        init(NEURESIZE,2,v,1.0);
        
        for(int i=0;i<DATASIZE;i++){
            y[i] = x1[i] + x2[i];
        }
        
        //CUDA 初始化
        if (!InitCUDA()) {
            return 0;
        }
        //cudaMalloc 取得一块显卡内存
        double *x1_g,*x2_g,*y_g,*y2_g;
        double *w1_g,*w2_g,*v_g,*yh_g;
        double *cLoss_g;
        cudaMalloc((void**)&x1_g, sizeof(double)* DATASIZE);
        cudaMalloc((void**)&x2_g, sizeof(double)* DATASIZE);
        cudaMalloc((void**)&y_g, sizeof(double)* DATASIZE);
        cudaMalloc((void**)&y2_g, sizeof(double)* DATASIZE);
        cudaMalloc((void**)&w1_g, sizeof(double)* NEURESIZE);
        cudaMalloc((void**)&w2_g, sizeof(double)* NEURESIZE);
        cudaMalloc((void**)&v_g, sizeof(double)* NEURESIZE);
        cudaMalloc((void**)&yh_g, sizeof(double)* NEURESIZE);
        cudaMalloc((void**)&cLoss_g, sizeof(double));
        
        //cudaMemcpy 将产生的随机数复制到显卡内存中
        //cudaMemcpyHostToDevice - 从内存复制到显卡内存
        //cudaMemcpyDeviceToHost - 从显卡内存复制到内存
        cudaMemcpy(w1_g,w1, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(w2_g,w2, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(v_g,v, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(x1_g,x1, sizeof(double)*DATASIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(x2_g,x2, sizeof(double)*DATASIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(y_g,y, sizeof(double)*DATASIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(yh_g,yh, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(cLoss_g,&cLoss, sizeof(double), cudaMemcpyHostToDevice);
        
        getM(DATASIZE,x1,m1);
        getM(DATASIZE,x2,m2);
        getM(DATASIZE,y,my);
        normalize(DATASIZE,x1,m1);
        normalize(DATASIZE,x2,m2);
        normalize(DATASIZE,y,my);
        
        
        for(int j=0;j<EPOCH;j++){
            double tLoss = 0.0;
            for(int i=0;i<DATASIZE;i++){
                hidenLayer<< < 1, NEURESIZE, 0 >> >(x1_g[i],x2_g[i],w1_g,w2_g,yh_g);
                cudaMemcpy(yh,yh_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
                cudaMemcpy(v,v_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
                y2[i] = outLayer(yh,v);
                cLoss = y2[i] * (1-y2[i]) * (y[i]-y2[i]);
                cudaMemcpy(cLoss_g,&cLoss, sizeof(double), cudaMemcpyHostToDevice);
                update<< < 1, NEURESIZE, 0 >> >(x1_g,x2_g,yh_g,v_g,w1_g,w2_g,cLoss_g);
                cudaMemcpy(&cLoss,cLoss_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
                cLoss = pow(cLoss,2);
                cLoss = cLoss*(my[1]-my[0]+1);
                tLoss += cLoss;
            }
            tLoss /= DATASIZE;
            if(tLoss<minTrainLoss){
                minTrainLoss = tLoss;
            }
            printf("EPOCH--%d, trainLoss--%0.4f
    ",j,tLoss);
             outf<<j<<"	"<<tLoss<<endl;
            
            /*cudaMemcpy(w1,w1_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
            cudaMemcpy(w2,w2_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
            cudaMemcpy(v,v_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
            double avLoss = test(w1,w2,v,m1,m2,my);
            printf("EPOCH--%d, avLoss--%0.4f
    ",j,avLoss);
            if(avLoss<minTestLoss){
                minTestLoss = avLoss;
            }*/
            cout<<"------------------"<<endl;
        }
        printf("minTrainLoss--%0.4f
    ",minTrainLoss);
        //printf("minTestLoss--%0.4f
    ",minTestLoss);
        outf.close();
        
        //Free
        cudaFree(x1_g);
        cudaFree(x2_g);
        cudaFree(y_g);
        cudaFree(w1_g);
        cudaFree(w2_g);
        cudaFree(v_g);
        cudaFree(yh_g);
        cudaFree(cLoss_g);
        
        long endtime = clock()-starttime;
        float execution_time = (float)endtime / (1024 * 1058500);
        cout << "total time cost: " << execution_time<<endl;
     
        
        
        return 0;
    }
  • 相关阅读:
    168. Excel Sheet Column Title
    171. Excel Sheet Column Number
    264. Ugly Number II java solutions
    152. Maximum Product Subarray java solutions
    309. Best Time to Buy and Sell Stock with Cooldown java solutions
    120. Triangle java solutions
    300. Longest Increasing Subsequence java solutions
    63. Unique Paths II java solutions
    221. Maximal Square java solutions
    279. Perfect Squares java solutions
  • 原文地址:https://www.cnblogs.com/jzcbest1016/p/9006650.html
Copyright © 2011-2022 走看看