zoukankan      html  css  js  c++  java
  • c++和cuda混合编程 实现传统神经网络

    直接放代码了。。。

    实现的是x1+x2=y的预测,但梯度下降很慢。。。233333,gpu运行时间很快!!

    //
    //  main.cpp
    //  bp
    //
    //  Created by jzc on 2018/4/18.
    //  Copyright © 2018年 jzc. All rights reserved.
    //
    #include <stdio.h>
    #include <iostream>
    #include <time.h>
    #include <stdlib.h>
    #include <math.h>
    #include <fstream>
    #include <cuda_runtime.h>
    using namespace std;
    #define DATASIZE 10000
    #define TESTSIZE 100
    #define NEURESIZE 50
    #define RW 0.1
    #define EPOCH 1000
    #define E 2.71828
    //打印设备信息
    void printDeviceProp(const cudaDeviceProp &prop)
    {
        printf("Device Name : %s.
    ", prop.name);
        printf("totalGlobalMem : %ld.
    ", prop.totalGlobalMem);
        printf("sharedMemPerBlock : %ld.
    ", prop.sharedMemPerBlock);
        printf("regsPerBlock : %d.
    ", prop.regsPerBlock);
        printf("warpSize : %d.
    ", prop.warpSize);
        printf("memPitch : %ld.
    ", prop.memPitch);
        printf("maxThreadsPerBlock : %d.
    ", prop.maxThreadsPerBlock);
        printf("maxThreadsDim[0 - 2] : %d %d %d.
    ", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
        printf("maxGridSize[0 - 2] : %d %d %d.
    ", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
        printf("totalConstMem : %ld.
    ", prop.totalConstMem);
        printf("major.minor : %d.%d.
    ", prop.major, prop.minor);
        printf("clockRate : %d.
    ", prop.clockRate);
        printf("textureAlignment : %ld.
    ", prop.textureAlignment);
        printf("deviceOverlap : %d.
    ", prop.deviceOverlap);
        printf("multiProcessorCount : %d.
    ", prop.multiProcessorCount);
    }
    
    //CUDA 初始化
    bool InitCUDA()
    {
        int count;
        
        //取得支持Cuda的装置的数目
        cudaGetDeviceCount(&count);
        
        if (count == 0) {
            fprintf(stderr, "There is no device.
    ");
            return false;
        }
        
        int i;
        
        for (i = 0; i < count; i++) {
            
            cudaDeviceProp prop;
            cudaGetDeviceProperties(&prop, i);
            //打印设备信息
            printDeviceProp(prop);
            
            if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
                if (prop.major >= 1) {
                    break;
                }
            }
        }
        
        if (i == count) {
            fprintf(stderr, "There is no device supporting CUDA 1.x.
    ");
            return false;
        }
        
        cudaSetDevice(i);
        
        return true;
    }
    void init(int num,int range,double a[],double offset){
        for(int i=0;i<num;i++){
            a[i] = (double)(rand()%(range*1000)/1000.0) - offset;
        }
    }
    
    void getM(int num,double a[],double m[]){
        m[0] = m[1] = 0.0;
        for(int i=0;i<num;i++){
            if(a[i]<m[0]){
                m[0] = a[i];
            }else if(a[i]>m[1]){
                m[1] = a[i];
            }
        }
    }
    
    
    void normalize(int num,double a[],double m[]){
        for(int i =0;i<num;i++){
            a[i] = (a[i]-m[0]+1)/(m[1]-m[0]+1);
        }
    }
    
    
    void renorm(int num,double a[],double m[]){
        for(int i =0;i<num;i++){
            a[i] = a[i]*(m[1]-m[0]+1) + m[0] - 1;
        }
    }
    
    
    void printArray(int num,double a[]){
        for(int i=0;i<num;i++){
            printf("%6.4lf ",a[i]);
            if((i+1)%10==0){
                cout<<endl;
            }
        }
    }
    
    __global__ static void hidenLayer(double x1,double x2,double w1[],double w2[],double yh[]){
        /*for(int i=0;i<NEURESIZE;i++){
            yh[i] = w1[i]*x1 + w2[i]*x2;
            yh[i] = 1/(1+pow(E,0-yh[i]));
        }*/
        const int tid = threadIdx.x;
        int i =tid;
        yh[i] = w1[i]*x1 + w2[i]*x2;
        yh[i] = 1/(1+pow(E,0-yh[i]));
    }
    
    double outLayer(double yh[],double v[]){
        double y2;
        for(int i=0;i<NEURESIZE;i++){
            y2 += yh[i] * v[i];
        }
        y2 = 1/(1+pow(E,0-y2));
        return y2;
        
    }
    
    __global__ static void update(double x1[],double x2[],double yh[],double v[],double w1[],double w2[],double *loss){
        const int tid = threadIdx.x;
        int i = tid;
        /*for(int i=0;i<NEURESIZE;i++){
            w1[i] += x1[i] * (1-x1[i]) * loss * RW;
            w2[i] += x2[i] * (1-x2[i]) * loss * RW;
            v[i] += yh[i] * loss * RW;
        }*/
        w1[i] += x1[i] * (1-x1[i]) * (*loss) * RW;
        w2[i] += x2[i] * (1-x2[i]) * (*loss) * RW;
        v[i] += yh[i] * (*loss) * RW;
    }
    
    /*double test(double w1[],double w2[],double v[],double m1[],double m2[],double my[]){
        double tx1[TESTSIZE],tx2[TESTSIZE],ty[TESTSIZE],tyh[NEURESIZE],ty2[TESTSIZE];
        double avLoss = 0.0;
        
        init(TESTSIZE,10,tx1,0.0);
        init(TESTSIZE,10,tx2,0.0);
        
        for(int i=0;i<TESTSIZE;i++){
            ty[i] = tx1[i] + tx2[i];
        }
        normalize(TESTSIZE,tx1,m1);
        normalize(TESTSIZE,tx2,m2);
        for(int q=0;q<TESTSIZE;q++){
            hidenLayer(tx1[q],tx2[q],w1,w2,tyh);
            ty2[q] = outLayer(tyh,v);
        }
        
        renorm(TESTSIZE,ty2,my);
        for(int i=0;i<TESTSIZE;i++){
            if(i<10){
                printf("%2d y=%2.4f y2=%2.4f
    ",i,ty[i],ty2[i]);
            }
            avLoss += pow(ty[i]-ty2[i],2);
        }
        avLoss /= TESTSIZE;
        //cout<<avLoss<<endl;
        return avLoss;
    }*/
    
    
    int main(){
        ofstream outf;
        outf.open("trainloss.txt");
        srand( (unsigned)time(NULL) );
        long starttime = clock();
        double x1[DATASIZE],x2[DATASIZE],y[DATASIZE],y2[DATASIZE];
        double w1[NEURESIZE],w2[NEURESIZE],v[NEURESIZE],yh[NEURESIZE];
        double m1[2],m2[2],my[2];                                      
        double cLoss,realLoss,minTrainLoss = 1.0,minTestLoss = 1.0;
        init(DATASIZE,10,x1,0.0);
        init(DATASIZE,10,x2,0.0);
        init(NEURESIZE,2,w1,1.0);
        init(NEURESIZE,2,w2,1.0);
        init(NEURESIZE,2,v,1.0);
        
        for(int i=0;i<DATASIZE;i++){
            y[i] = x1[i] + x2[i];
        }
        
        //CUDA 初始化
        if (!InitCUDA()) {
            return 0;
        }
        //cudaMalloc 取得一块显卡内存
        double *x1_g,*x2_g,*y_g,*y2_g;
        double *w1_g,*w2_g,*v_g,*yh_g;
        double *cLoss_g;
        cudaMalloc((void**)&x1_g, sizeof(double)* DATASIZE);
        cudaMalloc((void**)&x2_g, sizeof(double)* DATASIZE);
        cudaMalloc((void**)&y_g, sizeof(double)* DATASIZE);
        cudaMalloc((void**)&y2_g, sizeof(double)* DATASIZE);
        cudaMalloc((void**)&w1_g, sizeof(double)* NEURESIZE);
        cudaMalloc((void**)&w2_g, sizeof(double)* NEURESIZE);
        cudaMalloc((void**)&v_g, sizeof(double)* NEURESIZE);
        cudaMalloc((void**)&yh_g, sizeof(double)* NEURESIZE);
        cudaMalloc((void**)&cLoss_g, sizeof(double));
        
        //cudaMemcpy 将产生的随机数复制到显卡内存中
        //cudaMemcpyHostToDevice - 从内存复制到显卡内存
        //cudaMemcpyDeviceToHost - 从显卡内存复制到内存
        cudaMemcpy(w1_g,w1, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(w2_g,w2, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(v_g,v, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(x1_g,x1, sizeof(double)*DATASIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(x2_g,x2, sizeof(double)*DATASIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(y_g,y, sizeof(double)*DATASIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(yh_g,yh, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice);
        cudaMemcpy(cLoss_g,&cLoss, sizeof(double), cudaMemcpyHostToDevice);
        
        getM(DATASIZE,x1,m1);
        getM(DATASIZE,x2,m2);
        getM(DATASIZE,y,my);
        normalize(DATASIZE,x1,m1);
        normalize(DATASIZE,x2,m2);
        normalize(DATASIZE,y,my);
        
        
        for(int j=0;j<EPOCH;j++){
            double tLoss = 0.0;
            for(int i=0;i<DATASIZE;i++){
                hidenLayer<< < 1, NEURESIZE, 0 >> >(x1_g[i],x2_g[i],w1_g,w2_g,yh_g);
                cudaMemcpy(yh,yh_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
                cudaMemcpy(v,v_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
                y2[i] = outLayer(yh,v);
                cLoss = y2[i] * (1-y2[i]) * (y[i]-y2[i]);
                cudaMemcpy(cLoss_g,&cLoss, sizeof(double), cudaMemcpyHostToDevice);
                update<< < 1, NEURESIZE, 0 >> >(x1_g,x2_g,yh_g,v_g,w1_g,w2_g,cLoss_g);
                cudaMemcpy(&cLoss,cLoss_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
                cLoss = pow(cLoss,2);
                cLoss = cLoss*(my[1]-my[0]+1);
                tLoss += cLoss;
            }
            tLoss /= DATASIZE;
            if(tLoss<minTrainLoss){
                minTrainLoss = tLoss;
            }
            printf("EPOCH--%d, trainLoss--%0.4f
    ",j,tLoss);
             outf<<j<<"	"<<tLoss<<endl;
            
            /*cudaMemcpy(w1,w1_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
            cudaMemcpy(w2,w2_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
            cudaMemcpy(v,v_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost);
            double avLoss = test(w1,w2,v,m1,m2,my);
            printf("EPOCH--%d, avLoss--%0.4f
    ",j,avLoss);
            if(avLoss<minTestLoss){
                minTestLoss = avLoss;
            }*/
            cout<<"------------------"<<endl;
        }
        printf("minTrainLoss--%0.4f
    ",minTrainLoss);
        //printf("minTestLoss--%0.4f
    ",minTestLoss);
        outf.close();
        
        //Free
        cudaFree(x1_g);
        cudaFree(x2_g);
        cudaFree(y_g);
        cudaFree(w1_g);
        cudaFree(w2_g);
        cudaFree(v_g);
        cudaFree(yh_g);
        cudaFree(cLoss_g);
        
        long endtime = clock()-starttime;
        float execution_time = (float)endtime / (1024 * 1058500);
        cout << "total time cost: " << execution_time<<endl;
     
        
        
        return 0;
    }
  • 相关阅读:
    Kafka0.10.2.0分布式集群安装
    Windows下本机简易监控系统搭建(Telegraf+Influxdb+Grafana)
    Influxdb1.2.2安装
    Grafana4.2安装
    Zookeeper3.4.9分布式集群安装
    Grafana中多租户设置
    Grafana关键表结构分析
    Spark2.1.0分布式集群安装
    Hadoop2.7.3分布式集群安装
    JVM学习--(七)性能监控工具
  • 原文地址:https://www.cnblogs.com/jzcbest1016/p/9006650.html
Copyright © 2011-2022 走看看