zoukankan      html  css  js  c++  java
  • 使用神经网络做二分类预测

    不想整理代码了。先给个结果图:

    train 0 loss: 1838.0616
    train 100 loss: 1441.5283
    train 200 loss: 1299.4546
    train 300 loss: 934.36536
    train 400 loss: 506.06702
    train 500 loss: 322.9782
    train 600 loss: 271.5825
    train 700 loss: 360.091
    train 800 loss: 237.25177
    train 900 loss: 332.97592
    train 1000 loss: 117.5983
    train 1100 loss: 173.39397
    train 1200 loss: 51.26674
    train 1300 loss: 82.82826
    train 1400 loss: 74.705734
    train 1500 loss: 113.63321
    train 1600 loss: 71.29809
    train 1700 loss: 38.41456
    train 1800 loss: 82.75247
    train 1900 loss: 44.553272
    test 0,accuracy:0.953125,auc: (0.0, 0.9708618)
    test 1,accuracy:0.9375,auc: (0.9708618, 0.96028894)
    test 2,accuracy:0.9609375,auc: (0.96028894, 0.9594982)
    test 3,accuracy:0.953125,auc: (0.9594982, 0.96195656)
    test 4,accuracy:0.9375,auc: (0.96195656, 0.9627208)

    loss这么大,结果这么准确。我也搞不懂是怎么肥事呀。

                                                         

    AUC也没什么问题。暂时认为是好的吧。

    下面是源码dataUtil用来对数据预处理:

    import pandas as pd
    import numpy as np
    def load_csv(filename):
        data=pd.read_csv(filename)
        data = data.drop(data.columns[39:], axis=1)
        return data
    def toInt(y):
        return int(y)
    def split_x_y(data):
        keys=data.keys().tolist()
        y=data["Label"]
        keys.remove("Label")
        x=data[keys]
        return (x.values,y.values)
    def max_min_nomalize(X):
        keys=X.keys().tolist()
        keys.remove("Index")
        keys.remove("Label")
        #keys.remove("Gender")
        #keys=["BMI","JiGan","ShouSuoYa","ShuZhangYa"]
        #删掉JiGan为-1的人
        #X = X[X["JiGan"].isin([-1.0]) == False]
        for key in keys:
            #normalize_col=(X[key]-(X[key].max()+X[key].min())/2)/(X[key].max()-X[key].min())
            #测试1:用mean来normolize
            normalize_col = (X[key] - X[key].mean()) / (X[key].max() - X[key].min())
            X = X.drop(key, axis=1)
            X[key]=normalize_col
        return X
    
    if __name__=="__main__":
        pd.set_option('display.max_rows', 500)
        pd.set_option('display.max_columns', 500)
        pd.set_option('display.width', 500)
        data=load_csv("./data/patient_data.csv")
        #print(data.head())
        print(data.info())
        print(data.describe())
        print(data.count())
        #print(data["Label"].value_counts())
        #data=data[data["JiGan"].isin([-1.0])==False]
       # print(data)
        #print(data)
        #print(data.describe())
        #x=max_min_nomalize(data)
        # for key in data.keys().tolist():
        #     print("********************************************************{}**********************************".format(key))
        #     print(data[key].value_counts())
        #data=load_csv("F:workspacespycharmPatientdatapatient_data.csv")
        #data=max_min_nomalize(data)
        #print(data.head())
    dataUtil

    然后是dnnModel用来构建模型:

    import tensorflow as tf
    import numpy as np
    from sklearn.metrics import roc_auc_score, roc_curve
    import matplotlib.pyplot as plt
    from tensorflow.contrib import layers
    
    
    class dnnModel():
        def __init__(self,x_train,y_train,x_test,y_test,learn_rate):
    
            self.epoch=0
            self.learn_rate=learn_rate
            self.h1_dimen=500
            self.h2_dimen=500
            self.load_data2(x_train,y_train,x_test,y_test)
            #self.load_data(x_train, y_train)
        def load_data2(self,x_train,y_train,x_test,y_test):
            self.x_datas=x_train
            self.y_datas=y_train
            self.x_datas_test=x_test
            self.y_datas_test=y_test
            self.num_datas=self.y_datas.shape[0]
            self.num_datas_test=self.y_datas_test.shape[0]
            self.input_dimen=self.x_datas.shape[1]
            self.output_dimen = self.y_datas.shape[1]
            self.shullf()
        def load_data(self,x,y):
            datas_len=x.shape[0]
            self.x_datas=x[0:datas_len*8//10]
            self.y_datas=y[0:datas_len*8//10]
            self.x_datas_test=x[datas_len*8//10:]
            self.y_datas_test=y[datas_len*8//10:]
            self.num_datas=self.y_datas.shape[0]
            self.num_datas_test=self.y_datas_test.shape[0]
            self.input_dimen=self.x_datas.shape[1]
            self.output_dimen = self.y_datas.shape[1]
            self.shullf()
            #self.output_dimen = 1
        def shullf(self):
            perm=np.arange(self.num_datas)
            np.random.shuffle(perm)
            self.x_datas=self.x_datas[perm]
            self.y_datas=self.y_datas[perm]
            perm=np.arange(self.num_datas_test)
            np.random.shuffle(perm)
            self.x_datas_test=self.x_datas_test[perm]
            self.y_datas_test=self.y_datas_test[perm]
        def weight_variable(self,shape,reg=True):
            init=tf.random_normal(shape=shape,dtype=tf.float32)
            if reg:
                if reg == True:
                    regularizer = layers.l2_regularizer(0.05)
                else:
                    regularizer = None
            return tf.Variable(init)
        def bias_variable(self,shape):
            init=tf.constant(0.1,dtype=tf.float32,shape=shape)
            return tf.Variable(init)
        def next_batch(self,batchsize):
            start=self.epoch
            self.epoch+=batchsize
            if self.epoch>self.num_datas:
                perm=np.arange(self.num_datas)
                np.random.shuffle(perm)
                self.x_datas=self.x_datas[perm]
                self.y_datas=self.y_datas[perm]
                self.epoch=batchsize
                start=0
            end=self.epoch
            return self.x_datas[start:end],self.y_datas[start:end]
        def add_layer(self,x,input_dimen,output_dimen,name,relu=True):
            with tf.name_scope(name):
                weight = self.weight_variable([input_dimen, output_dimen])
                bias = self.bias_variable([output_dimen])
                tf.summary.histogram(name+"/weight",weight)
                tf.summary.histogram(name+"/bias",bias)
                if relu:
                    return tf.nn.relu(tf.matmul(x,weight)+bias)
                else:
                    return tf.matmul(x,weight)+bias
        def constructDnn(self,input_x):
            #输入层
            input_layer=self.add_layer(input_x,self.input_dimen,500,name="input_layer",relu=True)
            #一个隐藏层
            h1=self.add_layer(input_layer,500,500,relu=True,name="hidden_layer1")
            h1_drop=tf.nn.dropout(h1,keep_prob=0.7)
            #在增加一个隐藏层
            h2=self.add_layer(h1_drop,500,1024,relu=True,name="hidden_layer2")
            h2_drop=tf.nn.dropout(h2,keep_prob=0.8)
            # 在增加一个隐藏层
            # h3 = self.add_layer(h2_drop, 500, 500, relu=True, name="hidden_layer2")
            # h3_drop = tf.nn.dropout(h3, keep_prob=0.8)
            #输出层
            output_layer=self.add_layer(h2_drop,1024,self.output_dimen,"output_layer",relu=False)
            tf.summary.histogram('/outputs', output_layer)
            return output_layer
    
        def train(self,maxTrainTimes,batchsize):
            X=tf.placeholder(dtype=tf.float32,shape=[None,self.input_dimen])
            Y=tf.placeholder(dtype=tf.float32,shape=[None,self.output_dimen])
            y_pre=self.constructDnn(X)
            entropy=tf.nn.softmax_cross_entropy_with_logits(logits=y_pre,labels=Y)
            #entropy=-tf.reduce_sum(Y*tf.log(tf.nn.softmax(y_pre)))
            loss = tf.reduce_mean(entropy)
            optimizer=tf.train.AdamOptimizer(self.learn_rate).minimize(loss)
    
            with tf.name_scope("evl"):
                correct=tf.equal(tf.argmax(y_pre,1),tf.argmax(Y,1))
                accuracy=tf.reduce_mean(tf.cast(correct,dtype=tf.float32))
                a = tf.cast(tf.argmax(y_pre, 1), tf.float32)
                b = tf.cast(tf.argmax(Y, 1), tf.float32)
                auc = tf.contrib.metrics.streaming_auc(a, b)
            tf.summary.scalar("loss", loss)
            tf.summary.scalar("accuracy", accuracy)
            #tf.summary.scalar("auc", auc)
            merged_summary_op = tf.summary.merge_all()
            summary_writer = tf.summary.FileWriter('./tmp/mnist_logs')
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                sess.run(tf.initialize_local_variables())
                summary_writer.add_graph(sess.graph)
                for i in range(maxTrainTimes):
                    x_train,y_train=self.next_batch(batchsize)
                    sess.run(optimizer,feed_dict={X:x_train,Y:y_train})
                    #print(sess.run(y_pre,feed_dict={X:x_train,Y:y_train}))
                    #print(sess.run(entropy, feed_dict={X: x_train, Y: y_train}))
                    if i%100==0:
                        print("train {} loss:".format(i),sess.run(loss,feed_dict={X:x_train,Y:y_train}))
                        s = sess.run(merged_summary_op, feed_dict={X:x_train,Y:y_train})
                        summary_writer.add_summary(s, i)
                testTime=self.num_datas_test//batchsize
                for i in range(testTime):
                    x_train, y_train = self.next_batch(batchsize)
                    testAcc=sess.run(accuracy, feed_dict={X: x_train, Y: y_train})
                    testAuc=sess.run(auc,feed_dict={X: x_train, Y: y_train})
                    y_pred_pro = sess.run(y_pre,feed_dict={X: x_train, Y: y_train})
                    y_scores = np.array(y_pred_pro)
                    auc_value = roc_auc_score(y_train, y_scores)
                    a=np.array(y_train)[:,1]
                    b=y_scores[:,1]
                    fpr, tpr, thresholds = roc_curve(a,b , pos_label=1.0)
                    plt.figure(figsize=(6, 4))
                    plt.plot(fpr, tpr, color='blue', linewidth=2, label='AUC (area:%0.4f)' % auc_value)
                    plt.plot([0, 1], [0, 1], color='black', linewidth=2, linestyle='--')
                    plt.xlim([0.0, 1.0])
                    plt.ylim([0.0, 1.0])
                    plt.xlabel('False Positive Rate')
                    plt.ylabel('True Positive Rate')
                    plt.title('ROC')
                    plt.legend(loc="lower right")
                    plt.show()
                    print("test {},accuracy:{},auc: {}".format(i,testAcc,testAuc))
        def svm_train(self,maxTrainTimes,batchsize):
            # 初始化feedin
            x_data = tf.placeholder(shape=[None, self.input_dimen], dtype=tf.float32)
            y_target = tf.placeholder(shape=[None,1], dtype=tf.float32)
    
            # 创建变量
            A = tf.Variable(tf.random_normal(shape=[self.input_dimen, 1]))
            b = tf.Variable(tf.random_normal(shape=[1, 1]))
    
            # 定义线性模型
            model_output = tf.subtract(tf.matmul(x_data, A), b)
    
            # Declare vector L2 'norm' function squared
            l2_norm = tf.reduce_sum(tf.square(A))
    
            # Loss = max(0, 1-pred*actual) + alpha * L2_norm(A)^2
            alpha = tf.constant([0.01])
            classification_term = tf.reduce_mean(tf.maximum(0., tf.subtract(1., tf.multiply(model_output, y_target))))
            loss = tf.add(classification_term, tf.multiply(alpha, l2_norm))
            my_opt = tf.train.GradientDescentOptimizer(0.01)
            train_step = my_opt.minimize(loss)
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                for i in range(maxTrainTimes):
                    train_x,train_y=self.next_batch(batchsize)
                    train_y=train_y.reshape([-1,1])
                    sess.run(train_step, feed_dict={x_data: train_x, y_target: train_y})
                    if i%100==0:
                        print("loss in train step {}: {}".format(i,sess.run(loss,feed_dict={x_data: train_x, y_target: train_y})))
    dnnModel

    最后是程序入口,读取数据,喂给神经网络:

    import dataUtil
    import dnnModel
    import tensorflow as tf
    import numpy as np
    import numpy as np
    import tensorflow as tf
    
    
    def one_hot(labels,class_num):
        labels=changeToint(labels)
        b = tf.one_hot(labels, class_num, 1, 0)
        with tf.Session() as sess:
            return sess.run(b)
    def changeToint(list):
        a=range(len(list))
        for i in range(len(list)):
            #print("now in {},and total is {}".format(i,len(list)))
            if (int(list[i]))==0 :
                list[i]=int(0)
            else:
                list[i]=int(1)
            #print(i,list[i])
        return list
    def select_x_y(x,y):
        #return x,y,x,y
        #选择2 4的数据
        x_train_selected=[]
        y_train_selected=[]
        x_test_selected=[]
        y_test_selected=[]
        for i in range(x.shape[0]):
            item=x[i]
            if item[0]>20000 and item[0]<30000:
                x_train_selected.append(x[i,1:])
                y_train_selected.append(y[i])
            elif item[0]>40000:
                x_train_selected.append(x[i,1:])
                y_train_selected.append(y[i])
            else:
                x_test_selected.append(x[i,1:])
                y_test_selected.append(y[i])
        return np.array(x_train_selected),np.array(y_train_selected),np.array(x_test_selected),np.array(y_test_selected)
    
    if __name__=="__main__":
        data=dataUtil.load_csv("F:workspacespycharmPatientdatapatient_data.csv")
        #for key in data.keys().tolist():
        # print(data.info())
        # print(data["ShuZhangYa"].count())
        x,y=dataUtil.split_x_y(dataUtil.max_min_nomalize(data))
        y=one_hot(y,2)
        #y=changeToint(y).reshape([-1,1])
        #print(y)
        x_train,y_train,x_test,y_test=select_x_y(x,y)
        #from tensorflow.examples.tutorials.mnist import input_data
        #MNIST = input_data.read_data_sets("mnist", one_hot=True)
        #mydnn=dnnModel.dnnModel(MNIST.train.images,MNIST.train.labels,0.1)
        #mydnn = dnnModel.dnnModel(x,changeToint(y).reshape([-1,1]),0.1)
        mydnn = dnnModel.dnnModel(x_train,y_train,x_test,y_test, 0.001)
        #mydnn.svm_train(1000,50)
        mydnn.train(2000,128)
    main
  • 相关阅读:
    设置div内的内容不能被选中
    js中!和!!的区别及用法
    echars自定义提示框位置
    数组排序
    查询一个表的不同数据,显示到一个表
    document.body
    设置fixed,横向滚动条失效
    scrollTop
    JS 数组
    JS 数学函数
  • 原文地址:https://www.cnblogs.com/superxuezhazha/p/9544288.html
Copyright © 2011-2022 走看看