zoukankan      html  css  js  c++  java
  • 逻辑回归中KFold寻找最优正则化系数C画混淆矩阵以及找最优概率值

    lr中通过kfold寻找最优C

    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import recall_score
    def printing_kfold_scores(x_train_data,y_train_data):
        flod = KFold(5,shuffle=False)
        c_param_range=[0.01,0.1,1,10,100]
        result_table = pd.DataFrame(index = range(len(c_param_range),2),columns=['C_parameter','Mean recall score'])
        result_table['C_parameter']=c_param_range
        j=0
        for c_param in c_param_range:
            print('-------------------------------------------')
            print('C parameter: ', c_param)
            print('-------------------------------------------')
            print('')
            recall_accs = []
            for iteration,indices in enumerate(flod.split(y_train_data),start=1):
                lr=LogisticRegression(C=c_param,penalty='l1')
                lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
                y_pred_undersample =lr.predict(x_train_data.iloc[indices[1],:].values)
                recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
                recall_accs.append(recall_acc)
                print('Iteration ', iteration,': recall score = ', recall_acc)
            results_table.loc[j,'Mean recall score'] = np.mean(recall_accs)
            j += 1
            print('')
            print('Mean recall score ', np.mean(recall_accs))
            print('')
    #     return result_table
        best_c = result_table.loc[results_table['Mean recall score'].astype('float').idxmax()]['C_parameter']
        print('*********************************************************************************')
        print('Best model to choose from cross validation is with C parameter = ', best_c)
        print('*********************************************************************************')
        
        return best_c
    best_c = printing_kfold_scores(X_train_undersample,y_train_undersample)
    

      

    画混淆矩阵部分

    def plot_confusion_matrix(cm,classes,title ='Confusion     matrix',cmap=plt.cm.Blues):
        '''画混淆矩阵'''
        plt.imshow(cm,interpolation='nearest',cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=0)
        plt.yticks(tick_marks, classes)
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, cm[i, j],
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black")
    
        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')       

    lr中实现混淆矩阵

    import itertools
    from sklearn.metrics import confusion_matrix
    lr = LogisticRegression(C = best_c, penalty = 'l1')
    lr.fit(X_train_undersample,y_train_undersample.values.ravel())
    y_pred_undersample = lr.predict(X_test_undersample.values)
    
    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)
    np.set_printoptions(precision=2)
    
    print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
    
    # Plot non-normalized confusion matrix
    class_names = [0,1]
    plt.figure()
    plot_confusion_matrix(cnf_matrix
    , classes=class_names
    , title='Confusion matrix')
    plt.show()
    

      

    lr通过混淆矩阵找最优概率值

    lr = LogisticRegression(C = 0.01, penalty = 'l1')
    lr.fit(X_train_undersample,y_train_undersample.values.ravel())
    y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)
    
    thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
    
    plt.figure(figsize=(10,10))
    
    j = 1
    for i in thresholds:
        y_test_predictions_high_recall = y_pred_undersample_proba[:,1] > i
    
        plt.subplot(3,3,j)
        j += 1
    
        # Compute confusion matrix
        cnf_matrix = confusion_matrix(y_test_undersample,y_test_predictions_high_recall)
        np.set_printoptions(precision=2)
    
        print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
        
        # Plot non-normalized confusion matrix
        class_names = [0,1]
        plot_confusion_matrix(cnf_matrix , classes=class_names , title='Threshold >= %s'%i)    
    

      

  • 相关阅读:
    Java中的阻塞队列
    大数据笔记
    物联网小笔记
    shell 笔记
    Redis笔记
    rabbitMQ笔记
    java.lang.NoClassDefFoundError: freemarker/template/Template
    分布式系列学习-事务处理
    免费无需破解xshell xftp下载
    idea maven模块变灰或者java文件夹非 Sources文件夹
  • 原文地址:https://www.cnblogs.com/yang520ming/p/13796929.html
Copyright © 2011-2022 走看看