zoukankan      html  css  js  c++  java
  • sklearn中机器学习算法评价指标

    #机器学习分类算法的评价指标
    #二分类问题的算法评价指标
    import numpy as np
    import matplotlib.pyplot as plt
    import pandas as pd
    from sklearn import datasets
    d=datasets.load_digits()
    x=d.data
    y=d.target.copy() #防止原来数据改变
    print(len(y))
    y[d.target==9]=1
    y[d.target!=9]=0
    print(y)
    print(pd.value_counts(y)) #统计各个数据出现的个数
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
    from sklearn.linear_model import LogisticRegression
    log_reg=LogisticRegression(solver="newton-cg") #使用逻辑回归算法进行数据的分类
    log_reg.fit(x_train,y_train)
    print(log_reg.score(x_test,y_test))
    y_pre=log_reg.predict(x_test)
    def TN(y_true,y_pre):
    return np.sum((y_true==0) & (y_pre==0))
    def FP(y_true,y_pre):
    return np.sum((y_true==0) & (y_pre==1))
    def FN(y_true,y_pre):
    return np.sum((y_true==1) & (y_pre==0))
    def TP(y_true,y_pre):
    return np.sum((y_true==1) & (y_pre==1))
    print(TN(y_test,y_pre))
    print(FP(y_test,y_pre))
    print(FN(y_test,y_pre))
    print(TP(y_test,y_pre))
    #混淆矩阵的定义
    def confusion_matrix(y_true,y_pre):
    return np.array([
    [TN(y_true,y_pre),FP(y_true,y_pre)],
    [FN(y_true,y_pre),TP(y_true,y_pre)]
    ])
    print(confusion_matrix(y_test,y_pre))
    #精准率
    def precision(y_true,y_pre):
    try:
    return TP(y_true,y_pre)/(FP(y_true,y_pre)+TP(y_true,y_pre))
    except:
    return 0.0
    #召回率
    def recall(y_true,y_pre):
    try:
    return TP(y_true,y_pre)/(FN(y_true,y_pre)+TP(y_true,y_pre))
    except:
    return 0.0
    print(precision(y_test,y_pre))
    print(recall(y_test,y_pre))
    #sklearn中直接调用混淆矩阵,召回率,精准率
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    print((confusion_matrix(y_test,y_pre)))
    print(precision_score(y_test,y_pre))
    print(recall_score(y_test,y_pre))
    print(log_reg.score(x_test,y_test))
    #sklearn中F1的值,取得二者的调和平均值,当二者数据差距很大的时候,综合指标计算可以使得数据偏向于最小值
    def F1(pre,rec):
    try:
    return (2*pre*rec)/(pre+rec)
    except:
    return 0.0
    print(F1(precision(y_test,y_pre),recall(y_test,y_pre)))
    print(F1(0.1,0.9))
    print(F1(0,1))
    #直接使用sklearn中封装的函数F1_score
    from sklearn.metrics import f1_score
    print(f1_score(y_test,y_pre))
    print(log_reg.decision_function(x_test)) #输出逻辑回归预测时决策边界的大小,即theta*X的值(与0作比较)
    #改变决策边界的阈值score=0,可以改变机器学习的召回率和精准率,
    decision_scores=log_reg.decision_function(x_test) #属于对于测试数据集计算得到的theta*X的值,与决策边界阈值0比较输出预测结果
    y_pre2=np.array(decision_scores>=5,dtype="int")
    print(precision(y_test,y_pre2)) #提高(阈值提高)
    print(recall(y_test,y_pre2)) #下降
    print(confusion_matrix(y_test,y_pre2))
    y_pre3=np.array(decision_scores>=-5,dtype="int")
    print(precision(y_test,y_pre3)) #下降 (阈值减小)
    print(recall(y_test,y_pre3)) #提高
    print(confusion_matrix(y_test,y_pre3))
    print(y_pre3)
    #绘制出决策边界阈值与精准率和召回率的变化曲线
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    thresholds=np.arange(np.min(decision_scores),np.max(decision_scores),0.1)
    pre=[]
    rec=[]
    for threshold in thresholds:
    y_pre11=np.array(decision_scores>threshold,dtype="int")
    pre.append(precision_score(y_test,y_pre11))
    rec.append(recall_score(y_test,y_pre11))
    plt.figure()
    plt.plot(thresholds,pre,"r",thresholds,rec,"g")
    plt.show()
    #输出精确率和召回率相互变化曲线
    plt.plot(pre,rec,"g",linewidth=1)
    plt.show()
    #直接在sklearn中调用精准率召回率PR曲线直接输出相应的精准率变化和召回率变化以及决策阈值
    from sklearn.metrics import precision_recall_curve
    decision_scores=log_reg.decision_function(x_test)
    pre1,rec1,thre1=precision_recall_curve(y_test,decision_scores)
    print(rec1.shape)
    print(pre1.shape)
    print(thre1.shape) #横坐标的值少一个元素,即最右边的精准率为1,召回率为0的点不存在
    plt.figure()
    plt.plot(thre1,pre1[:-1],"r") #需要除去一个点
    plt.plot(thre1,rec1[:-1],"g")
    plt.show()
    plt.plot(pre1,rec1)
    plt.show()
    #sklearn中调用ROC(TPR与FPR曲线)
    from sklearn.metrics import roc_curve
    decision_scores=log_reg.decision_function(x_test)#算出来的测试数据集的阈值向量
    fpr,tpr,thre2=roc_curve(y_test,decision_scores)
    plt.plot(fpr,tpr,"r")
    plt.show() #曲线和x轴所围成的面积越大则性能越好一点
    # 输出ROC与x轴围成的面积大小roc_auc
    from sklearn.metrics import roc_auc_score
    print(roc_auc_score(y_test,decision_scores))
    #多分类问题下的各个机器学习评判指标应用
    import numpy as np
    import matplotlib.pyplot as plt
    import pandas as pd
    from sklearn import datasets
    d=datasets.load_digits()
    x=d.data
    y=d.target
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
    from sklearn.linear_model import LogisticRegression
    log1=LogisticRegression()
    log1.fit(x_train,y_train)
    print(log1.score(x_test,y_test))
    y_p=log1.predict(x_test)
    from sklearn.metrics import precision_score
    print(precision_score(y_test,y_p,average="micro")) #输出多分类问题的精准率的大小(需要设定average参数)
    print(recall_score(y_test,y_p,average="micro")) #输出多分类问题的召回率
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test,y_p)) #输出混淆矩阵
    #绘制混淆矩阵通过灰度图的方法可以看出各个行列元素的相对大小
    c=confusion_matrix(y_test,y_p)
    plt.matshow(c,cmap=plt.cm.gray) #图像越亮,矩阵里元素的数据越大,表明预测越准确
    plt.show()
    row_sum=np.sum(c,axis=1)
    erro_matrix=c/row_sum #每一行数据除以每一行数据的和
    np.fill_diagonal(erro_matrix,0) #将对角线的值填充为0
    print(erro_matrix)
    #输出犯错的地方,越亮越错误
    plt.matshow(erro_matrix,cmap=plt.cm.gray) #输出多元分类结果时所输出的错误结果
    plt.show()
  • 相关阅读:
    uva400 Unix ls
    cf641 div2 abcd
    cf619 div2 abcd
    cf620 div2 abcde
    atc160
    cf638 div2 abcd
    CodeCraft-20(Div. 2 abcd
    cf Round 621 abcd
    luogu1941 飞扬的小鸟
    UVA1601 The Morning afther Halloween
  • 原文地址:https://www.cnblogs.com/Yanjy-OnlyOne/p/12526433.html
Copyright © 2011-2022 走看看