zoukankan      html  css  js  c++  java
  • SKlearn

    ROC、AUC 的理论知识 请参考我的博客 分类模型评估

    本文旨在 总结 其在 SKlearn 中的用法

    基础用法

    先看源码

    def roc_curve(y_true, y_score, pos_label=None, sample_weight=None,
                  drop_intermediate=True):
        """Compute Receiver operating characteristic (ROC)
        y_true : array, shape = [n_samples]
            True binary labels. If labels are not either {-1, 1} or {0, 1}, then
            pos_label should be explicitly given.
    
        y_score : array, shape = [n_samples]
            Target scores, can either be probability estimates of the positive
            class, confidence values, or non-thresholded measure of decisions
            (as returned by "decision_function" on some classifiers).
    
        pos_label : int or str, default=None
            The label of the positive class.
            When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1},
            ``pos_label`` is set to 1, otherwise an error will be raised.
            设置 label 中 哪个 label 是 正例,比如 label 为 [1, 2],2 为正例,那 pos_label=2
            当 pos_label为 None 时,如果 y_true 为 {-1, 1} or {0, 1}, pos_label 自动被设定为 1
    
        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights.
    
        drop_intermediate : boolean, optional (default=True)
            Whether to drop some suboptimal thresholds which would not appear
            on a plotted ROC curve. This is useful in order to create lighter
            ROC curves.
    
            .. versionadded:: 0.17
               parameter *drop_intermediate*.
    
        Returns
        -------
        fpr : array, shape = [>2]
            Increasing false positive rates such that element i is the false
            positive rate of predictions with score >= thresholds[i].
    
        tpr : array, shape = [>2]
            Increasing true positive rates such that element i is the true
            positive rate of predictions with score >= thresholds[i].
    
        thresholds : array, shape = [n_thresholds]
            Decreasing thresholds on the decision function used to compute
            fpr and tpr. `thresholds[0]` represents no instances being predicted
            and is arbitrarily set to `max(y_score) + 1`.
        """

    然后看一个最普通的示例,包括 ROC 的计算、AUC 的计算、ROC 曲线绘制

    import numpy as np
    from sklearn.metrics import roc_curve, auc, roc_auc_score
    import matplotlib.pyplot as plt
    
    
    ################################### ROC and AUC ###################################
    y = np.array([1, 1, 2, 2])
    scores = np.array([0.1, 0.4, 0.35, 0.8])
    
    ######## 计算 ROC ########
    fpr, tpr, thresholds = roc_curve(y, scores, pos_label=2)        ### pos_label 表示 哪个 label 属于 正例
    print(fpr)      # array([0. , 0. , 0.5, 0.5, 1. ])
    print(tpr)      # array([0. , 0.5, 0.5, 1. , 1. ])
    print(thresholds)       # array([1.8 , 0.8 , 0.4 , 0.35, 0.1 ])
    
    ##### 解释
    ## 1. 当 阈值 为 0.1 时,所有的负样本识别为正样本,所有的正样本识别为正样本,也就是 阈值太低,全部识别为正样本了;
    ## 2. 当 阈值 为 0.35时,0.5的负样本识别为正样本,所有的正样本识别为正样本,太多负样本识别为正样本了,如果是 刷脸 取款,那你的钱会被别人取光的;
    ## 3. 当 阈值 为 0.4 时,0.5的负样本识别为正样本,0.5的正样本识别为正样本,感觉好差啊,什么乱七八糟的;
    ## 4. 当 阈值 为 0.8 时,没有负样本识别为正样本,0.5的正样本识别为正样本,不咋的啊,如果是 刷脸 取款,至少你的钱不会被别人取走,不过你自己可能也取不出来;
    ## 5. 当 阈值 为 1.8 时,所有样本都是负样本,阈值太高了;
    
    ######## 计算 AUC ########
    print(auc(fpr, tpr))            # 0.75      ### ROC 曲线下面积 AUC
    print(roc_auc_score(y, scores)) # 0.75
    
    ######## 画 ROC 曲线 ########
    plt.plot(fpr, tpr)
    plt.show()

    输出

    EER 选择模型阈值

    ROC 用于优化模型

    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.datasets import load_iris
    from sklearn.metrics import roc_auc_score,roc_curve
    import matplotlib.pyplot as plt
    import numpy as np
    
    iris = load_iris()
    iris.target[iris.target==1], iris.target[iris.target==2] = 0, 1   #将iris的三类数据转化为二类数据,labels=1与labels=0合并为0,labels=2转化为1
    x_train,x_test,y_train,y_test = train_test_split(iris.data, iris.target, test_size=0.3)
    
    model = LogisticRegression(solver='newton-cg', multi_class='ovr')
    model.fit(x_train ,y_train)
    y_pre = model.predict_proba(x_test)
    print('predict_proba is', y_pre)
    
    y_0 = list(y_pre[:,1])    #取第二列数据,因为第二列概率为趋于0时分类类别为0,概率趋于1时分类类别为1
    fpr, tpr, thresholds = roc_curve(y_test, y_0)
    print(thresholds)
    # [1.98964087e+00 9.89640873e-01 6.03375665e-01 5.68953989e-01, 4.81061404e-01 2.40418592e-01 2.24425917e-01 3.43507028e-06]
    
    auc = roc_auc_score(y_test, y_0) #计算auc
    
    ####### 计算ks
    KS_max = 0
    best_thr = 0
    for i in range(len(fpr)):
        if(i == 0):
            KS_max = tpr[i] - fpr[i]
            best_thr = thresholds[i]
        elif (tpr[i] - fpr[i] > KS_max):
            KS_max = tpr[i] - fpr[i]
            best_thr = thresholds[i]
    print('最大KS为:',KS_max)          # 最大KS为: 1.0
    print('最佳阈值为:',best_thr)      # 最佳阈值为: 0.6998150731799142
    
    ###### 画曲线图
    plt.figure()
    plt.plot(fpr, tpr)
    plt.plot(fpr, tpr, 'o')
    plt.plot([0, 1], [1, 0], 'r')
    plt.title('$ROC curve$')
    plt.show()

    输出

    显然 第 3 个点(圆圈内) 离 y=-x 最近 

    one vs rest 多分类 ROC

    每个二分类都有一个 ROC

    import numpy as np
    import matplotlib.pyplot as plt
    from itertools import cycle
    
    from sklearn import svm, datasets
    from sklearn.metrics import roc_curve, auc
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import label_binarize
    from sklearn.multiclass import OneVsRestClassifier
    from scipy import interp
    
    # 导入鸢尾花数据集
    iris = datasets.load_iris()
    X = iris.data  # X.shape==(150, 4)
    y = iris.target  # y.shape==(150, )
    
    # 二进制化输出
    y = label_binarize(y, classes=[0, 1, 2])  # shape==(150, 3)
    n_classes = y.shape[1]  # n_classes==3
    
    # 添加噪音特征,使问题更困难
    random_state = np.random.RandomState(0)
    n_samples, n_features = X.shape  # n_samples==150, n_features==4
    X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]  # shape==(150, 84)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)
    
    # 学习区分某个类与其他的类
    classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state))
    y_score = classifier.fit(X_train, y_train).decision_function(X_test)
    
    # 为每个类别计算ROC曲线和AUC
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    
    plt.figure()
    lw = 2
    color = ['r', 'g', 'b']
    for i in range(3):
        plt.plot(fpr[i], tpr[i], color=color[i], lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    输出

    多分类 - 宏 ROC 微 ROC

    宏 ROC 和 微 ROC 好像有点绕,个人觉得 参考资料 中 这两个搞反了,本人做如下解释来区分这两个概念

    宏 ROC:先让 每个 二分类 独自计算,再算总的

    微 ROC:先把 每个 二分类 综合(加)起来,再算总的

    import numpy as np
    import matplotlib.pyplot as plt
    from itertools import cycle
    
    from sklearn import svm, datasets
    from sklearn.metrics import roc_curve, auc
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import label_binarize
    from sklearn.multiclass import OneVsRestClassifier
    from scipy import interp
    from scipy.interpolate import lagrange, interp1d
    
    
    iris = datasets.load_iris()
    X = iris.data  # X.shape==(150, 4)
    y = iris.target  # y.shape==(150, )
    
    # 二进制化输出
    y = label_binarize(y, classes=[0, 1, 2])  # shape==(150, 3)
    n_classes = y.shape[1]  # n_classes==3
    
    # 添加噪音特征,使问题更困难
    random_state = np.random.RandomState(0)
    n_samples, n_features = X.shape  # n_samples==150, n_features==4
    X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]  # shape==(150, 84)
    
    # 打乱数据集并切分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)
    
    classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state))
    y_score = classifier.fit(X_train, y_train).decision_function(X_test)
    
    # 为每个类别计算ROC曲线和AUC
    fpr = dict()        ### 假正例率
    tpr = dict()        ### 真正例率
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    ########################### 计算宏平均ROC曲线和AUC ###########################
    ### 每个二分类,各自算各自的,再综合
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    ########################### 计算微平均ROC曲线和AUC ###########################
    ### 先综合每个二分类的,再综合
    # 汇总所有FPR
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    print(all_fpr.shape)        # (42,)
    
    # 然后再用这些点对ROC曲线进行插值
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        #### 把每个 二分类 结果 加起来了
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])     ### 版本不同
        # f = interp1d(fpr[i], tpr[i])                  ### 这两句和上面一句是一个作用
        # mean_tpr += f(all_fpr)
    
    # 最后求平均并计算AUC
    mean_tpr /= n_classes
    print(mean_tpr)
    
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    ########################### 绘制所有ROC曲线 ###########################
    plt.figure()
    lw = 2
    plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4)
    plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4)
    
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw, label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))
    
    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Some extension of Receiver operating characteristic to multi-class')
    plt.legend(loc="lower right")
    plt.show()

    输出

    参考资料:

    https://blog.csdn.net/hfutdog/article/details/88079934

    https://www.jianshu.com/p/90106243d231

  • 相关阅读:
    MyBatis 最常见错误,启动时控制台无限输出日志
    mybatis.generator.configurationFile
    MBG 相关资源链接
    MyBatis Generator 详解 专题
    随笔分类
    android 播放assets文件里视频文件的问题
    AssetManager中的路径参数不能包含"assets/"
    Android---intent传递putStringArrayListExtra
    android中使用setVideoURI()播放视频
    vitamio官方demo源码分析
  • 原文地址:https://www.cnblogs.com/yanshw/p/12691329.html
Copyright © 2011-2022 走看看