zoukankan      html  css  js  c++  java
  • [模型评估]常用功能实现

    前言:机器学习的主要任务是使用数据训练出模型,使用训练好的模型完成相关如分类、聚类、推荐等任务。

    对于某类问题,可以建立多种不同的模型,也有多种评价指标对模型进行评估。

    本文汇总了最常用的几种评价指标,方便快速查询使用。(本文使用鸢尾花数据集和逻辑回归模型)

    1.常用评估指标及损失函数:

     1 import numpy as np
     2 import pandas as pd
     3 import matplotlib as mpl
     4 import matplotlib.pyplot as plt
     5 import sklearn
     6 
     7 from sklearn import datasets
     8 
     9 from sklearn.linear_model import LogisticRegression
    10 from sklearn.model_selection import cross_val_score
    11 from sklearn.model_selection import StratifiedKFold
    12 from sklearn.model_selection import train_test_split
    13 
    14 from sklearn.metrics import accuracy_score
    15 from sklearn.metrics import precision_score
    16 from sklearn.metrics import recall_score
    17 from sklearn.metrics import f1_score
    18 from sklearn.metrics import confusion_matrix
    19 
    20 from sklearn.metrics import mean_squared_error
    21 from sklearn.metrics import mean_squared_log_error
    22 from sklearn.metrics import mean_absolute_error
    23 from sklearn.metrics import r2_score
    24 
    25 
    26 #1. 加载数据
    27 iris = datasets.load_iris()
    28 x,y = iris.data,iris.target
    29 
    30 #2. 划分训练集与测试集
    31 x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)
    32 
    33 #3. 使用逻辑回归训练分类器
    34 lr_clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6, solver='liblinear',multi_class='ovr')
    35 lr_clf.fit(x_train, y_train)
    36 
    37 #4. 对测试集进行预测(分类)
    38 y_predict = lr_clf.predict(x_test)
    39 y_probs = lr_clf.predict_proba(x_test) #模型的预测得分
    40 
    41 #5. 衡量指标
    42 # #交叉验证
    43 # cross_score = cross_val_score(lr_clf, x_train, y_train, cv=3, scoring="accuracy")
    44 # print(cross_score)
    45 
    46 #5.1 准确率——accuracy
    47 accuracy = accuracy_score(y_test, y_predict)
    48 print(accuracy)
    49 print()
    50 
    51 #5.2 精确率——precision
    52 accuracy = precision_score(y_test, y_predict, average='macro')
    53 print(accuracy)
    54 print()
    55 
    56 #5.3 召回率——recall
    57 accuracy = recall_score(y_test, y_predict, average='weighted')
    58 print(accuracy)
    59 print()
    60 
    61 #5.4 f1分数——f1_score
    62 accuracy = f1_score(y_test, y_predict, average='micro')
    63 print(accuracy)
    64 print()
    65 
    66 #5.5 混淆矩阵
    67 confusion_matrix = confusion_matrix(y_test, y_predict)
    68 print(confusion_matrix)
    69 print()
    70 
    71 #5.6 均方误差(MSE)
    72 mse = mean_squared_error(y_test,y_predict)
    73 print(mse)
    74 
    75 #5.7 均方对数误差(MSLE)
    76 msle = mean_squared_log_error(y_test,y_predict)
    77 print(msle)
    78 
    79 #5.8 平均绝对值误差(MAE)
    80 mae = mean_absolute_error(y_test,y_predict)
    81 print(mae)
    82 
    83 #5.9 R2决定系数()
    84 r2 = r2_score(y_test,y_predict)
    85 print(r2)

    2.ROC曲线与P-R曲线:二分类问题使用

     1 import numpy as np
     2 import pandas as pd
     3 import matplotlib as mpl
     4 import matplotlib.pyplot as plt
     5 import sklearn
     6 
     7 from sklearn.model_selection import StratifiedKFold
     8 from sklearn.linear_model import LogisticRegression
     9 from sklearn.model_selection import train_test_split
    10 from sklearn.model_selection import cross_val_score
    11 from sklearn.model_selection import cross_val_predict
    12 from sklearn import datasets
    13 from sklearn.metrics import auc
    14 from sklearn.metrics import roc_curve
    15 from sklearn.metrics import precision_recall_curve
    16 
    17 #1. 加载数据
    18 iris = datasets.load_iris()
    19 x,y = iris.data,iris.target
    20 
    21 #2. 划分训练集与测试集
    22 x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)
    23 
    24 # 进行二分类:0类和非0类
    25 y_train = (y_train == 0)
    26 y_test = (y_test == 0)
    27 
    28 #3. 使用逻辑回归训练分类器
    29 lr_clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6, solver='liblinear',multi_class='ovr')
    30 lr_clf.fit(x_train, y_train)
    31 
    32 #4. 对测试集进行预测(分类)
    33 y_predict = lr_clf.predict(x_test)
    34 y_probs = lr_clf.predict_proba(x_test) #模型的预测得分
    35 
    36 cross_score = cross_val_score(lr_clf, x_train, y_train, cv=3, scoring="accuracy")
    37 y_scores = cross_val_predict(lr_clf, x_train, y_train, cv=3,method="decision_function")
    38 
    39 
    40 #5. 不同阈值下精度与召回率
    41 precisions,recalls,thresholds = precision_recall_curve(y_train,y_scores)
    42 def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    43     plt.plot(thresholds, precisions[:-1], "r-", label="Precision", linewidth=1)
    44     plt.plot(thresholds, recalls[:-1], "b-", label="Recall", linewidth=1)
    45     plt.xlabel("Threshold", fontsize=12)
    46     plt.legend(loc="upper left", fontsize=12)
    47     plt.ylim([0, 1])
    48 
    49 plt.figure(figsize=(8, 6))
    50 plot_precision_recall_vs_threshold(precisions,recalls,thresholds)
    51 plt.xlim([0, 6])
    52 plt.show()
    53 
    54 
    55 #6. P-R曲线
    56 def plot_precision_vs_recall(precisions, recalls):
    57     plt.plot(recalls, precisions, "b-", linewidth=2)
    58     plt.xlabel("Recall", fontsize=16)
    59     plt.ylabel("Precision", fontsize=16)
    60     plt.axis([0, 1, 0, 1])
    61 
    62 plt.figure(figsize=(8, 6))
    63 plot_precision_vs_recall(precisions, recalls)
    64 plt.show()
    65 
    66 
    67 #7. ROC曲线
    68 fpr, tpr, thresholds = roc_curve(y_train, y_scores)
    69 def plot_roc_curve(fpr, tpr, label=None):
    70     plt.plot(fpr, tpr, 'r--',linewidth=2, label=label)
    71     plt.plot([0, 1], [0, 1], 'k--')
    72     plt.axis([0, 1, 0, 1])
    73     plt.xlabel('False Positive Rate', fontsize=16)
    74     plt.ylabel('True Positive Rate', fontsize=16)
    75 
    76 plt.figure(figsize=(8, 6))
    77 plot_roc_curve(fpr, tpr)
    78 plt.show()

    3.常用距离计算:余弦距离 = 1 - 余弦相似度,欧氏距离

     1 import numpy as np
     2 import pandas as pd
     3 import matplotlib as mpl
     4 import matplotlib.pyplot as plt
     5 import sklearn
     6 
     7 from sklearn.metrics.pairwise import cosine_similarity
     8 from sklearn.metrics.pairwise import pairwise_distances
     9 from sklearn.metrics.pairwise import euclidean_distances
    10 
    11 #1. 余弦相似度
    12 a = [[1,3,2],[2,2,1]]
    13 dist = cosine_similarity(a)
    14 print(dist)
    15 print()
    16 
    17 
    18 #2. 余弦距离= 1 - 余弦相似度
    19 dist = pairwise_distances(a,metric="cosine")
    20 print(dist)
    21 print()
    22 
    23 #3. 欧氏距离
    24 dist = euclidean_distances(a)
    25 print(dist)
    26 print()

    4.数据集划分:holdout,cross-validation,bootstrap

     1 import numpy as np
     2 import pandas as pd
     3 import matplotlib as mpl
     4 import matplotlib.pyplot as plt
     5 import sklearn
     6 
     7 from sklearn import datasets
     8 from sklearn.linear_model import LogisticRegression
     9 from sklearn.model_selection import train_test_split
    10 
    11 from sklearn.model_selection import KFold
    12 from sklearn.model_selection import LeaveOneOut
    13 from sklearn.model_selection import ShuffleSplit
    14 
    15 from sklearn.model_selection import StratifiedKFold
    16 
    17 iris = datasets.load_iris()
    18 x,y = iris.data,iris.target
    19 
    20 #1. holdout方式:用于划分训练集与测试集
    21 x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)
    22 
    23 
    24 #2. 交叉验证:用于划分训练集和验证集
    25 
    26 ##2.1 K折交叉
    27 # kf = KFold(n_splits=2)
    28 # for train_idx, valid_idx in kf.split(x_train):
    29 #     train_data = x_train[train_idx]
    30 #     valid_data = x_train[valid_idx]
    31 #     print("%s %s" % (train_data, valid_data))
    32 # print()
    33 
    34 ##2.2 留一法
    35 # loo = LeaveOneOut()
    36 # for train_idx, valid_idx in loo.split(x_train):
    37 #     train_data = x_train[train_idx]
    38 #     valid_data = x_train[valid_idx]
    39 #     print("%s %s" % (train_data, valid_data))
    40 # print()
    41 
    42 ##2.3 随机洗牌
    43 # ss = ShuffleSplit(n_splits=5, test_size=0.25,random_state=0)
    44 # for train_idx, valid_idx in ss.split(x_train):
    45 #     train_data = x_train[train_idx]
    46 #     valid_data = x_train[valid_idx]
    47 #     print("%s %s" % (train_data, valid_data))
    48 # print()
    49 
    50 ##2.4 分层K折交叉
    51 # skf = StratifiedKFold(n_splits=3)
    52 # for train_idx, valid_idx in skf.split(x_train,y_train):
    53 #     train_data = x_train[train_idx]
    54 #     valid_data = x_train[valid_idx]
    55 #     print("%s %s" % (train_data, valid_data))
    56 # print()
    57 
    58 
    59 #3. boostrap方式(自助法):用于划分训练集与测试集
    60 df_x = pd.DataFrame(x)
    61 df_y = pd.DataFrame(y)
    62 data = pd.concat([df_x,df_y],axis=1)
    63 train = data.sample(frac=1.0,replace=True)
    64 test = data.loc[data.index.difference(train.index)].copy()
    65 #train中有m个样本,但是有重复的
    66 print(len(train))
    67 
    68 #test中有大约(m * 0.368)个样本,不重复,并且都未在train中出现过
    69 print(len(test))

    5.超参数调优:网格搜索,随机搜索

     1 import numpy as np
     2 import pandas as pd
     3 import matplotlib as mpl
     4 import matplotlib.pyplot as plt
     5 import sklearn
     6 from scipy.stats import randint
     7 from sklearn import datasets
     8 from sklearn.neighbors import KNeighborsClassifier
     9 from sklearn.metrics import accuracy_score
    10 
    11 from sklearn.model_selection import train_test_split
    12 from sklearn.model_selection import StratifiedKFold
    13 from sklearn.model_selection import GridSearchCV
    14 from sklearn.model_selection import RandomizedSearchCV
    15 
    16 iris = datasets.load_iris()
    17 x,y = iris.data,iris.target
    18 x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)
    19 
    20 knn = KNeighborsClassifier(n_neighbors=5)
    21 
    22 #本题使用KNN分类模型,通过网格搜索在1~30之间选择超参数n_neighbors,获得参数及评分
    23 
    24 ##1. 网格搜索
    25 # k_range = range(10, 15)
    26 # param_grid = dict(n_neighbors=k_range)
    27 # #param_grid = [{'n_neighbors':k_range},{'algorithm':['ball_tree','kd_tree'],'leaf_size':range(29,31),'n_neighbors':k_range}]
    28 # grid = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10, scoring='accuracy')
    29 # grid.fit(x, y)
    30 
    31 
    32 ##2. 随机搜索
    33 param_grid = {'n_neighbors':randint(low=1,high=31)}
    34 grid = RandomizedSearchCV(estimator=knn, param_distributions=param_grid, cv=10, scoring='accuracy')
    35 grid.fit(x, y)
    36 
    37 
    38 #最佳参数选择
    39 print(grid.best_params_)
    40 
    41 #最佳得分
    42 print(grid.best_score_)
    43 
    44 #最佳评估器,可直接用于实际训练
    45 best_knn = grid.best_estimator_
    46 best_knn.fit(x_train, y_train)
    47 
    48 pred_train = best_knn.predict(x_train)
    49 pred_test = best_knn.predict(x_test)
    50 
    51 train_acc = accuracy_score(y_train, pred_train)
    52 test_acc = accuracy_score(y_test, pred_test)
    53 
    54 #验证集与测试集得分
    55 print(train_acc,test_acc)
  • 相关阅读:
    Spring-boot内置的程序管理监控工具-Actuator
    分表工具类(根据唯一字符串)
    ES设计及规范
    测试单元Junit一直进不去@test方法解决方案
    MAC OS怎样将普通成员升级为管理员
    elasticSearch小结
    Mysql 查询条件中字符串尾部有空格也能匹配上的问题
    gitlab进行meger代码回滚
    ES与关系型数据库的通俗比较
    kafka的生产者配置以及发送信息的三种方式
  • 原文地址:https://www.cnblogs.com/asenyang/p/11201508.html
Copyright © 2011-2022 走看看