zoukankan      html  css  js  c++  java
  • 机器学习入门-信用卡欺诈数据案例

    在前几个博客,我们将各个部分进行了拆分,现在写一个整体的代码

    1.统计两种标签的个数,画直方图

    2. 变量与标签的拆分, 训练集与测试集数据的拆分(train_test_split), 对训练数据进行下采样

    3. 使用交叉验证进行超参数正则化因子的选择 KFold

    4. 混淆矩阵的绘制,即准确度,召回率,F1score的说明

    5. 概率阈值的逻辑回归对召回率和准确度结果的影响

    6.对数据进行过采样

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import  StandardScaler
    from sklearn.cross_validation import train_test_split
    
    # 数据读取
    data = pd.read_csv('creditcard.csv')
    
    data['Normal_Amount'] = StandardScaler().fit_transform(np.array(data['Amount']).reshape(-1, 1))
    print(data.head())
    
    data = data.drop(['Time', 'Normal_Amount'], axis=1)
    
    X = data.loc[:, data.columns != 'Class']
    y = data.loc[:, data.columns == 'Class']
    # 1. 统计个数画图
    count_class = pd.value_counts(data.Class, sort=True).sort_index()
    count_class.plot(kind='bar')
    plt.show()
    
    # 2 进行数据的下采样
    
    negtive_len = len(data[data.Class==1])
    negtive_index = data[data.Class==1].index
    
    # 获得正常样本的数据便签
    normal_len = len(data[data.Class==0])
    normal_index = data[data.Class==0].index
    # 随机抽取
    under_normal_index = np.random.choice(normal_index, negtive_len)
    # 将两个样本的索引进行合并
    under_index = np.concatenate([negtive_index, under_normal_index])
    
    under_data = data.iloc[under_index, :]
    under_x = under_data.loc[:, under_data.columns != 'Class']
    under_y = under_data.loc[:, under_data.columns == 'Class']
    
    
    
    # 进行整体数据的拆分
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=0)
    
    # 进行下采样数据的拆分
    under_train_x, under_text_x, under_train_y, under_test_y = train_test_split(under_x, under_y, test_size=0.3, random_state=0)
    from sklearn.cross_validation import KFold
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import recall_score
    
    # 3.使用交叉验证来选择参数
    def printing_KFold_score(train_x, train_y):
        """
        进行数据的交叉验证
        :param train_x:输入的数据的变量
        :param train_y:输入数据的标签
        :return: 返回最佳的参数
        """
        # 对数据的索引进行拆分
        fold = KFold(len(train_x), 5, shuffle=False)
        # 正则化参数
        c_parameter = [0.01, 0.1, 1, 10, 100]
        # 建立DataFrame用于参数和recall得分的储存
        train_score = pd.DataFrame(index=range(len(c_parameter), 2), columns=['c_parameter', 'F_score_mean'])
        train_score['c_parameter'] = c_parameter
        for c in c_parameter:
            scores = []
            for iter, fol in enumerate(fold, start=1):
                lr = LogisticRegression(C=c, penalty='l1')
                lr.fit(train_x.iloc[fol[0], :], train_y.iloc[fol[0], :])
                pred_y = lr.predict(train_x.iloc[fol[1], :])
                # 导入recall_score模块进行计算
                score = recall_score(train_y.iloc[fol[1], :], pred_y)
                print('{} {}'.format(iter, score))
                scores.append(score)
            mean_score = np.mean(scores)
            train_score['F_score_mean'] = mean_score
        print(train_score)
        # 根据索引, idxmax() 表示获得最大值的索引,获得最佳的best_parameter
        best_parameter = train_score.iloc[train_score['F_score_mean'].idxmax(), :]['c_parameter']
        print('the best_parameter is {}'.format(best_parameter))
    
        return best_parameter
    
    
    best_c = printing_KFold_score(under_train_x, under_train_y)
    
    import itertools
    # 4. 画出混淆矩阵, 导入confusion_matrix
    def plot_matrix(conf, classes,
                    title='confusion matrix', cmap=plt.cm.Blues):
        """
        :param conf: 输入的混淆矩阵
        :param classes: 混淆矩阵的类别数
        :param title: 图形的标题
        :param cmap: 图形的颜色风格
        :return: 
        """
        # 展示直方图
        plt.imshow(conf, cmap=cmap)
        # 图片标题
        plt.title(title)
        # 图片颜色条
        plt.colorbar()
        # 设置x轴和y轴位置
        x_index = np.array(classes)
        # 第一个参数是位置,第二个参数是标签名
        plt.xticks(x_index, classes, rotation=0)
        plt.yticks(x_index, classes)
        conf_mean = conf.max() / 2
        # itertools.product
        # [0, 1] & [0, 1]
        # [0, 0], [0, 1], [1, 0], [1, 1]
        # 将数字添加到混合矩阵中
        for i, j in itertools.product(range(conf.shape[0]), range(conf.shape[1])):
            plt.text(j, i, conf[i, j], horizontalalignment='center',
                     color='white'if conf[i, j] > conf_mean else 'black')
        # 画出的图更加的紧凑
        plt.tight_layout()
    
    from sklearn.metrics import confusion_matrix
    # 建立逻辑回归模型
    lr = LogisticRegression(C=best_c, penalty='l1')
    # 模型训练
    lr.fit(under_train_x, under_train_y)
    # 模型预测
    pred_y = lr.predict(under_text_x)
    # 获得混合矩阵
    conf = confusion_matrix(under_test_y, pred_y)
    # 画图
    plot_matrix(conf, classes=[0, 1])
    # accrurracy
    # 精度
    accurracy = (conf[0, 0] + conf[1, 1]) / (conf[0, 0] + conf[0, 1] + conf[1, 0] + conf[1, 1])
    # 召回率
    recall = conf[1, 1] / (conf[1, 0] + conf[1, 1])
    # F1得分
    F1_score =  1 / (accurracy + recall)
    plt.show()
    #
    #
    # # 使用当前的测试数据进行测试
    #
    # pred_y = lr.predict(test_x)
    # # 获得混合矩阵
    # conf = confusion_matrix(test_y, pred_y)
    # # 画图
    # plot_matrix(conf, classes=[0, 1])
    # plt.show()
    #
    # 5. 测试不同的概率阈值的逻辑回归对准确率和召回率的影响
    lr = LogisticRegression(C=best_c, penalty='l1')
    lr.fit(under_train_x, under_train_y)
    
    pred_array = np.array(lr.predict_proba(under_text_x))
    
    thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    j = 1
    for threshold in thresholds:
        pred_y_new = np.zeros([len(under_text_x), 1])
        pred_y_new[pred_array[:, 1] > threshold] = 1
        # 获得矩阵
        plt.subplot(3, 3, j)
        conf = confusion_matrix(under_test_y, pred_y_new)
        # 画图
        plot_matrix(conf, classes=[0, 1], title='threshod is {}'.format(threshold))
        accurracy = (conf[0, 0] + conf[1, 1]) / (conf[0, 0] + conf[0, 1] + conf[1, 0] + conf[1, 1])
        # 召回率
        recall = conf[1, 1] / (conf[1, 0] + conf[1, 1])
        j = j + 1
    plt.show()
    #
    #
    # 6. 进行数据过采样操作
    from imblearn.over_sampling import SMOTE
    from sklearn.cross_validation import train_test_split
    
    X = data.loc[:, data.columns != 'Class']
    y = data.loc[:, data.columns == 'Class']
    
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=0)
    overstamp = SMOTE(random_state=0)
    
    SMOTE_train_x, SMOTE_train_y = overstamp.fit_sample(train_x, train_y)
    # 统计数据的标签0,1个数
    print(pd.value_counts(SMOTE_train_y, sort=True).sort_index())
  • 相关阅读:
    几种存储器类型总结
    几种存储器类型总结
    JPEG2000开发SDK及其特点
    JPEG2000开发SDK及其特点
    测试代码的编写
    测试代码的编写
    关于opencv的几个小总结
    关于opencv的几个小总结
    假设检验与判决准则(一)
    Canny算法源码,欢迎交流
  • 原文地址:https://www.cnblogs.com/my-love-is-python/p/10271318.html
Copyright © 2011-2022 走看看