zoukankan      html  css  js  c++  java
  • 3.3.py 迁移学习

    本章节主要说的迁移学习的事情,迁移学习简单来说:举个例子,新产品上线,建模使用其他产品和少量新产品的数据建模,不码字,具体百度吧。

    全部代码:

    # -*- coding: utf-8 -*-
    """
    Created on Tue Dec 24 15:25:58 2019
    
    @author: zixing.mei
    """
    
    import pandas as pd  
    from sklearn.metrics import roc_auc_score,roc_curve,auc  
    from sklearn.model_selection import train_test_split  
    from sklearn import metrics  
    from sklearn.linear_model import LogisticRegression  
    from sklearn.svm import LinearSVC  
    import numpy as np  
    import random  
    import math  
    from sklearn.calibration import CalibratedClassifierCV   #概率矫正方法
    data = pd.read_excel('D:/陈桂梅/学习资料/64353 智能风控_源码及数据_2010/智能风控(数据集)/tra_sample.xlsx')  
    data.head()  
    feature_lst = ['zx_score','msg_cnt','phone_num_cnt','register_days']    
    train = data[data.type == 'target'].reset_index().copy()    #目标域
    diff = data[data.type == 'origin'].reset_index().copy()    #源域
    val = data[data.type == 'offtime'].reset_index().copy()   #oot 
      
    ''' 
    TrainS 目标域样本   
    TrainA 源域样本   
    LabelS 目标域标签   
    LabelA 源域标签   
    '''  
      
    train = train.loc[:1200]   #取前面1200行 
        
    trans_S = train[feature_lst].copy()    
    label_S = train['bad_ind'].copy()    
        
    trans_A = diff[feature_lst].copy()    
    label_A = diff['bad_ind'].copy()    
        
    val_x =  val[feature_lst].copy()    
    val_y = val['bad_ind'].copy()    
        
    test = val_x.copy()   
    
    
    #%% 使用目标域数据建模 ,ks相差10%,roc曲线不平滑,说明模型泛化能力差
    lr_model = LogisticRegression(C=0.1,class_weight = 'balanced',solver = 'liblinear')  
    lr_model.fit(trans_S,label_S)  
      
    y_pred = lr_model.predict_proba(trans_S)[:,1]  
    fpr_lr_train,tpr_lr_train,_ = roc_curve(label_S,y_pred)  
    train_ks = abs(fpr_lr_train - tpr_lr_train).max()  
    print('train_ks : ',train_ks)  #train_ks :  0.48500238435860754
      
    y_pred = lr_model.predict_proba(test)[:,1]  
    fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)  
    val_ks = abs(fpr_lr - tpr_lr).max()  
    print('val_ks : ',val_ks)  #val_ks :  0.3887057754389137
    
    
    from matplotlib import pyplot as plt  
    plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')  
    plt.plot(fpr_lr,tpr_lr,label = 'evl LR')  
    plt.plot([0,1],[0,1],'k--')  
    plt.xlabel('False positive rate')  
    plt.ylabel('True positive rate')  
    plt.title('ROC Curve')  
    plt.legend(loc = 'best')  
    plt.show()  
    trans_data = np.concatenate((trans_A, trans_S), axis=0)  
    trans_label = np.concatenate((label_A, label_S), axis=0)  
    
    
    #%%使用源域数据建模 ,ks不稳定,roc不稳定,泛化能力非常差
    lr_model = LogisticRegression(C=0.3,class_weight = 'balanced',solver = 'liblinear')  
    lr_model.fit(trans_A,label_A)  
      
    y_pred = lr_model.predict_proba(trans_data)[:,1]  
    fpr_lr_train,tpr_lr_train,_ = roc_curve(trans_label,y_pred)  
    train_ks = abs(fpr_lr_train - tpr_lr_train).max()  
    print('train_ks : ',train_ks)  #train_ks :  0.4910909493184976
      
    y_pred = lr_model.predict_proba(test)[:,1]  
    fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)  
    val_ks = abs(fpr_lr - tpr_lr).max()  
    print('val_ks : ',val_ks)  #val_ks :  0.33077621830414
    
    from matplotlib import pyplot as plt  
    plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')  
    plt.plot(fpr_lr,tpr_lr,label = 'evl LR')  
    plt.plot([0,1],[0,1],'k--')  
    plt.xlabel('False positive rate')  
    plt.ylabel('True positive rate')  
    plt.title('ROC Curve')  
    plt.legend(loc = 'best')  
    plt.show()
    
       
    #%%将源域和目标域的数据整合在一起   
    import numpy as np      
    import pandas as pd    
    from sklearn.linear_model import LogisticRegression     
    from sklearn.metrics import roc_curve  
       
    def Tr_lr_boost(trans_A,trans_S,label_A,label_S,test,label_test,
                      N=500,early_stopping_rounds =100):    
        """   
            逻辑回归的学习率、权重的大小,影响整体收敛的快慢    
            H 测试样本分类结果    
            TrainS 目标域样本    
            TrainA 源域样本    
            LabelS 目标域标签    
            LabelA 源域标签    
            Test  测试样本    
            N 迭代次数   
            early_stopping_rounds 提前停止轮次 
        """   
        #计算weight      
        def calculate_P(weights, label):      
            total = np.sum(weights)      
            return np.asarray(weights / total, order='C')      
              
        #用逻辑回归作为基分类器,输出概率      
        def train_classify(trans_data, trans_label, test_data, P):      
            clf = LogisticRegression(C=0.3,class_weight = 'balanced',solver='liblinear')      
            clf.fit(trans_data, trans_label, sample_weight=P[:, 0])      
            return clf.predict_proba(test_data)[:,1],clf      
              
        #计算在目标域上面的错误率      
        def calculate_error_rate(label_R, label_H, weight):      
            total = np.sum(weight)      
            return np.sum(weight[:, 0] / total * np.abs(label_R - label_H))      
              
        #根据逻辑回归输出的score的得到标签,注意这里不能用predict直接输出标签      
        def put_label(score_H,thred):      
            new_label_H = []      
            for i in score_H:      
                if i <= thred:      
                    new_label_H.append(0)      
                else:      
                    new_label_H.append(1)      
            return new_label_H      
              
        #指定迭代次数,相当于集成模型中基模型的数量      
             
              
        #拼接数据集    
        trans_data = np.concatenate((trans_A, trans_S), axis=0)      
        trans_label = np.concatenate((label_A, label_S), axis=0)      
            
        #三个数据集样本数    
        row_A = trans_A.shape[0]      
        row_S = trans_S.shape[0]      
        row_T = test.shape[0]      
            
        #三个数据集合并为打分数据集    
        test_data = np.concatenate((trans_data, test), axis=0)      
              
        # 初始化权重      
        weights_A = np.ones([row_A, 1])/row_A      
        weights_S = np.ones([row_S, 1])/row_S*2  # 目标域的权重会比源域的权重高一些   
        weights = np.concatenate((weights_A, weights_S), axis=0)      
            
        #按照公式初始化beta值    
        bata = 1 / (1 + np.sqrt(2 * np.log(row_A / N)))    #N是多少,N是迭代次数  
              
            
        # 存每一次迭代的bata值=error_rate / (1 - error_rate)      
        bata_T = np.zeros([1, N])      
        # 存储每次迭代的标签    
        result_label = np.ones([row_A + row_S + row_T, N])       
              
        trans_data = np.asarray(trans_data, order='C')      
        trans_label = np.asarray(trans_label, order='C')      
        test_data = np.asarray(test_data, order='C')      
            
        #最优KS      
        best_ks = -1      
        #最优基模型数量          
        best_round = -1    
        #最优模型      
        best_model = -1     
             
        """ 
        初始化结束    
        正式开始训练  
        """     
            
        for i in range(N):      
            P = calculate_P(weights, trans_label)      
              
            result_label[:, i],model = train_classify(trans_data, trans_label, test_data, P)  
            score_H = result_label[row_A:row_A + row_S, i]      
            pctg = np.sum(trans_label)/len(trans_label)      
            thred = pd.DataFrame(score_H).quantile(1-pctg)[0]      
            
            label_H = put_label(score_H,thred)      
            
            #计算在目标域上的错误率    
            error_rate = calculate_error_rate(label_S, label_H,   
                                                        weights[row_A:row_A + row_S, :])  
            # 防止过拟合     
            if error_rate > 0.5:      
                error_rate = 0.5      
            if error_rate == 0:      
                N = i      
                break       
                    
            bata_T[0, i] = error_rate / (1 - error_rate)      
              
            # 调整目标域样本权重      
            for j in range(row_S):      
                weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i],  
                                          (-np.abs(result_label[row_A + j, i] - label_S[j])))
              
            # 调整源域样本权重      
            for j in range(row_A):      
                weights[j] = weights[j] * np.power(bata,   
                                                   np.abs(result_label[j, i] - label_A[j]))  
            y_pred = result_label[(row_A + row_S):,i]      
            fpr_lr_train,tpr_lr_train,_ = roc_curve(label_test,y_pred)      
            train_ks = abs(fpr_lr_train - tpr_lr_train).max()      
            print('test_ks : ',train_ks,'当前第',i+1,'')      
                  
            # 不再使用后一半学习器投票,而是只保留效果最好的逻辑回归模型      
            if train_ks > best_ks :      
                best_ks = train_ks      
                best_round = i      
                best_model = model    
            # 当超过eadrly_stopping_rounds轮KS不再提升后,停止训练  
            if best_round < i - early_stopping_rounds:  
                break  
        return best_ks,best_round,best_model   
        
    # 训练并得到最优模型best_model    
    best_ks,best_round,best_model = Tr_lr_boost(trans_A,trans_S,label_A,label_S,
                                                test,label_test=val_y,N=300,
                                                early_stopping_rounds=20) 
    
    y_pred = best_model.predict_proba(trans_S)[:,1]  
    fpr_lr_train,tpr_lr_train,_ = roc_curve(label_S,y_pred)  
    train_ks = abs(fpr_lr_train - tpr_lr_train).max()  
    print('train_ks : ',train_ks)  #train_ks :  0.4629947544110634
      
    y_pred = best_model.predict_proba(test)[:,1]  
    fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)  
    val_ks = abs(fpr_lr - tpr_lr).max()  
    print('val_ks : ',val_ks)  # val_ks :  0.39846160021324123
      
    from matplotlib import pyplot as plt  
    plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')  
    plt.plot(fpr_lr,tpr_lr,label = 'evl LR')  
    plt.plot([0,1],[0,1],'k--')  
    plt.xlabel('False positive rate')  
    plt.ylabel('True positive rate')  
    plt.title('ROC Curve')  
    plt.legend(loc = 'best')  
    plt.show()  
    View Code

    感觉迁移学习,很多公司不采用,比较难办

    一、数据准备

    主要是划分目标域,源域,oot样本

    import pandas as pd  
    from sklearn.metrics import roc_auc_score,roc_curve,auc  
    from sklearn.model_selection import train_test_split  
    from sklearn import metrics  
    from sklearn.linear_model import LogisticRegression  
    from sklearn.svm import LinearSVC  
    import numpy as np  
    import random  
    import math  
    from sklearn.calibration import CalibratedClassifierCV   #概率矫正方法
    data = pd.read_excel('xxx/tra_sample.xlsx')  
    data.head()  
    feature_lst = ['zx_score','msg_cnt','phone_num_cnt','register_days']    
    train = data[data.type == 'target'].reset_index().copy()    #目标域
    diff = data[data.type == 'origin'].reset_index().copy()    #源域
    val = data[data.type == 'offtime'].reset_index().copy()   #oot 
      
    ''' 
    TrainS 目标域样本   
    TrainA 源域样本   
    LabelS 目标域标签   
    LabelA 源域标签   
    '''  
      
    train = train.loc[:1200]   #取前面1200行 
        
    trans_S = train[feature_lst].copy()    
    label_S = train['bad_ind'].copy()    
        
    trans_A = diff[feature_lst].copy()    
    label_A = diff['bad_ind'].copy()    
        
    val_x =  val[feature_lst].copy()    
    val_y = val['bad_ind'].copy()    
        
    test = val_x.copy()   

    二、目标域建模

    使用目标域建模,然后使用oot测试

    #%% 使用目标域数据建模 ,ks相差10%,roc曲线不平滑,说明模型泛化能力差
    lr_model = LogisticRegression(C=0.1,class_weight = 'balanced',solver = 'liblinear')  
    lr_model.fit(trans_S,label_S)  
      
    y_pred = lr_model.predict_proba(trans_S)[:,1]  
    fpr_lr_train,tpr_lr_train,_ = roc_curve(label_S,y_pred)  
    train_ks = abs(fpr_lr_train - tpr_lr_train).max()  
    print('train_ks : ',train_ks)  #train_ks :  0.48500238435860754
      
    y_pred = lr_model.predict_proba(test)[:,1]  
    fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)  
    val_ks = abs(fpr_lr - tpr_lr).max()  
    print('val_ks : ',val_ks)  #val_ks :  0.3887057754389137
    
    
    from matplotlib import pyplot as plt  
    plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')  
    plt.plot(fpr_lr,tpr_lr,label = 'evl LR')  
    plt.plot([0,1],[0,1],'k--')  
    plt.xlabel('False positive rate')  
    plt.ylabel('True positive rate')  
    plt.title('ROC Curve')  
    plt.legend(loc = 'best')  
    plt.show()  
    trans_data = np.concatenate((trans_A, trans_S), axis=0)  
    trans_label = np.concatenate((label_A, label_S), axis=0)  

    ks相差10%,roc曲线不平滑,说明模型泛化能力差

     三、使用源域建模

    使用源域建模,然后使用oot测试

    #%%使用源域数据建模 ,ks不稳定,roc不稳定,泛化能力非常差
    lr_model = LogisticRegression(C=0.3,class_weight = 'balanced',solver = 'liblinear')  
    lr_model.fit(trans_A,label_A)  
      
    y_pred = lr_model.predict_proba(trans_data)[:,1]  
    fpr_lr_train,tpr_lr_train,_ = roc_curve(trans_label,y_pred)  
    train_ks = abs(fpr_lr_train - tpr_lr_train).max()  
    print('train_ks : ',train_ks)  #train_ks :  0.4910909493184976
      
    y_pred = lr_model.predict_proba(test)[:,1]  
    fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)  
    val_ks = abs(fpr_lr - tpr_lr).max()  
    print('val_ks : ',val_ks)  #val_ks :  0.33077621830414
    
    from matplotlib import pyplot as plt  
    plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')  
    plt.plot(fpr_lr,tpr_lr,label = 'evl LR')  
    plt.plot([0,1],[0,1],'k--')  
    plt.xlabel('False positive rate')  
    plt.ylabel('True positive rate')  
    plt.title('ROC Curve')  
    plt.legend(loc = 'best')  
    plt.show()

    ks不稳定,roc不稳定,泛化能力非常差

     四、使用源域+目标域根据不同权重建模

    使用源域+目标域,不断调整错分的样本的权重,使用目标域、oot测试

    #%%将源域和目标域的数据整合在一起   
    import numpy as np      
    import pandas as pd    
    from sklearn.linear_model import LogisticRegression     
    from sklearn.metrics import roc_curve  
       
    def Tr_lr_boost(trans_A,trans_S,label_A,label_S,test,label_test,
                      N=500,early_stopping_rounds =100):    
        """   
            逻辑回归的学习率、权重的大小,影响整体收敛的快慢    
            H 测试样本分类结果    
            TrainS 目标域样本    
            TrainA 源域样本    
            LabelS 目标域标签    
            LabelA 源域标签    
            Test  测试样本    
            N 迭代次数   
            early_stopping_rounds 提前停止轮次 
        """   
        #计算weight      
        def calculate_P(weights, label):      
            total = np.sum(weights)      
            return np.asarray(weights / total, order='C')      
              
        #用逻辑回归作为基分类器,输出概率      
        def train_classify(trans_data, trans_label, test_data, P):      
            clf = LogisticRegression(C=0.3,class_weight = 'balanced',solver='liblinear')      
            clf.fit(trans_data, trans_label, sample_weight=P[:, 0])      
            return clf.predict_proba(test_data)[:,1],clf      
              
        #计算在目标域上面的错误率      
        def calculate_error_rate(label_R, label_H, weight):      
            total = np.sum(weight)      
            return np.sum(weight[:, 0] / total * np.abs(label_R - label_H))      
              
        #根据逻辑回归输出的score的得到标签,注意这里不能用predict直接输出标签      
        def put_label(score_H,thred):      
            new_label_H = []      
            for i in score_H:      
                if i <= thred:      
                    new_label_H.append(0)      
                else:      
                    new_label_H.append(1)      
            return new_label_H      
              
        #指定迭代次数,相当于集成模型中基模型的数量      
             
              
        #拼接数据集    
        trans_data = np.concatenate((trans_A, trans_S), axis=0)      
        trans_label = np.concatenate((label_A, label_S), axis=0)      
            
        #三个数据集样本数    
        row_A = trans_A.shape[0]      
        row_S = trans_S.shape[0]      
        row_T = test.shape[0]      
            
        #三个数据集合并为打分数据集    
        test_data = np.concatenate((trans_data, test), axis=0)      
              
        # 初始化权重      
        weights_A = np.ones([row_A, 1])/row_A      
        weights_S = np.ones([row_S, 1])/row_S*2  # 目标域的权重会比源域的权重高一些   
        weights = np.concatenate((weights_A, weights_S), axis=0)      
            
        #按照公式初始化beta值    
        bata = 1 / (1 + np.sqrt(2 * np.log(row_A / N)))    #N是多少,N是迭代次数  
              
            
        # 存每一次迭代的bata值=error_rate / (1 - error_rate)      
        bata_T = np.zeros([1, N])      
        # 存储每次迭代的标签    
        result_label = np.ones([row_A + row_S + row_T, N])       
              
        trans_data = np.asarray(trans_data, order='C')      
        trans_label = np.asarray(trans_label, order='C')      
        test_data = np.asarray(test_data, order='C')      
            
        #最优KS      
        best_ks = -1      
        #最优基模型数量          
        best_round = -1    
        #最优模型      
        best_model = -1     
             
        """ 
        初始化结束    
        正式开始训练  
        """     
            
        for i in range(N):      
            P = calculate_P(weights, trans_label)      
              
            result_label[:, i],model = train_classify(trans_data, trans_label, test_data, P)  
            score_H = result_label[row_A:row_A + row_S, i]      
            pctg = np.sum(trans_label)/len(trans_label)      
            thred = pd.DataFrame(score_H).quantile(1-pctg)[0]      
            
            label_H = put_label(score_H,thred)      
            
            #计算在目标域上的错误率    
            error_rate = calculate_error_rate(label_S, label_H,   
                                                        weights[row_A:row_A + row_S, :])  
            # 防止过拟合     
            if error_rate > 0.5:      
                error_rate = 0.5      
            if error_rate == 0:      
                N = i      
                break       
                    
            bata_T[0, i] = error_rate / (1 - error_rate)      
              
            # 调整目标域样本权重      
            for j in range(row_S):      
                weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i],  
                                          (-np.abs(result_label[row_A + j, i] - label_S[j])))
              
            # 调整源域样本权重      
            for j in range(row_A):      
                weights[j] = weights[j] * np.power(bata,   
                                                   np.abs(result_label[j, i] - label_A[j]))  
            y_pred = result_label[(row_A + row_S):,i]      
            fpr_lr_train,tpr_lr_train,_ = roc_curve(label_test,y_pred)      
            train_ks = abs(fpr_lr_train - tpr_lr_train).max()      
            print('test_ks : ',train_ks,'当前第',i+1,'')      
                  
            # 不再使用后一半学习器投票,而是只保留效果最好的逻辑回归模型      
            if train_ks > best_ks :      
                best_ks = train_ks      
                best_round = i      
                best_model = model    
            # 当超过eadrly_stopping_rounds轮KS不再提升后,停止训练  
            if best_round < i - early_stopping_rounds:  
                break  
        return best_ks,best_round,best_model   
        
    # 训练并得到最优模型best_model    
    best_ks,best_round,best_model = Tr_lr_boost(trans_A,trans_S,label_A,label_S,
                                                test,label_test=val_y,N=300,
                                                early_stopping_rounds=20) 
    
    y_pred = best_model.predict_proba(trans_S)[:,1]  
    fpr_lr_train,tpr_lr_train,_ = roc_curve(label_S,y_pred)  
    train_ks = abs(fpr_lr_train - tpr_lr_train).max()  
    print('train_ks : ',train_ks)  #train_ks :  0.4629947544110634
      
    y_pred = best_model.predict_proba(test)[:,1]  
    fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)  
    val_ks = abs(fpr_lr - tpr_lr).max()  
    print('val_ks : ',val_ks)  # val_ks :  0.39846160021324123
      
    from matplotlib import pyplot as plt  
    plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')  
    plt.plot(fpr_lr,tpr_lr,label = 'evl LR')  
    plt.plot([0,1],[0,1],'k--')  
    plt.xlabel('False positive rate')  
    plt.ylabel('True positive rate')  
    plt.title('ROC Curve')  
    plt.legend(loc = 'best')  
    plt.show()  

     至于结果为什么和梅老师的结果不太一样,可能是数据的原因

  • 相关阅读:
    Android软件开发之SharedPreferences
    android软件开发之获取本地音乐属性
    项目开发日记
    MVC中C给V传值
    wtf忘了已经安装了Newtonsoft.json
    我的一些容易忘记的解决问题的方法
    ajax请求返回null
    验证码的一些细节
    项目做不下去
    关于HttpContex
  • 原文地址:https://www.cnblogs.com/cgmcoding/p/15357832.html
Copyright © 2011-2022 走看看