zoukankan      html  css  js  c++  java
  • 「二分类算法」提供银行精准营销解决方案 代码存档

    import mglearn
    from numpy import int64
    from sklearn import metrics
    from sklearn.calibration import CalibratedClassifierCV
    from sklearn.model_selection import GridSearchCV, KFold
    from sklearn.model_selection import cross_val_score
    import matplotlib.pyplot as plt
    import pandas as pd
    import numpy as np
    from sklearn.naive_bayes import GaussianNB
    from sklearn.neural_network import MLPClassifier
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression, Lasso, LinearRegression, LogisticRegressionCV
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVR, SVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
    from sklearn.datasets import make_classification
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report
    import seaborn as sns
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.decomposition import PCA, TruncatedSVD
    from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFECV
    from sklearn.metrics import roc_auc_score
    import warnings
    warnings.filterwarnings('always')
    warnings.filterwarnings('ignore')
    sns.set(style="darkgrid")
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示
    
    # 字段说明
    #
    # NO    字段名称    数据类型    字段描述
    # 1    ID    Int    客户唯一标识
    # 2    age    Int    客户年龄
    # 3    job    String    客户的职业
    # 4    marital    String    婚姻状况
    # 5    education    String    受教育水平
    # 6    default    String    是否有违约记录
    # 7    balance    Int    每年账户的平均余额
    # 8    housing    String    是否有住房贷款
    # 9    loan    String    是否有个人贷款
    # 10    contact    String    与客户联系的沟通方式
    # 11    day    Int    最后一次联系的时间(几号)
    # 12    month    String    最后一次联系的时间(月份)
    # 13    duration    Int    最后一次联系的交流时长
    # 14    campaign    Int    在本次活动中,与该客户交流过的次数
    # 15    pdays    Int    距离上次活动最后一次联系该客户,过去了多久(999表示没有联系过)
    # 16    previous    Int    在本次活动之前,与该客户交流过的次数
    # 17    poutcome    String    上一次活动的结果
    # 18    y    Int    预测客户是否会订购定期存款业务
    from sklearn.tree import DecisionTreeClassifier
    
    data_train = pd.read_csv('train_set.csv')
    data_test = pd.read_csv('test_set.csv')
    ids_test = data_test['ID']
    
    print(data_train.shape[0])
    
    # data_train['cppv']=data_train['campaign']+data_train['previous']
    # data_test['cppv']=data_test['campaign']+data_test['previous']
    # data_train.drop(['campaign','previous'], axis=1, inplace=True)
    # data_test.drop(['campaign','previous'], axis=1, inplace=True)
    
    # Rela_grouped=data_train.groupby(['cp'])
    # Rela_Survival_Rate=(Rela_grouped.sum()/Rela_grouped.count())['y']
    # Rela_count=Rela_grouped.count()['y']
    #
    # ax1=Rela_count.plot(kind='bar',color='g')
    # ax2=ax1.twinx()
    # ax2.plot(Rela_Survival_Rate.values,color='r')
    # ax1.set_xlabel('Relatives')
    # ax1.set_ylabel('Number')
    # ax2.set_ylabel('Survival Rate')
    # plt.title('Survival Rate by Relatives')
    # plt.grid(True,linestyle='-',color='0.7')
    # plt.show()
    
    # g = sns.FacetGrid(data_train, col='y')
    # g.map(plt.hist, 'day', bins=30)
    # plt.show()
    
    
    print("数值处理1:标签指标one-hot编码处理")
    
    
    data_train.drop(['ID'], axis=1, inplace=True)
    data_test.drop(['ID'], axis=1, inplace=True)
    
    dummy = pd.get_dummies(data_train[['month','job','marital','education','default','housing','loan','contact','poutcome']])
    dummyTest = pd.get_dummies(data_test[['month','job','marital','education','default','housing','loan','contact','poutcome']])
    data_train = pd.concat([dummy, data_train], axis=1)
    data_train.drop(['job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)
    data_test = pd.concat([dummyTest, data_test], axis=1)
    data_test.drop(['job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)
    
    data_train['day'].replace([30,13,15,4,14,12,18],4,inplace=True)
    data_train['day'].replace([5,20,21,11,8,16,2,3],3,inplace=True)
    data_train['day'].replace([17,9,6,27,7,22,28],2,inplace=True)
    data_train['day'].replace([23,25,26,10,29,19],1,inplace=True)
    data_train['day'].replace([1,24,31],0,inplace=True)
    
    data_test['day'].replace([30,13,15,4,14,12,18],4,inplace=True)
    data_test['day'].replace([5,20,21,11,8,16,2,3],3,inplace=True)
    data_test['day'].replace([17,9,6,27,7,22,28],2,inplace=True)
    data_test['day'].replace([23,25,26,10,29,19],1,inplace=True)
    data_test['day'].replace([1,24,31],0,inplace=True)
    
    
    # data_train['month1'] = data_train.month.apply(lambda x: 4 if x in ['may'] else 0)
    # data_train['month1'] = data_train.month.apply(lambda x: 3 if x in ['aug','jul','apr'] else 0)
    # data_train['month1'] = data_train.month.apply(lambda x: 2 if x in ['jun','feb','nov','oct'] else 0)
    # data_train['month1'] = data_train.month.apply(lambda x: 1 if x in ['sep','mar'] else 0)
    #
    # data_test['month1'] = data_test.month.apply(lambda x: 4 if x in ['may'] else 0)
    # data_test['month1'] = data_test.month.apply(lambda x: 3 if x in ['aug','jul','apr'] else 0)
    # data_test['month1'] = data_test.month.apply(lambda x: 2 if x in ['jun','feb','nov','oct'] else 0)
    # data_test['month1'] = data_test.month.apply(lambda x: 1 if x in ['sep','mar'] else 0)
    # #
    data_train.drop(['month'], inplace=True, axis=1)
    data_test.drop(['month'], inplace=True, axis=1)
    # data_train.drop(['day','job_management','marital_single'], axis=1, inplace=True)
    # data_test.drop(['day','job_management','marital_single'], axis=1, inplace=True)
    
    
    # data_train['month'].replace(['may'],4,inplace=True)
    # data_train['month'].replace(['aug','jul','apr'],3,inplace=True)
    # data_train['month'].replace(['jun','feb','nov','oct'],2,inplace=True)
    # data_train['month'].replace(['sep','mar'],1,inplace=True)
    # data_train['month'].replace(['jan','dec'],0,inplace=True)
    
    # 多删特征
    # data_train.drop(['age','balance','duration','pdays','previous','day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)
    # data_test.drop(['age','balance','duration','pdays','previous','day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)
    
    #default、housing、loan都是2分类的指标,删除其中一个即可
    # data_train.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1)
    # data_test.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1)
    
    
    ################################
    ########### 数据整理 ###########
    ################################
    
    data_train['pdays'].replace(-1,9999,inplace=True)
    data_test['pdays'].replace(-1,9999,inplace=True)
    print("数值处理2:pdays将-1替换为999")
    # data_train.drop(['pdays'], inplace=True, axis=1)
    # data_test.drop(['pdays'], inplace=True, axis=1)
    
    
    # g = sns.FacetGrid(data_train, col='y')
    # g.map(plt.hist, 'pdays', bins=20)
    # plt.show()
    # data_train.drop(['pdays'], inplace=True, axis=1)
    # data_test.drop(['pdays'], inplace=True, axis=1)
    
    y = data_train['y']
    X = data_train[data_train.columns[: -1]]
    # # X.info()
    # pdays的平均值先前看到是45,而-1距离45很近,距离max值854很远,故还是需要将所有的-1替换为999
    #数据预处理:
    #数据中pdays=-1表示从未联络过,替换为999
    
    
    
    #对方差较大的数据指标进行变换,MinMaxScaler或者StandardScaler
    print("数值处理3:数值指标Scaler变换")
    scaler = MinMaxScaler()
    # numerical = ['age','balance', 'duration', 'pdays', 'previous']
    # X[numerical] = scaler.fit_transform(X[numerical])
    # data_test[numerical] = scaler.fit_transform(data_test[numerical])
    print(data_test.shape)
    X = scaler.fit_transform(X)
    data_test = scaler.fit_transform(data_test)
    
    # tsvd = TruncatedSVD(n_components=46)
    # data_test = tsvd.fit_transform(data_test)
    #数据分割,用于测试
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.06, random_state=1)
    # X_train = tsvd.fit_transform(X_train)
    # X_test = tsvd.fit_transform(X_test)
    # print(X_train.shape)
    
    #增加二项式特征
    # polynomial_interaction = PolynomialFeatures(degree=2,include_bias=False)
    # #增加二项式特征,仅仅是交叉特征
    # polynomial_interaction = PolynomialFeatures(degree=2,interaction_only=True,include_bias=False)
    # X_train = polynomial_interaction.fit_transform(X_train)
    # X_test = polynomial_interaction.fit_transform(X_test)
    # data_test = polynomial_interaction.fit_transform(data_test)
    # print('after Polynomial:',X_train.shape)
    #
    # # #保留99%的信息,进行朱成本分析
    # pca = PCA(n_components=100,whiten=True)
    # X_train = pca.fit_transform(X_train)
    # X_test = pca.fit_transform(X_test)
    # data_test = pca.fit_transform(data_test)
    # print('after PCA:',X_train.shape)
    
    # #卡方分类筛选
    # selector = SelectKBest(f_classif,k=300)
    # X_train = selector.fit_transform(X_train,y_train)
    # X_test = selector.fit_transform(X_test,y_test)
    # print('after SelectKBest:',X_train.shape)
    
    # print(X_train['pdays'])
    
    ################################
    ########### 性能计算 ###########
    ################################
    
    
    # print('决策树,分数不理想')
    # clf = DecisionTreeClassifier(random_state=11)
    # clf.fit(X_train, y_train)
    # predictions = clf.predict(X_test)
    # print(classification_report(y_test, predictions))
    # print(cross_val_score(clf,X_train, y_train,scoring='f1'))
    # print(cross_val_score(clf,X_test, y_test,scoring='f1'))
    # print(clf.score(X_test, y_test))
    #
    # y_predprob = clf.predict_proba(X_test)
    # y_predprob = y_predprob[:, 1]
    #
    # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
    # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
    #
    print('随机森林,0.919203')
    clf = RandomForestClassifier(n_estimators=90, random_state=0,oob_score=True,n_jobs=-1)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(classification_report(y_test, predictions))
    # print(cross_val_score(clf,X_train, y_train,scoring='f1'))
    # print(cross_val_score(clf,X_test, y_test,scoring='f1'))
    print(clf.score(X_test, y_test))
    y_predprob = clf.predict_proba(X_test)
    y_predprob = y_predprob[:, 1]
    print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
    print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
    
    #穷举随机森林的最佳参数,答案:90
    # param_test1 ={'n_estimators':range(10,100,5)}
    # gsearch1= GridSearchCV(estimator =RandomForestClassifier(min_samples_split=100,
    #                                  min_samples_leaf=20,max_depth=8,max_features='sqrt',random_state=10),
    #                        param_grid =param_test1,scoring='roc_auc',cv=5)
    # gsearch1.fit(X_train, y_train)
    # print(gsearch1.best_params_)
    # y_predprob = gsearch1.predict_proba(X_test)
    # y_predprob = y_predprob[:, 1]
    # predictions = gsearch1.predict(X_test)
    # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
    # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
    #
    # print('逻辑回归,0.904655,0.915316')
    # # print(X_train)
    # #clf = Lasso(alpha=0.5)
    # clf = LogisticRegression(random_state=0,solver='newton-cg',class_weight='balanced',penalty='l2',n_jobs=-1)
    # # solver : str, {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, optional (default=’liblinear’).
    # clf.fit(X_train, y_train)
    # # clf.fit(X_train, y_train)
    # predictions = clf.predict(X_test)
    # # print(classification_report(y_test, predictions))
    # # print(cross_val_score(clf,X_train, y_train,scoring='f1'))
    # # print(cross_val_score(clf,X_test, y_test,scoring='f1'))
    # print(clf.score(X_test, y_test))
    # y_predprob = clf.predict_proba(X_test)
    # y_predprob = y_predprob[:, 1]
    #
    # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
    # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
    #
    # raletion = pd.DataFrame({"columns":list(data_train.columns)[0:-1], "coef":list(clf.coef_.T)})
    # print('相关性:',raletion)
    
    # #穷举逻辑回归的最佳参数,答案:
    # # best C : LogisticRegression(C=7.742636826811269, class_weight=None, dual=False,
    # #                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
    # #                    max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
    # #                    random_state=None, solver='warn', tol=0.0001, verbose=0,
    # #                    warm_start=False)
    # penalty = ['l1','l2']
    # C=np.logspace(0,4,10)
    # hyperparameters = dict(C=C,penalty=penalty)
    # gridsearch = GridSearchCV(clf,hyperparameters,cv=5,verbose=0)
    # best_clf= gridsearch.fit(X_train, y_train)
    # print('best C :',best_clf.best_estimator_)
    # print(gridsearch.best_params_)
    # y_predprob = gridsearch.predict_proba(X_test)
    # y_predprob = y_predprob[:, 1]
    # predictions = gridsearch.predict(X_test)
    # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
    # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
    
    # print('AdaBoost')
    # clf = AdaBoostClassifier(n_estimators=60, random_state=90)
    #
    # clf.fit(X_train, y_train)
    # predictionsByadaBoost = clf.predict(X_test)
    # print(classification_report(y_test, predictionsByadaBoost))
    # print(cross_val_score(clf,X_train, y_train,scoring='f1'))
    # print(cross_val_score(clf,X_test, y_test,scoring='f1'))
    # print(clf.score(X_test, y_test))
    # pred = clf.predict_proba(X_test)
    # dataPred = pd.DataFrame(pred, columns=['pred0', 'pred'])
    # dataPred.drop('pred0', axis=1, inplace=True)
    # print(dataPred)
    #
    # y_predprob = clf.predict_proba(X_test)
    # y_predprob = y_predprob[:, 1]
    #
    # predictions_train =  clf.predict(X_train)
    # y_predprob_train = clf.predict_proba(X_train)
    # y_predprob_train = y_predprob_train[:, 1]
    #
    # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictionsByadaBoost))
    # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
    # print("Accuracy y_train : %.4g" % metrics.accuracy_score(y_train, predictions_train))
    # print("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, y_predprob_train))
    # #
    #
    #
    # # #
    # print('神经网络')
    # # ‘lbfgs’ is an optimizer in the family of quasi-Newton methods.
    # # ‘sgd’ refers to stochastic gradient descent.
    # # ‘adam’ refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba
    # clf = MLPClassifier(solver='adam', hidden_layer_sizes=(80,80),
    #                     random_state=1)
    # clf.fit(X_train, y_train)
    # predictions = clf.predict(X_test)
    # print(clf.score(X_test, y_test))
    # y_predprob = clf.predict_proba(X_test)
    # y_predprob = y_predprob[:, 1]
    #
    # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
    # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
    # print('神经网络 end')
    # # #导出结果
    ID = list(range(25318,36170))
    submission = pd.DataFrame(ID)
    submission.rename(columns = {0: 'ID'}, inplace = True)
    # 将pred_y从array转化成DataFrame
    y_predprob_test = clf.predict_proba(data_test)
    y_predprob_test = y_predprob_test[:, 1]
    y_predprob_DataFrame = pd.DataFrame(y_predprob_test)
    submission['pred'] =y_predprob_DataFrame
    submission.to_csv('Result.csv', index = False)
    
    #为防止过拟合而减半步长,最大迭代次数加倍
    # gbm1 = GradientBoostingClassifier(learning_rate=0.001, n_estimators=10000, max_depth=7, min_samples_leaf=70,
    #                                   min_samples_split=1300, subsample=0.8, random_state=10)
    # gbm1.fit(X_train, y_train)
    #
    # y_pred = gbm1.predict(X_test)
    # y_predprob = gbm1.predict_proba(X_test)
    # y_predprob = y_predprob[:, 1]
    #
    # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred))
    # print("AUC Score (Train): %f" % metrics.roc_auc_score(y_test, y_predprob))
    
    # print('KNN近邻,分数不理想')
    # clf = KNeighborsClassifier(n_neighbors=5)
    # clf.fit(X_train,y_train)
    # predictions = clf.predict(X_test)
    # print(classification_report(y_test, predictions))
    # y_predprob = clf.predict_proba(X_test)
    # y_predprob = y_predprob[:, 1]
    
    # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
    # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
    
    # print('SVM支持向量机')
    # clf = SVC(kernel='rbf',C=1,gamma='auto',probability=True).fit(X_train, y_train)
    # predictions = clf.predict(X_test)
    # print(classification_report(y_test, predictions))
    # y_predprob = clf.predict_proba(X_test)
    # y_predprob = y_predprob[:, 1]
    #
    # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
    # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
    
    #朴素贝叶斯
    # print('朴素贝叶斯')
    # clf = GaussianNB()
    #
    # clf_sigmoid = CalibratedClassifierCV(clf,cv=5)
    # clf_sigmoid.fit(X_train,y_train)
    # predictions = clf_sigmoid.predict(X_test)
    # y_predprob = clf_sigmoid.predict_proba(X_test)
    # y_predprob = y_predprob[:, 1]
    #
    # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
    # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
    
    ################################
    # AdaBoost选为第一次使用的算法,提交数据
    ################################
    # print('AdaBoost')
    # adaBoost = AdaBoostClassifier(n_estimators=50, random_state=11)
    # adaBoost.fit(X_train, y_train)
    #
    # age_null = pd.isnull(data_test['age'])
    # data_null = data_test[age_null == True]
    # # print(data_null)
    #
    # id = data_test["ID"]
    # print(id)
    # X_test.drop(['ID'], axis=1, inplace=True)
    #
    # submission = pd.DataFrame({
    #         "ID": id
    #     })
    #
    # submission[['ID']].astype(int)
    # # submission[['ID']] = submission[['ID']].astype(int)
    # submission.to_csv('submission.csv', index=False)
    
    # data_test.dropna(inplace=True)
    # print(np.isnan(data_test).any())
    # submission.replace(np.nan, 0, inplace=True)
    
    
    # predictionsByadaBoost = adaBoost.predict_proba(X_test)
    #
    # submission = pd.DataFrame({
    #         "ID": id,
    #         "pred": predictionsByadaBoost
    #     })
    # submission.to_csv('submission.csv', index=False)

    第一次提交,没做什么特征工程,分数还不太理想

    0.9157894736842105
    Accuracy : 0.9158
    AUC Score (Test): 0.932477

    过程分析

    from numpy import int64
    from sklearn import metrics
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import cross_val_score
    import matplotlib.pyplot as plt
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
    from sklearn.datasets import make_classification
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report
    import seaborn as sns
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.decomposition import PCA
    from sklearn.feature_selection import SelectKBest,chi2,f_classif
    from sklearn.metrics import roc_auc_score
    
    data_train = pd.read_csv('/home/kesci/input/firstdata1587/train_set.csv')
    data_test = pd.read_csv('/home/kesci/input/firstdata1587/test_set.csv')
    data_train.describe()
    
    Out[4]:
     IDagebalancedaydurationcampaignpdayspreviousy
    count 25317.000000 25317.000000 25317.000000 25317.000000 25317.000000 25317.000000 25317.000000 25317.000000 25317.000000
    mean 12659.000000 40.935379 1357.555082 15.835289 257.732393 2.772050 40.248766 0.591737 0.116957
    std 7308.532719 10.634289 2999.822811 8.319480 256.975151 3.136097 100.213541 2.568313 0.321375
    min 1.000000 18.000000 -8019.000000 1.000000 0.000000 1.000000 -1.000000 0.000000 0.000000
    25% 6330.000000 33.000000 73.000000 8.000000 103.000000 1.000000 -1.000000 0.000000 0.000000
    50% 12659.000000 39.000000 448.000000 16.000000 181.000000 2.000000 -1.000000 0.000000 0.000000
    75% 18988.000000 48.000000 1435.000000 21.000000 317.000000 3.000000 -1.000000 0.000000 0.000000
    max 25317.000000 95.000000 102127.000000 31.000000 3881.000000 55.000000 854.000000 275.000000 1.000000
     

    总计记录25317人。 年龄分布:18-95; balance(存款)分布:-8019 - 102127,balance的标准差2999.822811,比较大,看到平均存款1357,上四分位1435,下四分位才只有73元,存款的差距还是蛮大的,万恶的资本主义; day(最后一次联系是几号):1-31,很明显一个月从1号开始,从31号结束,这个特征很可能和预测无关联; duration(交流时长):0-3881,这个猜测是持续的天数; campaign(交流次数):1-55 pdays(上次联系后过了多久):-1 - 854,这里没有999,应该是-1为没有联系,>-1就是期间几天前曾联系过; previous(活动前交流次数):0-275,平均0.591737,不到1次;

    In [5]:
    #工作和购买理财的关系
    y_0 = data_train.job[data_train.y == 0].value_counts()
    y_1 = data_train.job[data_train.y == 1].value_counts()
    df=pd.DataFrame({u'buy':y_1, u'not buy':y_0})
    df.plot(kind='bar', stacked=True)
    plt.title(u"job to buy")
    plt.ylabel(u"counts")
    plt.show()
    
     
    In [14]:
    #婚姻和购买理财的关系
    #看不出啥结果
    y_0 = data_train.marital[data_train.y == 0].value_counts()
    y_1 = data_train.marital[data_train.y == 1].value_counts()
    df=pd.DataFrame({u'buy':y_1, u'not buy':y_0})
    df.plot(kind='bar', stacked=True)
    plt.title(u"marital to buy")
    plt.ylabel(u"counts")
    plt.show()
    
     
    In [15]:
    #教育和购买理财的关系
    y_0 = data_train.education[data_train.y == 0].value_counts()
    y_1 = data_train.education[data_train.y == 1].value_counts()
    df=pd.DataFrame({u'buy':y_1, u'not buy':y_0})
    df.plot(kind='bar', stacked=True)
    plt.title(u"education to buy")
    plt.ylabel(u"counts")
    plt.show()
    
     
    In [24]:
    #上次活动结果和购买理财的关系
    #发现poutcome指标相当重要,上次活动成功的客户这次也购买理财的比例非常高
    y_0 = data_train.poutcome[data_train.y == 0].value_counts()
    y_1 = data_train.poutcome[data_train.y == 1].value_counts()
    df=pd.DataFrame({u'buy':y_1, u'not buy':y_0})
    df.plot(kind='bar', stacked=True)
    plt.title(u"poutcome to buy")
    plt.ylabel(u"counts")
    plt.show()
    
     
     

    判断day、month是和客户交流的月份和日份,很容易被当成噪音特征。用统计来说话。

    In [3]:
    #月份对结果的影响
    y_0 = data_train.month[data_train.y == 0].value_counts()
    y_1 = data_train.month[data_train.y == 1].value_counts()
    df=pd.DataFrame({u'buy':y_1, u'not buy':y_0})
    df.plot(kind='bar', stacked=True)
    plt.title(u"poutcome to buy")
    plt.ylabel(u"counts")
    plt.show()
    print(y_1/data_train.shape[0])
    #我们发现may(0.019789)和dec (0.001896)相差10倍,所以这个特征还是蛮重要的 
    
     
     
    may    0.019789
    aug    0.014773
    jul    0.014022
    apr    0.012916
    jun    0.011613
    feb    0.009954
    nov    0.009045
    oct    0.007465
    sep    0.006241
    mar    0.005727
    jan    0.003515
    dec    0.001896
    Name: month, dtype: float64
    
    In [4]:
    #日对结果的影响
    y_0 = data_train.day[data_train.y == 0].value_counts()
    y_1 = data_train.day[data_train.y == 1].value_counts()
    df=pd.DataFrame({u'buy':y_1, u'not buy':y_0})
    df.plot(kind='bar', stacked=True)
    plt.title(u"poutcome to buy")
    plt.ylabel(u"counts")
    plt.show()
    print(y_1/data_train.shape[0])
    #发现30号最容易出单,31号极不容易出单
    
     
     
    30    0.005964
    13    0.005253
    15    0.005135
    4     0.005016
    14    0.004977
    12    0.004898
    18    0.004898
    5     0.004661
    20    0.004661
    21    0.004621
    11    0.004582
    8     0.004463
    16    0.004345
    2     0.004345
    3     0.004266
    17    0.003950
    9     0.003910
    6     0.003792
    27    0.003792
    7     0.003476
    22    0.003436
    28    0.003160
    23    0.002923
    25    0.002646
    26    0.002528
    10    0.002528
    29    0.002409
    19    0.002370
    1     0.001777
    24    0.001303
    31    0.000869
    Name: day, dtype: float64
    
    In [7]:
    #'job','marital','education','default','housing','loan','contact','poutcome'这8个字段都
    #要做one-hot编码预处理,暂时先将unknown作为一个特征项。
    
    dummy = pd.get_dummies(data_train[['day','month','job','marital','education','default','housing','loan','contact','poutcome']])
    dummyTest = pd.get_dummies(data_test[['day','month','job','marital','education','default','housing','loan','contact','poutcome']])
    data_train = pd.concat([dummy, data_train], axis=1)
    data_train.drop(['day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)
    data_test = pd.concat([dummyTest, data_test], axis=1)
    data_test.drop(['day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)
    print("数值处理1:标签指标one-hot编码处理")
    #default、housing、loan都是2分类的指标,删除其中一个即可
    #data_train.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1)
    #data_test.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1)
    
    data_train['pdays'].replace(-1,999,inplace=True)
    data_test['pdays'].replace(-1,999,inplace=True)
    print("数值处理2:pdays将-1替换为999")
    
     
    数值处理1:标签指标one-hot编码处理
    数值处理2:pdays将-1替换为999
    
    In [20]:
    data_train.head()
    
    Out[20]:
     job_admin.job_blue-collarjob_entrepreneurjob_housemaidjob_managementjob_retiredjob_self-employedjob_servicesjob_studentjob_technician...poutcome_otherpoutcome_successpoutcome_unknownagebalancedurationcampaignpdayspreviousy
    0 0 0 0 0 1 0 0 0 0 0 ... 0 0 1 43 291 150 2 -1 0 0
    1 0 0 0 0 0 0 0 0 0 1 ... 1 0 0 42 5076 99 1 251 2 0
    2 1 0 0 0 0 0 0 0 0 0 ... 0 0 1 47 104 77 2 -1 0 0
    3 0 0 0 0 1 0 0 0 0 0 ... 0 0 1 28 -994 174 2 -1 0 0
    4 0 0 0 0 0 0 0 0 0 1 ... 0 0 1 42 2974 187 5 -1 0 0

    5 rows × 39 columns

    In [6]:
    #测试单一特征和目标的关系
    #print('无违约:',data_train[data_train['default_yes']==0].count())
    #print('有违约:',data_train[data_train['default_yes']==1].count())
    print(data_train['default_yes'].value_counts())
    print(data_test['default_yes'].value_counts())
    #data_train.groupby(["default_yes"], as_index=False)['y'].count()
    
     
    0    24869
    1      448
    Name: default_yes, dtype: int64
    0    24869
    1      448
    Name: default_yes, dtype: int64
    
    Out[6]:
     job_admin.job_blue-collarjob_entrepreneurjob_housemaidjob_managementjob_retiredjob_self-employedjob_servicesjob_studentjob_technician...poutcome_otherpoutcome_successpoutcome_unknownagebalancedurationcampaignpdayspreviousy
    job_admin. 1.000000 -0.188846 -0.067402 -0.059086 -0.185311 -0.082905 -0.068534 -0.115037 -0.052838 -0.161626 ... 0.013577 0.004200 -0.018840 -0.063839 -0.029366 -0.017629 -0.018559 0.021803 0.009821 0.000298
    job_blue-collar -0.188846 1.000000 -0.098047 -0.085951 -0.269568 -0.120600 -0.099695 -0.167341 -0.076863 -0.235113 ... -0.003148 -0.056453 0.025315 -0.044350 -0.056248 0.010505 0.009946 0.016488 -0.019208 -0.075065
    job_entrepreneur -0.067402 -0.098047 1.000000 -0.030677 -0.096212 -0.043044 -0.035583 -0.059726 -0.027433 -0.083915 ... -0.018659 -0.014969 0.013491 0.023331 0.010288 0.003927 -0.001803 -0.014705 -0.007958 -0.022519
    job_housemaid -0.059086 -0.085951 -0.030677 1.000000 -0.084342 -0.037733 -0.031193 -0.052357 -0.024049 -0.073562 ... -0.018467 -0.009511 0.029735 0.084754 0.008013 -0.001337 0.002692 -0.032321 -0.013129 -0.015041
    job_management -0.185311 -0.269568 -0.096212 -0.084342 1.000000 -0.118343 -0.097829 -0.164209 -0.075424 -0.230713 ... 0.008288 0.025737 -0.019421 -0.027075 0.078719 -0.010090 0.016234 -0.003619 0.025946 0.035234
    job_retired -0.082905 -0.120600 -0.043044 -0.037733 -0.118343 1.000000 -0.043767 -0.073464 -0.033743 -0.103217 ... -0.001619 0.054668 -0.024616 0.451285 0.046370 0.026569 -0.031805 -0.003046 0.007511 0.083868
    job_self-employed -0.068534 -0.099695 -0.035583 -0.031193 -0.097829 -0.043767 1.000000 -0.060730 -0.027894 -0.085325 ... -0.002526 0.004632 0.000565 -0.009973 0.000782 0.002657 -0.003602 -0.007433 -0.004029 0.001078
    job_services -0.115037 -0.167341 -0.059726 -0.052357 -0.164209 -0.073464 -0.060730 1.000000 -0.046821 -0.143221 ... 0.001367 -0.020796 0.005367 -0.060838 -0.036640 0.000364 -0.001615 0.011358 -0.006309 -0.026688
    job_student -0.052838 -0.076863 -0.027433 -0.024049 -0.075424 -0.033743 -0.027894 -0.046821 1.000000 -0.065784 ... 0.030733 0.049948 -0.045026 -0.195720 0.000799 -0.005165 -0.021539 0.024643 0.014206 0.069058
    job_technician -0.161626 -0.235113 -0.083915 -0.073562 -0.230713 -0.103217 -0.085325 -0.143221 -0.065784 1.000000 ... -0.001704 -0.004072 0.011010 -0.063478 -0.015668 -0.011605 0.023601 -0.015579 -0.004059 -0.004942
    job_unemployed -0.060802 -0.088448 -0.031568 -0.027673 -0.086792 -0.038829 -0.032099 -0.053879 -0.024747 -0.075699 ... -0.012716 0.016013 0.009008 0.005462 0.013252 0.023554 -0.021663 -0.013660 -0.008230 0.023980
    job_unknown -0.029004 -0.042192 -0.015059 -0.013201 -0.041402 -0.018523 -0.015312 -0.025701 -0.011805 -0.036110 ... -0.016910 0.007256 0.011327 0.045026 0.015479 -0.003483 0.012938 -0.014763 -0.006241 0.001438
    marital_divorced 0.027961 -0.062361 0.003040 0.016786 0.002196 0.053472 -0.017381 0.026199 -0.048590 0.007188 ... -0.001968 -0.002870 0.001999 0.165888 -0.028356 0.012815 -0.019830 0.003130 -0.004718 0.002723
    marital_married -0.056102 0.125532 0.044894 0.045362 -0.033545 0.073654 0.002060 -0.019572 -0.161869 -0.058949 ... -0.028606 -0.022959 0.028377 0.284516 0.026577 -0.022557 0.039452 -0.027329 -0.006380 -0.054746
    marital_single 0.041159 -0.092241 -0.050951 -0.061204 0.034904 -0.117958 0.010081 0.002703 0.210381 0.058978 ... 0.032488 0.026989 -0.032260 -0.426833 -0.008788 0.015434 -0.028825 0.027486 0.010278 0.057574
    education_primary -0.110105 0.348314 -0.011630 0.164128 -0.175814 0.119077 -0.040373 -0.058845 -0.042160 -0.161923 ... -0.004174 -0.033214 0.032773 0.194451 -0.026575 -0.000034 0.012495 -0.011621 -0.012038 -0.043154
    education_secondary 0.220828 0.037604 -0.051630 -0.062505 -0.405359 -0.037429 -0.053990 0.200833 0.007825 0.155845 ... 0.004079 -0.028471 0.002800 -0.093500 -0.074607 0.000568 -0.022185 0.017952 -0.011050 -0.038460
    education_tertiary -0.146154 -0.320429 0.061969 -0.055380 0.601275 -0.062459 0.095847 -0.170206 -0.024021 -0.036790 ... 0.003128 0.050667 -0.030504 -0.083080 0.094686 -0.001067 0.011818 -0.006720 0.024955 0.066901
    education_unknown -0.021208 0.010760 0.008699 -0.012186 -0.041017 0.022015 -0.010919 -0.008502 0.110442 -0.014967 ... -0.009791 0.015287 0.003656 0.073640 0.018380 0.001066 0.006071 -0.008665 -0.007600 0.021087
    default_yes -0.005145 0.012717 0.029592 -0.007002 -0.008630 -0.008948 0.008743 -0.002526 -0.017596 -0.004049 ... -0.010326 -0.021432 0.038027 -0.019272 -0.068299 -0.011327 0.019978 -0.029440 -0.015293 -0.024608
    housing_yes 0.043369 0.176937 0.017130 -0.074215 -0.063260 -0.159975 -0.023608 0.065284 -0.085328 -0.016506 ... 0.032566 -0.096285 -0.060478 -0.187364 -0.068780 0.002778 -0.024708 0.121740 0.032667 -0.143589
    loan_yes 0.032612 0.012896 0.040955 -0.012334 -0.032051 -0.016304 -0.006878 0.036603 -0.058082 0.009240 ... -0.011531 -0.053573 0.035315 -0.016286 -0.085854 -0.011356 0.020537 -0.024458 -0.006240 -0.065231
    contact_cellular -0.002431 -0.128760 -0.003751 -0.018765 0.101878 -0.010661 0.012462 -0.029756 0.027596 0.055623 ... 0.107764 0.104342 -0.263887 -0.072573 0.015821 0.018666 -0.027461 0.225438 0.122062 0.134791
    contact_telephone -0.012570 -0.002537 -0.012075 0.044074 -0.031565 0.105808 0.001363 -0.015583 0.026084 -0.037147 ... 0.025071 0.009642 -0.026306 0.174284 0.042785 -0.015570 0.056106 0.017672 0.021314 0.020747
    contact_unknown 0.009411 0.137290 0.010535 -0.004194 -0.090346 -0.046364 -0.013896 0.039893 -0.043332 -0.038483 ... -0.127399 -0.115385 0.292862 -0.018304 -0.039998 -0.011223 -0.001567 -0.247577 -0.140445 -0.153572
    poutcome_failure 0.012266 0.002967 0.003890 -0.019621 0.004027 0.000278 -0.001732 0.004389 0.007463 -0.010275 ... -0.073107 -0.064271 -0.734653 -0.006166 0.012700 -0.019398 -0.089085 0.704495 0.313898 0.011927
    poutcome_other 0.013577 -0.003148 -0.018659 -0.018467 0.008288 -0.001619 -0.002526 0.001367 0.030733 -0.001704 ... 1.000000 -0.038796 -0.443453 -0.021450 0.008611 -0.002584 -0.021604 0.384397 0.295747 0.038399
    poutcome_success 0.004200 -0.056453 -0.014969 -0.009511 0.025737 0.054668 0.004632 -0.020796 0.049948 -0.004072 ... -0.038796 1.000000 -0.389856 0.039246 0.031758 0.045017 -0.058443 0.223025 0.174036 0.305806
    poutcome_unknown -0.018840 0.025315 0.013491 0.029735 -0.019421 -0.024616 0.000565 0.005367 -0.045026 0.011010 ... -0.443453 -0.389856 1.000000 -0.002015 -0.029327 -0.003872 0.109688 -0.868084 -0.485981 -0.170697
    age -0.063839 -0.044350 0.023331 0.084754 -0.027075 0.451285 -0.009973 -0.060838 -0.195720 -0.063478 ... -0.021450 0.039246 -0.002015 1.000000 0.093740 0.000416 0.006171 -0.026431 0.006575 0.029916
    balance -0.029366 -0.056248 0.010288 0.008013 0.078719 0.046370 0.000782 -0.036640 0.000799 -0.015668 ... 0.008611 0.031758 -0.029327 0.093740 1.000000 0.026042 -0.010419 0.001032 0.015792 0.057564
    duration -0.017629 0.010505 0.003927 -0.001337 -0.010090 0.026569 0.002657 0.000364 -0.005165 -0.011605 ... -0.002584 0.045017 -0.003872 0.000416 0.026042 1.000000 -0.087780 0.000040 0.001315 0.394746
    campaign -0.018559 0.009946 -0.001803 0.002692 0.016234 -0.031805 -0.003602 -0.001615 -0.021539 0.023601 ... -0.021604 -0.058443 0.109688 0.006171 -0.010419 -0.087780 1.000000 -0.089224 -0.031667 -0.075173
    pdays 0.021803 0.016488 -0.014705 -0.032321 -0.003619 -0.003046 -0.007433 0.011358 0.024643 -0.015579 ... 0.384397 0.223025 -0.868084 -0.026431 0.001032 0.000040 -0.089224 1.000000 0.411688 0.107565
    previous 0.009821 -0.019208 -0.007958 -0.013129 0.025946 0.007511 -0.004029 -0.006309 0.014206 -0.004059 ... 0.295747 0.174036 -0.485981 0.006575 0.015792 0.001315 -0.031667 0.411688 1.000000 0.088337
    y 0.000298 -0.075065 -0.022519 -0.015041 0.035234 0.083868 0.001078 -0.026688 0.069058 -0.004942 ... 0.038399 0.305806 -0.170697 0.029916 0.057564 0.394746 -0.075173 0.107565 0.088337 1.000000

    36 rows × 36 columns

    In [8]:
    #违约记录&订购理财的关系
    fig = plt.figure()
    fig.set(alpha=0.2)  # 设定图表颜色alpha参数
    y_0 = data_atrain.default_yes[data_train.y == 0].value_counts()
    y_1 = data_train.default_yes[data_train.y == 1].value_counts()
    df=pd.DataFrame({u'buy':y_1, u'not buy':y_0})
    df.plot(kind='bar', stacked=True)
    plt.title(u"buy or not")
    plt.xlabel(u"default")
    plt.ylabel(u"counts")
    plt.show()
    
     
    ---------------------------------------------------------------------------
    NameError                                 Traceback (most recent call last)
    <ipython-input-8-a047910fcfb8> in <module>
          2 fig = plt.figure()
          3 fig.set(alpha=0.2)  # 设定图表颜色alpha参数
    ----> 4 y_0 = data_atrain.default_yes[data_train.y == 0].value_counts()
          5 y_1 = data_train.default_yes[data_train.y == 1].value_counts()
          6 df=pd.DataFrame({u'buy':y_1, u'not buy':y_0})
    
    NameError: name 'data_atrain' is not defined
     
    <Figure size 432x288 with 0 Axes>
    In [9]:
    #住房贷款&订购理财的关系
    #可以看出没有房贷购买理财的比例稍微高一些,但不明显,可能是还房贷的人资金压力稍大
    fig = plt.figure()
    fig.set(alpha=0.2)  # 设定图表颜色alpha参数
    y_0 = data_train.housing_yes[data_train.y == 0].value_counts()
    y_1 = data_train.housing_yes[data_train.y == 1].value_counts()
    df=pd.DataFrame({u'buy':y_1, u'not buy':y_0})
    df.plot(kind='bar', stacked=True)
    plt.title(u"buy or not")
    plt.xlabel(u"housing")
    plt.ylabel(u"counts")
    plt.show()
    #发现没有违约的人买理财比例略高
    
     
    <Figure size 432x288 with 0 Axes>
     
    In [19]:
    #个人贷款&订购理财的关系
    #可以看出两种情况差别不大
    fig = plt.figure()
    fig.set(alpha=0.2)  # 设定图表颜色alpha参数
    y_0 = data_train.loan_yes[data_train.y == 0].value_counts()
    y_1 = data_train.loan_yes[data_train.y == 1].value_counts()
    df=pd.DataFrame({u'buy':y_1, u'not buy':y_0})
    df.plot(kind='bar', stacked=True)
    plt.title(u"buy or not")
    plt.xlabel(u"loan")
    plt.ylabel(u"counts")
    plt.show()
    data_train[["loan_yes", "y"]].groupby(['loan_yes'], as_index=False).mean().sort_values(by='y', ascending=False)
    #可以看出12.6%的无个人贷的人买了理财,有贷款的只有6.89%买了理财
    #说明无个贷买理财的机会比较大
    
     
    <Figure size 432x288 with 0 Axes>
     
    Out[19]:
     loan_yesy
    0 0 0.126117
    1 1 0.068983
    In [7]:
    #使用直方图来看看那个区段年龄的人最多购买或不购买
    g = sns.FacetGrid(data_train, col='y')
    g.map(plt.hist, 'age', bins=20)
    plt.show()
    #貌似看不出什么问题,只能说明买理财的年龄不大集中,不买的集中在30-40岁之间
    
     
    In [8]:
    #使用直方图来看看“距离上次活动最后一次联系该客户,过去了多久”的人最多购买或不购买
    #看来是时间越短,购买率越高,说明pdays是相当重要的指标
    g = sns.FacetGrid(data_train, col='y')
    g.map(plt.hist, 'pdays', bins=20)
    plt.show()
    #pdays指标让人读不懂,以后重点解决
    
     
    In [9]:
    y = data_train['y']
    X = data_train[data_train.columns[: -1]]
    X.info()
    
     
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 25317 entries, 0 to 25316
    Data columns (total 51 columns):
    month_apr              25317 non-null uint8
    month_aug              25317 non-null uint8
    month_dec              25317 non-null uint8
    month_feb              25317 non-null uint8
    month_jan              25317 non-null uint8
    month_jul              25317 non-null uint8
    month_jun              25317 non-null uint8
    month_mar              25317 non-null uint8
    month_may              25317 non-null uint8
    month_nov              25317 non-null uint8
    month_oct              25317 non-null uint8
    month_sep              25317 non-null uint8
    job_admin.             25317 non-null uint8
    job_blue-collar        25317 non-null uint8
    job_entrepreneur       25317 non-null uint8
    job_housemaid          25317 non-null uint8
    job_management         25317 non-null uint8
    job_retired            25317 non-null uint8
    job_self-employed      25317 non-null uint8
    job_services           25317 non-null uint8
    job_student            25317 non-null uint8
    job_technician         25317 non-null uint8
    job_unemployed         25317 non-null uint8
    job_unknown            25317 non-null uint8
    marital_divorced       25317 non-null uint8
    marital_married        25317 non-null uint8
    marital_single         25317 non-null uint8
    education_primary      25317 non-null uint8
    education_secondary    25317 non-null uint8
    education_tertiary     25317 non-null uint8
    education_unknown      25317 non-null uint8
    default_no             25317 non-null uint8
    default_yes            25317 non-null uint8
    housing_no             25317 non-null uint8
    housing_yes            25317 non-null uint8
    loan_no                25317 non-null uint8
    loan_yes               25317 non-null uint8
    contact_cellular       25317 non-null uint8
    contact_telephone      25317 non-null uint8
    contact_unknown        25317 non-null uint8
    poutcome_failure       25317 non-null uint8
    poutcome_other         25317 non-null uint8
    poutcome_success       25317 non-null uint8
    poutcome_unknown       25317 non-null uint8
    ID                     25317 non-null int64
    age                    25317 non-null int64
    balance                25317 non-null int64
    duration               25317 non-null int64
    campaign               25317 non-null int64
    pdays                  25317 non-null int64
    previous               25317 non-null int64
    dtypes: int64(7), uint8(44)
    memory usage: 2.4 MB
    
    In [ ]:
    #查看相关矩阵,连带y也作为指标
    #data_train.corr()
    #查看相关矩阵热图
    #colormap = plt.cm.RdBu
    #plt.figure(figsize=(39,37))
    #plt.title('Correlation of Features', y=1.05, size=37)
    #sns.heatmap(data_train.astype(float).corr(),linewidths=0.1,vmax=1.0,
    #            square=True, cmap=colormap, linecolor='white', annot=True)
    #plt.show()
    
    In [11]:
    print("数值处理3:数值指标Scaler变换")
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    data_test = scaler.fit_transform(data_test)
    #数据分割,用于测试
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=90)
    
     
    数值处理3:数值指标Scaler变换
    
    In [12]:
    # print('决策树')
    # clf = DecisionTreeClassifier(random_state=11)
    # clf.fit(X_train, y_train)
    # predictions = clf.predict(X_test)
    # print(classification_report(y_test, predictions))
    # print(cross_val_score(clf,X_train, y_train,scoring='f1'))
    # print(cross_val_score(clf,X_test, y_test,scoring='f1'))
    # print(clf.score(X_test, y_test))
    #
    # y_predprob = clf.predict_proba(X_test)
    # y_predprob = y_predprob[:, 1]
    #
    # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
    # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
    #
    # print('随机森林')
    # clf = RandomForestClassifier(n_estimators=10, random_state=11)
    # clf.fit(X_train, y_train)
    # predictions = clf.predict(X_test)
    # print(classification_report(y_test, predictions))
    # print(cross_val_score(clf,X_train, y_train,scoring='f1'))
    # print(cross_val_score(clf,X_test, y_test,scoring='f1'))
    # print(clf.score(X_test, y_test))
    # y_predprob = clf.predict_proba(X_test)
    # y_predprob = y_predprob[:, 1]
    #
    # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
    # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
    
    # print('逻辑回归')
    # clf = LogisticRegression()
    # clf.fit(X_train, y_train)
    # predictions = clf.predict(X_test)
    # print(classification_report(y_test, predictions))
    # print(cross_val_score(clf,X_train, y_train,scoring='f1'))
    # print(cross_val_score(clf,X_test, y_test,scoring='f1'))
    # print(clf.score(X_test, y_test))
    # y_predprob = clf.predict_proba(X_test)
    # y_predprob = y_predprob[:, 1]
    #
    # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
    # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
    print('AdaBoost')
    adaBoost = AdaBoostClassifier(n_estimators=50, random_state=11)
    adaBoost.fit(X_train, y_train)
    predictionsByadaBoost = adaBoost.predict(X_test)
    print(classification_report(y_test, predictionsByadaBoost))
    print(cross_val_score(adaBoost,X_train, y_train,scoring='f1'))
    print(cross_val_score(adaBoost,X_test, y_test,scoring='f1'))
    print(adaBoost.score(X_test, y_test))
    pred = adaBoost.predict_proba(X_test)
    dataPred = pd.DataFrame(pred, columns=['pred0', 'pred'])
    dataPred.drop('pred0', axis=1, inplace=True)
    print(dataPred)
    
    y_predprob = adaBoost.predict_proba(X_test)
    y_predprob = y_predprob[:, 1]
    
    predictions_train =  adaBoost.predict(X_train)
    y_predprob_train = adaBoost.predict_proba(X_train)
    y_predprob_train = y_predprob_train[:, 1]
    
    print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictionsByadaBoost))
    print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
    print("Accuracy y_train : %.4g" % metrics.accuracy_score(y_train, predictions_train))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, y_predprob_train))
    
    
    ID = list(range(25318,36170))
    submission = pd.DataFrame(ID)
    
    submission.rename(columns = {0: 'ID'}, inplace = True)
    
    # 将pred_y从array转化成DataFrame
    y_predprob_test = adaBoost.predict_proba(data_test)
    y_predprob_test = y_predprob_test[:, 1]
    
    y_predprob_DataFrame = pd.DataFrame(y_predprob_test)
    submission['pred'] =y_predprob_DataFrame
    
    submission.to_csv('Result.csv', index = False)
    
     
    AdaBoost
                  precision    recall  f1-score   support
    
               0       1.00      1.00      1.00      2249
               1       1.00      1.00      1.00       283
    
        accuracy                           1.00      2532
       macro avg       1.00      1.00      1.00      2532
    weighted avg       1.00      1.00      1.00      2532
    
    [1.         1.         0.99943915]
    [1. 1. 1.]
    1.0
                  pred
    0     2.220446e-16
    1     1.000000e+00
    2     2.220446e-16
    3     2.220446e-16
    4     2.220446e-16
    5     2.220446e-16
    6     2.220446e-16
    7     2.220446e-16
    8     2.220446e-16
    9     2.220446e-16
    10    2.220446e-16
    11    2.220446e-16
    12    2.220446e-16
    13    2.220446e-16
    14    2.220446e-16
    15    2.220446e-16
    16    2.220446e-16
    17    2.220446e-16
    18    2.220446e-16
    19    2.220446e-16
    20    2.220446e-16
    21    2.220446e-16
    22    2.220446e-16
    23    2.220446e-16
    24    2.220446e-16
    25    2.220446e-16
    26    2.220446e-16
    27    2.220446e-16
    28    2.220446e-16
    29    2.220446e-16
    ...            ...
    2502  2.220446e-16
    2503  2.220446e-16
    2504  2.220446e-16
    2505  2.220446e-16
    2506  2.220446e-16
    2507  2.220446e-16
    2508  2.220446e-16
    2509  2.220446e-16
    2510  2.220446e-16
    2511  2.220446e-16
    2512  2.220446e-16
    2513  2.220446e-16
    2514  2.220446e-16
    2515  2.220446e-16
    2516  2.220446e-16
    2517  2.220446e-16
    2518  2.220446e-16
    2519  2.220446e-16
    2520  1.000000e+00
    2521  2.220446e-16
    2522  2.220446e-16
    2523  2.220446e-16
    2524  2.220446e-16
    2525  2.220446e-16
    2526  2.220446e-16
    2527  2.220446e-16
    2528  2.220446e-16
    2529  2.220446e-16
    2530  1.000000e+00
    2531  1.000000e+00
    
    [2532 rows x 1 columns]
    Accuracy : 1
    AUC Score (Test): 1.000000
    Accuracy y_train : 1
    AUC Score (Train): 1.000000
    
     
    /opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.
      warnings.warn(CV_WARNING, FutureWarning)
    /opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.
      warnings.warn(CV_WARNING, FutureWarning)
    
     
  • 相关阅读:
    web中缓存的使用
    .net1.1 Read Byte Array From File
    数据库连接的多种方式(二)
    存储过程和SQL语句比较及存储过程在C#中调用方法(转)
    exec与sp_executesql语法的区别详解(转)
    SQL Server存储过程入门案例详解
    asp.net结合aspnetpager用sql语句分页
    数据库连接的多种方式(一)
    分页存储过程1
    asp.net结合aspnetpager使用SQL2005的存储过程分页(转)
  • 原文地址:https://www.cnblogs.com/starcrm/p/11806712.html
Copyright © 2011-2022 走看看