zoukankan      html  css  js  c++  java
  • sklearn机器学习-泰坦尼克号

    sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频)

    https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

     

    医药统计项目可联系
     QQ:231469242

    randomForest.py

     调参后,预测最高准确性也达到了89%

    随机森林的参数

    # -*- coding: utf-8 -*-
    """
    Created on Sat Mar 31 09:30:24 2018
    
    @author: Administrator
    随机森林不需要预处理数据
    """
    #导入数据预处理,包括标准化处理或正则处理
    from sklearn import preprocessing
    from sklearn.preprocessing import Imputer
    from sklearn import metrics
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    import pandas as pd
    #中文字体设置
    from matplotlib.font_manager import FontProperties
    font=FontProperties(fname=r"c:windowsfontssimsun.ttc",size=14)
    
    #读取变量名文件
    varibleFileName="titantic.xlsx"
    #读取目标文件
    targetFileName="target.xlsx"
    #读取excel
    data=pd.read_excel(varibleFileName)
    data_dummies=pd.get_dummies(data)
    print('features after one-hot encoding:
    ',list(data_dummies.columns))
    features=data_dummies.ix[:,"Pclass":'Embarked_S']
    x=features.values
    
    #数据预处理
    imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) 
    imp.fit(x)
    x=imp.transform(x)
    
    
    target=pd.read_excel(targetFileName)
    y=target.values
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)
    names=features.columns
    
    trees=1000
    max_depth=10
    #n_estimators表示树的个数,测试中100颗树足够
    forest=RandomForestClassifier(n_estimators=trees,random_state=0,max_depth=max_depth)
    forest.fit(x_train,y_train)
    
    print("random forest with %d trees:"%trees)  
    print("accuracy on the training subset:{:.3f}".format(forest.score(x_train,y_train)))
    print("accuracy on the test subset:{:.3f}".format(forest.score(x_test,y_test)))
    #print('Feature importances:{}'.format(forest.feature_importances_))
    
    names=features.columns
    importance=forest.feature_importances_
    zipped = zip(importance,names)
    list1=list(zipped)
    
    list1.sort(reverse=True)
    #print(list1)
    
    
    
    n_features=data_dummies.shape[1]
    plt.barh(range(n_features),forest.feature_importances_,align='center')
    plt.yticks(np.arange(n_features),features)
    plt.title("random forest with %d trees,%dmax_depth:"%(trees,max_depth))
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature')
    plt.show()
    
    
    '''
    random forest with 1000 trees:
    accuracy on the training subset:0.983
    accuracy on the test subset:0.878
    
    
    random forest with 1000 trees,max_depth=4:
    accuracy on the training subset:0.854
    accuracy on the test subset:0.884
    
    random forest with 1000 trees,max_depth=5:
    accuracy on the training subset:0.853
    accuracy on the test subset:0.887
    
    random forest with 1000 trees,max_depth=9
    accuracy on the training subset:0.871
    accuracy on the test subset:0.890
    '''
    

      

      

    去掉覆盖率低的变量后,随机森林准确性反而下降,看了随机森林不需要去计算变量覆盖率

    训练数据准确性0.983

    测试数据准确性0.878

    '''
    random forest with 1000 trees:
    accuracy on the training subset:0.983
    accuracy on the test subset:0.878
    '''

    重要因子来看,性别第一,占据40%重要性,

    年龄重要性18%左右,

    票价重要性17%左右

     logistic.py

    # -*- coding: utf-8 -*-
    """
    Created on Sun Apr 29 22:39:35 2018
    
    @author: Administrator
    """
    
    # -*- coding: utf-8 -*-
    """
    Created on Sat Mar 31 09:30:24 2018
    
    @author: Administrator
    随机森林不需要预处理数据
    """
    from sklearn.linear_model import LogisticRegression
    #导入数据预处理,包括标准化处理或正则处理
    from sklearn import preprocessing
    from sklearn.preprocessing import Imputer
    from sklearn import metrics
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split
    import pandas as pd
    #中文字体设置
    from matplotlib.font_manager import FontProperties
    font=FontProperties(fname=r"c:windowsfontssimsun.ttc",size=14)
    
    #读取变量名文件
    varibleFileName="titantic.xlsx"
    #读取目标文件
    targetFileName="target.xlsx"
    #读取excel
    data=pd.read_excel(varibleFileName)
    data_dummies=pd.get_dummies(data)
    print('features after one-hot encoding:
    ',list(data_dummies.columns))
    features=data_dummies.ix[:,"Pclass":'Embarked_S']
    x=features.values
    
    #数据预处理
    imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) 
    imp.fit(x)
    x=imp.transform(x)
    
    
    target=pd.read_excel(targetFileName)
    y=target.values
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)
    names=features.columns
    
    
    #n_estimators表示树的个数,测试中100颗树足够
    logistic=LogisticRegression()
    logistic.fit(x_train,y_train)
    
    print("logistic:")  
    print("accuracy on the training subset:{:.3f}".format(logistic.score(x_train,y_train)))
    print("accuracy on the test subset:{:.3f}".format(logistic.score(x_test,y_test)))
    
    
    '''
    logistic:
    accuracy on the training subset:0.850
    accuracy on the test subset:0.875
    '''
    

      

     目前效果最好的是去掉低覆盖率的变量后,SVM准确率最高0.89

    # -*- coding: utf-8 -*-
    """
    Created on Sat Mar 31 09:30:24 2018
    
    @author: Administrator
    随机森林不需要预处理数据
    """
    from sklearn.svm import SVC
    #导入数据预处理,包括标准化处理或正则处理
    from sklearn import preprocessing
    from sklearn.preprocessing import Imputer
    from sklearn import metrics
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split
    import pandas as pd
    #中文字体设置
    from matplotlib.font_manager import FontProperties
    font=FontProperties(fname=r"c:windowsfontssimsun.ttc",size=14)
    
    #读取变量名文件
    varibleFileName="titantic.xlsx"
    #读取目标文件
    targetFileName="target.xlsx"
    #读取excel
    data=pd.read_excel(varibleFileName)
    data_dummies=pd.get_dummies(data)
    print('features after one-hot encoding:
    ',list(data_dummies.columns))
    features=data_dummies.ix[:,"Pclass":'Embarked_S']
    x=features.values
    
    #数据预处理
    imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) 
    imp.fit(x)
    x=imp.transform(x)
    
    
    target=pd.read_excel(targetFileName)
    y=target.values
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)
    names=features.columns
    
    svm=SVC()
    svm.fit(x_train,y_train)
    print("svc:")  
    print("accuracy on the training subset:{:.3f}".format(svm.score(x_train,y_train)))
    print("accuracy on the test subset:{:.3f}".format(svm.score(x_test,y_test)))
    
    
    '''
    svc:
    accuracy on the training subset:0.900
    accuracy on the test subset:0.726
    '''
    
    #标准化数据
    X_train_scaled = preprocessing.scale(x_train)
    x_test_scaled = preprocessing.scale(x_test)
    svm1=SVC()
    svm1.fit(X_train_scaled,y_train)
    #改变C参数,调优,kernel表示核函数,用于平面转换,probability表示是否需要计算概率
    svm1=SVC()
    svm1.fit(X_train_scaled,y_train)
    print("accuracy on the scaled training subset:{:.3f}".format(svm1.score(X_train_scaled,y_train)))
    print("accuracy on the scaled test subset:{:.3f}".format(svm1.score(x_test_scaled,y_test)))
    
    '''
    accuracy on the scaled training subset:0.866
    accuracy on the scaled test subset:0.881
    '''
    #改变C参数,调优,kernel表示核函数,用于平面转换,probability表示是否需要计算概率
    svm2=SVC(C=10,gamma="auto",kernel='rbf',probability=True)
    svm2.fit(X_train_scaled,y_train)
    print("after c parameter=10,accuracy on the scaled training subset:{:.3f}".format(svm2.score(X_train_scaled,y_train)))
    print("after c parameter=10,accuracy on the scaled test subset:{:.3f}".format(svm2.score(x_test_scaled,y_test)))
    
    '''
    after c parameter=10,accuracy on the scaled training subset:0.878
    after c parameter=10,accuracy on the scaled test subset:0.890
    '''
    

    xgboost1.py 

    效果也相当好

    AUC: 0.9464
    ACC: 0.8841
    Recall: 0.8716
    F1-score: 0.8716
    Precesion: 0.8716
    # -*- coding: utf-8 -*-
    """
    Created on Sat Mar 31 09:30:24 2018
    
    @author: Administrator
    随机森林不需要预处理数据
    """
    import xgboost as xgb
    #导入数据预处理,包括标准化处理或正则处理
    from sklearn import preprocessing
    from sklearn.preprocessing import Imputer
    from sklearn import metrics
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split
    import pandas as pd
    #中文字体设置
    from matplotlib.font_manager import FontProperties
    font=FontProperties(fname=r"c:windowsfontssimsun.ttc",size=14)
    
    #读取变量名文件
    varibleFileName="titantic.xlsx"
    #读取目标文件
    targetFileName="target.xlsx"
    #读取excel
    data=pd.read_excel(varibleFileName)
    data_dummies=pd.get_dummies(data)
    print('features after one-hot encoding:
    ',list(data_dummies.columns))
    features=data_dummies.ix[:,"Pclass":'Embarked_S']
    x=features.values
    
    #数据预处理
    imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) 
    imp.fit(x)
    x=imp.transform(x)
    
    
    target=pd.read_excel(targetFileName)
    y=target.values
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)
    names=features.columns
    
    dtrain=xgb.DMatrix(x_train,label=y_train)
    dtest=xgb.DMatrix(x_test)
    
    params={'booster':'gbtree',
        #'objective': 'reg:linear',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth':4,
        'lambda':10,
        'subsample':0.75,
        'colsample_bytree':0.75,
        'min_child_weight':2,
        'eta': 0.025,
        'seed':0,
        'nthread':8,
         'silent':1}
    
    
    watchlist = [(dtrain,'train')]
    
    bst=xgb.train(params,dtrain,num_boost_round=100,evals=watchlist)
    
    ypred=bst.predict(dtest)
    
    # 设置阈值, 输出一些评价指标
    y_pred = (ypred >= 0.5)*1
    
    #模型校验
    print ('AUC: %.4f' % metrics.roc_auc_score(y_test,ypred))
    print ('ACC: %.4f' % metrics.accuracy_score(y_test,y_pred))
    print ('Recall: %.4f' % metrics.recall_score(y_test,y_pred))
    print ('F1-score: %.4f' %metrics.f1_score(y_test,y_pred))
    print ('Precesion: %.4f' %metrics.precision_score(y_test,y_pred))
    metrics.confusion_matrix(y_test,y_pred)
    
    print("xgboost:")  
    print('Feature importances:{}'.format(bst.get_fscore()))
    
    '''
    AUC: 0.9464
    ACC: 0.8841
    Recall: 0.8716
    F1-score: 0.8716
    Precesion: 0.8716
    xgboost:
    Feature importances:{'f5': 69, 'f1': 178, 'f2': 68, 'f4': 245, 'f6': 25, 'f0': 88, 'f3': 25, 'f194': 4, 'f193': 21, 'f195': 9}
    '''
    

      

     决策树

    decisionTree.py

    # -*- coding: utf-8 -*-
    """
    Created on Mon Apr 30 19:04:10 2018
    
    @author: Administrator
    """
    from sklearn.tree import export_graphviz
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import Imputer
    import pandas as pd
    import numpy as np
    from sklearn.tree import DecisionTreeClassifier
    import matplotlib.pyplot as plt
    
    #读取变量名文件
    varibleFileName="titantic.xlsx"
    #读取目标文件
    targetFileName="target.xlsx"
    #读取excel
    data=pd.read_excel(varibleFileName)
    data_dummies=pd.get_dummies(data)
    print('features after one-hot encoding:
    ',list(data_dummies.columns))
    features=data_dummies.ix[:,"Pclass":'Embarked_S']
    x=features.values
    
    #数据预处理
    imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) 
    imp.fit(x)
    x=imp.transform(x)
    
    
    target=pd.read_excel(targetFileName)
    y=target.values
    X_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)
    #变量名
    names=features.columns
    
    #调参
    list_average_accuracy=[]
    depth=range(1,30)
    for i in depth:
        #max_depth=4限制决策树深度可以降低算法复杂度,获取更精确值
        tree= DecisionTreeClassifier(max_depth=i,random_state=0)
        tree.fit(X_train,y_train)
        accuracy_training=tree.score(X_train,y_train)
        accuracy_test=tree.score(x_test,y_test)
        average_accuracy=(accuracy_training+accuracy_test)/2.0
        #print("average_accuracy:",average_accuracy)
        list_average_accuracy.append(average_accuracy)
        
    max_value=max(list_average_accuracy)
    #索引是0开头,结果要加1
    best_depth=list_average_accuracy.index(max_value)+1
    print("best_depth:",best_depth)
    
    best_tree= DecisionTreeClassifier(max_depth=best_depth,random_state=0)
    best_tree.fit(X_train,y_train)
    accuracy_training=best_tree.score(X_train,y_train)
    accuracy_test=best_tree.score(x_test,y_test)
    
    print("decision tree:")    
    print("accuracy on the training subset:{:.3f}".format(best_tree.score(X_train,y_train)))
    print("accuracy on the test subset:{:.3f}".format(best_tree.score(x_test,y_test)))
    
    '''
    best_depth: 19
    decision tree:
    accuracy on the training subset:0.976
    accuracy on the test subset:0.860
    '''
    
    #绘图,显示因子重要性
    n_features=x.shape[1]
    plt.barh(range(n_features),best_tree.feature_importances_,align='center')
    plt.yticks(np.arange(n_features),features)
    plt.title("Decision Tree:")
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature')
    plt.show()
    
    #生成一个dot文件,以后用cmd形式生成图片
    export_graphviz(best_tree,out_file="Titanic.dot",class_names=['death','live'],feature_names=names,impurity=False,filled=True)
    

      

    python风控评分卡建模和风控常识

  • 相关阅读:
    电话号码分身
    利用Geoerver+Mysql+openlayers实现gis空间数据线段、多边形的存储、编辑、平移等功能
    vue+openlayers图形交互,实现多边形绘制、编辑和保存
    JetBrains AppCode:用于 iOS/macOS 开发的智能 IDE
    GIS基础知识
    class java.time.LocalDateTime cannot be cast to class java.util.Date
    geoserver配置SQL图层 cql_filter模糊查询
    gis论坛
    Geoserver的WFS服务
    Linux 环境下修改 MySQL 时区
  • 原文地址:https://www.cnblogs.com/webRobot/p/8972030.html
Copyright © 2011-2022 走看看