zoukankan      html  css  js  c++  java
  • sklearn:决策分类树_红酒数据集

    from sklearn import tree
    from sklearn.datasets import load_wine  # 红酒数据
    from sklearn.model_selection import train_test_split
    
    wine = load_wine()  # 导入数据
    wine
    
    {'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
             1.065e+03],
            [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
             1.050e+03],
            [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
             1.185e+03],
            ...,
            [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
             8.350e+02],
            [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
             8.400e+02],
            [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
             5.600e+02]]),
     'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            2, 2]),
     'target_names': array(['class_0', 'class_1', 'class_2'], dtype='<U7'),
     'DESCR': '......rics).
    ',
     'feature_names': ['alcohol',
      'malic_acid',
      'ash',
      'alcalinity_of_ash',
      'magnesium',
      'total_phenols',
      'flavanoids',
      'nonflavanoid_phenols',
      'proanthocyanins',
      'color_intensity',
      'hue',
      'od280/od315_of_diluted_wines',
      'proline']}
    
    import pandas as pd
    
    # 将特征数据与 target拼接起来
    wine_df = pd.concat([pd.DataFrame(wine.data), pd.DataFrame(wine.target)] ,axis=1)
    wine_df.columns=list(wine.feature_names) + ['target']  # 将数据特征名称与数据对应
    wine_df['target'] = wine_df['target'].map(dict(zip(range(3), wine.target_names)))  # 显示类别名称
    wine_df
    
    alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue od280/od315_of_diluted_wines proline target
    0 14.23 1.71 2.43 15.6 127.0 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065.0 class_0
    1 13.20 1.78 2.14 11.2 100.0 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050.0 class_0
    2 13.16 2.36 2.67 18.6 101.0 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185.0 class_0
    3 14.37 1.95 2.50 16.8 113.0 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480.0 class_0
    4 13.24 2.59 2.87 21.0 118.0 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735.0 class_0
    ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
    173 13.71 5.65 2.45 20.5 95.0 1.68 0.61 0.52 1.06 7.70 0.64 1.74 740.0 class_2
    174 13.40 3.91 2.48 23.0 102.0 1.80 0.75 0.43 1.41 7.30 0.70 1.56 750.0 class_2
    175 13.27 4.28 2.26 20.0 120.0 1.59 0.69 0.43 1.35 10.20 0.59 1.56 835.0 class_2
    176 13.17 2.59 2.37 20.0 120.0 1.65 0.68 0.53 1.46 9.30 0.60 1.62 840.0 class_2
    177 14.13 4.10 2.74 24.5 96.0 2.05 0.76 0.56 1.35 9.20 0.61 1.60 560.0 class_2

    178 rows × 14 columns

    # 拆分数据为:训练集和测试集
    X_train, X_test, Y_train, Y_test = train_test_split(wine.data, wine.target, test_size=0.3)
    
    clf = tree.DecisionTreeClassifier(criterion="entropy")
    clf = clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)  # 返回准确度
    score
    
    0.9444444444444444
    
    import graphviz  # 需要提前安装graphviz
    
    dot_data = tree.export_graphviz(clf
        ,feature_names = wine.feature_names  # 特征名
        ,class_names = wine.target_names  # 标签名
        ,filled = True  # 颜色填充
        ,rounded = True  # 圆角边框
    )
    
    graph = graphviz.Source(dot_data)
    graph
    

    jpg

    clf.feature_importances_  # 查看各特征的重要性,没有被使用的特征 重要性为0
    
    array([0.        , 0.        , 0.        , 0.        , 0.        ,
           0.        , 0.3918564 , 0.        , 0.        , 0.1160134 ,
           0.02128596, 0.        , 0.47084424])
    
    dict(zip(wine.feature_names, clf.feature_importances_))  # 将特征名称与重要性对应
    
    {'alcohol': 0.0,
     'malic_acid': 0.0,
     'ash': 0.0,
     'alcalinity_of_ash': 0.0,
     'magnesium': 0.0,
     'total_phenols': 0.0,
     'flavanoids': 0.26190367697120653,
     'nonflavanoid_phenols': 0.0,
     'proanthocyanins': 0.0,
     'color_intensity': 0.11601339710491781,
     'hue': 0.0,
     'od280/od315_of_diluted_wines': 0.15123868318487035,
     'proline': 0.47084424273900527}
    
    
    

    增加决策树随机性

    • 决策树的随机性在高维度的数据集中表现的会比较好
    • 在低维度数据集(比如鸢尾花数据集中),随机性就表现得不够好
    clf = tree.DecisionTreeClassifier(criterion="entropy"
                                      ,random_state=50  # 随机种子
                                      # splitter:默认是best,就是上面的重要性。虽然随机,但是还是选择最重要的。
                                      # random让决策树更加随机,树会更大更深
                                      ,splitter="random"  
                                     )
    clf = clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)  # 返回准确度
    score
    
    0.8888888888888888
    
    import graphviz
    dot_data = tree.export_graphviz(clf
        ,feature_names = wine.feature_names  # 特征名
        ,class_names = wine.target_names  # 标签名
        ,filled = True  # 颜色填充
        ,rounded = True  # 圆角边框
    )
    
    graph = graphviz.Source(dot_data)
    graph
    

    jpg

    
    

    剪枝参数:min_samples_leaf & min_samples_split

    • 为了使决策树具有更大的泛化能力
    • 限制树的最大深度,建议从3开始逐渐尝试
    • 限制叶子节点数量
    • 限制划分节点数量
    import graphviz
    
    clf = tree.DecisionTreeClassifier(criterion="entropy"
                                      ,random_state=50
                                      ,splitter="random"
                                      ,max_depth = 3
                                      ,min_samples_leaf=10  # 将样本数量小于10的叶子节点剪掉
                                      ,min_samples_split=10  # 将中间节点样本数量小于10的剪掉
                                     )
    clf = clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)  # 返回准确度
    print(score)
    dot_data = tree.export_graphviz(clf
        ,feature_names = wine.feature_names  # 特征名
        ,class_names = wine.target_names  # 标签名
        ,filled = True  # 颜色填充
        ,rounded = True  # 圆角边框
    )
    
    graph = graphviz.Source(dot_data)
    graph
    
    0.8518518518518519
    

    jpg

    
    

    max_features & min_impurity_decrease

    • max_features:最大特征数量限制,超过限制的特征会被舍弃,是一种降维方式,使用较少
    • min_impurity_decrease:限制信息增益大小,当信息增益小于这个值,就不再进行分支了
    import graphviz
    
    clf = tree.DecisionTreeClassifier(criterion="entropy"
                                      ,random_state=50
    #                                   ,splitter="random"
                                      ,max_depth = 5
    #                                   ,min_samples_leaf=10  # 将样本数量小于10的叶子节点剪掉
    #                                   ,min_samples_split=10  # 将中间节点样本数量小于10的剪掉
    #                                   ,max_features = 2
                                      ,min_impurity_decrease=0.1
                                     )
    clf = clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)  # 返回准确度
    print(score)
    dot_data = tree.export_graphviz(clf
        ,feature_names = wine.feature_names  # 特征名
        ,class_names = wine.target_names  # 标签名
        ,filled = True  # 颜色填充
        ,rounded = True  # 圆角边框
    )
    
    graph = graphviz.Source(dot_data)
    graph
    
    0.9444444444444444
    

    jpg

    
    

    确认最优参数,画学习曲线

    import  matplotlib.pyplot as plt
    
    deths_rt = []
    
    for dep in range(1, 10):
        clf = tree.DecisionTreeClassifier(criterion="entropy"
                                          ,max_depth = dep
                                         )
        clf = clf.fit(X_train, Y_train)
        score = clf.score(X_test, Y_test)  # 返回准确度
        deths_rt.append(score)
        
    
    plt.plot(range(1, 10), deths_rt)
    

    png

    
    

    目标权重参数

    • class_weight & min_weight_fraction_leaf
    • 注意:sklearn不接受一维矩阵
    class_weight  # 目标类型的权重,其数据类型为dict或者列表内的dict,或者为"balanced"
    
    min_weight_fraction_leaf  # 权重剪枝参数,搭配目标权重使用,比min_samples_leaf更偏向于主导类
    

    其他常用接口

    # 返回样本所在叶子节点的索引
    clf.apply(X_test)
    
    array([ 5,  5,  5,  4,  3,  5,  5,  5,  5, 10, 10,  5,  5,  3, 10, 10, 10,
            5,  4,  5, 10,  4,  5, 10,  5,  5,  4,  5,  4,  4,  5,  4,  4, 10,
           10,  5,  4,  5,  5,  5,  4, 10, 10, 10,  5,  5, 10,  4, 10, 10,  5,
            5,  5, 10], dtype=int64)
    
    # 返回预测标签
    clf.predict(X_test
    
    array([1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 2, 1, 0, 2,
           1, 0, 1, 1, 2, 1, 2, 2, 1, 2, 2, 0, 0, 1, 2, 1, 1, 1, 2, 0, 0, 0,
           1, 1, 0, 2, 0, 0, 1, 1, 1, 0])
  • 相关阅读:
    概率论——随机事件及其概率
    Web应用程序项目以配置使用IIS。未找到Web服务器”链接地址”
    LaTeX中.sty文件缺失解决办法
    IIS中的经典模式和集成模式有什么区别
    判断有序整型数组中是否存在两数,相加之和等于给定的任意整数
    51Job的搜索技巧
    登录失败。该登录名来自不受信任的域,不能与 Windows 身份验证一起使用。
    Ubuntu使用Latex模板moderncv写简历
    COM相关操作(C#)
    什么是单线程单元(STA)什么是多线程单元(MTA)
  • 原文地址:https://www.cnblogs.com/jaysonteng/p/14217115.html
Copyright © 2011-2022 走看看