zoukankan      html  css  js  c++  java
  • sklearn实现决策树算法

    1、决策树算法是一种非参数的决策算法,它根据数据的不同特征进行多层次的分类和判断,最终决策出所需要预测的结果。它既可以解决分类算法,也可以解决回归问题,具有很好的解释能力。另外,对于决策树的构建方法具有多种出发点,它具有多种构建方式,如何构建决策树的出发点主要在于决策树每一个决策点上需要在哪些维度上进行划分以及在这些维度的哪些阈值节点做划分等细节问题。

    具体在sklearn中调用决策树算法解决分类问题和回归问题的程序代码如下所示:


    #1-1导入基础训练数据集
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import datasets
    d=datasets.load_iris()
    x=d.data[:,2:]
    y=d.target
    plt.figure()
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.scatter(x[y==2,0],x[y==2,1],color="b")
    plt.show()
    #1-2导入sklearn中的决策树算法进行数据的分类问题实现训练预测
    from sklearn.tree import DecisionTreeClassifier
    dt1=DecisionTreeClassifier(max_depth=2,criterion="entropy") #定义决策树的分类器相关决策超参数
    dt1.fit(x,y)
    def plot_decision_boundary(model,axis): #决策边界输出函数(二维数据点)
    x0,x1=np.meshgrid(
    np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)).reshape(-1,1),
    np.linspace(axis[2],axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1,1)
    )
    x_new=np.c_[x0.ravel(),x1.ravel()]
    y_pre=model.predict(x_new)
    zz=y_pre.reshape(x0.shape)
    from matplotlib.colors import ListedColormap
    cus=ListedColormap(["#EF9A9A","#FFF59D","#90CAF9"])
    plt.contourf(x0,x1,zz,cmap=cus)
    plot_decision_boundary(dt1,axis=[0.5,8,0,3])
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.scatter(x[y==2,0],x[y==2,1],color="b")
    plt.show()
    #定义二分类问题的信息熵计算函数np.sum(-p*np.log(p))
    def entropy(p):
    return -p*np.log(p)-(1-p)*np.log(1-p)
    x1=np.linspace(0.01,0.99,100)
    y1=entropy(x1)
    plt.plot(x1,y1,"r")
    plt.show()
    #利用信息熵的原理对数据进行实现划分,决策树信息熵构建方式的原理实现代码
    def split(x,y,d,value):
    index_a=(x[:,d]<=value)
    index_b=(x[:,d]>value)
    return x[index_a],x[index_b],y[index_a],y[index_b]
    from collections import Counter
    def entropy(y):
    Counter1=Counter(y)
    res=0.0
    for num in Counter1.values():
    p=num/len(y)
    res+=-p*np.log(p)
    return res
    def try_spit(x,y):
    best_entropy=float("inf")
    best_d,best_v=-1,-1
    for d in range(x.shape[1]):
    sorted_index=np.argsort(x[:,d])
    for i in range(1,len(x)):
    if x[sorted_index[i-1],d] != x[sorted_index[i],d]:
    v=(x[sorted_index[i-1],d]+x[sorted_index[i],d])/2
    x_l,x_r,y_l,y_r=split(x,y,d,v)
    e=entropy(y_l)+entropy(y_r)
    if e<best_entropy:
    best_entropy,best_d,best_v=e,d,v
    return best_entropy,best_d,best_v
    print(try_spit(x,y))
    best_entropy=try_spit(x,y)[0]
    best_d=try_spit(x,y)[1]
    best_v=try_spit(x,y)[2]
    x_l,x_r,y_l,y_r=split(x,y,best_d,best_v)
    print(entropy(y_l))
    print(entropy(y_r))
    #基尼系数方式构建决策树的代码实现
    from sklearn.tree import DecisionTreeClassifier
    dt2=DecisionTreeClassifier(max_depth=2,criterion="gini") #定义决策树的分类器相关决策超参数
    dt2.fit(x,y)
    plot_decision_boundary(dt2,axis=[0.5,8,0,3])
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.scatter(x[y==2,0],x[y==2,1],color="b")
    plt.show()
    def split(x,y,d,value):
    index_a=(x[:,d]<=value)
    index_b=(x[:,d]>value)
    return x[index_a],x[index_b],y[index_a],y[index_b]
    from collections import Counter
    def gini(y):
    Counter1 = Counter(y)
    res = 1.0
    for num in Counter1.values():
    p = num / len(y)
    res -= p**2
    return res
    def try_spit1(x,y):
    best_gini=float("inf")
    best_d,best_v=-1,-1
    for d in range(x.shape[1]):
    sorted_index=np.argsort(x[:,d])
    for i in range(1,len(x)):
    if x[sorted_index[i-1],d] != x[sorted_index[i],d]:
    v=(x[sorted_index[i-1],d]+x[sorted_index[i],d])/2
    x_l,x_r,y_l,y_r=split(x,y,d,v)
    g=gini(y_l)+gini(y_r)
    if g<best_gini:
    best_gini,best_d,best_v=g,d,v
    return [best_gini,best_d,best_v]
    best_gini,best_d,best_v=try_spit1(x,y)
    print(best_gini,best_d,best_v)
    #对于决策数进行相应的剪枝,尽可能降低过拟合的情况
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import datasets
    x,y=datasets.make_moons(noise=0.25,random_state=666) #生成数据默认为100个数据样本
    print(x.shape)
    print(y.shape)
    plt.figure()
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.show()
    from sklearn.tree import DecisionTreeClassifier
    dt2=DecisionTreeClassifier(max_depth=2,min_samples_split=10,min_samples_leaf=6,max_leaf_nodes=4) #默认情况下则为基尼系数,对于深度会一直划分下去使得基尼系数为0为止
    #决策树的主要超参数
    dt2.fit(x,y)
    plot_decision_boundary(dt2,axis=[-2,3,-1,1.5])
    plt.scatter(x[y==0,0],x[y==0,1],color="r")
    plt.scatter(x[y==1,0],x[y==1,1],color="g")
    plt.show()
    #使用决策树解决回归问题
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import datasets
    d=datasets.load_boston()
    x=d.data
    y=d.target
    print(x.shape)
    print(y.shape)
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
    from sklearn.tree import DecisionTreeRegressor
    dr=DecisionTreeRegressor()
    dr.fit(x_train,y_train)
    print(dr.score(x_test,y_test))
    print(dr.score(x_train,y_train)) #在训练数据集的R2=1,而在测试集上比较小,因此已经产生了过拟合,学习曲线可以比较好的反映过拟合情况
    #绘制不同参数组合情况下的学习曲线
    from sklearn.metrics import mean_squared_error
    def plot_learning_curve(algo,x_train,x_test,y_train,y_test):
    train_score = []
    test_score = []
    for i in range(1, len(x_train)):
    algo.fit(x_train[:i], y_train[:i])
    y_train_pre = algo.predict(x_train[:i])
    y_test_pre =algo.predict(x_test)
    train_score.append(mean_squared_error(y_train[:i], y_train_pre))
    test_score.append(mean_squared_error(y_test, y_test_pre))
    plt.figure()
    plt.plot([i for i in range(1, len(x_train))], np.sqrt(train_score), "g", label="train_error")
    plt.plot([i for i in range(1, len(x_train))], np.sqrt(test_score), "r", label="test_error")
    plt.legend()
    #plt.axis([0,len(x_train)+1,0,5])
    plt.show()
    plot_learning_curve(DecisionTreeRegressor(max_depth=1),x_train,x_test,y_train,y_test) #欠拟合的情况
    plot_learning_curve(DecisionTreeRegressor(max_depth=5),x_train,x_test,y_train,y_test) #较好拟合的情况
    plot_learning_curve(DecisionTreeRegressor(max_depth=15),x_train,x_test,y_train,y_test) #过拟合的情况













  • 相关阅读:
    360安全桌面的屏保图片在哪个目录下?
    IntelliJ 注解@Slf4j后找不到log问题解决
    运行springboot项目报错:Field userMapper in XX required a bean of type 'xx' that could not be found.
    springsecurity
    Error creating bean with name 'sqlSessionFactory' defined in class path resource [config/spring/applicationContext.xml]: Invocation of init method failed;
    idea热部署
    数据结构导论 三 线性表-顺序存储
    数据结构导论 二 时空性
    树莓派安装、卸载、更新软件
    数据结构导论第一章
  • 原文地址:https://www.cnblogs.com/Yanjy-OnlyOne/p/11372286.html
Copyright © 2011-2022 走看看