zoukankan      html  css  js  c++  java
  • 决策树之分类与回归

    决策树是一种利用树形机构,进行数据分析的模型。

    from collections import Counter
    
    import numpy as np
    
    """
    [
                     [0.5, 6.5, 1, 1],
                     [0.7, 9, 2, 1],
                     [0.5, 7, 1, 0],
    
                     [0.6, 8.5, 0, 1],
    
                     [0.9, 10, 0, 1],
                     [1, 12, 1, 0],
                     [0.6, 9, 1, 0],
                     ]
    """
    # 数据 :宽, 高  ,长, 结果
    data = np.array([[0.3, 5, 2, 0],
                     [0.4, 6, 0, 0],
                     [0.5, 6.5, 1, 1],
                     [0.6, 6, 0, 0],
                     [0.7, 9, 2, 1],
                     [0.5, 7, 1, 0],
                     [0.4, 6, 0, 0],
                     [0.6, 8.5, 0, 1],
                     [0.3, 5.5, 2, 0],
                     [0.9, 10, 0, 1],
                     [1, 12, 1, 0],
                     [0.6, 9, 1, 0],
                     ])
    
    
    # 对数据按照j列,value进行拆分
    def split(data, j, value):
        data_l = data[data[:, j] <= value]
        data_r = data[data[:, j] > value]
        return data_l, data_r
    
    
    def entropy(data):
        data_count = Counter(data)
        length = data.shape[0]
        sum = 0
        for key in data_count:
            sum -= data_count[key] / length * np.log(data_count[key] / length)
        return sum
    
    
    # 求出条件熵,entropy0为经验熵,1,2为条件熵
    def gain(entropy0, entropy1, entropy2, p):
        tiaojian_entropy = p * entropy1 + (1 - p) * entropy2
        entropy = -(entropy0 - tiaojian_entropy) / (p * np.log(p) + (1 - p) * np.log(1 - p))
        # entropy = (entropy0 - tiaojian_entropy)
        return entropy
    
    
    # 计算所有的信息增益比,求出最小的值,进行切割数据
    def try_split(x, y):
        result = float("-inf")
        jingyan_entropy = entropy(y)  # 经验熵
        split_colum = 0
        split_value = 0
        for i in range(x.shape[1] - 1):
            datax = x.argsort(axis=0)
            for j in range(1, x.shape[0]):
                if x[datax[j, i], i] != x[datax[j - 1, i], i]:
                    #print(x[datax[j, i], i], x[datax[j - 1, i], i])
                    v = (x[datax[j, i], i] + x[datax[j - 1, i], i]) / 2
                    data_l, data_r = split(x, i, v)
                    p = j / x.shape[0]  # 通过j可以求出条件熵的概率
                    data_l_entropy = entropy(data_l[:, -1])
                    data_r_entropy = entropy(data_r[:, -1])
                    result_entropy = gain(jingyan_entropy, data_l_entropy, data_r_entropy, p)
                    if result_entropy > result:
                        result = result_entropy
                        split_colum = i
                        split_value = v
        return split_colum, split_value
    
    
    if __name__ == '__main__':
        split_colum,split_value=try_split(data, data[:, -1])
        print(split_colum,split_value,111)
        data_l,data_r=split(data,split_colum,split_value)
        split_colum, split_value = try_split(data_r, data[:, -1])
        print(split_colum, split_value, 222)

     2.回归树:

      利用决策树也可以进行回归预测,其实际上就是cart树模型,其损失函数是方差。通过对所有的属性分割点进行比较,求出最优的分割点

    import numpy as np
    
    DEEPTH = 4
    
    
    class Regression_Tree():
        def __init__(self):
            self.result = []
            self.data_l = None
            self.data_r = None
            self.queen = []
    
        # 方法1,采用argsort函数进行索引排序,然后分割
        def min_loss(self, data):
            loss = float("inf")
            length1 = data.shape[0]
            split1, split2, split3 = 0, 0, 0
            data1 = np.argsort(data, axis=0)
            for i in range(data.shape[1] - 1):
                for j in range(1, length1):
                    if data[data1[j, i], -1] != data[data1[j - 1, i], -1]:
                        loss1 = j * np.var(data[data1[:j, i], -1]) + (length1 - j) * np.var(data[data1[j:, i], -1])
                        if loss1 < loss:
                            loss = loss1
                            split1 = i
                            split3 = j
                            split2 = data[data1[j - 1, i], i]
            self.result.append([split1, split2, loss])
            self.data_l = data[data1[:split3, split1], :]
            self.data_r = data[data1[split3:, split1], :]

    #第二种求解损失函数的方法:采用布尔索引进行数组分割。与argsort函数类似。
    def min_loss1(self, data):
    loss = float("inf")
    split1, split2 = 0, 0
    for i in range(data.shape[1] - 1):
    data_list = Counter(data[:, i])
    for key in data_list.keys():
    data_l=data[data[:,i]<=key]
    data_r=data[data[:,i]>key]
    #当data_r=[]时, np.var(data_r[:,-1])=nan,会报切片异常。

    if data_l.shape[0]and data_r.shape[0] :
                    loss1=data_l.shape[0] * np.var(data_l[:,-1]) + data_r.shape[0] * np.var(data_r[:,-1])
    if loss1 < loss:
    loss = loss1
    split1 = i
    split2 = key
    return split1, split2,loss
    第三种方案是在第二种方案的基础上,进行改进。通过对data_list进行排序,直接输出key的列表,并去除最后一个值,则切割不会出现nan的现象
    for i in range(data.shape[1] - 1):
    data_list = sorted(Counter(data[:, i])).pop()
    def bitree(self,data1): self.queen.append(data1) for i in range(2 ** (DEEPTH - 1) - 1): # 至多需要分叉的次数 if self.queen: data = self.queen.pop(0) # 删除第一个元素 self.min_loss(data) if self.data_l.shape[0] and self.data_r.shape[0]: # 二叉树分割后,若左右的数据全不为空,则原则上可以再切分 self.queen += [self.data_l, self.data_r] a=np.array([[5,20,1.1],[70,30,1.3],[21,70,1.7],[300,60,1.8],[25,65,1.72],[28,80,1.67],[56,70,1.65],[30,55,1.62]]) tree1=Regression_Tree() tree1.bitree(a) print(tree1.result)
  • 相关阅读:
    《多处理器编程的艺术》读书笔记(2) 互斥
    《多处理器编程的艺术》读书笔记(7) CLH队列锁
    rdlc 套打实现
    《多处理器编程的艺术》读书笔记(6) 队列锁
    《多处理器编程的艺术》读书笔记(3) 双线程解决方案
    《多处理器编程的艺术》读书笔记(4) 自旋锁(1)
    《多处理器编程的艺术》读书笔记(1) 并行的困境和加速比
    反色,霓虹灯,浮雕
    《多处理器编程的艺术》读书笔记(5) 自旋锁(2)
    Open ESRI shape files in Quantum GIS Anny
  • 原文地址:https://www.cnblogs.com/xuehaiwuya0000/p/11629490.html
Copyright © 2011-2022 走看看