zoukankan      html  css  js  c++  java
  • 决策树之分类与回归

    决策树是一种利用树形机构,进行数据分析的模型。

    from collections import Counter
    
    import numpy as np
    
    """
    [
                     [0.5, 6.5, 1, 1],
                     [0.7, 9, 2, 1],
                     [0.5, 7, 1, 0],
    
                     [0.6, 8.5, 0, 1],
    
                     [0.9, 10, 0, 1],
                     [1, 12, 1, 0],
                     [0.6, 9, 1, 0],
                     ]
    """
    # 数据 :宽, 高  ,长, 结果
    data = np.array([[0.3, 5, 2, 0],
                     [0.4, 6, 0, 0],
                     [0.5, 6.5, 1, 1],
                     [0.6, 6, 0, 0],
                     [0.7, 9, 2, 1],
                     [0.5, 7, 1, 0],
                     [0.4, 6, 0, 0],
                     [0.6, 8.5, 0, 1],
                     [0.3, 5.5, 2, 0],
                     [0.9, 10, 0, 1],
                     [1, 12, 1, 0],
                     [0.6, 9, 1, 0],
                     ])
    
    
    # 对数据按照j列,value进行拆分
    def split(data, j, value):
        data_l = data[data[:, j] <= value]
        data_r = data[data[:, j] > value]
        return data_l, data_r
    
    
    def entropy(data):
        data_count = Counter(data)
        length = data.shape[0]
        sum = 0
        for key in data_count:
            sum -= data_count[key] / length * np.log(data_count[key] / length)
        return sum
    
    
    # 求出条件熵,entropy0为经验熵,1,2为条件熵
    def gain(entropy0, entropy1, entropy2, p):
        tiaojian_entropy = p * entropy1 + (1 - p) * entropy2
        entropy = -(entropy0 - tiaojian_entropy) / (p * np.log(p) + (1 - p) * np.log(1 - p))
        # entropy = (entropy0 - tiaojian_entropy)
        return entropy
    
    
    # 计算所有的信息增益比,求出最小的值,进行切割数据
    def try_split(x, y):
        result = float("-inf")
        jingyan_entropy = entropy(y)  # 经验熵
        split_colum = 0
        split_value = 0
        for i in range(x.shape[1] - 1):
            datax = x.argsort(axis=0)
            for j in range(1, x.shape[0]):
                if x[datax[j, i], i] != x[datax[j - 1, i], i]:
                    #print(x[datax[j, i], i], x[datax[j - 1, i], i])
                    v = (x[datax[j, i], i] + x[datax[j - 1, i], i]) / 2
                    data_l, data_r = split(x, i, v)
                    p = j / x.shape[0]  # 通过j可以求出条件熵的概率
                    data_l_entropy = entropy(data_l[:, -1])
                    data_r_entropy = entropy(data_r[:, -1])
                    result_entropy = gain(jingyan_entropy, data_l_entropy, data_r_entropy, p)
                    if result_entropy > result:
                        result = result_entropy
                        split_colum = i
                        split_value = v
        return split_colum, split_value
    
    
    if __name__ == '__main__':
        split_colum,split_value=try_split(data, data[:, -1])
        print(split_colum,split_value,111)
        data_l,data_r=split(data,split_colum,split_value)
        split_colum, split_value = try_split(data_r, data[:, -1])
        print(split_colum, split_value, 222)

     2.回归树:

      利用决策树也可以进行回归预测,其实际上就是cart树模型,其损失函数是方差。通过对所有的属性分割点进行比较,求出最优的分割点

    import numpy as np
    
    DEEPTH = 4
    
    
    class Regression_Tree():
        def __init__(self):
            self.result = []
            self.data_l = None
            self.data_r = None
            self.queen = []
    
        # 方法1,采用argsort函数进行索引排序,然后分割
        def min_loss(self, data):
            loss = float("inf")
            length1 = data.shape[0]
            split1, split2, split3 = 0, 0, 0
            data1 = np.argsort(data, axis=0)
            for i in range(data.shape[1] - 1):
                for j in range(1, length1):
                    if data[data1[j, i], -1] != data[data1[j - 1, i], -1]:
                        loss1 = j * np.var(data[data1[:j, i], -1]) + (length1 - j) * np.var(data[data1[j:, i], -1])
                        if loss1 < loss:
                            loss = loss1
                            split1 = i
                            split3 = j
                            split2 = data[data1[j - 1, i], i]
            self.result.append([split1, split2, loss])
            self.data_l = data[data1[:split3, split1], :]
            self.data_r = data[data1[split3:, split1], :]

    #第二种求解损失函数的方法:采用布尔索引进行数组分割。与argsort函数类似。
    def min_loss1(self, data):
    loss = float("inf")
    split1, split2 = 0, 0
    for i in range(data.shape[1] - 1):
    data_list = Counter(data[:, i])
    for key in data_list.keys():
    data_l=data[data[:,i]<=key]
    data_r=data[data[:,i]>key]
    #当data_r=[]时, np.var(data_r[:,-1])=nan,会报切片异常。

    if data_l.shape[0]and data_r.shape[0] :
                    loss1=data_l.shape[0] * np.var(data_l[:,-1]) + data_r.shape[0] * np.var(data_r[:,-1])
    if loss1 < loss:
    loss = loss1
    split1 = i
    split2 = key
    return split1, split2,loss
    第三种方案是在第二种方案的基础上,进行改进。通过对data_list进行排序,直接输出key的列表,并去除最后一个值,则切割不会出现nan的现象
    for i in range(data.shape[1] - 1):
    data_list = sorted(Counter(data[:, i])).pop()
    def bitree(self,data1): self.queen.append(data1) for i in range(2 ** (DEEPTH - 1) - 1): # 至多需要分叉的次数 if self.queen: data = self.queen.pop(0) # 删除第一个元素 self.min_loss(data) if self.data_l.shape[0] and self.data_r.shape[0]: # 二叉树分割后,若左右的数据全不为空,则原则上可以再切分 self.queen += [self.data_l, self.data_r] a=np.array([[5,20,1.1],[70,30,1.3],[21,70,1.7],[300,60,1.8],[25,65,1.72],[28,80,1.67],[56,70,1.65],[30,55,1.62]]) tree1=Regression_Tree() tree1.bitree(a) print(tree1.result)
  • 相关阅读:
    121.买卖股票 求最大收益1 Best Time to Buy and Sell Stock
    409.求最长回文串的长度 LongestPalindrome
    202.快乐数 Happy Number
    459.(KMP)求字符串是否由模式重复构成 Repeated Substring Pattern
    326.是否为3的平方根 IsPowerOfThree
    231.是否为2的平方根 IsPowerOfTwo
    461.求两个数字转成二进制后的“汉明距离” Hamming Distance
    206.反转单链表 Reverse Linked List
    448. 数组中缺少的元素 Find All Numbers Disappeared in an Array
    常见表单元素处理
  • 原文地址:https://www.cnblogs.com/xuehaiwuya0000/p/11629490.html
Copyright © 2011-2022 走看看