决策树是一种利用树形机构,进行数据分析的模型。
from collections import Counter import numpy as np """ [ [0.5, 6.5, 1, 1], [0.7, 9, 2, 1], [0.5, 7, 1, 0], [0.6, 8.5, 0, 1], [0.9, 10, 0, 1], [1, 12, 1, 0], [0.6, 9, 1, 0], ] """ # 数据 :宽, 高 ,长, 结果 data = np.array([[0.3, 5, 2, 0], [0.4, 6, 0, 0], [0.5, 6.5, 1, 1], [0.6, 6, 0, 0], [0.7, 9, 2, 1], [0.5, 7, 1, 0], [0.4, 6, 0, 0], [0.6, 8.5, 0, 1], [0.3, 5.5, 2, 0], [0.9, 10, 0, 1], [1, 12, 1, 0], [0.6, 9, 1, 0], ]) # 对数据按照j列,value进行拆分 def split(data, j, value): data_l = data[data[:, j] <= value] data_r = data[data[:, j] > value] return data_l, data_r def entropy(data): data_count = Counter(data) length = data.shape[0] sum = 0 for key in data_count: sum -= data_count[key] / length * np.log(data_count[key] / length) return sum # 求出条件熵,entropy0为经验熵,1,2为条件熵 def gain(entropy0, entropy1, entropy2, p): tiaojian_entropy = p * entropy1 + (1 - p) * entropy2 entropy = -(entropy0 - tiaojian_entropy) / (p * np.log(p) + (1 - p) * np.log(1 - p)) # entropy = (entropy0 - tiaojian_entropy) return entropy # 计算所有的信息增益比,求出最小的值,进行切割数据 def try_split(x, y): result = float("-inf") jingyan_entropy = entropy(y) # 经验熵 split_colum = 0 split_value = 0 for i in range(x.shape[1] - 1): datax = x.argsort(axis=0) for j in range(1, x.shape[0]): if x[datax[j, i], i] != x[datax[j - 1, i], i]: #print(x[datax[j, i], i], x[datax[j - 1, i], i]) v = (x[datax[j, i], i] + x[datax[j - 1, i], i]) / 2 data_l, data_r = split(x, i, v) p = j / x.shape[0] # 通过j可以求出条件熵的概率 data_l_entropy = entropy(data_l[:, -1]) data_r_entropy = entropy(data_r[:, -1]) result_entropy = gain(jingyan_entropy, data_l_entropy, data_r_entropy, p) if result_entropy > result: result = result_entropy split_colum = i split_value = v return split_colum, split_value if __name__ == '__main__': split_colum,split_value=try_split(data, data[:, -1]) print(split_colum,split_value,111) data_l,data_r=split(data,split_colum,split_value) split_colum, split_value = try_split(data_r, data[:, -1]) print(split_colum, split_value, 222)
2.回归树:
利用决策树也可以进行回归预测,其实际上就是cart树模型,其损失函数是方差。通过对所有的属性分割点进行比较,求出最优的分割点
import numpy as np DEEPTH = 4 class Regression_Tree(): def __init__(self): self.result = [] self.data_l = None self.data_r = None self.queen = [] # 方法1,采用argsort函数进行索引排序,然后分割 def min_loss(self, data): loss = float("inf") length1 = data.shape[0] split1, split2, split3 = 0, 0, 0 data1 = np.argsort(data, axis=0) for i in range(data.shape[1] - 1): for j in range(1, length1): if data[data1[j, i], -1] != data[data1[j - 1, i], -1]: loss1 = j * np.var(data[data1[:j, i], -1]) + (length1 - j) * np.var(data[data1[j:, i], -1]) if loss1 < loss: loss = loss1 split1 = i split3 = j split2 = data[data1[j - 1, i], i] self.result.append([split1, split2, loss]) self.data_l = data[data1[:split3, split1], :] self.data_r = data[data1[split3:, split1], :]
#第二种求解损失函数的方法:采用布尔索引进行数组分割。与argsort函数类似。
def min_loss1(self, data):
loss = float("inf")
split1, split2 = 0, 0
for i in range(data.shape[1] - 1):
data_list = Counter(data[:, i])
for key in data_list.keys():
data_l=data[data[:,i]<=key]
data_r=data[data[:,i]>key]
#当data_r=[]时, np.var(data_r[:,-1])=nan,会报切片异常。
if data_l.shape[0]and data_r.shape[0] :
loss1=data_l.shape[0] * np.var(data_l[:,-1]) + data_r.shape[0] * np.var(data_r[:,-1])
if loss1 < loss:
loss = loss1
split1 = i
split2 = key
return split1, split2,loss
第三种方案是在第二种方案的基础上,进行改进。通过对data_list进行排序,直接输出key的列表,并去除最后一个值,则切割不会出现nan的现象
for i in range(data.shape[1] - 1):
data_list = sorted(Counter(data[:, i])).pop()
def bitree(self,data1): self.queen.append(data1) for i in range(2 ** (DEEPTH - 1) - 1): # 至多需要分叉的次数 if self.queen: data = self.queen.pop(0) # 删除第一个元素 self.min_loss(data) if self.data_l.shape[0] and self.data_r.shape[0]: # 二叉树分割后,若左右的数据全不为空,则原则上可以再切分 self.queen += [self.data_l, self.data_r] a=np.array([[5,20,1.1],[70,30,1.3],[21,70,1.7],[300,60,1.8],[25,65,1.72],[28,80,1.67],[56,70,1.65],[30,55,1.62]]) tree1=Regression_Tree() tree1.bitree(a) print(tree1.result)