import numpy as np import math #产生数据的函数 def createdatabase(): dataSet = [[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']] labels = [['no surfacing'],['flippers']] return dataSet,labels dataSet,labels = createdatabase() print('dataSet:',dataSet) print() #求数据的香农熵 熵越大 混合的数据越多 def XN(dataSet): mydict = {} Sum = 0 datasize = len(dataSet) for i in dataSet: mydict[i[-1]] = mydict.get(i[-1],0) + 1 for key in mydict: P = mydict[key] / datasize Sum -= P * math.log(P,2) # print('dict:',mydict) # print('XN:',Sum) return Sum # XN(dataSet) def D_split(dataSet,axis,value): #按某一列的某个值分数据 以dataset[axis]的value值分类 返回的数组比原数据少一列(少的axis这列) # print("data:{},axis:{},value:{}".format(dataSet,axis,value)) result = [] for i in dataSet: # print(i[axis],i[axis] == value,type(i[axis])) if i[axis] == value: # print('测试每行', i[axis]) data1 = i[:axis] data2 = i[axis + 1:] data1.extend(data2) result.append(data1) return result # partdata = D_split(dataSet,0,1) # partdata [[1, 'yes'], [1, 'yes'], [0, 'yo']] def chooseaxis(dataSet): #选择最佳axis 返回最佳特征值的序号 datasize = len(dataSet) #数据行数(长度) baseXN = XN(dataSet) #原始数据的熵值 bestaxis = 0 # 最好的axis选择默认为0 for axis in range(len(dataSet[0]) - 1): #axis 为列号 value_list = [row[axis] for row in dataSet] #把该列的所有value组成一个列表 value_list = set(value_list) #去重 newXN = 0 #设置新熵值 for value in value_list: partdata = D_split(dataSet,axis,value) P = len(partdata) / datasize #求该value的概率 newXN += P * XN(partdata) # print('partdata:{},P:{},axis为:{},newXH:{}'.format(partdata,P,axis,newXN)) # axis为:0,newXH:0.5509775004326937 # axis为:1,newXH:0.8 if newXN < baseXN: baseXN = newXN bestaxis = axis print('bestaxis:{},XN:{}'.format(bestaxis,baseXN)) # bestaxis:0,XN:0.5509775004326937 return bestaxis def major(classlist): #少数服从多数函数 返回较多的类型 classcount = {} for i in classlist: classcount[i] = classcount.get(i,0) + 1 classcount = sorted(classcount,key=classcount.get) return classcount[-1] classlist = [i[2] for i in dataSet] print('classlist:',classlist) # mydict = major(classlist) def createtree(dataSet,labels): #构造树 classlist = [i[-1] for i in dataSet] if len(dataSet) == classlist.count(classlist[0]): return classlist[0] if len(dataSet[0]) == 1: return major(classlist) axis = chooseaxis(dataSet) label_choose = labels[axis] # print('label_choose',label_choose) del labels[axis] mytree = {label_choose[0]:{}} #定义需要返回的树 以当前分类特征为key for value in [row[axis] for row in dataSet]: newlables = labels[:] #如果直接传lables,列表元素传值是传的引用,会影响,所以这里用切片切个一样的副本,不能单纯的'=',不然还是引用 mytree[label_choose[0]][value] = createtree(D_split(dataSet,axis,value),newlables) print("mytree: ",mytree) return mytree createtree(dataSet,labels) # 结果如下: # {'no surfacing': {1: {'flippers': {1: 'yes', 0: 'no'}}, 0: 'no'}}