zoukankan      html  css  js  c++  java
  • 植树节快到了-那就种棵决策树吧

    from math import log
    import operator
    
    def calcShannonEnt(dataSet):  # 计算数据的熵(entropy)
        numEntries=len(dataSet)  # 数据条数
        labelCounts={}
        for featVec in dataSet:
            currentLabel=featVec[-1] # 每行数据的最后一个字(类别)
            if currentLabel not in labelCounts.keys():
                labelCounts[currentLabel]=0
            labelCounts[currentLabel]+=1  # 统计有多少个类以及每个类的数量
        shannonEnt=0
        for key in labelCounts:
            prob=float(labelCounts[key])/numEntries # 计算单个类的熵值
            shannonEnt-=prob*log(prob,2) # 累加每个类的熵值
        return shannonEnt
    
    def createDataSet1():    # 创造示例数据
        dataSet = [['长', '粗', '男'],
                   ['短', '粗', '男'],
                   ['短', '粗', '男'],
                   ['长', '细', '女'],
                   ['短', '细', '女'],
                   ['短', '粗', '女'],
                   ['长', '粗', '女'],
                   ['长', '粗', '女']]
        labels = ['头发','声音']  #两个特征
        return dataSet,labels
    
    def splitDataSet(dataSet,axis,value): # 按某个特征分类后的数据
        retDataSet=[]
        for featVec in dataSet:
            if featVec[axis]==value:
                reducedFeatVec =featVec[:axis]
                reducedFeatVec.extend(featVec[axis+1:])
                retDataSet.append(reducedFeatVec)
        return retDataSet
    
    def chooseBestFeatureToSplit(dataSet):  # 选择最优的分类特征
        numFeatures = len(dataSet[0])-1
        baseEntropy = calcShannonEnt(dataSet)  # 原始的熵
        bestInfoGain = 0
        bestFeature = -1
        for i in range(numFeatures):
            featList = [example[i] for example in dataSet]
            uniqueVals = set(featList)
            newEntropy = 0
            for value in uniqueVals:
                subDataSet = splitDataSet(dataSet,i,value)
                prob =len(subDataSet)/float(len(dataSet))
                newEntropy +=prob*calcShannonEnt(subDataSet)  # 按特征分类后的熵
            infoGain = baseEntropy - newEntropy  # 原始熵与按特征分类后的熵的差值
            if (infoGain>bestInfoGain):   # 若按某特征划分后,熵值减少的最大,则次特征为最优分类特征
                bestInfoGain=infoGain
                bestFeature = i
        return bestFeature
    
    def majorityCnt(classList):    #按分类后类别数量排序,比如:最后分类为2男1女,则判定为男;
        classCount={}
        for vote in classList:
            if vote not in classCount.keys():
                classCount[vote]=0
            classCount[vote]+=1
        sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
        return sortedClassCount[0][0]
    
    def createTree(dataSet,labels):
        classList=[example[-1] for example in dataSet]  # 类别:男或女
        if classList.count(classList[0])==len(classList):
            return classList[0]
        if len(dataSet[0])==1:
            return majorityCnt(classList)
        bestFeat=chooseBestFeatureToSplit(dataSet) #选择最优特征
        bestFeatLabel=labels[bestFeat]
        myTree={bestFeatLabel:{}} #分类结果以字典形式保存
        del(labels[bestFeat])
        featValues=[example[bestFeat] for example in dataSet]
        uniqueVals=set(featValues)
        for value in uniqueVals:
            subLabels=labels[:]
            myTree[bestFeatLabel][value]=createTree(splitDataSet
                                (dataSet,bestFeat,value),subLabels)
        return myTree
    
    
    if __name__=='__main__':
        dataSet, labels=createDataSet1()  # 创造示列数据
        print(createTree(dataSet, labels))  # 输出决策树模型结果
    
    
  • 相关阅读:
    Java中的集合类-详解
    wargames-Leviathan
    词霸阿涛的英语学习经历
    《小王子》阅读笔记
    linux的mysql密码忘了怎么办
    redis事务实现
    缓存穿透、缓存击穿、缓存雪崩
    单线程redis为什么快?
    redis和么memcached的区别
    如何解决缓存污染
  • 原文地址:https://www.cnblogs.com/pteromyini/p/12510729.html
Copyright © 2011-2022 走看看