zoukankan      html  css  js  c++  java
  • 郑捷《机器学习算法原理与编程实践》学习笔记(第三章 决策树的发展)(二)_C4.5

    (上接第三章)

      3.3.1 信息增益率

      信息增益率的定义如下:

      GainRatio(S,A) = Gain(S,A)/SplitInfo(S,A)

      其中Gain(S,A)就是ID3算法中的信息增益,而划分信息SplitInfo(S,A)代表了按照特征A划分样本集S的广度和均匀性。

      

      其中Si到Sc是特征A的C个不同值构成的样本子集

      3.3.2 C4.5的实现

      

    #coding:utf-8
    
    from numpy import *
    import math
    import copy
    import cPickle as pickle
    
    # 定义一个ID3DTree的类来封装算法:
    class ID3DTree(object):
        def __init__(self):        #构造方法
            self.tree    = {}      #生成的树
            self.dataSet = []      #数据集
            self.label   = []      #标签集
    
        #数据导入函数
        def loadDataSet(self,path,labels):
            recordlist   = []
            fp           = open(path,"rb")
            content      = fp.read()
            fp.close()
            rowlist      = content.splitlines() #按行转换为一维表
            recordlist   = [row.split(" ") for row in rowlist if row.strip()]
            self.dataSet = recordlist
            self.labels  = labels
    
        #执行决策树函数
        def train(self):
            labels    = copy.deepcopy(self.labels)
            self.tree = self.buildTree(self.dataSet,labels)
    
        # 3.2.3 决策树主方法
        # (1)构建决策树:创建决策树主程序
        def buildTree(self,dataSet,labels):
            cateList = [data[-1] for data in dataSet]  #抽取源数据集的决策标签列
            #程序的终止条件1:如果classList只有一种决策标签,停止划分,返回这个决策标签
            if cateList.count(cateList[0]) == len(cateList):
                return cateList[0]
            #程序的终止条件2:如果数据集的第一个决策标签只有一个,则返回这个决策标签
            if len(dataSet[0]) == 1:
                return self.maxCate(cateList)
            #算法核心:
            bestFeat,featValueList = self.getBestFeat(dataSet)  #返回数据集的最优特征轴
            bestFeatLabel = labels[bestFeat]
            tree          = {bestFeatLabel:{}}
            del(labels[bestFeat])
            #抽取最优特征轴的列向量
            # uniqueVals = set([data[bestFeat] for data in dataSet]) #去重
            for value in featValueLis:  #决策树递归生长
                subLabels = labels[:]  #将删除后的特征类别接建立子类别集
                #按最优特征列和值分割数据集
                splitDataset = self.splitDataSet(dataSet,bestFeat,value)
                subTree      = self.buildTree(splitDataset,subLabels)
                tree[bestFeatLabel][value] = subTree
            return tree
    
        #计算出现次数最多的类别标签
        def maxCate(self,catelist):
            items = dict([(catelist.count(i),i) for i in catelist])
            return items([max(items.keys())])
    
        #计算最优特征
        def getBestFeat(self,dataSet):
            #计算特征向量维,其中最后一列用于类别标签,因此要减去
            # numFeatures  = len(dataSet[0])-1             #特征向量维数=行向量维数-1
            Num_Feats    = len(dataSet[0][:-1])
            totality     = len(dataSet)
            BaseEntropy  = self.computeEntropy(dataSet)  #基础熵:源数据香农熵
            ConditionEntropy = []                        #初始化条件熵
            slpitInfo    = []                            #for C4.5,calculate gain ratoo
            allFeatVList = []
            for f in xrange(Num_Feats):
                featList = [example[f] for example in dataSet]
                [splitI,featureValueList] = self.computeSplitInfo(featList)
                allFeatVList.append(featureValueList)
                slpitInfo.append(splitI)
                resultGain = 0.0
                for value in featureValueList:
                    subSet     = self.splitDataSet(dataSet,f,value)
                    appearNum  = float(len(subSet))
                    subEntropy = self.computeEntropy(subSet)
                    resultGain += (appearNum/totality)*subEntropy
                ConditionEntropy.append(resultGain)     #总条件熵
            infoGainArray    = BaseEntropy*ones(Num_Feats)-array(ConditionEntropy)
            infoGainRatio    = infoGainArray/array(slpitInfo) #c4.5 信息增益的计算
            bastFeatureIndex = argsort(-infoGainArray)[0]
            return bastFeatureIndex ,allFeatVList[bastFeatureIndex]
        #计算信息熵
        def computeEntropy(self,dataSet):              #计算香农熵
            datalen  = float(len(dataSet))
            cateList = [data[-1] for data in dataSet]  #从数据集中得到类别标签
            #得到类别为key,出现次数value的字典
            items    = dict([(i,cateList.count(i)) for i in cateList])
            infoEntropy = 0.0
            for key in items: #香农熵:=-p*log2(p) --infoEntropy = -prob*log(prob,2)
                prob = float(items[key])/datalen
                infoEntropy -= prob*math.log(prob,2)
            return infoEntropy
    
        #(5)划分数据集:分割数据集;删除特征轴所在的数据列,返回剩余的数据集
        def splitDataSet(self,dataSet,axis,value):
            rtnList = []
            for featVec in dataSet:
                if featVec[axis] == value:
                    rFeatVec     = featVec[:axis]    #list操作:提取0~(axis-1)的元素
                    rFeatVec.extend(featVec[axis+1:])#lsit操作:将特征轴(列)之后的元素加回
                    rtnList.append(rFeatVec)
            return rtnList                          #剔除已选择的一列
    
        #计算划分信息
        def computeSplitInfo(self,featureVList):
            numEntries = len(featureVList)
            featureValueSetList = list(set(featureVList))
            valueCount = [featureVList.count(featVec) for featVec in featureValueSetList]
            #caclulate shannonEnt
            pList = [float(item)/numEntries from item in valueCount]
            lList [item*math.log(item,2) for item in pList]
            splitInfo = -sum(lList)
            return splitInfo,featureValueSetList

      

  • 相关阅读:
    SpringBoot2.x入门教程:引入jdbc模块与JdbcTemplate简单使用
    2.5万字长文简单总结SpringMVC请求参数接收
    SpringBoot2.x入门:使用CommandLineRunner钩子接口
    百万级别数据Excel导出优化
    SpringBoot2.x入门:应用打包与启动
    谈谈对分布式事务的一点理解和解决方案
    3W字干货深入分析基于Micrometer和Prometheus实现度量和监控的方案
    SpringBoot2.x入门教程:理解配置文件
    SpringBoot2.x入门:引入web模块
    SpringBoot2.x入门:快速创建一个SpringBoot应用
  • 原文地址:https://www.cnblogs.com/wuchuanying/p/6245115.html
Copyright © 2011-2022 走看看