zoukankan      html  css  js  c++  java
  • 机器学习实战---决策树ID3算法

    一:代码实现

    import numpy as np
    from math import log

    (一)创建数据集

    # 创建数据集
    def createDataSet():
        dataSet = [[1, 1],
                   [1, 1],
                   [1, 0],
                   [0, 1],
                   [0, 1]]
        labels = [1, 1, 0, 0, 0]
        features_names = ['水下', '脚蹼']  # 特征名称
    
        return dataSet, labels, features_names

    (二)计算信息熵

    #计算信息熵
    def calcEnt(data_Y):   #传入numpy数据
        cnt = len(data_Y)
        EntVal = 0.0
    
        val_nums = np.unique(data_Y)
        for val in val_nums:
            num = data_Y[np.where(data_Y==val)].size
            EntVal += num/cnt*log(num/cnt,2)
        return -EntVal

    (三)根据信息增益获取特征

    #根据信息熵,获取最好的特征
    def chooseBestFeature(data_X,data_Y):
        samp_num,fea_num = data_X.shape   #统计样本数、特征数
        #循环统计每个特征的信息增益
        BaseEntval = calcEnt(data_Y)
        BestFeature = -1;BestEntGain=0.0
        for i in range(fea_num):    #开始循环特征
            newEntVal = 0.0  #获取每个特征的信息熵
            val_nums = np.unique(data_X[:,i])   #先获取该特征下的值种类
            for val in val_nums:
                new_dataY = data_Y[np.where(data_X[:, i] == val)]
                newEntVal += new_dataY.size/samp_num*calcEnt(new_dataY)
            if BaseEntval - newEntVal > BestEntGain:    #比较信息增益大小
                BestEntGain = BaseEntval - newEntVal
                BestFeature = i
    
        return BestFeature

    (四)按特征和特征的值进行子集划分

    #按特征和特征的值进行子集划分 注意:最后返回的子集中不包含原来的特征
    def splitDataByFeature(data_X,data_Y,fea_idx,fea_axis,fea_val): #注意fea_idx保存了特征原始索引
        new_dataX = data_X[np.where(data_X[:,fea_axis]==fea_val)]
        new_dataY = data_Y[np.where(data_X[:, fea_axis] == fea_val)]
    
        new_dataX = np.delete(new_dataX,fea_axis,1) #按列删除特征
        new_feaIdx = np.delete(fea_idx,fea_axis,0)  #按行删除 (2,)
        return new_dataX,new_dataY,new_feaIdx  #因为先获取了new_dataY,之后才删除的该特征列,所以必然存在new_dataX为空,new_dataY不为空情况

    (五)创建决策树

    #开始递归创建树
    def createTree(data_X,data_Y,fea_idx):
        y_valNums = np.unique(data_Y)   #值去重
        if y_valNums.size == 1: #全是一个类别,直接返回该类别
            return np.int(data_Y[0])
    
        if data_X.shape[1] == 0:  #如果该递归路径下,已经遍历了所有特征,使用多数投票进行分类返回(这里和splitDataByFeature有关)
            bestCls,bestCnt = 0,0
            for i in y_valNums:
                if data_Y[np.where(data_Y==i)].size > bestCnt:
                    bestCls = i
            return bestCls
    
        #可以进行递归了
        BestFeature = chooseBestFeature(data_X,data_Y)
        my_tree = {fea_idx[BestFeature]:{}}
        uniFeaVals = np.unique(data_X[:,BestFeature])
        for i in uniFeaVals:
            new_dataX,new_dataY,new_feaIdx = splitDataByFeature(data_X,data_Y,fea_idx,BestFeature,i)   #获取新的划分子集,注意,因为我们是先获取Y标签,然后才删除X矩阵该特征列。所以会出现data_X为空,data_Y不为空情况
            my_tree[fea_idx[BestFeature]][i] = createTree(new_dataX,new_dataY,new_feaIdx)
        return my_tree

    (六)实现预测

    def classify(inputTree,testVec):    #递归查找树,直到找到叶子节点
        rootTag = list(inputTree.keys())[0]   #获取根节点信息,看先找的哪一个特征 --- 获取索引
        FeaVal = inputTree[rootTag]     #获取该根节点全部特性值 --- 获取值
    
        for k in FeaVal.keys():
            if k == testVec[rootTag]:
                if type(inputTree[rootTag][k]) != dict:
                    return inputTree[rootTag][k]
                return classify(inputTree[rootTag][k],testVec)

    (七)进行测试

    data_x,data_y,fea_names = createDataSet()
    fea_Idx = np.arange(len(fea_names))
    data_X,data_Y = np.array(data_x),np.array([data_y]).T
    myTree = createTree(data_X,data_Y,fea_Idx)
    print(myTree)
    testData = np.zeros(len(fea_names))
    for i in range(len(fea_names)):
        testData[i] = input("{}(0/1)>:".format(fea_names[i]))
    
    print(classify(myTree,testData))

    (八)全部代码

    import numpy as np
    from math import log
    
    # 创建数据集
    def createDataSet():
        dataSet = [[1, 1],
                   [1, 1],
                   [1, 0],
                   [0, 1],
                   [0, 1]]
        labels = [1, 1, 0, 0, 0]
        features_names = ['水下', '脚蹼']  # 特征名称
    
        return dataSet, labels, features_names
    
    #计算信息熵
    def calcEnt(data_Y):   #传入numpy数据
        cnt = len(data_Y)
        EntVal = 0.0
    
        val_nums = np.unique(data_Y)
        for val in val_nums:
            num = data_Y[np.where(data_Y==val)].size
            EntVal += num/cnt*log(num/cnt,2)
        return -EntVal
    
    #根据信息熵,获取最好的特征
    def chooseBestFeature(data_X,data_Y):
        samp_num,fea_num = data_X.shape   #统计样本数、特征数
        #循环统计每个特征的信息增益
        BaseEntval = calcEnt(data_Y)
        BestFeature = -1;BestEntGain=0.0
        for i in range(fea_num):    #开始循环特征
            newEntVal = 0.0  #获取每个特征的信息熵
            val_nums = np.unique(data_X[:,i])   #先获取该特征下的值种类
            for val in val_nums:
                new_dataY = data_Y[np.where(data_X[:, i] == val)]
                newEntVal += new_dataY.size/samp_num*calcEnt(new_dataY)
            if BaseEntval - newEntVal > BestEntGain:    #比较信息增益大小
                BestEntGain = BaseEntval - newEntVal
                BestFeature = i
    
        return BestFeature
    
    
    #按特征和特征的值进行子集划分 注意:最后返回的子集中不包含原来的特征
    def splitDataByFeature(data_X,data_Y,fea_idx,fea_axis,fea_val): #注意fea_idx保存了特征原始索引
        new_dataX = data_X[np.where(data_X[:,fea_axis]==fea_val)]
        new_dataY = data_Y[np.where(data_X[:, fea_axis] == fea_val)]
    
        new_dataX = np.delete(new_dataX,fea_axis,1) #按列删除特征
        new_feaIdx = np.delete(fea_idx,fea_axis,0)  #按行删除 (2,)
        return new_dataX,new_dataY,new_feaIdx  #因为先获取了new_dataY,之后才删除的该特征列,所以必然存在new_dataX为空,new_dataY不为空情况
    
    #开始递归创建树
    def createTree(data_X,data_Y,fea_idx):
        y_valNums = np.unique(data_Y)   #值去重
        if y_valNums.size == 1: #全是一个类别,直接返回该类别
            return np.int(data_Y[0])
    
        if data_X.shape[1] == 0:  #如果该递归路径下,已经遍历了所有特征,使用多数投票进行分类返回(这里和splitDataByFeature有关)
            bestCls,bestCnt = 0,0
            for i in y_valNums:
                if data_Y[np.where(data_Y==i)].size > bestCnt:
                    bestCls = i
            return bestCls
    
        #可以进行递归了
        BestFeature = chooseBestFeature(data_X,data_Y)
        my_tree = {fea_idx[BestFeature]:{}}
        uniFeaVals = np.unique(data_X[:,BestFeature])
        for i in uniFeaVals:
            new_dataX,new_dataY,new_feaIdx = splitDataByFeature(data_X,data_Y,fea_idx,BestFeature,i)   #获取新的划分子集,注意,因为我们是先获取Y标签,然后才删除X矩阵该特征列。所以会出现data_X为空,data_Y不为空情况
            my_tree[fea_idx[BestFeature]][i] = createTree(new_dataX,new_dataY,new_feaIdx)
        return my_tree
    
    def classify(inputTree,testVec):    #递归查找树,直到找到叶子节点
        rootTag = list(inputTree.keys())[0]   #获取根节点信息,看先找的哪一个特征 --- 获取索引
        FeaVal = inputTree[rootTag]     #获取该根节点全部特性值 --- 获取值
    
        for k in FeaVal.keys():
            if k == testVec[rootTag]:
                if type(inputTree[rootTag][k]) != dict:
                    return inputTree[rootTag][k]
                return classify(inputTree[rootTag][k],testVec)
    
    data_x,data_y,fea_names = createDataSet()
    fea_Idx = np.arange(len(fea_names))
    data_X,data_Y = np.array(data_x),np.array([data_y]).T
    myTree = createTree(data_X,data_Y,fea_Idx)
    print(myTree)
    testData = np.zeros(len(fea_names))
    for i in range(len(fea_names)):
        testData[i] = input("{}(0/1)>:".format(fea_names[i]))
    
    print(classify(myTree,testData))
    View Code

    二:实现决策树存储

    因为构建决策树十分耗时,因此我们希望尽可能少的构建决策树,所以我们把每次构建的决策树存储,在使用的时候进行读取即可,不需要重复进行构建。

    import pickle
    def storeTree(inputTree,filename):
        fw = open(filename,"wb")
        pickle.dump(inputTree,fw)
        fw.close()
    
    def getTree(filename):
        fr = open(filename,"rb")
        return pickle.load(fr)
    data_x,data_y,fea_names = createDataSet()
    fea_Idx = np.arange(len(fea_names))
    data_X,data_Y = np.array(data_x),np.array([data_y]).T
    myTree = createTree(data_X,data_Y,fea_Idx)
    print(myTree)
    storeTree(myTree,"mytreeStorage.txt")
    myTree = getTree("mytreeStorage.txt")
    
    testData = np.zeros(len(fea_names))
    for i in range(len(fea_names)):
        testData[i] = input("{}(0/1)>:".format(fea_names[i]))
    
    print(classify(myTree,testData))

    三:案例---预测隐形眼镜类型

    (一)代码实现数据预处理---使用pandas读取字符串格式文件转换为可处理数字文件

    def preDealData(filename):
        df = pd.read_table(filename,'	',header = None)
        columns = ["age","prescript","astigmatic","tearRate"]  # df.columns = ["age","prescript","astigmatic","tearRate","Result"]   #https://zhuanlan.zhihu.com/p/60248460
    
        #数据预处理,变为可以处理的数据    #https://blog.csdn.net/liuweiyuxiang/article/details/78222818
        new_df = pd.DataFrame()
        for i in range(len(columns)):
            new_df[i] = pd.factorize(df[i])[0]  ##factorize函数可以将Series中的标称型数据映射称为一组数字,相同的标称型映射为相同的数字。
        data_X = new_df.values
        data_Y = pd.factorize(df[df.shape[1]-1])[0] #factorize返回的是ndarray类型
        data_Y = np.array([data_Y]).T
    
        return data_X,data_Y,columns

    (二)全部代码

    import numpy as np
    from math import log
    import pandas as pd
    import pickle
    
    # 创建数据集
    def createDataSet():
        dataSet = [[1, 1],
                   [1, 1],
                   [1, 0],
                   [0, 1],
                   [0, 1]]
        labels = [1, 1, 0, 0, 0]
        features_names = ['水下', '脚蹼']  # 特征名称
    
        return dataSet, labels, features_names
    
    #计算信息熵
    def calcEnt(data_Y):   #传入numpy数据
        cnt = len(data_Y)
        EntVal = 0.0
    
        val_nums = np.unique(data_Y)
        for val in val_nums:
            num = data_Y[np.where(data_Y==val)].size
            EntVal += num/cnt*log(num/cnt,2)
        return -EntVal
    
    #根据信息熵,获取最好的特征
    def chooseBestFeature(data_X,data_Y):
        samp_num,fea_num = data_X.shape   #统计样本数、特征数
        #循环统计每个特征的信息增益
        BaseEntval = calcEnt(data_Y)
        BestFeature = -1;BestEntGain=0.0
        for i in range(fea_num):    #开始循环特征
            newEntVal = 0.0  #获取每个特征的信息熵
            val_nums = np.unique(data_X[:,i])   #先获取该特征下的值种类
            for val in val_nums:
                new_dataY = data_Y[np.where(data_X[:, i] == val)]
                newEntVal += new_dataY.size/samp_num*calcEnt(new_dataY)
            if BaseEntval - newEntVal > BestEntGain:    #比较信息增益大小
                BestEntGain = BaseEntval - newEntVal
                BestFeature = i
    
        return BestFeature
    
    
    #按特征和特征的值进行子集划分 注意:最后返回的子集中不包含原来的特征
    def splitDataByFeature(data_X,data_Y,fea_idx,fea_axis,fea_val): #注意fea_idx保存了特征原始索引
        new_dataX = data_X[np.where(data_X[:,fea_axis]==fea_val)]
        new_dataY = data_Y[np.where(data_X[:, fea_axis] == fea_val)]
    
        new_dataX = np.delete(new_dataX,fea_axis,1) #按列删除特征
        new_feaIdx = np.delete(fea_idx,fea_axis,0)  #按行删除 (2,)
        return new_dataX,new_dataY,new_feaIdx  #因为先获取了new_dataY,之后才删除的该特征列,所以必然存在new_dataX为空,new_dataY不为空情况
    
    #开始递归创建树
    def createTree(data_X,data_Y,fea_idx):
        y_valNums = np.unique(data_Y)   #值去重
        if y_valNums.size == 1: #全是一个类别,直接返回该类别
            return data_Y[0]
            # return np.int(data_Y[0])
    
        if data_X.shape[1] == 0:  #如果该递归路径下,已经遍历了所有特征,使用多数投票进行分类返回(这里和splitDataByFeature有关)
            bestCls,bestCnt = 0,0
            for i in y_valNums:
                if data_Y[np.where(data_Y==i)].size > bestCnt:
                    bestCls = i
            return bestCls
    
        #可以进行递归了
        BestFeature = chooseBestFeature(data_X,data_Y)
        my_tree = {fea_idx[BestFeature]:{}}
        uniFeaVals = np.unique(data_X[:,BestFeature])
        for i in uniFeaVals:
            new_dataX,new_dataY,new_feaIdx = splitDataByFeature(data_X,data_Y,fea_idx,BestFeature,i)   #获取新的划分子集,注意,因为我们是先获取Y标签,然后才删除X矩阵该特征列。所以会出现data_X为空,data_Y不为空情况
            my_tree[fea_idx[BestFeature]][i] = createTree(new_dataX,new_dataY,new_feaIdx)
        return my_tree
    
    def classify(inputTree,testVec):    #递归查找树,直到找到叶子节点
        rootTag = list(inputTree.keys())[0]   #获取根节点信息,看先找的哪一个特征 --- 获取索引
        FeaVal = inputTree[rootTag]     #获取该根节点全部特性值 --- 获取值
    
        for k in FeaVal.keys():
            if k == testVec[rootTag]:
                if type(inputTree[rootTag][k]) != dict:
                    return inputTree[rootTag][k]
                return classify(inputTree[rootTag][k],testVec)
    
    def storeTree(inputTree,filename):
        fw = open(filename,"wb")
        pickle.dump(inputTree,fw)
        fw.close()
    
    def getTree(filename):
        fr = open(filename,"rb")
        return pickle.load(fr)
    
    def preDealData(filename):
        df = pd.read_table(filename,'	',header = None)
        columns = ["age","prescript","astigmatic","tearRate"]  # df.columns = ["age","prescript","astigmatic","tearRate","Result"]   #https://zhuanlan.zhihu.com/p/60248460
    
        #数据预处理,变为可以处理的数据    #https://blog.csdn.net/liuweiyuxiang/article/details/78222818
        new_df = pd.DataFrame()
        for i in range(len(columns)):
            new_df[i] = pd.factorize(df[i])[0]  ##factorize函数可以将Series中的标称型数据映射称为一组数字,相同的标称型映射为相同的数字。
        data_X = new_df.values
        data_Y = pd.factorize(df[df.shape[1]-1])[0] #factorize返回的是ndarray类型
        data_Y = np.array([data_Y]).T
    
        return data_X,data_Y,columns
    
    
    data_X,data_Y,fea_names = preDealData("lenses.txt")
    fea_Idx = np.arange(len(fea_names))
    myTree = createTree(data_X,data_Y,fea_Idx)
    print(myTree)
    View Code

    (三)结果展示

    {3: {
            0: array([0], dtype=int64), 
            1: {
                2: {
                    0: {
                        0: {
                            0: array([1], dtype=int64), 
                            1: array([1], dtype=int64), 
                            2: {
                                1: {
                                    0: array([0], dtype=int64), 
                                    1: array([1], dtype=int64)
                                    }
                                }
                            }
                        }, 
                    1: {
                        1: {
                            0: array([2], dtype=int64), 
                            1: {
                                0: {
                                    0: array([2], dtype=int64), 
                                    1: array([0], dtype=int64), 
                                    2: array([0], dtype=int64)
                                    }
                                }
                            }
                        }
                    }
                }
        }
    }

    变形为:

    ["age","prescript","astigmatic","tearRate"]
    {tearRate: {
            reduced: array([0], dtype=int64), 
            normal:{
                astigmatic: {
                    no: {
                        age: {
                            young: array([1], dtype=int64), ---> soft
                            pre: array([1], dtype=int64), ---> soft
                            presbyopic: {
                                prescript: {
                                    myope: array([0], dtype=int64), ---> no lenses
                                    hyper: array([1], dtype=int64) ---> soft
                                    }
                                }
                            }
                        }, 
                    yes: {
                        prescript: {
                            myope: array([2], dtype=int64), ---> hard
                            hyper: {
                                age: {
                                    young: array([2], dtype=int64), ---> hard
                                    pre: array([0], dtype=int64), ---> no lenses
                                    presbyopic: array([0], dtype=int64) ---> no lenses
                                    }
                                }
                            }
                        }
                    }
                }
        }
    }
  • 相关阅读:
    Spring标签@Aspect-实现面向方向编程(@Aspect的多数据源自动加载)——SKY
    easyUI参数传递Long型时,前台解析出错的问题——SKY
    javax.servlet.ServletException: Could not resolve view with name‘ XXXX’in servlet with name 'spring'的解决方案-----SKY
    Netty实现java多线程Post请求解析(Map参数类型)—SKY
    java并发编程基础---Sky
    创建100个1k的随机文件到FSxL
    RAID磁盘阵列与LVM逻辑卷管理
    The beginners’ guide to farming Chia Coin on Windows.
    有赞移动Crash平台建设
    Comparisons Serverless
  • 原文地址:https://www.cnblogs.com/ssyfj/p/13229743.html
Copyright © 2011-2022 走看看