zoukankan      html  css  js  c++  java
  • python实现ID3

    # -*- coding: utf-8 -*-
    #计算各个属性各个值的嫡
    import numpy as np
    def H(tdata):
        n = tdata.shape[1] -1
        C = tdata.ix[:,n]
        result = 0
        counts = list(C.value_counts())
        for i in range(len(counts)):
            p = counts[i]/len(C)
            result = result + p*np.log2(p)
        print('H')
        return result
            
    #计算各个属性的条件嫡
    def tiaojiandi(dataset,T):
        #按照T划分数据集
        xiaodi = 0
        for i in dataset[T].unique():
            tdata = dataset[dataset[T]==i]
            p = len(tdata)/len(dataset)
            xiaodi = xiaodi +p*H(tdata)    
        print('tiaojiandi')
        return -xiaodi
    
    
    #计算最大信息增益的属性
    def maxgain(dataset):
        gain=[]
        n = dataset.shape[1] -1
        features = list(dataset.columns[0:n])
        for i in range(len(features)):
            di = tiaojiandi(dataset,features[i])
            gain.append(di)
        gain = np.array(gain)
        print('maxgain')
        return features[gain.argmin()]
    
    
    #获得属性后,拆分数据集
    def split(dataset, feature, value):
        newdata = dataset[dataset[feature]==value]
        del newdata[feature]
        print('split')
        return newdata
    
    #若属性为空时,结果多的为终结点
    def classfiy(C):
        counts = C.value_counts().sort_index()
        print('classfiy')
        return str(counts.index[-1])
    
    #创建决策树
    def decision_tree(dataset):
        n = dataset.shape[1] -1
        features = list(dataset.columns[0:n])
        C = list(dataset.ix[:,n])
        if C.count(C[0]) == len(C):
            return C[0]
        if len(features)==0:
            return classfiy(dataset.ix[:,n])
        feature = maxgain(dataset)
        tree={feature:{}}
        for value in dataset[feature].unique():
            print('ok')
            newdata = split(dataset,feature,value)
            tree[feature][value] = decision_tree(newdata)
        return tree
    
    
    
    import pandas as pd
    train = pd.read_csv(r'E:Pythonmachine learningowndecision_tree	rain.csv')
    tree = decision_tree(train) 
    
    #预测结果
    def predict(tree,test):
        result = []
        for i in range(len(test)):
            newdata = test.ix[i,0:4].to_dict()
            while isinstance(tree,dict):
                key = list(tree.keys())[0]
                tree = tree[key][newdata[key]]
            result.append(tree)
        print(result)
        return result
    
    #计算准确率
    def pinggu(tree, test):
        result = predict(tree,test)
        test['result']=result
        return len(test[test['Play']==test['result']])/len(test)
    
    test = pd.read_csv(r'E:Pythonmachine learningowndecision_tree	est.csv')
    accuary = pinggu(tree,test)
    
    
       
  • 相关阅读:
    典型相关性分析(刷题)
    轻音少女美图分享
    动漫美景
    linux下安装redis(全操作)
    前端限制对后端的请求频率
    idea自定义java方法的注释模板
    sql 语句,判断某个值在某个字段中是否存在,存在返回1,不存在返回0
    Error: Module not specified
    解决sql语句中DISTINCT和order by的冲突
    将后端传来的数据放入ul中
  • 原文地址:https://www.cnblogs.com/chenyaling/p/7234997.html
Copyright © 2011-2022 走看看