zoukankan      html  css  js  c++  java
  • 决策树-缺失值处理

    缺失值算是决策树里处理起来比较麻烦的了,其他简单的我就不发布了。

    # encoding:utf-8
    from __future__ import division
    __author__ = 'HP'
    import copy
    import math
    import numpy as np
    import pandas as pd
    from collections import Counter
    from sklearn.preprocessing import LabelEncoder
    
    ################################
    # id3
    # 离散属性
    # 多分类
    # 多重字典记录学习规则
    
    # 非递归
    
    # 深度优先
    
    # 预剪枝
    
    ### 缺失值处理
        # 解决两个问题
            # 如何进行划分属性选择,缺失值如何处理
            # 如何进行样本划分,缺失值对应的样本如何划分
    ################################
    
    ''' 缺失值处理
    1. 如何进行属性选择
        a. 第一次选择划分属性时,样本等权重,均为1,找出未缺失的样本集,计算该样本集的信息增益 和 该样本集的占比,两者相乘即为真正的信息增益
            . 注意这时计算占比,就是数个数,因为权重都是1
            . 计算信息增益时,P也是数个数
        b. 后面选择划分属性时,样本不等权重,找出未缺失的样本集,计算该样本集的信息增益 和 该样本集的占比,两者相乘即为真正的信息增益
            . 此时样本权重不全为1
            . 计算占比时不是数个数,而是求权重和
            . 计算信息增益的P时,也是求权重和
    2. 如何划分节点
        a. 未缺失按照正常方法划分,权重都为1
        b. 缺失值划到所有子集当中,权重不为1, 而是该属性值占未缺失的样本集的比例
    '''
    
    
    def mydata():
        data = pd.read_csv('xg3.txt',index_col=[0], encoding='gbk')
        data[[-1]] = data.apply(lambda x:x[-1].strip(), axis=1)
        # print(data)
        # print(pd.get_dummies(data[[0]]))
        data.columns = range(9)
        # print(data)
    
        encode_str = LabelEncoder()
    
        str_cols = [0, 1, 2, 3, 4, 5, 8]
        for i in str_cols:
            data[[i]] = encode_str.fit_transform(data[[i]])
        return data.values
    
    def get_label(labels):
        count_label = Counter(labels)
        key = None
        sum = 0
        for label, count in count_label.items():
            if count > sum:
                sum = count
                key = label
        return key
    
    def entropy(attr):
        # 信息熵
        attr_values_count = Counter(attr)
        attr_len = len(attr)
        sum = 0
        for i in attr_values_count.values():
            sum += -1 * i / attr_len * math.log(i / attr_len, 2)
        return sum
    
    def gain_queshi_equal_weight(attr, label):
        # 缺失属性的信息增益,用于初次划分,初次划分样本权重都为1
        index_nan = np.isnan(attr)
        index_nonan = np.where(attr>=0)
    
        # 未缺失属性及标签
        attr_new = attr[index_nonan]
        label_new = label[index_nonan]
    
        # 未缺失样本数
        count_nonan = label_new.shape[0]
    
        # 未缺失占比
        zhanbi = attr_new.shape[0]/attr.shape[0]
    
        # 未缺失的原始熵
        ori_entropy = entropy(label_new)
        # 未缺失的新熵
        new_entropy = 0
        for key, count in Counter(attr_new).items():
            # 未缺失中属性值为key的占比 * key对应的样本集的熵
            new_entropy += count/count_nonan * entropy(label_new[np.where(attr_new == key)])
    
        # 信息增益
        gain = zhanbi * (ori_entropy - new_entropy)
        return gain
    
    def split_node_queshi(node, attr_split):
        # 属性有缺失值的样本划分
        index_nan = np.isnan(node[:,attr_split])
        index_nonan = np.where(node[:,attr_split]>=0)
    
        # 未缺失属性值对应的样本集
        node_new = node[index_nonan]
        # 缺失属性值对应的样本集
        sample_queshi = node[index_nan]
    
        # 未缺失样本大小
        count_nonan = node_new.shape[0]
    
        ### 对该样本集进行划分
        # 未缺失的划分 [属性值,样本集,样本占比]
        split = []
        for key, node_child in pd.DataFrame(node_new).groupby(attr_split):
            # 属性值为key的样本在未缺失样本中占比
            zhanbi_key = round(len(node_child) / count_nonan, 3)
    
            # 未缺失样本权重为1
            weight = [1] * len(node_child)
    
            # 添加缺失样本
            node_child = np.vstack((node_child.values, sample_queshi))
            # 缺失样本权重
            weight.extend([zhanbi_key] * len(sample_queshi))
    
            split.append([key, node_child, np.array(weight)])
        return split
    
    def entropy_no_equal_weight(attr, weight):
        # 样本不等权重的信息熵
        sum = 0
        sum_weight = np.sum(weight)
        for key in Counter(attr).keys():
            index = np.where(attr==key)
            zhanbi = np.sum(weight[index]) / sum_weight
            sum += -1 * zhanbi * math.log(zhanbi, 2)
        return sum
    
    def gain_queshi_no_equal_weight(attr, weight, label):
        # 缺失属性的信息增益,样本权重不相等,用于第一次之后的属性选择
        index_nan = np.isnan(attr)
        index_nonan = np.where(attr>=0)
    
        # 未缺失的属性/标签/权重
        attr_new = attr[index_nonan]
        label_new = label[index_nonan]
        weight_new = weight[index_nonan]
    
        # 未缺失对应的样本占比
        zhanbi = np.sum(weight_new) / np.sum(weight)
    
        ### 未缺失对应的信息增益
        # 未缺失对应的原始熵
        ori_entropy = entropy_no_equal_weight(label_new, weight_new)
    
        # 未缺失的新熵
        new_entropy = 0
        for key in Counter(attr_new).keys():
            index_key = np.where(attr_new==key)
            label_key = label_new[index_key]
            weight_key = weight_new[index_key]
            new_entropy += len(label_key) / len(label_new) * entropy_no_equal_weight(label_key, weight_key)
    
        # 信息增益
        gain = zhanbi * (ori_entropy - new_entropy)
        return gain
    
    
    if __name__ == '__main__':
        data = mydata()
        # 离散型样本
        data = data[:,[0,1,2,3,4,5,8]]
        data[0, 0] = None
        data[4, 0] = None
        data[12, 0] = None
        data[7, 3] = None
        data[9, 3] = None
        print(data)
    
        # 缺失属性的信息增益  样本等权重
        for i in range(data.shape[1]):
            print gain_queshi_equal_weight(data[:,i], data[:,-1])
    
        # 缺失值属性的样本划分
        split = split_node_queshi(data, 3)
        print(split)
    
        # 缺失属性的信息增益 样本不等权重
        # weight = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1/3, 1/3])
        # gain_queshi_no_equal_weight(data[:,0], weight, data[:,-1])
    
        # 以色泽为例
        gain = gain_queshi_no_equal_weight(split[2][1][:,0], split[2][2],split[2][1][:,-1])
        print(gain)
  • 相关阅读:
    数据库中Schema(模式)概念的理解
    git错误处理
    mysql存储过程
    bunyan
    golang 小问题
    操作系统
    数据库优化
    内存控制
    MySQL优化2
    mysql优化1
  • 原文地址:https://www.cnblogs.com/yanshw/p/10451983.html
Copyright © 2011-2022 走看看