zoukankan      html  css  js  c++  java
  • 有监督的卡方分箱算法

    实现代码

    import numpy as np
    import pandas as pd
    from collections import Counter
    def chimerge(data, attr, label, max_intervals):
        distinct_vals = sorted(set(data[attr])) # Sort the distinct values
        labels = sorted(set(data[label])) # Get all possible labels
        empty_count = {l: 0 for l in labels} # A helper function for padding the Counter()
        intervals = [[distinct_vals[i], distinct_vals[i]] for i in range(len(distinct_vals))] # Initialize the intervals for each attribute
        while len(intervals) > max_intervals: # While loop
            chi = []
            for i in range(len(intervals)-1):
                # Calculate the Chi2 value
                obs0 = data[data[attr].between(intervals[i][0], intervals[i][1])]
                obs1 = data[data[attr].between(intervals[i+1][0], intervals[i+1][1])]
                total = len(obs0) + len(obs1)
                count_0 = np.array([v for i, v in {**empty_count, **Counter(obs0[label])}.items()])
                count_1 = np.array([v for i, v in {**empty_count, **Counter(obs1[label])}.items()])
                count_total = count_0 + count_1
                expected_0 = count_total*sum(count_0)/total
                expected_1 = count_total*sum(count_1)/total
                chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1
                chi_ = np.nan_to_num(chi_) # Deal with the zero counts
                chi.append(sum(chi_)) # Finally do the summation for Chi2
            min_chi = min(chi) # Find the minimal Chi2 for current iteration
            for i, v in enumerate(chi):
                if v == min_chi:
                    min_chi_index = i # Find the index of the interval to be merged
                    break
            new_intervals = [] # Prepare for the merged new data array
            skip = False
            done = False
            for i in range(len(intervals)):
                if skip:
                    skip = False
                    continue
                if i == min_chi_index and not done: # Merge the intervals
                    t = intervals[i] + intervals[i+1]
                    new_intervals.append([min(t), max(t)])
                    skip = True
                    done = True
                else:
                    new_intervals.append(intervals[i])
            intervals = new_intervals
        for i in intervals:
            print('[', i[0], ',', i[1], ']', sep='')
    

    使用例子

    iris = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
    iris.columns = ['sepal_l', 'sepal_w', 'petal_l', 'petal_w', 'type']
    for attr in ['sepal_l', 'sepal_w', 'petal_l', 'petal_w']:
        print('Interval for', attr)
        chimerge(data=iris, attr=attr, label='type', max_intervals=3)
    
    

    结果:

  • 相关阅读:
    2018——测试与信仰
    面试必备----测试用例笔试题分享
    软件测试人员必备网络知识(一):什么是cookie?
    Postman和Selenium IDE开局自带红蓝BUFF属性,就问你要还是不要
    【Loadrunner】LR参数化:利用mysql数据库里面的数据进行参数化
    因果图法设计测试用例
    场景法设计测试用例
    Linux Centos7下安装Python
    Vmware安装与VMware下Linux系统安装
    Python运算符与表达式
  • 原文地址:https://www.cnblogs.com/hichens/p/13585854.html
Copyright © 2011-2022 走看看