zoukankan      html  css  js  c++  java
  • 离散化和面元划分

    from pandas import DataFrame,Series
    import pandas as pd
    import numpy as np
    
    ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
    bins = [18,25,35,60,100]
    cats = pd.cut(ages,bins)
    
    print(cats)
    '''
    [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
    Length: 12
    Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
    '''
    print(cats.codes)
    '''
    [0 0 0 1 0 0 2 1 3 2 2 1]
    '''
    print(cats.categories)
    '''
    IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
                  closed='right',
                  dtype='interval[int64]')
    '''
    
    # 跟区间的数学符号意义,圆括号表示不包含,方括号包含,可通过right=False进行修改
    print(pd.cut(ages,[18,26,36,61,100],right=False))
    '''
    [[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
    Length: 12
    Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
    '''
    # 可自定义面元名称
    group_names = ["AAA","BBB","CCC","DDD"]
    print(pd.cut(ages,bins,labels=group_names))
    '''
    ['AAA', 'AAA', 'AAA', 'BBB', 'AAA', ..., 'BBB', 'DDD', 'CCC', 'CCC', 'BBB']
    Length: 12
    Categories (4, object): ['AAA' < 'BBB' < 'CCC' < 'DDD']
    '''
    # 如果cut传入的是面元的数量而不是准确的面元边界,他会根据最小值和最大值计算等长面元
    print(pd.cut(ages,4))
    '''
    [(19.959, 30.25], (19.959, 30.25], (19.959, 30.25], (19.959, 30.25], (19.959, 30.25], ..., (30.25, 40.5], (50.75, 61.0], (40.5, 50.75], (40.5, 50.75], (30.25, 40.5]]
    Length: 12
    Categories (4, interval[float64]): [(19.959, 30.25] < (30.25, 40.5] < (40.5, 50.75] < (50.75, 61.0]]
    '''
    # qcut得到大小基本相等的面元
    ages_qcut = pd.qcut(ages,4)
    print(ages_qcut)
    '''
    [(19.999, 22.75], (19.999, 22.75], (22.75, 29.0], (22.75, 29.0], (19.999, 22.75], ..., (29.0, 38.0], (38.0, 61.0], (38.0, 61.0], (38.0, 61.0], (29.0, 38.0]]
    Length: 12
    Categories (4, interval[float64]): [(19.999, 22.75] < (22.75, 29.0] < (29.0, 38.0] < (38.0, 61.0]]
    '''
    print(pd.value_counts(ages_qcut))
    '''
    (19.999, 22.75]    3
    (22.75, 29.0]      3
    (29.0, 38.0]       3
    (38.0, 61.0]       3
    dtype: int64
    '''
  • 相关阅读:
    打印二叉树中节点的所有祖先
    1.把2叉查找树转换成双向链表
    Linux下tar.xz结尾的文件的解压方法
    Floyd算法
    c缺陷与陷阱笔记-第七章 可移植性代码
    c缺陷与陷阱笔记-第六章 预处理器
    c缺陷与陷阱笔记-第四章 连接
    C语言小程序(四)、杨辉三角
    C语言小程序(三)、判断两个日期之差
    C语言小程序(二)、计算第二天日期
  • 原文地址:https://www.cnblogs.com/nicole-zhang/p/14959191.html
Copyright © 2011-2022 走看看