zoukankan      html  css  js  c++  java
  • 离散化和面元划分

    from pandas import DataFrame,Series
    import pandas as pd
    import numpy as np
    
    ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
    bins = [18,25,35,60,100]
    cats = pd.cut(ages,bins)
    
    print(cats)
    '''
    [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
    Length: 12
    Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
    '''
    print(cats.codes)
    '''
    [0 0 0 1 0 0 2 1 3 2 2 1]
    '''
    print(cats.categories)
    '''
    IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
                  closed='right',
                  dtype='interval[int64]')
    '''
    
    # 跟区间的数学符号意义,圆括号表示不包含,方括号包含,可通过right=False进行修改
    print(pd.cut(ages,[18,26,36,61,100],right=False))
    '''
    [[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
    Length: 12
    Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
    '''
    # 可自定义面元名称
    group_names = ["AAA","BBB","CCC","DDD"]
    print(pd.cut(ages,bins,labels=group_names))
    '''
    ['AAA', 'AAA', 'AAA', 'BBB', 'AAA', ..., 'BBB', 'DDD', 'CCC', 'CCC', 'BBB']
    Length: 12
    Categories (4, object): ['AAA' < 'BBB' < 'CCC' < 'DDD']
    '''
    # 如果cut传入的是面元的数量而不是准确的面元边界,他会根据最小值和最大值计算等长面元
    print(pd.cut(ages,4))
    '''
    [(19.959, 30.25], (19.959, 30.25], (19.959, 30.25], (19.959, 30.25], (19.959, 30.25], ..., (30.25, 40.5], (50.75, 61.0], (40.5, 50.75], (40.5, 50.75], (30.25, 40.5]]
    Length: 12
    Categories (4, interval[float64]): [(19.959, 30.25] < (30.25, 40.5] < (40.5, 50.75] < (50.75, 61.0]]
    '''
    # qcut得到大小基本相等的面元
    ages_qcut = pd.qcut(ages,4)
    print(ages_qcut)
    '''
    [(19.999, 22.75], (19.999, 22.75], (22.75, 29.0], (22.75, 29.0], (19.999, 22.75], ..., (29.0, 38.0], (38.0, 61.0], (38.0, 61.0], (38.0, 61.0], (29.0, 38.0]]
    Length: 12
    Categories (4, interval[float64]): [(19.999, 22.75] < (22.75, 29.0] < (29.0, 38.0] < (38.0, 61.0]]
    '''
    print(pd.value_counts(ages_qcut))
    '''
    (19.999, 22.75]    3
    (22.75, 29.0]      3
    (29.0, 38.0]       3
    (38.0, 61.0]       3
    dtype: int64
    '''
  • 相关阅读:
    Android Studio在android Emulator中运行的项目黑屏
    【.NET开发福音】使用Visual Studio将JSON格式数据自动转化为对应的类
    ASP.NET Core获取请求完整的Url
    解决Cannot find module '@angular/compiler-cli'
    必备三件套:xshell6+xftp6+navicat
    关于bertTokenizer
    关于warm up(transformers.get_linear_schedule_with_warmup)
    一文弄懂pytorch搭建网络流程+多分类评价指标
    python实现多分类评价指标
    如何使用flask将模型部署为服务
  • 原文地址:https://www.cnblogs.com/nicole-zhang/p/14959191.html
Copyright © 2011-2022 走看看