zoukankan      html  css  js  c++  java
  • pandas高级操作总结

    1.pandas中的列的分位数

    # 查看列的分位数
    import pandas as pd
    # set columns type
    my_df['col'] = my_df['col'].astype(np.float64)
    
    # computations for 4 quantiles : quartiles
    bins_col = pd.qcut(my_df['col'], 4)
    bins_col_label = pd.qcut(my_df['col'], 4).labels
    分位数

    2.多重聚合(组函数)

    # 多重聚合(组函数)
    # columns settings
    grouped_on = 'col_0'  # ['col_0', 'col_2'] for multiple columns
    aggregated_column = 'col_1'
    
    ### Choice of aggregate functions
    ## On non-NA values in the group
    ## - numeric choice :: mean, median, sum, std, var, min, max, prod
    ## - group choice :: first, last, count
    # list of functions to compute
    agg_funcs = ['mean', 'max']
    
    # compute aggregate values
    aggregated_values = my_df.groupby(grouped_on)[aggregated_columns].agg(agg_funcs)
    
    # get the aggregate of group
    aggregated_values.ix[group]
    多重聚合

    3.使用自定义函数进行聚合

    # 使用自定义函数进行聚合
    # columns settings
    grouped_on = ['col_0']
    aggregated_columns = ['col_1']
    
    def my_func(my_group_array):
        return my_group_array.min() * my_group_array.count()
    
    ## list of functions to compute
    agg_funcs = [my_func] # could be many
    
    # compute aggregate values
    aggregated_values = my_df.groupby(grouped_on)[aggregated_columns].agg(agg_funcs)
    自定义函数进行聚合

    4.在聚合的dataframe上使用apply

    在聚合中使用apply
    # 在聚合的dataframe上使用apply
    # top n in aggregate dataframe
    def top_n(group_df, col, n=2):
        bests = group_df[col].value_counts()[:n]
        return bests
    
    # columns settings
    grouped_on = 'col_0'
    aggregated_column = 'col'
    
    grouped = my_df.groupby(grouped_on)
    groups_top_n = grouped.apply(top_n, aggregated_column, n=3)

    5.移动平均

    # 移动平均
    import numpy as np
    
    ret = np.cumsum(np.array(X), dtype=float)
    ret[w:] = ret[w:] - ret[:-w]
    result = ret[w - 1:] / w
    
    # X: array-like
    # window: int
    移动平均

    6.组数据的基本信息

    # 组数据的基本信息
    # columns settings
    grouped_on = 'col_0'  # ['col_0', 'col_1'] for multiple columns
    aggregated_column = 'col_1'
    
    ### Choice of aggregate functions
    ## On non-NA values in the group
    ## - numeric choice : mean, median, sum, std, var, min, max, prod
    ## - group choice : first, last, count
    ## On the group lines
    ## - size of the group : size
    aggregated_values = my_df.groupby(grouped_on)[aggregated_column].mean()
    aggregated_values.name = 'mean'
    
    # get the aggregate of group
    aggregated_values.ix[group]
    组数据的基本信息

    7.数据组的遍历

    数据组的遍历
    # 数据组的遍历
    # columns settings
    grouped_on = 'col_0'  # ['col_0', 'col_1'] for multiple columns
    
    grouped = my_df.groupby(grouped_on)
    
    i = 0
    for group_name, group_dataframe in grouped:
        if i > 10:
            break
        i += 1
        print(i, group_name, group_dataframe.mean())  ## mean on all numerical columns

    8.最大互信息数

    # 最大互信息数
    import numpy as np
    
    matrix = np.transpose(np.array(X)).astype(float)
    mine = MINE(alpha=0.6, c=15, est="mic_approx")
    mic_result = []
    for i in matrix[1:]:
        mine.compute_score(t_matrix[0], i)
        mic_result.append(mine.mic())
    return mic_result
    最大互信息数

    9.pearson相关系数

    import numpy as np
    
    matrix = np.transpose(np.array(X))
    np.corrcoef(matrix[0], matrix[1])[0, 1]
    
    # X: array-like
    # https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.corrcoef.html
    pearson相关系数

    10.自定义聚合函数

    # 自定义聚合函数
    def zscore(x):
        return (x - x.mean()) / x.std()
    
    my_df['zscore_col'] = my_df.groupby(grouped_on)[aggregated_column].transform(zscore)
    
    自定义聚合函数

    11.标准聚合使用groupby

    # 标准聚合使用groupby
    # columns settings
    grouped_on = 'col_1'
    aggregated_column = 'col_0'
    
    ### Choice of aggregate functions
    ## On non-NA values in the group
    ## - numeric choice : mean, median, sum, std, var, min, max, prod
    ## - group choice : first, last, count
    my_df['aggregate_values_on_col'] = my_df.groupby(grouped_on)[aggregated_column].transform(lambda v: v.mean())
    
    标准聚合使用groupby

    12.使用自定义函数设值

    # 使用自定义函数设值
    def to_log(v):
        try:
            return log(v)
        except:
            return np.nan
    my_df['new_col'] = my_df['col_0'].map(to_log)
    使用自定义函数设值

    13.使用复杂函数设值

    # 使用复杂的函数设值
    import numpy as np
    def complex_formula(col0_value, col1_value):
        return "%s (%s)" % (col0_value, col1_value)
    
    my_df['new_col'] = np.vectorize(complex_formula)(my_df['col_0'], my_df['col_1'])
    使用复杂函数设值

    14.使用字典dict设值

    # 使用字典dict设值
    gender_dict={'男':1,'女':2}
    df['gender'] = df['gender'].map(gender_dict)
    使用字典设值
    参考信息:https://www.kesci.com/
  • 相关阅读:
    不用加减乘除做加法
    数组中只出现一次的数字
    平衡二叉树
    二叉树的深度
    两个链表的第一个公共结点
    连续子数组的最大和
    最小的K个数
    数组中出现次数超过一半的数字
    二叉搜索树与双向链表
    复杂链表的复制
  • 原文地址:https://www.cnblogs.com/jean925/p/9315291.html
Copyright © 2011-2022 走看看