zoukankan      html  css  js  c++  java
  • 【数据分析&数据挖掘】pandas分组聚合

     1 import pandas as pd
     2 import numpy as np
     3 
     4 # 加载数据
     5 detail = pd.read_excel("../day05/meal_order_detail.xlsx")
     6 print("detail: 
    ", detail)
     7 print("detail的列名称: 
    ", detail.columns)
     8 
     9 # 删除法
    10 # 先进行判断
    11 drop_list = []
    12 for column in detail.columns:
    13     # print(column)
    14     # 统计每一列非空数据的数量
    15     res = detail.loc[:, column].count()
    16     # print("res: 
    ", res)
    17     if res == 0:
    18         drop_list.append(column)
    19 
    20 print(drop_list)
    21 
    22 # 再进行删除:
    23 detail.drop(labels=drop_list, axis=1, inplace=True)
    24 print("删除全部为空列之后的结果: 
    " ,detail.shape)
    25 print("删除全部为空列之后的结果的列名称: 
    " ,detail.columns)
    26 print("^"*60)
    27 
    28 # 分组进行统计指标
    29 # 按照单列进行分组——统计菜品id的最大值
    30 res_ = detail.groupby(by="order_id")["dishes_id"].max()
    31 res_ = detail.groupby(by=detail["order_id"])["dishes_id"].max()
    32 
    33 print("res_: 
    ", res_)
    34 
    35 # 统计所欲python班级各个小组的平均成绩
    36 df = pd.DataFrame(
    37     data={
    38         "cls_id": ["A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B","C"],
    39         "group_id": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
    40         "name": ["xixi", "haha", "taotao", "huihui", "ranran", "Island", "Tree" ,"bamao", "simao", "hanhan", "qimao", "sanmao"],
    41         "score": [92, 93, 39, 89, 90.5, 80, 91, 92, 65, 73, 34.5, 56],
    42         "height": [165, 166, 167, 168, 152, 193, 192, 190, 173, 172, 170, 169]
    43     },
    44 )
    45 print("df: 
    ", df)
    46 
    47 # 按照班级分组,统计班级的平均分
    48 # 按照单列进行分组
    49 res = df.groupby(by="cls_id")["score"].mean()
    50 print(res)
    51 
    52 # 先按照班级分组,再统计各小组的平均成绩
    53 res = df.groupby(by=["cls_id", "group_id"])["score"].mean()
    54 print("res: 
    ", res)
    55 
    56 # 按照多列分组,既要统计成绩的平均值,又要统计身高的平均值
    57 res = df.groupby(by=["cls_id", "group_id"])[["score", "height"]].mean()
    58 print("res: 
    ", res)
    59 
    60 # 对成绩求最大值,身高求平均值
    61 # res = detail.agg({"counts": np.max, "height": np.mean})
    62 # print("res: 
    ", res)
    63 
    64 # 对不同的列求取不同的指标
    65 res = detail.agg({"counts": np.sum, "amounts": np.mean})
    66 
    67 # 对不同的列求取多个相同的指标
    68 res = detail[["counts", "amounts"]].agg([np.max, np.mean])
    69 #
    70 # # 对不同单列求取不同个数的指标
    71 res = detail.agg({"counts": [np.mean, np.max], "amounts": np.min})
    72 
    73 print("res :
    ", res)
    74 
    75 # 对某列进行指定的运算
    76 res = detail[["counts", "amounts"]].apply(lambda x: x+1)
    77 res = detail[["counts", "amounts"]].transform(lambda x: x+1)
    78 # res = detail[["counts", "amounts"]].apply(lambda x, y: x+y)  # 错误的, 不能跨列运算
    79 
    80 print('detail[["counts", "amounts"]]: 
    ', detail[["counts", "amounts"]])
    81 print("res :
    ", res)
    82 print(detail["counts"])
  • 相关阅读:
    自定义的tabBarController的几种方法
    JAVA如何把一个float四舍五入到小数点后2位,4位,或者其它指定位数.
    ALAssetsLibrary使用
    UITabBarController详解
    学习笔记:Tab Bar 控件使用详解
    iOS开发 跳转场景的三种方式
    Java中文件与字节数组转换
    'NSUnknownKeyException', reason:....etValue:forUndefinedKey:]: this class is not key value coding-compliant for the key
    开源项目
    object-c的异常处理机制
  • 原文地址:https://www.cnblogs.com/Tree0108/p/12116067.html
Copyright © 2011-2022 走看看