zoukankan      html  css  js  c++  java
  • pandas常用方法示例 2

    参考:https://mp.weixin.qq.com/s/Z1iNkI0ulstyNAeULKACFQ
    from
    pandas import DataFrame import numpy as np import pandas as pd t={ "age": [18, 30, np.nan, 40, np.nan, 30], "city": ["BeiJing", "ShangHai", "GuangZhou", "ShenZhen", 'BeiJing', "ShangHai"], "sex": [None, "male", "female", "male", np.nan, "unknown"], "birth": ["2000-02-10", "1988-10-17", None, "1978-08-08", np.nan, "1988-10-17"], "score":[11,7,33,44,22,33], "naem":['sdf','aa','bb','tt','ere','tt'], "name":['sdf 11','aa 22','bb 33','tt 44','ere 55','tt 66'] } df =DataFrame(t,index=['num1','num2','num3','num4','num5','num6'])
    将第1行和第2行的
    'age','name'返回:print(df.loc[df.index[[0,2]],['age','name']])
    增加一行:df.loc['num']=[11,'shandong','male','2020-02-02',11,'sdf','sdf']
    按sex进行分组,求每组age的平均值:print(df.groupby('sex')['age'].mean())

    对列求和、平均、最大
    print(df['age'].mean())
    print(df['age'].sum())
    print(df['age'].max())

    按 age降序birth升序排:print(df.sort_values(by=['age', 'birth'], ascending=[False, True]))
    naem列中sdf替换成sss,ere替换成eee :print(df['naem'].replace(["sdf","ere"],['sss','eee']))
    print(df['age'].replace(18,88))
    读csv文件,指定编码,分隔符Separator:df = pd.read_csv('test.csv', encoding='gbk, sep=';')
    d = {"customer": ["A", "B", "C", "D"],
    "sales":[1100, "950.5RMB", "$400", " $1250.75"]}
    
    df = pd.DataFrame(d)
    print(df)
    df["sales"] = df["sales"].replace("[$,RMB]", "", regex = True).astype("float") # 清洗掉 RMB$ 符号,转化这一列为浮点型
    print(df)
    d = {
    "district_code": [12345, 56789, 101112, 131415],
    "apple": [5.2, 2.4, 4.2, 3.6],
    "banana": [3.5, 1.9, 4.0, 2.3],
    "orange": [8.0, 7.5, 6.4, 3.9]
    }
    df = pd.DataFrame(d)
    print(df)
    print('====合并多列为一列=======')
    df = df.melt(
    id_vars = "district_code",
    var_name = "fruit_name",
    value_name = "price")
    print(df)
    import pandas as pd
    import numpy as np
    
    df = pd.DataFrame(np.random.randint(1,10,size=(48,3)),
    columns = ['商品编码','商品销量','商品库存'])
    df.index=pd.date_range(start='2020-03-01',periods=48,freq='H')
    print(df)
    #按天聚合,不用通过groupby实现
    day_df = df.resample("D")["商品销量"].sum().to_frame()
    print(day_df)
    
                商品销量
    2020-03-01   122
    2020-03-02   129
    不是按列值做分组,而是自定义分组
    people = pd.DataFrame(np.random.randn(5, 5), columns=['a', 'b', 'c', 'd', 'e'], index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) mapping = {'a':'red', 'b':'red', 'c':'blue', 'd':'blue', 'e':'red', 'f':'orange'} by_column = people.groupby(mapping, axis=1) print(by_column.sum())

    一列拆成多列
    df = pd.DataFrame({'列1':['a','b','c'],'列2':[[10,20], [20,30], [30,40]]})
    
    df_new = df.列2.apply(pd.Series)
    
    字符串拆成多列
    df = pd.DataFrame({'姓名':['张 三','李 四','王 五'],
                      '所在地':['北京-东城区','上海-黄浦区','广州-白云区']})
    df.姓名.str.split(' ', expand=True)
    根据出现频率筛选 DataFrame(挑选出现次数多的行)
    t={
        "age": [18, 30, 3, 30, np.nan, 30],
        "city": ["BeiJing1", "ShangHai", "GuangZhou", "ShangHai", 'BeiJing', "ShangHai"],
        "sex": [None, "male", "female", "male", np.nan, "unknown"],
        "birth": ["2000-02-10", "1988-10-17", None, "1978-08-08", np.nan, "1988-10-17"],
        "score":[7,7,33,44,7,33],
        "naem":['sdf','aa','bb','aa','sdf','tt'],
        "name":['sdf 11','aa 22','bb 33','tt 44','ere 55','tt 66']
    }
    
    df =DataFrame(t)
    c = (df['score'].value_counts())
    print(c.nlargest(1).index)
    print(df[df.score.isin(c.nlargest(1).index)])
    类型转换:
    df = pd.DataFrame({'列1':['1','2','3'],
                     '列2':['4.4','5.5','6.6'],
                     '列3':['7.7','8.8','-']})
    
    df2=df.astype({'列1':'int','列2':'float'})
    print(df2.dtypes)
    print('==========')
    print(df.dtypes)

    用这种方式转换第三列会出错,因为这列里包含一个代表 0 的下划线,pandas 无法自动判断这个下划线。为了解决这个问题,可以使用 to_numeric() 函数来处理第三列,让 pandas 把任意无效输入转为 NaN。
    df = df.apply(pd.to_numeric, errors='coerce').fillna(0)
    挑选指定类型的列:
    print(df.select_dtypes(include=['number','datetime']))
    排除指定类型的列:
    print(df.select_dtypes(exclude=['number']))

    多列合并为一行

    df = pd.DataFrame({'id_part':['a','b','c','d'], 'pred':[0.1,0.2,0.3,0.4], 'pred_class':['women','man','cat','dog'], 'v_id':['d1','d2','d3','d1']})
    print(df)
    df2 = df.groupby(['v_id']).agg({'pred_class': [', '.join],'pred': lambda x: list(x),'id_part': 'first'}).reset_index()
    print(df2)
    获取分组里最大值所在的行方法
    df = pd.DataFrame({'Sp':['a','b','c','d','e','f'], 'Mt':['s1', 's1', 's2','s2','s2','s3'], 'Value':[1,2,3,4,5,6], 'Count':[3,2,5,10,10,6]})
    df.iloc[df.groupby(['Mt']).apply(lambda x: x['Count'].idxmax())]
    
    有重复值的情况
    df["rank"] = df.groupby("ID")["score"].rank(method="min", ascending=False).astype(np.int64)
    df[df["rank"] == 1][["ID", "class"]]
  • 相关阅读:
    赔了多少钱
    datatables使用
    Django开发汇总
    STF的DOCKER搭建
    ubuntu基本
    python 列表、元组、字典、字符串
    Appium环境搭建
    AppCrawler环境搭建
    TASK 总结
    python & jira
  • 原文地址:https://www.cnblogs.com/testzcy/p/12400834.html
Copyright © 2011-2022 走看看