参考:https://mp.weixin.qq.com/s/Z1iNkI0ulstyNAeULKACFQ
from pandas import DataFrame
import numpy as np
import pandas as pd
t={
"age": [18, 30, np.nan, 40, np.nan, 30],
"city": ["BeiJing", "ShangHai", "GuangZhou", "ShenZhen", 'BeiJing', "ShangHai"],
"sex": [None, "male", "female", "male", np.nan, "unknown"],
"birth": ["2000-02-10", "1988-10-17", None, "1978-08-08", np.nan, "1988-10-17"],
"score":[11,7,33,44,22,33],
"naem":['sdf','aa','bb','tt','ere','tt'],
"name":['sdf 11','aa 22','bb 33','tt 44','ere 55','tt 66']
}
df =DataFrame(t,index=['num1','num2','num3','num4','num5','num6'])
将第1行和第2行的'age','name'返回:print(df.loc[df.index[[0,2]],['age','name']])
增加一行:df.loc['num']=[11,'shandong','male','2020-02-02',11,'sdf','sdf']
按sex进行分组,求每组age的平均值:print(df.groupby('sex')['age'].mean())
对列求和、平均、最大
print(df['age'].mean())
print(df['age'].sum())
print(df['age'].max())
按 age降序birth升序排:print(df.sort_values(by=['age', 'birth'], ascending=[False, True]))
naem列中sdf替换成sss,ere替换成eee :print(df['naem'].replace(["sdf","ere"],['sss','eee']))
print(df['age'].replace(18,88))
读csv文件,指定编码,分隔符Separator:df = pd.read_csv('test.csv', encoding='gbk, sep=';')
d = {"customer": ["A", "B", "C", "D"],
"sales":[1100, "950.5RMB", "$400", " $1250.75"]}
df = pd.DataFrame(d)
print(df)
df["sales"] = df["sales"].replace("[$,RMB]", "", regex = True).astype("float") # 清洗掉 RMB
,$
符号,转化这一列为浮点型
print(df)
d = {
"district_code": [12345, 56789, 101112, 131415],
"apple": [5.2, 2.4, 4.2, 3.6],
"banana": [3.5, 1.9, 4.0, 2.3],
"orange": [8.0, 7.5, 6.4, 3.9]
}
df = pd.DataFrame(d)
print(df)
print('====合并多列为一列=======')
df = df.melt(
id_vars = "district_code",
var_name = "fruit_name",
value_name = "price")
print(df)
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(1,10,size=(48,3)),
columns = ['商品编码','商品销量','商品库存'])
df.index=pd.date_range(start='2020-03-01',periods=48,freq='H')
print(df)
#按天聚合,不用通过groupby实现
day_df = df.resample("D")["商品销量"].sum().to_frame()
print(day_df)
商品销量
2020-03-01 122
2020-03-02 129
不是按列值做分组,而是自定义分组
people = pd.DataFrame(np.random.randn(5, 5),
columns=['a', 'b', 'c', 'd', 'e'],
index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
mapping = {'a':'red', 'b':'red', 'c':'blue',
'd':'blue', 'e':'red', 'f':'orange'}
by_column = people.groupby(mapping, axis=1)
print(by_column.sum())
一列拆成多列
df = pd.DataFrame({'列1':['a','b','c'],'列2':[[10,20], [20,30], [30,40]]})
df_new = df.列2.apply(pd.Series)
字符串拆成多列
df = pd.DataFrame({'姓名':['张 三','李 四','王 五'],
'所在地':['北京-东城区','上海-黄浦区','广州-白云区']})
df.姓名.str.split(' ', expand=True)
根据出现频率筛选 DataFrame(挑选出现次数多的行)
t={
"age": [18, 30, 3, 30, np.nan, 30],
"city": ["BeiJing1", "ShangHai", "GuangZhou", "ShangHai", 'BeiJing', "ShangHai"],
"sex": [None, "male", "female", "male", np.nan, "unknown"],
"birth": ["2000-02-10", "1988-10-17", None, "1978-08-08", np.nan, "1988-10-17"],
"score":[7,7,33,44,7,33],
"naem":['sdf','aa','bb','aa','sdf','tt'],
"name":['sdf 11','aa 22','bb 33','tt 44','ere 55','tt 66']
}
df =DataFrame(t)
c = (df['score'].value_counts())
print(c.nlargest(1).index)
print(df[df.score.isin(c.nlargest(1).index)])
类型转换:
df = pd.DataFrame({'列1':['1','2','3'],
'列2':['4.4','5.5','6.6'],
'列3':['7.7','8.8','-']})
df2=df.astype({'列1':'int','列2':'float'})
print(df2.dtypes)
print('==========')
print(df.dtypes)
用这种方式转换第三列会出错,因为这列里包含一个代表 0 的下划线,pandas 无法自动判断这个下划线。为了解决这个问题,可以使用 to_numeric() 函数来处理第三列,让 pandas 把任意无效输入转为 NaN。
df = df.apply(pd.to_numeric, errors='coerce').fillna(0)
挑选指定类型的列:
print(df.select_dtypes(include=['number','datetime']))
排除指定类型的列:
print(df.select_dtypes(exclude=['number']))
多列合并为一行
df = pd.DataFrame({'id_part':['a','b','c','d'], 'pred':[0.1,0.2,0.3,0.4], 'pred_class':['women','man','cat','dog'], 'v_id':['d1','d2','d3','d1']})
print(df)
df2 = df.groupby(['v_id']).agg({'pred_class': [', '.join],'pred': lambda x: list(x),'id_part': 'first'}).reset_index()
print(df2)
获取分组里最大值所在的行方法
df = pd.DataFrame({'Sp':['a','b','c','d','e','f'], 'Mt':['s1', 's1', 's2','s2','s2','s3'], 'Value':[1,2,3,4,5,6], 'Count':[3,2,5,10,10,6]})
df.iloc[df.groupby(['Mt']).apply(lambda x: x['Count'].idxmax())]
有重复值的情况
df["rank"] = df.groupby("ID")["score"].rank(method="min", ascending=False).astype(np.int64)
df[df["rank"] == 1][["ID", "class"]]