- 如何得到列中前n个最大值对应的索引
df = pd.DataFrame(np.random.randint(1, 15, 15).reshape(5,-1), columns=list('abc')) print(df) # 取'a'列前3个最大值对应的行 n = 5 df['a'].argsort()[::-1].iloc[:3] #> a b c 0 5 5 2 1 12 7 1 2 5 2 12 3 5 14 12 4 1 13 13 #> 4 1 3 3 2 2 Name: a, dtype: int64
- 如何获得dataframe行的和大于100的最末n行索引
df = pd.DataFrame(np.random.randint(10, 40, 16).reshape(-1, 4)) print(df) # dataframe每行的和 rowsums = df.apply(np.sum, axis=1) # 选取大于100的最末两行索引 # last_two_rows = df.iloc[np.where(rowsums > 100)[0][-2:], :] nline = np.where(rowsums > 100)[0][-2:] nline #> 0 1 2 3 0 19 34 15 12 1 38 35 14 26 2 39 32 18 20 3 28 27 36 38 #> array([2, 3], dtype=int64)
- 如何从series中查找异常值并赋值
ser = pd.Series(np.logspace(-2, 2, 30)) # 小于low_per分位的数赋值为low,大于low_per分位的数赋值为high def cap_outliers(ser, low_perc, high_perc): low, high = ser.quantile([low_perc, high_perc]) print(low_perc, '%ile: ', low, '|', high_perc, '%ile: ', high) ser[ser < low] = low ser[ser > high] = high return(ser) capped_ser = cap_outliers(ser, .05, .95) #> 0.05 %ile: 0.016049294076965887 | 0.95 %ile: 63.876672220183934
- 如何交换dataframe的两行
df = pd.DataFrame(np.arange(9).reshape(3, -1)) print(df) # 函数 def swap_rows(df, i1, i2): a, b = df.iloc[i1, :].copy(), df.iloc[i2, :].copy() # 通过iloc换行 df.iloc[i1, :], df.iloc[i2, :] = b, a return df # 2和3行互换 print(swap_rows(df, 1, 2)) #> 0 1 2 0 0 1 2 1 3 4 5 2 6 7 8 #> 0 1 2 0 0 1 2 1 6 7 8 2 3 4 5
- 如何倒转dataframe的行
df = pd.DataFrame(np.arange(9).reshape(3, -1)) print(df) # 方法 1 df.iloc[::-1, :] # 方法 2 print(df.loc[df.index[::-1], :]) #> 0 1 2 0 0 1 2 1 3 4 5 2 6 7 8 #> 0 1 2 2 6 7 8 1 3 4 5 0 0 1 2
- 如何对分类变量进行one-hot编码
df = pd.DataFrame(np.arange(25).reshape(5,-1), columns=list('abcde')) print(df) # 对列'a'进行onehot编码 df_onehot = pd.concat([pd.get_dummies(df['a']), df[list('bcde')]], axis=1) print(df_onehot) #> a b c d e 0 0 1 2 3 4 1 5 6 7 8 9 2 10 11 12 13 14 3 15 16 17 18 19 4 20 21 22 23 24 #> 0 5 10 15 20 b c d e 0 1 0 0 0 0 1 2 3 4 1 0 1 0 0 0 6 7 8 9 2 0 0 1 0 0 11 12 13 14 3 0 0 0 1 0 16 17 18 19 4 0 0 0 0 1 21 22 23 24