import numpy as np
import pandas as pd
df = pd.read_csv('table.csv',index_col='ID') #用来指定表格的索引值
df.head(2)
| School | Class | Gender | Address | Height | Weight | Math | Physics | |
|---|---|---|---|---|---|---|---|---|
| ID | ||||||||
| 1101 | S_1 | C_1 | M | street_1 | 173 | 63 | 34.0 | A+ |
| 1102 | S_1 | C_1 | F | street_2 | 192 | 73 | 32.5 | B+ |
2.排序
a)索引排序index
df.set_index('Weight').head(6)
| School | Class | Gender | Address | Height | Math | Physics | |
|---|---|---|---|---|---|---|---|
| Weight | |||||||
| 63 | S_1 | C_1 | M | street_1 | 173 | 34.0 | A+ |
| 73 | S_1 | C_1 | F | street_2 | 192 | 32.5 | B+ |
| 82 | S_1 | C_1 | M | street_2 | 186 | 87.2 | B+ |
| 81 | S_1 | C_1 | F | street_2 | 167 | 80.4 | B- |
| 64 | S_1 | C_1 | F | street_4 | 159 | 84.8 | B+ |
| 68 | S_1 | C_2 | M | street_5 | 188 | 97.0 | A- |
</div>
df.set_index('Weight').sort_index(ascending=False).head(3) #ascending=False为降序,默认是升序
| School | Class | Gender | Address | Height | Math | Physics | |
|---|---|---|---|---|---|---|---|
| Weight | |||||||
| 100 | S_2 | C_2 | M | street_5 | 193 | 39.1 | B |
| 99 | S_2 | C_3 | F | street_7 | 190 | 65.9 | C |
| 97 | S_2 | C_1 | F | street_5 | 159 | 72.2 | B+ |
b)值排序values
df.sort_values(by='Height').head(3)
| School | Class | Gender | Address | Height | Weight | Math | Physics | |
|---|---|---|---|---|---|---|---|---|
| ID | ||||||||
| 2203 | S_2 | C_2 | M | street_4 | 155 | 91 | 73.8 | A+ |
| 2103 | S_2 | C_1 | M | street_4 | 157 | 61 | 52.5 | B- |
| 2301 | S_2 | C_3 | F | street_4 | 157 | 78 | 72.3 | B+ |
df.sort_values(by=['Height','Weight'],ascending=False).head(4) #多值的排序,先对第一次层排序,在第一层相同的情况下,再对下一层排序
| School | Class | Gender | Address | Height | Weight | Math | Physics | |
|---|---|---|---|---|---|---|---|---|
| ID | ||||||||
| 1304 | S_1 | C_3 | M | street_2 | 195 | 70 | 85.2 | A |
| 2202 | S_2 | C_2 | F | street_7 | 194 | 77 | 68.5 | B+ |
| 2201 | S_2 | C_2 | M | street_5 | 193 | 100 | 39.1 | B |
| 2405 | S_2 | C_4 | F | street_6 | 193 | 54 | 47.6 | B |
3.重复值
a)duplicated 告诉我们是否重复,以布尔值的形式告知
df.duplicated('Address').head(6)
#True和False指的是:一个数值,自上而下,若第一次出现,为False;若之后再次出现(可多次出现),则为True
ID 1101 False 1102 False 1103 True 1104 True 1105 False 1201 False dtype: bool
df.duplicated('Class',keep='last').tail(8)
ID 2303 True 2304 True 2305 False 2401 True 2402 True 2403 True 2404 True 2405 False dtype: bool
df.tail(8)
| School | Class | Gender | Address | Height | Weight | Math | Physics | |
|---|---|---|---|---|---|---|---|---|
| ID | ||||||||
| 2303 | S_2 | C_3 | F | street_7 | 190 | 99 | 65.9 | C |
| 2304 | S_2 | C_3 | F | street_6 | 164 | 81 | 95.5 | A- |
| 2305 | S_2 | C_3 | M | street_4 | 187 | 73 | 48.9 | B |
| 2401 | S_2 | C_4 | F | street_2 | 192 | 62 | 45.3 | A |
| 2402 | S_2 | C_4 | M | street_7 | 166 | 82 | 48.7 | B |
| 2403 | S_2 | C_4 | F | street_6 | 158 | 60 | 59.7 | B+ |
| 2404 | S_2 | C_4 | F | street_2 | 160 | 84 | 67.7 | B |
| 2405 | S_2 | C_4 | F | street_6 | 193 | 54 | 47.6 | B |
b)drop_duplicates
单个字段重复值处理
df['Class'].value_counts()
C_1 10 C_3 10 C_2 10 C_4 5 Name: Class, dtype: int64
df.drop_duplicates('Class') #保留删除重复值后的唯一项
| School | Class | Gender | Address | Height | Weight | Math | Physics | |
|---|---|---|---|---|---|---|---|---|
| ID | ||||||||
| 1101 | S_1 | C_1 | M | street_1 | 173 | 63 | 34.0 | A+ |
| 1201 | S_1 | C_2 | M | street_5 | 188 | 68 | 97.0 | A- |
| 1301 | S_1 | C_3 | M | street_4 | 161 | 68 | 31.5 | B+ |
| 2401 | S_2 | C_4 | F | street_2 | 192 | 62 | 45.3 | A |
df.drop_duplicates('Class',keep='last') #last的意思是:保留重复值中最后出现的,first与之相反
| School | Class | Gender | Address | Height | Weight | Math | Physics | |
|---|---|---|---|---|---|---|---|---|
| ID | ||||||||
| 2105 | S_2 | C_1 | M | street_4 | 170 | 81 | 34.2 | A |
| 2205 | S_2 | C_2 | F | street_7 | 183 | 76 | 85.4 | B |
| 2305 | S_2 | C_3 | M | street_4 | 187 | 73 | 48.9 | B |
| 2405 | S_2 | C_4 | F | street_6 | 193 | 54 | 47.6 | B |
</div>
多个字段重复值处理
df.drop_duplicates(['School','Class']) #多列的重复值,在这里可以联想为多层的索引
| School | Class | Gender | Address | Height | Weight | Math | Physics | |
|---|---|---|---|---|---|---|---|---|
| ID | ||||||||
| 1101 | S_1 | C_1 | M | street_1 | 173 | 63 | 34.0 | A+ |
| 1201 | S_1 | C_2 | M | street_5 | 188 | 68 | 97.0 | A- |
| 1301 | S_1 | C_3 | M | street_4 | 161 | 68 | 31.5 | B+ |
| 2101 | S_2 | C_1 | M | street_7 | 174 | 84 | 83.3 | C |
| 2201 | S_2 | C_2 | M | street_5 | 193 | 100 | 39.1 | B |
| 2301 | S_2 | C_3 | F | street_4 | 157 | 78 | 72.3 | B+ |
| 2401 | S_2 | C_4 | F | street_2 | 192 | 62 | 45.3 | A |
</div>
df.drop_duplicates(['School','Class'],keep='last')
| School | Class | Gender | Address | Height | Weight | Math | Physics | |
|---|---|---|---|---|---|---|---|---|
| ID | ||||||||
| 1105 | S_1 | C_1 | F | street_4 | 159 | 64 | 84.8 | B+ |
| 1205 | S_1 | C_2 | F | street_6 | 167 | 63 | 68.4 | B- |
| 1305 | S_1 | C_3 | F | street_5 | 187 | 69 | 61.7 | B- |
| 2105 | S_2 | C_1 | M | street_4 | 170 | 81 | 34.2 | A |
| 2205 | S_2 | C_2 | F | street_7 | 183 | 76 | 85.4 | B |
| 2305 | S_2 | C_3 | M | street_4 | 187 | 73 | 48.9 | B |
| 2405 | S_2 | C_4 | F | street_6 | 193 | 54 | 47.6 | B |
4.缺失值
a)了解缺失的信息
qs_data = pd.read_csv('table_missing.csv')
qs_data.head()
| School | Class | ID | Gender | Address | Height | Weight | Math | Physics | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | S_1 | C_1 | NaN | M | street_1 | 173 | NaN | 34.0 | A+ |
| 1 | S_1 | C_1 | NaN | F | street_2 | 192 | NaN | 32.5 | B+ |
| 2 | S_1 | C_1 | 1103.0 | M | street_2 | 186 | NaN | 87.2 | B+ |
| 3 | S_1 | NaN | NaN | F | street_2 | 167 | 81.0 | 80.4 | NaN |
| 4 | S_1 | C_1 | 1105.0 | NaN | street_4 | 159 | 64.0 | 84.8 | A- |
isna
qs_data['Physics'].isna().head()
0 False 1 False 2 False 3 True 4 False Name: Physics, dtype: bool
qs_data.isna().sum() #通过以上操作能得出每列有多少缺失值
School 0 Class 4 ID 6 Gender 7 Address 0 Height 0 Weight 13 Math 5 Physics 4 dtype: int64
info
qs_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 35 entries, 0 to 34 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 School 35 non-null object 1 Class 31 non-null object 2 ID 29 non-null float64 3 Gender 28 non-null object 4 Address 35 non-null object 5 Height 35 non-null int64 6 Weight 22 non-null float64 7 Math 30 non-null float64 8 Physics 31 non-null object dtypes: float64(3), int64(1), object(5) memory usage: 2.6+ KB
b)查看缺失值所在行
qs_data[qs_data['Physics'].isna()]
| School | Class | ID | Gender | Address | Height | Weight | Math | Physics | |
|---|---|---|---|---|---|---|---|---|---|
| 3 | S_1 | NaN | NaN | F | street_2 | 167 | 81.0 | 80.4 | NaN |
| 8 | S_1 | C_2 | 1204.0 | F | street_5 | 162 | 63.0 | 33.8 | NaN |
| 13 | S_1 | C_3 | 1304.0 | NaN | street_2 | 195 | 70.0 | 85.2 | NaN |
| 22 | S_2 | C_2 | 2203.0 | M | street_4 | 155 | 91.0 | 73.8 | NaN |
c)选出所有缺失或者非缺失列
qs_data[qs_data.notna().all(1)]
| School | Class | ID | Gender | Address | Height | Weight | Math | Physics | |
|---|---|---|---|---|---|---|---|---|---|
| 5 | S_1 | C_2 | 1201.0 | M | street_5 | 159 | 68.0 | 97.0 | A- |
| 6 | S_1 | C_2 | 1202.0 | F | street_4 | 176 | 94.0 | 63.5 | B- |
| 12 | S_1 | C_3 | 1303.0 | M | street_7 | 188 | 82.0 | 49.7 | B |
| 17 | S_2 | C_1 | 2103.0 | M | street_4 | 157 | 61.0 | 52.5 | B- |
| 21 | S_2 | C_2 | 2202.0 | F | street_7 | 194 | 77.0 | 68.5 | B+ |
| 25 | S_2 | C_3 | 2301.0 | F | street_4 | 157 | 78.0 | 72.3 | B+ |
| 27 | S_2 | C_3 | 2303.0 | F | street_7 | 190 | 99.0 | 65.9 | C |
| 28 | S_2 | C_3 | 2304.0 | F | street_6 | 164 | 81.0 | 95.5 | A- |
| 29 | S_2 | C_3 | 2305.0 | M | street_4 | 187 | 73.0 | 48.9 | B |
</div>
d)数据的填充或放置或删除
fillna
qs_data['Physics'].fillna('2333').head(4)
0 A+ 1 B+ 2 B+ 3 2333 Name: Physics, dtype: object
qs_data['Physics'].fillna(method='ffill').head(4) #用缺失值所在行的上一行的值进行填充
0 A+ 1 B+ 2 B+ 3 B+ Name: Physics, dtype: object
qs_data['Physics'].fillna(method='backfill').head(4) #用缺失值所在行的下一行的值进行填充
0 A+ 1 B+ 2 B+ 3 A- Name: Physics, dtype: object
qs_data['Weight'].fillna(qs_data['Weight'].mean()).head() #填充还可以使用平均值、中位数、众数等 #前提,已知的缺失数据量不能较大
0 76.863636 1 76.863636 2 76.863636 3 81.000000 4 64.000000 Name: Weight, dtype: float64
dropna
qs_data.dropna(axis=1)
qs_data.dropna()
| School | Class | ID | Gender | Address | Height | Weight | Math | Physics | |
|---|---|---|---|---|---|---|---|---|---|
| 5 | S_1 | C_2 | 1201.0 | M | street_5 | 159 | 68.0 | 97.0 | A- |
| 6 | S_1 | C_2 | 1202.0 | F | street_4 | 176 | 94.0 | 63.5 | B- |
| 12 | S_1 | C_3 | 1303.0 | M | street_7 | 188 | 82.0 | 49.7 | B |
| 17 | S_2 | C_1 | 2103.0 | M | street_4 | 157 | 61.0 | 52.5 | B- |
| 21 | S_2 | C_2 | 2202.0 | F | street_7 | 194 | 77.0 | 68.5 | B+ |
| 25 | S_2 | C_3 | 2301.0 | F | street_4 | 157 | 78.0 | 72.3 | B+ |
| 27 | S_2 | C_3 | 2303.0 | F | street_7 | 190 | 99.0 | 65.9 | C |
| 28 | S_2 | C_3 | 2304.0 | F | street_6 | 164 | 81.0 | 95.5 | A- |
| 29 | S_2 | C_3 | 2305.0 | M | street_4 | 187 | 73.0 | 48.9 | B |
qs_data.head(8)
| School | Class | ID | Gender | Address | Height | Weight | Math | Physics | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | S_1 | C_1 | NaN | M | street_1 | 173 | NaN | 34.0 | A+ |
| 1 | S_1 | C_1 | NaN | F | street_2 | 192 | NaN | 32.5 | B+ |
| 2 | S_1 | C_1 | 1103.0 | M | street_2 | 186 | NaN | 87.2 | B+ |
| 3 | S_1 | NaN | NaN | F | street_2 | 167 | 81.0 | 80.4 | NaN |
| 4 | S_1 | C_1 | 1105.0 | NaN | street_4 | 159 | 64.0 | 84.8 | A- |
| 5 | S_1 | C_2 | 1201.0 | M | street_5 | 159 | 68.0 | 97.0 | A- |
| 6 | S_1 | C_2 | 1202.0 | F | street_4 | 176 | 94.0 | 63.5 | B- |
| 7 | S_1 | C_2 | NaN | M | street_6 | 160 | 53.0 | 58.8 | A+ |