import numpy as np
import pandas as pd
df = pd.read_csv('table.csv',index_col='ID') #用来指定表格的索引值
df.head(2)
School | Class | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|
ID | ||||||||
1101 | S_1 | C_1 | M | street_1 | 173 | 63 | 34.0 | A+ |
1102 | S_1 | C_1 | F | street_2 | 192 | 73 | 32.5 | B+ |
2.分组
单列分组
danlie = df.groupby('School')
danlie.sum()
Height | Weight | Math | |
---|---|---|---|
School | |||
S_1 | 2636 | 1050 | 956.2 |
S_2 | 3459 | 1563 | 1191.1 |
多列分组
duolie = df.groupby(['School','Class'])
duolie.sum()
Height | Weight | Math | ||
---|---|---|---|---|
School | Class | |||
S_1 | C_1 | 877 | 363 | 318.9 |
C_2 | 853 | 341 | 321.5 | |
C_3 | 906 | 346 | 315.8 | |
S_2 | C_1 | 821 | 384 | 292.8 |
C_2 | 900 | 418 | 314.0 | |
C_3 | 869 | 419 | 315.3 | |
C_4 | 869 | 342 | 269.0 |
提取指定字段并指定分组的列
tiqu = df['Address Math'.split()].groupby(by='Address')
tiqu.count()
Math | |
---|---|
Address | |
street_1 | 3 |
street_2 | 6 |
street_4 | 8 |
street_5 | 6 |
street_6 | 6 |
street_7 | 6 |
</div>
3.合并
sheet = pd.ExcelFile('用户借款数据.xlsx')
sheet.sheet_names
['基础信息', '就业信息', '借款信息']
hb1 = pd.read_excel('用户借款数据.xlsx',sheet_name='基础信息') hb2 = pd.read_excel('用户借款数据.xlsx',sheet_name='就业信息') hb3 = pd.read_excel('用户借款数据.xlsx',sheet_name='借款信息')
hb1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 999 entries, 0 to 998 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 编号 999 non-null int64 1 城市 999 non-null object 2 国家 999 non-null object 3 出生日期 999 non-null object 4 教育程度 999 non-null int64 5 性别 999 non-null int64 6 房屋所有权类型 763 non-null float64 7 婚姻状况 999 non-null int64 dtypes: float64(1), int64(4), object(3) memory usage: 62.6+ KB
hb1.head(6)
编号 | 城市 | 国家 | 出生日期 | 教育程度 | 性别 | 房屋所有权类型 | 婚姻状况 | |
---|---|---|---|---|---|---|---|---|
0 | 0 | AESPA | EE | 21/11/1975 | 4 | 0 | 6.0 | 4 |
1 | 2 | P脛RNU | EE | 25/10/1969 | 4 | 1 | 9.0 | 1 |
2 | 3 | TALLINN | EE | 22/11/1965 | 4 | 0 | 1.0 | 4 |
3 | 4 | KEHTNA | EE | 05/12/1983 | 2 | 0 | 1.0 | 3 |
4 | 5 | KIVI脮LI | EE | 13/07/1980 | 4 | 1 | 4.0 | 3 |
5 | 6 | P脛RNU | EE | 05/07/1993 | 3 | 1 | 2.0 | 3 |
</div>
hb2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 999 entries, 0 to 998 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 编号 999 non-null int64 1 申请签字时间 999 non-null int64 2 申请签证周 999 non-null int64 3 当前雇主的雇佣期限 926 non-null object 4 就业岗位 962 non-null object 5 就业状况 977 non-null float64 6 主要工作收入 999 non-null int64 7 收入总额 999 non-null int64 8 工作经验 999 non-null object dtypes: float64(1), int64(5), object(3) memory usage: 70.4+ KB
hb2.head(6)
编号 | 申请签字时间 | 申请签证周 | 当前雇主的雇佣期限 | 就业岗位 | 就业状况 | 主要工作收入 | 收入总额 | 工作经验 | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 12 | 4 | UpTo5Years | Worker | 3.0 | 1000 | 1000 | 15To25Years |
1 | 2 | 14 | 2 | MoreThan5Years | SpecialistOfficeWorker | 3.0 | 633 | 633 | 15To25Years |
2 | 3 | 22 | 4 | MoreThan5Years | Owner | 5.0 | 550 | 550 | MoreThan25Years |
3 | 4 | 15 | 3 | MoreThan5Years | Worker | 3.0 | 833 | 833 | 5To10Years |
4 | 5 | 10 | 3 | MoreThan5Years | Worker | 3.0 | 341 | 341 | 5To10Years |
5 | 6 | 14 | 4 | UpTo1Year | Worker | 3.0 | 430 | 430 | LessThan2Years |
hb3.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 899 entries, 0 to 898 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 编号 899 non-null int64 1 状态 899 non-null object 2 金额 899 non-null float64 3 现有负债 899 non-null int64 4 贷款日期 899 non-null object 5 贷款期限 899 non-null int64 6 新信用客户 899 non-null bool 7 贷前贷款数 899 non-null int64 8 占用区 859 non-null float64 9 贷款用途 899 non-null int64 10 先前得分 763 non-null float64 11 违约 899 non-null bool dtypes: bool(2), float64(3), int64(5), object(2) memory usage: 72.1+ KB
hb3.head()
编号 | 状态 | 金额 | 现有负债 | 贷款日期 | 贷款期限 | 新信用客户 | 贷前贷款数 | 占用区 | 贷款用途 | 先前得分 | 违约 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | Current | 5000.0 | 9 | 05/05/2015 | 60 | False | 1 | 8.0 | 0 | 0.0957 | False |
1 | 2 | Repaid | 530.0 | 7 | 19/10/2015 | 60 | True | 0 | 1.0 | 2 | 0.1034 | False |
2 | 3 | Current | 5500.0 | 1 | 02/09/2015 | 60 | True | 0 | 13.0 | 2 | 0.0772 | False |
3 | 4 | Repaid | 6900.0 | 10 | 20/05/2015 | 60 | False | 1 | 19.0 | 7 | 0.0773 | False |
4 | 5 | Current | 2655.0 | 6 | 29/12/2015 | 60 | True | 0 | 17.0 | 2 | 0.1898 | False |
data2 = pd.merge(hb1,hb2,how='outer')
data2
编号 | 城市 | 国家 | 出生日期 | 教育程度 | 性别 | 房屋所有权类型 | 婚姻状况 | 申请签字时间 | 申请签证周 | 当前雇主的雇佣期限 | 就业岗位 | 就业状况 | 主要工作收入 | 收入总额 | 工作经验 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | AESPA | EE | 21/11/1975 | 4 | 0 | 6.0 | 4 | 12 | 4 | UpTo5Years | Worker | 3.0 | 1000 | 1000 | 15To25Years |
1 | 2 | P脛RNU | EE | 25/10/1969 | 4 | 1 | 9.0 | 1 | 14 | 2 | MoreThan5Years | SpecialistOfficeWorker | 3.0 | 633 | 633 | 15To25Years |
2 | 3 | TALLINN | EE | 22/11/1965 | 4 | 0 | 1.0 | 4 | 22 | 4 | MoreThan5Years | Owner | 5.0 | 550 | 550 | MoreThan25Years |
3 | 4 | KEHTNA | EE | 05/12/1983 | 2 | 0 | 1.0 | 3 | 15 | 3 | MoreThan5Years | Worker | 3.0 | 833 | 833 | 5To10Years |
4 | 5 | KIVI脮LI | EE | 13/07/1980 | 4 | 1 | 4.0 | 3 | 10 | 3 | MoreThan5Years | Worker | 3.0 | 341 | 341 | 5To10Years |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
994 | 1257 | P脛RISPEA | EE | 08/04/1971 | 3 | 0 | NaN | 3 | 19 | 5 | UpTo2Years | valvur | 3.0 | 3820 | 8452 | 10To15Years |
995 | 1258 | TALLINN | EE | 29/11/1949 | 4 | 1 | NaN | 5 | 13 | 6 | UpTo4Years | klienditeenindaja | 3.0 | 9000 | 9000 | MoreThan25Years |
996 | 1259 | UUEM脮ISA | EE | 21/02/1979 | 4 | 1 | NaN | 1 | 17 | 1 | UpTo1Year | Noorem referent | 3.0 | 7200 | 7500 | 10To15Years |
997 | 1260 | TARTU | EE | 21/10/1983 | 5 | 1 | NaN | 3 | 16 | 7 | NaN | administraator | 3.0 | 6500 | 10500 | 2To5Years |
998 | 1261 | VILJANDI | EE | 30/08/1977 | 1 | 1 | NaN | 2 | 21 | 7 | MoreThan5Years | NaN | 3.0 | 8000 | 8300 | 10To15Years |
999 rows × 16 columns
</div>
4.透视表
shuju = pd.read_csv('table.csv')
shuju.head()
School | Class | ID | Gender | Address | Height | Weight | Math | Physics | |
---|---|---|---|---|---|---|---|---|---|
0 | S_1 | C_1 | 1101 | M | street_1 | 173 | 63 | 34.0 | A+ |
1 | S_1 | C_1 | 1102 | F | street_2 | 192 | 73 | 32.5 | B+ |
2 | S_1 | C_1 | 1103 | M | street_2 | 186 | 82 | 87.2 | B+ |
3 | S_1 | C_1 | 1104 | F | street_2 | 167 | 81 | 80.4 | B- |
4 | S_1 | C_1 | 1105 | F | street_4 | 159 | 64 | 84.8 | B+ |
</div>
pivot_table
pd.pivot_table(shuju,index='ID',columns='Gender',values='Height').head()
Gender | F | M |
---|---|---|
ID | ||
1101 | NaN | 173.0 |
1102 | 192.0 | NaN |
1103 | NaN | 186.0 |
1104 | 167.0 | NaN |
1105 | 159.0 | NaN |
四个参数
pd.pivot_table(shuju,index='Gender',columns='School',values='Height',aggfunc=['mean','sum'])
mean | sum | |||
---|---|---|---|---|
School | S_1 | S_2 | S_1 | S_2 |
Gender | ||||
F | 173.125000 | 173.727273 | 1385 | 1911 |
M | 178.714286 | 172.000000 | 1251 | 1548 |
pd.pivot_table(shuju,index=['School','Class'],columns=['Gender','Address'],values=['Height','Weight'])
Height | ... | Weight | ||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Gender | F | M | ... | F | M | |||||||||||||||||
Address | street_1 | street_2 | street_4 | street_5 | street_6 | street_7 | street_1 | street_2 | street_4 | street_5 | ... | street_4 | street_5 | street_6 | street_7 | street_1 | street_2 | street_4 | street_5 | street_6 | street_7 | |
School | Class | |||||||||||||||||||||
S_1 | C_1 | NaN | 179.5 | 159.0 | NaN | NaN | NaN | 173.0 | 186.0 | NaN | NaN | ... | 64.0 | NaN | NaN | NaN | 63.0 | 82.0 | NaN | NaN | NaN | NaN |
C_2 | NaN | NaN | 176.0 | 162.0 | 167.0 | NaN | NaN | NaN | NaN | 188.0 | ... | 94.0 | 63.0 | 63.0 | NaN | NaN | NaN | NaN | 68.0 | 53.0 | NaN | |
C_3 | 175.0 | NaN | NaN | 187.0 | NaN | NaN | NaN | 195.0 | 161.0 | NaN | ... | NaN | 69.0 | NaN | NaN | NaN | 70.0 | 68.0 | NaN | NaN | 82.0 | |
S_2 | C_1 | NaN | NaN | NaN | 159.0 | 161.0 | NaN | NaN | NaN | 163.5 | NaN | ... | NaN | 97.0 | 61.0 | NaN | NaN | NaN | 71.0 | NaN | NaN | 84.0 |
C_2 | NaN | NaN | NaN | NaN | NaN | 188.5 | 175.0 | NaN | 155.0 | 193.0 | ... | NaN | NaN | NaN | 76.5 | 74.0 | NaN | 91.0 | 100.0 | NaN | NaN | |
C_3 | NaN | NaN | 157.0 | NaN | 164.0 | 190.0 | NaN | NaN | 187.0 | 171.0 | ... | 78.0 | NaN | 81.0 | 99.0 | NaN | NaN | 73.0 | 88.0 | NaN | NaN | |
C_4 | NaN | 176.0 | NaN | NaN | 175.5 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 57.0 | NaN | NaN | NaN | NaN | NaN | NaN | 82.0 |
7 rows × 24 columns