数据科学第三章
1.Panda 的数据结构
(1)引入panda包
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
(2)Series对象
height = Series([187, 190, 185, 178, 185], index=['13', '14', '7', '2', '9'])
# index为字符串
height
13 187
14 190
7 185
2 178
9 185
dtype: int64
# 可以用字典创建Series 对象
height1 = Series({'13':187, '14':190, '7':185, '2':178, '9':185})
height
13 187
14 190
7 185
2 178
9 185
dtype: int64
(3)Series数据访问
height['13'] # 检索13号球员身高, 同height[0]
187
height[['13', '2', '7']] # 检索13号、2号、7号球员身高, 同height[[0, 3, 2]]
13 187
2 178
7 185
dtype: int64
height[1:3] # 截断, 选取序号为1~2的球员身高
14 190
7 185
dtype: int64
height[height.values >= 186]# 检索高于186的球员
13 187
14 190
dtype: int64
height['13'] = 188 # 修改13号球员身高
height['13']
188
height[1: 3] = 160 # 修改序号为1,2的数据,标量赋值
height
13 188
14 160
7 160
2 178
9 185
dtype: int64
a = Series([190, 187], index=['23', '5']) # 创建新球员的Series对象a
newheight = height.append(a) # 取出height对象值添加啊后赋给newheight
newheight
13 188
14 160
7 160
2 178
9 185
23 190
5 187
dtype: int64
newheight = height.drop(['13', '9']) # 删除13号和9号球员数据
newheight
14 160
7 160
2 178
dtype: int64
height.index=[1, 2, 3, 4, 5] # 用新的列表替换即可
height
1 188
2 160
3 160
4 178
5 185
dtype: int64
height[ [1, 3] ] # 使用索引访问,注意:这是数字类型
1 188
3 160
dtype: int64
height.iloc[0] # 基于位置序号的访问
188
DataFrame对象
data = [[19, 170, 68], [20, 165, 65], [18, 175, 65]]
students = DataFrame(data, index=[1, 2, 3], columns=['age', 'height', 'weight'])
students
|
age |
height |
weight |
1 |
19 |
170 |
68 |
2 |
20 |
165 |
65 |
3 |
18 |
175 |
65 |
DataFrame数据选取方法
选取类型 |
选取方法 |
说明 |
基于索引名选取 |
obj[ col ] |
选取某列 |
* |
obj[ colList ] |
选取某几列 |
* |
obj.loc[index, col] |
选取某行某列 |
* |
obj.loc[indexList, colList] |
选取多行多列 |
基于位置序号选取 |
obj.iloc[iloc, cloc] |
选取某行某列 |
* |
obj.iloc[ilocList, clocList] |
选取多行多列 |
* |
obj.iloc[a:b, c:d] |
选取a~(b-1)行,c~(d-1)列 |
条件筛选 |
obj.loc[ condition, colList ] |
使用索引构造条件表达式,选取满足条件的行 |
* |
obj.iloc[ condition, clocList ] |
使用位置序号构造条件表达式,选取满足条件的行 |
(1)查询
students.loc[1, 'age'] # 查询1号同学的年龄
19
students.iloc[[0, 2], [0, 1]] # 查询第0、2行,0、1列的信息
|
age |
height |
1 |
19 |
170 |
3 |
18 |
175 |
* part 18
students.loc[:, ['height', 'age']] # 特定列
|
height |
age |
1 |
170 |
19 |
2 |
165 |
20 |
3 |
175 |
18 |
* part 19
students[['height', 'age']] # 与上同
|
height |
age |
1 |
170 |
19 |
2 |
165 |
20 |
3 |
175 |
18 |
* part 20
students.iloc[1:, 0:2] # 截取
|
age |
height |
2 |
20 |
165 |
3 |
18 |
175 |
* part 21
students[1:3] # 截取行数据
|
age |
height |
weight |
2 |
20 |
165 |
65 |
3 |
18 |
175 |
65 |
* part 22
students['height'] >= 168
1 True
2 False
3 True
Name: height, dtype: bool
students.loc[ students['height']>=168, ['height', 'weight'] ]
|
height |
weight |
1 |
170 |
68 |
3 |
175 |
65 |
(2)修改操作
# 增加列
students['expense'] = [1500, 1600, 1200]
students
|
age |
height |
weight |
expense |
1 |
19 |
170 |
68 |
1500 |
2 |
20 |
165 |
65 |
1600 |
3 |
18 |
175 |
65 |
1200 |
* part 25
# 增加行
students.loc[4] = [5, 5, 5, 5]
students
|
age |
height |
weight |
expense |
1 |
19 |
170 |
68 |
1500 |
2 |
20 |
165 |
65 |
1600 |
3 |
18 |
175 |
65 |
1200 |
4 |
5 |
5 |
5 |
5 |
* part 26
# 修改列
students['expense'] = 1000
students
|
age |
height |
weight |
expense |
1 |
19 |
170 |
68 |
1000 |
2 |
20 |
165 |
65 |
1000 |
3 |
18 |
175 |
65 |
1000 |
4 |
5 |
5 |
5 |
1000 |
* part 27
# 修改行
students.loc[1, :] = [21, 188, 70, 20]
students
|
age |
height |
weight |
expense |
1 |
21 |
188 |
70 |
20 |
2 |
20 |
165 |
65 |
1000 |
3 |
18 |
175 |
65 |
1000 |
4 |
5 |
5 |
5 |
1000 |
* part 28
# 修改点
students.loc[students['expense']<500, 'expense'] = 1200
students
|
age |
height |
weight |
expense |
1 |
21 |
188 |
70 |
1200 |
2 |
20 |
165 |
65 |
1000 |
3 |
18 |
175 |
65 |
1000 |
4 |
5 |
5 |
5 |
1000 |
(3)删除
# 删除行输出
students.drop(1, axis=0)
|
age |
height |
weight |
expense |
2 |
20 |
165 |
65 |
1000 |
3 |
18 |
175 |
65 |
1000 |
4 |
5 |
5 |
5 |
1000 |
* part 30
# 删除列输出
students.drop('expense', axis=1)
|
age |
height |
weight |
1 |
21 |
188 |
70 |
2 |
20 |
165 |
65 |
3 |
18 |
175 |
65 |
4 |
5 |
5 |
5 |
* part 31
# 删除多行,且直接修改students
students.drop([1, 2], axis=0, inplace=True)
students
|
age |
height |
weight |
expense |
3 |
18 |
175 |
65 |
1000 |
4 |
5 |
5 |
5 |
1000 |
2.数据文件读写
(1)csv文件的读写,txt雷同
# 读取
GINI = pd.read_csv('3th dataGINI.csv')
GINI[-3:]
|
Country |
2000 |
2001 |
2002 |
2003 |
2004 |
2005 |
2006 |
2007 |
2008 |
2009 |
2010 |
2011 |
2012 |
2013 |
2014 |
155 |
乌干达 |
NaN |
NaN |
45.2 |
NaN |
NaN |
42.9 |
NaN |
NaN |
NaN |
44.2 |
NaN |
NaN |
41.0 |
NaN |
NaN |
156 |
刚果(金) |
NaN |
NaN |
NaN |
NaN |
42.2 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
42.1 |
NaN |
NaN |
157 |
津巴布韦 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
43.2 |
NaN |
NaN |
NaN |
* part 33
# 保存
students.to_csv('3th dataStudents.csv', mode='w', header=True, index=False)
(2)读取Excel文件
# 读取
GINI = pd.read_excel('3th dataGINI.xlsx', 'GINI', index_col=0)
3.数据清洗
GINI.dropna(thresh=7, inplace=True) # 保留有效数据个数大于7的行
GINI
|
2000 |
2001 |
2002 |
2003 |
2004 |
2005 |
2006 |
2007 |
2008 |
2009 |
2010 |
2011 |
2012 |
2013 |
2014 |
Country |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
中国 |
NaN |
NaN |
42.10 |
NaN |
NaN |
41.0 |
NaN |
NaN |
42.9 |
NaN |
43.70 |
42.40 |
42.20 |
39.7 |
39.1 |
泰国 |
42.8 |
NaN |
41.90 |
NaN |
42.50 |
NaN |
41.80 |
39.80 |
40.3 |
39.60 |
39.40 |
37.50 |
39.30 |
37.8 |
37.0 |
印度尼西亚 |
28.5 |
29.0 |
31.70 |
31.8 |
32.70 |
33.0 |
34.20 |
35.70 |
35.1 |
35.10 |
36.40 |
39.70 |
39.60 |
39.9 |
39.4 |
越南 |
NaN |
NaN |
37.00 |
NaN |
36.80 |
NaN |
35.80 |
NaN |
35.6 |
NaN |
39.30 |
NaN |
35.60 |
NaN |
34.8 |
柬埔寨 |
NaN |
NaN |
NaN |
NaN |
35.46 |
NaN |
NaN |
41.14 |
35.1 |
34.65 |
33.44 |
31.70 |
30.76 |
NaN |
NaN |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
危地马拉 |
54.2 |
NaN |
56.94 |
54.1 |
50.73 |
NaN |
54.60 |
NaN |
NaN |
NaN |
NaN |
52.35 |
NaN |
NaN |
48.3 |
洪都拉斯 |
NaN |
55.6 |
55.70 |
58.1 |
58.10 |
59.5 |
57.50 |
55.80 |
55.5 |
51.30 |
53.10 |
56.20 |
56.10 |
52.6 |
50.4 |
巴拉圭 |
NaN |
54.6 |
57.30 |
54.9 |
52.30 |
51.4 |
53.00 |
53.00 |
50.7 |
49.10 |
51.00 |
52.30 |
47.60 |
47.9 |
50.7 |
萨尔瓦多 |
51.5 |
51.4 |
51.90 |
50.4 |
47.80 |
48.5 |
45.70 |
45.20 |
46.9 |
45.80 |
43.50 |
42.30 |
41.80 |
43.4 |
41.6 |
南非 |
57.8 |
NaN |
NaN |
NaN |
NaN |
64.8 |
64.79 |
NaN |
63.0 |
NaN |
63.40 |
63.38 |
NaN |
NaN |
63.0 |
67 rows × 15 columns
* part 36
GINI.interpolate(method='linear', axis=1, inplace=True) # 线性插值
GINI.fillna(method='bfill', axis=1, inplace=True) # 向前填充
GINI
|
2000 |
2001 |
2002 |
2003 |
2004 |
2005 |
2006 |
2007 |
2008 |
2009 |
2010 |
2011 |
2012 |
2013 |
2014 |
Country |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
中国 |
42.10 |
42.10 |
42.10 |
41.733333 |
41.366667 |
41.000000 |
41.633333 |
42.266667 |
42.9 |
43.30 |
43.70 |
42.40 |
42.200000 |
39.700000 |
39.10 |
泰国 |
42.80 |
42.35 |
41.90 |
42.200000 |
42.500000 |
42.150000 |
41.800000 |
39.800000 |
40.3 |
39.60 |
39.40 |
37.50 |
39.300000 |
37.800000 |
37.00 |
印度尼西亚 |
28.50 |
29.00 |
31.70 |
31.800000 |
32.700000 |
33.000000 |
34.200000 |
35.700000 |
35.1 |
35.10 |
36.40 |
39.70 |
39.600000 |
39.900000 |
39.40 |
越南 |
37.00 |
37.00 |
37.00 |
36.900000 |
36.800000 |
36.300000 |
35.800000 |
35.700000 |
35.6 |
37.45 |
39.30 |
37.45 |
35.600000 |
35.200000 |
34.80 |
柬埔寨 |
35.46 |
35.46 |
35.46 |
35.460000 |
35.460000 |
37.353333 |
39.246667 |
41.140000 |
35.1 |
34.65 |
33.44 |
31.70 |
30.760000 |
30.760000 |
30.76 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
危地马拉 |
54.20 |
55.57 |
56.94 |
54.100000 |
50.730000 |
52.665000 |
54.600000 |
54.150000 |
53.7 |
53.25 |
52.80 |
52.35 |
51.000000 |
49.650000 |
48.30 |
洪都拉斯 |
55.60 |
55.60 |
55.70 |
58.100000 |
58.100000 |
59.500000 |
57.500000 |
55.800000 |
55.5 |
51.30 |
53.10 |
56.20 |
56.100000 |
52.600000 |
50.40 |
巴拉圭 |
54.60 |
54.60 |
57.30 |
54.900000 |
52.300000 |
51.400000 |
53.000000 |
53.000000 |
50.7 |
49.10 |
51.00 |
52.30 |
47.600000 |
47.900000 |
50.70 |
萨尔瓦多 |
51.50 |
51.40 |
51.90 |
50.400000 |
47.800000 |
48.500000 |
45.700000 |
45.200000 |
46.9 |
45.80 |
43.50 |
42.30 |
41.800000 |
43.400000 |
41.60 |
南非 |
57.80 |
59.20 |
60.60 |
62.000000 |
63.400000 |
64.800000 |
64.790000 |
63.895000 |
63.0 |
63.20 |
63.40 |
63.38 |
63.253333 |
63.126667 |
63.00 |
67 rows × 15 columns
* part 37
# 去除重复值,即重复行
GINI.drop_duplicates()
|
2000 |
2001 |
2002 |
2003 |
2004 |
2005 |
2006 |
2007 |
2008 |
2009 |
2010 |
2011 |
2012 |
2013 |
2014 |
Country |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
中国 |
42.10 |
42.10 |
42.10 |
41.733333 |
41.366667 |
41.000000 |
41.633333 |
42.266667 |
42.9 |
43.30 |
43.70 |
42.40 |
42.200000 |
39.700000 |
39.10 |
泰国 |
42.80 |
42.35 |
41.90 |
42.200000 |
42.500000 |
42.150000 |
41.800000 |
39.800000 |
40.3 |
39.60 |
39.40 |
37.50 |
39.300000 |
37.800000 |
37.00 |
印度尼西亚 |
28.50 |
29.00 |
31.70 |
31.800000 |
32.700000 |
33.000000 |
34.200000 |
35.700000 |
35.1 |
35.10 |
36.40 |
39.70 |
39.600000 |
39.900000 |
39.40 |
越南 |
37.00 |
37.00 |
37.00 |
36.900000 |
36.800000 |
36.300000 |
35.800000 |
35.700000 |
35.6 |
37.45 |
39.30 |
37.45 |
35.600000 |
35.200000 |
34.80 |
柬埔寨 |
35.46 |
35.46 |
35.46 |
35.460000 |
35.460000 |
37.353333 |
39.246667 |
41.140000 |
35.1 |
34.65 |
33.44 |
31.70 |
30.760000 |
30.760000 |
30.76 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
危地马拉 |
54.20 |
55.57 |
56.94 |
54.100000 |
50.730000 |
52.665000 |
54.600000 |
54.150000 |
53.7 |
53.25 |
52.80 |
52.35 |
51.000000 |
49.650000 |
48.30 |
洪都拉斯 |
55.60 |
55.60 |
55.70 |
58.100000 |
58.100000 |
59.500000 |
57.500000 |
55.800000 |
55.5 |
51.30 |
53.10 |
56.20 |
56.100000 |
52.600000 |
50.40 |
巴拉圭 |
54.60 |
54.60 |
57.30 |
54.900000 |
52.300000 |
51.400000 |
53.000000 |
53.000000 |
50.7 |
49.10 |
51.00 |
52.30 |
47.600000 |
47.900000 |
50.70 |
萨尔瓦多 |
51.50 |
51.40 |
51.90 |
50.400000 |
47.800000 |
48.500000 |
45.700000 |
45.200000 |
46.9 |
45.80 |
43.50 |
42.30 |
41.800000 |
43.400000 |
41.60 |
南非 |
57.80 |
59.20 |
60.60 |
62.000000 |
63.400000 |
64.800000 |
64.790000 |
63.895000 |
63.0 |
63.20 |
63.40 |
63.38 |
63.253333 |
63.126667 |
63.00 |
67 rows × 15 columns
* part 38
GINI1 = GINI[:30]
GINI2 = GINI[30:]
# 数据合并
NEW_GINI = pd.concat([GINI1, GINI2], axis=0) # axis=0表示按行追加
NEW_GINI
|
2000 |
2001 |
2002 |
2003 |
2004 |
2005 |
2006 |
2007 |
2008 |
2009 |
2010 |
2011 |
2012 |
2013 |
2014 |
Country |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
中国 |
42.10 |
42.10 |
42.10 |
41.733333 |
41.366667 |
41.000000 |
41.633333 |
42.266667 |
42.9 |
43.30 |
43.70 |
42.40 |
42.200000 |
39.700000 |
39.10 |
泰国 |
42.80 |
42.35 |
41.90 |
42.200000 |
42.500000 |
42.150000 |
41.800000 |
39.800000 |
40.3 |
39.60 |
39.40 |
37.50 |
39.300000 |
37.800000 |
37.00 |
印度尼西亚 |
28.50 |
29.00 |
31.70 |
31.800000 |
32.700000 |
33.000000 |
34.200000 |
35.700000 |
35.1 |
35.10 |
36.40 |
39.70 |
39.600000 |
39.900000 |
39.40 |
越南 |
37.00 |
37.00 |
37.00 |
36.900000 |
36.800000 |
36.300000 |
35.800000 |
35.700000 |
35.6 |
37.45 |
39.30 |
37.45 |
35.600000 |
35.200000 |
34.80 |
柬埔寨 |
35.46 |
35.46 |
35.46 |
35.460000 |
35.460000 |
37.353333 |
39.246667 |
41.140000 |
35.1 |
34.65 |
33.44 |
31.70 |
30.760000 |
30.760000 |
30.76 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
危地马拉 |
54.20 |
55.57 |
56.94 |
54.100000 |
50.730000 |
52.665000 |
54.600000 |
54.150000 |
53.7 |
53.25 |
52.80 |
52.35 |
51.000000 |
49.650000 |
48.30 |
洪都拉斯 |
55.60 |
55.60 |
55.70 |
58.100000 |
58.100000 |
59.500000 |
57.500000 |
55.800000 |
55.5 |
51.30 |
53.10 |
56.20 |
56.100000 |
52.600000 |
50.40 |
巴拉圭 |
54.60 |
54.60 |
57.30 |
54.900000 |
52.300000 |
51.400000 |
53.000000 |
53.000000 |
50.7 |
49.10 |
51.00 |
52.30 |
47.600000 |
47.900000 |
50.70 |
萨尔瓦多 |
51.50 |
51.40 |
51.90 |
50.400000 |
47.800000 |
48.500000 |
45.700000 |
45.200000 |
46.9 |
45.80 |
43.50 |
42.30 |
41.800000 |
43.400000 |
41.60 |
南非 |
57.80 |
59.20 |
60.60 |
62.000000 |
63.400000 |
64.800000 |
64.790000 |
63.895000 |
63.0 |
63.20 |
63.40 |
63.38 |
63.253333 |
63.126667 |
63.00 |
67 rows × 15 columns
- 列数据处理有:inner内连接-交集;outer外连接-并集;left左连接;right右连接
数据排序
值排序
GINI.sort_values(by=[2014, 2013], ascending=False)
# by-可以多列排序
# ascending-False为降序
|
2000 |
2001 |
2002 |
2003 |
2004 |
2005 |
2006 |
2007 |
2008 |
2009 |
2010 |
2011 |
2012 |
2013 |
2014 |
Country |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
南非 |
57.80 |
59.20 |
60.60 |
62.00 |
63.4 |
64.8 |
64.79 |
63.895 |
63.0 |
63.2 |
63.4 |
63.38 |
63.253333 |
63.126667 |
63.0 |
哥伦比亚 |
58.70 |
57.20 |
55.80 |
53.40 |
54.8 |
53.7 |
60.08 |
59.370 |
55.5 |
54.4 |
54.7 |
53.50 |
52.700000 |
52.800000 |
52.7 |
巴西 |
58.40 |
58.40 |
58.10 |
57.60 |
56.5 |
56.3 |
55.60 |
54.900 |
54.0 |
53.7 |
53.3 |
52.90 |
52.700000 |
52.800000 |
51.5 |
巴拉圭 |
54.60 |
54.60 |
57.30 |
54.90 |
52.3 |
51.4 |
53.00 |
53.000 |
50.7 |
49.1 |
51.0 |
52.30 |
47.600000 |
47.900000 |
50.7 |
巴拿马 |
56.80 |
56.90 |
56.20 |
55.80 |
54.8 |
53.8 |
54.60 |
52.700 |
52.7 |
51.8 |
51.6 |
51.30 |
51.700000 |
51.500000 |
50.5 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
挪威 |
27.60 |
27.60 |
27.60 |
27.60 |
31.6 |
30.6 |
26.40 |
27.100 |
27.0 |
26.2 |
25.7 |
25.30 |
25.700000 |
26.400000 |
26.8 |
斯洛伐克 |
27.10 |
27.10 |
27.10 |
27.10 |
27.1 |
29.3 |
25.80 |
24.700 |
26.0 |
27.2 |
27.3 |
26.50 |
26.100000 |
28.100000 |
26.1 |
捷克 |
27.50 |
27.50 |
27.50 |
27.50 |
27.5 |
26.9 |
26.70 |
26.000 |
26.3 |
26.2 |
26.6 |
26.40 |
26.100000 |
26.500000 |
25.9 |
斯洛文尼亚 |
29.12 |
29.12 |
29.12 |
30.87 |
24.8 |
24.6 |
24.40 |
24.400 |
23.7 |
24.8 |
24.9 |
24.90 |
25.600000 |
26.200000 |
25.7 |
乌克兰 |
29.00 |
29.00 |
29.00 |
28.70 |
28.9 |
29.0 |
29.80 |
27.000 |
26.6 |
25.3 |
24.8 |
24.60 |
24.700000 |
24.600000 |
24.0 |
67 rows × 15 columns
* part 40
# 添加排名
GINI['排名'] = GINI[2014].rank(method='min', ascending=False)
GINI
|
2000 |
2001 |
2002 |
2003 |
2004 |
2005 |
2006 |
2007 |
2008 |
2009 |
2010 |
2011 |
2012 |
2013 |
2014 |
排名 |
Country |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
中国 |
42.10 |
42.10 |
42.10 |
41.733333 |
41.366667 |
41.000000 |
41.633333 |
42.266667 |
42.9 |
43.30 |
43.70 |
42.40 |
42.200000 |
39.700000 |
39.10 |
20.0 |
泰国 |
42.80 |
42.35 |
41.90 |
42.200000 |
42.500000 |
42.150000 |
41.800000 |
39.800000 |
40.3 |
39.60 |
39.40 |
37.50 |
39.300000 |
37.800000 |
37.00 |
24.0 |
印度尼西亚 |
28.50 |
29.00 |
31.70 |
31.800000 |
32.700000 |
33.000000 |
34.200000 |
35.700000 |
35.1 |
35.10 |
36.40 |
39.70 |
39.600000 |
39.900000 |
39.40 |
19.0 |
越南 |
37.00 |
37.00 |
37.00 |
36.900000 |
36.800000 |
36.300000 |
35.800000 |
35.700000 |
35.6 |
37.45 |
39.30 |
37.45 |
35.600000 |
35.200000 |
34.80 |
32.0 |
柬埔寨 |
35.46 |
35.46 |
35.46 |
35.460000 |
35.460000 |
37.353333 |
39.246667 |
41.140000 |
35.1 |
34.65 |
33.44 |
31.70 |
30.760000 |
30.760000 |
30.76 |
46.0 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
危地马拉 |
54.20 |
55.57 |
56.94 |
54.100000 |
50.730000 |
52.665000 |
54.600000 |
54.150000 |
53.7 |
53.25 |
52.80 |
52.35 |
51.000000 |
49.650000 |
48.30 |
8.0 |
洪都拉斯 |
55.60 |
55.60 |
55.70 |
58.100000 |
58.100000 |
59.500000 |
57.500000 |
55.800000 |
55.5 |
51.30 |
53.10 |
56.20 |
56.100000 |
52.600000 |
50.40 |
6.0 |
巴拉圭 |
54.60 |
54.60 |
57.30 |
54.900000 |
52.300000 |
51.400000 |
53.000000 |
53.000000 |
50.7 |
49.10 |
51.00 |
52.30 |
47.600000 |
47.900000 |
50.70 |
4.0 |
萨尔瓦多 |
51.50 |
51.40 |
51.90 |
50.400000 |
47.800000 |
48.500000 |
45.700000 |
45.200000 |
46.9 |
45.80 |
43.50 |
42.30 |
41.800000 |
43.400000 |
41.60 |
14.0 |
南非 |
57.80 |
59.20 |
60.60 |
62.000000 |
63.400000 |
64.800000 |
64.790000 |
63.895000 |
63.0 |
63.20 |
63.40 |
63.38 |
63.253333 |
63.126667 |
63.00 |
1.0 |
67 rows × 16 columns
4.统计分析
GINI.T.describe() # 给出常用统计量
Country |
中国 |
泰国 |
印度尼西亚 |
越南 |
柬埔寨 |
奥地利 |
比利时 |
瑞士 |
捷克 |
德国 |
... |
厄瓜多尔 |
墨西哥 |
巴拿马 |
秘鲁 |
玻利维亚 |
危地马拉 |
洪都拉斯 |
巴拉圭 |
萨尔瓦多 |
南非 |
count |
16.000000 |
16.000000 |
16.000000 |
16.000000 |
16.000000 |
16.00000 |
16.000000 |
16.000000 |
16.000000 |
16.000000 |
... |
16.00000 |
16.000000 |
16.000000 |
16.000000 |
16.000000 |
16.000000 |
16.000000 |
16.000000 |
16.000000 |
16.000000 |
mean |
40.475000 |
39.400000 |
33.800000 |
36.243750 |
35.513125 |
31.28125 |
30.075000 |
33.450000 |
29.131250 |
31.506250 |
... |
48.36250 |
44.831250 |
50.481250 |
46.056250 |
50.306250 |
50.125312 |
52.318750 |
49.025000 |
44.481250 |
58.740313 |
std |
5.584429 |
4.526919 |
5.362586 |
1.575106 |
4.053855 |
4.77678 |
6.685656 |
1.097877 |
9.583299 |
3.417498 |
... |
10.61652 |
9.515301 |
12.310766 |
9.394606 |
12.146315 |
11.445617 |
12.598054 |
12.285249 |
8.852285 |
15.512899 |
min |
20.000000 |
24.000000 |
19.000000 |
32.000000 |
30.760000 |
28.70000 |
27.500000 |
31.600000 |
25.900000 |
28.800000 |
... |
11.00000 |
10.000000 |
5.000000 |
13.000000 |
9.000000 |
8.000000 |
6.000000 |
4.000000 |
14.000000 |
1.000000 |
25% |
41.275000 |
38.925000 |
31.775000 |
35.600000 |
33.005000 |
29.50000 |
28.100000 |
32.575000 |
26.275000 |
30.300000 |
... |
46.70000 |
45.337500 |
51.575000 |
44.625000 |
47.637500 |
50.932500 |
52.975000 |
50.300000 |
43.125000 |
61.650000 |
50% |
42.100000 |
40.050000 |
34.650000 |
36.550000 |
35.460000 |
30.35000 |
28.100000 |
33.900000 |
26.650000 |
30.650000 |
... |
50.95000 |
45.975000 |
52.700000 |
48.300000 |
52.650000 |
53.025000 |
55.650000 |
51.850000 |
45.750000 |
63.163333 |
75% |
42.300000 |
42.162500 |
37.150000 |
37.000000 |
35.933333 |
30.65000 |
28.675000 |
33.900000 |
27.500000 |
31.225000 |
... |
53.52500 |
48.712500 |
55.050000 |
50.325000 |
57.212500 |
54.162500 |
56.525000 |
53.400000 |
48.975000 |
63.400000 |
max |
43.700000 |
42.800000 |
39.900000 |
39.300000 |
46.000000 |
49.00000 |
55.000000 |
36.000000 |
65.000000 |
44.000000 |
... |
56.40000 |
51.400000 |
56.900000 |
53.600000 |
61.600000 |
56.940000 |
59.500000 |
57.300000 |
51.900000 |
64.800000 |
8 rows × 67 columns
* part 42
result = GINI.T[0:15].cov() # 寻找协方差矩阵
result
Country |
中国 |
泰国 |
印度尼西亚 |
越南 |
柬埔寨 |
奥地利 |
比利时 |
瑞士 |
捷克 |
德国 |
... |
厄瓜多尔 |
墨西哥 |
巴拿马 |
秘鲁 |
玻利维亚 |
危地马拉 |
洪都拉斯 |
巴拉圭 |
萨尔瓦多 |
南非 |
Country |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
中国 |
1.472413 |
0.574690 |
-1.138000 |
0.961833 |
0.990535 |
0.136190 |
0.030381 |
0.068476 |
0.042095 |
-0.323381 |
... |
0.851048 |
-0.308119 |
0.322762 |
0.832905 |
0.049214 |
1.650081 |
0.367333 |
0.460762 |
0.765381 |
-0.248414 |
泰国 |
0.574690 |
3.887452 |
-6.833905 |
0.524238 |
3.745933 |
-1.119286 |
0.528905 |
1.387714 |
0.990286 |
-0.412095 |
... |
6.916690 |
3.435619 |
3.936405 |
5.758643 |
9.400071 |
2.907842 |
3.463024 |
3.382095 |
6.296405 |
-1.531205 |
印度尼西亚 |
-1.138000 |
-6.833905 |
14.122667 |
-1.356048 |
-6.489586 |
1.886429 |
-0.741238 |
-2.683857 |
-1.907286 |
1.303905 |
... |
-13.407095 |
-6.943881 |
-7.688381 |
-10.532714 |
-18.070571 |
-6.005233 |
-4.305619 |
-7.516762 |
-12.982667 |
4.659531 |
越南 |
0.961833 |
0.524238 |
-1.356048 |
1.286024 |
0.095826 |
-0.047500 |
0.106048 |
-0.028000 |
0.267429 |
-0.462810 |
... |
0.910262 |
0.341690 |
0.487833 |
0.707929 |
0.389357 |
1.017324 |
0.105167 |
0.893524 |
0.997833 |
-0.490467 |
柬埔寨 |
0.990535 |
3.745933 |
-6.489586 |
0.095826 |
9.228565 |
-0.930857 |
1.087014 |
2.284157 |
0.442090 |
0.426067 |
... |
8.361343 |
2.486307 |
3.289538 |
7.452195 |
11.003314 |
4.353009 |
4.024590 |
4.168433 |
5.390538 |
0.507772 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
危地马拉 |
1.650081 |
2.907842 |
-6.005233 |
1.017324 |
4.353009 |
-0.665786 |
-0.105945 |
1.220221 |
0.702843 |
-0.544824 |
... |
6.064755 |
2.332612 |
3.492340 |
6.009764 |
7.909443 |
5.156259 |
2.263938 |
4.513252 |
6.001876 |
-1.842574 |
洪都拉斯 |
0.367333 |
3.463024 |
-4.305619 |
0.105167 |
4.024590 |
-1.337857 |
0.729190 |
1.060857 |
0.825429 |
0.308048 |
... |
5.653190 |
2.753190 |
3.137762 |
5.501714 |
7.913857 |
2.263938 |
6.586381 |
2.893381 |
4.284190 |
0.413193 |
巴拉圭 |
0.460762 |
3.382095 |
-7.516762 |
0.893524 |
4.168433 |
-1.205714 |
0.243190 |
1.571286 |
1.186000 |
-0.922810 |
... |
7.842762 |
4.199548 |
4.723905 |
7.370429 |
11.137214 |
4.513252 |
2.893381 |
7.250667 |
7.554619 |
-2.830776 |
萨尔瓦多 |
0.765381 |
6.296405 |
-12.982667 |
0.997833 |
5.390538 |
-1.812143 |
0.501952 |
2.529571 |
1.915143 |
-1.159619 |
... |
12.506381 |
6.511738 |
7.451952 |
10.684857 |
17.325214 |
6.001876 |
4.284190 |
7.554619 |
13.171238 |
-4.599483 |
南非 |
-0.248414 |
-1.531205 |
4.659531 |
-0.490467 |
0.507772 |
0.369179 |
0.430743 |
-0.412436 |
-0.717974 |
1.246205 |
... |
-3.732493 |
-2.743832 |
-2.793590 |
-2.007002 |
-4.964467 |
-1.842574 |
0.413193 |
-2.830776 |
-4.599483 |
3.824582 |
67 rows × 67 columns
* part 43
# 保存
result.to_csv('3th dataGINI_COV.csv', mode='w', header=True, index=True)
# 清洗出协方差绝对值大于1的两个国家
RES = DataFrame(GINI.index, index=GINI.index, columns=['Name'])
for il in list(GINI.index):
mask = np.abs(result[il])>=1
mask = result.loc[mask, il]
mask = DataFrame(mask, index=mask.index, columns=[il])
RES = pd.concat([RES, mask], axis=1, sort=False)
RES = RES.iloc[:, 1:]
RES.fillna(0, inplace=True)
for num in range(RES.index.size):
RES.iloc[num, num] = 0
for mun in range(RES.index.size):
RES.iloc[num, mun] = np.sign(RES.iloc[num, mun])
RES
|
中国 |
泰国 |
印度尼西亚 |
越南 |
柬埔寨 |
奥地利 |
比利时 |
瑞士 |
捷克 |
德国 |
... |
厄瓜多尔 |
墨西哥 |
巴拿马 |
秘鲁 |
玻利维亚 |
危地马拉 |
洪都拉斯 |
巴拉圭 |
萨尔瓦多 |
南非 |
中国 |
0.0 |
0.0 |
-1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
泰国 |
0.0 |
0.0 |
-1.0 |
0.0 |
1.0 |
-1.0 |
0.0 |
1.0 |
0.0 |
0.0 |
... |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
-1.0 |
印度尼西亚 |
-1.0 |
-1.0 |
0.0 |
-1.0 |
-1.0 |
1.0 |
0.0 |
-1.0 |
-1.0 |
1.0 |
... |
-1.0 |
-1.0 |
-1.0 |
-1.0 |
-1.0 |
-1.0 |
-1.0 |
-1.0 |
-1.0 |
1.0 |
越南 |
0.0 |
0.0 |
-1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
柬埔寨 |
0.0 |
1.0 |
-1.0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
0.0 |
0.0 |
... |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
0.0 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
危地马拉 |
1.0 |
1.0 |
-1.0 |
1.0 |
1.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
... |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
0.0 |
1.0 |
1.0 |
1.0 |
-1.0 |
洪都拉斯 |
0.0 |
1.0 |
-1.0 |
0.0 |
1.0 |
-1.0 |
0.0 |
1.0 |
0.0 |
0.0 |
... |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
0.0 |
1.0 |
1.0 |
0.0 |
巴拉圭 |
0.0 |
1.0 |
-1.0 |
0.0 |
1.0 |
-1.0 |
0.0 |
1.0 |
1.0 |
0.0 |
... |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
0.0 |
1.0 |
-1.0 |
萨尔瓦多 |
0.0 |
1.0 |
-1.0 |
0.0 |
1.0 |
-1.0 |
0.0 |
1.0 |
1.0 |
-1.0 |
... |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
0.0 |
-1.0 |
南非 |
0.0 |
-1.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
... |
-1.0 |
-1.0 |
-1.0 |
-1.0 |
-1.0 |
-1.0 |
0.0 |
-1.0 |
-1.0 |
0.0 |
67 rows × 67 columns
* part 44
# 原本想做一个有向连通图的,比较懒,没有写完
# import networkx as nx
# import matplotlib.pyplot as plt
# G = nx.DiGraph()
# G.add_node(tuple(RES.index))
# nx.draw(G)
# plt.show()
# G = nx.DiGraph()
# G.add_node(1)
# G.add_node(2)
# G.add_nodes_from([3,4,5,6])
# G.add_edge(1,3)
# G.add_edges_from([(3,5),(3,6),(6,7)])
# G = G.to_undirected()
# nx.draw(G)
# plt.show()