pandas基础(2)_多重索引

zoukankan html css js c++ java

pandas基础(2)_多重索引

1：多重索引的构造

>>> #下面显示构造pd.MultiIndex

>>> df1=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html5','python'])

>>> import pandas as pd

>>> df1=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html5','python'],index=pd.MultiIndex.from_arrays([['张三','张三','侯少','侯少','a','a'],['M','E','M','E','M','E']]))

>>> df1#因为Python自身的原因，对汉字的识别不是太好，所以汉字被？代替了

        java html5 python

???? M     2     13      76

     E   141     67      84

     M   116     83       8

     E    70    118     125

a    M    74      0      76

     E   111     31       8

>>> #使用元组tuple创建

df2=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html','python'],index=pd.MultiIndex.from_tuples([('a','1'),('a','11'),('b','1'),('b','11'),('c','1'),('c','11')]))

>>> df2

      java html python

a 1     32   144      99

  11   104   101      16

b 1     93    98      41

  11    59    30      45

c 1     91    17     149

  11     9    28      59

>>> #使用product

df2=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html','python'],index=pd.MultiIndex.from_product([['zhangsan ','lisi','wangwu'],['mid','end']]))

>>> df2

               java html python

zhangsan mid    50   128      54

          end     3     4      91

lisi      mid     4    93     110

          end   116   123     122

wangwu    mid    88    25      54

          end    48   146      57

>>> #对dataFrame同样可以设置成多重索引

df2=DataFrame(np.random.randint(0,150,size=(3,6)),columns=pd.MultiIndex.from_product([['java','html','python'],['mid','end']]),index=['张三','李四','王五'])

>>> df2

     java     html     python

      mid end mid end    mid end

????   33 38 112 70    113 110

????   29 46 132 91    117 128

????   73 56 118 82    132   39

>>>

>>> df2['java','mid']#查询某一列

????    33

????    29

????    73

Name: (java, mid), dtype: int32

>>> s['zhangsan':'lisi']#其实就是一个Series

Series([], dtype: int64)

>>> s.iloc[0:3]

a 0    1

   1    2

b 0    3

dtype: int64

>>> #切片

>>> df2['张三':'王五']

     java     html     python

      mid end mid end    mid end

????   33 38 112 70    113 110

????   29 46 132 91    117 128

????   73 56 118 82    132   39

>>>df2.iloc[0:4]#推荐使用

Df2[‘张三’，‘期中’]和df2.loc[‘张三’].loc[‘期中’]

#如何一级索引有多个，对二级索引会遇到问题，也就是说，无法直接对二级进行索引

必须把二级索引变成一级索引才可以进行索引

>>> df2.stack()

          html java python

???? end    70    38     110

     mid   112    33     113

     end    91    46     128

     mid   132    29     117

     end    82    56      39

     mid   118    73     132

>>> #stack =堆----》行

             end mid

???? html     70 112

     java     38   33

     python 110 113

     html     91 132

     java     46   29

     python 128 117

     html     82 118

     java     56   73

     python   39 132

>>> #默认为-1

2：多重索引的计算

>>> df2

     java     html     python

      mid end mid end    mid end

????   33 38 112 70    113 110

????   29 46 132 91    117 128

????   73 56 118 82    132   39

>>> df1.sum()

java      514

html5     312

python    377

dtype: int64

>>> df1.sum(axis=0)

java      514

html5     312

python    377

dtype: int64

>>> df1.sum(axis=1)#对列

???? M     91

      E    292

      M    207

      E    313

a     M    150

      E    150

dtype: int64

>>> df1.sum(axis=1)#对列求和，得到每行的和

???? M     91

      E    292

      M    207

      E    313

a     M    150

      E    150

dtype: int64

>>> df1.std

<bound method DataFrame.std of         java html5 python

???? M     2     13      76

     E   141     67      84

     M   116     83       8

     E    70    118     125

a    M    74      0      76

     E   111     31       8>

>>> #求方差

>>> df1.std(axis=1)

???? M    39.929104

      E    38.759945

      M    55.344376

      E    29.938827

a     M    43.312816

      E    54.064776

dtype: float64

>>> df1.max()

java      141

html5     118

python    125

dtype: int32

3多重索引的拼接

>>> nd = np.random.randint(0,10,size=(3,3))

>>> nd

array([[9, 9, 4],

       [7, 2, 4],

       [1, 6, 1]])

>>> np.concatenate ((nd,nd),axis=0)#在列方向就行拼接

array([[9, 9, 4],

       [7, 2, 4],

       [1, 6, 1],

       [9, 9, 4],

       [7, 2, 4],

       [1, 6, 1]])

>>> np.concatenate ([nd,nd],axis=1)#在行方向进行拼接

array([[9, 9, 4, 9, 9, 4],

       [7, 2, 4, 7, 2, 4],

       [1, 6, 1, 1, 6, 1]])

>>> def make_df(cols,inds):

data = {c:[c+str(i) for i in cols]for c in cols}

return DataFrame(data,index=inds,columns=cols)

>>> make_df(['A','B'],[1,2])

    A   B

1 AA BA

2 AB BB

>>> df1=make_df(list('AB'),[0,1])

>>> df2=make_df(list('AB'),[2,3])

>>> pd.concat ([df1,df2])#默认在列方向进行拼接

    A   B

0 AA BA

1 AB BB

2 AA BA

3 AB BB

>>> #优先增加行数

>>> pd.concat ((df1,df2),axis=1)

     A    B    A    B

0   AA   BA NaN NaN

1   AB   BB NaN NaN

2 NaN NaN   AA   BA

3 NaN NaN   AB   BB

>>> #注意index在级联时可以重复

3)

>>> #列名可以相同但是不建议

>>> df3= make_df(list('AB'),[0,1])

>>> df4=make_df(list('VB'),[1,2])

>>> pd.concat((df3,df4))#只能传入一个参数

     A   B    V

0   AA BA NaN

1   AB BB NaN

1 NaN BV   VV

2 NaN BB   VB

>>> #3种连接方式

>>> #1：外连接：补NaN（默认模式）

>>> df1= make_df(list('AB'),[1,3])

>>> df2= make_df(list('AB'),[2,4])

>>> df2= make_df(list('BC'),[2,4])

>>> pd.concat ([df1,df2],join='inner')#连接都有的部分

    B

1 BA

3 BB

2 BB

4 BC

>>> pd.concat ([df1,df2],join='outer')

     A   B    C

1   AA BA NaN

3   AB BB NaN

2 NaN BB   CB

4 NaN BC   CC

>>> #内连接只连接匹配项

>>> #3：连接指定轴 join_axes所以CDF的F便不显示了

>>> df3= make_df(list('ACD'),[0,1,2])

>>> df4= make_df(list('CDF'),[3,4,5])

>>> pd.concat([df3,df4],join_axes=[df3.columns])

     A   C   D

0   AA CA DA

1   AC CC DC

2   AD CD DD

3 NaN CC DC

4 NaN CD DD

5 NaN CF DF

>>> #join_axes 某一个DataFrame列索引为新的列索引值

>>> #3使用append()函数添加

>>> #concat方法属于pandas

>>> #append()在后面添加

>>> #concat([df1,df2])

>>> #df1.append(df2)

>>> #merge与concat的区别是，merge需要依据某一共同的行或列来进行合并

>>> #使用pd.merge（）合并时，会自动根据两者相同column名称的那一属性，作为key来进行合并，注意每一列的顺序不要求一致

>>> #一对一合并

>>> df1 = DataFrame({'employee':['po','sara','danis'],'group':['sail','counting','marcketing']})

>>> df2 = DataFrame({'employee':['po','sara','danis'],'work_time':[2,3,1]})

>>> df1

  employee       group

0       po        sail

1     sara    counting

2    danis marcketing

>>> df2

  employee work_time

0       po          2

1     sara          3

2    danis          1

>>> pd.merge (df1,df2)

  employee       group work_time

0       po        sail          2

1     sara    counting          3

2    danis marcketing          1

>>> pd.concat([df1,df2])

  employee       group work_time

0       po        sail        NaN

1     sara    counting        NaN

2    danis marcketing        NaN

0       po         NaN        2.0

1     sara         NaN        3.0

2    danis         NaN        1.0

>>> df3 = DataFrame({'employee':['po','sara','liulei'],'work_time':[2,3,1]})

>>> pd.merge(df1,df3)

  employee     group work_time

0       po      sail          2

1     sara counting          3

>>> #merge只合并相同属性里面都有的项

>>> #下面是merge的多对一的合并

>>> df1 = DataFrame({'employee':['po','sara','danis'],'work_time':[2,3,1]})

>>> df2 = DataFrame({'employee':['po','po','danis'],'group':['sail','counting','marcketing']})

>>> pd.merge(df1,df2)

  employee work_time       group

0       po          2        sail

1       po          2    counting

2    danis          1 marcketing

>>> #出现了两个po

>>> #下面是多对多的合并

>>> df1 = DataFrame({'employee':['po','sara','danis'],'group':['sail','counting','marcketing']})

>>> df1 = DataFrame({'employee':['po','po','danis'],'group':['sail','counting','marcketing']})

>>> df2 = DataFrame({'employee':['po','po','danis'],'work_time':[2,3,1]})

>>> pd.merge(df1,df2)

  employee       group work_time

0       po        sail          2

1       po        sail          3

2       po    counting          2

3       po    counting          3

4    danis marcketing          1

>>> #1*2*2的模式

>>> #使用merge多对多可以来处理重名等数据的情况

>>> df3= DataFrame({'employee':['po','Summer','Flower'],'group':['sail','marking','serch'],'WorkTime':[1,2,3]})

>>> df4= DataFrame({'employee':['po','Summer','Flower'],'group':['sail','marking','serch'],'salary':[12000,20000,10002]})

>>> df3

   WorkTime employee    group

0         1       po     sail

1         2   Summer marking

2         3   Flower    serch

>>> df4

  employee    group salary

0       po     sail   12000

1   Summer marking   20000

2   Flower    serch   10002

>>> pd.merge(df3,df4)

   WorkTime employee    group salary

0         1       po     sail   12000

1         2   Summer marking   20000

2         3   Flower    serch   10002

>>> df3= DataFrame({'employee':['po','Winter','Flower'],'group':['marketing','marking','serch'],'WorkTime':[1,2,3]})

>>> pd.merge(df3,df4)

   WorkTime employee group salary

0         3   Flower serch   10002

>>> pd.merge(df3,df4,on='employee')

   WorkTime employee    group_x group_y salary

0         1       po marketing    sail   12000

1         3   Flower      serch   serch   10002

>>> #出现两行数据的原因是指定了employee相同就可以合并

>> pd.merge(df3,df4,on='group')

   WorkTime employee_x    group employee_y salary

0         2     Winter marking     Summer   20000

1         3     Flower    serch     Flower   10002

>>> pd.merge(df3,df4,on='group',suffixes=['_A','_B'])

   WorkTime employee_A    group employee_B salary

0         2     Winter marking     Summer   20000

1         3     Flower    serch     Flower   10002

>>> df3= DataFrame({'employee':['po','Winter','Flower'],'group':['marketing','marking','serch'],'WorkTime':[1,2,3]})

>>> df4= DataFrame({'employer':['po','Summer','Flower'],'group':['sail','marking','serch'],'salary':[12000,20000,10002]})

>>> pd.merge(df3,df4)

   WorkTime employee    group employer salary

0         2   Winter marking   Summer   20000

1         3   Flower    serch   Flower   10002

>>> pd.merge(df3,df4,left_on='employee',right_on='employer')

   WorkTime employee    group_x employer group_y salary

0         1       po marketing       po    sail   12000

1         3   Flower      serch   Flower   serch   10002

>>> #df3主键key为employee和df4主键为employer,两者不同但又想相互匹配时，可以指定前者的left_on为employee和后者的right_on为employer这时两者可以进行匹配

>>> #内合并与外合并

>>> #内合并只保留两者都有的数据

>>> df1=DataFrame({'age':[18,22,33],'height':[175,169,180]})

>>> df1=DataFrame({'age':[18,23,32],'height':[175,169,180]})

>>> df2=DataFrame({'age':[18,22,33],'weight':[175,169,180]})

>>> pd.merge(df1,df2)

   age height weight

0   18     175     175

>>> pd.merge(df1,df2,how='outer')

   age height weight

0   18   175.0   175.0

1   23   169.0     NaN

2   32   180.0     NaN

3   22     NaN   169.0

4   33     NaN   180.0

>>> #默认为内合并，通过how可以指定合并类型

>>>

>>> pd.merge(df1,df2,how='left')

   age height weight

0   18     175   175.0

1   23     169     NaN

2   32     180     NaN

>>> pd.merge(df1,df2,how='right')

   age height weight

0   18   175.0     175

1   22     NaN     169

2   33     NaN     180

>>> #left保留前者的数据，right保留后者数据

>>> #left保留前者df1的数据，right保留后者df2数据

>>> #下面是列冲突

>>> df3= DataFrame({'employee':['po','Winter','Flower'],'group':['marketing','marking','serch'],'WorkTime':[1,2,3]})

>>> df4= DataFrame({'employee':['po','Summer','Flower'],'group':['sail','marking','serch'],'salary':[12000,20000,10002]})

>>> pd.merge(df3,df4)

   WorkTime employee group salary

0         3   Flower serch   10002

>>> pd.merge(df3,df4,on='employee',suffixes=['_李','_王'])

   WorkTime employee   group_?? group_?? salary

0         1       po marketing     sail   12000

1         3   Flower      serch    serch   10002

>>> #因为两者的employee和group相同，当指定employee为主键时，suffixes修改的就是group

4：总结：

多重索引也是pandas里非常重要的知识点，要牢牢掌握

查看全文

相关阅读:
Java Web编程的主要组件技术——JSP
Java Web编程的主要组件技术——Servlet
closest()一个在评论里很有用的函数
 ThinkPHP I方法
 PHPstorm 的快捷键
 ThinkPHP的缓存 F方法
 console.log的使用
 选择使用接口和抽象类的依据
 OOP三类继承的区别
 Thinkphp C方法

原文地址：https://www.cnblogs.com/henuliulei/p/9368260.html