zoukankan      html  css  js  c++  java
  • Pandas重塑和轴向旋转

    重塑和轴向旋转

    Se

    import pandas as pd
    import numpy as np
    from pandas import Series
    
    data=pd.DataFrame(np.arange(6).reshape(2,3),
                      index=['Ohio','Colorado'],
                      columns=['one','two','three']
                     )
    data.index.names=['state']
    data.columns.names=['number']
    data
    
    number	   one	two	three
       state			
       Ohio	    0	1	2
    Colorado	3	4	5
    
    
    # 使用该unstack()方法可将列转为行,一一对应,得到一个Series
    result = data.stack()
    result
    
    state     number
    Ohio      one       0
              two       1
              three     2
    Colorado  one       3
              two       4
              three     5
    dtype: int32
    
    # unstack()可以将其重排为一个DataFrame
    result.unstack()
    
    number	one	two	three
       state			
       Ohio	    0	1	2
    Colorado	3	4	5
    
    # 默认情况下,unstack操作的是最里面的那层number,这个对象可以接收索引的编号啊或名称
    result.unstack('state')
    
    state	  Ohio Colorado
    number		
      one	    0	3
      two	    1	4
      three	    2	5
    # 传入索引名称,都是讲state的层次化索引变为DataFrame格式
    result.unstack(0)
    
    state	   Ohio	Colorado
    number		
     one	    0	3
     two	    1	4
     three	    2	5
    
     # 当组里的值不是都有的时候,unstack会引入缺失数
    s1 = Series([0,1,2,3],index=['a','b','c','d'])
    s2 = Series([4,5,6], index=['c','d','e'])
    data2 = pd.concat([s1,s2],keys=['one','two'])
    data2
    
    one  a    0
         b    1
         c    2
         d    3
    two  c    4
         d    5
         e    6
    dtype: int64
    
    data2.unstack()
    
        a	b	c	d	e
    one	0.0	1.0	2.0	3.0	NaN
    two	NaN	NaN	4.0	5.0	6.0
    
    # 但是stack却可以过滤掉缺失数据,如果不想过滤,可以dropna=False
    data2.unstack().stack()
    one  a    0.0
         b    1.0
         c    2.0
         d    3.0
    two  c    4.0
         d    5.0
         e    6.0
    dtype: float64
    
    # 这是不过滤的效果
    data2.unstack().stack(dropna=False)
    
    one  a    0.0
         b    1.0
         c    2.0
         d    3.0
         e    NaN
    two  a    NaN
         b    NaN
         c    4.0
         d    5.0
         e    6.0
    dtype: float64
    
    # DataFrame中的stack和unstack
    
    result
    
    state     number
    Ohio      one       0
              two       1
              three     2
    Colorado  one       3
              two       4
              three     5
    dtype: int32
    
    df = pd.DataFrame({'left':result, 'right':result+5},columns=pd.Index(['left','right'],name='side'))
    df
    
              side	   left	   right
    state	  number		
    Ohio	  one	     0	      5
              two	     1	      6
              three	     2	      7
    Colorado  one	     3	      8
              two	     4	      9
              three	     5	      10
    
    # 对DataFrame进行unstack操作,会将旋转轴变为结果中的最低级别,变为层次化索引的最低级别
    df.unstack('state')
    
    side	left	            right
    state	Ohio	Colorado	Ohio	Colorado
    number				
    one	     0	  3	               5	  8
    two	     1	  4	               6	  9
    three	 2	  5	               7	  10
    
    # side也会是最低级别,把side折叠
    df.unstack('state').stack('side')
    
    state	       Colorado	Ohio
    number	side		
    one	    left	3	      0
            right	8	      5
    two	    left	4	      1
            right	9	      6
    three	left	5	      2
            right	10	      7
    

    时间序列数据的堆叠格式

    data_c = [
        ['1959-03-31','realgdb',2710.349],
        ['1959-03-31','infl',0.000],
        ['1959-03-31','unemp',5.800],
        ['1959-06-30','realgdb',2778.801],
        ['1959-06-30','infl',2.340],
        ['1959-06-30','unemp',5.100],
        ['1959-09-30','realgdb',2775.488],
        ['1959-09-30','infl',2.740],
        ['1959-09-30','unemp',5.300],
    ]
    ldata = pd.DataFrame(data_c,columns=['data','item','value'])
    ldata
    
            data	item	value
    0	1959-03-31	realgdb	2710.349
    1	1959-03-31	infl	0.000
    2	1959-03-31	unemp	5.800
    3	1959-06-30	realgdb	2778.801
    4	1959-06-30	infl	2.340
    5	1959-06-30	unemp	5.100
    6	1959-09-30	realgdb	2775.488
    7	1959-09-30	infl	2.740
    8	1959-09-30	unemp	5.300
    
    
    # 将data作为行索引,item作为列索引,最简单的方法,pivot快捷函数
    ldata.pivot('data','item','value')
    
    item	    infl	realgdb	    unemp
          data			
    1959-03-31	0.00	2710.349	5.8
    1959-06-30	2.34	2778.801	5.1
    1959-09-30	2.74	2775.488	5.3
    
    
    # pivot其实是执行了如下两步,本质还是堆叠
    #第一步
    ldata.set_index(['data','item'])
    
    
    	                value
        data	item	
    1959-03-31	realgdb	2710.349
                infl	0.000
                unemp	5.800
    1959-06-30	realgdb	2778.801
                infl	2.340
                unemp	5.100
    1959-09-30	realgdb	2775.488
                infl	2.740
                unemp	5.300
    
    # 第二步
    ldata.set_index(['data','item']).unstack()
    
    
    value
    item	    infl	realgdb	   unemp
       data			
    1959-03-31	0.00	2710.349	5.8
    1959-06-30	2.34	2778.801	5.1
    1959-09-30	2.74	2775.488	5.3
    
  • 相关阅读:
    Promise笔记
    srping-cloud-stream集成rocketmq
    mysql锁
    profiling分析
    mysql慢查询
    sql语句中in与exists的使用区别
    数据库死锁的解决办法
    死锁的形成以及处理
    百万数据修改索引,百万数据修改主键
    创建视图索引
  • 原文地址:https://www.cnblogs.com/lishi-jie/p/10039201.html
Copyright © 2011-2022 走看看