zoukankan      html  css  js  c++  java
  • 数据挖掘之pandas

    sdata={'语文':89,'数学':96,'音乐':39,'英语':78,'化学':88}
    #字典向Series转化 @@
    >>> studata=Series(sdata)
    >>> studata
    化学    88
    数学    96
    英语    78
    语文    89
    音乐    39
    dtype: int64
    >>> obj=Series(sdata,index=['物理','数学','化学'])
    >>> obj
    物理     NaN[这个地方没有物理成绩所以是NaN,同时引起下面的数据是float型]
    数学    96.0
    化学    88.0
    dtype: float64
    #判断数据行中是否为空值
    >>> pd.isnull(obj)
    物理     True
    数学    False
    化学    False
    dtype: bool
    >>> pd.notnull(obj)
    物理    False
    数学     True
    化学     True
    dtype: bool
    >>> obj.isnull()
    物理     True
    数学    False
    化学    False
    dtype: bool
    
    #对应数据相加
    >>> en=Series([84,94,51,81],index=['张三','李四','王五','赵六'])
    >>> sx=Series([94,81,31,91],index=['张三','赵六','王五','李四'])
    >>> en+sx   [相加时候索引自动对其]
    张三    178
    李四    185
    王五     82
    赵六    162
    dtype: int64
    
    #Series 的name 属性
    >>> en.name='英语成绩'
    >>> en
    张三    84
    李四    94
    王五    51
    赵六    81
    Name: 英语成绩, dtype: int64
    >>> en.index.name='姓名'
    >>> en
    姓名
    张三    84
    李四    94
    王五    51
    赵六    81
    Name: 英语成绩, dtype: int64
    
    #索引是可以修改的
    >>> en.index=['zs','ll','ww','zl']
    >>> en
    zs    84
    ll    94
    ww    51
    zl    81
    Name: 英语成绩, dtype: int64
    
    #############DataFrame##############
    
    >>> data={
    	'name':['张三','张三','张三','李四','李四','李四'],
    	'year':[2001,2002,2003,2001,2002,2003],
    	'weight':[54,50,60,61,63,65],
    }
    >>> frame=DataFrame(data)
    >>> frame
      name  weight  year
    0   张三      54  2001
    1   张三      50  2002
    2   张三      60  2003
    3   李四      61  2001
    4   李四      63  2002
    5   李四      65  2003
    
    #columns可以修改显示顺序和选项
    >>> DataFrame(data,columns=['year','weight','name'])
       year  weight name
    0  2001      54   张三
    1  2002      50   张三
    2  2003      60   张三
    3  2001      61   李四
    4  2002      63   李四
    5  2003      65   李四
    
    >>> DataFrame(data,columns=['year','weight','name','sex'],index=['one','two','three','four','five','five'])
           year  weight name  sex
    one    2001      54   张三  NaN
    two    2002      50   张三  NaN
    three  2003      60   张三  NaN
    four   2001      61   李四  NaN
    five   2002      63   李四  NaN
    five   2003      65   李四  NaN
    
    #索引相同的情况查询,获取某一行或者几行
    >>> a.ix['five']
          year  weight name  sex
    five  2002      63   李四  NaN
    five  2003      65   李四  NaN
    
    #DataFrame-->Series 降维
    #获取某一列
    >>> info=DataFrame(data,columns=['year','weight','name','sex'],index=['one','two','three','four','five','five'])
    >>> info['name']                                                                one      张三
    two      张三
    three    张三
    four     李四
    five     李四
    five     李四
    Name: name, dtype: object
    
    #列赋值
    >>> info['sex']='男'
    >>> info
           year  weight name sex
    one    2001      54   张三   男
    two    2002      50   张三   男
    three  2003      60   张三   男
    four   2001      61   李四   男
    five   2002      63   李四   男
    five   2003      65   李四   男
    
    #列赋值-列值局部赋值
    >>> val=Series(['man','woman','man'],index=['two','four','five'])
    >>> info['sex']=val
    >>> info
           year  weight name    sex
    one    2001      54   张三    NaN
    two    2002      50   张三    man
    three  2003      60   张三    NaN
    four   2001      61   李四  woman
    five   2002      63   李四    man
    five   2003      65   李四    man
    
    #为不存在的列创建并赋值
    >>> info['sexflag']=info.sex=='man'
    >>> info
           year  weight name    sex sexflag
    one    2001      54   张三    NaN   False
    two    2002      50   张三    man    True
    three  2003      60   张三    NaN   False
    four   2001      61   李四  woman   False
    five   2002      63   李四    man    True
    five   2003      65   李四    man    True
    
    #删除某一个列
    >>> del info['sex']
    >>> info
           year  weight name sexflag
    one    2001      54   张三   False
    two    2002      50   张三    True
    three  2003      60   张三   False
    four   2001      61   李四   False
    five   2002      63   李四    True
    five   2003      65   李四    True
    
    #嵌套字典-----convert--->DataFrame
    #外层的key是列;内层的key是行
    >>> studata={'张三':{'语文':91,'数学':99,'物理':90},'李四':{'语文':31,'数学':65,'物理':45}}
    >>> info2=DataFrame(studata)
    >>> info2
        张三  李四
    数学  99  65
    物理  90  45
    语文  91  31
    >>> info2.T
        数学  物理  语文
    张三  99  90  91
    李四  65  45  31
    
    #index.name columns.name 属性
    >>> info
           year  weight name sexflag
    one    2001      54   张三   False
    two    2002      50   张三    True
    three  2003      60   张三   False
    four   2001      61   李四   False
    five   2002      63   李四    True
    five   2003      65   李四    True
    >>> info.index.name='个人信息'
    >>> info.columns.name='索引'
    >>> info
    索引 year  weight name sexflag
    个人信息                            
    one    2001      54   张三   False
    two    2002      50   张三    True
    three  2003      60   张三   False
    four   2001      61   李四   False
    five   2002      63   李四    True
    five   2003      65   李四    True
    
    >>> info.index
    Index([u'one', u'two', u'three', u'four', u'five', u'five'], dtype='object', name=u'个人信息')
    #集合去重复
    >>> info.index.unique
    <bound method Index.unique of Index([u'one', u'two', u'three', u'four', u'five', u'five'], dtype='object', name=u'个人信息')>
    >>> info.index.unique()
    array(['one', 'two', 'three', 'four', 'five'], dtype=object)
    #是否唯一
    >>> info.index.is_unique
    False
    #当各元素均大于等于前一个元素时候,返回True
    >>> DataFrame(range(1,4),index=range(1,4)).index.is_monotonic
    True
    >>> info.index.is_monotonic
    False
    #删除传入的值并得到新的index
    >>> DataFrame(range(1,4),index=range(1,4)).index.drop(1)
    Int64Index([2, 3], dtype='int64')
    
    >>> obj=Series([33,23],index=['a','b'])
    >>> obj
    a    33
    b    23
    dtype: int64
    >>> obj2=obj.reindex(['b','a','c'])
    >>> obj2
    b    23.0
    a    33.0
    c     NaN
    dtype: float64
    >>> obj2=obj.reindex(['b','a','c'],fill_value=0)
    >>> obj2
    b    23
    a    33
    c     0
    dtype: int64
    
    >>> obj3=Series(['blue','purple','yellow'],index=[0,2,4])
    >>> obj3
    0      blue
    2    purple
    4    yellow
    dtype: object
    #ffill前向值填充
    >>> obj3.reindex(range(6),method='ffill')
    0      blue
    1      blue
    2    purple
    3    purple
    4    yellow
    5    yellow
    dtype: object
    #bfill后向填充
    >>> obj3.reindex(range(6),method='bfill')
    0      blue
    1    purple
    2    purple
    3    yellow
    4    yellow
    5       NaN
    dtype: object
    
    >>> frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','b','d'],columns=['Ohio','Texas','california'])
    >>> frame
       Ohio  Texas  california
    a     0      1           2
    b     3      4           5
    d     6      7           8
    #重新索引行
    >>> frame2=frame.reindex(['a','b','c','d'])
    >>> frame2
       Ohio  Texas  california
    a   0.0    1.0         2.0
    b   3.0    4.0         5.0
    c   NaN    NaN         NaN
    d   6.0    7.0         8.0
    #重新索引列
    >>> cols=['Texas','Ohio','uknown']
    >>> frame.reindex(columns=cols)
       Texas  Ohio  uknown
    a      1     0     NaN
    b      4     3     NaN
    d      7     6     NaN
    
    >>> frame.reindex(index=['a','b','c','d'],method='ffill',columns=cols)
       Texas  Ohio  uknown
    a      1     0     NaN
    b      4     3     NaN
    c      4     3     NaN
    d      7     6     NaN
    >>> frame.ix[['a','b','c','d'],cols]
       Texas  Ohio  uknown
    a    1.0   0.0     NaN
    b    4.0   3.0     NaN
    c    NaN   NaN     NaN
    d    7.0   6.0     NaN
    
    
    >>> data
       Texas  Ohio  uknown
    a    1.0   0.0     NaN
    b    4.0   3.0     NaN
    c    NaN   NaN     NaN
    d    7.0   6.0     NaN
    #删除行
    >>> data.drop(['c','b'])
       Texas  Ohio  uknown
    a    1.0   0.0     NaN
    d    7.0   6.0     NaN
    >>> data.drop('uknown',axis=1)
       Texas  Ohio
    a    1.0   0.0
    b    4.0   3.0
    c    NaN   NaN
    d    7.0   6.0
    
    #列的条件查询
    >>> info[info['weight']>60]
    索引 year  weight name sexflag
    个人信息                           
    four  2001      61   李四   False
    five  2002      63   李四    True
    five  2003      65   李四    True
    
    #
    >>> info.ix['one',['name','year']]
    索引
    name      张三
    year    2001
    Name: one, dtype: object
    
    
    >>> data=DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','NewYork'],columns=['one','two','three','four'])
    >>> data
              one  two  three  four
    Ohio        0    1      2     3
    Colorado    4    5      6     7
    Utah        8    9     10    11
    NewYork    12   13     14    15
    >>> data['two']
    Ohio         1
    Colorado     5
    Utah         9
    NewYork     13
    Name: two, dtype: int64
    >>> data[['three','one']]
              three  one
    Ohio          2    0
    Colorado      6    4
    Utah         10    8
    NewYork      14   12
    >>> 
    >>> data[:2]
              one  two  three  four
    Ohio        0    1      2     3
    Colorado    4    5      6     7
    >>> data[data['three']>5]
              one  two  three  four
    Colorado    4    5      6     7
    Utah        8    9     10    11
    NewYork    12   13     14    15
    
    >>> data<5
                one    two  three   four
    Ohio       True   True   True   True
    Colorado   True  False  False  False
    Utah      False  False  False  False
    NewYork   False  False  False  False
    >>> data[data<5]=0
    >>> data
              one  two  three  four
    Ohio        0    0      0     0
    Colorado    0    5      6     7
    Utah        8    9     10    11
    NewYork    12   13     14    15
    
    #行列组合查询
    >>> data.ix['Colorado',['two','three']]
    two      5
    three    6
    Name: Colorado, dtype: int64
    >>> data.ix[['Colorado','Utah'],[3,0,1]]
              four  one  two
    Colorado     7    0    5
    Utah        11    8    9
    
    >>> data.ix[:'Utah','two']
    Ohio        0
    Colorado    5
    Utah        9
    Name: two, dtype: int64
    >>> 
    
    >>> data.ix[data.three>5,:3]
              one  two  three
    Colorado    0    5      6
    Utah        8    9     10
    NewYork    12   13     14
    
    #obj[val] 选取DataFrame的单个列或一组列。在一些特殊情况下会比较便利
    #obj.ix[val] 选取DataFrame的单个行或一组行
    #obj.ix[:,val] 选取单个列或列子集
    #obj.ix[val1,val2] 同时选取行和列
    #reindex 新索引
    
    #DataFrame的数据对齐
    >>> df1 = DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['good','bad','normal'])
    >>> df2 = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['good','normal','bad','supper'])
    >>> df1
              b    c    d
    good    0.0  1.0  2.0
    bad     3.0  4.0  5.0
    normal  6.0  7.0  8.0
    >>> df2
              b     d     e
    good    0.0   1.0   2.0
    normal  3.0   4.0   5.0
    bad     6.0   7.0   8.0
    supper  9.0  10.0  11.0
    
    >>> df1+df2
              b   c     d   e
    bad     9.0 NaN  12.0 NaN
    good    0.0 NaN   3.0 NaN
    normal  9.0 NaN  12.0 NaN
    supper  NaN NaN   NaN NaN
    
    #没有的值使用0填充
    >>> df1.add(df2,fill_value=0)
              b    c     d     e
    bad     9.0  4.0  12.0   8.0
    good    0.0  1.0   3.0   2.0
    normal  9.0  7.0  12.0   5.0
    supper  9.0  NaN  10.0  11.0
    #索引reindex 的填充
    >>> df1.reindex(columns=df2.columns,fill_value=0)
              b    d  e
    good    0.0  2.0  0
    bad     3.0  5.0  0
    normal  6.0  8.0  0
    
    #其他的算术方法:
    add +
    sub -
    div /
    mul *
    
    
    DataFrame和Series的运算
    >>> frame=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['good','bad','supper','uknown'])
    >>> frame
              b     d     e
    good    0.0   1.0   2.0
    bad     3.0   4.0   5.0
    supper  6.0   7.0   8.0
    uknown  9.0  10.0  11.0
    >>> series=frame.ix[0]
    >>> series
    b    0.0
    d    1.0
    e    2.0
    Name: good, dtype: float64
    >>> 
    >>> frame-series
              b    d    e
    good    0.0  0.0  0.0
    bad     3.0  3.0  3.0
    supper  6.0  6.0  6.0
    uknown  9.0  9.0  9.0
    
    #frame 和 serie运算出现广播现象
    >>> series2=Series(range(3),index=[list('bef')])
    >>> series2
    b    0
    e    1
    f    2
    dtype: int64
    >>> frame+series2
              b   d     e   f
    good    0.0 NaN   3.0 NaN
    bad     3.0 NaN   6.0 NaN
    supper  6.0 NaN   9.0 NaN
    uknown  9.0 NaN  12.0 NaN
    
    #在列上广播
    >>> frame.sub(series3,axis=0)
              b    d    e
    good   -1.0  0.0  1.0
    bad    -1.0  0.0  1.0
    supper -1.0  0.0  1.0
    uknown -1.0  0.0  1.0
    
    
    >>> frame=DataFrame(np.random.randn(4,3),columns=list('bde'),index=['good','bad','nice','supper'])
    >>> frame
                   b         d         e
    good    0.428420 -0.951975  0.862226
    bad    -0.666254 -0.988423  2.442255
    nice    1.617591  0.377867 -1.069077
    supper -1.417150  0.449853  0.685007
    #全部转换成正数
    >>> np.abs(frame)
                   b         d         e
    good    0.428420  0.951975  0.862226
    bad     0.666254  0.988423  2.442255
    nice    1.617591  0.377867  1.069077
    supper  1.417150  0.449853  0.685007
    
    >>> f=lambda x: x.max()-x.min()
    >>> frame.apply(f,axis=0)
    b    3.034740
    d    1.438276
    e    3.511332
    dtype: float64
    >>> frame.apply(f,axis=1)
    good      1.814201
    bad       3.430677
    nice      2.686668
    supper    2.102157
    dtype: float64
    
    >>> def f(x):return Series([x.min(),x.max()],index=['min','max'])
    ... 
    >>> frame.apply(f)
                b         d         e
    min -1.417150 -0.988423 -1.069077
    max  1.617591  0.449853  2.442255
    
    #格式化内容
    >>> format=lambda x:'%.2f' % x
    >>> frame.applymap(format)
                b      d      e
    good     0.43  -0.95   0.86
    bad     -0.67  -0.99   2.44
    nice     1.62   0.38  -1.07
    supper  -1.42   0.45   0.69
    
    
    
    #############排序和排名#############
    #ascending 升序还是降
    >>> frame=DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=[list('nalv')])
    >>> frame
           n  a  l  v
    three  0  1  2  3
    one    4  5  6  7
    >>> frame.sort_index()
           n  a  l  v
    one    4  5  6  7
    three  0  1  2  3
    >>> frame.sort_index(axis=1)
           a  l  n  v
    three  1  2  0  3
    one    5  6  4  7
    >>> frame.sort_index(axis=1,ascending=False)
           v  n  l  a
    three  3  0  2  1
    one    7  4  6  5
    
    >>> obj=Series([4,5,-3,2])
    >>> obj.order()
    2   -3
    3    2
    0    4
    1    5
    dtype: int64
    
    #指定列v倒叙排
    >>> frame.sort_index(axis=0,ascending=False,by='v')
           n  a  l  v
    one    4  5  6  7
    three  0  1  2  3
    
    >>> frame.sort_index(axis=0,ascending=False,by=['v','l'])
           n  a  l  v
    one    4  5  6  7
    three  0  1  2  3
    
    >>> obj=Series([7,-5,7,4,2,0,4])
    >>> obj.rank(method='first')
    0    6.0
    1    1.0
    2    7.0
    3    4.0
    4    3.0
    5    2.0
    6    5.0
    dtype: float64
    >>> obj.rank(ascending=False,method='max')
    0    2.0
    1    7.0
    2    2.0
    3    4.0
    4    5.0
    5    6.0
    6    4.0
    dtype: float64
    
    
    >>> DataFrame(studata).T
        数学  物理  语文
    张三  99  90  91
    李四  65  45  31
    >>> DataFrame(studata).T.rank(axis=1,ascending=False)
         数学   物理   语文
    张三  1.0  3.0  2.0
    李四  1.0  2.0  3.0
    >>> DataFrame(studata).T.rank(axis=0,ascending=False)
         数学   物理   语文
    张三  1.0  1.0  1.0
    李四  2.0  2.0  2.0
    
    
    >>> datastu=pd.read_csv('/Users/similarface/Downloads/jnn.csv')
    >>> datastu
               准考证号   姓名  班级     语文  数学     英语  化学  物理
    0  304040250124   罗茜   1  101.0  94  102.5  79  74
    1  304040250128  沈怡君   1   91.5  96   69.0  82  69
    2  304040250321   魏华   2   74.0  28   42.0  56  56
    3  304040250233  何仕林   2   60.5  42   34.5  49  46
    4  304040250725   屈妮   5   93.5  63   77.5  55  66
    5  304040250709  邓培蓓   5  102.5  81   47.0  65  58
    6  304040250805  郑清霞   5   89.0  80   63.5  63  65
    7  304040250827   明杨   6  108.5  92   79.0  89  83
    8  304040250819   李倩   6   93.5  61   44.0  45  32
    9  304040250912  江明悦   6    0.0   0    0.0   0   0
    
    >>> datastu.rank(axis=1,ascending=False,method='min')
       准考证号   姓名   班级   语文   数学   英语   化学   物理
    0   2.0  1.0  8.0  4.0  5.0  3.0  6.0  7.0
    1   2.0  1.0  8.0  4.0  3.0  6.0  5.0  6.0
    2   2.0  1.0  8.0  3.0  7.0  6.0  4.0  4.0
    3   2.0  1.0  8.0  3.0  6.0  7.0  4.0  5.0
    4   2.0  1.0  8.0  3.0  6.0  4.0  7.0  5.0
    5   2.0  1.0  8.0  3.0  4.0  7.0  5.0  6.0
    6   2.0  1.0  8.0  3.0  4.0  6.0  7.0  5.0
    7   2.0  1.0  8.0  3.0  4.0  7.0  5.0  6.0
    8   2.0  1.0  8.0  3.0  4.0  6.0  5.0  7.0
    9   2.0  1.0  3.0  4.0  4.0  4.0  4.0  4.0
    >>> datastu.rank(axis=0,ascending=False,method='min')
       准考证号    姓名   班级    语文    数学    英语    化学    物理
    0  10.0   4.0  9.0   3.0   2.0   1.0   3.0   2.0
    1   9.0   5.0  9.0   6.0   1.0   4.0   2.0   3.0
    2   7.0   1.0  7.0   8.0   9.0   8.0   6.0   7.0
    3   8.0  10.0  7.0   9.0   8.0   9.0   8.0   8.0
    4   5.0   9.0  4.0   4.0   6.0   3.0   7.0   4.0
    5   6.0   3.0  4.0   2.0   4.0   6.0   4.0   6.0
    6   4.0   2.0  4.0   7.0   5.0   5.0   5.0   5.0
    7   2.0   8.0  1.0   1.0   3.0   2.0   1.0   1.0
    8   3.0   7.0  1.0   4.0   7.0   7.0   9.0   9.0
    9   1.0   6.0  1.0  10.0  10.0  10.0  10.0  10.0
    
    >>> data=datastu[['语文','数学','物理','英语','化学']]
    >>> data
          语文  数学  物理     英语  化学
    0  101.0  94  74  102.5  79
    1   91.5  96  69   69.0  82
    2   74.0  28  56   42.0  56
    3   60.5  42  46   34.5  49
    4   93.5  63  66   77.5  55
    5  102.5  81  58   47.0  65
    6   89.0  80  65   63.5  63
    7  108.5  92  83   79.0  89
    8   93.5  61  32   44.0  45
    9    0.0   0   0    0.0   0
    
    
    >>> data.sum()
    语文    814.0
    数学    637.0
    物理    549.0
    英语    559.0
    化学    583.0
    dtype: float64
    
    >>> data.sum(axis=1)
    0    450.5
    1    407.5
    2    256.0
    3    232.0
    4    355.0
    5    353.5
    6    360.5
    7    451.5
    8    275.5
    9      0.0
    dtype: float64
    
    #axis
    #skipna 排除缺失值NAN
    #level 
    
    
    >>> data
          语文  数学  物理     英语  化学
    0  101.0  94  74  102.5  79
    1   91.5  96  69   69.0  82
    2   74.0  28  56   42.0  56
    3   60.5  42  46   34.5  49
    4   93.5  63  66   77.5  55
    5  102.5  81  58   47.0  65
    6   89.0  80  65   63.5  63
    7  108.5  92  83   79.0  89
    8   93.5  61  32   44.0  45
    9    0.0   0   0    0.0   0
    #返回间接统计
    >>> data.idxmax()
    语文    7   最高分数的索引在7
    数学    1   最高分数的索引在1
    物理    7	最高分数的索引在7
    英语    0	最高分数的索引在0
    化学    7	最高分数的索引在7
    dtype: int64
    #累和
    >>> data.cumsum()
          语文     数学     物理     英语     化学
    0  101.0   94.0   74.0  102.5   79.0
    1  192.5  190.0  143.0  171.5  161.0
    2  266.5  218.0  199.0  213.5  217.0
    3  327.0  260.0  245.0  248.0  266.0
    4  420.5  323.0  311.0  325.5  321.0
    5  523.0  404.0  369.0  372.5  386.0
    6  612.0  484.0  434.0  436.0  449.0
    7  720.5  576.0  517.0  515.0  538.0
    8  814.0  637.0  549.0  559.0  583.0
    9  814.0  637.0  549.0  559.0  583.0
    
    >>> data.describe()
                   语文        数学         物理          英语         化学
    count   10.000000  10.00000  10.000000   10.000000  10.000000
    mean    81.400000  63.70000  54.900000   55.900000  58.300000
    std     31.857146  31.86447  24.052951   28.670349  25.117723
    min      0.000000   0.00000   0.000000    0.000000   0.000000
    25%     77.750000  46.75000  48.500000   42.500000  50.500000
    50%     92.500000  71.50000  61.500000   55.250000  59.500000
    75%     99.125000  89.25000  68.250000   75.375000  75.500000
    max    108.500000  96.00000  83.000000  102.500000  89.000000
    
    '''
    
    DataFrame.abs()	Return an object with absolute value taken–only applicable to objects that are all numeric.
    DataFrame.all([axis, bool_only, skipna, level])	Return whether all elements are True over requested axis
    DataFrame.any([axis, bool_only, skipna, level])	Return whether any element is True over requested axis
    DataFrame.clip([lower, upper, out, axis])	Trim values at input threshold(s).
    DataFrame.clip_lower(threshold[, axis])	Return copy of the input with values below given value(s) truncated.
    DataFrame.clip_upper(threshold[, axis])	Return copy of input with values above given value(s) truncated.
    DataFrame.corr([method, min_periods])	Compute pairwise correlation of columns, excluding NA/null values
    DataFrame.corrwith(other[, axis, drop])	Compute pairwise correlation between rows or columns of two DataFrame objects.
    DataFrame.count([axis, level, numeric_only])	Return Series with number of non-NA/null observations over requested axis.
    DataFrame.cov([min_periods])	Compute pairwise covariance of columns, excluding NA/null values
    DataFrame.cummax([axis, dtype, out, skipna])	Return cumulative max over requested axis.
    DataFrame.cummin([axis, dtype, out, skipna])	Return cumulative min over requested axis.
    DataFrame.cumprod([axis, dtype, out, skipna])	Return cumulative prod over requested axis.
    DataFrame.cumsum([axis, dtype, out, skipna])	Return cumulative sum over requested axis.
    DataFrame.describe([percentiles, include, ...])	Generate various summary statistics, excluding NaN values.
    一阶差分(时间序列很有用)DataFrame.diff([periods, axis])	1st discrete difference of object
    DataFrame.eval(expr[, inplace])	Evaluate an expression in the context of the calling DataFrame instance.
    样本的峰度(四阶矩)DataFrame.kurt([axis, skipna, level, ...])	Return unbiased kurtosis over requested axis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0).
    平均绝对离差DataFrame.mad([axis, skipna, level])	Return the mean absolute deviation of the values for the requested axis
    DataFrame.max([axis, skipna, level, ...])	This method returns the maximum of the values in the object.
    DataFrame.mean([axis, skipna, level, ...])	Return the mean of the values for the requested axis
    DataFrame.median([axis, skipna, level, ...])	Return the median of the values for the requested axis
    DataFrame.min([axis, skipna, level, ...])	This method returns the minimum of the values in the object.
    DataFrame.mode([axis, numeric_only])	Gets the mode(s) of each element along the axis selected.
    百分数变化DataFrame.pct_change([periods, fill_method, ...])	Percent change over given number of periods.
    DataFrame.prod([axis, skipna, level, ...])	Return the product of the values for the requested axis
    DataFrame.quantile([q, axis, numeric_only, ...])	Return values at the given quantile over requested axis, a la numpy.percentile.
    DataFrame.rank([axis, method, numeric_only, ...])	Compute numerical data ranks (1 through n) along axis.
    DataFrame.round([decimals, out])	Round a DataFrame to a variable number of decimal places.
    DataFrame.sem([axis, skipna, level, ddof, ...])	Return unbiased standard error of the mean over requested axis.
    样本值的偏度(三阶矩)DataFrame.skew([axis, skipna, level, ...])	Return unbiased skew over requested axis
    DataFrame.sum([axis, skipna, level, ...])	Return the sum of the values for the requested axis
    标准差DataFrame.std([axis, skipna, level, ddof, ...])	Return sample standard deviation over requested axis.
    方差DataFrame.var([axis, skipna, level, ddof, ...])	Return unbiased variance over requested axis.
    '''
    
    
    >>> import pandas.io.data as web
    >>> all_data={}
    >>> for ticker in ['AAPL','IBM','MSFT','GOOG']: all_data[ticker]=web.get_data_yahoo(ticker,'1/1/2000','1/1/2010')
    >>> price=DataFrame({tic:data['Adj Close'] for tic ,data in all_data.iteritems()})
    >>> volume=DataFrame({tic:data['Volume'] for tic,data in all_data.iteritems()})
    >>> returns=price.pct_change()
    >>> returns.tail()
                    AAPL      GOOG       IBM      MSFT
    Date                                              
    2009-12-24  0.034339  0.011117  0.004385  0.002587
    2009-12-28  0.012294  0.007098  0.013326  0.005484
    2009-12-29 -0.011861 -0.005571 -0.003477  0.007058
    2009-12-30  0.012147  0.005376  0.005461 -0.013699
    2009-12-31 -0.004300 -0.004416 -0.012597 -0.015504
    #计算相关系数
    >>> returns.IBM.corr(returns.GOOG)
    0.39068882087254675
    >>> returns.corrwith(returns.IBM)
    AAPL    0.410011
    GOOG    0.390689
    IBM     1.000000
    MSFT    0.495980
    dtype: float64
    
    >>> returns.corrwith(volume)
    AAPL   -0.057549
    GOOG    0.062647
    IBM    -0.007892
    MSFT   -0.014245
    dtype: float64
    
    >>> obj=Series(['c','b','c','c','d','a','g','b'])
    >>> obj.value_counts()
    c    3
    b    2
    g    1
    d    1
    a    1
    dtype: int64
    >>> pd.value_counts(obj.values,sort=False)
    a    1
    c    3
    b    2
    d    1
    g    1
    dtype: int64
    
    #是否存在
    >>> mask=obj.isin(['b','c'])
    >>> mask
    0     True
    1     True
    2     True
    3     True
    4    False
    5    False
    6    False
    7     True
    dtype: bool
    
    >>> obj[mask]
    0    c
    1    b
    2    c
    3    c
    7    b
    dtype: object
    
    #频度柱状图
    >>> data=DataFrame({'Qu1':[1,3,4,5,3],'Qu2':[2,4,1,2,4],'Qu3':[3,4,2,1,1]})
    >>> data
       Qu1  Qu2  Qu3
    0    1    2    3
    1    3    4    4
    2    4    1    2
    3    5    2    1
    4    3    4    1
    >>> data.apply(pd.value_counts).fillna(0)
       Qu1  Qu2  Qu3
    1  1.0  1.0  2.0
    2  0.0  2.0  1.0
    3  2.0  0.0  1.0
    4  1.0  2.0  1.0
    5  1.0  0.0  0.0
    
    
    
    #缺失数据处理
    >>> string_data=Series(['张三','李四',np.nan,'赵六'])
    >>> string_data
    0     张三
    1     李四
    2    NaN
    3     赵六
    dtype: object
    >>> string_data.isnull()
    0    False
    1    False
    2     True
    3    False
    dtype: bool
    
    ######过滤数据过滤缺失数据
    >>> from numpy import nan as NA
    >>> data=Series([1,NA,3.5,NA,7])
    >>> data.dropna()
    0    1.0
    2    3.5
    4    7.0
    dtype: float64
    >>> data
    0    1.0
    1    NaN
    2    3.5
    3    NaN
    4    7.0
    dtype: float64
    >>> data[data.notnull()]
    0    1.0
    2    3.5
    4    7.0
    dtype: float64
    
    #DataFrame默认删除只要包含NA的行
    >>> data=DataFrame([[1.,6.5,3.],[1,NA,NA],[NA,NA,NA],[NA,6.5,3.]])
    >>> data
         0    1    2
    0  1.0  6.5  3.0
    1  1.0  NaN  NaN
    2  NaN  NaN  NaN
    3  NaN  6.5  3.0
    >>> data.dropna()
         0    1    2
    0  1.0  6.5  3.0
    #how='all'
    >>> data.dropna(how='all')
         0    1    2
    0  1.0  6.5  3.0
    1  1.0  NaN  NaN
    3  NaN  6.5  3.0
    
    #删除列全是null的
    >>> data
         0    1    2   4
    0  1.0  6.5  3.0 NaN
    1  1.0  NaN  NaN NaN
    2  NaN  NaN  NaN NaN
    3  NaN  6.5  3.0 NaN
    >>> data.dropna(axis=1,how='all')
         0    1    2
    0  1.0  6.5  3.0
    1  1.0  NaN  NaN
    2  NaN  NaN  NaN
    3  NaN  6.5  3.0
    
    #thresh  表示空值的个数
    >>> df.dropna(thresh=3)
              0         1         2
    5  0.519277  1.182077 -0.500918
    6 -0.050867 -0.051302  1.368309
    
    #填充缺失数据
    >>> df.fillna(-1)
              0         1         2
    0  0.581403 -1.000000 -1.000000
    1 -1.709160 -1.000000 -1.000000
    2  2.496074 -1.000000 -1.000000
    3  0.329339 -1.000000  0.736299
    4 -0.638106 -1.000000  0.756044
    5  0.519277  1.182077 -0.500918
    6 -0.050867 -0.051302  1.368309
    #指定列的填充
    >>> df.fillna({1:0.5,3:-1})
              0         1         2
    0  0.581403  0.500000       NaN
    1 -1.709160  0.500000       NaN
    2  2.496074  0.500000       NaN
    3  0.329339  0.500000  0.736299
    4 -0.638106  0.500000  0.756044
    5  0.519277  1.182077 -0.500918
    6 -0.050867 -0.051302  1.368309
    
    #修改原始对象 默认返回新对象
    >>> df.fillna({1:0.5,3:-1},inplace=True)
              0         1         2
    0  0.581403  0.500000       NaN
    1 -1.709160  0.500000       NaN
    2  2.496074  0.500000       NaN
    3  0.329339  0.500000  0.736299
    4 -0.638106  0.500000  0.756044
    5  0.519277  1.182077 -0.500918
    6 -0.050867 -0.051302  1.368309
    >>> df
              0         1         2
    0  0.581403  0.500000       NaN
    1 -1.709160  0.500000       NaN
    2  2.496074  0.500000       NaN
    3  0.329339  0.500000  0.736299
    4 -0.638106  0.500000  0.756044
    5  0.519277  1.182077 -0.500918
    6 -0.050867 -0.051302  1.368309
    
    >>> info=DataFrame(np.random.randn(6,3))
    >>> info.ix[:2,1]=NA;info.ix[4:,2]=NA
    >>> info
              0         1         2
    0  1.217480       NaN  0.479981
    1 -2.104463       NaN -2.917539
    2 -2.141440       NaN -1.371574
    3  0.925971  1.697813  0.814347
    4 -1.463290 -0.526497       NaN
    5 -0.300475  0.839098       NaN
    #可以限制行数
    >>> info.fillna(method='bfill',limit=1)
              0         1         2
    0  1.217480       NaN  0.479981
    1 -2.104463       NaN -2.917539
    2 -2.141440  1.697813 -1.371574
    3  0.925971  1.697813  0.814347
    4 -1.463290 -0.526497       NaN
    5 -0.300475  0.839098       NaN
    
    #层次索引
    >>> data=Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
    >>> data
    a  1    1.148945
       2   -0.489120
       3    1.151546
    b  1    0.840938
       2   -1.992375
       3    0.039002
    c  1    2.157531
       2    0.963063
    d  2    0.130796
       3    0.012320
    dtype: float64
    >>> data.index
    MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
               labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
    >>> data['b']
    1    0.840938
    2   -1.992375
    3    0.039002
    dtype: float64
    >>> data['b':'c']
    b  1    0.840938
       2   -1.992375
       3    0.039002
    c  1    2.157531
       2    0.963063
    dtype: float64
    >>> data.ix[['b','d']]
    b  1    0.840938
       2   -1.992375
       3    0.039002
    d  2    0.130796
       3    0.012320
    dtype: float64
    
    >>> data[:,2]
    a   -0.489120
    b   -1.992375
    c    0.963063
    d    0.130796
    dtype: float64
    
    #转换成dataframe
    >>> data.unstack()
              1         2         3
    a  1.148945 -0.489120  1.151546
    b  0.840938 -1.992375  0.039002
    c  2.157531  0.963063       NaN
    d       NaN  0.130796  0.012320
    
    >>> data.unstack().stack()
    a  1    1.148945
       2   -0.489120
       3    1.151546
    b  1    0.840938
       2   -1.992375
       3    0.039002
    c  1    2.157531
       2    0.963063
    d  2    0.130796
       3    0.012320
    dtype: float64
    
    >>> frame=DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]],columns=[['good','good','bad'],['G','R','G']])
    >>> frame
        good     bad
           G   R   G
    a 1    0   1   2
      2    3   4   5
    b 1    6   7   8
      2    9  10  11
    
    >>> frame.index.names=['key1','key2']
    >>> frame.columns.names=['s','c']
    >>> frame
    s         good     bad
    c            G   R   G
    key1 key2             
    a    1       0   1   2
         2       3   4   5
    b    1       6   7   8
         2       9  10  11
    
    >>> frame['good']
    c          G   R
    key1 key2       
    a    1     0   1
         2     3   4
    b    1     6   7
         2     9  10
    
    #重排分级顺序
    >>> frame.swaplevel('key1','key2')
              good     bad
                 G   R   G
    key2 key1             
    1    a       0   1   2
    2    a       3   4   5
    1    b       6   7   8
    2    b       9  10  11
    >>> frame.sortlevel(1)
    state     good     bad
    color        G   R   G
    key1 key2             
    a    1       0   1   2
    b    1       6   7   8
    a    2       3   4   5
    b    2       9  10  11
    >>> frame.swaplevel(0,1).sortlevel(0)
    state     good     bad
    color        G   R   G
    key2 key1             
    1    a       0   1   2
         b       6   7   8
    2    a       3   4   5
         b       9  10  11
    #根据层次汇总
    >>> frame.sum(level='key2')
    state good     bad
    color    G   R   G
    key2              
    1        6   8  10
    2       12  14  16
    >>> frame.sum(level='color',axis=1)
    color       G   R
    key1 key2        
    a    1      2   1
         2      8   4
    b    1     14   7
         2     20  10
    
    #使用DataFrame的列
    >>> frame=DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})
    >>> frame
       a  b    c  d
    0  0  7  one  0
    1  1  6  one  1
    2  2  5  one  2
    3  3  4  two  0
    4  4  3  two  1
    5  5  2  two  2
    6  6  1  two  3
    >>> frame2=frame.set_index(['c','d'])
    >>> frame2
           a  b
    c   d      
    one 0  0  7
        1  1  6
        2  2  5
    two 0  3  4
        1  4  3
        2  5  2
        3  6  1
    >>> frame2=frame.set_index(['c','d'],drop=False)
    >>> frame2
           a  b    c  d
    c   d              
    one 0  0  7  one  0
        1  1  6  one  1
        2  2  5  one  2
    two 0  3  4  two  0
        1  4  3  two  1
        2  5  2  two  2
        3  6  1  two  3
     
    ##############读取文件################
    >>> os.system('cat  /Users/similarface/Downloads/jnn.csv')
    准考证号,姓名,班级,语文,数学,英语,化学,物理
    304040250124,罗茜,1,101,94,102.5,79,74
    304040250128,沈怡君,1,91.5,96,69,82,69
    304040250321,魏华,2,74,28,42,56,56
    304040250233,何仕林,2,60.5,42,34.5,49,46
    304040250725,屈妮,5,93.5,63,77.5,55,66
    304040250709,邓培蓓,5,102.5,81,47,65,58
    304040250805,郑清霞,5,89,80,63.5,63,65
    304040250827,明杨,6,108.5,92,79,89,83
    304040250819,李倩,6,93.5,61,44,45,32
    304040250912,江明悦,6,0,0,0,0,00
    >>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',name>>> pd.read_csv('/Users/similarface/Downloads/jnn.csv')
               准考证号   姓名  班级     语文  数学     英语  化学  物理
    0  304040250124   罗茜   1  101.0  94  102.5  79  74
    1  304040250128  沈怡君   1   91.5  96   69.0  82  69
    2  304040250321   魏华   2   74.0  28   42.0  56  56
    3  304040250233  何仕林   2   60.5  42   34.5  49  46
    4  304040250725   屈妮   5   93.5  63   77.5  55  66
    5  304040250709  邓培蓓   5  102.5  81   47.0  65  58
    6  304040250805  郑清霞   5   89.0  80   63.5  63  65
    7  304040250827   明杨   6  108.5  92   79.0  89  83
    8  304040250819   李倩   6   93.5  61   44.0  45  32
    9  304040250912  江明悦   6    0.0   0    0.0   0   0
    >>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',index_col='准考证号')
                   姓名  班级     语文  数学     英语  化学  物理
    准考证号                                           
    304040250124   罗茜   1  101.0  94  102.5  79  74
    304040250128  沈怡君   1   91.5  96   69.0  82  69
    304040250321   魏华   2   74.0  28   42.0  56  56
    304040250233  何仕林   2   60.5  42   34.5  49  46
    304040250725   屈妮   5   93.5  63   77.5  55  66
    304040250709  邓培蓓   5  102.5  81   47.0  65  58
    304040250805  郑清霞   5   89.0  80   63.5  63  65
    304040250827   明杨   6  108.5  92   79.0  89  83
    304040250819   李倩   6   93.5  61   44.0  45  32
    304040250912  江明悦   6    0.0   0    0.0   0   0
    
    #数量不定的空白符分割
    >>> result=pd.read_table('ext3.txt',sep='s+')
    
    #忽略的行数
    >>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',index_col='准考证号',skiprows=[5,9])
                   姓名  班级     语文  数学     英语  化学  物理
    准考证号                                           
    304040250124   罗茜   1  101.0  94  102.5  79  74
    304040250128  沈怡君   1   91.5  96   69.0  82  69
    304040250321   魏华   2   74.0  28   42.0  56  56
    304040250233  何仕林   2   60.5  42   34.5  49  46
    304040250709  邓培蓓   5  102.5  81   47.0  65  58
    304040250805  郑清霞   5   89.0  80   63.5  63  65
    304040250827   明杨   6  108.5  92   79.0  89  83
    304040250912  江明悦   6    0.0   0    0.0   0   0
    
    #缺失值的填充
    NA -1.#IND NULL
    >>> os.system('cat  /Users/similarface/Downloads/ex5.csv')
    something,a,b,c,d,message
    one,1,2,IND,4,NA
    tow,-1,-1,,8,world
    three,.,10,11,NULL,foo
    >>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=['NULL'])
      something   a   b    c    d message
    0       one   1   2  IND  4.0     NaN
    1       tow  -1  -1  NaN  8.0   world
    2     three   .  10   11  NaN     foo
    #指定空值
    >>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=['-1'])
      something    a     b    c    d message
    0       one    1   2.0  IND  4.0     NaN
    1       tow  NaN   NaN  NaN  8.0   world
    2     three    .  10.0   11  NaN     foo
    >>> sentinels={'message':['foo','NA'],'something':['tow']}
    >>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=sentinels)
      something   a   b    c    d message
    0       one   1   2  IND  4.0     NaN
    1       NaN  -1  -1  NaN  8.0   world
    2     three   .  10   11  NaN     NaN
    
    '''
    filepath_or_buffer : str, pathlib.Path, py._path.local.LocalPath or any object with a read() method (such as a file handle or StringIO)
    The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.csv
    sep : str, default ‘,’
    Delimiter to use. If sep is None, will try to automatically determine this. Regular expressions are accepted and will force use of the python parsing engine and will ignore quotes in the data.
    delimiter : str, default None
    Alternative argument name for sep.
    header : int or list of ints, default ‘infer’
    Row number(s) to use as the column names, and the start of the data. Default behavior is as if set to 0 if no names passed, otherwise None. Explicitly pass header=0 to be able to replace existing names. The header can be a list of integers that specify row locations for a multi-index on the columns e.g. [0,1,3]. Intervening rows that are not specified will be skipped (e.g. 2 in this example is skipped). Note that this parameter ignores commented lines and empty lines if skip_blank_lines=True, so header=0 denotes the first line of data rather than the first line of the file.
    names : array-like, default None
    List of column names to use. If file contains no header row, then you should explicitly pass header=None
    index_col : int or sequence or False, default None
    Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider index_col=False to force pandas to _not_ use the first column as the index (row names)
    usecols : array-like, default None
    Return a subset of the columns. Results in much faster parsing time and lower memory usage.
    squeeze : boolean, default False
    If the parsed data only contains one column then return a Series
    prefix : str, default None
    Prefix to add to column numbers when no header, e.g. ‘X’ for X0, X1, ...
    mangle_dupe_cols : boolean, default True
    Duplicate columns will be specified as ‘X.0’...’X.N’, rather than ‘X’...’X’
    dtype : Type name or dict of column -> type, default None
    Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32} (Unsupported with engine=’python’). Use str or object to preserve and not interpret dtype.
    engine : {‘c’, ‘python’}, optional
    Parser engine to use. The C engine is faster while the python engine is currently more feature-complete.
    converters : dict, default None
    Dict of functions for converting values in certain columns. Keys can either be integers or column labels
    true_values : list, default None
    Values to consider as True
    false_values : list, default None
    Values to consider as False
    skipinitialspace : boolean, default False
    Skip spaces after delimiter.
    skiprows : list-like or integer, default None
    Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file
    skipfooter : int, default 0
    Number of lines at bottom of file to skip (Unsupported with engine=’c’)
    nrows : int, default None
    Number of rows of file to read. Useful for reading pieces of large files
    na_values : str or list-like or dict, default None
    Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘nan’.
    keep_default_na : bool, default True
    If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they’re appended to.
    na_filter : boolean, default True
    Detect missing value markers (empty strings and the value of na_values). In data without any NAs, passing na_filter=False can improve the performance of reading a large file
    verbose : boolean, default False
    Indicate number of NA values placed in non-numeric columns
    skip_blank_lines : boolean, default True
    If True, skip over blank lines rather than interpreting as NaN values
    parse_dates : boolean or list of ints or names or list of lists or dict, default False
    boolean. If True -> try parsing the index.
    list of ints or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column.
    list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
    a single date column.
    dict, e.g. {‘foo’ : [1, 3]} -> parse columns 1, 3 as date and call result ‘foo’
    Note: A fast-path exists for iso8601-formatted dates.
    infer_datetime_format : boolean, default False
    If True and parse_dates is enabled for a column, attempt to infer the datetime format to speed up the processing
    keep_date_col : boolean, default False
    If True and parse_dates specifies combining multiple columns then keep the original columns.
    date_parser : function, default None
    Function to use for converting a sequence of string columns to an array of datetime instances. The default uses dateutil.parser.parser to do the conversion. Pandas will try to call date_parser in three different ways, advancing to the next if an exception occurs: 1) Pass one or more arrays (as defined by parse_dates) as arguments; 2) concatenate (row-wise) the string values from the columns defined by parse_dates into a single array and pass that; and 3) call date_parser once for each row using one or more strings (corresponding to the columns defined by parse_dates) as arguments.
    dayfirst : boolean, default False
    DD/MM format dates, international and European format
    iterator : boolean, default False
    Return TextFileReader object for iteration or getting chunks with get_chunk().
    chunksize : int, default None
    Return TextFileReader object for iteration. See IO Tools docs for more information on iterator and chunksize.
    compression : {‘infer’, ‘gzip’, ‘bz2’, None}, default ‘infer’
    For on-the-fly decompression of on-disk data. If ‘infer’, then use gzip or bz2 if filepath_or_buffer is a string ending in ‘.gz’ or ‘.bz2’, respectively, and no decompression otherwise. Set to None for no decompression.
    thousands : str, default None
    Thousands separator
    decimal : str, default ‘.’
    Character to recognize as decimal point (e.g. use ‘,’ for European data).
    lineterminator : str (length 1), default None
    Character to break file into lines. Only valid with C parser.
    quotechar : str (length 1), optional
    The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored.
    quoting : int or csv.QUOTE_* instance, default None
    Control field quoting behavior per csv.QUOTE_* constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). Default (None) results in QUOTE_MINIMAL behavior.
    escapechar : str (length 1), default None
    One-character string used to escape delimiter when quoting is QUOTE_NONE.
    comment : str, default None
    Indicates remainder of line should not be parsed. If found at the beginning of a line, the line will be ignored altogether. This parameter must be a single character. Like empty lines (as long as skip_blank_lines=True), fully commented lines are ignored by the parameter header but not by skiprows. For example, if comment=’#’, parsing ‘#emptyna,b,cn1,2,3’ with header=0 will result in ‘a,b,c’ being treated as the header.
    encoding : str, default None
    Encoding to use for UTF when reading/writing (ex. ‘utf-8’). List of Python standard encodings
    dialect : str or csv.Dialect instance, default None
    If None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more details
    tupleize_cols : boolean, default False
    Leave a list of tuples on columns as is (default is to convert to a Multi Index on the columns)
    error_bad_lines : boolean, default True
    Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these “bad lines” will dropped from the DataFrame that is returned. (Only valid with C parser)
    warn_bad_lines : boolean, default True
    If error_bad_lines is False, and warn_bad_lines is True, a warning for each “bad line” will be output. (Only valid with C parser).
    '''
    #数据写入
    data.to_csv('文件名/sys.stout',sep='|',index=True/False,headers=TRUE/FALSE,cols=[选取的列])
    #数据库操作
    import pandas as pd
    from pandas import *
    import sqlite3
    query="""
    create table test(
    a varchar(20),b VARCHAR(20),c REAL ,d INTEGER
    );
    """
    con=sqlite3.connect(':memory')
    con.execute(query)
    con.commit()
    data=[('Atlanta','Georgia',1.25,6),
          ('Tallahassee','Florida',2.6,3),
          ('Sacramento','California',1.7,5)
           ]
    stmt="INSERT INTO test VALUES (?,?,?,?)"
    con.executemany(stmt,data)
    con.commit()
    cursor=con.execute('select * from test')
    rows=cursor.fetchall()
    DataFrame(rows,columns=zip(*cursor.description)[0])
    #直接写sql读取dataFrame
    import pandas.io.sql as sql
    sql.read_sql('select * from test',con)
    
    
    
    #合并数据集
    >>> df1 = DataFrame(
    ...     {'key': ['北京大学', '四川大学', '天津大学', '山东大学', '清华大学'],
    ...      'major0': ['计算机','生物','化学','物理','医学']
    ...     })
    >>> df2 = DataFrame(
    ...     {'key': ['北京大学', '四川大学', '云南大学'],
    ...      'major1': ['外国语', '口腔', '旅游']
    ... })
    >>> df1
        key major0
    0  北京大学    计算机
    1  四川大学     生物
    2  天津大学     化学
    3  山东大学     物理
    4  清华大学     医学
    >>> df2
        key major1
    0  北京大学    外国语
    1  四川大学     口腔
    2  云南大学     旅游
    
    >>> pd.merge(df1,df2)
        key major0 major1
    0  北京大学    计算机    外国语
    1  四川大学     生物     口腔
    
    >>> df3 = DataFrame(
    ...     {'lkey': ['北京大学', '四川大学', '天津大学', '山东大学', '清华大学'],
    ...      'major0': ['计算机','生物','化学','物理','医学']
    ...     })
    >>> df4 = DataFrame(
    ...     {'rkey': ['北京大学', '四川大学', '云南大学'],
    ...      'major1': ['外国语', '口腔', '旅游']
    ... })
    
    >>> df3
       lkey major0
    0  北京大学    计算机
    1  四川大学     生物
    2  天津大学     化学
    3  山东大学     物理
    4  清华大学     医学
    >>> df4
      major1  rkey
    0    外国语  北京大学
    1     口腔  四川大学
    2     旅游  云南大学
    
    >>> pd.merge(df3,df4,left_on='lkey',right_on='rkey')
       lkey major0 major1  rkey
    0  北京大学    计算机    外国语  北京大学
    1  四川大学     生物     口腔  四川大学
    #外连接
    >>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='outer')
       lkey major0 major1  rkey
    0  北京大学    计算机    外国语  北京大学
    1  四川大学     生物     口腔  四川大学
    2  天津大学     化学    NaN   NaN
    3  山东大学     物理    NaN   NaN
    4  清华大学     医学    NaN   NaN
    5   NaN    NaN     旅游  云南大学
    #左连接
    >>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='left')
       lkey major0 major1  rkey
    0  北京大学    计算机    外国语  北京大学
    1  四川大学     生物     口腔  四川大学
    2  天津大学     化学    NaN   NaN
    3  山东大学     物理    NaN   NaN
    4  清华大学     医学    NaN   NaN
    #右连接
    >>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='right')
       lkey major0 major1  rkey
    0  北京大学    计算机    外国语  北京大学
    1  四川大学     生物     口腔  四川大学
    2   NaN    NaN     旅游  云南大学
    #内连接
    >>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='inner')
       lkey major0 major1  rkey
    0  北京大学    计算机    外国语  北京大学
    1  四川大学     生物     口腔  四川大学
    
    #多个键进行合并
    
    left=DataFrame({
        'key1':['foo','foo','bar'],
        'key2':['one','two','one'],
        'lval':[1,2,3]
    })
    
    right=DataFrame({
        'key1':['foo','foo','bar','bar'],
        'key2':['one','one','one','two'],
        'lval':[4,5,6,7]
    })
    
    >>> pd.merge(left,right,on=['key1','key2'],how='outer')
      key1 key2  lval_x  lval_y
    0  foo  one     1.0     4.0
    1  foo  one     1.0     5.0
    2  foo  two     2.0     NaN
    3  bar  one     3.0     6.0
    4  bar  two     NaN     7.0
    
    #重复列名的处理
    >>> pd.merge(left,right,on='key1',suffixes=('_lef','_right'))
      key1 key2_lef  lval_lef key2_right  lval_right
    0  foo      one         1        one           4
    1  foo      one         1        one           5
    2  foo      two         2        one           4
    3  foo      two         2        one           5
    4  bar      one         3        one           6
    5  bar      one         3        two           7
    
    #索引上的合并
    >>> right1=DataFrame({'group_val':[3.5,7]},index=['a','b'])
    >>> left1=DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})
    #合并根据索引对比
    >>> pd.merge(left1,right1,left_on='key',right_index=True)
      key  value  group_val
    0   a      0        3.5
    2   a      2        3.5
    3   a      3        3.5
    1   b      1        7.0
    4   b      4        7.0
    
    lefth=DataFrame(
        {'key1':['similar','similar','similar','face','face'],
         'key2':[2000,2001,2002,2001,2002],
         'data':np.arange(5.)
         })
    
    righth=DataFrame(np.arange(12).reshape((6,2)),
                     index=[['face','face','similar','similar','similar','similar'],
                            [2001,2000,2000,2000,2001,2002]
                            ],
                     columns=['event1','event2']
                     )
    >>> lefth
       data     key1  key2
    0   0.0  similar  2000
    1   1.0  similar  2001
    2   2.0  similar  2002
    3   3.0     face  2001
    4   4.0     face  2002
    >>> righth
                  event1  event2
    face    2001       0       1
            2000       2       3
    similar 2000       4       5
            2000       6       7
            2001       8       9
            2002      10      11
    
    >>> pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)
       data     key1  key2  event1  event2
    0   0.0  similar  2000       4       5
    0   0.0  similar  2000       6       7
    1   1.0  similar  2001       8       9
    2   2.0  similar  2002      10      11
    3   3.0     face  2001       0       1
    
    >>> left2=DataFrame([[1.,2.],[3.,4.],[5.,6.]],index=['a','c','e'],columns=['similar','face'])
    >>> left2
       similar  face
    a      1.0   2.0
    c      3.0   4.0
    e      5.0   6.0
    >>> right2=DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14.]],index=['b','c','d','e'],columns=['M','A'])
    >>> right2
          M     A
    b   7.0   8.0
    c   9.0  10.0
    d  11.0  12.0
    e  13.0  14.0
    
    >>> pd.merge(left2,right2,how='outer',left_index=True,right_index=True)
       similar  face     M     A
    a      1.0   2.0   NaN   NaN
    b      NaN   NaN   7.0   8.0
    c      3.0   4.0   9.0  10.0
    d      NaN   NaN  11.0  12.0
    e      5.0   6.0  13.0  14.0
    >>> left2.join(right2,how='outer')
       similar  face     M     A
    a      1.0   2.0   NaN   NaN
    b      NaN   NaN   7.0   8.0
    c      3.0   4.0   9.0  10.0
    d      NaN   NaN  11.0  12.0
    e      5.0   6.0  13.0  14.0
    >>> another=DataFrame([[7,8],[9,10],[11,12],[16,17]],index=['a','c','e','f'],columns=['NK','O'])
    >>> left2.join([right2,another])
       similar  face     M     A  NK   O
    a      1.0   2.0   NaN   NaN   7   8
    c      3.0   4.0   9.0  10.0   9  10
    e      5.0   6.0  13.0  14.0  11  12
    
    #轴向连接
    >>> arr=np.arange(12).reshape((3,4))
    >>> arr
    array([[ 0,  1,  2,  3],
           [ 4,  5,  6,  7],
           [ 8,  9, 10, 11]])
    >>> np.concatenate([arr,arr],axis=1)
    array([[ 0,  1,  2,  3,  0,  1,  2,  3],
           [ 4,  5,  6,  7,  4,  5,  6,  7],
           [ 8,  9, 10, 11,  8,  9, 10, 11]])
    >>> s1=Series([0,1],index=['a','b'])
    >>> s2=Series([2,3,4],index=['c','d','e'])
    >>> s3=Series([5,6],index=['f','g'])
    >>> s1
    a    0
    b    1
    dtype: int64
    >>> s2
    c    2
    d    3
    e    4
    dtype: int64
    >>> s3
    f    5
    g    6
    dtype: int64
    >>> pd.concat([s1,s2,s3])
    a    0
    b    1
    c    2
    d    3
    e    4
    f    5
    g    6
    dtype: int64
    >>> pd.concat([s1,s2,s3,s1])
    a    0
    b    1
    c    2
    d    3
    e    4
    f    5
    g    6
    a    0
    b    1
    dtype: int64
    >>> pd.concat([s1,s2,s3,s1],axis=1)
         0    1    2    3
    a  0.0  NaN  NaN  0.0
    b  1.0  NaN  NaN  1.0
    c  NaN  2.0  NaN  NaN
    d  NaN  3.0  NaN  NaN
    e  NaN  4.0  NaN  NaN
    f  NaN  NaN  5.0  NaN
    g  NaN  NaN  6.0  NaN
    
    df1=DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])
    df2=DataFrame(5+np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
    >>> pd.concat([df1,df2],axis=1,keys=['level1','level2'])
      level1     level2     
         one two  three four
    a      0   1    5.0  6.0
    b      2   3    NaN  NaN
    c      4   5    7.0  8.0
    >>> pd.concat({'level1':df1,'level2':df2},axis=1)
      level1     level2     
         one two  three four
    a      0   1    5.0  6.0
    b      2   3    NaN  NaN
    c      4   5    7.0  8.0
    >>> pd.concat([df1,df2],axis=1,keys=['L1','L2'],names=['u','l'])
    u  L1        L2     
    l one two three four
    a   0   1   5.0  6.0
    b   2   3   NaN  NaN
    c   4   5   7.0  8.0
    >>> df1=DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
    >>> df2=DataFrame(np.random.randn(2,3),columns=['b','d','a'])
    >>> df1
              a         b         c         d
    0 -1.487358  0.077565  0.209403 -0.712507
    1  1.990047 -0.221415  1.381161 -0.876811
    2 -0.153150  0.391847  1.180728 -0.972548
    >>> df2
              b         d         a
    0 -0.200611  0.321759 -0.201620
    1 -1.842735 -1.924933  0.281712
    
    >>> pd.concat([df1,df2])
              a         b         c         d
    0 -1.487358  0.077565  0.209403 -0.712507
    1  1.990047 -0.221415  1.381161 -0.876811
    2 -0.153150  0.391847  1.180728 -0.972548
    0 -0.201620 -0.200611       NaN  0.321759
    1  0.281712 -1.842735       NaN -1.924933
    
    >>> pd.concat([df1,df2],ignore_index=True)
              a         b         c         d
    0 -1.487358  0.077565  0.209403 -0.712507
    1  1.990047 -0.221415  1.381161 -0.876811
    2 -0.153150  0.391847  1.180728 -0.972548
    3 -0.201620 -0.200611       NaN  0.321759
    4  0.281712 -1.842735       NaN -1.924933
    
    >>> pd.concat([df1,df2],ignore_index=True,axis=1)
              0         1         2         3         4         5         6
    0 -1.487358  0.077565  0.209403 -0.712507 -0.200611  0.321759 -0.201620
    1  1.990047 -0.221415  1.381161 -0.876811 -1.842735 -1.924933  0.281712
    2 -0.153150  0.391847  1.180728 -0.972548       NaN       NaN       NaN
    
    >>> b[:-2]
    f    0.0
    e    1.0
    d    2.0
    c    3.0
    dtype: float64
    >>> a[2:]
    d    NaN
    c    3.5
    b    4.5
    a    NaN
    dtype: float64
    >>> b[:-2].combine_first(a[2:])
    a    NaN
    b    4.5
    c    3.0
    d    2.0
    e    1.0
    f    0.0
    dtype: float64
    
    >>> df1=DataFrame({'a':[1,np.nan,5,np.nan],'b':[np.nan,2,np.nan,6],'c':range(2,18,4)})
    >>> df2=DataFrame({'a':[5,4,np.nan,3,7],'b':[np.nan,3,4,6,8]})
    >>> df2
         a    b
    0  5.0  NaN
    1  4.0  3.0
    2  NaN  4.0
    3  3.0  6.0
    4  7.0  8.0
    >>> df1
         a    b   c
    0  1.0  NaN   2
    1  NaN  2.0   6
    2  5.0  NaN  10
    3  NaN  6.0  14
    >>> df1.combine_first(df2)
         a    b     c
    0  1.0  NaN   2.0
    1  4.0  2.0   6.0
    2  5.0  4.0  10.0
    3  3.0  6.0  14.0
    4  7.0  8.0   NaN
    
    #重塑和轴向旋转
    >>> data=DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['similar','face'],name='state'),columns=pd.Index(['one','two','three'],name='number'))
    >>> data
    number   one  two  three
    state                   
    similar    0    1      2
    face       3    4      5
    >>> data.stack()
    state    number
    similar  one       0
             two       1
             three     2
    face     one       3
             two       4
             three     5
    dtype: int64
    >>> data.stack().unstack()
    number   one  two  three
    state                   
    similar    0    1      2
    face       3    4      5
    
    >>> data.stack().unstack(0)
    state   similar  face
    number               
    one           0     3
    two           1     4
    three         2     5
    
    >>> data.stack().unstack('state')
    state   similar  face
    number               
    one           0     3
    two           1     4
    three         2     5
    
    >>> s1=Series([0,1,2,3],index=['a','b','c','d'])
    >>> s2=Series([4,5,6],index=['c','d','e'])
    >>> s1
    a    0
    b    1
    c    2
    d    3
    dtype: int64
    >>> s2
    c    4
    d    5
    e    6
    dtype: int64
    >>> pd.concat([s1,s2],keys=['one','two'])
    one  a    0
         b    1
         c    2
         d    3
    two  c    4
         d    5
         e    6
    dtype: int64
    >>> pd.concat([s1,s2],keys=['one','two']).unstack()
           a    b    c    d    e
    one  0.0  1.0  2.0  3.0  NaN
    two  NaN  NaN  4.0  5.0  6.0
    >>> pd.concat([s1,s2],keys=['one','two']).unstack().stack()
    one  a    0.0
         b    1.0
         c    2.0
         d    3.0
    two  c    4.0
         d    5.0
         e    6.0
    dtype: float64
    >>> pd.concat([s1,s2],keys=['one','two']).unstack().stack(dropna=False)
    one  a    0.0
         b    1.0
         c    2.0
         d    3.0
         e    NaN
    two  a    NaN
         b    NaN
         c    4.0
         d    5.0
         e    6.0
    dtype: float64
    
    
    #利用函数进行数据转换
    
    data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                               'corned beef', 'Bacon', 'pastrami', 'honey ham',
                               'nova lox'],
                      'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
    
    meat_to_animal = {
      'bacon': 'pig',
      'pulled pork': 'pig',
      'pastrami': 'cow',
      'corned beef': 'cow',
      'honey ham': 'pig',
      'nova lox': 'salmon'
    }
    
    data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
    >>> data
              food  ounces  animal
    0        bacon     4.0     pig
    1  pulled pork     3.0     pig
    2        bacon    12.0     pig
    3     Pastrami     6.0     cow
    4  corned beef     7.5     cow
    5        Bacon     8.0     pig
    6     pastrami     3.0     cow
    7    honey ham     5.0     pig
    8     nova lox     6.0  salmon
    
    >>> data['food'].map(lambda x: meat_to_animal[x.lower()])
    0       pig
    1       pig
    2       pig
    3       cow
    4       cow
    5       pig
    6       cow
    7       pig
    8    salmon
    Name: food, dtype: object
    
    离散化和面元划分:
    #指定组名称
    >>> group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
    >>> pd.cut(ages,bins,labels=group_names)
    [Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
    Length: 12
    Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
    #等长面元 将下面的随机数分4段 precision小数点位数
    >>> data=np.random.rand(20)
    >>> data
    array([ 0.42519089,  0.18981873,  0.29726754,  0.37843724,  0.31072184,
            0.20240683,  0.99244468,  0.61880299,  0.9948212 ,  0.32893834,
            0.87701908,  0.25638677,  0.02344737,  0.15162624,  0.31874342,
            0.16534997,  0.43495775,  0.83059911,  0.57975644,  0.53763544])
    >>> pd.cut(data,4,precision=2)
    [(0.27, 0.51], (0.022, 0.27], (0.27, 0.51], (0.27, 0.51], (0.27, 0.51], ..., (0.022, 0.27], (0.27, 0.51], (0.75, 0.99], (0.51, 0.75], (0.51, 0.75]]
    Length: 20
    Categories (4, object): [(0.022, 0.27] < (0.27, 0.51] < (0.51, 0.75] < (0.75, 0.99]]
    #分段求值
    >>> pd.value_counts(cats)
    (18, 25]     5
    (35, 60]     3
    (25, 35]     3
    (60, 100]    1
    dtype: int64
    #左闭右开
    pd.cut(ages, [18, 26, 36, 61, 100], right=False)
    
    #检查、过滤异常值
    >>> np.random.seed(12345)
    >>> data=DataFrame(np.random.randn(1000,4))
    >>> data.describe()
                     0            1            2            3
    count  1000.000000  1000.000000  1000.000000  1000.000000
    mean     -0.067684     0.067924     0.025598    -0.002298
    std       0.998035     0.992106     1.006835     0.996794
    min      -3.428254    -3.548824    -3.184377    -3.745356
    25%      -0.774890    -0.591841    -0.641675    -0.644144
    50%      -0.116401     0.101143     0.002073    -0.013611
    75%       0.616366     0.780282     0.680391     0.654328
    max       3.366626     2.653656     3.260383     3.927528
    >>> col=data[3]
    >>> col[np.abs(col)>3]
    97     3.927528
    305   -3.399312
    400   -3.745356
    Name: 3, dtype: float64
    
    #随机重排序
    >>> sampler=np.random.permutation(5)
    >>> df.take(sampler)
        0   1   2   3
    4  16  17  18  19
    2   8   9  10  11
    1   4   5   6   7
    3  12  13  14  15
    0   0   1   2   3
    
    >>> df.take(np.random.permutation(len(df))[:3])
       0  1   2   3
    1  4  5   6   7
    2  8  9  10  11
    0  0  1   2   3
    
    
    #给定数组的值生成大集合
    >>> bag=np.array([5,7,-1,6,4])
    >>> sampler=np.random.randint(0,len(bag),size=10)
    >>> sampler
    array([1, 0, 4, 1, 2, 1, 4, 4, 3, 4])
    >>> draws=bag.take(sampler)
    >>> draws
    array([ 7,  5,  4,  7, -1,  7,  4,  4,  6,  4])
    
    #哑变量矩阵 和 指标矩阵  
    @某一列出现与否的矩阵
    >>> df=DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
    >>> df
       data1 key
    0      0   b
    1      1   b
    2      2   a
    3      3   c
    4      4   a
    5      5   b
    >>> pd.get_dummies(df['key'])
         a    b    c
    0  0.0  1.0  0.0
    1  0.0  1.0  0.0
    2  1.0  0.0  0.0
    3  0.0  0.0  1.0
    4  1.0  0.0  0.0
    5  0.0  1.0  0.0
    
    #
    >>> dummies=pd.get_dummies(df['key'],prefix='key')
    >>> dummies
       key_a  key_b  key_c
    0    0.0    1.0    0.0
    1    0.0    1.0    0.0
    2    1.0    0.0    0.0
    3    0.0    0.0    1.0
    4    1.0    0.0    0.0
    5    0.0    1.0    0.0
    >>> df_with_dummy=df[['data1']].join(dummies)
    >>> df_with_dummy
       data1  key_a  key_b  key_c
    0      0    0.0    1.0    0.0
    1      1    0.0    1.0    0.0
    2      2    1.0    0.0    0.0
    3      3    0.0    0.0    1.0
    4      4    1.0    0.0    0.0
    5      5    0.0    1.0    0.0
    
    
    >>> values
    array([ 0.86789062,  0.4187927 ,  0.48191735,  0.44540277,  0.6855452 ,
            0.33193716,  0.20772778,  0.21461227,  0.50985294,  0.95327048])
    >>> 
    >>> bins=[0,0.2,0.4,0.6,0.8,1]
    >>> pd.get_dummies(pd.cut(values,bins))
       (0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1]
    0       0.0         0.0         0.0         0.0       1.0
    1       0.0         0.0         1.0         0.0       0.0
    2       0.0         0.0         1.0         0.0       0.0
    3       0.0         0.0         1.0         0.0       0.0
    4       0.0         0.0         0.0         1.0       0.0
    5       0.0         1.0         0.0         0.0       0.0
    6       0.0         1.0         0.0         0.0       0.0
    7       0.0         1.0         0.0         0.0       0.0
    8       0.0         0.0         1.0         0.0       0.0
    9       0.0         0.0         0.0         0.0       1.0
    
    #电子邮件正则
    >>> pattern=r'([A-Z0-9.%+-]+)@([A-Z0-9.-]+).([A-Z]{2,4})'
    >>> regex=re.compile(pattern,flags=re.IGNORECASE)
    >>> regex.match('jaflfbs@sina.com')
    <_sre.SRE_Match object at 0x111ceab78>
    >>> m=regex.match('jaflfbs@sina.com')
    >>> m.groups()
    ('jaflfbs', 'sina', 'com')
    
    
    
    #分组 group by groupby
    >>> df=DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','tow','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})
    >>> df
          data1     data2 key1 key2
    0 -0.893905  0.311668    a  one
    1  1.274761  0.885820    a  two
    2  1.115914  0.887069    b  one
    3  0.054165  0.267643    b  tow
    4 -0.819516  0.933495    a  one
    >>> grouped=df['data1'].groupby(df['key1'])
    >>> grouped
    <pandas.core.groupby.SeriesGroupBy object at 0x111e11e10>
    >>> grouped.mean()
    key1
    a   -0.14622
    b    0.58504
    Name: data1, dtype: float64
    >>> means=df['data1'].groupby([df['key1'],df['key2']]).mean()
    >>> means
    key1  key2
    a     one    -0.856710
          two     1.274761
    b     one     1.115914
          tow     0.054165
    Name: data1, dtype: float64
    
    >>> means.unstack()
    key2       one       tow       two
    key1                              
    a    -0.856710       NaN  1.274761
    b     1.115914  0.054165       NaN
    
    #可以具体制定 分组的列
    >>> states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
    >>> years = np.array([2005, 2005, 2006, 2005, 2006])
    >>> df['data1'].groupby([states,years]).mean()
    #分组的可以是列名 key2没有出现 因为key2不是数值类型的
    >>> df.groupby('key1').mean()
            data1     data2
    key1                   
    a    -0.14622  0.710328
    b     0.58504  0.577356
    
    >>> df.groupby(['key1','key2']).mean()
                  data1     data2
    key1 key2                    
    a    one  -0.856710  0.622582
         two   1.274761  0.885820
    b    one   1.115914  0.887069
         tow   0.054165  0.267643
    
    #获取分组的大小
    >>> df.groupby(['key1','key2']).size()
    key1  key2
    a     one     2
          two     1
    b     one     1
          tow     1
    
    #
    >>> pieces=dict(list(df.groupby('key1')))
    >>> pieces['b']
          data1     data2 key1 key2
    2  1.115914  0.887069    b  one
    3  0.054165  0.267643    b  tow
    
    
    
    
    
    
    ############时间操作
    >>> from datetime import datetime
    >>> now=datetime.now()
    >>> now
    datetime.datetime(2016, 4, 12, 14, 31, 50, 995484)
    >>> now.year,now.month,now.day
    (2016, 4, 12)
    >>> now.day
    12
    >>> #delta以毫秒形式存储日期和时间 datetime.timedelta表示lia
    >>> delta=datetime(2016,5,1)-datetime(2016,5,2)
    >>> delta
    datetime.timedelta(-1)
    >>> delta.days
    -1
    >>> delta.seconds
    0
    >>> from datetime import timedelta
    >>> start=datetime(2011,1,1)
    >>> start+timedelta(12)
    datetime.datetime(2011, 1, 13, 0, 0)
    >>> start-2*timedelta(12)
    datetime.datetime(2010, 12, 8, 0, 0)
    >>> stamp=datetime(2011,1,3)
    >>> str(stamp)
    '2011-01-03 00:00:00'
    >>> value='2016-01-01'
    >>> datetime.strptime(value,'%Y-%m-%d')
    datetime.datetime(2016, 1, 1, 0, 0)
    >>> value='2016-01-13'
    >>> datetime.strptime(value,'%Y-%m-%d')
    datetime.datetime(2016, 1, 13, 0, 0)
    >>> value='2016-13-13'
    >>> datetime.strptime(value,'%Y-%m-%d')
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/_strptime.py", line 325, in _strptime
        (data_string, format))
    ValueError: time data '2016-13-13' does not match format '%Y-%m-%d
    
    >>> datestrs=['7/6/2016','1/1/1111']
    >>> [datetime.strptime(x,'%m/%d/%Y')  for x in datestrs]
    [datetime.datetime(2016, 7, 6, 0, 0), datetime.datetime(1111, 1, 1, 0, 0)]
    
    >>> from dateutil.parser import parse
    >>> parse('2016-01-09')
    datetime.datetime(2016, 1, 9, 0, 0)
    >>> parse('Jan 31,2015 10:31 PM')
    datetime.datetime(2015, 1, 31, 22, 31)
    >>> parse('1/3/2018',dayfirst=True)
    datetime.datetime(2018, 3, 1, 0, 0)
    >>> parse('1/3/2018',dayfirst=False)
    datetime.datetime(2018, 1, 3, 0, 0)
    
    
    >>> datestrs=['1/4/2016','4/1/2017']
    >>> pd.to_datetime(datestrs)
    DatetimeIndex(['2016-01-04', '2017-04-01'], dtype='datetime64[ns]', freq=None)
    >>> idx=pd.to_datetime(datestrs+[None])
    >>> idx
    DatetimeIndex(['2016-01-04', '2017-04-01', 'NaT'], dtype='datetime64[ns]', freq=None)
    
    >>> pd.isnull(idx)
    array([False, False,  True], dtype=bool)
    
    
    >>> dates=[datetime(2011,1,2),datetime(2016,1,1),datetime(2016,1,2),datetime(2016,1,3),datetime(2016,1,4),datetime(2016,1,5)]
    >>> dates
    [datetime.datetime(2011, 1, 2, 0, 0), datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0), datetime.datetime(2016, 1, 4, 0, 0), datetime.datetime(2016, 1, 5, 0, 0)]
    >>> from pandas import *
    >>> ts=Series(np.random.randn(6),index=dates)
    >>> ts
    2011-01-02    0.734018
    2016-01-01    1.661590
    2016-01-02    0.839504
    2016-01-03   -1.295834
    2016-01-04    0.190545
    2016-01-05    0.267724
    dtype: float64
    
    
    >>> ts+ts[::2]
    2011-01-02    1.468037
    2016-01-01         NaN
    2016-01-02    1.679008
    2016-01-03         NaN
    2016-01-04    0.381091
    2016-01-05         NaN
    dtype: float64
    
    >>> ts.index.dtype
    dtype('<M8[ns]')
    >>> stamp=ts.index[0]
    >>> stamp
    Timestamp('2011-01-02 00:00:00')
    >>> stamp=ts.index[2]
    >>> ts[stamp]
    0.83950398236998658
    >>> ts['1/1/2016']
    1.6615901161098698
    
    >>> longer_ts=Series(np.random.randn(1000),index=pd.date_range('1/1/2000',periods=1000))
    >>> longer_ts['2002-09-21':'2002-09-23']
    2002-09-21   -0.105898
    2002-09-22    1.708342
    2002-09-23   -0.815799
    Freq: D, dtype: float64
    >>> longer_ts['2002-09-21':'09/23/2002']
    2002-09-21   -0.105898
    2002-09-22    1.708342
    2002-09-23   -0.815799
    Freq: D, dtype: float64
    >>> longer_ts['2002-09-21':'23/09/2002']
    2002-09-21   -0.105898
    2002-09-22    1.708342
    2002-09-23   -0.815799
    Freq: D, dtype: float64
    
    >>> longer_ts.truncate(before='2002-09-23')
    2002-09-23   -0.815799
    2002-09-24   -0.140892
    2002-09-25   -0.397591
    2002-09-26    0.451815
    Freq: D, dtype: float64
    >>> longer_ts.truncate(after='2002-09-23')
    
    #重复时间序列
    
    >>> dates=pd.DatetimeIndex(['1/1/2016','1/2/2016','1/2/2016','1/2/2016','1/3/2016'])
    >>> dates
    DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-02', '2016-01-02',
                   '2016-01-03'],
                  dtype='datetime64[ns]', freq=None)
    >>> dup_ts=Series(range(5),index=dates)
    >>> dup_ts
    2016-01-01    0
    2016-01-02    1
    2016-01-02    2
    2016-01-02    3
    2016-01-03    4
    dtype: int64
    >>> dup_ts.index.is_unique
    False
    >>> dup_ts[]
      File "<stdin>", line 1
        dup_ts[]
               ^
    SyntaxError: invalid syntax
    >>> dup_ts['1/2/2016']
    2016-01-02    1
    2016-01-02    2
    2016-01-02    3
    dtype: int64
    >>> grouped=dup_ts.groupby(level=0)
    >>> grouped.mean()
    2016-01-01    0
    2016-01-02    2
    2016-01-03    4
    dtype: int64
    >>> grouped.max()
    2016-01-01    0
    2016-01-02    3
    2016-01-03    4
    dtype: int64
    >>> grouped.count()
    2016-01-01    1
    2016-01-02    3
    2016-01-03    1
    dtype: int64
    
    #4-6月的日期
    >>> index=pd.date_range('4/1/2016','6/1/2016')
    #开始 向后多少天
    >>> pd.date_range(start='4/1/2016',periods=20)
    DatetimeIndex(['2016-04-01', '2016-04-02', '2016-04-03', '2016-04-04',
                   '2016-04-05', '2016-04-06', '2016-04-07', '2016-04-08',
                   '2016-04-09', '2016-04-10', '2016-04-11', '2016-04-12',
                   '2016-04-13', '2016-04-14', '2016-04-15', '2016-04-16',
                   '2016-04-17', '2016-04-18', '2016-04-19', '2016-04-20'],
                  dtype='datetime64[ns]', freq='D')
    
    >>> pd.date_range(end='2016-12-12',periods=10)
    DatetimeIndex(['2016-12-03', '2016-12-04', '2016-12-05', '2016-12-06',
                   '2016-12-07', '2016-12-08', '2016-12-09', '2016-12-10',
                   '2016-12-11', '2016-12-12'],
                  dtype='datetime64[ns]', freq='D')
    
    >>> pd.date_range('1/1/2016','12/2/2016',freq='BM')
    DatetimeIndex(['2016-01-29', '2016-02-29', '2016-03-31', '2016-04-29',
                   '2016-05-31', '2016-06-30', '2016-07-29', '2016-08-31',
                   '2016-09-30', '2016-10-31', '2016-11-30'],
                  dtype='datetime64[ns]', freq='BM')
    
    >>> pd.date_range('5/2/2012 12:12:12',periods=5)
    DatetimeIndex(['2012-05-02 12:12:12', '2012-05-03 12:12:12',
                   '2012-05-04 12:12:12', '2012-05-05 12:12:12',
                   '2012-05-06 12:12:12'],
                  dtype='datetime64[ns]', freq='D')
    #normalize 午夜12点
    >>> pd.date_range('5/2/2016 12:13:14',periods=5,normalize=True)
    DatetimeIndex(['2016-05-02', '2016-05-03', '2016-05-04', '2016-05-05',
                   '2016-05-06'],
                  dtype='datetime64[ns]', freq='D')
    
    >>> from pandas.tseries.offsets import Hour,Minute
    >>> hour=Hour
    >>> hour
    <class 'pandas.tseries.offsets.Hour'>
    >>> four_hours=Hour(4)
    >>> four_hours
    <4 * Hours>
    >>> 
    >>> pd.date_range('1/1/2016','1/2/2016',freq='4h')
    DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 04:00:00',
                   '2016-01-01 08:00:00', '2016-01-01 12:00:00',
                   '2016-01-01 16:00:00', '2016-01-01 20:00:00',
                   '2016-01-02 00:00:00'],
                  dtype='datetime64[ns]', freq='4H')
    
    >>> pd.date_range('1/1/2000',periods=2,freq='1h30min')
    DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00'], dtype='datetime64[ns]', freq='90T')
    
    freq
    -----------------------------
    http://pandas.pydata.org/pandas-docs/version/0.18.0/timeseries.html#dateoffset-objects
    -----------------------------
    D 每日
    B 工作日
    H 小数
    T 分钟
    S 秒
    L 毫秒
    U 微妙
    M 每月最后一天
    BM 每月最后一个工作日
    MS 每月第一个
    BMS 每月工作第一天
    W-MON W-TUE[WED THU FRI SAT SUN]
    WOM-1MON WOM-2MON 每月第一个星期一 。。。
    Q-JAN 月份 JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC
    BQ-JAN 
    
    AS-JAN 每年指定月份的第一个日历日
    BAS-JAN BAS-FEB 每年指定月份的第一个工作日
    
    >>> rng=pd.date_range('1/1/2016','9/1/2012',freq='WOM-3FRI')
    >>> rng
    DatetimeIndex([], dtype='datetime64[ns]', freq='WOM-3FRI')
    >>> rng=pd.date_range('1/1/2016','9/1/2016',freq='WOM-3FRI')
    >>> rng
    DatetimeIndex(['2016-01-15', '2016-02-19', '2016-03-18', '2016-04-15',
                   '2016-05-20', '2016-06-17', '2016-07-15', '2016-08-19'],
                  dtype='datetime64[ns]', freq='WOM-3FRI')
    
    >>> ts=Series(np.random.randn(4),index=pd.date_range('1/1/2000',periods=4,freq='M'))
    >>> ts
    2000-01-31    0.246254
    2000-02-29    0.426385
    2000-03-31    0.832971
    2000-04-30    1.163773
    Freq: M, dtype: float64
    >>> ts.shift(2)
    2000-01-31         NaN
    2000-02-29         NaN
    2000-03-31    0.246254
    2000-04-30    0.426385
    Freq: M, dtype: float64
    >>> ts.shift(-2)
    2000-01-31    0.832971
    2000-02-29    1.163773
    2000-03-31         NaN
    2000-04-30         NaN
    Freq: M, dtype: float64
    
    #计算百分比变化
    >>> ts/ts.shift(1)-1
    2000-01-31         NaN
    2000-02-29    0.731486
    2000-03-31    0.953564
    2000-04-30    0.397135
    Freq: M, dtype: float64
    
    >>> ts.shift(2,freq='M')
    2000-03-31    0.246254
    2000-04-30    0.426385
    2000-05-31    0.832971
    2000-06-30    1.163773
    Freq: M, dtype: float64
    
    >>> ts.shift(3,freq='D')
    2000-02-03    0.246254
    2000-03-03    0.426385
    2000-04-03    0.832971
    2000-05-03    1.163773
    dtype: float64
    
    >>> ts.shift(1,freq='3D')
    2000-02-03    0.246254
    2000-03-03    0.426385
    2000-04-03    0.832971
    2000-05-03    1.163773
    dtype: float64
    
    >>> ts.shift(1,freq='90T')
    2000-01-31 01:30:00    0.246254
    2000-02-29 01:30:00    0.426385
    2000-03-31 01:30:00    0.832971
    2000-04-30 01:30:00    1.163773
    Freq: M, dtype: float64
    
    
    >>> from pandas.tseries.offsets import Day,MonthEnd
    >>> now=datetime(2011,11,17)
    >>> now
    datetime.datetime(2011, 11, 17, 0, 0)
    >>> now+3*Day()
    Timestamp('2011-11-20 00:00:00')
    >>> now+MonthEnd()
    Timestamp('2011-11-30 00:00:00')
    >>> now+MonthEnd(2)
    Timestamp('2011-12-31 00:00:00')
    
    >>> offset=MonthEnd()
    >>> offset.rollforward(now)
    Timestamp('2011-11-30 00:00:00')
    >>> now
    datetime.datetime(2011, 11, 17, 0, 0)
    >>> offset.rollback(now)
    Timestamp('2011-10-31 00:00:00')
    
    >>> ts=Series(np.random.randn(20),index=pd.date_range('1/12/2016',periods=20,freq='4d'))5450>>
    >>> ts.groupby(offset.rollforward).mean()
    2016-01-31   -0.023515
    2016-02-29    0.332412
    2016-03-31    0.445600
    dtype: float64
    
    >>> ts.resample('M',how='mean')
    2016-01-31    0.705208
    2016-02-29   -0.174444
    2016-03-31    0.534282
    Freq: M, dtype: float64
    
    #时间算术运算
    >>> p=pd.Period(2016,freq='A-DEC')
    >>> p
    Period('2016', 'A-DEC')
    >>> p+5
    Period('2021', 'A-DEC')
    >>> p-2
    Period('2014', 'A-DEC')
    >>> pd.Period('2014',freq='A-DEC')-p
    -2
    >>> rng=pd.period_range('1/1/2016','6/30/2016',freq='M')
    >>> rng
    PeriodIndex(['2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06'], dtype='int64', freq='M')
    
    >>> rng=pd.period_range('1/1/2016','6/30/2016',freq='M')
    >>> Series(np.random.randn(6),index=rng)
    2016-01   -0.739693
    2016-02   -0.928667
    2016-03    0.176348
    2016-04    1.343980
    2016-05   -1.513816
    2016-06    0.654137
    Freq: M, dtype: float64
    
    >>> values=['2010Q3','2012Q2','2013Q1']
    >>> index=pd.PeriodIndex(values,freq='Q-DEC')
    >>> index
    PeriodIndex(['2010Q3', '2012Q2', '2013Q1'], dtype='int64', freq='Q-DEC')
    
    
    #时间频度转换
    >>> p=pd.Period('2007',freq='A-DEC')
    >>> p.asfreq('M',how='start')
    Period('2007-01', 'M')
    >>> p.asfreq('M',how='end')
    Period('2007-12', 'M')
    >>> p=pd.Period('2007',freq='A-FEB')
    >>> p.asfreq('M',how='start')
    Period('2006-03', 'M')
    >>> p.asfreq('M',how='end')
    Period('2007-02', 'M')
    
    
    #
    #
    #
    #
    #
    #
    #
    #
    #
    #
    #
    #
    #
    #
    
    #查询空列
    fNull=full[full.End_y.isnull()] pd.value_counts(cats)

    #isin

    #isin 的使用
    >>> p1299del[p1299del.Gene_symbol.isin(['TP53','EGFR'])].count()[1]
    >>> p1299snp[p1299snp.Gene_symbol.isin(['TP53','EGFR'])].count()[1]
    
    >>> p297['TUMOR']
    44667              0/1:.:114:110:4:3.51%:26,84,3,1
    44668             1/1:.:111:6:104:94.55%:3,3,25,79
    44669                 0/1:.:19:12:7:36.84%:5,7,1,6
    Name: TUMOR, dtype: object
    
    >>> p297['TUMOR'].str.split(':').str.get(5).str.replace('%','')
    44666    20.69
    44667     3.51
    44668    94.55
    44669    36.84
    
    #将新的列插入到dataframe中
    >>> p297all=pd.concat([p297,s1],axis=1)
    >>> p297.ix[1]
    #CHROM                                        chr1
    POS                                         131114
    ID                                               .
    REF                                              C
    ALT                                              T
    QUAL                                             .
    FILTER                                        PASS
    INFO      DP=339;SS=1;SSC=13;GPV=1E0;SPV=4.0882E-2
    FORMAT                     GT:GQ:DP:RD:AD:FREQ:DP4
    NORMAL                0/1:.:69:67:2:2.9%:53,14,1,1
    TUMOR             0/1:.:270:243:27:10%:188,55,20,7
    Name: 1, dtype: object
    >>> p297all.ix[1]
    #CHROM                                        chr1
    POS                                         131114
    ID                                               .
    REF                                              C
    ALT                                              T
    QUAL                                             .
    FILTER                                        PASS
    INFO      DP=339;SS=1;SSC=13;GPV=1E0;SPV=4.0882E-2
    FORMAT                     GT:GQ:DP:RD:AD:FREQ:DP4
    NORMAL                0/1:.:69:67:2:2.9%:53,14,1,1
    TUMOR             0/1:.:270:243:27:10%:188,55,20,7
    Q                                               10[多的]
    Name: 1, dtype: object
    
    #列大于的集合
    p297all[p297all.Q.ge('1')]
    
    # if then
    >>> df = pd.DataFrame({'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]})
    >>> df
       AAA  BBB  CCC
    0    4   10  100
    1    5   20   50
    2    6   30  -30
    3    7   40  -50
    
    >>> df.ix[df.AAA>=5]
       AAA  BBB  CCC
    1    5   20   50
    2    6   30  -30
    3    7   40  -50
    >>> df.ix[df.AAA>=5,'BBB']=-1
    >>> df
       AAA  BBB  CCC
    0    4   10  100
    1    5   -1   50
    2    6   -1  -30
    3    7   -1  -50
    >>> df.ix[df.AAA>=5,['BBB','CCC']]=555
    >>> df
       AAA  BBB  CCC
    0    4   10  100
    1    5  555  555
    2    6  555  555
    3    7  555  555
    >>> df.ix[df.AAA<5,['BBB','CCC']]=2000
    >>> df
       AAA   BBB   CCC
    0    4  2000  2000
    1    5   555   555
    2    6   555   555
    3    7   555   555
    >>> df_mask = pd.DataFrame({'AAA' : [True] * 4, 'BBB' : [False] * 4,'CCC' : [True,False] * 2})
    >>> df_mask
        AAA    BBB    CCC
    0  True  False   True
    1  True  False  False
    2  True  False   True
    3  True  False  False
    >>> 
    >>> df.where(df_mask,-1000)
       AAA   BBB   CCC
    0    4 -1000  2000
    1    5 -1000 -1000
    2    6 -1000   555
    3    7 -1000 -1000
    
    >>> df = pd.DataFrame({'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]});
    >>> df
       AAA  BBB  CCC
    0    4   10  100
    1    5   20   50
    2    6   30  -30
    3    7   40  -50
    >>> 
    >>> df['logic'] = np.where(df['AAA'] > 5,'high','low'); 
    >>> df
       AAA  BBB  CCC logic
    0    4   10  100   low
    1    5   20   50   low
    2    6   30  -30  high
    3    7   40  -50  high
    
    >>> df = pd.DataFrame({'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]});
    >>> df
       AAA  BBB  CCC
    0    4   10  100
    1    5   20   50
    2    6   30  -30
    3    7   40  -50
    >>> dflow = df[df.AAA <= 5]
    >>> dfhigh = df[df.AAA > 5]
    >>> dflow,dfhigh
    (   AAA  BBB  CCC
    0    4   10  100
    1    5   20   50,    AAA  BBB  CCC
    2    6   30  -30
    3    7   40  -50)
    >>> df = pd.DataFrame({'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]});
    >>> newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA']; newseries
    0    4
    1    5
    Name: AAA, dtype: int64
    >>> df.loc[1]
    AAA     5
    BBB    20
    CCC    50
    Name: 1, dtype: int64
    >>> newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40)]; newseries
       AAA  BBB  CCC
    0    4   10  100
    1    5   20   50
    >>> newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA']; newseries;
    0    4
    1    5
    2    6
    3    7
    Name: AAA, dtype: int64
    #(df.CCC-50) 的绝对值 进行排序
    >>> df.ix[(df.CCC-50).abs().argsort()]
       AAA  BBB  CCC
    1    5   20   50
    0    4   10  100
    2    6   30  -30
    3    7   40  -50
    
    
    
    Crit1 = df.AAA <= 5.5
    Crit2 = df.BBB == 10.0
    Crit3 = df.CCC > -40.0
    AllCrit = Crit1 & Crit2 & Crit3
    CritList = [Crit1,Crit2,Crit3]
    AllCrit = functools.reduce(lambda x,y: x & y, CritList)
    df[AllCrit] Out[27]:
       AAA  BBB  CCC
    0    4   10  100
    
    
    #Selection
    
    >>> df[(df.AAA<=6)&(df.index.isin([0,2,4]))]
       AAA  BBB  CCC
    0    4   10  100
    2    6   30  -30
    
    >>> df[~((df.AAA <= 6) & (df.index.isin([0,2,4])))]
         AAA  BBB  CCC
    foo    4   10  100
    bar    5   20   50
    boo    6   30  -30
    kar    7   40  -50
    
    
    
    >>> rng = pd.date_range(start="2014-10-07",periods=10,freq='2min')
    >>> ts = pd.Series(data = list(range(10)), index = rng)
    #解释下 这个x 其事就是分组的list 比如[1,1,1,2,2,3,3,3,4,4] 分组x=[1,1,1] ...x=[4,4]  
    >>> def MyCust(x):
    ...  if len(x)>2:
    ...   return x[1]*2
    ...  return pd.NaT
    ... 
    >>> mhc = {'Mean' : np.mean, 'Max' : np.max, 'Custom' : MyCust}
    >>> ts.resample("5min").apply(mhc)
                         Max Custom  Mean
    2014-10-07 00:00:00    2      2   1.0
    2014-10-07 00:05:00    4    NaT   3.5
    2014-10-07 00:10:00    7     12   6.0
    2014-10-07 00:15:00    9    NaT   8.5
    
    
    >>> df['Counts'] = df.groupby(['Color']).transform(len)
    >>> df
      Color  Value  Counts
    0   Red    100       3
    1   Red    150       3
    2   Red     50       3
    3  Blue     50       1
    
    >>> df['beyer_shifted'] = df.groupby(level=0)['beyer'].shift(1)
    >>> df
                     beyer  line_race  beyer_shifted
    Last Gunfighter     99         10            NaN
    Last Gunfighter    102         10           99.0
    Last Gunfighter    103          8          102.0
    Paynter            103         10            NaN
    Paynter             88         10          103.0
    Paynter            100          8           88.0
    
    #如何替换 对应值

    >>> data.ix[1:3]
    barcode sex age rsid genotype proj question answer
    1 111-1112-0082 女 27 rs17822931 CT 耳垢 你的耳垢类型是? 不清楚
    2 111-1112-4110 男 38 rs17822931 CT 耳垢 你的耳垢类型是? 湿耳
    3 111-1112-7043 男 33 rs17822931 TT 耳垢 你的耳垢类型是? 干耳
    >>> data.loc[(data.sex==u'男'),'sex']='male'
    >>> data.loc[(data.sex==u'女'),'sex']='female'
    >>> data.ix[1:3]
    barcode sex age rsid genotype proj question answer
    1 111-1112-0082 female 27 rs17822931 CT 耳垢 你的耳垢类型是? 不清楚
    2 111-1112-4110 male 38 rs17822931 CT 耳垢 你的耳垢类型是? 湿耳
    3 111-1112-7043 male 33 rs17822931 TT 耳垢 你的耳垢类型是? 干耳

    #sex[‘male’,’female’],而将其平展开为’sex_male’,’sex_female’两个属性
    >>> data_sex = pd.get_dummies(data['sex'], prefix= 'sex')
    >>> data_sex.ix[1:3]
    sex_female sex_male
    1 1.0 0.0
    2 0.0 1.0
    3 0.0 1.0

    data.loc[(data.answer==u'干耳'),'answer']='dry'
    data.loc[(data.answer==u'湿耳'),'answer']='wet'
    #过滤 回答是 不清楚 的
    filterdata=data[data.answer!=u'不清楚']
    filterdata.ix[1:3]
    barcode sex age rsid genotype proj question answer
    2 111-1112-4110 male 38 rs17822931 CT 耳垢 你的耳垢类型是? wet
    3 111-1112-7043 male 33 rs17822931 TT 耳垢 你的耳垢类型是? dry

    dummies_answer = pd.get_dummies(filterdata['answer'], prefix= 'answer')
    dummies_sex = pd.get_dummies(filterdata['sex'], prefix= 'sex')
    dummies_genotype=pd.get_dummies(filterdata['genotype'], prefix= 'genotype')
    filterdatafull=pd.concat([filterdata,dummies_answer,dummies_sex,dummies_genotype], axis=1)
    filterdatafull.drop(['sex','rsid','genotype','answer','proj','question'], axis=1, inplace=True)

      

      

      

      

      

  • 相关阅读:
    js正则表达式 (.+)与(.+?)
    javaScript中的继承
    理解javascript中event loop,
    vue3-provide/inject 注入
    javaScript设计模式
    javaScript语言精粹--函数
    vue在数据data里面引入图片语法是require("")
    查看分支
    vue项目里面预览下载附件
    小程序组件中传值的几种方式
  • 原文地址:https://www.cnblogs.com/similarface/p/5363723.html
Copyright © 2011-2022 走看看