zoukankan      html  css  js  c++  java
  • python数据分析基础——pandas Tutorial

    参考pandas官方文档:

    http://pandas.pydata.org/pandas-docs/stable/10min.html#min

    1.pandas中的数据类型

    Series 带有索引标记的一维数组,可以存储任何数据类型

     1 #基本方法
     2 >>s =pd.Series(data, index=index)
     3 
     4 >>import pandas as pd
     5 >>import numpy as np
     6 
     7 # 使用ndarray创建
     8 >>indexs = ['a', 'b', 'c']
     9 >>s  = pd.Series(np.random.randn(3), index=indexs)
    10 >>s
    11 a   -1.817485
    12 b    0.012912
    13 c    0.866929
    14 dtype: float64
    15 >>s.index
    16 Index(['a', 'b', 'c'], dtype='object')
    17 
    18 #默认索引值
    19 >>s  = pd.Series(np.random.randn(3))
    20 >>s
    21 0    1.985833
    22 1    0.467035
    23 2    0.636828
    24 dtype: float64
    25 
    26 #使用dict创建
    27 #默认使用dict的索引
    28 >>d = {'a' : 0., 'b' : 1., 'c' : 2.}
    29 >>pd.Series(d)
    30 a    0.0
    31 b    1.0
    32 c    2.0
    33 dtype: float64
    34 
    35 #指明索引值
    36 >>pd.Series(d, index=['b', 'c', 'd', 'a'])
    37 b    1.0
    38 c    2.0
    39 d    NaN
    40 a    0.0
    41 dtype: float64
    42 
    43 #使用标量值创建
    44 >>pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])
    45 a    5.0
    46 b    5.0
    47 c    5.0
    48 d    5.0
    49 e    5.0
    50 dtype: float64

    Series 类似ndarray,可以使用Numpy的很多语法

    >>s = pd.Series(np.random.randn(5),index=['a', 'b', 'c', 'd', 'e'])
    >>s
    a   -1.329486
    b    0.396057
    c   -1.156737
    d   -1.152107
    e   -0.787661
    dtype: float64
    
    # 索引
    >>s[0]
    -1.3294860342555725
    
    #切片
    >>s[:3]
    a   -1.329486
    b    0.396057
    c   -1.156737
    dtype: float64
    
    # 推导式
    >>s[s > s.median()]
    b    0.396057
    e   -0.787661
    dtype: float64
    
    # 按序索引
    >>s[[4,3,1]]
    e   -0.787661
    d   -1.152107
    b    0.396057
    dtype: float64
    
    >>np.exp(s)
    a    0.264613
    b    1.485954
    c    0.314511
    d    0.315970
    e    0.454908
    dtype: float64

    Series 类似dict类型,可以操作索引值

    >>s['a']
    -1.3294860342555725
    
    >>s['e']=12
    >>s
    a    -1.329486
    b     0.396057
    c    -1.156737
    d    -1.152107
    e    12.000000
    dtype: float64
    
    >>'e' in s
    True
    
    >>s.get('e')
    12.0
    
    >>s+s
    a    -2.658972
    b     0.792115
    c    -2.313474
    d    -2.304214
    e    24.000000
    dtype: float64
    
    >>s*2
    a    -2.658972
    b     0.792115
    c    -2.313474
    d    -2.304214
    e    24.000000
    dtype: float64
    
    #索引值自动对齐
    #s[1:]中有a, s[:-1]中有e
    >>s[1:] + s[:-1]
    a         NaN
    b    0.792115
    c   -2.313474
    d   -2.304214
    e         NaN
    dtype: float64

    Series的name属性,创建新对象

    #注意 name属性
    >>s = pd.Series(np.random.randn(5),name='sth')
    >>s
    0    1.338578
    1    2.074678
    2   -0.462777
    3    0.518763
    4   -0.372692
    Name: sth, dtype: float64
    
    # 使用rename方法
    >>s2 = s.rename('dif')
    >>s2
    0    1.338578
    1    2.074678
    2   -0.462777
    3    0.518763
    4   -0.372692
    Name: dif, dtype: float64
    
    >>id(s)
    2669465319632
    
    >>id(s2)
    2669465320416
    
    #s 与 s2是不同的对象,两者尽管值相同,但地址不同

    DataFrame  带索引值的二维数组,类似SQL的表,列项通常是不同的数据类型

    index 行索引,columns列索引

    #使用Series字典或字典创建DataFrame
    >>d= {'one':pd.Series([1.,2.,3.], index=['a','b','c']),         'two':pd.Series([1.,2.,3.,4.], index=['a','b','c','d'])}
    >>df = pd.DataFrame(d)
    >>df
       one  two
    a  1.0  1.0
    b  2.0  2.0
    c  3.0  3.0
    d  NaN  4.0
    
    # 按序输出
    >>pd.DataFrame(d, index=['d','b','a'])
       one  two
    d  NaN  4.0
    b  2.0  2.0
    a  1.0  1.0
    
    >>df.index
    Index(['a', 'b', 'c', 'd'], dtype='object')
    >>df.columns
    Index(['one', 'two'], dtype='object')
    
    #使用ndarrays/list字典
    >>d = {'one':[1.,2.,3.,4.],'two':[4.,3.,2.,1.]}
    >>pd.DatdFrame(d)
       one  two
    0  1.0  4.0
    1  2.0  3.0
    2  3.0  2.0
    3  4.0  1.0
    
    #指定index
    >>pd.DataFrame(d,index=['a','b','c','d'])
       one  two
    a  1.0  4.0
    b  2.0  3.0
    c  3.0  2.0
    d  4.0  1.0

    DataFrame操作

    列选择、添加、删除

    >>df['one']
    a    1.0
    b    2.0
    c    3.0
    d    NaN
    Name: one, dtype: float64
    
    #添加 three 与 flag 列,总在尾部添加
    >>df['three'] = df['one'] * df['two']
    >>df['flag']=df['one']>2
    >>df
       one  two  three   flag
    a  1.0  1.0    1.0  False
    b  2.0  2.0    4.0  False
    c  3.0  3.0    9.0   True
    d  NaN 4.0   NaN  False
    
    # 删除
    >>del df['two']
    >>three = df.pop('three')
    >>three
    a    1.0
    b    4.0
    c    9.0
    d    NaN
    Name: three, dtype: float64
    
    >>df
       one   flag
    a  1.0  False
    b  2.0  False
    c  3.0   True
    d  NaN  False
    
    #可以将列数据截断
    >>df['one_trunc'] = df['one'][:2]
       one   flag  one_trunc
    a  1.0  False        1.0
    b  2.0  False        2.0
    c  3.0   True        NaN
    d  NaN  False       NaN
    
    >>df['foo'] = 'bar'
    >>df
       one   flag  one_trunc  foo
    a  1.0  False        1.0     bar
    b  2.0  False        2.0     bar
    c  3.0   True        NaN    bar
    d  NaN  False      NaN    bar
    
    #使用insert函数可以在指定列后插入
    #在第1列后插入
    >>df.insert(1,'ba',df['one'])
    >>df
       one   ba     flag  one_trunc  foo
    a  1.0  1.0    False        1.0  bar
    b  2.0  2.0    False        2.0  bar
    c  3.0  3.0     True        NaN  bar
    d  NaN  NaN  False       NaN  bar

     索引、选择行

    选择列                df[col]              Series

    按照标签选择行    df.loc[label]   Series

    按照索引值选择行   df.iloc[loc]  Series

    切分行      df[5:10]           DataFrame

    按照布尔向量选择行  df[bool_vec]   DataFrame

    >>d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
         'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
    >>df = pd.DataFrame(d)
    >>df
       one  two
    a  1.0  1.0
    b  2.0  2.0
    c  3.0  3.0
    d  NaN  4.0
    
    #按照标签选择行
    >>df.loc['b']
    one    2.0
    two    2.0
    Name: b, dtype: float64
    >>type(df.loc['b'])
    pandas.core.series.Series
    
    #按照索引值选择行
    >>df.iloc[2]
    one    3.0
    two    3.0
    Name: c, dtype: float64
    
    #切分行
    >>df[1:3]
       one  two
    b  2.0  2.0
    c  3.0  3.0
    >>type(df[1:3])
    pandas.core.frame.DataFrame

    选择列

    >>df.one
    a    1.0
    b    2.0
    c    3.0
    d    NaN
    Name: one, dtype: float64
    
    >>df['one']
    a    1.0
    b    2.0
    c    3.0
    d    NaN
    Name: one, dtype: float64

    数据对齐与计算

    对齐:列与行标签自动对齐

    >>da = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
    >>db = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
    >>da +db
              A            B              C           D
    0 -0.920370 -0.529455 -2.386419  NaN
    1 -1.277148  1.292130  1.196099   NaN
    2  1.182199  0.454546  0.381586   NaN
    3  1.100170 -1.830894  1.105932   NaN
    4  0.507649  1.291516 -2.084368   NaN
    5 -1.198811 -2.180978  0.342185   NaN
    6  0.667211  2.141364  0.044136   NaN
    7       NaN       NaN            NaN      NaN
    8       NaN       NaN            NaN      NaN
    9       NaN       NaN            NaN      NaN
    
    #支持Numpy操作
    >>np.exp(da)
    >>np.asarray(da)

    3维数据类型Penel,在0.20.0及其后续版本中不再支持

    新的类型xarray,用于支持多维数据

     

  • 相关阅读:
    模板元编程实现素数判定
    JDBC开发
    4.9 当相应行存在时更新
    QT5中如何使用QFtp类(这个类虽然没有被收录,但一直在更新)
    gcc和g++的区别
    Awesome C/C++(图像部分)
    Ubuntu更新源
    GO的GDB调试
    内核探测工具systemtap简介
    列举一下项目中使用的产品和技术
  • 原文地址:https://www.cnblogs.com/feinaio2017/p/8763610.html
Copyright © 2011-2022 走看看