zoukankan      html  css  js  c++  java
  • pandas tutorial

    import pandas as pd
    import numpy as np
    

    Series

    s = pd.Series()
    s
    
    Series([], dtype: float64)
    
    data1 = [1, 2, 3]
    data2 = np.array(data1, dtype=float)
    s1 = pd.Series(data1)
    s2 = pd.Series(data2)
    print(s1)
    print(s2)
    
    0    1
    1    2
    2    3
    dtype: int64
    0    1.0
    1    2.0
    2    3.0
    dtype: float64
    
    s3 = pd.Series(data1, dtype=float)
    s3
    
    0    1.0
    1    2.0
    2    3.0
    dtype: float64
    

    我们可以看到,如果我们不指定dtype, 那么其会自行推断

    data = np.array(['a', 'b', 'c', 'd'])
    s = pd.Series(data, index=np.arange(100, 104))
    s
    
    100    a
    101    b
    102    c
    103    d
    dtype: object
    

    利用dict来创建series

    data = {'a':0, "b":1, 'c':2}
    s = pd.Series(data)
    s
    
    a    0
    b    1
    c    2
    dtype: int64
    
    s.index
    
    Index(['a', 'b', 'c'], dtype='object')
    
    s = pd.Series(data, index=['b', 'c', 'd', 'a'])
    s
    
    b    1.0
    c    2.0
    d    NaN
    a    0.0
    dtype: float64
    

    利用标量创建series

    s = pd.Series(5, index=np.arange(5, 9))
    s
    
    5    5
    6    5
    7    5
    8    5
    dtype: int64
    

    s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
    s
    
    a    1
    b    2
    c    3
    d    4
    e    5
    dtype: int64
    
    s[0], s[1], s[2]
    
    (1, 2, 3)
    
    s[:2], s[2:]
    
    (a    1
     b    2
     dtype: int64, c    3
     d    4
     e    5
     dtype: int64)
    
    s[-3:]
    
    c    3
    d    4
    e    5
    dtype: int64
    
    s['a'], s['b'], s['c']
    
    (1, 2, 3)
    
    s[['a', 'b', 'e']]
    
    a    1
    b    2
    e    5
    dtype: int64
    

    Dataframe

    pandas.DataFrame(data, index, columns, dtype, copy)

    df = pd.DataFrame()
    df
    
    data = [1, 2, 3, 4, 5]
    df = pd.DataFrame(data)
    df
    
    0
    0 1
    1 2
    2 3
    3 4
    4 5
    data = [['Alex', 10], ['Bob', 12], ['Clarke', 13]]
    df = pd.DataFrame(data, columns=['Name', 'Age'])
    df
    
    Name Age
    0 Alex 10
    1 Bob 12
    2 Clarke 13

    利用dict创建dataframe

    data = {'Name':['Alex', 'Bob', 'Clarke'], 'Age':[10., 12., 13.]}
    df = pd.DataFrame(data)
    df
    
    Name Age
    0 Alex 10.0
    1 Bob 12.0
    2 Clarke 13.0
    data = {'Name':['Alex', 'Bob', 'Clarke'], 'Age':[10., 12., 'NaN']}  #长度需要匹配
    df = pd.DataFrame(data, index=['rank1', 'rank2', 'rank3'])
    df
    
    Name Age
    rank1 Alex 10
    rank2 Bob 12
    rank3 Clarke NaN
    data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}] #长度无需匹配
    df = pd.DataFrame(data)
    df
    
    a b c
    0 1 2 NaN
    1 5 10 20.0
    df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b'])
    df2 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b1'])
    print(df1)
    print(df2)
    
            a   b
    first   1   2
    second  5  10
            a  b1
    first   1 NaN
    second  5 NaN
    
    data = {
        'one': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
        'two': pd.Series([1, 2, 3, 4.], index=['a', 'b', 'c', 'd'])
    }
    df = pd.DataFrame(data)
    df
    
    one two
    a 1.0 1.0
    b 2.0 2.0
    c 3.0 3.0
    d NaN 4.0

    选择

    df['one']
    
    a    1.0
    b    2.0
    c    3.0
    d    NaN
    Name: one, dtype: float64
    

    添加列

    df['three'] = pd.Series([10, 20, 30])
    
    df
    
    one two three
    a 1.0 1.0 NaN
    b 2.0 2.0 NaN
    c 3.0 3.0 NaN
    d NaN 4.0 NaN
    df['three'] = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
    df
    
    one two three
    a 1.0 1.0 10.0
    b 2.0 2.0 20.0
    c 3.0 3.0 30.0
    d NaN 4.0 NaN
    df['four'] = df['one'] + df['two']
    df
    
    one two three four
    a 1.0 1.0 10.0 2.0
    b 2.0 2.0 20.0 4.0
    c 3.0 3.0 30.0 6.0
    d NaN 4.0 NaN NaN

    列移除

    del df['one']
    df
    
    two three four
    a 1.0 10.0 2.0
    b 2.0 20.0 4.0
    c 3.0 30.0 6.0
    d 4.0 NaN NaN
    df.pop('three')
    df
    
    two four
    a 1.0 2.0
    b 2.0 4.0
    c 3.0 6.0
    d 4.0 NaN

    行的选择, 添加, 移除

    data = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
       'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
    df = pd.DataFrame(data)
    df
    
    one two
    a 1.0 1
    b 2.0 2
    c 3.0 3
    d NaN 4
    df.loc['b']  # row b
    
    one    2.0
    two    2.0
    Name: b, dtype: float64
    
    df.iloc[1] # 按照0, 1, 2...的顺序选择
    
    one    2.0
    two    2.0
    Name: b, dtype: float64
    
    df[2:4]  #df[0]是错的
    
    one two
    c 3.0 3
    d NaN 4
    df = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
    df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])
    df = df.append(df2)  #通过append可以在数据框后面添加数据,但是需要注意的这个操作并不会改变数据本身而是返回一个副本
    df
    
    a b
    0 1 2
    1 3 4
    0 5 6
    1 7 8
    df.drop(1)  #利用drop可以依照index来删除某些行,比如1,即把index=1的行均移除, 同样也是返回一个副本
    
    a b
    0 1 2
    0 5 6

    Panel

    pandas.Panel(data, items, major_axis, minor_axis, dtype, copy)

    items: axis=0

    major_axis: axis=1

    minor_axis: axis=2

    data = np.random.rand(2, 4, 5)
    data
    
    array([[[0.13766405, 0.31453832, 0.51876265, 0.97380794, 0.28314695],
            [0.02942928, 0.28957222, 0.38716041, 0.67941481, 0.54108452],
            [0.84420857, 0.60339649, 0.49242029, 0.34838561, 0.91342058],
            [0.1127622 , 0.28420695, 0.22687715, 0.06842055, 0.87414373]],
    
           [[0.07591772, 0.86028356, 0.30468089, 0.15491769, 0.04969857],
            [0.31649918, 0.85154403, 0.73062637, 0.99916418, 0.3809675 ],
            [0.63817574, 0.81089715, 0.41390597, 0.6660661 , 0.91651907],
            [0.24497635, 0.43923643, 0.01833888, 0.98348271, 0.89717517]]])
    
    p = pd.Panel(data)
    p
    
    C:Analibsite-packagesIPythoncoreinteractiveshell.py:3267: FutureWarning: 
    Panel is deprecated and will be removed in a future version.
    The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
    Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
    Pandas provides a `.to_xarray()` method to help automate this conversion.
    
      exec(code_obj, self.user_global_ns, self.user_ns)
    
    
    
    
    
    <class 'pandas.core.panel.Panel'>
    Dimensions: 2 (items) x 4 (major_axis) x 5 (minor_axis)
    Items axis: 0 to 1
    Major_axis axis: 0 to 3
    Minor_axis axis: 0 to 4
    
    data = {'Item1' : pd.DataFrame(np.random.randn(4, 3)), 
       'Item2' : pd.DataFrame(np.random.randn(4, 2))}
    p = pd.Panel(data)
    p
    
    C:Analibsite-packagesIPythoncoreinteractiveshell.py:3267: FutureWarning: 
    Panel is deprecated and will be removed in a future version.
    The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
    Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
    Pandas provides a `.to_xarray()` method to help automate this conversion.
    
      exec(code_obj, self.user_global_ns, self.user_ns)
    
    
    
    
    
    <class 'pandas.core.panel.Panel'>
    Dimensions: 2 (items) x 4 (major_axis) x 3 (minor_axis)
    Items axis: Item1 to Item2
    Major_axis axis: 0 to 3
    Minor_axis axis: 0 to 2
    
    p['Item1']
    
    0 1 2
    0 1.796552 1.614647 -2.199413
    1 -1.213886 -1.438678 -1.045931
    2 -2.178608 1.212732 0.526674
    3 -0.360727 -0.135351 0.678293
    p.major_xs(0)
    
    Item1 Item2
    0 1.796552 0.845528
    1 1.614647 -0.708260
    2 -2.199413 NaN
    p.minor_xs(0)
    
    Item1 Item2
    0 1.796552 0.845528
    1 -1.213886 0.555775
    2 -2.178608 0.925129
    3 -0.360727 -0.380906
    p.major_axis, p.minor_axis
    
    (RangeIndex(start=0, stop=4, step=1), RangeIndex(start=0, stop=3, step=1))
    

    Basic Functionality

    Series

    s = pd.Series(np.random.rand(5), index=['a', 'b', 'c', 'd', 'e'])
    s
    
    a    0.795298
    b    0.141144
    c    0.125098
    d    0.965541
    e    0.957783
    dtype: float64
    
    s.axes  #返回index
    
    [Index(['a', 'b', 'c', 'd', 'e'], dtype='object')]
    
    s.dtype
    
    dtype('float64')
    
    s.empty  #是否为空
    
    False
    
    s.ndim
    
    1
    
    s.size
    
    5
    
    s.values  #以ndarray的形式返回数值
    
    array([0.79529816, 0.14114367, 0.12509848, 0.96554135, 0.95778323])
    
    s.head(3)  #返回前n个
    
    a    0.795298
    b    0.141144
    c    0.125098
    dtype: float64
    
    s.tail(3)  #返回后n个
    
    c    0.125098
    d    0.965541
    e    0.957783
    dtype: float64
    

    DataFrame

    data = {"a":[1, 2, 3], 'b':['r', 'g', 'b']}
    df = pd.DataFrame(data, index=['first', 'second', "third"])
    df
    
    a b
    first 1 r
    second 2 g
    third 3 b
    df.T  #转置
    
    first second third
    a 1 2 3
    b r g b
    df.axes  #有俩个Index
    
    [Index(['first', 'second', 'third'], dtype='object'),
     Index(['a', 'b'], dtype='object')]
    
    df.dtypes
    
    a     int64
    b    object
    dtype: object
    
    df.empty
    
    False
    
    df.ndim
    
    2
    
    df.shape
    
    (3, 2)
    
    df.size  #3 x 2
    
    6
    
    df.values  #以ndarray 的形式返回
    
    array([[1, 'r'],
           [2, 'g'],
           [3, 'b']], dtype=object)
    
    df.head(2)
    
    a b
    first 1 r
    second 2 g
    df.tail(2)
    
    a b
    second 2 g
    third 3 b

    Descriptive Statistic

    #Create a Dictionary of series
    d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack',
       'Lee','David','Gasper','Betina','Andres']),
       'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
       'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])
    }
    
    #Create a DataFrame
    df = pd.DataFrame(d)
    df
    
    Name Age Rating
    0 Tom 25 4.23
    1 James 26 3.24
    2 Ricky 25 3.98
    3 Vin 23 2.56
    4 Steve 30 3.20
    5 Smith 29 4.60
    6 Jack 23 3.80
    7 Lee 34 3.78
    8 David 40 2.98
    9 Gasper 30 4.80
    10 Betina 51 4.10
    11 Andres 46 3.65
    df.sum()  #默认是按行相加
    
    Name      TomJamesRickyVinSteveSmithJackLeeDavidGasperBe...
    Age                                                     382
    Rating                                                44.92
    dtype: object
    
    df.sum()[1]
    
    382
    
    df.sum(1) #按列相加 居然自动避开了字符串...
    
    0     29.23
    1     29.24
    2     28.98
    3     25.56
    4     33.20
    5     33.60
    6     26.80
    7     37.78
    8     42.98
    9     34.80
    10    55.10
    11    49.65
    dtype: float64
    
    df.mean()
    
    Age       31.833333
    Rating     3.743333
    dtype: float64
    
    df.mean()['Age']
    
    31.833333333333332
    
    df.std()
    
    Age       9.232682
    Rating    0.661628
    dtype: float64
    

    1 count() Number of non-null observations

    2 sum() Sum of values

    3 mean() Mean of Values

    4 median() Median of Values

    5 mode() Mode of values

    6 std() Standard Deviation of the Values

    7 min() Minimum Value

    8 max() Maximum Value

    9 abs() Absolute Value

    10 prod() Product of Values

    11 cumsum() Cumulative Sum

    12 cumprod() Cumulative Product

    df.describe()
    
    Age Rating
    count 12.000000 12.000000
    mean 31.833333 3.743333
    std 9.232682 0.661628
    min 23.000000 2.560000
    25% 25.000000 3.230000
    50% 29.500000 3.790000
    75% 35.500000 4.132500
    max 51.000000 4.800000
    df.describe(include=['object'])
    
    Name
    count 12
    unique 12
    top Gasper
    freq 1

    object: 针对字符列

    number: 针对数字列

    all: 针对全部

    df.describe(include='all') #注意'all'不要以列表的形式传入, 否则会报错
    
    Name Age Rating
    count 12 12.000000 12.000000
    unique 12 NaN NaN
    top Gasper NaN NaN
    freq 1 NaN NaN
    mean NaN 31.833333 3.743333
    std NaN 9.232682 0.661628
    min NaN 23.000000 2.560000
    25% NaN 25.000000 3.230000
    50% NaN 29.500000 3.790000
    75% NaN 35.500000 4.132500
    max NaN 51.000000 4.800000
    df.describe(include='object')
    
    Name
    count 12
    unique 12
    top Gasper
    freq 1

    绑定自定义函数 pipe, apply, applymap

    pipe(func, ...) -- 作用于整个表格

    apply(func, 0) -- 作用于列或者行

    applymap(func) -- 作用于每个元素

    df = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
    df
    
    col1 col2 col3
    0 1.363701 0.533478 1.850151
    1 0.541553 -1.190178 1.204944
    2 0.181793 -0.199892 -0.602374
    3 -0.411247 1.978019 1.183671
    4 -0.045223 1.444328 -0.121690
    def adder(ele1,ele2):
        print("****")
        print(ele1)
        print("****")
        print(ele2)
        return ele1+ele2
    
    df.pipe(adder, 2)  #可以看出,ele1 == df, ele2 == 2
    
    ****
           col1      col2      col3
    0  1.363701  0.533478  1.850151
    1  0.541553 -1.190178  1.204944
    2  0.181793 -0.199892 -0.602374
    3 -0.411247  1.978019  1.183671
    4 -0.045223  1.444328 -0.121690
    ****
    2
    
    col1 col2 col3
    0 3.363701 2.533478 3.850151
    1 2.541553 0.809822 3.204944
    2 2.181793 1.800108 1.397626
    3 1.588753 3.978019 3.183671
    4 1.954777 3.444328 1.878310
    df2 = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
    df2
    
    col1 col2 col3
    0 0.386211 1.297222 -0.413626
    1 -1.873829 -0.007802 -0.857307
    2 -0.881874 -2.026235 0.540769
    3 0.458257 -0.590630 0.685780
    4 0.177258 -1.843835 0.131939
    def inf_print(x):
        print(type(x))
        print("*******")
        print(x)
    
    df2.apply(inf_print, 0)
    
    <class 'pandas.core.series.Series'>
    *******
    0    0.386211
    1   -1.873829
    2   -0.881874
    3    0.458257
    4    0.177258
    Name: col1, dtype: float64
    <class 'pandas.core.series.Series'>
    *******
    0    1.297222
    1   -0.007802
    2   -2.026235
    3   -0.590630
    4   -1.843835
    Name: col2, dtype: float64
    <class 'pandas.core.series.Series'>
    *******
    0   -0.413626
    1   -0.857307
    2    0.540769
    3    0.685780
    4    0.131939
    Name: col3, dtype: float64
    
    
    
    
    
    col1    None
    col2    None
    col3    None
    dtype: object
    
    df2.apply(inf_print, 1)
    
    <class 'pandas.core.series.Series'>
    *******
    col1    0.386211
    col2    1.297222
    col3   -0.413626
    Name: 0, dtype: float64
    <class 'pandas.core.series.Series'>
    *******
    col1   -1.873829
    col2   -0.007802
    col3   -0.857307
    Name: 1, dtype: float64
    <class 'pandas.core.series.Series'>
    *******
    col1   -0.881874
    col2   -2.026235
    col3    0.540769
    Name: 2, dtype: float64
    <class 'pandas.core.series.Series'>
    *******
    col1    0.458257
    col2   -0.590630
    col3    0.685780
    Name: 3, dtype: float64
    <class 'pandas.core.series.Series'>
    *******
    col1    0.177258
    col2   -1.843835
    col3    0.131939
    Name: 4, dtype: float64
    
    
    
    
    
    0    None
    1    None
    2    None
    3    None
    4    None
    dtype: object
    
    df2.apply(np.mean)  #可以知道, apply会将列(默认,如果第二个参数为1则为行)一次次传入
    
    col1   -0.346795
    col2   -0.634256
    col3    0.017511
    dtype: float64
    
    df2.applymap(lambda x: x*100)
    
    col1 col2 col3
    0 38.621115 129.722168 -41.362553
    1 -187.382897 -0.780203 -85.730689
    2 -88.187450 -202.623474 54.076880
    3 45.825709 -59.062993 68.578010
    4 17.725785 -184.383525 13.193880

    Reindex

    N=20
    
    df = pd.DataFrame({
       'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
       'x': np.linspace(0,stop=N-1,num=N),
       'y': np.random.rand(N),
       'C': np.random.choice(['Low','Medium','High'],N).tolist(),
       'D': np.random.normal(100, 10, size=(N)).tolist()
    })
    df
    
    A x y C D
    0 2016-01-01 0.0 0.173488 High 117.632385
    1 2016-01-02 1.0 0.493186 Low 114.066702
    2 2016-01-03 2.0 0.982273 High 102.389228
    3 2016-01-04 3.0 0.329518 Low 104.405035
    4 2016-01-05 4.0 0.392182 Medium 85.867100
    5 2016-01-06 5.0 0.905708 High 103.248690
    6 2016-01-07 6.0 0.731801 Low 100.177698
    7 2016-01-08 7.0 0.772975 High 97.365013
    8 2016-01-09 8.0 0.953258 Low 90.228303
    9 2016-01-10 9.0 0.503579 High 99.946431
    10 2016-01-11 10.0 0.580698 Low 88.411279
    11 2016-01-12 11.0 0.268562 High 91.238630
    12 2016-01-13 12.0 0.462713 High 86.720994
    13 2016-01-14 13.0 0.482387 High 104.549789
    14 2016-01-15 14.0 0.963168 Medium 108.565120
    15 2016-01-16 15.0 0.692654 High 112.370992
    16 2016-01-17 16.0 0.716956 High 112.949463
    17 2016-01-18 17.0 0.897878 Low 107.860172
    18 2016-01-19 18.0 0.289202 Medium 90.430672
    19 2016-01-20 19.0 0.957986 High 115.225753
    df.reindex(index=(0, 2, 5), columns=['A', 'C', 'B'])  #没有B所以会补
    
    A C B
    0 2016-01-01 High NaN
    2 2016-01-03 High NaN
    5 2016-01-06 High NaN

    reindex_like

    df1 = pd.DataFrame(np.random.randn(4, 4))
    df2 = pd.DataFrame(np.random.randn(3, 3))
    df1
    
    0 1 2 3
    0 -2.515820 -0.027034 -0.695420 0.368491
    1 -1.055241 0.778208 -1.062983 -1.715173
    2 0.178253 -0.186661 0.615827 1.379872
    3 -1.316952 -0.209785 -0.953194 -0.138620
    df2
    
    0 1 2
    0 -0.913434 1.641339 -2.418425
    1 0.113041 0.721168 0.446690
    2 2.606504 -0.972984 -2.588228
    df1.reindex_like(df2)
    
    0 1 2
    0 -2.515820 -0.027034 -0.695420
    1 -1.055241 0.778208 -1.062983
    2 0.178253 -0.186661 0.615827

    插补

    pad/ffill - 用前面的值填充

    bfill/backfill - 用后面的值填充

    nearest - 用最近的值填充

    df2.reindex_like(df1, method="ffill")
    
    0 1 2 3
    0 -0.913434 1.641339 -2.418425 -2.418425
    1 0.113041 0.721168 0.446690 0.446690
    2 2.606504 -0.972984 -2.588228 -2.588228
    3 2.606504 -0.972984 -2.588228 -2.588228
    df2.reindex_like(df1, method="bfill")
    
    0 1 2 3
    0 -0.913434 1.641339 -2.418425 NaN
    1 0.113041 0.721168 0.446690 NaN
    2 2.606504 -0.972984 -2.588228 NaN
    3 NaN NaN NaN NaN
    df2.reindex_like(df1, method="nearest")
    
    0 1 2 3
    0 -0.913434 1.641339 -2.418425 -2.418425
    1 0.113041 0.721168 0.446690 0.446690
    2 2.606504 -0.972984 -2.588228 -2.588228
    3 2.606504 -0.972984 -2.588228 -2.588228

    limit

    df1 = pd.DataFrame(np.random.randn(7, 7))
    df2 = pd.DataFrame(np.random.randn(3, 3))
    df1, df2
    
    (          0         1         2         3         4         5         6
     0  0.592257  0.913287  1.276314  0.064212  1.338661 -0.110666 -0.459020
     1 -0.104347  0.388397 -1.822243  1.927027  0.890738  0.577283 -0.302798
     2 -0.016216 -1.101383  0.128118 -0.138639  1.642480 -1.382323 -0.835393
     3  1.411169 -0.395379 -0.412377  0.661016 -0.602245  0.558017  0.588833
     4  0.609378  0.338787 -0.858829  0.006657  1.509428 -0.283262 -0.563293
     5 -1.316789  0.152338 -1.027535  0.026238 -0.052540  1.233837 -1.028193
     6  0.992425  1.364755 -1.384109 -1.888707 -0.259932 -0.207928  0.135734,
               0         1         2
     0 -0.297591  1.019611  0.892070
     1  0.881763 -0.498356  1.708343
     2  0.123616  0.875709  0.387768)
    
    df2.reindex_like(df1, method="ffill", limit=1)
    
    0 1 2 3 4 5 6
    0 -0.297591 1.019611 0.892070 0.892070 NaN NaN NaN
    1 0.881763 -0.498356 1.708343 1.708343 NaN NaN NaN
    2 0.123616 0.875709 0.387768 0.387768 NaN NaN NaN
    3 0.123616 0.875709 0.387768 0.387768 NaN NaN NaN
    4 NaN NaN NaN NaN NaN NaN NaN
    5 NaN NaN NaN NaN NaN NaN NaN
    6 NaN NaN NaN NaN NaN NaN NaN
    df2.reindex_like(df1, method="ffill", limit=2)  #可以发现,limit限制了填补的最大数量
    
    0 1 2 3 4 5 6
    0 -0.297591 1.019611 0.892070 0.892070 0.892070 NaN NaN
    1 0.881763 -0.498356 1.708343 1.708343 1.708343 NaN NaN
    2 0.123616 0.875709 0.387768 0.387768 0.387768 NaN NaN
    3 0.123616 0.875709 0.387768 0.387768 0.387768 NaN NaN
    4 0.123616 0.875709 0.387768 0.387768 0.387768 NaN NaN
    5 NaN NaN NaN NaN NaN NaN NaN
    6 NaN NaN NaN NaN NaN NaN NaN

    Renaming

    df2
    
    0 1 2
    0 -0.297591 1.019611 0.892070
    1 0.881763 -0.498356 1.708343
    2 0.123616 0.875709 0.387768
    df2.rename(columns={0:'A', 1:'B', 2:'C', 3:'D'},
              index={0:'a', 1:'b', 2:'c', 3:'d'})
    
    A B C
    a -0.297591 1.019611 0.892070
    b 0.881763 -0.498356 1.708343
    c 0.123616 0.875709 0.387768
    df2  #注意返回的是一个副本
    
    0 1 2
    0 -0.297591 1.019611 0.892070
    1 0.881763 -0.498356 1.708343
    2 0.123616 0.875709 0.387768

    Iteration

    N=20
    df = pd.DataFrame({
       'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
       'x': np.linspace(0,stop=N-1,num=N),
       'y': np.random.rand(N),
       'C': np.random.choice(['Low','Medium','High'],N).tolist(),
       'D': np.random.normal(100, 10, size=(N)).tolist()
       })
    df
    
    A x y C D
    0 2016-01-01 0.0 0.529153 Low 110.430517
    1 2016-01-02 1.0 0.713513 Low 125.401221
    2 2016-01-03 2.0 0.751809 Medium 112.446846
    3 2016-01-04 3.0 0.124047 High 108.633343
    4 2016-01-05 4.0 0.472205 Medium 102.750572
    5 2016-01-06 5.0 0.221076 High 108.208930
    6 2016-01-07 6.0 0.231904 High 104.982321
    7 2016-01-08 7.0 0.567697 Medium 117.178737
    8 2016-01-09 8.0 0.384391 Medium 94.160408
    9 2016-01-10 9.0 0.109675 Medium 108.560830
    10 2016-01-11 10.0 0.681480 High 101.400936
    11 2016-01-12 11.0 0.918687 Medium 102.421124
    12 2016-01-13 12.0 0.332227 High 99.464727
    13 2016-01-14 13.0 0.373779 High 107.219963
    14 2016-01-15 14.0 0.412173 Low 97.184597
    15 2016-01-16 15.0 0.194842 Medium 96.671218
    16 2016-01-17 16.0 0.372288 Low 105.270272
    17 2016-01-18 17.0 0.068876 Low 101.112631
    18 2016-01-19 18.0 0.391142 High 102.240937
    19 2016-01-20 19.0 0.942600 Low 92.492350
    for col in df:
        print(col)
    
    A
    x
    y
    C
    D
    

    iteritems() (key, value)

    for key, value in df.iteritems():
        print(key)
        print("*******")
        print(value)
        print("#######")
    
    A
    *******
    0    2016-01-01
    1    2016-01-02
    2    2016-01-03
    3    2016-01-04
    4    2016-01-05
    5    2016-01-06
    6    2016-01-07
    7    2016-01-08
    8    2016-01-09
    9    2016-01-10
    10   2016-01-11
    11   2016-01-12
    12   2016-01-13
    13   2016-01-14
    14   2016-01-15
    15   2016-01-16
    16   2016-01-17
    17   2016-01-18
    18   2016-01-19
    19   2016-01-20
    Name: A, dtype: datetime64[ns]
    #######
    x
    *******
    0      0.0
    1      1.0
    2      2.0
    3      3.0
    4      4.0
    5      5.0
    6      6.0
    7      7.0
    8      8.0
    9      9.0
    10    10.0
    11    11.0
    12    12.0
    13    13.0
    14    14.0
    15    15.0
    16    16.0
    17    17.0
    18    18.0
    19    19.0
    Name: x, dtype: float64
    #######
    y
    *******
    0     0.529153
    1     0.713513
    2     0.751809
    3     0.124047
    4     0.472205
    5     0.221076
    6     0.231904
    7     0.567697
    8     0.384391
    9     0.109675
    10    0.681480
    11    0.918687
    12    0.332227
    13    0.373779
    14    0.412173
    15    0.194842
    16    0.372288
    17    0.068876
    18    0.391142
    19    0.942600
    Name: y, dtype: float64
    #######
    C
    *******
    0        Low
    1        Low
    2     Medium
    3       High
    4     Medium
    5       High
    6       High
    7     Medium
    8     Medium
    9     Medium
    10      High
    11    Medium
    12      High
    13      High
    14       Low
    15    Medium
    16       Low
    17       Low
    18      High
    19       Low
    Name: C, dtype: object
    #######
    D
    *******
    0     110.430517
    1     125.401221
    2     112.446846
    3     108.633343
    4     102.750572
    5     108.208930
    6     104.982321
    7     117.178737
    8      94.160408
    9     108.560830
    10    101.400936
    11    102.421124
    12     99.464727
    13    107.219963
    14     97.184597
    15     96.671218
    16    105.270272
    17    101.112631
    18    102.240937
    19     92.492350
    Name: D, dtype: float64
    #######
    

    iterrows() (index, series)

    for index, row in df.iterrows():
        print(index)
        print("********")
        print(row)
        print("########")
    
    0
    ********
    A    2016-01-01 00:00:00
    x                      0
    y               0.529153
    C                    Low
    D                110.431
    Name: 0, dtype: object
    ########
    1
    ********
    A    2016-01-02 00:00:00
    x                      1
    y               0.713513
    C                    Low
    D                125.401
    Name: 1, dtype: object
    ########
    2
    ********
    A    2016-01-03 00:00:00
    x                      2
    y               0.751809
    C                 Medium
    D                112.447
    Name: 2, dtype: object
    ########
    3
    ********
    A    2016-01-04 00:00:00
    x                      3
    y               0.124047
    C                   High
    D                108.633
    Name: 3, dtype: object
    ########
    4
    ********
    A    2016-01-05 00:00:00
    x                      4
    y               0.472205
    C                 Medium
    D                102.751
    Name: 4, dtype: object
    ########
    5
    ********
    A    2016-01-06 00:00:00
    x                      5
    y               0.221076
    C                   High
    D                108.209
    Name: 5, dtype: object
    ########
    6
    ********
    A    2016-01-07 00:00:00
    x                      6
    y               0.231904
    C                   High
    D                104.982
    Name: 6, dtype: object
    ########
    7
    ********
    A    2016-01-08 00:00:00
    x                      7
    y               0.567697
    C                 Medium
    D                117.179
    Name: 7, dtype: object
    ########
    8
    ********
    A    2016-01-09 00:00:00
    x                      8
    y               0.384391
    C                 Medium
    D                94.1604
    Name: 8, dtype: object
    ########
    9
    ********
    A    2016-01-10 00:00:00
    x                      9
    y               0.109675
    C                 Medium
    D                108.561
    Name: 9, dtype: object
    ########
    10
    ********
    A    2016-01-11 00:00:00
    x                     10
    y                0.68148
    C                   High
    D                101.401
    Name: 10, dtype: object
    ########
    11
    ********
    A    2016-01-12 00:00:00
    x                     11
    y               0.918687
    C                 Medium
    D                102.421
    Name: 11, dtype: object
    ########
    12
    ********
    A    2016-01-13 00:00:00
    x                     12
    y               0.332227
    C                   High
    D                99.4647
    Name: 12, dtype: object
    ########
    13
    ********
    A    2016-01-14 00:00:00
    x                     13
    y               0.373779
    C                   High
    D                 107.22
    Name: 13, dtype: object
    ########
    14
    ********
    A    2016-01-15 00:00:00
    x                     14
    y               0.412173
    C                    Low
    D                97.1846
    Name: 14, dtype: object
    ########
    15
    ********
    A    2016-01-16 00:00:00
    x                     15
    y               0.194842
    C                 Medium
    D                96.6712
    Name: 15, dtype: object
    ########
    16
    ********
    A    2016-01-17 00:00:00
    x                     16
    y               0.372288
    C                    Low
    D                 105.27
    Name: 16, dtype: object
    ########
    17
    ********
    A    2016-01-18 00:00:00
    x                     17
    y              0.0688757
    C                    Low
    D                101.113
    Name: 17, dtype: object
    ########
    18
    ********
    A    2016-01-19 00:00:00
    x                     18
    y               0.391142
    C                   High
    D                102.241
    Name: 18, dtype: object
    ########
    19
    ********
    A    2016-01-20 00:00:00
    x                     19
    y                 0.9426
    C                    Low
    D                92.4924
    Name: 19, dtype: object
    ########
    

    itertuples()

    for row in df.itertuples():
        print(row)
        print("*********")
    
    Pandas(Index=0, A=Timestamp('2016-01-01 00:00:00'), x=0.0, y=0.5291527485322772, C='Low', D=110.43051702923863)
    *********
    Pandas(Index=1, A=Timestamp('2016-01-02 00:00:00'), x=1.0, y=0.713512538332376, C='Low', D=125.40122093094763)
    *********
    Pandas(Index=2, A=Timestamp('2016-01-03 00:00:00'), x=2.0, y=0.7518093449140011, C='Medium', D=112.44684623090683)
    *********
    Pandas(Index=3, A=Timestamp('2016-01-04 00:00:00'), x=3.0, y=0.12404682661025335, C='High', D=108.63334270085768)
    *********
    Pandas(Index=4, A=Timestamp('2016-01-05 00:00:00'), x=4.0, y=0.47220500135853094, C='Medium', D=102.75057211144569)
    *********
    Pandas(Index=5, A=Timestamp('2016-01-06 00:00:00'), x=5.0, y=0.22107632396965704, C='High', D=108.20892974035311)
    *********
    Pandas(Index=6, A=Timestamp('2016-01-07 00:00:00'), x=6.0, y=0.23190410081052582, C='High', D=104.98232144314449)
    *********
    Pandas(Index=7, A=Timestamp('2016-01-08 00:00:00'), x=7.0, y=0.5676969704991909, C='Medium', D=117.17873695254926)
    *********
    Pandas(Index=8, A=Timestamp('2016-01-09 00:00:00'), x=8.0, y=0.38439055971010483, C='Medium', D=94.16040790153708)
    *********
    Pandas(Index=9, A=Timestamp('2016-01-10 00:00:00'), x=9.0, y=0.10967465769586215, C='Medium', D=108.56083032097501)
    *********
    Pandas(Index=10, A=Timestamp('2016-01-11 00:00:00'), x=10.0, y=0.6814801929159177, C='High', D=101.40093570017285)
    *********
    Pandas(Index=11, A=Timestamp('2016-01-12 00:00:00'), x=11.0, y=0.9186874162117078, C='Medium', D=102.42112353899493)
    *********
    Pandas(Index=12, A=Timestamp('2016-01-13 00:00:00'), x=12.0, y=0.33222699128916544, C='High', D=99.46472715055548)
    *********
    Pandas(Index=13, A=Timestamp('2016-01-14 00:00:00'), x=13.0, y=0.37377940932622644, C='High', D=107.21996306704972)
    *********
    Pandas(Index=14, A=Timestamp('2016-01-15 00:00:00'), x=14.0, y=0.41217288447139533, C='Low', D=97.1845970026168)
    *********
    Pandas(Index=15, A=Timestamp('2016-01-16 00:00:00'), x=15.0, y=0.19484179666549728, C='Medium', D=96.67121785562782)
    *********
    Pandas(Index=16, A=Timestamp('2016-01-17 00:00:00'), x=16.0, y=0.3722882537710307, C='Low', D=105.27027217632694)
    *********
    Pandas(Index=17, A=Timestamp('2016-01-18 00:00:00'), x=17.0, y=0.068875657049556, C='Low', D=101.11263086450178)
    *********
    Pandas(Index=18, A=Timestamp('2016-01-19 00:00:00'), x=18.0, y=0.3911420688006072, C='High', D=102.24093699498466)
    *********
    Pandas(Index=19, A=Timestamp('2016-01-20 00:00:00'), x=19.0, y=0.9425996619637542, C='Low', D=92.49235045195462)
    *********
    

    教程上说,迭代的东西是一个副本,所以对其中的元素进行更改是不会影响原数据的.

    Sorting

    sort_index

    unsorted_df=pd.DataFrame(np.random.randn(10,2),index=[1,4,6,2,3,5,9,8,0,7],columns=['col2','col1'])
    unsorted_df
    
    col2 col1
    1 0.418578 -0.556598
    4 0.513646 1.436592
    6 0.830816 1.500456
    2 -0.373790 -0.578432
    3 0.961146 0.991754
    5 0.826093 -0.345533
    9 0.881435 0.934766
    8 -1.388952 -1.276708
    0 -0.685924 -0.210499
    7 1.556807 -0.652186
    unsorted_df.sort_index(ascending=False) #通过label排序
    
    col2 col1
    9 0.881435 0.934766
    8 -1.388952 -1.276708
    7 1.556807 -0.652186
    6 0.830816 1.500456
    5 0.826093 -0.345533
    4 0.513646 1.436592
    3 0.961146 0.991754
    2 -0.373790 -0.578432
    1 0.418578 -0.556598
    0 -0.685924 -0.210499
    help(unsorted_df.sort_index)
    
    Help on method sort_index in module pandas.core.frame:
    
    sort_index(axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, by=None) method of pandas.core.frame.DataFrame instance
        Sort object by labels (along an axis)
        
        Parameters
        ----------
        axis : index, columns to direct sorting
        level : int or level name or list of ints or list of level names
            if not None, sort on values in specified index level(s)
        ascending : boolean, default True
            Sort ascending vs. descending
        inplace : bool, default False
            if True, perform operation in-place
        kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
             Choice of sorting algorithm. See also ndarray.np.sort for more
             information.  `mergesort` is the only stable algorithm. For
             DataFrames, this option is only applied when sorting on a single
             column or label.
        na_position : {'first', 'last'}, default 'last'
             `first` puts NaNs at the beginning, `last` puts NaNs at the end.
             Not implemented for MultiIndex.
        sort_remaining : bool, default True
            if true and sorting by level and index is multilevel, sort by other
            levels too (in order) after sorting by specified level
        
        Returns
        -------
        sorted_obj : DataFrame
    
    unsorted_df.sort_index(axis=1)
    
    col1 col2
    1 -0.556598 0.418578
    4 1.436592 0.513646
    6 1.500456 0.830816
    2 -0.578432 -0.373790
    3 0.991754 0.961146
    5 -0.345533 0.826093
    9 0.934766 0.881435
    8 -1.276708 -1.388952
    0 -0.210499 -0.685924
    7 -0.652186 1.556807

    sort_value

    unsorted_df = pd.DataFrame({'col1':[2,1,1,1],'col2':[1,3,2,4]})
    unsorted_df
    
    col1 col2
    0 2 1
    1 1 3
    2 1 2
    3 1 4
    unsorted_df.sort_values(by="col1")
    
    col1 col2
    1 1 3
    2 1 2
    3 1 4
    0 2 1
    help(unsorted_df.sort_values)
    
    Help on method sort_values in module pandas.core.frame:
    
    sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last') method of pandas.core.frame.DataFrame instance
        Sort by the values along either axis
        
        Parameters
        ----------
        by : str or list of str
            Name or list of names to sort by.
        
            - if `axis` is 0 or `'index'` then `by` may contain index
              levels and/or column labels
            - if `axis` is 1 or `'columns'` then `by` may contain column
              levels and/or index labels
        
            .. versionchanged:: 0.23.0
               Allow specifying index or column level names.
        axis : {0 or 'index', 1 or 'columns'}, default 0
             Axis to be sorted
        ascending : bool or list of bool, default True
             Sort ascending vs. descending. Specify list for multiple sort
             orders.  If this is a list of bools, must match the length of
             the by.
        inplace : bool, default False
             if True, perform operation in-place
        kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
             Choice of sorting algorithm. See also ndarray.np.sort for more
             information.  `mergesort` is the only stable algorithm. For
             DataFrames, this option is only applied when sorting on a single
             column or label.
        na_position : {'first', 'last'}, default 'last'
             `first` puts NaNs at the beginning, `last` puts NaNs at the end
        
        Returns
        -------
        sorted_obj : DataFrame
        
        Examples
        --------
        >>> df = pd.DataFrame({
        ...     'col1' : ['A', 'A', 'B', np.nan, 'D', 'C'],
        ...     'col2' : [2, 1, 9, 8, 7, 4],
        ...     'col3': [0, 1, 9, 4, 2, 3],
        ... })
        >>> df
            col1 col2 col3
        0   A    2    0
        1   A    1    1
        2   B    9    9
        3   NaN  8    4
        4   D    7    2
        5   C    4    3
        
        Sort by col1
        
        >>> df.sort_values(by=['col1'])
            col1 col2 col3
        0   A    2    0
        1   A    1    1
        2   B    9    9
        5   C    4    3
        4   D    7    2
        3   NaN  8    4
        
        Sort by multiple columns
        
        >>> df.sort_values(by=['col1', 'col2'])
            col1 col2 col3
        1   A    1    1
        0   A    2    0
        2   B    9    9
        5   C    4    3
        4   D    7    2
        3   NaN  8    4
        
        Sort Descending
        
        >>> df.sort_values(by='col1', ascending=False)
            col1 col2 col3
        4   D    7    2
        5   C    4    3
        2   B    9    9
        0   A    2    0
        1   A    1    1
        3   NaN  8    4
        
        Putting NAs first
        
        >>> df.sort_values(by='col1', ascending=False, na_position='first')
            col1 col2 col3
        3   NaN  8    4
        4   D    7    2
        5   C    4    3
        2   B    9    9
        0   A    2    0
        1   A    1    1
    
    df = pd.DataFrame(np.random.randn(8, 4),
    index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])
    df
    
    A B C D
    a -0.235229 0.214813 0.838116 1.081632
    b 0.530365 0.600021 0.753903 -1.958886
    c 1.760542 -1.027882 -0.053263 0.299710
    d -0.241942 0.455707 -0.684968 -0.513217
    e 0.866758 1.035051 -0.451651 -0.987964
    f 1.620520 0.236408 0.478373 -1.012238
    g -0.236978 0.352751 -0.514737 -0.195936
    h -0.046064 0.129530 -0.874676 1.740141
    df.loc[:, 'A']
    
    a    2.539530
    b   -0.278140
    c    1.291831
    d   -0.231592
    e   -2.047005
    f   -0.720743
    g   -0.995131
    h    0.190029
    Name: A, dtype: float64
    
    df.loc[:, ['A', 'C']]
    
    A C
    a 2.539530 -0.290170
    b -0.278140 1.575699
    c 1.291831 0.038547
    d -0.231592 0.117562
    e -2.047005 -0.569768
    f -0.720743 0.321223
    g -0.995131 1.530757
    h 0.190029 -0.068202
    df.loc['a':'h']
    
    A B C D
    a 2.539530 0.046380 -0.290170 -1.540302
    b -0.278140 1.420046 1.575699 0.533353
    c 1.291831 2.595299 0.038547 -0.488134
    d -0.231592 -0.162497 0.117562 1.452291
    e -2.047005 -0.046110 -0.569768 1.328672
    f -0.720743 0.339251 0.321223 -0.310041
    g -0.995131 0.831769 1.530757 0.975214
    h 0.190029 1.056606 -0.068202 -1.127776
    df.loc[df.loc[:, 'A'] > 0, 'A'] 
    
    b    0.530365
    c    1.760542
    e    0.866758
    f    1.620520
    Name: A, dtype: float64
    

    iloc()

    df.iloc[:4]
    
    A B C D
    a 2.539530 0.046380 -0.290170 -1.540302
    b -0.278140 1.420046 1.575699 0.533353
    c 1.291831 2.595299 0.038547 -0.488134
    d -0.231592 -0.162497 0.117562 1.452291
    df.iloc[1:5, 2:4]
    
    C D
    b 1.575699 0.533353
    c 0.038547 -0.488134
    d 0.117562 1.452291
    e -0.569768 1.328672
    import collections
    p = collections.defaultdict(int)
    p['A'] += 1
    p['B'] += 1
    p
    
    defaultdict(int, {'A': 1, 'B': 1})
    
    df = pd.DataFrame({'thing':['A', 'A', 'B', 'A', 'B', 'A', 'C', 'C', 'C']})
    for row in df.loc[df.loc[:, 'thing']== 'D'].iterrows():
        print(1)
    
    x = set([1, 2, 3, 1])
    x
    
    {1, 2, 3}
    
    df2 = df.copy()
    df2
    
    A D
    a -0.235229 1.081632
    b 0.530365 -1.958886
    c 1.760542 0.299710
    d -0.241942 -0.513217
    e 0.866758 -0.987964
    f 1.620520 -1.012238
    g -0.236978 -0.195936
    h -0.046064 1.740141
    df2.loc['a'][0] = 3
    
    df3 = df['A']
    df3
    
    a   -0.235229
    b    0.530365
    c    1.760542
    d   -0.241942
    e    0.866758
    f    1.620520
    g   -0.236978
    h   -0.046064
    Name: A, dtype: float64
    
    df3['a'] = 3
    df3
    
    a    3.000000
    b    0.530365
    c    1.760542
    d   -0.241942
    e    0.866758
    f    1.620520
    g   -0.236978
    h   -0.046064
    Name: A, dtype: float64
    
    df3 = df.iloc[:4]
    df3
    
    A D
    a 3.000000 1.081632
    b 0.530365 -1.958886
    c 1.760542 0.299710
    d -0.241942 -0.513217
    df3.pop('A')
    
    a    0.000000
    b    0.530365
    c    1.760542
    d   -0.241942
    Name: A, dtype: float64
    
    d = {1:1, 2:2}
    u = {3:3}
    d.update(u)
    d
    
    {1: 1, 2: 2, 3: 3}
    

    Wroking with Text Data

    s = pd.Series(['Tom', 'William Rick', 'John', 'Alber@t', np.nan, '1234','SteveSmith'])
    s
    
    0             Tom
    1    William Rick
    2            John
    3         Alber@t
    4             NaN
    5            1234
    6      SteveSmith
    dtype: object
    

    s.str.lower() s.str.upper()

    list(s.str)
    
    [0      T
     1      W
     2      J
     3      A
     4    NaN
     5      1
     6      S
     dtype: object, 0      o
     1      i
     2      o
     3      l
     4    NaN
     5      2
     6      t
     dtype: object, 0      m
     1      l
     2      h
     3      b
     4    NaN
     5      3
     6      e
     dtype: object, 0    NaN
     1      l
     2      n
     3      e
     4    NaN
     5      4
     6      v
     dtype: object, 0    NaN
     1      i
     2    NaN
     3      r
     4    NaN
     5    NaN
     6      e
     dtype: object, 0    NaN
     1      a
     2    NaN
     3      @
     4    NaN
     5    NaN
     6      S
     dtype: object, 0    NaN
     1      m
     2    NaN
     3      t
     4    NaN
     5    NaN
     6      m
     dtype: object, 0    NaN
     1       
     2    NaN
     3    NaN
     4    NaN
     5    NaN
     6      i
     dtype: object, 0    NaN
     1      R
     2    NaN
     3    NaN
     4    NaN
     5    NaN
     6      t
     dtype: object, 0    NaN
     1      i
     2    NaN
     3    NaN
     4    NaN
     5    NaN
     6      h
     dtype: object, 0    NaN
     1      c
     2    NaN
     3    NaN
     4    NaN
     5    NaN
     6    NaN
     dtype: object, 0    NaN
     1      k
     2    NaN
     3    NaN
     4    NaN
     5    NaN
     6    NaN
     dtype: object]
    
    s.str.lower()  #所以这一步实际上就是把s.str的第一个部分全部改为小写?
    
    0             tom
    1    william rick
    2            john
    3         alber@t
    4             NaN
    5            1234
    6      stevesmith
    dtype: object
    
    s.str.upper()
    
    0             TOM
    1    WILLIAM RICK
    2            JOHN
    3         ALBER@T
    4             NaN
    5            1234
    6      STEVESMITH
    dtype: object
    

    s.str.len()

    s.str.len()  #实际上就是把每一个元素的长度弄出来
    
    0     3.0
    1    12.0
    2     4.0
    3     7.0
    4     NaN
    5     4.0
    6    10.0
    dtype: float64
    

    s.str.strip()

    s = pd.Series(['Tom            ', ' William Rick', 'John', 'Alber@t'])
    s
    
    0    Tom            
    1       William Rick
    2               John
    3            Alber@t
    dtype: object
    
    s.str.strip()
    
    0             Tom
    1    William Rick
    2            John
    3         Alber@t
    dtype: object
    
    s.str.strip('k')
    
    0    Tom            
    1        William Ric
    2               John
    3            Alber@t
    dtype: object
    

    s.str.spilt()

    s.str.split('o')
    
    0    [T, m            ]
    1       [ William Rick]
    2               [J, hn]
    3             [Alber@t]
    dtype: object
    
    s.str.split()
    
    0              [Tom]
    1    [William, Rick]
    2             [John]
    3          [Alber@t]
    dtype: object
    
    s.str.split(' ')
    
    0    [Tom, , , , , , , , , , , , ]
    1                [, William, Rick]
    2                           [John]
    3                        [Alber@t]
    dtype: object
    

    s.str.cat()

    s.str.cat()
    
    'Tom             William RickJohnAlber@t'
    
    s.str.cat(sep='A')
    
    'Tom            A William RickAJohnAAlber@t'
    
    s.str.cat(sep='_____')
    
    'Tom            _____ William Rick_____John_____Alber@t'
    
    help(s.str.cat) #所以如果不是series之间相连接,需要通过关键词sep来传入分隔符
    
    Help on method cat in module pandas.core.strings:
    
    cat(others=None, sep=None, na_rep=None, join=None) method of pandas.core.strings.StringMethods instance
        Concatenate strings in the Series/Index with given separator.
        
        If `others` is specified, this function concatenates the Series/Index
        and elements of `others` element-wise.
        If `others` is not passed, then all values in the Series/Index are
        concatenated into a single string with a given `sep`.
        
        Parameters
        ----------
        others : Series, Index, DataFrame, np.ndarrary or list-like
            Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and
            other list-likes of strings must have the same length as the
            calling Series/Index, with the exception of indexed objects (i.e.
            Series/Index/DataFrame) if `join` is not None.
        
            If others is a list-like that contains a combination of Series,
            np.ndarray (1-dim) or list-like, then all elements will be unpacked
            and must satisfy the above criteria individually.
        
            If others is None, the method returns the concatenation of all
            strings in the calling Series/Index.
        sep : string or None, default None
            If None, concatenates without any separator.
        na_rep : string or None, default None
            Representation that is inserted for all missing values:
        
            - If `na_rep` is None, and `others` is None, missing values in the
              Series/Index are omitted from the result.
            - If `na_rep` is None, and `others` is not None, a row containing a
              missing value in any of the columns (before concatenation) will
              have a missing value in the result.
        join : {'left', 'right', 'outer', 'inner'}, default None
            Determines the join-style between the calling Series/Index and any
            Series/Index/DataFrame in `others` (objects without an index need
            to match the length of the calling Series/Index). If None,
            alignment is disabled, but this option will be removed in a future
            version of pandas and replaced with a default of `'left'`. To
            disable alignment, use `.values` on any Series/Index/DataFrame in
            `others`.
        
            .. versionadded:: 0.23.0
        
        Returns
        -------
        concat : str or Series/Index of objects
            If `others` is None, `str` is returned, otherwise a `Series/Index`
            (same type as caller) of objects is returned.
        
        See Also
        --------
        split : Split each string in the Series/Index
        
        Examples
        --------
        When not passing `others`, all values are concatenated into a single
        string:
        
        >>> s = pd.Series(['a', 'b', np.nan, 'd'])
        >>> s.str.cat(sep=' ')
        'a b d'
        
        By default, NA values in the Series are ignored. Using `na_rep`, they
        can be given a representation:
        
        >>> s.str.cat(sep=' ', na_rep='?')
        'a b ? d'
        
        If `others` is specified, corresponding values are concatenated with
        the separator. Result will be a Series of strings.
        
        >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')
        0    a,A
        1    b,B
        2    NaN
        3    d,D
        dtype: object
        
        Missing values will remain missing in the result, but can again be
        represented using `na_rep`
        
        >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')
        0    a,A
        1    b,B
        2    -,C
        3    d,D
        dtype: object
        
        If `sep` is not specified, the values are concatenated without
        separation.
        
        >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')
        0    aA
        1    bB
        2    -C
        3    dD
        dtype: object
        
        Series with different indexes can be aligned before concatenation. The
        `join`-keyword works as in other methods.
        
        >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])
        >>> s.str.cat(t, join=None, na_rep='-')
        0    ad
        1    ba
        2    -e
        3    dc
        dtype: object
        >>>
        >>> s.str.cat(t, join='left', na_rep='-')
        0    aa
        1    b-
        2    -c
        3    dd
        dtype: object
        >>>
        >>> s.str.cat(t, join='outer', na_rep='-')
        0    aa
        1    b-
        2    -c
        3    dd
        4    -e
        dtype: object
        >>>
        >>> s.str.cat(t, join='inner', na_rep='-')
        0    aa
        2    -c
        3    dd
        dtype: object
        >>>
        >>> s.str.cat(t, join='right', na_rep='-')
        3    dd
        0    aa
        4    -e
        2    -c
        dtype: object
        
        For more examples, see :ref:`here <text.concatenate>`.
    

    s.str.get_dummies()

    s.str.get_dummies()
    
    William Rick Alber@t John Tom
    0 0 0 0 1
    1 1 0 0 0
    2 0 0 1 0
    3 0 1 0 0
    s = pd.Series(['Tom ', ' William Rick', 'Alber@t', 'John'])
    s.str.get_dummies()
    
    William Rick Alber@t John Tom
    0 0 0 0 1
    1 1 0 0 0
    2 0 1 0 0
    3 0 0 1 0

    所以是以DataFrame的形式来表示各个字符出现的顺序?

    s.str.contains()

    s.str.contains(' ')
    
    0     True
    1     True
    2    False
    3    False
    dtype: bool
    
    s.str.contains('o')
    
    0     True
    1    False
    2    False
    3     True
    dtype: bool
    

    s.str.replace()

    s.str.replace('@', '$')
    
    0             Tom 
    1     William Rick
    2          Alber$t
    3             John
    dtype: object
    

    s.str.repeat()

    s = pd.Series(['Tom ', ' William Rick', 'Alber@t', 'John', np.nan])
    
    s.str.repeat(3)
    
    0                               Tom Tom Tom 
    1     William Rick William Rick William Rick
    2                      Alber@tAlber@tAlber@t
    3                               JohnJohnJohn
    dtype: object
    
    s.str.repeat([1, 2, 3, 4])
    
    0                          Tom 
    1     William Rick William Rick
    2         Alber@tAlber@tAlber@t
    3              JohnJohnJohnJohn
    dtype: object
    

    s.str.count(s2)

    统计s2在每个元素中出现的次数(不一定要相等,被包含也是可以的)

    s.str.count('m')
    
    0    1
    1    1
    2    0
    3    0
    dtype: int64
    

    s.str.startswith()

    s.str.startswith('To')
    
    0     True
    1    False
    2    False
    3    False
    4      NaN
    dtype: object
    
    s.str.startswith('To', na=False)
    
    0     True
    1    False
    2    False
    3    False
    4    False
    dtype: bool
    

    s.str.find()

    s.str.find('e')  # -1表示不存在, 其它的数字表示其位置
    
    0   -1.0
    1   -1.0
    2    3.0
    3   -1.0
    4    NaN
    dtype: float64
    

    s.str.findall()

    s.str.findall('e')  #?这样感觉还不如上面的好用啊
    
    0     []
    1     []
    2    [e]
    3     []
    4    NaN
    dtype: object
    

    s.str.islower()

    s.str.islower()
    
    0    False
    1    False
    2    False
    3    False
    4      NaN
    dtype: object
    

    s.str.isupper()

    s.str.isupper()
    
    0    False
    1    False
    2    False
    3    False
    4      NaN
    dtype: object
    

    s.str.isnumeric()

    s.str.isnumeric()
    
    0    False
    1    False
    2    False
    3    False
    4      NaN
    dtype: object
    

    Option & Customization

    get_option()

    set_option()

    reset_option()

    describe_option()

    option_context()

    display.max_rows

    display.max_columns

    display.expand_frame_repr

    display.max_colwidth

    display.precision #精确度

    get_option()

    pd.get_option("display.max_rows")  #能够显示的最大行数
    
    60
    
    pd.get_option("display.max_columns") #能够显示的最大列数
    
    20
    

    set_option()

    pd.set_option("display.max_rows", 10)  #设置能够显示的最大行数为10
    pd.get_option("display.max_rows")   
    
    10
    
    s = pd.Series(np.arange(11))
    s
    
    0      0
    1      1
    2      2
    3      3
    4      4
          ..
    6      6
    7      7
    8      8
    9      9
    10    10
    Length: 11, dtype: int32
    
    pd.set_option("display.max_columns", 2) #设置能够显示最大列数为2
    pd.get_option("display.max_columns")
    
    2
    
    data = {1:np.arange(10), 2:np.arange(1, 11), 3:np.arange(2, 12)}
    df = pd.DataFrame(data)
    df
    
    1 ... 3
    0 0 ... 2
    1 1 ... 3
    2 2 ... 4
    3 3 ... 5
    4 4 ... 6
    5 5 ... 7
    6 6 ... 8
    7 7 ... 9
    8 8 ... 10
    9 9 ... 11

    10 rows × 3 columns

    reset_option()

    reset_option 接受一个参数,设置其属性为默认的属性

    pd.reset_option("display.max_rows")
    pd.reset_option("display.max_columns")
    
    s
    
    0      0
    1      1
    2      2
    3      3
    4      4
    5      5
    6      6
    7      7
    8      8
    9      9
    10    10
    dtype: int32
    

    describe_option()

    打印对参数的说明

    pd.describe_option("display.max_rows")
    
    display.max_rows : int
        If max_rows is exceeded, switch to truncate view. Depending on
        `large_repr`, objects are either centrally truncated or printed as
        a summary view. 'None' value means unlimited.
    
        In case python/IPython is running in a terminal and `large_repr`
        equals 'truncate' this can be set to 0 and pandas will auto-detect
        the height of the terminal and print a truncated object which fits
        the screen height. The IPython notebook, IPython qtconsole, or
        IDLE do not run in a terminal and hence it is not possible to do
        correct auto-detection.
        [default: 60] [currently: 60]
    

    option_context()

    with pd.option_context("display.max_rows", 10):
        print(pd.get_option("display.max_rows"))
    print(pd.get_option("display.max_rows"))
    
    10
    60
    

    所以其实这就是一个上下文,在上下文管理器中,参数会被暂时性地调整,离开控制之后,便会回到原先的状态

    统计函数

    percent_change pct_change

    比较每一个元素与其之前的元素的变化率

    s = pd.Series([1,2,3,4,5,4])
    s.pct_change()
    
    0         NaN
    1    1.000000
    2    0.500000
    3    0.333333
    4    0.250000
    5   -0.200000
    dtype: float64
    
    df = pd.DataFrame(np.random.randn(5, 2))
    df
    
    0 1
    0 0.855450 -0.673131
    1 0.610321 0.389186
    2 0.386450 -0.209481
    3 -0.159426 1.941561
    4 0.692407 0.332914
    df.pct_change()
    
    0 1
    0 NaN NaN
    1 -0.286549 -1.578173
    2 -0.366809 -1.538254
    3 -1.412541 -10.268438
    4 -5.343109 -0.828533
    help(df.pct_change)
    
    Help on method pct_change in module pandas.core.generic:
    
    pct_change(periods=1, fill_method='pad', limit=None, freq=None, **kwargs) method of pandas.core.frame.DataFrame instance
        Percentage change between the current and a prior element.
        
        Computes the percentage change from the immediately previous row by
        default. This is useful in comparing the percentage of change in a time
        series of elements.
        
        Parameters
        ----------
        periods : int, default 1
            Periods to shift for forming percent change.
        fill_method : str, default 'pad'
            How to handle NAs before computing percent changes.
        limit : int, default None
            The number of consecutive NAs to fill before stopping.
        freq : DateOffset, timedelta, or offset alias string, optional
            Increment to use from time series API (e.g. 'M' or BDay()).
        **kwargs
            Additional keyword arguments are passed into
            `DataFrame.shift` or `Series.shift`.
        
        Returns
        -------
        chg : Series or DataFrame
            The same type as the calling object.
        
        See Also
        --------
        Series.diff : Compute the difference of two elements in a Series.
        DataFrame.diff : Compute the difference of two elements in a DataFrame.
        Series.shift : Shift the index by some number of periods.
        DataFrame.shift : Shift the index by some number of periods.
        
        Examples
        --------
        **Series**
        
        >>> s = pd.Series([90, 91, 85])
        >>> s
        0    90
        1    91
        2    85
        dtype: int64
        
        >>> s.pct_change()
        0         NaN
        1    0.011111
        2   -0.065934
        dtype: float64
        
        >>> s.pct_change(periods=2)
        0         NaN
        1         NaN
        2   -0.055556
        dtype: float64
        
        See the percentage change in a Series where filling NAs with last
        valid observation forward to next valid.
        
        >>> s = pd.Series([90, 91, None, 85])
        >>> s
        0    90.0
        1    91.0
        2     NaN
        3    85.0
        dtype: float64
        
        >>> s.pct_change(fill_method='ffill')
        0         NaN
        1    0.011111
        2    0.000000
        3   -0.065934
        dtype: float64
        
        **DataFrame**
        
        Percentage change in French franc, Deutsche Mark, and Italian lira from
        1980-01-01 to 1980-03-01.
        
        >>> df = pd.DataFrame({
        ...     'FR': [4.0405, 4.0963, 4.3149],
        ...     'GR': [1.7246, 1.7482, 1.8519],
        ...     'IT': [804.74, 810.01, 860.13]},
        ...     index=['1980-01-01', '1980-02-01', '1980-03-01'])
        >>> df
                        FR      GR      IT
        1980-01-01  4.0405  1.7246  804.74
        1980-02-01  4.0963  1.7482  810.01
        1980-03-01  4.3149  1.8519  860.13
        
        >>> df.pct_change()
                          FR        GR        IT
        1980-01-01       NaN       NaN       NaN
        1980-02-01  0.013810  0.013684  0.006549
        1980-03-01  0.053365  0.059318  0.061876
        
        Percentage of change in GOOG and APPL stock volume. Shows computing
        the percentage change between columns.
        
        >>> df = pd.DataFrame({
        ...     '2016': [1769950, 30586265],
        ...     '2015': [1500923, 40912316],
        ...     '2014': [1371819, 41403351]},
        ...     index=['GOOG', 'APPL'])
        >>> df
                  2016      2015      2014
        GOOG   1769950   1500923   1371819
        APPL  30586265  40912316  41403351
        
        >>> df.pct_change(axis='columns')
              2016      2015      2014
        GOOG   NaN -0.151997 -0.086016
        APPL   NaN  0.337604  0.012002
    
    s.pct_change(2)
    
    0         NaN
    1         NaN
    2    2.000000
    3    1.000000
    4    0.666667
    5    0.000000
    dtype: float64
    

    covariance

    计算协方差,会自动省略NA

    s1 = pd.Series(np.random.randn(10))
    s2 = pd.Series(np.random.randn(10))
    s1.cov(s2)
    
    0.05172017259428779
    

    当cov作用于DataFrame的时候,会计算列之间的协方差

    frame = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
    frame.cov()
    
    a b c d e
    a 1.737924 0.114498 0.279058 -0.325639 -0.224395
    b 0.114498 0.543846 0.421225 -0.103112 -0.373280
    c 0.279058 0.421225 1.326316 -0.635491 -0.573974
    d -0.325639 -0.103112 -0.635491 0.978824 0.764530
    e -0.224395 -0.373280 -0.573974 0.764530 0.856515
    frame['a'].cov(frame.loc[:, 'b'])
    
    0.11449787431144376
    

    corrlation

    计算相关系数

    frame = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
    frame.corr()
    
    a b c d e
    a 1.000000 0.312138 0.173514 0.247459 -0.389265
    b 0.312138 1.000000 -0.410371 0.104755 -0.480116
    c 0.173514 -0.410371 1.000000 0.124817 -0.230226
    d 0.247459 0.104755 0.124817 1.000000 0.259208
    e -0.389265 -0.480116 -0.230226 0.259208 1.000000
    frame['a'].corr(frame.iloc[:, 1])
    
    0.3121378592734573
    

    Ranking

    排序

    s = pd.Series(np.random.randn(5), index=list('abcde'))
    s['d'] = s['b'] # so there's a tie
    s
    
    a   -2.179226
    b    0.614786
    c    1.801039
    d    0.614786
    e   -1.604162
    dtype: float64
    
    s.rank()
    
    a    1.0
    b    3.5
    c    5.0
    d    3.5
    e    2.0
    dtype: float64
    

    window function 一块一块来,蛮有用的

    .rolling()

    df = pd.DataFrame(np.random.randn(10, 4),
       index = pd.date_range('1/1/2000', periods=10),
       columns = ['A', 'B', 'C', 'D'])
    df
    
    A B C D
    2000-01-01 -0.602802 0.294063 -0.803316 -0.500838
    2000-01-02 -0.968835 -1.745470 0.027664 -1.012092
    2000-01-03 -0.047073 0.440166 -0.338257 1.551372
    2000-01-04 0.136861 0.357544 -0.370691 -0.312876
    2000-01-05 1.257872 -1.126768 -0.539122 -0.478309
    2000-01-06 -0.954518 -0.067380 0.139257 -0.908213
    2000-01-07 1.501658 -1.189674 0.794113 -0.155611
    2000-01-08 0.400153 0.291841 -0.450429 1.044665
    2000-01-09 -0.797415 0.346594 -0.107653 -0.605027
    2000-01-10 -0.532034 1.296260 0.303357 -0.056933
    df.rolling(window=3).mean()
    
    A B C D
    2000-01-01 NaN NaN NaN NaN
    2000-01-02 NaN NaN NaN NaN
    2000-01-03 -0.539570 -0.337081 -0.371303 0.012814
    2000-01-04 -0.293015 -0.315920 -0.227095 0.075468
    2000-01-05 0.449220 -0.109686 -0.416023 0.253396
    2000-01-06 0.146739 -0.278868 -0.256852 -0.566466
    2000-01-07 0.601671 -0.794607 0.131416 -0.514044
    2000-01-08 0.315764 -0.321738 0.160980 -0.006386
    2000-01-09 0.368132 -0.183746 0.078677 0.094676
    2000-01-10 -0.309765 0.644898 -0.084908 0.127569

    注意到, window=3, 所以每次的作用于是3行,后面跟的函数是mean, 所以第n行的结果是n, n-1, n-2三行的平均值,前俩行自然而然是NaN

    .expanding()

    df.expanding(min_periods=3).mean()
    
    A B C D
    2000-01-01 NaN NaN NaN NaN
    2000-01-02 NaN NaN NaN NaN
    2000-01-03 -0.539570 -0.337081 -0.371303 0.012814
    2000-01-04 -0.370462 -0.163425 -0.371150 -0.068609
    2000-01-05 -0.044795 -0.356093 -0.404744 -0.150549
    2000-01-06 -0.196416 -0.307974 -0.314078 -0.276826
    2000-01-07 0.046166 -0.433932 -0.155765 -0.259510
    2000-01-08 0.090415 -0.343210 -0.192598 -0.096488
    2000-01-09 -0.008233 -0.266565 -0.183159 -0.152992
    2000-01-10 -0.060613 -0.110283 -0.134508 -0.143386
    df['A'][:4].mean()
    
    -0.37046210746336367
    

    注意到 2000-01-03是一样的,后面的就不一样了,这是因为, min_periods=3限制了作用域最小为3行,所以n=4的时候,实际上是会把前面的4行取平均

    .ewm()

    滑动平均
    实现方式

    df
    
    A B C D
    2000-01-01 -0.602802 0.294063 -0.803316 -0.500838
    2000-01-02 -0.968835 -1.745470 0.027664 -1.012092
    2000-01-03 -0.047073 0.440166 -0.338257 1.551372
    2000-01-04 0.136861 0.357544 -0.370691 -0.312876
    2000-01-05 1.257872 -1.126768 -0.539122 -0.478309
    2000-01-06 -0.954518 -0.067380 0.139257 -0.908213
    2000-01-07 1.501658 -1.189674 0.794113 -0.155611
    2000-01-08 0.400153 0.291841 -0.450429 1.044665
    2000-01-09 -0.797415 0.346594 -0.107653 -0.605027
    2000-01-10 -0.532034 1.296260 0.303357 -0.056933
    df.ewm(com=0.5, adjust=True).mean()
    
    A B C D
    2000-01-01 -0.602802 0.294063 -0.803316 -0.500838
    2000-01-02 -0.877327 -1.235587 -0.180081 -0.884278
    2000-01-03 -0.302535 -0.075451 -0.289588 0.801941
    2000-01-04 -0.005943 0.216821 -0.344332 0.049439
    2000-01-05 0.840082 -0.682606 -0.474729 -0.303847
    2000-01-06 -0.357961 -0.271892 -0.064843 -0.707311
    2000-01-07 0.882352 -0.884027 0.508056 -0.339343
    2000-01-08 0.560837 -0.099995 -0.131032 0.583470
    2000-01-09 -0.344710 0.197746 -0.115445 -0.208901
    2000-01-10 -0.469595 0.930101 0.163761 -0.107587
    a = 1 / (1+0.5)
    x1 = df.iloc[0, 0]
    x2 = df.iloc[1, 0]
    (x2 + (1-a) * x1) / (1 + 1 - a)
    
    -0.8773265770527067
    
    df.mean()
    
    A   -0.060613
    B   -0.110283
    C   -0.134508
    D   -0.143386
    dtype: float64
    

    Aggregations 花哨

    df = pd.DataFrame(np.random.randn(10, 4),
       index = pd.date_range('1/1/2000', periods=10),
       columns = ['A', 'B', 'C', 'D'])
    df
    
    A B C D
    2000-01-01 0.371455 -0.191824 -0.146096 -1.347259
    2000-01-02 -1.571376 0.061927 0.149302 -0.507093
    2000-01-03 0.015607 1.637870 -0.642065 -0.228584
    2000-01-04 -0.236157 0.366852 -0.117198 1.373123
    2000-01-05 -0.390561 -0.670603 -2.022454 0.964826
    2000-01-06 -0.309272 1.234031 -0.383297 0.234326
    2000-01-07 -0.925264 0.417228 -0.432956 -1.331263
    2000-01-08 0.223505 0.160549 -0.247965 0.262888
    2000-01-09 -2.442173 0.757845 -0.704929 0.037361
    2000-01-10 -0.936853 -0.479592 -0.274561 0.146732
    r = df.rolling(window=3, min_periods=1)
    r
    
    Rolling [window=3,min_periods=1,center=False,axis=0]
    
    r.aggregate(np.sum)
    
    A B C D
    2000-01-01 0.371455 -0.191824 -0.146096 -1.347259
    2000-01-02 -1.199921 -0.129897 0.003207 -1.854353
    2000-01-03 -1.184314 1.507972 -0.638858 -2.082937
    2000-01-04 -1.791925 2.066648 -0.609961 0.637446
    2000-01-05 -0.611110 1.334119 -2.781717 2.109366
    2000-01-06 -0.935989 0.930280 -2.522949 2.572276
    2000-01-07 -1.625096 0.980656 -2.838707 -0.132111
    2000-01-08 -1.011030 1.811808 -1.064218 -0.834049
    2000-01-09 -3.143932 1.335622 -1.385850 -1.031014
    2000-01-10 -3.155521 0.438802 -1.227455 0.446981
    r['A'].aggregate(np.sum)
    
    2000-01-01    0.371455
    2000-01-02   -1.199921
    2000-01-03   -1.184314
    2000-01-04   -1.791925
    2000-01-05   -0.611110
    2000-01-06   -0.935989
    2000-01-07   -1.625096
    2000-01-08   -1.011030
    2000-01-09   -3.143932
    2000-01-10   -3.155521
    Freq: D, Name: A, dtype: float64
    
    r[['A', 'B']].aggregate(np.sum)
    
    A B
    2000-01-01 0.371455 -0.191824
    2000-01-02 -1.199921 -0.129897
    2000-01-03 -1.184314 1.507972
    2000-01-04 -1.791925 2.066648
    2000-01-05 -0.611110 1.334119
    2000-01-06 -0.935989 0.930280
    2000-01-07 -1.625096 0.980656
    2000-01-08 -1.011030 1.811808
    2000-01-09 -3.143932 1.335622
    2000-01-10 -3.155521 0.438802
    r['A'].aggregate([np.sum, np.mean])
    
    sum mean
    2000-01-01 0.371455 0.371455
    2000-01-02 -1.199921 -0.599960
    2000-01-03 -1.184314 -0.394771
    2000-01-04 -1.791925 -0.597308
    2000-01-05 -0.611110 -0.203703
    2000-01-06 -0.935989 -0.311996
    2000-01-07 -1.625096 -0.541699
    2000-01-08 -1.011030 -0.337010
    2000-01-09 -3.143932 -1.047977
    2000-01-10 -3.155521 -1.051840
    r.aggregate({'A':np.sum, 'B':np.mean})
    
    A B
    2000-01-01 0.371455 -0.191824
    2000-01-02 -1.199921 -0.064949
    2000-01-03 -1.184314 0.502657
    2000-01-04 -1.791925 0.688883
    2000-01-05 -0.611110 0.444706
    2000-01-06 -0.935989 0.310093
    2000-01-07 -1.625096 0.326885
    2000-01-08 -1.011030 0.603936
    2000-01-09 -3.143932 0.445207
    2000-01-10 -3.155521 0.146267

    Missing Data

    df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
    'h'],columns=['one', 'two', 'three'])
    
    df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
    df
    
    one two three
    a 0.560200 0.244413 -1.814612
    b NaN NaN NaN
    c 0.210847 0.014889 -0.711094
    d NaN NaN NaN
    e -0.340756 1.657751 0.419182
    f -0.699982 0.258028 1.324182
    g NaN NaN NaN
    h -1.271993 1.477846 0.488302

    NaN: Not a Number

    isnull() notnull()

    df.isnull()
    
    one two three
    a False False False
    b True True True
    c False False False
    d True True True
    e False False False
    f False False False
    g True True True
    h False False False
    df.notnull()
    
    one two three
    a True True True
    b False False False
    c True True True
    d False False False
    e True True True
    f True True True
    g False False False
    h True True True

    关于缺失值的计算

    当数据求和的时候,缺失值视为0,如果数据全为NA,那么结果也是NA, 额。。。好像还是0,修改了?

    df['one'].sum()
    
    -1.541684617991043
    
    df = pd.DataFrame(index=[0,1,2,3,4,5],columns=['one','two'])
    df
    
    one two
    0 NaN NaN
    1 NaN NaN
    2 NaN NaN
    3 NaN NaN
    4 NaN NaN
    5 NaN NaN
    df['one'].sum()
    
    0
    

    清理,替换缺失值

    df = pd.DataFrame(np.random.randn(3, 3), index=['a', 'c', 'e'],columns=['one',
    'two', 'three'])
    
    df = df.reindex(['a', 'b', 'c'])
    df
    
    one two three
    a 0.326460 1.529225 1.230027
    b NaN NaN NaN
    c 1.296313 0.032379 2.182915
    df.fillna(0)
    
    one two three
    a 0.326460 1.529225 1.230027
    b 0.000000 0.000000 0.000000
    c 1.296313 0.032379 2.182915
    df  #看来上面是返回一个副本
    
    one two three
    a 0.326460 1.529225 1.230027
    b NaN NaN NaN
    c 1.296313 0.032379 2.182915
    df.fillna('haha')
    
    one two three
    a 0.32646 1.52922 1.23003
    b haha haha haha
    c 1.29631 0.0323788 2.18292

    前向后向替换

    df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
    'h'],columns=['one', 'two', 'three'])
    
    df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
    df
    
    one two three
    a 2.218769 -0.742408 -1.068846
    b NaN NaN NaN
    c 0.467651 0.372357 1.387020
    d NaN NaN NaN
    e -0.868840 -0.648827 -2.261319
    f -0.755799 0.159130 -0.129401
    g NaN NaN NaN
    h 0.703744 -1.665470 0.166229
    df.fillna(method='pad')  # 或者 method='ffill'
    
    one two three
    a 2.218769 -0.742408 -1.068846
    b 2.218769 -0.742408 -1.068846
    c 0.467651 0.372357 1.387020
    d 0.467651 0.372357 1.387020
    e -0.868840 -0.648827 -2.261319
    f -0.755799 0.159130 -0.129401
    g -0.755799 0.159130 -0.129401
    h 0.703744 -1.665470 0.166229

    可以看到,会用前面的元素来替换

    df.fillna(method="backfill")  #或者  method='bfill'
    
    one two three
    a 2.218769 -0.742408 -1.068846
    b 0.467651 0.372357 1.387020
    c 0.467651 0.372357 1.387020
    d -0.868840 -0.648827 -2.261319
    e -0.868840 -0.648827 -2.261319
    f -0.755799 0.159130 -0.129401
    g 0.703744 -1.665470 0.166229
    h 0.703744 -1.665470 0.166229
    help(df.fillna)
    
    Help on method fillna in module pandas.core.frame:
    
    fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs) method of pandas.core.frame.DataFrame instance
        Fill NA/NaN values using the specified method
        
        Parameters
        ----------
        value : scalar, dict, Series, or DataFrame
            Value to use to fill holes (e.g. 0), alternately a
            dict/Series/DataFrame of values specifying which value to use for
            each index (for a Series) or column (for a DataFrame). (values not
            in the dict/Series/DataFrame will not be filled). This value cannot
            be a list.
        method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
            Method to use for filling holes in reindexed Series
            pad / ffill: propagate last valid observation forward to next valid
            backfill / bfill: use NEXT valid observation to fill gap
        axis : {0 or 'index', 1 or 'columns'}
        inplace : boolean, default False
            If True, fill in place. Note: this will modify any
            other views on this object, (e.g. a no-copy slice for a column in a
            DataFrame).
        limit : int, default None
            If method is specified, this is the maximum number of consecutive
            NaN values to forward/backward fill. In other words, if there is
            a gap with more than this number of consecutive NaNs, it will only
            be partially filled. If method is not specified, this is the
            maximum number of entries along the entire axis where NaNs will be
            filled. Must be greater than 0 if not None.
        downcast : dict, default is None
            a dict of item->dtype of what to downcast if possible,
            or the string 'infer' which will try to downcast to an appropriate
            equal type (e.g. float64 to int64 if possible)
        
        See Also
        --------
        interpolate : Fill NaN values using interpolation.
        reindex, asfreq
        
        Returns
        -------
        filled : DataFrame
        
        Examples
        --------
        >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
        ...                    [3, 4, np.nan, 1],
        ...                    [np.nan, np.nan, np.nan, 5],
        ...                    [np.nan, 3, np.nan, 4]],
        ...                    columns=list('ABCD'))
        >>> df
             A    B   C  D
        0  NaN  2.0 NaN  0
        1  3.0  4.0 NaN  1
        2  NaN  NaN NaN  5
        3  NaN  3.0 NaN  4
        
        Replace all NaN elements with 0s.
        
        >>> df.fillna(0)
            A   B   C   D
        0   0.0 2.0 0.0 0
        1   3.0 4.0 0.0 1
        2   0.0 0.0 0.0 5
        3   0.0 3.0 0.0 4
        
        We can also propagate non-null values forward or backward.
        
        >>> df.fillna(method='ffill')
            A   B   C   D
        0   NaN 2.0 NaN 0
        1   3.0 4.0 NaN 1
        2   3.0 4.0 NaN 5
        3   3.0 3.0 NaN 4
        
        Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
        2, and 3 respectively.
        
        >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
        >>> df.fillna(value=values)
            A   B   C   D
        0   0.0 2.0 2.0 0
        1   3.0 4.0 2.0 1
        2   0.0 1.0 2.0 5
        3   0.0 3.0 2.0 4
        
        Only replace the first NaN element.
        
        >>> df.fillna(value=values, limit=1)
            A   B   C   D
        0   0.0 2.0 2.0 0
        1   3.0 4.0 NaN 1
        2   NaN 1.0 NaN 5
        3   NaN 3.0 NaN 4
    

    丢弃缺失数据

    我们可以利用dropna来舍弃缺失数据所在的轴(默认为行)

    df.dropna()
    
    one two three
    a 2.218769 -0.742408 -1.068846
    c 0.467651 0.372357 1.387020
    e -0.868840 -0.648827 -2.261319
    f -0.755799 0.159130 -0.129401
    h 0.703744 -1.665470 0.166229
    df = pd.DataFrame({'one':np.arange(10), 'two':np.arange(10), 'three':np.arange(10)})
    df.iloc[1, 2] = np.nan
    df
    
    one two three
    0 0 0 0.0
    1 1 1 NaN
    2 2 2 2.0
    3 3 3 3.0
    4 4 4 4.0
    5 5 5 5.0
    6 6 6 6.0
    7 7 7 7.0
    8 8 8 8.0
    9 9 9 9.0
    df.dropna(axis=1)
    
    one two
    0 0 0
    1 1 1
    2 2 2
    3 3 3
    4 4 4
    5 5 5
    6 6 6
    7 7 7
    8 8 8
    9 9 9

    替换数据 .replace()

    df = pd.DataFrame({'one':[10,20,30,40,50,2000], 'two':[1000,0,30,40,50,60]})
    df
    
    one two
    0 10 1000
    1 20 0
    2 30 30
    3 40 40
    4 50 50
    5 2000 60
    df.replace({1000:10, 2000:66})
    
    one two
    0 10 10
    1 20 0
    2 30 30
    3 40 40
    4 50 50
    5 66 60
    help(df.replace)
    
    Help on method replace in module pandas.core.frame:
    
    replace(to_replace=None, value=None, inplace=False, limit=None, regex=False, method='pad') method of pandas.core.frame.DataFrame instance
        Replace values given in `to_replace` with `value`.
        
        Values of the DataFrame are replaced with other values dynamically.
        This differs from updating with ``.loc`` or ``.iloc``, which require
        you to specify a location to update with some value.
        
        Parameters
        ----------
        to_replace : str, regex, list, dict, Series, int, float, or None
            How to find the values that will be replaced.
        
            * numeric, str or regex:
        
                - numeric: numeric values equal to `to_replace` will be
                  replaced with `value`
                - str: string exactly matching `to_replace` will be replaced
                  with `value`
                - regex: regexs matching `to_replace` will be replaced with
                  `value`
        
            * list of str, regex, or numeric:
        
                - First, if `to_replace` and `value` are both lists, they
                  **must** be the same length.
                - Second, if ``regex=True`` then all of the strings in **both**
                  lists will be interpreted as regexs otherwise they will match
                  directly. This doesn't matter much for `value` since there
                  are only a few possible substitution regexes you can use.
                - str, regex and numeric rules apply as above.
        
            * dict:
        
                - Dicts can be used to specify different replacement values
                  for different existing values. For example,
                  ``{'a': 'b', 'y': 'z'}`` replaces the value 'a' with 'b' and
                  'y' with 'z'. To use a dict in this way the `value`
                  parameter should be `None`.
                - For a DataFrame a dict can specify that different values
                  should be replaced in different columns. For example,
                  ``{'a': 1, 'b': 'z'}`` looks for the value 1 in column 'a'
                  and the value 'z' in column 'b' and replaces these values
                  with whatever is specified in `value`. The `value` parameter
                  should not be ``None`` in this case. You can treat this as a
                  special case of passing two lists except that you are
                  specifying the column to search in.
                - For a DataFrame nested dictionaries, e.g.,
                  ``{'a': {'b': np.nan}}``, are read as follows: look in column
                  'a' for the value 'b' and replace it with NaN. The `value`
                  parameter should be ``None`` to use a nested dict in this
                  way. You can nest regular expressions as well. Note that
                  column names (the top-level dictionary keys in a nested
                  dictionary) **cannot** be regular expressions.
        
            * None:
        
                - This means that the `regex` argument must be a string,
                  compiled regular expression, or list, dict, ndarray or
                  Series of such elements. If `value` is also ``None`` then
                  this **must** be a nested dictionary or Series.
        
            See the examples section for examples of each of these.
        value : scalar, dict, list, str, regex, default None
            Value to replace any values matching `to_replace` with.
            For a DataFrame a dict of values can be used to specify which
            value to use for each column (columns not in the dict will not be
            filled). Regular expressions, strings and lists or dicts of such
            objects are also allowed.
        inplace : boolean, default False
            If True, in place. Note: this will modify any
            other views on this object (e.g. a column from a DataFrame).
            Returns the caller if this is True.
        limit : int, default None
            Maximum size gap to forward or backward fill.
        regex : bool or same types as `to_replace`, default False
            Whether to interpret `to_replace` and/or `value` as regular
            expressions. If this is ``True`` then `to_replace` *must* be a
            string. Alternatively, this could be a regular expression or a
            list, dict, or array of regular expressions in which case
            `to_replace` must be ``None``.
        method : {'pad', 'ffill', 'bfill', `None`}
            The method to use when for replacement, when `to_replace` is a
            scalar, list or tuple and `value` is ``None``.
        
            .. versionchanged:: 0.23.0
                Added to DataFrame.
        
        See Also
        --------
        DataFrame.fillna : Fill NA values
        DataFrame.where : Replace values based on boolean condition
        Series.str.replace : Simple string replacement.
        
        Returns
        -------
        DataFrame
            Object after replacement.
        
        Raises
        ------
        AssertionError
            * If `regex` is not a ``bool`` and `to_replace` is not
              ``None``.
        TypeError
            * If `to_replace` is a ``dict`` and `value` is not a ``list``,
              ``dict``, ``ndarray``, or ``Series``
            * If `to_replace` is ``None`` and `regex` is not compilable
              into a regular expression or is a list, dict, ndarray, or
              Series.
            * When replacing multiple ``bool`` or ``datetime64`` objects and
              the arguments to `to_replace` does not match the type of the
              value being replaced
        ValueError
            * If a ``list`` or an ``ndarray`` is passed to `to_replace` and
              `value` but they are not the same length.
        
        Notes
        -----
        * Regex substitution is performed under the hood with ``re.sub``. The
          rules for substitution for ``re.sub`` are the same.
        * Regular expressions will only substitute on strings, meaning you
          cannot provide, for example, a regular expression matching floating
          point numbers and expect the columns in your frame that have a
          numeric dtype to be matched. However, if those floating point
          numbers *are* strings, then you can do this.
        * This method has *a lot* of options. You are encouraged to experiment
          and play with this method to gain intuition about how it works.
        * When dict is used as the `to_replace` value, it is like
          key(s) in the dict are the to_replace part and
          value(s) in the dict are the value parameter.
        
        Examples
        --------
        
        **Scalar `to_replace` and `value`**
        
        >>> s = pd.Series([0, 1, 2, 3, 4])
        >>> s.replace(0, 5)
        0    5
        1    1
        2    2
        3    3
        4    4
        dtype: int64
        
        >>> df = pd.DataFrame({'A': [0, 1, 2, 3, 4],
        ...                    'B': [5, 6, 7, 8, 9],
        ...                    'C': ['a', 'b', 'c', 'd', 'e']})
        >>> df.replace(0, 5)
           A  B  C
        0  5  5  a
        1  1  6  b
        2  2  7  c
        3  3  8  d
        4  4  9  e
        
        **List-like `to_replace`**
        
        >>> df.replace([0, 1, 2, 3], 4)
           A  B  C
        0  4  5  a
        1  4  6  b
        2  4  7  c
        3  4  8  d
        4  4  9  e
        
        >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1])
           A  B  C
        0  4  5  a
        1  3  6  b
        2  2  7  c
        3  1  8  d
        4  4  9  e
        
        >>> s.replace([1, 2], method='bfill')
        0    0
        1    3
        2    3
        3    3
        4    4
        dtype: int64
        
        **dict-like `to_replace`**
        
        >>> df.replace({0: 10, 1: 100})
             A  B  C
        0   10  5  a
        1  100  6  b
        2    2  7  c
        3    3  8  d
        4    4  9  e
        
        >>> df.replace({'A': 0, 'B': 5}, 100)
             A    B  C
        0  100  100  a
        1    1    6  b
        2    2    7  c
        3    3    8  d
        4    4    9  e
        
        >>> df.replace({'A': {0: 100, 4: 400}})
             A  B  C
        0  100  5  a
        1    1  6  b
        2    2  7  c
        3    3  8  d
        4  400  9  e
        
        **Regular expression `to_replace`**
        
        >>> df = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
        ...                    'B': ['abc', 'bar', 'xyz']})
        >>> df.replace(to_replace=r'^ba.$', value='new', regex=True)
              A    B
        0   new  abc
        1   foo  new
        2  bait  xyz
        
        >>> df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True)
              A    B
        0   new  abc
        1   foo  bar
        2  bait  xyz
        
        >>> df.replace(regex=r'^ba.$', value='new')
              A    B
        0   new  abc
        1   foo  new
        2  bait  xyz
        
        >>> df.replace(regex={r'^ba.$':'new', 'foo':'xyz'})
              A    B
        0   new  abc
        1   xyz  new
        2  bait  xyz
        
        >>> df.replace(regex=[r'^ba.$', 'foo'], value='new')
              A    B
        0   new  abc
        1   new  new
        2  bait  xyz
        
        Note that when replacing multiple ``bool`` or ``datetime64`` objects,
        the data types in the `to_replace` parameter must match the data
        type of the value being replaced:
        
        >>> df = pd.DataFrame({'A': [True, False, True],
        ...                    'B': [False, True, False]})
        >>> df.replace({'a string': 'new value', True: False})  # raises
        Traceback (most recent call last):
            ...
        TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str'
        
        This raises a ``TypeError`` because one of the ``dict`` keys is not of
        the correct type for replacement.
        
        Compare the behavior of ``s.replace({'a': None})`` and
        ``s.replace('a', None)`` to understand the pecularities
        of the `to_replace` parameter:
        
        >>> s = pd.Series([10, 'a', 'a', 'b', 'a'])
        
        When one uses a dict as the `to_replace` value, it is like the
        value(s) in the dict are equal to the `value` parameter.
        ``s.replace({'a': None})`` is equivalent to
        ``s.replace(to_replace={'a': None}, value=None, method=None)``:
        
        >>> s.replace({'a': None})
        0      10
        1    None
        2    None
        3       b
        4    None
        dtype: object
        
        When ``value=None`` and `to_replace` is a scalar, list or
        tuple, `replace` uses the method parameter (default 'pad') to do the
        replacement. So this is why the 'a' values are being replaced by 10
        in rows 1 and 2 and 'b' in row 4 in this case.
        The command ``s.replace('a', None)`` is actually equivalent to
        ``s.replace(to_replace='a', value=None, method='pad')``:
        
        >>> s.replace('a', None)
        0    10
        1    10
        2    10
        3     b
        4     b
        dtype: object
    
    df = pd.DataFrame({'one':np.arange(10), 'two':np.arange(2, 12)})
    df
    
    one two
    0 0 2
    1 1 3
    2 2 4
    3 3 5
    4 4 6
    5 5 7
    6 6 8
    7 7 9
    8 8 10
    9 9 11
    df.replace(r'4', 'haha', regex=True)
    
    one two
    0 0 2
    1 1 3
    2 2 4
    3 3 5
    4 4 6
    5 5 7
    6 6 8
    7 7 9
    8 8 10
    9 9 11
    s = pd.Series(['1', '2', '3'])
    s
    
    0    1
    1    2
    2    3
    dtype: object
    
    s.replace(r'3', 4, regex=True)
    
    0    1
    1    2
    2    4
    dtype: object
    
    df = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
                 'B': ['abc', 'bar', 'xyz']})
    df
    
    A B
    0 bat abc
    1 foo bar
    2 bait xyz
    df.replace(r'ba', 'new', regex=True)
    
    A B
    0 newt abc
    1 foo newr
    2 newit xyz

    看来只有str才能用regex

  • 相关阅读:
    docker相关
    多线程
    设计模式
    ftp下载乱码问题
    Windows无法启动SQL server 代理服务(服务器)错误1067:进程意外终止
    Struts2 if标签
    Java项目编译时经常会出现不编译,或者报一些假性错误
    ajaxSubmit 上传文件 提示下载json处理
    MySQL中优化sql语句查询常用的30种方法
    mybatis 中的where标签
  • 原文地址:https://www.cnblogs.com/MTandHJ/p/11632540.html
Copyright © 2011-2022 走看看