zoukankan      html  css  js  c++  java
  • pandas学习笔记

    #构造一行数据
    >>> s = pd.Series([1,3,6,np.nan,44,1])
    >>> s
    0 1.0
    1 3.0
    2 6.0
    3 NaN
    4 44.0
    5 1.0
    dtype: float64

    #创建一个索引列
    >>> dates = pd.date_range('20160101',periods=6)
    >>> dates
    DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
    '2016-01-05', '2016-01-06'],
    dtype='datetime64[ns]', freq='D')
    >>> df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
    >>> df
    a b c d
    2016-01-01 0.607993 -0.937166 -0.112296 -0.072984
    2016-01-02 -0.040631 0.535248 0.872500 0.043600
    2016-01-03 -2.345816 0.433454 0.845263 -1.181626
    2016-01-04 -0.021304 0.119520 -0.827935 0.347086
    2016-01-05 -1.626559 0.272060 0.854826 -0.972919
    2016-01-06 -1.213062 -0.980244 -1.011220 -0.119660
    >>> df.index
    DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
    '2016-01-05', '2016-01-06'],
    dtype='datetime64[ns]', freq='D')
    >>> df.columns
    Index([u'a', u'b', u'c', u'd'], dtype='object')
    >>> df.values
    array([[ 0.60799315, -0.93716631, -0.11229586, -0.07298433],
    [-0.04063099, 0.53524821, 0.87249982, 0.04360029],
    [-2.34581586, 0.43345385, 0.8452626 , -1.18162629],
    [-0.02130354, 0.11952033, -0.827935 , 0.34708601],
    [-1.62655855, 0.27206008, 0.85482623, -0.97291868],
    [-1.21306154, -0.98024449, -1.01122044, -0.11965997]])
    >>> df.describe()
    a b c d
    count 6.000000 6.000000 6.000000 6.000000
    mean -0.773230 -0.092855 0.103523 -0.326084
    std 1.131770 0.685580 0.878954 0.607757
    min -2.345816 -0.980244 -1.011220 -1.181626
    25% -1.523184 -0.672995 -0.649025 -0.759604
    50% -0.626846 0.195790 0.366483 -0.096322
    75% -0.026135 0.393105 0.852435 0.014454
    max 0.607993 0.535248 0.872500 0.347086
    >>> df.T
    2016-01-01 2016-01-02 2016-01-03 2016-01-04 2016-01-05 2016-01-06
    a 0.607993 -0.040631 -2.345816 -0.021304 -1.626559 -1.213062
    b -0.937166 0.535248 0.433454 0.119520 0.272060 -0.980244
    c -0.112296 0.872500 0.845263 -0.827935 0.854826 -1.011220
    d -0.072984 0.043600 -1.181626 0.347086 -0.972919 -0.119660

    #给列的先后顺序排序
    >>> df.sort_index(axis=1,ascending=False)
    d c b a
    2016-01-01 -0.072984 -0.112296 -0.937166 0.607993
    2016-01-02 0.043600 0.872500 0.535248 -0.040631
    2016-01-03 -1.181626 0.845263 0.433454 -2.345816
    2016-01-04 0.347086 -0.827935 0.119520 -0.021304
    2016-01-05 -0.972919 0.854826 0.272060 -1.626559
    2016-01-06 -0.119660 -1.011220 -0.980244 -1.213062

    #根据索引值排序
    >>> df.sort_index(axis=0,ascending=False)
    a b c d
    2016-01-06 -1.213062 -0.980244 -1.011220 -0.119660
    2016-01-05 -1.626559 0.272060 0.854826 -0.972919
    2016-01-04 -0.021304 0.119520 -0.827935 0.347086
    2016-01-03 -2.345816 0.433454 0.845263 -1.181626
    2016-01-02 -0.040631 0.535248 0.872500 0.043600
    2016-01-01 0.607993 -0.937166 -0.112296 -0.072984

    #根据a列的值排序
    >>> df.sort_values(by='a')
    a b c d
    2016-01-03 -2.345816 0.433454 0.845263 -1.181626
    2016-01-05 -1.626559 0.272060 0.854826 -0.972919
    2016-01-06 -1.213062 -0.980244 -1.011220 -0.119660
    2016-01-02 -0.040631 0.535248 0.872500 0.043600
    2016-01-04 -0.021304 0.119520 -0.827935 0.347086
    2016-01-01 0.607993 -0.937166 -0.112296 -0.072984

    #pandas选择数据
    >>> df.a #或者df['a']
    2016-01-01 0.607993
    2016-01-02 -0.040631
    2016-01-03 -2.345816
    2016-01-04 -0.021304
    2016-01-05 -1.626559
    2016-01-06 -1.213062
    Freq: D, Name: a, dtype: float64
    >>> df[0:3]
    a b c d
    2016-01-01 0.607993 -0.937166 -0.112296 -0.072984
    2016-01-02 -0.040631 0.535248 0.872500 0.043600
    2016-01-03 -2.345816 0.433454 0.845263 -1.181626

    #select by label: loc 根据索引的值选择行
    >>> df.loc['2016-01-01']
    a 0.607993
    b -0.937166
    c -0.112296
    d -0.072984
    Name: 2016-01-01 00:00:00, dtype: float64

    #根据切片选择行
    >>> df[:1]
    a b c d
    2016-01-01 0.607993 -0.937166 -0.112296 -0.072984

    #根据索引值和列名选择行和列
    >>> df.loc['2016-01-01',['a','b']]
    a 0.607993
    b -0.937166
    Name: 2016-01-01 00:00:00, dtype: float64


    #select by position: iloc
    #选择第3行
    >>> df.iloc[3]
    a -0.021304
    b 0.119520
    c -0.827935
    d 0.347086
    Name: 2016-01-04 00:00:00, dtype: float64

    #选择第3行第1列
    >>> df.iloc[3,1]
    0.11952032779945752

    #选择第1,3,4行,第1到3列
    >>> df.iloc[[1,3,4],1:3]
    b c
    2016-01-02 0.535248 0.872500
    2016-01-04 0.119520 -0.827935
    2016-01-05 0.272060 0.854826

    #根据切片和列名选择数据
    #选择前面3行,第a列和c列的值
    >>> df.ix[:3,['a','c']]
    a c
    2016-01-01 0.607993 -0.112296
    2016-01-02 -0.040631 0.872500
    2016-01-03 -2.345816 0.845263

    #选择a列中值小于0的数据
    >>> df[df.a<0]
    a b c d
    2016-01-02 -0.040631 0.535248 0.872500 0.043600
    2016-01-03 -2.345816 0.433454 0.845263 -1.181626
    2016-01-04 -0.021304 0.119520 -0.827935 0.347086
    2016-01-05 -1.626559 0.272060 0.854826 -0.972919
    2016-01-06 -1.213062 -0.980244 -1.011220 -0.119660

    #pandas设置值
    >>> df.iloc[2,2]=111
    >>> df
    a b c d
    2016-01-01 0.607993 -0.937166 -0.112296 -0.072984
    2016-01-02 -0.040631 0.535248 0.872500 0.043600
    2016-01-03 -2.345816 0.433454 111.000000 -1.181626
    2016-01-04 -0.021304 0.119520 -0.827935 0.347086
    2016-01-05 -1.626559 0.272060 0.854826 -0.972919
    2016-01-06 -1.213062 -0.980244 -1.011220 -0.119660
    >>> df.loc['2016-01-01','b'] = 22222
    >>> df
    a b c d
    2016-01-01 0.607993 22222.000000 -0.112296 -0.072984
    2016-01-02 -0.040631 0.535248 0.872500 0.043600
    2016-01-03 -2.345816 0.433454 111.000000 -1.181626
    2016-01-04 -0.021304 0.119520 -0.827935 0.347086
    2016-01-05 -1.626559 0.272060 0.854826 -0.972919
    2016-01-06 -1.213062 -0.980244 -1.011220 -0.119660
    >>> df[df.a<0] = 0
    >>> df
    a b c d
    2016-01-01 0.607993 22222.0 -0.112296 -0.072984
    2016-01-02 0.000000 0.0 0.000000 0.000000
    2016-01-03 0.000000 0.0 0.000000 0.000000
    2016-01-04 0.000000 0.0 0.000000 0.000000
    2016-01-05 0.000000 0.0 0.000000 0.000000
    2016-01-06 0.000000 0.0 0.000000 0.000000
    >>> df.c[df.c==0]=111
    >>> df
    a b c d
    2016-01-01 0.607993 22222.0 -0.112296 -0.072984
    2016-01-02 0.000000 0.0 111.000000 0.000000
    2016-01-03 0.000000 0.0 111.000000 0.000000
    2016-01-04 0.000000 0.0 111.000000 0.000000
    2016-01-05 0.000000 0.0 111.000000 0.000000
    2016-01-06 0.000000 0.0 111.000000 0.000000


    >>> df['e']=np.nan
    >>> df
    a b c d e
    2016-01-01 0.607993 22222.0 -0.112296 -0.072984 NaN
    2016-01-02 0.000000 0.0 111.000000 0.000000 NaN
    2016-01-03 0.000000 0.0 111.000000 0.000000 NaN
    2016-01-04 0.000000 0.0 111.000000 0.000000 NaN
    2016-01-05 0.000000 0.0 111.000000 0.000000 NaN
    2016-01-06 0.000000 0.0 111.000000 0.000000 NaN


    >>> df['f'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('2016-01-01',periods=6))
    >>> df
    a b c d e f
    2016-01-01 0.607993 22222.0 -0.112296 -0.072984 NaN 1
    2016-01-02 0.000000 0.0 111.000000 0.000000 NaN 2
    2016-01-03 0.000000 0.0 111.000000 0.000000 NaN 3
    2016-01-04 0.000000 0.0 111.000000 0.000000 NaN 4
    2016-01-05 0.000000 0.0 111.000000 0.000000 NaN 5
    2016-01-06 0.000000 0.0 111.000000 0.000000 NaN 6

    #处理丢失数据
    axis=1时丢掉任何一列中有nan的数据,axis=0时丢掉有nan的那一行数据
    >>> df.dropna(axis=1,how='any') #how={'any','all'}
    a b c d f
    2016-01-01 0.607993 22222.0 -0.112296 -0.072984 1
    2016-01-02 0.000000 0.0 111.000000 0.000000 2
    2016-01-03 0.000000 0.0 111.000000 0.000000 3
    2016-01-04 0.000000 0.0 111.000000 0.000000 4
    2016-01-05 0.000000 0.0 111.000000 0.000000 5
    2016-01-06 0.000000 0.0 111.000000 0.000000 6

    #将数据中为nan的数据填充为0
    >>> df.fillna(value=0)
    a b c d e f
    2016-01-01 0.607993 22222.0 -0.112296 -0.072984 0.0 1
    2016-01-02 0.000000 0.0 111.000000 0.000000 0.0 2
    2016-01-03 0.000000 0.0 111.000000 0.000000 0.0 3
    2016-01-04 0.000000 0.0 111.000000 0.000000 0.0 4
    2016-01-05 0.000000 0.0 111.000000 0.000000 0.0 5
    2016-01-06 0.000000 0.0 111.000000 0.000000 0.0 6

    #判断是否有缺失数据
    >>> df.isnull()
    a b c d e f
    2016-01-01 False False False False True False
    2016-01-02 False False False False True False
    2016-01-03 False False False False True False
    2016-01-04 False False False False True False
    2016-01-05 False False False False True False
    2016-01-06 False False False False True False


    #判断整个表格中是否有丢失的数据
    >>> np.any(df.isnull())==True
    True


    #pandas数据导入导出
    >>> pd.read_csv('/Users/lijie/Downloads/student.csv')
    Student ID name age gender
    0 1100 Kelly 22 Female
    1 1101 Clo 21 Female
    2 1102 Tilly 22 Female
    3 1103 Tony 24 Male
    4 1104 David 20 Male
    5 1105 Catty 22 Female
    6 1106 M 3 Female
    7 1107 N 43 Male
    8 1108 A 13 Male
    9 1109 S 12 Male
    10 1110 David 33 Male
    11 1111 Dw 3 Female
    12 1112 Q 23 Male
    13 1113 W 21 Female


    #pandas数据合并
    #contact
    >>> df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d']
    ... )
    >>> df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
    >>> df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
    >>> df1
    a b c d
    0 0.0 0.0 0.0 0.0
    1 0.0 0.0 0.0 0.0
    2 0.0 0.0 0.0 0.0
    >>> df2
    a b c d
    0 1.0 1.0 1.0 1.0
    1 1.0 1.0 1.0 1.0
    2 1.0 1.0 1.0 1.0
    >>> df3
    a b c d
    0 2.0 2.0 2.0 2.0
    1 2.0 2.0 2.0 2.0
    2 2.0 2.0 2.0 2.0
    >>> res = pd.concat([df1,df2,df3],axis=0)
    >>> res
    a b c d
    0 0.0 0.0 0.0 0.0
    1 0.0 0.0 0.0 0.0
    2 0.0 0.0 0.0 0.0
    0 1.0 1.0 1.0 1.0
    1 1.0 1.0 1.0 1.0
    2 1.0 1.0 1.0 1.0
    0 2.0 2.0 2.0 2.0
    1 2.0 2.0 2.0 2.0
    2 2.0 2.0 2.0 2.0
    >>> res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
    >>> res
    a b c d
    0 0.0 0.0 0.0 0.0
    1 0.0 0.0 0.0 0.0
    2 0.0 0.0 0.0 0.0
    3 1.0 1.0 1.0 1.0
    4 1.0 1.0 1.0 1.0
    5 1.0 1.0 1.0 1.0
    6 2.0 2.0 2.0 2.0
    7 2.0 2.0 2.0 2.0
    8 2.0 2.0 2.0 2.0


    >>> df4=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
    >>> df5=pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
    >>> df4
    a b c d
    1 0.0 0.0 0.0 0.0
    2 0.0 0.0 0.0 0.0
    3 0.0 0.0 0.0 0.0
    >>> df5
    b c d e
    2 1.0 1.0 1.0 1.0
    3 1.0 1.0 1.0 1.0
    4 1.0 1.0 1.0 1.0
    >>> res = pd.concat([df4,df5])
    >>> res
    a b c d e
    1 0.0 0.0 0.0 0.0 NaN
    2 0.0 0.0 0.0 0.0 NaN
    3 0.0 0.0 0.0 0.0 NaN
    2 NaN 1.0 1.0 1.0 1.0
    3 NaN 1.0 1.0 1.0 1.0
    4 NaN 1.0 1.0 1.0 1.0
    >>> res = pd.concat([df4,df5],join='inner')
    >>> res
    b c d
    1 0.0 0.0 0.0
    2 0.0 0.0 0.0
    3 0.0 0.0 0.0
    2 1.0 1.0 1.0
    3 1.0 1.0 1.0
    4 1.0 1.0 1.0

    >>> left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
    ... 'A': ['A0', 'A1', 'A2', 'A3'],
    ... 'B': ['B0', 'B1', 'B2', 'B3']})
    >>> right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
    ... 'C': ['C0', 'C1', 'C2', 'C3'],
    ... 'D': ['D0', 'D1', 'D2', 'D3']})
    >>> left
    A B key
    0 A0 B0 K0
    1 A1 B1 K1
    2 A2 B2 K2
    3 A3 B3 K3
    >>> right
    C D key
    0 C0 D0 K0
    1 C1 D1 K1
    2 C2 D2 K2
    3 C3 D3 K3
    >>> res = pd.merge(left,right,on='key')
    >>> res
    A B key C D
    0 A0 B0 K0 C0 D0
    1 A1 B1 K1 C1 D1
    2 A2 B2 K2 C2 D2
    3 A3 B3 K3 C3 D3


    >>> left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
    ... 'key2': ['K0', 'K1', 'K0', 'K1'],
    ... 'A': ['A0', 'A1', 'A2', 'A3'],
    ... 'B': ['B0', 'B1', 'B2', 'B3']})
    >>> right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
    ... 'key2': ['K0', 'K0', 'K0', 'K0'],
    ... 'C': ['C0', 'C1', 'C2', 'C3'],
    ... 'D': ['D0', 'D1', 'D2', 'D3']})
    >>> res = pd.merge(left,right,on=['key1','key2'])
    >>> res
    A B key1 key2 C D
    0 A0 B0 K0 K0 C0 D0
    1 A2 B2 K1 K0 C1 D1
    2 A2 B2 K1 K0 C2 D2

    >>> import matplotlib.pyplot as plt
    >>> data = pd.Series(np.random.randn(1000),index=np.arange(1000))
    >>> data=data.cumsum()
    >>> data.plot()
    <matplotlib.axes.AxesSubplot object at 0x10409ffd0>
    >>> plt.show()

    >>> data = pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list('ABCD'))
    >>> data.head()
    A B C D
    0 -0.219043 -0.116109 -0.227378 -1.246710
    1 0.603295 -2.291828 -0.245817 0.178349
    2 -0.661455 1.234543 1.193432 0.145587
    3 2.185926 -1.254439 0.029333 -0.475892
    4 -0.282924 -0.127020 0.359198 -0.719617
    >>> data=data.cumsum()
    >>> data.plot()
    <matplotlib.axes.AxesSubplot object at 0x107d8f490>
    >>> plt.show()

    >>> ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class1')
    >>> data.plot.scatter(x='A',y='C',color='DarkGreen',label='Class2',ax=ax)
    <matplotlib.axes.AxesSubplot object at 0x103c5fc10>
    >>> plt.show()

  • 相关阅读:
    NYOJ 10 skiing DFS+DP
    51nod 1270 数组的最大代价
    HDU 4635 Strongly connected
    HDU 4612 Warm up
    POJ 3177 Redundant Paths
    HDU 1629 迷宫城堡
    uva 796
    uva 315
    POJ 3180 The Cow Prom
    POJ 1236 Network of Schools
  • 原文地址:https://www.cnblogs.com/sprouts/p/7650062.html
Copyright © 2011-2022 走看看