zoukankan      html  css  js  c++  java
  • pandas学习笔记

    #构造一行数据
    >>> s = pd.Series([1,3,6,np.nan,44,1])
    >>> s
    0 1.0
    1 3.0
    2 6.0
    3 NaN
    4 44.0
    5 1.0
    dtype: float64

    #创建一个索引列
    >>> dates = pd.date_range('20160101',periods=6)
    >>> dates
    DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
    '2016-01-05', '2016-01-06'],
    dtype='datetime64[ns]', freq='D')
    >>> df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
    >>> df
    a b c d
    2016-01-01 0.607993 -0.937166 -0.112296 -0.072984
    2016-01-02 -0.040631 0.535248 0.872500 0.043600
    2016-01-03 -2.345816 0.433454 0.845263 -1.181626
    2016-01-04 -0.021304 0.119520 -0.827935 0.347086
    2016-01-05 -1.626559 0.272060 0.854826 -0.972919
    2016-01-06 -1.213062 -0.980244 -1.011220 -0.119660
    >>> df.index
    DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
    '2016-01-05', '2016-01-06'],
    dtype='datetime64[ns]', freq='D')
    >>> df.columns
    Index([u'a', u'b', u'c', u'd'], dtype='object')
    >>> df.values
    array([[ 0.60799315, -0.93716631, -0.11229586, -0.07298433],
    [-0.04063099, 0.53524821, 0.87249982, 0.04360029],
    [-2.34581586, 0.43345385, 0.8452626 , -1.18162629],
    [-0.02130354, 0.11952033, -0.827935 , 0.34708601],
    [-1.62655855, 0.27206008, 0.85482623, -0.97291868],
    [-1.21306154, -0.98024449, -1.01122044, -0.11965997]])
    >>> df.describe()
    a b c d
    count 6.000000 6.000000 6.000000 6.000000
    mean -0.773230 -0.092855 0.103523 -0.326084
    std 1.131770 0.685580 0.878954 0.607757
    min -2.345816 -0.980244 -1.011220 -1.181626
    25% -1.523184 -0.672995 -0.649025 -0.759604
    50% -0.626846 0.195790 0.366483 -0.096322
    75% -0.026135 0.393105 0.852435 0.014454
    max 0.607993 0.535248 0.872500 0.347086
    >>> df.T
    2016-01-01 2016-01-02 2016-01-03 2016-01-04 2016-01-05 2016-01-06
    a 0.607993 -0.040631 -2.345816 -0.021304 -1.626559 -1.213062
    b -0.937166 0.535248 0.433454 0.119520 0.272060 -0.980244
    c -0.112296 0.872500 0.845263 -0.827935 0.854826 -1.011220
    d -0.072984 0.043600 -1.181626 0.347086 -0.972919 -0.119660

    #给列的先后顺序排序
    >>> df.sort_index(axis=1,ascending=False)
    d c b a
    2016-01-01 -0.072984 -0.112296 -0.937166 0.607993
    2016-01-02 0.043600 0.872500 0.535248 -0.040631
    2016-01-03 -1.181626 0.845263 0.433454 -2.345816
    2016-01-04 0.347086 -0.827935 0.119520 -0.021304
    2016-01-05 -0.972919 0.854826 0.272060 -1.626559
    2016-01-06 -0.119660 -1.011220 -0.980244 -1.213062

    #根据索引值排序
    >>> df.sort_index(axis=0,ascending=False)
    a b c d
    2016-01-06 -1.213062 -0.980244 -1.011220 -0.119660
    2016-01-05 -1.626559 0.272060 0.854826 -0.972919
    2016-01-04 -0.021304 0.119520 -0.827935 0.347086
    2016-01-03 -2.345816 0.433454 0.845263 -1.181626
    2016-01-02 -0.040631 0.535248 0.872500 0.043600
    2016-01-01 0.607993 -0.937166 -0.112296 -0.072984

    #根据a列的值排序
    >>> df.sort_values(by='a')
    a b c d
    2016-01-03 -2.345816 0.433454 0.845263 -1.181626
    2016-01-05 -1.626559 0.272060 0.854826 -0.972919
    2016-01-06 -1.213062 -0.980244 -1.011220 -0.119660
    2016-01-02 -0.040631 0.535248 0.872500 0.043600
    2016-01-04 -0.021304 0.119520 -0.827935 0.347086
    2016-01-01 0.607993 -0.937166 -0.112296 -0.072984

    #pandas选择数据
    >>> df.a #或者df['a']
    2016-01-01 0.607993
    2016-01-02 -0.040631
    2016-01-03 -2.345816
    2016-01-04 -0.021304
    2016-01-05 -1.626559
    2016-01-06 -1.213062
    Freq: D, Name: a, dtype: float64
    >>> df[0:3]
    a b c d
    2016-01-01 0.607993 -0.937166 -0.112296 -0.072984
    2016-01-02 -0.040631 0.535248 0.872500 0.043600
    2016-01-03 -2.345816 0.433454 0.845263 -1.181626

    #select by label: loc 根据索引的值选择行
    >>> df.loc['2016-01-01']
    a 0.607993
    b -0.937166
    c -0.112296
    d -0.072984
    Name: 2016-01-01 00:00:00, dtype: float64

    #根据切片选择行
    >>> df[:1]
    a b c d
    2016-01-01 0.607993 -0.937166 -0.112296 -0.072984

    #根据索引值和列名选择行和列
    >>> df.loc['2016-01-01',['a','b']]
    a 0.607993
    b -0.937166
    Name: 2016-01-01 00:00:00, dtype: float64


    #select by position: iloc
    #选择第3行
    >>> df.iloc[3]
    a -0.021304
    b 0.119520
    c -0.827935
    d 0.347086
    Name: 2016-01-04 00:00:00, dtype: float64

    #选择第3行第1列
    >>> df.iloc[3,1]
    0.11952032779945752

    #选择第1,3,4行,第1到3列
    >>> df.iloc[[1,3,4],1:3]
    b c
    2016-01-02 0.535248 0.872500
    2016-01-04 0.119520 -0.827935
    2016-01-05 0.272060 0.854826

    #根据切片和列名选择数据
    #选择前面3行,第a列和c列的值
    >>> df.ix[:3,['a','c']]
    a c
    2016-01-01 0.607993 -0.112296
    2016-01-02 -0.040631 0.872500
    2016-01-03 -2.345816 0.845263

    #选择a列中值小于0的数据
    >>> df[df.a<0]
    a b c d
    2016-01-02 -0.040631 0.535248 0.872500 0.043600
    2016-01-03 -2.345816 0.433454 0.845263 -1.181626
    2016-01-04 -0.021304 0.119520 -0.827935 0.347086
    2016-01-05 -1.626559 0.272060 0.854826 -0.972919
    2016-01-06 -1.213062 -0.980244 -1.011220 -0.119660

    #pandas设置值
    >>> df.iloc[2,2]=111
    >>> df
    a b c d
    2016-01-01 0.607993 -0.937166 -0.112296 -0.072984
    2016-01-02 -0.040631 0.535248 0.872500 0.043600
    2016-01-03 -2.345816 0.433454 111.000000 -1.181626
    2016-01-04 -0.021304 0.119520 -0.827935 0.347086
    2016-01-05 -1.626559 0.272060 0.854826 -0.972919
    2016-01-06 -1.213062 -0.980244 -1.011220 -0.119660
    >>> df.loc['2016-01-01','b'] = 22222
    >>> df
    a b c d
    2016-01-01 0.607993 22222.000000 -0.112296 -0.072984
    2016-01-02 -0.040631 0.535248 0.872500 0.043600
    2016-01-03 -2.345816 0.433454 111.000000 -1.181626
    2016-01-04 -0.021304 0.119520 -0.827935 0.347086
    2016-01-05 -1.626559 0.272060 0.854826 -0.972919
    2016-01-06 -1.213062 -0.980244 -1.011220 -0.119660
    >>> df[df.a<0] = 0
    >>> df
    a b c d
    2016-01-01 0.607993 22222.0 -0.112296 -0.072984
    2016-01-02 0.000000 0.0 0.000000 0.000000
    2016-01-03 0.000000 0.0 0.000000 0.000000
    2016-01-04 0.000000 0.0 0.000000 0.000000
    2016-01-05 0.000000 0.0 0.000000 0.000000
    2016-01-06 0.000000 0.0 0.000000 0.000000
    >>> df.c[df.c==0]=111
    >>> df
    a b c d
    2016-01-01 0.607993 22222.0 -0.112296 -0.072984
    2016-01-02 0.000000 0.0 111.000000 0.000000
    2016-01-03 0.000000 0.0 111.000000 0.000000
    2016-01-04 0.000000 0.0 111.000000 0.000000
    2016-01-05 0.000000 0.0 111.000000 0.000000
    2016-01-06 0.000000 0.0 111.000000 0.000000


    >>> df['e']=np.nan
    >>> df
    a b c d e
    2016-01-01 0.607993 22222.0 -0.112296 -0.072984 NaN
    2016-01-02 0.000000 0.0 111.000000 0.000000 NaN
    2016-01-03 0.000000 0.0 111.000000 0.000000 NaN
    2016-01-04 0.000000 0.0 111.000000 0.000000 NaN
    2016-01-05 0.000000 0.0 111.000000 0.000000 NaN
    2016-01-06 0.000000 0.0 111.000000 0.000000 NaN


    >>> df['f'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('2016-01-01',periods=6))
    >>> df
    a b c d e f
    2016-01-01 0.607993 22222.0 -0.112296 -0.072984 NaN 1
    2016-01-02 0.000000 0.0 111.000000 0.000000 NaN 2
    2016-01-03 0.000000 0.0 111.000000 0.000000 NaN 3
    2016-01-04 0.000000 0.0 111.000000 0.000000 NaN 4
    2016-01-05 0.000000 0.0 111.000000 0.000000 NaN 5
    2016-01-06 0.000000 0.0 111.000000 0.000000 NaN 6

    #处理丢失数据
    axis=1时丢掉任何一列中有nan的数据,axis=0时丢掉有nan的那一行数据
    >>> df.dropna(axis=1,how='any') #how={'any','all'}
    a b c d f
    2016-01-01 0.607993 22222.0 -0.112296 -0.072984 1
    2016-01-02 0.000000 0.0 111.000000 0.000000 2
    2016-01-03 0.000000 0.0 111.000000 0.000000 3
    2016-01-04 0.000000 0.0 111.000000 0.000000 4
    2016-01-05 0.000000 0.0 111.000000 0.000000 5
    2016-01-06 0.000000 0.0 111.000000 0.000000 6

    #将数据中为nan的数据填充为0
    >>> df.fillna(value=0)
    a b c d e f
    2016-01-01 0.607993 22222.0 -0.112296 -0.072984 0.0 1
    2016-01-02 0.000000 0.0 111.000000 0.000000 0.0 2
    2016-01-03 0.000000 0.0 111.000000 0.000000 0.0 3
    2016-01-04 0.000000 0.0 111.000000 0.000000 0.0 4
    2016-01-05 0.000000 0.0 111.000000 0.000000 0.0 5
    2016-01-06 0.000000 0.0 111.000000 0.000000 0.0 6

    #判断是否有缺失数据
    >>> df.isnull()
    a b c d e f
    2016-01-01 False False False False True False
    2016-01-02 False False False False True False
    2016-01-03 False False False False True False
    2016-01-04 False False False False True False
    2016-01-05 False False False False True False
    2016-01-06 False False False False True False


    #判断整个表格中是否有丢失的数据
    >>> np.any(df.isnull())==True
    True


    #pandas数据导入导出
    >>> pd.read_csv('/Users/lijie/Downloads/student.csv')
    Student ID name age gender
    0 1100 Kelly 22 Female
    1 1101 Clo 21 Female
    2 1102 Tilly 22 Female
    3 1103 Tony 24 Male
    4 1104 David 20 Male
    5 1105 Catty 22 Female
    6 1106 M 3 Female
    7 1107 N 43 Male
    8 1108 A 13 Male
    9 1109 S 12 Male
    10 1110 David 33 Male
    11 1111 Dw 3 Female
    12 1112 Q 23 Male
    13 1113 W 21 Female


    #pandas数据合并
    #contact
    >>> df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d']
    ... )
    >>> df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
    >>> df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
    >>> df1
    a b c d
    0 0.0 0.0 0.0 0.0
    1 0.0 0.0 0.0 0.0
    2 0.0 0.0 0.0 0.0
    >>> df2
    a b c d
    0 1.0 1.0 1.0 1.0
    1 1.0 1.0 1.0 1.0
    2 1.0 1.0 1.0 1.0
    >>> df3
    a b c d
    0 2.0 2.0 2.0 2.0
    1 2.0 2.0 2.0 2.0
    2 2.0 2.0 2.0 2.0
    >>> res = pd.concat([df1,df2,df3],axis=0)
    >>> res
    a b c d
    0 0.0 0.0 0.0 0.0
    1 0.0 0.0 0.0 0.0
    2 0.0 0.0 0.0 0.0
    0 1.0 1.0 1.0 1.0
    1 1.0 1.0 1.0 1.0
    2 1.0 1.0 1.0 1.0
    0 2.0 2.0 2.0 2.0
    1 2.0 2.0 2.0 2.0
    2 2.0 2.0 2.0 2.0
    >>> res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
    >>> res
    a b c d
    0 0.0 0.0 0.0 0.0
    1 0.0 0.0 0.0 0.0
    2 0.0 0.0 0.0 0.0
    3 1.0 1.0 1.0 1.0
    4 1.0 1.0 1.0 1.0
    5 1.0 1.0 1.0 1.0
    6 2.0 2.0 2.0 2.0
    7 2.0 2.0 2.0 2.0
    8 2.0 2.0 2.0 2.0


    >>> df4=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
    >>> df5=pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
    >>> df4
    a b c d
    1 0.0 0.0 0.0 0.0
    2 0.0 0.0 0.0 0.0
    3 0.0 0.0 0.0 0.0
    >>> df5
    b c d e
    2 1.0 1.0 1.0 1.0
    3 1.0 1.0 1.0 1.0
    4 1.0 1.0 1.0 1.0
    >>> res = pd.concat([df4,df5])
    >>> res
    a b c d e
    1 0.0 0.0 0.0 0.0 NaN
    2 0.0 0.0 0.0 0.0 NaN
    3 0.0 0.0 0.0 0.0 NaN
    2 NaN 1.0 1.0 1.0 1.0
    3 NaN 1.0 1.0 1.0 1.0
    4 NaN 1.0 1.0 1.0 1.0
    >>> res = pd.concat([df4,df5],join='inner')
    >>> res
    b c d
    1 0.0 0.0 0.0
    2 0.0 0.0 0.0
    3 0.0 0.0 0.0
    2 1.0 1.0 1.0
    3 1.0 1.0 1.0
    4 1.0 1.0 1.0

    >>> left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
    ... 'A': ['A0', 'A1', 'A2', 'A3'],
    ... 'B': ['B0', 'B1', 'B2', 'B3']})
    >>> right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
    ... 'C': ['C0', 'C1', 'C2', 'C3'],
    ... 'D': ['D0', 'D1', 'D2', 'D3']})
    >>> left
    A B key
    0 A0 B0 K0
    1 A1 B1 K1
    2 A2 B2 K2
    3 A3 B3 K3
    >>> right
    C D key
    0 C0 D0 K0
    1 C1 D1 K1
    2 C2 D2 K2
    3 C3 D3 K3
    >>> res = pd.merge(left,right,on='key')
    >>> res
    A B key C D
    0 A0 B0 K0 C0 D0
    1 A1 B1 K1 C1 D1
    2 A2 B2 K2 C2 D2
    3 A3 B3 K3 C3 D3


    >>> left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
    ... 'key2': ['K0', 'K1', 'K0', 'K1'],
    ... 'A': ['A0', 'A1', 'A2', 'A3'],
    ... 'B': ['B0', 'B1', 'B2', 'B3']})
    >>> right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
    ... 'key2': ['K0', 'K0', 'K0', 'K0'],
    ... 'C': ['C0', 'C1', 'C2', 'C3'],
    ... 'D': ['D0', 'D1', 'D2', 'D3']})
    >>> res = pd.merge(left,right,on=['key1','key2'])
    >>> res
    A B key1 key2 C D
    0 A0 B0 K0 K0 C0 D0
    1 A2 B2 K1 K0 C1 D1
    2 A2 B2 K1 K0 C2 D2

    >>> import matplotlib.pyplot as plt
    >>> data = pd.Series(np.random.randn(1000),index=np.arange(1000))
    >>> data=data.cumsum()
    >>> data.plot()
    <matplotlib.axes.AxesSubplot object at 0x10409ffd0>
    >>> plt.show()

    >>> data = pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list('ABCD'))
    >>> data.head()
    A B C D
    0 -0.219043 -0.116109 -0.227378 -1.246710
    1 0.603295 -2.291828 -0.245817 0.178349
    2 -0.661455 1.234543 1.193432 0.145587
    3 2.185926 -1.254439 0.029333 -0.475892
    4 -0.282924 -0.127020 0.359198 -0.719617
    >>> data=data.cumsum()
    >>> data.plot()
    <matplotlib.axes.AxesSubplot object at 0x107d8f490>
    >>> plt.show()

    >>> ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class1')
    >>> data.plot.scatter(x='A',y='C',color='DarkGreen',label='Class2',ax=ax)
    <matplotlib.axes.AxesSubplot object at 0x103c5fc10>
    >>> plt.show()

  • 相关阅读:
    将网站从WSS2.0升级到WSS3.0的心得
    Teched 2008课程:ADO.NET Data Service & UC开发概览
    我的基于Silverlight2的相册,也刚刚升级到了RTW了。
    ubuntu16.04设置python3为默认及一些库的安装
    CDMA手机的MEID
    CDMA Subscription 模式设置
    Android架构纵横谈之——软件自愈能力(转载)
    CDMA系统中的用户识别卡(UIM)和空中激活技术(OTA)
    手机信号强度全解析
    GSMPhone与CDMAPhone切换过程
  • 原文地址:https://www.cnblogs.com/sprouts/p/7650062.html
Copyright © 2011-2022 走看看