zoukankan      html  css  js  c++  java
  • pandas

    pandas

    a.基本使用

    #结构化的数据分析
    
    pip3 install jupyter
    pip3 install pandas
    
    
    #import pandas as pd
    #import numpy as np
    
    #事例一:
    
    #s = pd.Series([1,3,5,np.NaN,8,4])
    #s
    
        Out[5]: 
        0    1.0
        1    3.0
        2    5.0
        3    NaN
        4    8.0
        5    4.0
        dtype: float64
    
    #事例二:
    
    #创建日期序列,里面有6个日期
    #datas = pd.date_range('20170821',periods=6)    
    #datas
    
        DatetimeIndex(['2017-08-21', '2017-08-22', '2017-08-23', '2017-08-24','2017-08-25', '2017-08-26'], dtype='datetime64[ns]', freq='D')
    
    
    #创建二维数组
    
    #创建6行4列的随机数,索引datas,列ABCD
    
    #data = pd.DataFrame(np.random.randn(6,4),index=datas,columns=list('ABCD'))
    #data
    
                           A         B         C         D
        2017-08-21 -0.245344  0.260401 -2.003621  0.427142
        2017-08-22 -2.773848  1.604729 -0.711769 -0.677211
        2017-08-23  0.396086 -0.731911 -0.100242  0.966344
        2017-08-24  0.761821 -0.159621 -1.172729 -1.317056
        2017-08-25  0.186582  0.739702 -1.688458  0.480121
        2017-08-26 -0.519489 -0.002741  0.875164 -0.783657
    
    #data.values
    
        array([[ -2.45343824e-01,   2.60401419e-01,  -2.00362100e+00,  4.27142120e-01],
               [ -2.77384841e+00,   1.60472878e+00,  -7.11768546e-01,  -6.77211441e-01],
               [  3.96086166e-01,  -7.31910686e-01,  -1.00241967e-01,   9.66344486e-01],
               [  7.61820910e-01,  -1.59621471e-01,  -1.17272904e+00,   -1.31705593e+00],
               [  1.86582291e-01,   7.39702155e-01,  -1.68845777e+00,   4.80121151e-01],
               [ -5.19488872e-01,  -2.74128435e-03,   8.75164076e-01,  -7.83656946e-01]])
    
    
    #data.head(2)    #看前两行的数据
                           A         B         C         D
        2017-08-21 -0.245344  0.260401 -2.003621  0.427142
        2017-08-22 -2.773848  1.604729 -0.711769 -0.677211
    
    
    #data.T         #行列转置
    
           2017-08-21  2017-08-22  2017-08-23  2017-08-24  2017-08-25  2017-08-26
        A   -0.245344   -2.773848    0.396086    0.761821    0.186582   -0.519489
        B    0.260401    1.604729   -0.731911   -0.159621    0.739702   -0.002741
        C   -2.003621   -0.711769   -0.100242   -1.172729   -1.688458    0.875164
        D    0.427142   -0.677211    0.966344   -1.317056    0.480121   -0.783657
    
    
    #data.sort_index(axis=1)        #列正序
    
                           A         B         C         D
        2017-08-21 -0.245344  0.260401 -2.003621  0.427142
        2017-08-22 -2.773848  1.604729 -0.711769 -0.677211
        2017-08-23  0.396086 -0.731911 -0.100242  0.966344
        2017-08-24  0.761821 -0.159621 -1.172729 -1.317056
        2017-08-25  0.186582  0.739702 -1.688458  0.480121
        2017-08-26 -0.519489 -0.002741  0.875164 -0.783657
    
    
    #data.sort_index(axis=1,ascending=False)        #列倒序
    
                           D         C         B         A
        2017-08-21  0.427142 -2.003621  0.260401 -0.245344
        2017-08-22 -0.677211 -0.711769  1.604729 -2.773848
        2017-08-23  0.966344 -0.100242 -0.731911  0.396086
        2017-08-24 -1.317056 -1.172729 -0.159621  0.761821
        2017-08-25  0.480121 -1.688458  0.739702  0.186582
        2017-08-26 -0.783657  0.875164 -0.002741 -0.519489
    
    
    #data.sort_index(axis=0,ascending=False)        #行倒序
    
                           A         B         C         D
        2017-08-26 -0.519489 -0.002741  0.875164 -0.783657
        2017-08-25  0.186582  0.739702 -1.688458  0.480121
        2017-08-24  0.761821 -0.159621 -1.172729 -1.317056
        2017-08-23  0.396086 -0.731911 -0.100242  0.966344
        2017-08-22 -2.773848  1.604729 -0.711769 -0.677211
        2017-08-21 -0.245344  0.260401 -2.003621  0.427142
    
    
    #data.sort_values(by='A')        # A列排序
    
                           A         B         C         D
        2017-08-22 -2.773848  1.604729 -0.711769 -0.677211
        2017-08-26 -0.519489 -0.002741  0.875164 -0.783657
        2017-08-21 -0.245344  0.260401 -2.003621  0.427142
        2017-08-25  0.186582  0.739702 -1.688458  0.480121
        2017-08-23  0.396086 -0.731911 -0.100242  0.966344
        2017-08-24  0.761821 -0.159621 -1.172729 -1.317056
    
    
    #data['A'] 或 data.A                #数据选择
     
        2017-08-21   -0.245344
        2017-08-22   -2.773848
        2017-08-23    0.396086
        2017-08-24    0.761821
        2017-08-25    0.186582
        2017-08-26   -0.519489
    
    
    data.loc['2017-08-21':'2017-08-23']        #选择哪几行
    
                           A         B         C         D
        2017-08-21 -0.245344  0.260401 -2.003621  0.427142
        2017-08-22 -2.773848  1.604729 -0.711769 -0.677211
        2017-08-23  0.396086 -0.731911 -0.100242  0.966344
    
    
    #data.iloc[2:4]                            #行
    
                           A         B         C         D
        2017-08-23  0.396086 -0.731911 -0.100242  0.966344
        2017-08-24  0.761821 -0.159621 -1.172729 -1.317056
    
    
    #data.loc[:,['B','C']]                #取出列
    
                           B         C
        2017-08-21  0.260401 -2.003621
        2017-08-22  1.604729 -0.711769
        2017-08-23 -0.731911 -0.100242
        2017-08-24 -0.159621 -1.172729
        2017-08-25  0.739702 -1.688458
        2017-08-26 -0.002741  0.875164
    
    
    #data.loc['2017-08-21':'2017-08-23',['B','C']]        #指定行和列
    
                           B         C
        2017-08-21  0.260401 -2.003621
        2017-08-22  1.604729 -0.711769
        2017-08-23 -0.731911 -0.100242
    
    
    #data.loc['2017-08-21','B']            #访问特定的值
    
        0.26040141861580018
    
    #data.at[pd.Timestamp('2017-08-21'),'B']    #比上面效率高
        
        #0.26040141861580018
    
    
    #data.A = range(6)                #修改列数据
    #data
                    A         B         C         D
        2017-08-21  0  0.260401 -2.003621  0.427142
        2017-08-22  1  1.604729 -0.711769 -0.677211
        2017-08-23  2 -0.731911 -0.100242  0.966344
        2017-08-24  3 -0.159621 -1.172729 -1.317056
        2017-08-25  4  0.739702 -1.688458  0.480121
        2017-08-26  5 -0.002741  0.875164 -0.783657
    
    
    #data.iloc[:,2:5] = 1000        #修改某几列
    #data
    
                    A         B     C     D
        2017-08-21  0  0.260401  1000  1000
        2017-08-22  1  1.604729  1000  1000
        2017-08-23  2 -0.731911  1000  1000
        2017-08-24  3 -0.159621  1000  1000
        2017-08-25  4  0.739702  1000  1000
        2017-08-26  5 -0.002741  1000  1000
    
    
    #事例三:
    
    #d = {'A':1,'B':pd.Timestamp("20160821"),'C':list(range(4)),'D':np.arange(4)}
    #d
    
        {'A': 1,
         'B': Timestamp('2016-08-21 00:00:00'),
         'C': [0, 1, 2, 3],
         'D': array([0, 1, 2, 3])}
    
    df = pd.DataFrame(d)
    #df
     
           A          B  C  D
        0  1 2016-08-21  0  0
        1  1 2016-08-21  1  1
        2  1 2016-08-21  2  2
        3  1 2016-08-21  3  3
    
    #df.dtypes 
        A             int64
        B    datetime64[ns]
        C             int64
        D             int64
    View Code

    b. DataFrame 创建

    #创建方式
    
    #pd.DataFrame({'one':[1,2,3,4],'two':[4,3,2,1]})
    
            one  two
        0    1    4
        1    2    3
        2    3    2
        3    4    1
    
    
    #pd.DataFrame({'one':[1,2,3],'two':[3,2,1]},index=list('abc'))
    
           one  two
        a    1    3
        b    2    2
        c    3    1
    
    
    #pd.DataFrame({'one': pd.Series([1,2,3],index=['a','b','c']),'two' :pd.Series([1,2,3,4],index=['b','a','c','d'])})
    
           one   two
        a  1.0    2
        b  2.0    1
        c  3.0    3
        d  NaN    4
    View Code

    c. xx

    #obj = pd.read_csv('601318.csv')    #从文件中导入
    #obj.to_csv('new_obj.csv')            #导出
    
    #obj.index        #获取行索引
    #obj.columns    #获取列索引
    #obj.values        #返回所有的值
    #obj.rename(columns={'close':'new_close'})    #给列改名称
    
    # loc 
    
    #obj.loc[:,['close','open']]        #所有行的两列
    #obj.loc[0:10,['close','open']]        #前10行的两列 0可以省略
    
    # iloc 只能位置(下标)
    
    #obj.iloc[0:10,0:3]            #前10行的前3列
    #obj[obj['open']<20]        #open小于20的
    View Code
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    
    
    #资本显示
    #dates = pd.date_range('20160821',periods=6)
    #df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
    #df
    
                        A            B            C            D
        2016-08-21    0.395062    -0.713272    0.560768    1.175623
        2016-08-22    0.487270    0.541010    -1.451927    -0.374084
        2016-08-23    1.190535    -0.017564    1.065334    -0.716946
        2016-08-24    -0.231604    0.890002    -2.085083    -0.421781
        2016-08-25    -0.282863    0.039613    0.900477    -1.048180
        2016-08-26    0.238394    -0.086092    0.002821    1.893919
    
    
    # 增加一列 E
    
    #df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E'])
    #df1
    
                        A            B            C            D         E
        2016-08-21    0.395062    -0.713272    0.560768    1.175623    NaN
        2016-08-22    0.487270    0.541010    -1.451927    -0.374084    NaN
        2016-08-23    1.190535    -0.017564    1.065334    -0.716946    NaN
        2016-08-24    -0.231604    0.890002    -2.085083    -0.421781    NaN
    
    
    # 修改列值
    
    #df1.loc[dates[1:3],'E'] = 2
    #df1
    
                        A            B            C            D         E
        2016-08-21    0.395062    -0.713272    0.560768    1.175623    NaN
        2016-08-22    0.487270    0.541010    -1.451927    -0.374084    2.0
        2016-08-23    1.190535    -0.017564    1.065334    -0.716946    2.0
        2016-08-24    -0.231604    0.890002    -2.085083    -0.421781    NaN
    
    
    #有空数据行的丢掉
    
    #df1.dropna()
    
                        A            B            C            D         E
        2016-08-22    0.487270    0.541010    -1.451927    -0.374084    2.0
        2016-08-23    1.190535    -0.017564    1.065334    -0.716946    2.0
    
    
    # 有空的值用默认值替换
    
    #df1.fillna(value=5)
    
                        A            B            C            D         E
        2016-08-21    0.395062    -0.713272    0.560768    1.175623    5.0
        2016-08-22    0.487270    0.541010    -1.451927    -0.374084    2.0
        2016-08-23    1.190535    -0.017564    1.065334    -0.716946    2.0
        2016-08-24    -0.231604    0.890002    -2.085083    -0.421781    5.0
    
    
    # 判断是否有空数据
    
    #pd.isnull(df1)
    
                      A           B          C          D         E
        2016-08-21    False    False    False    False    True
        2016-08-22    False    False    False    False    False
        2016-08-23    False    False    False    False    False
        2016-08-24    False    False    False    False    True
    
    
    # 求行平均值,空数据不参与计算
    
    #df1.mean()
    
        A    0.460316
        B    0.175044
        C   -0.477727
        D   -0.084297
        E    2.000000
        dtype: float64
    
    
    # 求列平均值
    
    #df1.mean(axis=1)
    
        A    0.460316
        B    0.175044
        C   -0.477727
        D   -0.084297
        E    2.000000
        dtype: float64
    
    
    # 累加 (把列传给参数处理)
    
    #df.apply(np.cumsum)
    
                        A              B            C            D
        2016-08-21    -2.251855    0.967517    0.045508    1.011237
        2016-08-22    -0.159759    -1.011472    0.642207    1.451304
        2016-08-23    0.595429    -1.164885    -1.323172    2.214757
        2016-08-24    0.582950    0.299636    1.421000    2.018076
        2016-08-25    0.624217    0.838486    1.840071    1.241466
        2016-08-26    0.102015    -0.251384    0.422820    0.166844
    
    
    #每列的最大值 减去最小值
    
    #df.apply(lambda x: x.max() - x.min())
    
        A    4.343951
        B    3.443510
        C    4.709552
        D    2.085859
        dtype: float64
    
    
    #追加一行
    
    #df.append(s,ignore_index=True)
    
    
    ####--------------  concat()---------------
    
    #df = pd.DataFrame(np.random.randn(10,4),columns=list('ABCD'))
    #df
    
                A            B            C            D
        0    0.448883    0.667777    1.974642    -0.598013
        1    -0.384818    -1.616549    1.551960    0.201467
        2    -1.428748    0.122964    0.837514    1.102314
        3    0.634624    0.553064    -0.248767    0.019210
        4    2.518095    0.240933    -0.722795    -0.815324
        5    0.517742    -2.444094    1.270380    -0.160528
        6    -0.800297    -1.242557    -1.118255    0.616456
        7    0.953936    0.318387    0.151009    0.340203
        8    0.558308    -1.344539    0.846960    1.264978
        9    1.549733    1.496383    0.208102    1.265871
    
    #df.iloc[:3]    # 取前3行
    #df.iloc[3:7]    # 3-7行
    #df.iloc[7:]    # 7-最后一行
    #df1 = pd.concat([df.iloc[:3],df.iloc[3:7],df.iloc[7:]])    #等于上面3行
    #df1
    
                A            B            C            D
        0    -0.151648    -0.077165    1.130773    1.587466
        1    -0.854935    0.490470    -0.438102    -0.066202
        2    -0.622838    -1.524100    -0.995986    -1.378272
        3    -0.718499    0.184935    -1.164265    1.127623
        4    2.180069    0.799651    1.630780    -1.592882
        5    -0.590875    -0.824192    -0.109713    -1.502221
        6    0.437851    2.268239    0.305557    -0.515982
        7    -1.084059    -0.325458    1.279105    -0.404447
        8    0.134603    0.012700    0.935611    1.017555
        9    0.521873    0.630876    -1.714364    -1.243454
    
    
    #判断 df1 和 df 是否相等
    
    #df == df1
    
              A          B          C          D
        0    True    True    True    True
        1    True    True    True    True
        2    True    True    True    True
        3    True    True    True    True
        4    True    True    True    True
        5    True    True    True    True
        6    True    True    True    True
        7    True    True    True    True
        8    True    True    True    True
        9    True    True    True    True
    
    
    #(df == df1).all()
    
        A    True
        B    True
        C    True
        D    True
    
    #(df == df1).all().all()
        True
    
    ###-------------    merge()    ----------
    
    #left = pd.DataFrame({'key':['foo','foo'],'rval':[1,2]})
    #right = pd.DataFrame({'key':['foo','foo'],'rval':[4,5]})
    #left
            key     rval
        0    foo       1
        1    foo       2
    
    #right 
            key     rval
        0    foo      4
        1    foo      5
    
    #按照key关联
    
    #pd.merge(left,right,on='key')
    
            key      rval_x    rval_y
        0    foo        1          4
        1    foo        1          5
        2    foo        2          4
        3    foo        2          5
    
    
    ###-------------  groupby()    -----------
    
    #df = pd.DataFrame({'A' : ['foo','bar','foo','bar'],
                      'B' : ['one','two','one','two'],
                      'C': np.random.randn(4),
                      'D': np.random.randn(4)
                      })
    #df
    
             A     B      C                D
        0    foo    one    1.261140    1.109300
        1    bar    two    -2.072375    -0.533420
        2    foo    one    1.470090    0.243500
        3    bar    two    -0.243230    -0.721201
    
    
    #分组
    #df.groupby('A').sum()
                    
        A        C          D
        bar     -2.315605    -1.254621
        foo     2.731229    1.352800
    
    
    #df.groupby(['A','B']).sum()
                
        A    B        C          D
        bar    two    -1.039401    -0.338988
        foo    one    -1.004454    1.169985
    
    
    -------------------------序列---------------------
    
    #s = pd.Series(np.random.randint(10,20,size=20))
    #s
        0     13
        1     16
        2     11
        3     13
        4     17
        5     18
        6     12
        7     11
        8     13
        9     12
        10    18
        11    15
        12    19
        13    10
        14    16
        15    10
        16    16
        17    17
        18    10
        19    13
    
    
    # 每个数字产生的次数
    #s.value_counts()
    
        15    5
        18    3
        13    3
        17    2
        12    2
        11    2
        19    1
        16    1
        10    1
    
    # 产生最多的数
    #s.mode()
        0    15
    
    
    
    
    
    
    ----------------------------------------------------------
    # 创建序列 包含空值
    
    #s = pd.Series([1,3,5,np.nan,6,8],index=dates)
    #s
    
        2016-08-21    1.0
        2016-08-22    3.0
        2016-08-23    5.0
        2016-08-24    NaN
        2016-08-25    6.0
        2016-08-26    8.0
        Freq: D, dtype: float64
        In [12]:
    
    
    #s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2)
    #s
    
        2016-08-21    NaN
        2016-08-22    NaN
        2016-08-23    1.0
        2016-08-24    3.0
        2016-08-25    5.0
        2016-08-26    NaN
        Freq: D, dtype: float64
    
    
    #df
    
                        A            B            C            D
        2016-08-21    -2.251855    0.967517    0.045508    1.011237
        2016-08-22    2.092096    -1.978989    0.596700    0.440067
        2016-08-23    0.755188    -0.153414    -1.965380    0.763453
        2016-08-24    -0.012479    1.464521    2.744173    -0.196681
        2016-08-25    0.041267    0.538850    0.419071    -0.776610
        2016-08-26    -0.522202    -1.089871    -1.417252    -1.074622
    
    # 二维 减去 序列  (空值不参与运算)
    #df.sub(s,axis='index')
        
                        A             B             C             D
        2016-08-21        NaN            NaN            NaN            NaN
        2016-08-22        NaN            NaN            NaN            NaN
        2016-08-23    -0.244812    -1.153414    -2.965380    -0.236547
        2016-08-24    -3.012479    -1.535479    -0.255827    -3.196681
        2016-08-25    -4.958733    -4.461150    -4.580929    -5.776610
        2016-08-26        NaN            NaN            NaN            NaN
    View Code

    b. 数据整形

    #数据整形 (把数据的行和列互换)
    
    
    %matplotlib inline
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    
    #行索引
    
    #tuples = list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],['one','two','one','two','one','two','one','two']]))
    #tuples
    
        [('bar', 'one'),
         ('bar', 'two'),
         ('baz', 'one'),
         ('baz', 'two'),
         ('foo', 'one'),
         ('foo', 'two'),
         ('qux', 'one'),
         ('qux', 'two')]
    #index = pd.MultiIndex.from_tuples(tuples, names=['first','second'])
    #index
    
        MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
                   labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
                   names=['first', 'second'])
    
    
    #df = pd.DataFrame(np.random.randn(8,2),index=index,columns=['A','B'])
    #df
                            A            B
        first    second        
        bar        one        1.109199    -0.482758
                two        0.947951    1.284311
        baz        one        -0.733705    -0.088907
                two        1.146346    -0.946909
        foo        one        0.254840    1.868951
                two        1.052604    -0.684604
        qux        one        0.133846    0.065647
                two        -1.137040    1.010329
    
    
    #把列索引变成行索引
    
    #stacked = df.stack()
    #stacked
    
        first  second   
        bar    one     A    1.109199
                       B   -0.482758
               two     A    0.947951
                       B    1.284311
        baz    one     A   -0.733705
                       B   -0.088907
               two     A    1.146346
                       B   -0.946909
        foo    one     A    0.254840
                       B    1.868951
               two     A    1.052604
                       B   -0.684604
        qux    one     A    0.133846
                       B    0.065647
               two     A   -1.137040
                       B    1.010329
    View Code

     

  • 相关阅读:
    Discuz!NT代码阅读笔记(4)一切皆可配置:页面的显示
    WAP网站开发

    Discuz!NT代码阅读笔记(2)网站安装自动化论坛程序安装及初始化过程
    Discuz!NT代码阅读笔记(1)从HttpModule开始
    c#使用Transactions类完成多个数据库的事务操作(分布式事务处理)
    自我介绍 简历
    Discuz!NT 系统架构分析
    缓存应用Memcached分布式缓存简介(二)
    数组类型EDT的元素个数
  • 原文地址:https://www.cnblogs.com/golangav/p/7401177.html
Copyright © 2011-2022 走看看