zoukankan      html  css  js  c++  java
  • Python数据处理-v1.0

    9.2 Pandas-数据结构

    • 一维数据:序列(Series)
    • 二维数据:数据框(DataFrame)
    • 三维数据:面板(MultiIndex/Panel(后面版本可能放弃))

    从数据结构角度,一般实现“增删改查”操作,官方接口提供了如下操作:

    9.2.1 Series

    接口文档

    pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)

    1. 创建

    # 通过列表
    import numpy as np
    import pandas as pd
    s1 = pd.Series([1,3,'555',np.nan,'6.66',8.8],index=list('abcdef'),name='value')
    s1
    
    a       1
    b       3
    c     555
    d     NaN
    e    6.66
    f     8.8
    Name: value, dtype: object
    
    # 通过字典
    import numpy as np
    import pandas as pd
    d = {'b': 1, 'a': 0, 'c': 2}
    s2 = pd.Series(d,name='value')
    s2
    
    b    1
    a    0
    c    2
    Name: value, dtype: int64
    

    2. 查找

    获取元素
    s1.get('c')
    
    '555'
    
    s1[['a','c','d']]
    
    a      1
    c    555
    d    NaN
    Name: value, dtype: object
    
    s1[[1,2,4]]
    
    b       3
    c     555
    e    6.66
    Name: value, dtype: object
    
    索引、列名、值
    # 索引
    s1.index
    
    Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')
    
    # 列名
    s1.name
    
    'value'
    
    # 值
    type(s1.values) #返回ndarray类型
    type(s1.items())#返回tuples类型
    
    zip
    
    条件查询
    # 查找空值数据
    s1[s1.isna()]
    
    d    NaN
    Name: value, dtype: object
    
    # 条件查找
    d = {'b': 1, 'a': 0, 'c': 2}
    s2 = pd.Series(d,name='value')
    s2[s2.values>0]
    
    b    1
    c    2
    Name: value, dtype: int64
    
    切片
    # 切片
    s1['b':'e']
    
    b       3
    c     555
    d     NaN
    e    6.66
    Name: value, dtype: object
    
    # 切片-前5行
    s1.head()
    
    a       1
    b       3
    c     555
    d     NaN
    e    6.66
    Name: value, dtype: object
    
    # 切片-后3行
    s1.tail(3)
    
    d     NaN
    e    6.66
    f     8.8
    Name: value, dtype: object
    

    3. 修改

    排序
    # 索引排序
    s2.sort_index()
    
    a    0
    b    1
    c    2
    Name: value, dtype: int64
    
    # 值排序,要求类型相同
    s2.sort_values()
    
    a    0
    b    1
    c    2
    Name: value, dtype: int64
    
    运算
    # 算术运算
    s1*2
    
    a           2
    b           6
    c      555555
    d         NaN
    e    6.666.66
    f        17.6
    Name: value, dtype: object
    
    # 统计运算
    s2.sum()
    
    3
    
    类型转换、输出
    # 类型转换
    s1 = s1.astype("float64")
    
    # 导出到csv
    s1.to_csv(".data\666.csv")
    
    # 导出到json
    s1.to_json(".data\666.json")
    

    7.2.2 数据框(dataframe)

    接口链接

    DataFrame([data, index, columns, dtype, copy])

    1. 创建

    # 通过Series
    import numpy as np
    import pandas as pd
    d = {'col1': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
         'col2': pd.Series([1., 2., 3., 4.], index=list('abcd'))}
    df1 = pd.DataFrame(d)
    df1
    
    col1 col2
    a 1.0 1.0
    b 2.0 2.0
    c 3.0 3.0
    d NaN 4.0
    # 通过列表
    import numpy as np
    import pandas as pd
    dates = pd.date_range('20200801',periods=5)
    df2 = pd.DataFrame(np.random.randn(5,3),index=dates,columns=list('ABC'))
    df2
    
    A B C
    2020-08-01 0.781995 1.349165 -2.274934
    2020-08-02 -1.671108 1.352948 -0.700606
    2020-08-03 -0.693292 1.014148 0.599076
    2020-08-04 0.264373 0.620617 -0.235280
    2020-08-05 0.109606 0.452433 0.615102
    # 通过字典
    import numpy as np
    import pandas as pd
    df3 = pd.DataFrame({'A':1.,
                       'B':pd.Timestamp('20200827'),
                       'C':pd.Series(1,index=list(range(4)),dtype='float64'),
                       'D':np.array([3]*4,dtype='int64'),
                       'E':pd.Categorical(['test','train','test','train']),
                       'F':'foo'})
    df3
    
    A B C D E F
    0 1.0 2020-08-27 1.0 3 test foo
    1 1.0 2020-08-27 1.0 3 train foo
    2 1.0 2020-08-27 1.0 3 test foo
    3 1.0 2020-08-27 1.0 3 train foo

    2. 属性

    # index
    df3.index
    
    Int64Index([0, 1, 2, 3], dtype='int64')
    
    # columns
    df3.columns
    
    Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
    
    # type
    df3.dtypes
    
    A           float64
    B    datetime64[ns]
    C           float64
    D             int64
    E          category
    F            object
    dtype: object
    
    # values
    df3.values
    
    array([[1.0, Timestamp('2020-08-27 00:00:00'), 1.0, 3, 'test', 'foo'],
           [1.0, Timestamp('2020-08-27 00:00:00'), 1.0, 3, 'train', 'foo'],
           [1.0, Timestamp('2020-08-27 00:00:00'), 1.0, 3, 'test', 'foo'],
           [1.0, Timestamp('2020-08-27 00:00:00'), 1.0, 3, 'train', 'foo']],
          dtype=object)
    
    # 转置
    df1.T
    
    a b c d
    col1 1.0 2.0 3.0 NaN
    col2 1.0 2.0 3.0 4.0
    # 统计量
    df1.describe()
    
    col1 col2
    count 3.0 4.000000
    mean 2.0 2.500000
    std 1.0 1.290994
    min 1.0 1.000000
    25% 1.5 1.750000
    50% 2.0 2.500000
    75% 2.5 3.250000
    max 3.0 4.000000

    3. 查询与赋值

    获取单元
    # 直接索引(先列后行)--不推荐
    df1['col2']['b']
    
    2.0
    
    # select by label
    # 标签索引(先行后列)
    df1.loc['b','col2']
    
    2.0
    
    # select by position
    # 位置索引
    df1.iloc[1,1]
    
    2.0
    
    获取行
    df1.loc['c':]
    
    col1 col2
    c 3.0 3.0
    d NaN 4.0
    df1.iloc[2:]
    
    col1 col2
    c 3.0 3.0
    d NaN 4.0
    获取列
    df1['col2']
    
    a    1.0
    b    2.0
    c    3.0
    d    4.0
    Name: col2, dtype: float64
    
    df1.col2
    
    a    1.0
    b    2.0
    c    3.0
    d    4.0
    Name: col2, dtype: float64
    
    df1.loc[:,'col2']
    
    a    1.0
    b    2.0
    c    3.0
    d    4.0
    Name: col2, dtype: float64
    
    df1.iloc[:,1]
    
    a    1.0
    b    2.0
    c    3.0
    d    4.0
    Name: col2, dtype: float64
    
    条件查询
    # 条件查询--不推荐
    df2[df2['B'].values>0]
    
    A B C
    2020-08-01 0.781995 1.349165 -2.274934
    2020-08-02 -1.671108 1.352948 -0.700606
    2020-08-03 -0.693292 1.014148 0.599076
    2020-08-04 0.264373 0.620617 -0.235280
    2020-08-05 0.109606 0.452433 0.615102
    # 条件查询--推荐
    df2.query("B > 0")
    
    A B C
    2020-08-01 0.781995 1.349165 -2.274934
    2020-08-02 -1.671108 1.352948 -0.700606
    2020-08-03 -0.693292 1.014148 0.599076
    2020-08-04 0.264373 0.620617 -0.235280
    2020-08-05 0.109606 0.452433 0.615102
    # 条件查询
    df3['E'].isin(['test'])
    
    0     True
    1    False
    2     True
    3    False
    Name: E, dtype: bool
    

    赋值

    # 单元赋值
    df1.loc['d','col1']=666
    df1
    
    col1 col2
    a 1.0 1.0
    b 2.0 2.0
    c 3.0 3.0
    d 666.0 4.0
    # 列赋值
    df1.col1=2
    df1
    
    col1 col2
    a 2 1.0
    b 2 2.0
    c 2 3.0
    d 2 4.0
    # 行赋值
    df1.loc['d']=888
    df1
    
    col1 col2
    a 2 1.0
    b 2 2.0
    c 2 3.0
    d 888 888.0
    df1.loc['c':,'col2']=444
    df1
    
    col1 col2
    a 2 1.0
    b 2 2.0
    c 2 444.0
    d 888 444.0

    4. 操作

    排序

    # 值排序
    df2=df2.sort_values('B',ascending=False) #降序
    df2
    
    A B C
    2020-08-02 -1.671108 1.352948 -0.700606
    2020-08-01 0.781995 1.349165 -2.274934
    2020-08-03 -0.693292 1.014148 0.599076
    2020-08-04 0.264373 0.620617 -0.235280
    2020-08-05 0.109606 0.452433 0.615102
    # 索引排序
    df2=df2.sort_index()
    df2
    
    A B C
    2020-08-01 0.781995 1.349165 -2.274934
    2020-08-02 -1.671108 1.352948 -0.700606
    2020-08-03 -0.693292 1.014148 0.599076
    2020-08-04 0.264373 0.620617 -0.235280
    2020-08-05 0.109606 0.452433 0.615102

    7.3 Pandas-数据处理

    7.3.1 缺失值处理

    • 查询缺失值 df.isnull().any()
    • 移除缺失值 df.dropna(axis=0,how='all') # how={'any','all'}
    • 替换缺失值 df.fillna(inplace=True)
    • 替换标记缺失值(非NaN) df.repalce(to_repalce=,value=)
    # 通过字典
    import numpy as np
    import pandas as pd
    df3 = pd.DataFrame({'A':1.,
                       'B':pd.Timestamp('20200827'),
                       'C':pd.Series(1,index=list(range(4)),dtype='float64'),
                       'D':np.array(range(1,5),dtype='int64'),
                       'E':pd.Categorical(['test','train',np.nan,'?']),
                       'F':[np.nan,np.nan,np.nan,np.nan]})
    df3
    
    A B C D E F
    0 1.0 2020-08-27 1.0 1 test NaN
    1 1.0 2020-08-27 1.0 2 train NaN
    2 1.0 2020-08-27 1.0 3 NaN NaN
    3 1.0 2020-08-27 1.0 4 ? NaN
    # 某列中含有NaN,则返回True
    df3.isnull().any()
    
    A    False
    B    False
    C    False
    D    False
    E     True
    F     True
    dtype: bool
    
    # 某列中全部数据为NaN,则返回True
    df3.isnull().all()
    
    A    False
    B    False
    C    False
    D    False
    E    False
    F     True
    dtype: bool
    
    # numpy查询整个dataframe
    np.any(pd.isnull(df3))
    
    True
    
    data1=df3.dropna(axis=1,how='all') #删除整列为NaN的数据
    data1
    
    A B C D E
    0 1.0 2020-08-27 1.0 1 test
    1 1.0 2020-08-27 1.0 2 train
    2 1.0 2020-08-27 1.0 3 NaN
    3 1.0 2020-08-27 1.0 4 ?
    df3.F.fillna(df3.D.mean(),inplace=True) # inplace表示在原有表修改
    df3
    
    A B C D E F
    0 1.0 2020-08-27 1.0 1 test 2.5
    1 1.0 2020-08-27 1.0 2 train 2.5
    2 1.0 2020-08-27 1.0 3 NaN 2.5
    3 1.0 2020-08-27 1.0 4 ? 2.5
    # 替换标记缺失值(非NaN)
    data2=df3.replace(to_replace='?',value=df3.D.mean())
    data2
    
    A B C D E F
    0 1.0 2020-08-27 1.0 1 test 2.5
    1 1.0 2020-08-27 1.0 2 train 2.5
    2 1.0 2020-08-27 1.0 3 NaN 2.5
    3 1.0 2020-08-27 1.0 4 2.5 2.5

    7.3.2 离散化

    • 分组
      • sr = pd.qcut(data,bins) #自动分组
      • sr = pd.cut(data,[区间]) #手动分组
    • 将分组好的结果换成one-hot编码
      • pd.get_dummies(sr,prefix=前缀标记)
    # 1 创建数据
    data = pd.Series([165,174,160,180,159,163,192,184],index=list(range(1,9)))
    # 2 手动分组
    bins = [150,165,180,195]
    sr = pd.cut(data,bins)
    print(sr.value_counts())
    # 3 noe-hots编码
    pd.get_dummies(sr,prefix='身高_')
    
    (150, 165]    4
    (180, 195]    2
    (165, 180]    2
    dtype: int64
    
    身高__(150, 165] 身高__(165, 180] 身高__(180, 195]
    1 1 0 0
    2 0 1 0
    3 1 0 0
    4 0 1 0
    5 1 0 0
    6 1 0 0
    7 0 0 1
    8 0 0 1

    7.3.3 数据合并

    1. 拼接concat

    # 创建数据
    df1 = pd.DataFrame(np.ones((3,4))*0,columns=list('abcd'))
    df2 = pd.DataFrame(np.ones((3,4))*1,columns=list('bcde'))
    df3 = pd.DataFrame(np.ones((3,4))*2,columns=list('abcd'))
    print(df1)
    print(df2)
    print(df3)
    # 拼接
    res1 = pd.concat([df1,df2,df3],axis=0,ignore_index=True) # 纵向合并(axis=0),索引重新排序,缺失数据补NaN
    res1
    
         a    b    c    d
    0  0.0  0.0  0.0  0.0
    1  0.0  0.0  0.0  0.0
    2  0.0  0.0  0.0  0.0
         b    c    d    e
    0  1.0  1.0  1.0  1.0
    1  1.0  1.0  1.0  1.0
    2  1.0  1.0  1.0  1.0
         a    b    c    d
    0  2.0  2.0  2.0  2.0
    1  2.0  2.0  2.0  2.0
    2  2.0  2.0  2.0  2.0
    
    a b c d e
    0 0.0 0.0 0.0 0.0 NaN
    1 0.0 0.0 0.0 0.0 NaN
    2 0.0 0.0 0.0 0.0 NaN
    3 NaN 1.0 1.0 1.0 1.0
    4 NaN 1.0 1.0 1.0 1.0
    5 NaN 1.0 1.0 1.0 1.0
    6 2.0 2.0 2.0 2.0 NaN
    7 2.0 2.0 2.0 2.0 NaN
    8 2.0 2.0 2.0 2.0 NaN
    '''
    join参数:默认outer
        outer:缺失数据补NaN
        inner:删除缺失数据列
    '''
    res2=pd.concat([df1,df2],join='inner',ignore_index=True)
    res2
    
    b c d
    0 0.0 0.0 0.0
    1 0.0 0.0 0.0
    2 0.0 0.0 0.0
    3 1.0 1.0 1.0
    4 1.0 1.0 1.0
    5 1.0 1.0 1.0

    2.添加append

    # 添加dataframe
    res3 = df1.append(df2,ignore_index=True)
    res3
    
    a b c d e
    0 0.0 0.0 0.0 0.0 NaN
    1 0.0 0.0 0.0 0.0 NaN
    2 0.0 0.0 0.0 0.0 NaN
    3 NaN 1.0 1.0 1.0 1.0
    4 NaN 1.0 1.0 1.0 1.0
    5 NaN 1.0 1.0 1.0 1.0
    # 添加行
    sr = pd.Series([1,2,3,4],index=list('abcd'))
    res4 = df1.append(sr,ignore_index=True)
    res4
    
    a b c d
    0 0.0 0.0 0.0 0.0
    1 0.0 0.0 0.0 0.0
    2 0.0 0.0 0.0 0.0
    3 1.0 2.0 3.0 4.0

    3. 融合Merge

    #依据一组key合并
    left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                                 'A': ['A0', 'A1', 'A2', 'A3'],
                                 'B': ['B0', 'B1', 'B2', 'B3']})
    right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                                  'C': ['C0', 'C1', 'C2', 'C3'],
                                  'D': ['D0', 'D1', 'D2', 'D3']})
    res = pd.merge(left, right, on='key')
    print(left)
    print(right)
    res
    
      key   A   B
    0  K0  A0  B0
    1  K1  A1  B1
    2  K2  A2  B2
    3  K3  A3  B3
      key   C   D
    0  K0  C0  D0
    1  K1  C1  D1
    2  K2  C2  D2
    3  K3  C3  D3
    
    key A B C D
    0 K0 A0 B0 C0 D0
    1 K1 A1 B1 C1 D1
    2 K2 A2 B2 C2 D2
    3 K3 A3 B3 C3 D3
    # 依据两组key合并
    left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                          'key2': ['K0', 'K1', 'K0', 'K1'],
                          'A': ['A0', 'A1', 'A2', 'A3'],
                          'B': ['B0', 'B1', 'B2', 'B3']})
    right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                           'key2': ['K0', 'K0', 'K0', 'K0'],
                           'C': ['C0', 'C1', 'C2', 'C3'],
                           'D': ['D0', 'D1', 'D2', 'D3']})
    print('left表:')
    print(left)
    print('
    right表:')
    print(right)
    #依据key1与key2 columns进行合并,并打印出四种结果['left', 'right', 'outer', 'inner']
    res = pd.merge(left, right, on=['key1', 'key2'], how='inner') #只合并两张表key都有的数据
    print('
    inner方式:')
    print(res)
    res = res = pd.merge(left, right, on=['key1', 'key2'], how='outer')#没有的数据补NaN
    print('
    outer方式:')
    print(res)
    res = pd.merge(left, right, on=['key1', 'key2'], how='left') #以左表为基准,右表匹配
    print('
    left方式:')
    print(res)
    res = pd.merge(left, right, on=['key1', 'key2'], how='right') #以右表为基准,左表匹配
    print('
    right方式:')
    print(res)
    
    left表:
      key1 key2   A   B
    0   K0   K0  A0  B0
    1   K0   K1  A1  B1
    2   K1   K0  A2  B2
    3   K2   K1  A3  B3
    
    right表:
      key1 key2   C   D
    0   K0   K0  C0  D0
    1   K1   K0  C1  D1
    2   K1   K0  C2  D2
    3   K2   K0  C3  D3
    
    inner方式:
      key1 key2   A   B   C   D
    0   K0   K0  A0  B0  C0  D0
    1   K1   K0  A2  B2  C1  D1
    2   K1   K0  A2  B2  C2  D2
    
    outer方式:
      key1 key2    A    B    C    D
    0   K0   K0   A0   B0   C0   D0
    1   K0   K1   A1   B1  NaN  NaN
    2   K1   K0   A2   B2   C1   D1
    3   K1   K0   A2   B2   C2   D2
    4   K2   K1   A3   B3  NaN  NaN
    5   K2   K0  NaN  NaN   C3   D3
    
    left方式:
      key1 key2   A   B    C    D
    0   K0   K0  A0  B0   C0   D0
    1   K0   K1  A1  B1  NaN  NaN
    2   K1   K0  A2  B2   C1   D1
    3   K1   K0  A2  B2   C2   D2
    4   K2   K1  A3  B3  NaN  NaN
    
    right方式:
      key1 key2    A    B   C   D
    0   K0   K0   A0   B0  C0  D0
    1   K1   K0   A2   B2  C1  D1
    2   K1   K0   A2   B2  C2  D2
    3   K2   K0  NaN  NaN  C3  D3
    
    #依据index合并
    left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                         'B': ['B0', 'B1', 'B2']},
                         index=['K0', 'K1', 'K2'])
    right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                          'D': ['D0', 'D2', 'D3']},
                         index=['K0', 'K2', 'K3'])
    
    print(left)
    print(right)
    # outer方式
    res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
    print(res)
    # inner方式
    res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
    res
    
         A   B
    K0  A0  B0
    K1  A1  B1
    K2  A2  B2
         C   D
    K0  C0  D0
    K2  C2  D2
    K3  C3  D3
          A    B    C    D
    K0   A0   B0   C0   D0
    K1   A1   B1  NaN  NaN
    K2   A2   B2   C2   D2
    K3  NaN  NaN   C3   D3
    
    A B C D
    K0 A0 B0 C0 D0
    K2 A2 B2 C2 D2

    7.3.4 透视表(pivot table)

    df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
                       'B': ['A', 'B', 'C'] * 4,
                       'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                       'D': np.random.randn(12),
                       'E': np.random.randn(12)})
    print(df)
    pivot=pd.pivot_table(df,values='D',index=['A','B'],columns='C')
    print(pivot.columns)
    pivot
    
            A  B    C         D         E
    0     one  A  foo  2.847002 -0.341067
    1     one  B  foo -0.764842  1.078190
    2     two  C  foo  0.002059  0.414781
    3   three  A  bar -0.174984  0.084828
    4     one  B  bar -2.018801 -1.122346
    5     one  C  bar  1.576535  0.551934
    6     two  A  foo -0.427333 -0.990089
    7   three  B  foo -0.907410 -0.541668
    8     one  C  foo -0.988257  2.493991
    9     one  A  bar -0.560151 -1.124036
    10    two  B  bar  1.333048 -0.620632
    11  three  C  bar  0.735043 -0.102446
    Index(['bar', 'foo'], dtype='object', name='C')
    
    C bar foo
    A B
    one A -0.560151 2.847002
    B -2.018801 -0.764842
    C 1.576535 -0.988257
    three A -0.174984 NaN
    B NaN -0.907410
    C 0.735043 NaN
    two A NaN -0.427333
    B 1.333048 NaN
    C NaN 0.002059
    bill = pd.read_csv('./data/bill.csv',encoding='gb2312')
    bill.时间 = pd.to_datetime(bill.时间).dt.normalize() #去除时间保留日期
    # 按‘分类’分组
    pivot = pd.pivot_table(bill,index='分类',columns='时间',values='支出',aggfunc='sum').reset_index()
    # 查询单天支出超过1笔的日期
    pivot.loc[:,pivot.count(axis=0)>1]
    
    时间 分类 2020-03-13 00:00:00 2020-06-17 00:00:00 2020-07-12 00:00:00 2020-08-20 00:00:00
    0 App购买 NaN 74.0 NaN NaN
    1 交通 NaN NaN NaN NaN
    2 其它 NaN NaN NaN NaN
    3 医疗 NaN NaN NaN 280.0
    4 发红包 NaN NaN NaN NaN
    5 学习 NaN NaN NaN 3900.0
    6 就诊 NaN NaN NaN NaN
    7 旅行 NaN NaN 399.0 NaN
    8 电器 NaN NaN NaN NaN
    9 电子产品 NaN NaN NaN NaN
    10 租金 NaN NaN NaN 1000.0
    11 衣服 158.0 79.0 NaN NaN
    12 话费网费 400.0 NaN 114.0 NaN
    13 请客送礼 NaN NaN NaN NaN
    
    
  • 相关阅读:
    Android studio关于点击事件后的页面跳转,选择完成后返回(onActivityResult)
    关于Android对话框简单实用方法总结
    Eclipse键盘输出文字,显示到屏幕上方法
    indexOf实际试用方法
    LiteOS裸机驱动移植01-以LED为例说明驱动移植
    LiteOS内核教程06-内存管理
    LiteOS内核教程05-互斥锁
    LiteOS内核教程04-信号量
    LiteOS内核教程03-任务管理
    LiteOS内核教程02-HelloWorld
  • 原文地址:https://www.cnblogs.com/liuwenzhen/p/13589395.html
Copyright © 2011-2022 走看看