zoukankan      html  css  js  c++  java
  • Python数据分析(二)pandas缺失值处理

    import pandas as pd
    import numpy as np
    
    df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
    'h'],columns=['one', 'two', 'three'])
    
    df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
    print(df)
    print('################缺失值判断######################')
    print('--------Series的缺失值判断---------')
    print (df['one'].isnull())
    '''
    --------Series的缺失值判断---------
    a    False
    b     True
    c    False
    d     True
    e    False
    f    False
    g     True
    h    False
    Name: one, dtype: bool
    '''
    print('---------输出Series缺失值和索引--------')
    print(df['one'][df['one'].isnull()])
    '''
    ---------输出Series缺失值和索引--------
    b   NaN
    d   NaN
    g   NaN
    Name: one, dtype: float64

    '''
    print('--------dataframe的缺失值判断---------') print(df.isnull())
    '''
    --------dataframe的缺失值判断---------
         one    two  three
    a  False  False  False
    b   True   True   True
    c  False  False  False
    d   True   True   True
    e  False  False  False
    f  False  False  False
    g   True   True   True
    h  False  False  False

    '''
    print('--------输出dataframe的缺失值和索引---------') data = df[df.isnull().values==True] print(data[~data.index.duplicated()])
    '''
    --------输出dataframe的缺失值和索引---------
       one  two  three
    b  NaN  NaN    NaN
    d  NaN  NaN    NaN
    g  NaN  NaN    NaN

    '''
    print('--------输出dataframe的有缺失值的列---------') print(df.isnull().any())
    '''
    --------输出dataframe的有缺失值的列---------
    one      True
    two      True
    three    True
    dtype: bool

    '''
    print('################缺失值过滤######################') print('--------Series的缺失值过滤---------') print(df['one'].isnull())
    '''
    ################缺失值过滤######################
    --------Series的缺失值过滤---------
    a    False
    b     True
    c    False
    d     True
    e    False
    f    False
    g     True
    h    False
    Name: one, dtype: bool

    '''
    print('--------使用dropna方法删除缺失数据,返回一个删除后的Series--------') print(df['one'].dropna())
    '''
    --------使用dropna方法删除缺失数据,返回一个删除后的Series--------
    a   -0.211055
    c   -0.870090
    e   -0.203259
    f    0.490568
    h    1.437819
    Name: one, dtype: float64

    '''
    print('--------dataframe的缺失值过滤---------') print(df.dropna())
    '''
    --------dataframe的缺失值过滤---------
            one       two     three
    a -0.211055 -2.869212  0.022179
    c -0.870090 -0.878423  1.071588
    e -0.203259  0.315897  0.495306
    f  0.490568 -0.968058 -0.999899
    h  1.437819 -0.370934 -0.482307

    '''
    print('-------当行全为NaN的时候,才删除,参数how默认是any,含有缺失值就删除--------') print(df.dropna(how="all"))
    '''
    -------当行全为NaN的时候,才删除,参数how默认是any,含有缺失值就删除--------
            one       two     three
    a -0.211055 -2.869212  0.022179
    c -0.870090 -0.878423  1.071588
    e -0.203259  0.315897  0.495306
    f  0.490568 -0.968058 -0.999899
    h  1.437819 -0.370934 -0.482307

    '''
    print('################缺失值填充######################') print('------指定特殊值填充缺失值-------') print(df.fillna(0))
    '''
    ################缺失值填充######################
    ------指定特殊值填充缺失值-------
            one       two     three
    a -0.211055 -2.869212  0.022179
    b  0.000000  0.000000  0.000000
    c -0.870090 -0.878423  1.071588
    d  0.000000  0.000000  0.000000
    e -0.203259  0.315897  0.495306
    f  0.490568 -0.968058 -0.999899
    g  0.000000  0.000000  0.000000
    h  1.437819 -0.370934 -0.482307

    '''
    print('------不同的列用不同的值填充------') print(df.fillna({'one':1,'two':2,'three':3}))
    '''
    ------不同的列用不同的值填充------
            one       two     three
    a -0.211055 -2.869212  0.022179
    b  1.000000  2.000000  3.000000
    c -0.870090 -0.878423  1.071588
    d  1.000000  2.000000  3.000000
    e -0.203259  0.315897  0.495306
    f  0.490568 -0.968058 -0.999899
    g  1.000000  2.000000  3.000000
    h  1.437819 -0.370934 -0.482307

    '''
    print('------前向填充------') print(df.fillna(method="ffill"))
    '''
    ------前向填充------
            one       two     three
    a -0.211055 -2.869212  0.022179
    b -0.211055 -2.869212  0.022179
    c -0.870090 -0.878423  1.071588
    d -0.870090 -0.878423  1.071588
    e -0.203259  0.315897  0.495306
    f  0.490568 -0.968058 -0.999899
    g  0.490568 -0.968058 -0.999899
    h  1.437819 -0.370934 -0.482307

    '''
    print('------后向填充------') print(df.fillna(method="bfill"))
    '''
    ------后向填充------
            one       two     three
    a -0.211055 -2.869212  0.022179
    b -0.870090 -0.878423  1.071588
    c -0.870090 -0.878423  1.071588
    d -0.203259  0.315897  0.495306
    e -0.203259  0.315897  0.495306
    f  0.490568 -0.968058 -0.999899
    g  1.437819 -0.370934 -0.482307
    h  1.437819 -0.370934 -0.482307

    '''
    print('------平均值填充------') print(df.fillna(df.mean()))
    '''
    ------平均值填充------
            one       two     three
    a -0.211055 -2.869212  0.022179
    b  0.128797 -0.954146  0.021373
    c -0.870090 -0.878423  1.071588
    d  0.128797 -0.954146  0.021373
    e -0.203259  0.315897  0.495306
    f  0.490568 -0.968058 -0.999899
    g  0.128797 -0.954146  0.021373
    h  1.437819 -0.370934 -0.482307

    '''
  • 相关阅读:
    python开发--信息处理系统
    oracle--表空间操作
    oracle--表空间基本操作
    CPU、内存、磁盘的瓶颈(转载文)
    在pycharm中进行ORM操作
    contenttypes组件 (处理大量外键)
    Django Rest Framework 请求流程
    Python标准库--UUID
    Django REST framework基础:版本、认证、权限、限制
    Django Rest Framework 视图和路由
  • 原文地址:https://www.cnblogs.com/jinqier/p/9337562.html
Copyright © 2011-2022 走看看