zoukankan      html  css  js  c++  java
  • Pandas入门之十三:缺失值处理

    已信任
    Jupyter 服务器: 本地
    Python 3: Not Started
    [1]
    
    
    
    import pandas as pd
    import numpy as np
    [3]
    
    
    
    df = pd.DataFrame(np.random.randn(5,3),index=['a','b','e','f','h'],columns=['one','two','three'])
    df 
    one    two    three
    a    -0.134914    -0.183527    1.455060
    b    0.056577    -0.612873    -1.710761
    e    1.000864    -0.708675    0.690998
    f    -2.126286    0.363740    -0.151361
    h    -0.750653    0.687731    -0.830824
    [5]
    
    
    
    df = df.reindex(['a','b','c','d','e','f','h'])
    df
    one    two    three
    a    -0.134914    -0.183527    1.455060
    b    0.056577    -0.612873    -1.710761
    c    NaN    NaN    NaN
    d    NaN    NaN    NaN
    e    1.000864    -0.708675    0.690998
    f    -2.126286    0.363740    -0.151361
    h    -0.750653    0.687731    -0.830824
    [6]
    
    
    
    # 检查数据是否为空
    df['one'].isnull()
    a    False
    b    False
    c     True
    d     True
    e    False
    f    False
    h    False
    Name: one, dtype: bool
    [7]
    
    
    
    # 检查数据是否非空
    df['one'].notnull()
    a     True
    b     True
    c    False
    d    False
    e     True
    f     True
    h     True
    Name: one, dtype: bool
    [8]
    
    
    
    # 提取空
    df[df['one'].isnull()]
    one    two    three
    c    NaN    NaN    NaN
    d    NaN    NaN    NaN
    [9]
    
    
    
    #提取非空
    df[df['one'].notnull()]
    one    two    three
    a    -0.134914    -0.183527    1.455060
    b    0.056577    -0.612873    -1.710761
    e    1.000864    -0.708675    0.690998
    f    -2.126286    0.363740    -0.151361
    h    -0.750653    0.687731    -0.830824
    [10]
    
    
    
    df
    one    two    three
    a    -0.134914    -0.183527    1.455060
    b    0.056577    -0.612873    -1.710761
    c    NaN    NaN    NaN
    d    NaN    NaN    NaN
    e    1.000864    -0.708675    0.690998
    f    -2.126286    0.363740    -0.151361
    h    -0.750653    0.687731    -0.830824
    [11]
    
    
    
    # 计算 第一列求和,若有nan,则视为0;如果所有的数据都为nan,则结果也为nan
    df['one'].sum()
    -1.9544119617918125
    [12]
    
    
    
    # 填充,把所有的nan填充为0
    df.fillna(0)
    one    two    three
    a    -0.134914    -0.183527    1.455060
    b    0.056577    -0.612873    -1.710761
    c    0.000000    0.000000    0.000000
    d    0.000000    0.000000    0.000000
    e    1.000864    -0.708675    0.690998
    f    -2.126286    0.363740    -0.151361
    h    -0.750653    0.687731    -0.830824
    [16]
    
    
    
    df
    one    two    three
    a    -0.134914    -0.183527    1.455060
    b    0.056577    -0.612873    -1.710761
    c    NaN    NaN    NaN
    d    NaN    NaN    NaN
    e    1.000864    -0.708675    0.690998
    f    -2.126286    0.363740    -0.151361
    h    -0.750653    0.687731    -0.830824
    [15]
    
    
    
    # 计算平均值进行填充,按列的平均值进行填充
    df.fillna(df.mean())
    one    two    three
    a    -0.134914    -0.183527    1.455060
    b    0.056577    -0.612873    -1.710761
    c    -0.390882    -0.090721    -0.109377
    d    -0.390882    -0.090721    -0.109377
    e    1.000864    -0.708675    0.690998
    f    -2.126286    0.363740    -0.151361
    h    -0.750653    0.687731    -0.830824
    [17]
    
    
    
    # pad为填充前一个数据
    df.fillna(method='pad')
    one    two    three
    a    -0.134914    -0.183527    1.455060
    b    0.056577    -0.612873    -1.710761
    c    0.056577    -0.612873    -1.710761
    d    0.056577    -0.612873    -1.710761
    e    1.000864    -0.708675    0.690998
    f    -2.126286    0.363740    -0.151361
    h    -0.750653    0.687731    -0.830824
    [18]
    
    
    
    # backfill为填充后一个数据
    df.fillna(method='backfill')
    one    two    three
    a    -0.134914    -0.183527    1.455060
    b    0.056577    -0.612873    -1.710761
    c    1.000864    -0.708675    0.690998
    d    1.000864    -0.708675    0.690998
    e    1.000864    -0.708675    0.690998
    f    -2.126286    0.363740    -0.151361
    h    -0.750653    0.687731    -0.830824
    [19]
    
    
    
    # 将空值的数据删除,按行删除nan
    df.dropna()
    one    two    three
    a    -0.134914    -0.183527    1.455060
    b    0.056577    -0.612873    -1.710761
    e    1.000864    -0.708675    0.690998
    f    -2.126286    0.363740    -0.151361
    h    -0.750653    0.687731    -0.830824
    [20]
    
    
    
    # 按列删除,每列都有nan,为空
    df.dropna(axis=1)
    a
    b
    c
    d
    e
    f
    h
    [22]
    
    
    
    # 替换丢失或者nan值或者通用值
    df.replace({np.nan:10})
    one    two    three
    a    -0.134914    -0.183527    1.455060
    b    0.056577    -0.612873    -1.710761
    c    10.000000    10.000000    10.000000
    d    10.000000    10.000000    10.000000
    e    1.000864    -0.708675    0.690998
    f    -2.126286    0.363740    -0.151361
    h    -0.750653    0.687731    -0.830824
    [24]
    
    
    
    df['four']=pd.Series([1,2,3,4,5,6,7],index=['a','b','c','d','e','f','h'])
    df
    one    two    three    four
    a    -0.134914    -0.183527    1.455060    1
    b    0.056577    -0.612873    -1.710761    2
    c    NaN    NaN    NaN    3
    d    NaN    NaN    NaN    4
    e    1.000864    -0.708675    0.690998    5
    f    -2.126286    0.363740    -0.151361    6
    h    -0.750653    0.687731    -0.830824    7
    [25]
    
    
    
    df.replace({np.nan:10,5:1000})
    one    two    three    four
    a    -0.134914    -0.183527    1.455060    1
    b    0.056577    -0.612873    -1.710761    2
    c    10.000000    10.000000    10.000000    3
    d    10.000000    10.000000    10.000000    4
    e    1.000864    -0.708675    0.690998    1000
    f    -2.126286    0.363740    -0.151361    6
    h    -0.750653    0.687731    -0.830824    7
    [-]
  • 相关阅读:
    AcRxClass::addX
    string.format("%s",name)
    strcmp 与 _tcscmp
    acedinitget
    判断实体的类型 相关操作
    accmcolor
    CAD类型转换
    图的存储结构及遍历
    并查集(Union/Find)
    设计模式--缺醒适配器模式
  • 原文地址:https://www.cnblogs.com/vvzhang/p/15018149.html
Copyright © 2011-2022 走看看