zoukankan      html  css  js  c++  java
  • Pandas缺失值处理

    #导入库
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import Imputer
    
    #生成缺失数据
    df=pd.DataFrame(np.random.randn(6,4),columns=['col1','col2','col3','col4'])
    df.iloc[1:2,1] = np.nan #增加缺失值
    df.iloc[4,3] = np.nan #增加缺失值
    print(df) #打印输出
           col1      col2      col3      col4
    0 -0.977511 -0.566332 -0.529934  1.489695
    1 -0.491128       NaN -0.811174 -1.102717
    2  0.385777 -0.638822  0.325953 -0.240780
    3  0.938351 -0.746889  0.375200 -0.715265
    4  1.103418  0.238959 -0.459114       NaN
    5  1.002177  0.448844 -0.584634 -1.038151
    
    #查看缺失值位置
    nan_all=df.isnull()
    print(nan_all)
        col1   col2   col3   col4
    0  False  False  False  False
    1  False   True  False  False
    2  False  False  False  False
    3  False  False  False  False
    4  False  False  False   True
    5  False  False  False  False
    
    nan_col1=df.isnull().any() #获取含有NA的列
    print(nan_col1)
    col1    False
    col2     True
    col3    False
    col4     True
    dtype: bool
    
    nan_col2=df.isnull().all() #获得全部为NA的列
    print(nan_col2)
    col1    False
    col2    False
    col3    False
    col4    False
    dtype: bool
    
    #丢弃缺失值
    df2=df.dropna() #直接丢弃含有NA的行纪录
    print(df2)
           col1      col2      col3      col4
    0 -0.977511 -0.566332 -0.529934  1.489695
    2  0.385777 -0.638822  0.325953 -0.240780
    3  0.938351 -0.746889  0.375200 -0.715265
    5  1.002177  0.448844 -0.584634 -1.038151
    
    #通过sklearn的数据预处理方法对缺失值进行处理
    nan_model=Imputer(missing_values='NaN',strategy='mean',axis=0) #建立替换规则:将值为NaN的缺失值以均值做替换
    nan_result=nan_model.fit_transform(df) #应用模型规则
    print(nan_result) #打印输出
    [[-0.97751051 -0.56633185 -0.52993389  1.48969465]
     [-0.49112788 -0.25284792 -0.81117388 -1.10271738]
     [ 0.38577678 -0.63882219  0.32595345 -0.24077995]
     [ 0.93835121 -0.74688892  0.37519957 -0.71526484]
     [ 1.10341788  0.23895916 -0.45911413 -0.32144373]
     [ 1.00217657  0.4488442  -0.58463419 -1.03815116]]
    
    #使用Pandas做缺失值处理
    nan_result_pd1 = df.fillna(method='backfill') #用后面的值替换缺失值
    print(nan_result_pd1)
           col1      col2      col3      col4
    0 -0.977511 -0.566332 -0.529934  1.489695
    1 -0.491128 -0.638822 -0.811174 -1.102717
    2  0.385777 -0.638822  0.325953 -0.240780
    3  0.938351 -0.746889  0.375200 -0.715265
    4  1.103418  0.238959 -0.459114 -1.038151
    5  1.002177  0.448844 -0.584634 -1.038151
    
    nan_result_pd2 = df.fillna(method='bfill',limit=1) #用后面的值替换缺失值,限制每列只能替代一个缺失值
    print(nan_result_pd2)
           col1      col2      col3      col4
    0 -0.977511 -0.566332 -0.529934  1.489695
    1 -0.491128 -0.638822 -0.811174 -1.102717
    2  0.385777 -0.638822  0.325953 -0.240780
    3  0.938351 -0.746889  0.375200 -0.715265
    4  1.103418  0.238959 -0.459114 -1.038151
    5  1.002177  0.448844 -0.584634 -1.038151
    
    nan_result_df3=df.fillna(method='pad') #用前面的值替换缺失值
    print(nan_result_df3)
           col1      col2      col3      col4
    0 -0.977511 -0.566332 -0.529934  1.489695
    1 -0.491128 -0.566332 -0.811174 -1.102717
    2  0.385777 -0.638822  0.325953 -0.240780
    3  0.938351 -0.746889  0.375200 -0.715265
    4  1.103418  0.238959 -0.459114 -0.715265
    5  1.002177  0.448844 -0.584634 -1.038151
    
    nan_result_df4=df.fillna(0) #用0替换缺失值
    print(nan_result_df4)
           col1      col2      col3      col4
    0 -0.977511 -0.566332 -0.529934  1.489695
    1 -0.491128  0.000000 -0.811174 -1.102717
    2  0.385777 -0.638822  0.325953 -0.240780
    3  0.938351 -0.746889  0.375200 -0.715265
    4  1.103418  0.238959 -0.459114  0.000000
    5  1.002177  0.448844 -0.584634 -1.038151
    
    nan_result_df5=df.fillna({'col2':1.1,'col4':1.2}) #用不同值替换不同列的缺失值
    print(nan_result_df5)
           col1      col2      col3      col4
    0 -0.977511 -0.566332 -0.529934  1.489695
    1 -0.491128  1.100000 -0.811174 -1.102717
    2  0.385777 -0.638822  0.325953 -0.240780
    3  0.938351 -0.746889  0.375200 -0.715265
    4  1.103418  0.238959 -0.459114  1.200000
    5  1.002177  0.448844 -0.584634 -1.038151
    
    nan_result_df6=df.fillna(df.mean()['col2':'col4']) #用各自列的平均数替换缺失值
    print(nan_result_df6)
           col1      col2      col3      col4
    0 -0.977511 -0.566332 -0.529934  1.489695
    1 -0.491128 -0.252848 -0.811174 -1.102717
    2  0.385777 -0.638822  0.325953 -0.240780
    3  0.938351 -0.746889  0.375200 -0.715265
    4  1.103418  0.238959 -0.459114 -0.321444
    5  1.002177  0.448844 -0.584634 -1.038151
    
    nan_result_df7=df.replace(np.nan,0) #用Pandas的replace替换缺失值
    print(nan_result_df7)
           col1      col2      col3      col4
    0 -0.977511 -0.566332 -0.529934  1.489695
    1 -0.491128  0.000000 -0.811174 -1.102717
    2  0.385777 -0.638822  0.325953 -0.240780
    3  0.938351 -0.746889  0.375200 -0.715265
    4  1.103418  0.238959 -0.459114  0.000000
    5  1.002177  0.448844 -0.584634 -1.038151
    
    
  • 相关阅读:
    MongoVUE破解方法(转)
    Apache和IIS共享80端口,支持多域名
    让作业飞吧,与屌丝兄弟们分享我的分布式作业调度平台 【拥抱开源,拥抱作业调度的神器Quartz.net】
    关于Nbearlite 访问PostgreSql,MySql,Sqlite的Bug
    php5.4.6/5.3.16/5.2.17安装(In windows),配置(转)
    MSSQL翻页存储过程
    话说客户端连接mongoDB的连接参数(转载)
    关于Windows频繁打开关闭端口时出现的问题(转至老赵)
    zeromq的几种模式(转)
    如何设置代理服务器上网
  • 原文地址:https://www.cnblogs.com/hankleo/p/11462830.html
Copyright © 2011-2022 走看看