zoukankan      html  css  js  c++  java
  • Spark2 Dataset DataFrame空值null,NaN判断和处理

    import org.apache.spark.sql.SparkSession 
    import org.apache.spark.sql.Dataset 
    import org.apache.spark.sql.Row 
    import org.apache.spark.sql.DataFrame 
    import org.apache.spark.sql.Column 
    import org.apache.spark.sql.DataFrameReader 
    import org.apache.spark.rdd.RDD 
    import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder 
    import org.apache.spark.sql.Encoder 
    import org.apache.spark.sql.functions._
    import org.apache.spark.sql.DataFrameStatFunctions 
    import org.apache.spark.ml.linalg.Vectors 
    
    
    math.sqrt(-1.0) 
    res43: Double = NaN 
       
    math.sqrt(-1.0).isNaN() 
    res44: Boolean = true
      
      
    val data1 = data.toDF("affairs", "gender", "age", "yearsmarried", "children", "religiousness", "education", "occupation", "rating") 
    data1: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields] 
       
    data1.limit(10).show 
    +-------+------+---+------------+--------+-------------+---------+----------+------+ 
    |affairs|gender|age|yearsmarried|children|religiousness|education|occupation|rating| 
    +-------+------+---+------------+--------+-------------+---------+----------+------+ 
    |      0|  male| 37|          10|      no|            3|       18|         7|     4| 
    |      0|  null| 27|        null|      no|            4|       14|         6|  null| 
    |      0|  null| 32|        null|     yes|            1|       12|         1|  null| 
    |      0|  null| 57|        null|     yes|            5|       18|         6|  null| 
    |      0|  null| 22|        null|      no|            2|       17|         6|  null| 
    |      0|  null| 32|        null|      no|            2|       17|         5|  null| 
    |      0|female| 22|        null|      no|            2|       12|         1|  null| 
    |      0|  male| 57|          15|     yes|            2|       14|         4|     4| 
    |      0|female| 32|          15|     yes|            4|       16|         1|     2| 
    |      0|  male| 22|         1.5|      no|            4|       14|         4|     5| 
    +-------+------+---+------------+--------+-------------+---------+----------+------+ 
       
     // 删除所有列的空值和NaN 
    val resNull=data1.na.drop() 
    resNull: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields] 
       
     resNull.limit(10).show() 
    +-------+------+---+------------+--------+-------------+---------+----------+------+ 
    |affairs|gender|age|yearsmarried|children|religiousness|education|occupation|rating| 
    +-------+------+---+------------+--------+-------------+---------+----------+------+ 
    |      0|  male| 37|          10|      no|            3|       18|         7|     4| 
    |      0|  male| 57|          15|     yes|            2|       14|         4|     4| 
    |      0|female| 32|          15|     yes|            4|       16|         1|     2| 
    |      0|  male| 22|         1.5|      no|            4|       14|         4|     5| 
    |      0|  male| 37|          15|     yes|            2|       20|         7|     2| 
    |      0|  male| 27|           4|     yes|            4|       18|         6|     4| 
    |      0|  male| 47|          15|     yes|            5|       17|         6|     4| 
    |      0|female| 22|         1.5|      no|            2|       17|         5|     4| 
    |      0|female| 27|           4|      no|            4|       14|         5|     4| 
    |      0|female| 37|          15|     yes|            1|       17|         5|     5| 
    +-------+------+---+------------+--------+-------------+---------+----------+------+ 
       
     //删除某列的空值和NaN
    val res=data1.na.drop(Array("gender","yearsmarried")) 
    
    // 删除某列的非空且非NaN的低于10的
    data1.na.drop(10,Array("gender","yearsmarried")) 
       
       
     //填充所有空值的列 
    val res123=data1.na.fill("wangxiao123") 
    res123: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields] 
       
     res123.limit(10).show() 
    +-------+-----------+---+------------+--------+-------------+---------+----------+-----------+ 
    |affairs|     gender|age|yearsmarried|children|religiousness|education|occupation|     rating| 
    +-------+-----------+---+------------+--------+-------------+---------+----------+-----------+ 
    |      0|       male| 37|          10|      no|            3|       18|         7|          4| 
    |      0|wangxiao123| 27| wangxiao123|      no|            4|       14|         6|wangxiao123| 
    |      0|wangxiao123| 32| wangxiao123|     yes|            1|       12|         1|wangxiao123| 
    |      0|wangxiao123| 57| wangxiao123|     yes|            5|       18|         6|wangxiao123| 
    |      0|wangxiao123| 22| wangxiao123|      no|            2|       17|         6|wangxiao123| 
    |      0|wangxiao123| 32| wangxiao123|      no|            2|       17|         5|wangxiao123| 
    |      0|     female| 22| wangxiao123|      no|            2|       12|         1|wangxiao123| 
    |      0|       male| 57|          15|     yes|            2|       14|         4|          4| 
    |      0|     female| 32|          15|     yes|            4|       16|         1|          2| 
    |      0|       male| 22|         1.5|      no|            4|       14|         4|          5| 
    +-------+-----------+---+------------+--------+-------------+---------+----------+-----------+ 
       
     //对指定的列空值填充 
     val res2=data1.na.fill(value="wangxiao111",cols=Array("gender","yearsmarried") ) 
    res2: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields] 
       
     res2.limit(10).show() 
    +-------+-----------+---+------------+--------+-------------+---------+----------+------+ 
    |affairs|     gender|age|yearsmarried|children|religiousness|education|occupation|rating| 
    +-------+-----------+---+------------+--------+-------------+---------+----------+------+ 
    |      0|       male| 37|          10|      no|            3|       18|         7|     4| 
    |      0|wangxiao111| 27| wangxiao111|      no|            4|       14|         6|  null| 
    |      0|wangxiao111| 32| wangxiao111|     yes|            1|       12|         1|  null| 
    |      0|wangxiao111| 57| wangxiao111|     yes|            5|       18|         6|  null| 
    |      0|wangxiao111| 22| wangxiao111|      no|            2|       17|         6|  null| 
    |      0|wangxiao111| 32| wangxiao111|      no|            2|       17|         5|  null| 
    |      0|     female| 22| wangxiao111|      no|            2|       12|         1|  null| 
    |      0|       male| 57|          15|     yes|            2|       14|         4|     4| 
    |      0|     female| 32|          15|     yes|            4|       16|         1|     2| 
    |      0|       male| 22|         1.5|      no|            4|       14|         4|     5| 
    +-------+-----------+---+------------+--------+-------------+---------+----------+------+ 
       
       
    val res3=data1.na.fill(Map("gender"->"wangxiao222","yearsmarried"->"wangxiao567") ) 
    res3: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields] 
       
     res3.limit(10).show() 
    +-------+-----------+---+------------+--------+-------------+---------+----------+------+ 
    |affairs|     gender|age|yearsmarried|children|religiousness|education|occupation|rating| 
    +-------+-----------+---+------------+--------+-------------+---------+----------+------+ 
    |      0|       male| 37|          10|      no|            3|       18|         7|     4| 
    |      0|wangxiao222| 27| wangxiao567|      no|            4|       14|         6|  null| 
    |      0|wangxiao222| 32| wangxiao567|     yes|            1|       12|         1|  null| 
    |      0|wangxiao222| 57| wangxiao567|     yes|            5|       18|         6|  null| 
    |      0|wangxiao222| 22| wangxiao567|      no|            2|       17|         6|  null| 
    |      0|wangxiao222| 32| wangxiao567|      no|            2|       17|         5|  null| 
    |      0|     female| 22| wangxiao567|      no|            2|       12|         1|  null| 
    |      0|       male| 57|          15|     yes|            2|       14|         4|     4| 
    |      0|     female| 32|          15|     yes|            4|       16|         1|     2| 
    |      0|       male| 22|         1.5|      no|            4|       14|         4|     5| 
    +-------+-----------+---+------------+--------+-------------+---------+----------+------+ 
       
     //查询空值列 
    data1.filter("gender is null").select("gender").limit(10).show 
    +------+ 
    |gender| 
    +------+ 
    |  null| 
    |  null| 
    |  null| 
    |  null| 
    |  null| 
    +------+ 
       
       
     data1.filter("gender is not null").select("gender").limit(10).show 
    +------+ 
    |gender| 
    +------+ 
    |  male| 
    |female| 
    |  male| 
    |female| 
    |  male| 
    |  male| 
    |  male| 
    |  male| 
    |female| 
    |female| 
    +------+ 
       
       
     data1.filter( data1("gender").isNull ).select("gender").limit(10).show 
    +------+ 
    |gender| 
    +------+ 
    |  null| 
    |  null| 
    |  null| 
    |  null| 
    |  null| 
    +------+ 
       
       
     data1.filter("gender<>''").select("gender").limit(10).show 
    +------+ 
    |gender| 
    +------+ 
    |  male| 
    |female| 
    |  male| 
    |female| 
    |  male| 
    |  male| 
    |  male| 
    |  male| 
    |female| 
    |female| 
    +------+ 
       
       
    
     
    
  • 相关阅读:
    Django源码解析(1):启动程序
    python之importlib模块
    Django中间件:CsrfViewMiddleware
    Django的admin组件
    Linux学习之CentOS--CentOS6.4下Mysql数据库的安装与配置【转】
    C#读取Xml【转】
    在eclipse导入项目的步骤【转】
    Spring学习(一)——Spring中的依赖注入简介【转】
    Spring学习(二)——Spring中的AOP的初步理解[转]
    Spring之AOP
  • 原文地址:https://www.cnblogs.com/wwxbi/p/6011422.html
Copyright © 2011-2022 走看看