zoukankan      html  css  js  c++  java
  • spark scala分组取最新日期的几条记录max date

    val emp = Seq((1,"Smith",-1,"2018","10","M",3000),
        (2,"Rose",1,"2010","20","M",4000),
        (1,"Williams",1,"2020","10","M",1000),
        (2,"Jones",2,"2005","10","F",2000),
        (1,"Brown",2,"2020","40","",-1),
          (6,"Brown",2,"2010","50","",-1)
      )
      val empColumns = Seq("emp_id","name","superior_emp_id","year_joined",
           "emp_dept_id","gender","salary")
    
      import spark.sqlContext.implicits._
      val empDF = emp.toDF(empColumns:_*)
      empDF.show(false)
    
    scala> val b = empDF
    scala> b.show
    +------+--------+---------------+-----------+-----------+------+------+
    |emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|
    +------+--------+---------------+-----------+-----------+------+------+
    |     1|   Smith|             -1|       2018|         10|     M|  3000|
    |     2|    Rose|              1|       2010|         20|     M|  4000|
    |     1|Williams|              1|       2020|         10|     M|  1000|
    |     2|   Jones|              2|       2005|         10|     F|  2000|
    |     1|   Brown|              2|       2020|         40|      |    -1|
    |     6|   Brown|              2|       2010|         50|      |    -1|
    +------+--------+---------------+-----------+-----------+------+------+
    
    scala> val a = empDF.groupBy("emp_id").agg(max("year_joined").alias("max"))
    a: org.apache.spark.sql.DataFrame = [emp_id: int, max: string]
    
    scala> a.show
    +------+----+
    |emp_id| max|
    +------+----+
    |     1|2020|
    |     6|2010|
    |     2|2010|
    +------+----+
    
    scala> b.join(a, Seq("emp_id"), "left").show
    +------+--------+---------------+-----------+-----------+------+------+----+
    |emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary| max|
    +------+--------+---------------+-----------+-----------+------+------+----+
    |     1|   Smith|             -1|       2018|         10|     M|  3000|2020|
    |     2|    Rose|              1|       2010|         20|     M|  4000|2010|
    |     1|Williams|              1|       2020|         10|     M|  1000|2020|
    |     2|   Jones|              2|       2005|         10|     F|  2000|2010|
    |     1|   Brown|              2|       2020|         40|      |    -1|2020|
    |     6|   Brown|              2|       2010|         50|      |    -1|2010|
    +------+--------+---------------+-----------+-----------+------+------+----+
    
    scala> b.join(a, Seq("emp_id"), "left").where(s"year_joined = max").show
    +------+--------+---------------+-----------+-----------+------+------+----+
    |emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary| max|
    +------+--------+---------------+-----------+-----------+------+------+----+
    |     2|    Rose|              1|       2010|         20|     M|  4000|2010|
    |     1|Williams|              1|       2020|         10|     M|  1000|2020|
    |     1|   Brown|              2|       2020|         40|      |    -1|2020|
    |     6|   Brown|              2|       2010|         50|      |    -1|2010|
    +------+--------+---------------+-----------+-----------+------+------+----+
    参考:
    https://sparkbyexamples.com/spark/spark-sql-dataframe-join/
    https://stackoverflow.com/questions/39699495/spark-2-0-groupby-column-and-then-get-maxdate-on-a-datetype-column?rq=1

    如果没有一直坚持,也不会有质的飞跃,当生命有了限度,每个人的价值就会浮现。

    船长博客,期待共同交流提高!

    本文如对您有帮助,记得点击右下边小球【赞一下】,热烈期待您关注博客 n(*≧▽≦*)n

    0成本创业_月入5000被动收入

  • 相关阅读:
    laravel windows下安装 gulp 和 laravel-elixir
    php-新特性,生成器的创建和使用
    laravel 使用极验验证码
    laravel 发送邮件
    laravel安装 redis 并驱动 session
    理解HTTP协议(转载)
    iOS中Block的用法,举例,解析与底层原理
    iOS自定义结构体
    dyld环境变量
    iOS中的静态库与动态库,区别、制作和使用
  • 原文地址:https://www.cnblogs.com/v5captain/p/14332296.html
Copyright © 2011-2022 走看看