zoukankan      html  css  js  c++  java
  • spark scala分组取最新日期的几条记录max date

    val emp = Seq((1,"Smith",-1,"2018","10","M",3000),
        (2,"Rose",1,"2010","20","M",4000),
        (1,"Williams",1,"2020","10","M",1000),
        (2,"Jones",2,"2005","10","F",2000),
        (1,"Brown",2,"2020","40","",-1),
          (6,"Brown",2,"2010","50","",-1)
      )
      val empColumns = Seq("emp_id","name","superior_emp_id","year_joined",
           "emp_dept_id","gender","salary")
    
      import spark.sqlContext.implicits._
      val empDF = emp.toDF(empColumns:_*)
      empDF.show(false)
    
    scala> val b = empDF
    scala> b.show
    +------+--------+---------------+-----------+-----------+------+------+
    |emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|
    +------+--------+---------------+-----------+-----------+------+------+
    |     1|   Smith|             -1|       2018|         10|     M|  3000|
    |     2|    Rose|              1|       2010|         20|     M|  4000|
    |     1|Williams|              1|       2020|         10|     M|  1000|
    |     2|   Jones|              2|       2005|         10|     F|  2000|
    |     1|   Brown|              2|       2020|         40|      |    -1|
    |     6|   Brown|              2|       2010|         50|      |    -1|
    +------+--------+---------------+-----------+-----------+------+------+
    
    scala> val a = empDF.groupBy("emp_id").agg(max("year_joined").alias("max"))
    a: org.apache.spark.sql.DataFrame = [emp_id: int, max: string]
    
    scala> a.show
    +------+----+
    |emp_id| max|
    +------+----+
    |     1|2020|
    |     6|2010|
    |     2|2010|
    +------+----+
    
    scala> b.join(a, Seq("emp_id"), "left").show
    +------+--------+---------------+-----------+-----------+------+------+----+
    |emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary| max|
    +------+--------+---------------+-----------+-----------+------+------+----+
    |     1|   Smith|             -1|       2018|         10|     M|  3000|2020|
    |     2|    Rose|              1|       2010|         20|     M|  4000|2010|
    |     1|Williams|              1|       2020|         10|     M|  1000|2020|
    |     2|   Jones|              2|       2005|         10|     F|  2000|2010|
    |     1|   Brown|              2|       2020|         40|      |    -1|2020|
    |     6|   Brown|              2|       2010|         50|      |    -1|2010|
    +------+--------+---------------+-----------+-----------+------+------+----+
    
    scala> b.join(a, Seq("emp_id"), "left").where(s"year_joined = max").show
    +------+--------+---------------+-----------+-----------+------+------+----+
    |emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary| max|
    +------+--------+---------------+-----------+-----------+------+------+----+
    |     2|    Rose|              1|       2010|         20|     M|  4000|2010|
    |     1|Williams|              1|       2020|         10|     M|  1000|2020|
    |     1|   Brown|              2|       2020|         40|      |    -1|2020|
    |     6|   Brown|              2|       2010|         50|      |    -1|2010|
    +------+--------+---------------+-----------+-----------+------+------+----+
    参考:
    https://sparkbyexamples.com/spark/spark-sql-dataframe-join/
    https://stackoverflow.com/questions/39699495/spark-2-0-groupby-column-and-then-get-maxdate-on-a-datetype-column?rq=1

    如果没有一直坚持,也不会有质的飞跃,当生命有了限度,每个人的价值就会浮现。

    船长博客,期待共同交流提高!

    本文如对您有帮助,记得点击右下边小球【赞一下】,热烈期待您关注博客 n(*≧▽≦*)n

    0成本创业_月入5000被动收入

  • 相关阅读:
    FileReader:读取本地图片文件并显示
    uploadfy插件结合php案例
    php 生成二维码,图片上传到又拍云
    php get/post 请求(可用于请求api)获取手机号码归属地
    php中curl的详细解说
    聊聊Web App、Hybrid App与Native App的设计差异
    我的前端之路
    使用angular.js开发的一个简易todo demo
    在线个人简历(续)
    在线个人简历
  • 原文地址:https://www.cnblogs.com/v5captain/p/14332296.html
Copyright © 2011-2022 走看看