zoukankan      html  css  js  c++  java
  • Spark DataFrame常用API

    Spark  DataFrame常用API

    package com.imooc.bigdata.chapter04
    
    import org.apache.spark.sql.{DataFrame, SparkSession}
    
    object DataFrameAPIApp {
    
      def main(args: Array[String]): Unit = {
    
        val spark = SparkSession.builder().master("local").appName("DataFrameAPIApp").getOrCreate()
        import spark.implicits._
    
    
         val people: DataFrame = spark.read.json("E:\06-work\03-java\01-JavaCodeDome\SparkSqlCode\sparksql-train\data\people.json")
    
         people.printSchema()  // 查看DF的内部结构:列名、列的数据类型、是否可以为空
    
        people.show() // 展示出DF内部的数据
    
        // TODO... DF里面有两列,只要name列 ==> select name from people
        people.select("name").show()
        people.select($"name").show()
    
        // TODO...  select * from people where age > 21
        people.filter($"age" > 21).show()
        people.filter("age > 21").show()
    
        // TODO... select age, count(1) from people group by age
        people.groupBy("age").count().show()
    
        // TODO... select name,age+10 from people
         people.select($"name", ($"age"+10).as("new_age")).show()
    
    
        // TODO... 使用SQL的方式操作
        people.createOrReplaceTempView("people")
        spark.sql("select name from people where age > 21").show()
    
    
        val zips: DataFrame = spark.read.json("E:\06-work\03-java\01-JavaCodeDome\SparkSqlCode\sparksql-train\data\zips.json")
        zips.printSchema()  // 查看schema信息
    
        /**
          * 1)loc的信息没用展示全,超过一定长度就使用...来展示
          * 2)只显示了前20条
          * show() ==> show(20) ==> show(numRows, truncate = true)
          */
        zips.show(10, false)
    
        zips.head(3).foreach(println)
        zips.first()
        zips.take(5)
    
        val count: Long = zips.count()
        println(s"Total Counts: $count")
    
        // 过滤出大于40000,withColumnRenamed:字段重新命名
         zips.filter(zips.col("pop") > 40000).withColumnRenamed("_id","new_id").show(10,false)
    
    
        import org.apache.spark.sql.functions._
        // 统计加州pop最多的10个城市名称和ID  desc是一个内置函数
        zips.select("_id","city","pop","state").filter(zips.col("state") === "CA").orderBy(desc("pop")).show(10,false)
    
        zips.createOrReplaceTempView("zips")
        spark.sql("select _id,city,pop,state from zips where state='CA' order by pop desc limit 10").show()
    
    
        spark.stop()
      }
    }
    

      

  • 相关阅读:
    使SourceInsight支持Python语言的方法
    图解Join
    EularProject 42:单词解码出来的三角形数
    android-async-http二次封装和调用
    #20 Valid Parentheses
    udev详解【转】
    linux下udev简介【转】
    Linux USB 驱动开发(一)—— USB设备基础概念【转】
    Android updater-scripts(Edify Script)各函数详细说明【转】
    OTA制作及升级过程笔记【转】
  • 原文地址:https://www.cnblogs.com/yoyo1216/p/13533892.html
Copyright © 2011-2022 走看看