zoukankan      html  css  js  c++  java
  • SparkSQL读写外部数据源--数据分区

    import com.twq.dataset.Utils._
    import org.apache.spark.sql.{SaveMode, SparkSession}
    
    object FilePartitionTest {
      def main(args: Array[String]): Unit = {
        val spark = SparkSession
          .builder()
          .appName("FilePartitionTest")
          .getOrCreate()
    
        val sessions = spark.read.parquet(s"${BASE_PATH}/trackerSession")
        sessions.show()
        sessions.printSchema()
    
        sessions.createOrReplaceTempView("non_partition_table")
        spark.sql("select * from non_partition_table where day = 20170903").show()
    
        //对数据按照年月日进行分区
        sessions.write.mode(SaveMode.Overwrite).partitionBy("cookie").parquet(s"${BASE_PATH}/trackerSession_partition")
    
        val partitionDF = spark.read.parquet(s"${BASE_PATH}/trackerSession_partition")
        partitionDF.show()
        partitionDF.printSchema()
    
        //用sql查询某20170903这天的数据
        partitionDF.createOrReplaceTempView("partition_table")
        spark.sql("select * from partition_table where cookie='cookie1'").show()
    
        //取20170903这天的数据
        val day03DF = spark.read.parquet(s"${BASE_PATH}/trackerSession_partition/year=2017/month=201709/day=20170903")
        day03DF.show()
        day03DF.printSchema()
    
        //bucket只能用于hive表中
        //而且只用于parquet、json和orc文件格式的文件数据
        sessions.write
          .partitionBy("year")
          .bucketBy(24, "cookie")
          .saveAsTable("session")
    
        spark.stop()
      }
    }
    

      

  • 相关阅读:
    毛皮立方体
    APPLE buSinEss
    #4613. problem C
    #4614. problem B
    idiots
    熊猫(i)
    圆盘自动机 cell
    小L的占卜
    有趣的数(number)
    「JOISC 2015 Day 1」卡片占卜
  • 原文地址:https://www.cnblogs.com/tesla-turing/p/11489088.html
Copyright © 2011-2022 走看看