zoukankan      html  css  js  c++  java
  • SparkSQL读写外部数据源--数据分区

    import com.twq.dataset.Utils._
    import org.apache.spark.sql.{SaveMode, SparkSession}
    
    object FilePartitionTest {
      def main(args: Array[String]): Unit = {
        val spark = SparkSession
          .builder()
          .appName("FilePartitionTest")
          .getOrCreate()
    
        val sessions = spark.read.parquet(s"${BASE_PATH}/trackerSession")
        sessions.show()
        sessions.printSchema()
    
        sessions.createOrReplaceTempView("non_partition_table")
        spark.sql("select * from non_partition_table where day = 20170903").show()
    
        //对数据按照年月日进行分区
        sessions.write.mode(SaveMode.Overwrite).partitionBy("cookie").parquet(s"${BASE_PATH}/trackerSession_partition")
    
        val partitionDF = spark.read.parquet(s"${BASE_PATH}/trackerSession_partition")
        partitionDF.show()
        partitionDF.printSchema()
    
        //用sql查询某20170903这天的数据
        partitionDF.createOrReplaceTempView("partition_table")
        spark.sql("select * from partition_table where cookie='cookie1'").show()
    
        //取20170903这天的数据
        val day03DF = spark.read.parquet(s"${BASE_PATH}/trackerSession_partition/year=2017/month=201709/day=20170903")
        day03DF.show()
        day03DF.printSchema()
    
        //bucket只能用于hive表中
        //而且只用于parquet、json和orc文件格式的文件数据
        sessions.write
          .partitionBy("year")
          .bucketBy(24, "cookie")
          .saveAsTable("session")
    
        spark.stop()
      }
    }
    

      

  • 相关阅读:
    软件公司项目经理岗位职责
    指针和链表
    数据结构
    五子棋
    AtCoder Grand Contest 031 B
    两道dp
    博客搬迁
    [Codeforces Round #526 (Div. 2)]
    [Educational Codeforces Round 55 (Rated for Div. 2)][C. Multi-Subject Competition]
    [codeforces Mail.Ru Cup 2018 Round 3][B Divide Candies ][思维+数学]
  • 原文地址:https://www.cnblogs.com/tesla-turing/p/11489088.html
Copyright © 2011-2022 走看看