zoukankan      html  css  js  c++  java
  • SparkSQL读写外部数据源-jext文件和table数据源的读写

    object ParquetFileTest {
      def main(args: Array[String]): Unit = {
        val spark = SparkSession
          .builder()
          .appName("ParquetFileTest")
          .getOrCreate()
    
        //1: 将json文件数据转化成parquet文件数据
        val df = spark.read.json(s"${BASE_PATH}/people.json")
        df.show()
    
        //gzip、lzo、snappy
        df.write.option("compression", "snappy").parquet(s"${BASE_PATH}/parquet")
        //2: 读取parquet文件
        val parquetDF = spark.read.parquet(s"${BASE_PATH}/parquet")
        parquetDF.show()
    
        //3: parquet schema merge
        //全局设置spark.sql.parquet.mergeSchema = true
        df.toDF("age", "first_name").write.parquet(s"${BASE_PATH}/parquet_schema_change")
        val changedDF = spark.read.parquet(s"${BASE_PATH}/parquet_schema_change")
        changedDF.show()
    
        val schemaMergeDF = spark.read.option("mergeSchema", "true").parquet(s"${BASE_PATH}/parquet",
          s"${BASE_PATH}/parquet_schema_change")
        schemaMergeDF.show()
    
        spark.stop()
      }
    }
    

      

    object OrcFileTest {
      def main(args: Array[String]): Unit = {
        val spark = SparkSession
          .builder()
          .appName("OrcFileTest")
          .getOrCreate()
    
        //1: 将json文件数据转化成orc文件数据
        val df = spark.read.json(s"${BASE_PATH}/people.json")
        df.show()
    
        df.write.option("compression", "snappy").orc(s"${BASE_PATH}/orc")
    
        val orcFileDF = spark.read.orc(s"${BASE_PATH}/orc")
        orcFileDF.show()
    
        spark.stop()
      }
    }
    

      

  • 相关阅读:
    如果你很忙,你一定在什么地方做错了!
    NOSQL介绍
    mysql 8.0.11 安装(windows)
    ORA-28547:(Navicat Premium连接oracle报错)
    线性筛法
    Luogu-P1020(导弹拦截)(DP,LIS ,二分优化)
    POJ
    HDU
    HDU-1024-Max Sum Plus Plus(DP)
    UVA-1625-Color Length(DP LCS变形)
  • 原文地址:https://www.cnblogs.com/tesla-turing/p/11489093.html
Copyright © 2011-2022 走看看