zoukankan      html  css  js  c++  java
  • SparkSQL读写外部数据源-jext文件和table数据源的读写

    object ParquetFileTest {
      def main(args: Array[String]): Unit = {
        val spark = SparkSession
          .builder()
          .appName("ParquetFileTest")
          .getOrCreate()
    
        //1: 将json文件数据转化成parquet文件数据
        val df = spark.read.json(s"${BASE_PATH}/people.json")
        df.show()
    
        //gzip、lzo、snappy
        df.write.option("compression", "snappy").parquet(s"${BASE_PATH}/parquet")
        //2: 读取parquet文件
        val parquetDF = spark.read.parquet(s"${BASE_PATH}/parquet")
        parquetDF.show()
    
        //3: parquet schema merge
        //全局设置spark.sql.parquet.mergeSchema = true
        df.toDF("age", "first_name").write.parquet(s"${BASE_PATH}/parquet_schema_change")
        val changedDF = spark.read.parquet(s"${BASE_PATH}/parquet_schema_change")
        changedDF.show()
    
        val schemaMergeDF = spark.read.option("mergeSchema", "true").parquet(s"${BASE_PATH}/parquet",
          s"${BASE_PATH}/parquet_schema_change")
        schemaMergeDF.show()
    
        spark.stop()
      }
    }
    

      

    object OrcFileTest {
      def main(args: Array[String]): Unit = {
        val spark = SparkSession
          .builder()
          .appName("OrcFileTest")
          .getOrCreate()
    
        //1: 将json文件数据转化成orc文件数据
        val df = spark.read.json(s"${BASE_PATH}/people.json")
        df.show()
    
        df.write.option("compression", "snappy").orc(s"${BASE_PATH}/orc")
    
        val orcFileDF = spark.read.orc(s"${BASE_PATH}/orc")
        orcFileDF.show()
    
        spark.stop()
      }
    }
    

      

  • 相关阅读:
    学习Java的Day02
    学习Java的Day01
    多线程的了解
    几个MQ的区别
    HTML5——存储(cookie、localStorage、sessionStorage)的区别
    dubbo mock配置
    Springboot分布式,excel导出,运用POI导出,前端用的jsp
    oracle 添加字段和添加注释
    可以重复的Map:IdentityHashMap
    数组转list问题
  • 原文地址:https://www.cnblogs.com/tesla-turing/p/11489093.html
Copyright © 2011-2022 走看看