Kudu为Kudu表提供了一个自定义的原生数据源。可以和DataFrame API紧密集成;
使用DataFrame的好处就是可以从很多的数据源创建dataframe,包括现有的RDD,Hive表或Spark数据
语法格式:
object DataframeKUDU { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("AcctfileProcess") //设置Master_IP并设置spark参数 .setMaster("local") .set("spark.worker.timeout", "500") .set("spark.cores.max", "10") .set("spark.rpc.askTimeout", "600s") .set("spark.network.timeout", "600s") .set("spark.task.maxFailures", "1") .set("spark.speculationfalse", "false") .set("spark.driver.allowMultipleContexts", "true") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val sparkContext = SparkContext.getOrCreate(sparkConf) val sqlContext = SparkSession.builder().config(sparkConf).getOrCreate().sqlContext //使用spark创建kudu表 val kuduContext = new KuduContext("hadoop01:7051,hadoop02:7051,hadoop03:7051", sqlContext.sparkContext) import sqlContext.implicits._ //定义数据 val customers = Array( Customer("jane", 30, "new york"), Customer("jordan", 18, "toronto")) // 创建RDD val customersRDD = sparkContext.parallelize(customers) //将RDD转成dataFrame val customersDF = customersRDD.toDF() } } case class Customer(name:String, age:Int, city:String)