join,leftOuterJoin,rightOuterJoin,fullOuterJoin 都是transformation类别的算子
作用在K,V格式的RDD上。根据K进行连接,对(K,V)join(K,W)返回(K,(V,W))
join后的分区数是多的那个的分区
join
val kzc=spark.sparkContext.parallelize(List(("hive",8),("apache",8),("hive",30),("hadoop",18)))
val bd=spark.sparkContext.parallelize(List(("hive","test"),("test",2),("spark",20)))
val result1=bd.join(kzc)
result1.collect().foreach(println(_))
结果
(hive,(test,8)) (hive,(test,30))
leftOuterJoin
val kzc=spark.sparkContext.parallelize(List(("hive",8),("apache",8),("hive",30),("hadoop",18))) val bd=spark.sparkContext.parallelize(List(("hive","test"),("test",2),("spark",20))) val result1=bd.leftOuterJoin(kzc) result1.collect().foreach(println(_))
结果,没有连接上的为None
(spark,(20,None)) (hive,(test,Some(8))) (hive,(test,Some(30))) (test,(2,None))
rightOuterJoin ,以右边RDD为准
val kzc=spark.sparkContext.parallelize(List(("hive",8),("apache",8),("hive",30),("hadoop",18))) val bd=spark.sparkContext.parallelize(List(("hive","test"),("test",2),("spark",20))) val result1=bd.rightOuterJoin(kzc) result1.collect().foreach(println(_))
结果,没有连接上的是None
(hive,(Some(test),8)) (hive,(Some(test),30)) (hadoop,(None,18)) (apache,(None,8))
fullOuterJoin
val kzc=spark.sparkContext.parallelize(List(("hive",8),("apache",8),("hive",30),("hadoop",18))) val bd=spark.sparkContext.parallelize(List(("hive","test"),("test",2),("spark",20))) val result1=bd.fullOuterJoin(kzc) result1.collect().foreach(println(_))
结果
(spark,(Some(20),None)) (hive,(Some(test),Some(8))) (hive,(Some(test),Some(30))) (hadoop,(None,Some(18))) (apache,(None,Some(8))) (test,(Some(2),None))