combineByKeyWithClassTag
相当于Mapreduce自定义combine,可以提高任务的执行效率,毕竟在map端已经聚合过了
def combineByKeyWithClassTag[C](
createCombiner: V => C, //map端,改变 v 的返回值类型
mergeValue: (C, V) => C, //map端,预聚合
mergeCombiners: (C, C) => C, //reduce端,聚合
partitioner: Partitioner, //分区对象
mapSideCombine: Boolean = true, //是否开启map端聚合,默认开启
serializer: Serializer = null)
//使用的这种
def combineByKeyWithClassTag[C](
createCombiner: V => C,
mergeValue: (C, V) => C,
mergeCombiners: (C, C) => C)
def combineByKeyWithClassTag[C](
createCombiner: V => C,
mergeValue: (C, V) => C,
mergeCombiners: (C, C) => C,
numPartitions: Int)
rddA.combineByKeyWithClassTag(
line => mutable.Set(line), //声明map的返回值类型
(c1: mutable.Set[String], newLine) => c1 += newLine, //map端预聚合
(c1: mutable.Set[String], c2: mutable.Set[String]) => c1 ++= c2 //reduce端,聚合
)