Basic Functions
sc.parallelize(List(1,2,3,4,5,6)).map(_ * 2).filter(_ > 5).collect()
*** res: Array[Int] = Array(6, 8, 10, 12) ***
val rdd = sc.parallelize(List(1,2,3,4,5,6,7,8,9,10))
rdd.reduce(_+_)
*** res: Int = 55 ***
union & intersection & join & lookup
val rdd1 = sc.parallelize(List(("a", 1), ("a", 2), ("b", 1), ("b", 3)))
val rdd2 = sc.parallelize(List(("a", 3), ("a", 4), ("b", 1), ("b", 2)))
val unionRDD = rdd1.union(rdd2)
unionRDD.collect()
*** res: Array((a,1), (a,2), (b,1), (b,3), (a,3), (a,4), (b,1), (b,2)) ***
val intersectionRDD = rdd1.intersection(rdd2)
intersectionRDD.collect()
*** res: Array[(String, Int)] = Array((b,1)) ***
val joinRDD = rdd1.join(rdd2)
joinRDD.collect()
*** res: Array[(String, (Int, Int))] = Array((a,(1,3)), (a,(1,4)), (a,(2,3)), (a,(2,4)), (b,(1,1)), (b,(1,2)), (b,(3,1)), (b,(3,2))) ***
rdd1.lookup("a")
*** res: Seq[Int] = WrappedArray(1, 2) ***
unionRDD.lookup("a")
*** res: Seq[Int] = WrappedArray(1, 2, 3, 4) ***
joinRDD.lookup("a")
*** res: Seq[(Int, Int)] = ArrayBuffer((1,3), (1,4), (2,3), (2,4)) ***
chars count example
val rdd = sc.textFile("/Users/tony/spark/spark-xiaoxiang-v1/chapter-01/char.data")
val charCount = rdd.flatMap(_.split(" "))
.map(char => (char.toLowerCase, 1))
.reduceByKey(_+_)
charCount.collect()
charCount.saveAsTextFile("/Users/tony/spark/spark-xiaoxiang-v1/chapter-01/result")
val charCountSort = rdd.flatMap(_.split(" "))
.map(char => (char.toLowerCase, 1))
.reduceByKey(_+_)
.map( p => (p._2, p._1) )
.sortByKey(false)
.map( p => (p._2, p._1) )
charCountSort.collect()