def map(): Unit = {
val conf = new SparkConf().setAppName("map").setMaster("local")
val sc = new SparkContext(conf)
val numbers = Array(1, 2, 3, 4, 5)
val numberRDD = sc.parallelize(numbers, 1)
val multipleNumberRDD = numberRDD.map(num => num * 2)
multipleNumberRDD.foreach(num => println(num))
}
def flatMap(): Unit = {
val conf = new SparkConf().setAppName("flatMap").setMaster("local")
val sc = new SparkContext(conf)
val lineArray = Array("hello you", "hello me", "hello world")
val lines = sc.parallelize(lineArray, 1)
val words = lines.flatMap(line => line.split(" "))
words.foreach(word => println(word))
}
def groupByKey(): Unit = {
val conf = new SparkConf().setAppName("groupByKey").setMaster("local")
val sc = new SparkContext(conf)
val scoreList = Array(new Tuple2[String, Integer]("class1", 80),
new Tuple2[String, Integer]("class2", 88),
new Tuple2[String, Integer]("class1", 80),
new Tuple2[String, Integer]("class2", 90))
val scores = sc.parallelize(scoreList, 1)
val groupedScores = scores.groupByKey()
groupedScores.foreach(
score => {
println(score._1)
score._2.foreach(singleScore => println(singleScore))
println("===========")
}
)
}
def reduceByKey(): Unit = {
val conf = new SparkConf().setAppName("reduceByKey").setMaster("local")
val sc = new SparkContext(conf)
val scoreList = Array(new Tuple2[String, Integer]("class1", 80),
new Tuple2[String, Integer]("class2", 88),
new Tuple2[String, Integer]("class1", 80),
new Tuple2[String, Integer]("class2", 90))
val scores = sc.parallelize(scoreList, 1)
val totalScores = scores.reduceByKey(_ + _)
totalScores.foreach(classScore => println(classScore._1 + ":" + classScore._2))
}
六、sortByKey:将学生分数进行排序
6.1 Java
/**
* sortByKey算子:按照学生分数进行排序
*/
private static void sortByKey() {
//创建SparkConf
SparkConf conf = new SparkConf()
.setAppName("sortByKey")
.setMaster("local");
//创建JavaSparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
//构造集合
List<Tuple2<Integer, String>> scoresList = Arrays.asList(
new Tuple2<>(65, "leo"),
new Tuple2<>(60, "tom"),
new Tuple2<>(90, "marry"),
new Tuple2<>(88, "jack"));
//并行化集合,创建JavaPairRDD
JavaPairRDD<Integer, String> scores = sc.<Integer, String>parallelizePairs(scoresList);
//对scoreRDD执行sortByKey算子
//sortByKey其实就是根据key进行排序,可以手动指定升序,或者降序
//返回的,还是JavaPairRDD,其中的元素内容和原始的RDD一样
//只是RDD中的元素顺序不同了
JavaPairRDD<Integer, String> sortedScored = scores.sortByKey();
//打印sortedScored RDD
sortedScored.foreach(new VoidFunction<Tuple2<Integer, String>>() {
private static final long serivalVersionUID = 1L;
@Override
public void call(Tuple2<Integer, String> t) throws Exception {
System.out.println(t._1 + ":" + t._2);
}
});
//关闭JavaSparkContext
sc.close();
}
6.2 Scala
def sortByKey(): Unit = {
val conf = new SparkConf().setAppName("sortByKey").setMaster("local")
val sc = new SparkContext(conf)
val scoreList = Array(new Tuple2[Integer, String](90, "cat"),
new Tuple2[Integer, String](80, "leo"),
new Tuple2[Integer, String](80, "opp"),
new Tuple2[Integer, String](55, "lll"))
val scores = sc.parallelize(scoreList, 1)
val totalScores = scores.sortByKey()
totalScores.foreach(studentScore => println(studentScore._1 + ":" + studentScore._2))
}
def join(): Unit = {
val conf = new SparkConf().setAppName("join").setMaster("local")
val sc = new SparkContext(conf)
//学生集合
val studentList = Array(
new Tuple2[Integer, String](1, "leo"),
new Tuple2[Integer, String](2, "tom"),
new Tuple2[Integer, String](3, "marry"))
//分数集合
val scoreList = Array(
new Tuple2[Integer, Integer](1, 100), new Tuple2[Integer, Integer](2, 80),
new Tuple2[Integer, Integer](3, 50), new Tuple2[Integer, Integer](1, 70),
new Tuple2[Integer, Integer](2, 10), new Tuple2[Integer, Integer](3, 40))
val student = sc.parallelize(studentList)
val scores = sc.parallelize(scoreList)
val studentScores = student.join(scores)
studentScores.foreach(studentScore => println({
System.out.println("student id:" + studentScore._1)
System.out.println("student name:" + studentScore._2._1)
System.out.println("student score:" + studentScore._2._2)
System.out.println("==============")
}))
}
八、cogroup:打印每个学生的成绩
8.1 Java
/**
* cogroup算子:打印学生成绩
*/
private static void cogroup() {
//创建SparkConf
SparkConf conf = new SparkConf()
.setAppName("cogroup")
.setMaster("local");
//创建JavaSparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
//学生集合
List<Tuple2<Integer, String>> studentList = Arrays.asList(
new Tuple2<>(1, "leo"),
new Tuple2<>(2, "tom"),
new Tuple2<>(3, "marry"));
//分数集合
List<Tuple2<Integer, Integer>> scoreList = Arrays.asList(
new Tuple2<>(1, 100),
new Tuple2<>(2, 80),
new Tuple2<>(3, 50),
new Tuple2<>(1, 70),
new Tuple2<>(2, 10),
new Tuple2<>(3, 40));
//并行化两个RDD
JavaPairRDD<Integer, String> student = sc.<Integer, String>parallelizePairs(studentList);
JavaPairRDD<Integer, Integer> scores = sc.<Integer, Integer>parallelizePairs(scoreList);
//cogroup与join不同
//相当于是,一个key join上的所有value,都给放到一个Iterable里面去了
JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> studentScores = student.<Integer>cogroup(scores);
//打印sortedScored RDD
studentScores.foreach(new VoidFunction<Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>>>() {
@Override
public void call(Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> t) throws Exception {
System.out.println("student id:" + t._1);
System.out.println("student name:" + t._2._1);
System.out.println("student score:" + t._2._2);
System.out.println("==============");
}
});
//关闭JavaSparkContext
sc.close();
}