zoukankan      html  css  js  c++  java
  • Spark编程练习题

    import org.apache.spark.sql.SparkSession

    val spark = SparkSession
    .builder()
    .appName("Spark SQL basic example")
    .enableHiveSupport()
    .getOrCreate()

    //开启隐式转换
    import spark.implicits._

    //任务:求data的平均值
    import util.Random
    val data = for(i<- List.range(1,10)) yield Random.nextInt(100)

    //使用RDD编程实现
    val rdd = sc.parallelize(data,5)
    val mean = rdd.map(_.toDouble).reduce(_+_)/rdd.count
    println(mean)

    //使用SparkSQL编程实现
    val df = data.toDF("value")
    df.agg("value"->"avg").show

    //任务:统计file中每个词的词频
    val file = "wordcount"

    //使用RDD编程实现
    val rdd = sc.textFile(file)
    rdd.flatMap(_.trim.split(" ")).map((_,1)).reduceByKey(_+_).collect

    //使用SparkSQL编程实现
    val df = spark.read.option("header","false").csv(file).toDF("value")
    df.flatMap(row=>row(0).toString.trim.split(" ")).groupBy("value").count.show

    //任务:
    //有一批学生信息表格,包括name,age,score
    //找出score排名前3的学生
    val students = List(("LiLei",18,87),
                       ("HanMeiMei",16,77),
                       ("DaChui",16,66),
                       ("Jim",18,80),
                       ("RuHua",20,50))
    val n = 3

    //使用RDD编程实现
    val rdd = sc.parallelize(students)
    rdd.sortBy(_._3,ascending = false).take(n)

    //使用SparkSQL编程实现
    val df = students.toDF("name","age","score")
    df.orderBy(df("score").desc).limit(n).show

    //任务:求最大值最小值
    val data = List(1,7,8,5,3,18,34,23,67,53,9,0,12,8)

    //使用RDD编程实现,方案1
    val rdd = sc.parallelize(data,3)
    val max_value = rdd.reduce((a,b)=> if(a>b) a else b)
    val min_value = rdd.reduce((a,b)=> if(a>b) b else a)
    println("max_value:" + max_value)
    println("min_value:" + min_value)

    //使用RDD编程实现,方案2
    val rdd = sc.parallelize(data,3)
    val temp = rdd.mapPartitions(iterator => {
        var min = Integer.MAX_VALUE
        var max = Integer.MIN_VALUE
        for(x <- iterator){
            if(x>max) max = x
            if(x<min) min = x
        }
        Iterator((min,max))
    })
    val result = temp.reduce((a,b)=>
              {val min = if(a._1<= b._1) a._1 else b._1
               val max = if(a._2 >= b._2) a._2 else b._2
               (min,max)
              })

    //使用SparkSQL编程实现
    import org.apache.spark.sql.functions._
    val df = data.toDF("value")
    df.agg(max("value") as "max_value",min("value") as "min_value").show

    //任务:排序并返回序号
    val data = List(1,7,8,5,3,18,34,9,0,12,8)

    //使用RDD编程实现:方案1
    val rdd = sc.parallelize(data,3)
    val len = rdd.count
    val sortedrdd = rdd.map((_,1)).sortByKey().map(_._1).repartition(1)
    val index = sc.parallelize(0 to len.toInt-1,1)
    index.zip(sortedrdd).collect

    //使用RDD编程实现:方案2
    val rdd = sc.parallelize(data,3)
    val sortedrdd = rdd.map((_,1)).sortByKey().map(_._1).repartition(1)
    var idx = -1
    sortedrdd.map(value => {
        idx+=1
        (idx,value)
    }).collect

  • 相关阅读:
    activity启动模式
    Android自定义view:折线图(附带动画效果)
    支付宝开发接口 Multiple dex files define Lcom/ta/utdid2/device/UTDevice
    android 开发中应用到的单例模式
    新浪微博分享错误代码 10014
    android Model与View解耦的一个简单方式
    android studio 新建项目导入到Coding远程仓库git
    一起学习MVC(2)数据库设计
    ASP.NET MVC :MVC页面验证与授权
    ASP.net:水晶报表的5种表格设计模式
  • 原文地址:https://www.cnblogs.com/hrnn/p/13387189.html
Copyright © 2011-2022 走看看