zoukankan      html  css  js  c++  java
  • Spark编程练习题

    import org.apache.spark.sql.SparkSession

    val spark = SparkSession
    .builder()
    .appName("Spark SQL basic example")
    .enableHiveSupport()
    .getOrCreate()

    //开启隐式转换
    import spark.implicits._

    //任务:求data的平均值
    import util.Random
    val data = for(i<- List.range(1,10)) yield Random.nextInt(100)

    //使用RDD编程实现
    val rdd = sc.parallelize(data,5)
    val mean = rdd.map(_.toDouble).reduce(_+_)/rdd.count
    println(mean)

    //使用SparkSQL编程实现
    val df = data.toDF("value")
    df.agg("value"->"avg").show

    //任务:统计file中每个词的词频
    val file = "wordcount"

    //使用RDD编程实现
    val rdd = sc.textFile(file)
    rdd.flatMap(_.trim.split(" ")).map((_,1)).reduceByKey(_+_).collect

    //使用SparkSQL编程实现
    val df = spark.read.option("header","false").csv(file).toDF("value")
    df.flatMap(row=>row(0).toString.trim.split(" ")).groupBy("value").count.show

    //任务:
    //有一批学生信息表格,包括name,age,score
    //找出score排名前3的学生
    val students = List(("LiLei",18,87),
                       ("HanMeiMei",16,77),
                       ("DaChui",16,66),
                       ("Jim",18,80),
                       ("RuHua",20,50))
    val n = 3

    //使用RDD编程实现
    val rdd = sc.parallelize(students)
    rdd.sortBy(_._3,ascending = false).take(n)

    //使用SparkSQL编程实现
    val df = students.toDF("name","age","score")
    df.orderBy(df("score").desc).limit(n).show

    //任务:求最大值最小值
    val data = List(1,7,8,5,3,18,34,23,67,53,9,0,12,8)

    //使用RDD编程实现,方案1
    val rdd = sc.parallelize(data,3)
    val max_value = rdd.reduce((a,b)=> if(a>b) a else b)
    val min_value = rdd.reduce((a,b)=> if(a>b) b else a)
    println("max_value:" + max_value)
    println("min_value:" + min_value)

    //使用RDD编程实现,方案2
    val rdd = sc.parallelize(data,3)
    val temp = rdd.mapPartitions(iterator => {
        var min = Integer.MAX_VALUE
        var max = Integer.MIN_VALUE
        for(x <- iterator){
            if(x>max) max = x
            if(x<min) min = x
        }
        Iterator((min,max))
    })
    val result = temp.reduce((a,b)=>
              {val min = if(a._1<= b._1) a._1 else b._1
               val max = if(a._2 >= b._2) a._2 else b._2
               (min,max)
              })

    //使用SparkSQL编程实现
    import org.apache.spark.sql.functions._
    val df = data.toDF("value")
    df.agg(max("value") as "max_value",min("value") as "min_value").show

    //任务:排序并返回序号
    val data = List(1,7,8,5,3,18,34,9,0,12,8)

    //使用RDD编程实现:方案1
    val rdd = sc.parallelize(data,3)
    val len = rdd.count
    val sortedrdd = rdd.map((_,1)).sortByKey().map(_._1).repartition(1)
    val index = sc.parallelize(0 to len.toInt-1,1)
    index.zip(sortedrdd).collect

    //使用RDD编程实现:方案2
    val rdd = sc.parallelize(data,3)
    val sortedrdd = rdd.map((_,1)).sortByKey().map(_._1).repartition(1)
    var idx = -1
    sortedrdd.map(value => {
        idx+=1
        (idx,value)
    }).collect

  • 相关阅读:
    seo 优化 仅针对 来拍呀www.laipaiya.com(一)
    mac 下 配置 xhprof
    mac 下 sphinx + mysql + php 实现全文搜索(xampp)(4)php api 解析
    mac 下 sphinx + mysql + php 实现全文搜索(xampp)(3)sphinx 的配置项解析
    php + mysql + sphinx 的全文检索(2)
    mac 下 sphinx + mysql + php 实现全文搜索(xampp)(1)
    mysql 的 存储结构(储存引擎)
    [php] yii debug设置
    [mysql] 查看mysql执行时间
    [javascript] 对象拷贝
  • 原文地址:https://www.cnblogs.com/hrnn/p/13387189.html
Copyright © 2011-2022 走看看