zoukankan      html  css  js  c++  java
  • 寒假学习进度

    spark中wordcount的实现

    (1)//aggregateByKey
    def wordcount3(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[(String, Int)] = wordRDD.map((_, 1))
    val group: RDD[(String, Int)] = wordmap.aggregateByKey(0)(_ + _,_ + _)

    group.collect().foreach(println)
    }

    //foldByKey
    def wordcount4(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[(String, Int)] = wordRDD.map((_, 1))
    val group: RDD[(String, Int)] = wordmap.foldByKey(0)(_ + _)

    group.collect().foreach(println)
    }
    //combineByKey
    def wordcount5(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[(String, Int)] = wordRDD.map((_, 1))
    val group: RDD[(String, Int)] = wordmap.combineByKey(
    v => v,
    (x: Int, y) => x + y,
    (x: Int, y: Int) => x + y
    )

    group.collect().foreach(println)
    }

    //countByKey
    def wordcount6(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[(String, Int)] = wordRDD.map((_, 1))
    val group: collection.Map[String, Long] = wordmap.countByKey()
    println(group)


    }

    //countByValue
    def wordcount7(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )

    val group: collection.Map[String, Long] = wordRDD.countByValue()
    println(group)


    }

    //reduce
    def wordcount8(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[mutable.Map[String, Long]] = wordRDD.map(
    word => {
    mutable.Map[String, Long]((word, 1))
    }
    )

    val stringToLong: mutable.Map[String, Long] = wordmap.reduce(
    (map1, map2) => {
    map2.foreach{
    case (word,count)=>{
    val newCount=map1.getOrElse(word,0L)+count
    map1.update(word,newCount)
    }
    }
    map1
    }
    )
    println(stringToLong)
    }

     

  • 相关阅读:
    并不对劲的辛普森积分
    并不对劲的概率与期望
    并不对劲的cdq分治解三维偏序
    68.机器人的运动范围
    67.矩阵中的路径
    66.滑动窗口最大值
    65.数据流的中位数
    64.二叉搜索树的第K个节点
    63.序列化二叉树
    62.把二叉树打印成多行
  • 原文地址:https://www.cnblogs.com/chenghaixiang/p/15795733.html
Copyright © 2011-2022 走看看