zoukankan      html  css  js  c++  java
  • 寒假学习进度

    spark中wordcount的实现

    (1)//aggregateByKey
    def wordcount3(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[(String, Int)] = wordRDD.map((_, 1))
    val group: RDD[(String, Int)] = wordmap.aggregateByKey(0)(_ + _,_ + _)

    group.collect().foreach(println)
    }

    //foldByKey
    def wordcount4(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[(String, Int)] = wordRDD.map((_, 1))
    val group: RDD[(String, Int)] = wordmap.foldByKey(0)(_ + _)

    group.collect().foreach(println)
    }
    //combineByKey
    def wordcount5(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[(String, Int)] = wordRDD.map((_, 1))
    val group: RDD[(String, Int)] = wordmap.combineByKey(
    v => v,
    (x: Int, y) => x + y,
    (x: Int, y: Int) => x + y
    )

    group.collect().foreach(println)
    }

    //countByKey
    def wordcount6(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[(String, Int)] = wordRDD.map((_, 1))
    val group: collection.Map[String, Long] = wordmap.countByKey()
    println(group)


    }

    //countByValue
    def wordcount7(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )

    val group: collection.Map[String, Long] = wordRDD.countByValue()
    println(group)


    }

    //reduce
    def wordcount8(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[mutable.Map[String, Long]] = wordRDD.map(
    word => {
    mutable.Map[String, Long]((word, 1))
    }
    )

    val stringToLong: mutable.Map[String, Long] = wordmap.reduce(
    (map1, map2) => {
    map2.foreach{
    case (word,count)=>{
    val newCount=map1.getOrElse(word,0L)+count
    map1.update(word,newCount)
    }
    }
    map1
    }
    )
    println(stringToLong)
    }

     

  • 相关阅读:
    python 装饰器
    函数嵌套,函数对象
    函数的形参和实参
    函数
    周末复习知识1
    文件修改 文件处理
    字符编码,文件管理
    安卓CTS官方文档之兼容性测试套件简介-attach
    安卓CTS官方文档之兼容性测试套件简介
    MPEG-DASH on IIS Practice in Action-attach
  • 原文地址:https://www.cnblogs.com/chenghaixiang/p/15795733.html
Copyright © 2011-2022 走看看