zoukankan      html  css  js  c++  java
  • 寒假学习进度

    spark中wordcount的实现

    (1)//aggregateByKey
    def wordcount3(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[(String, Int)] = wordRDD.map((_, 1))
    val group: RDD[(String, Int)] = wordmap.aggregateByKey(0)(_ + _,_ + _)

    group.collect().foreach(println)
    }

    //foldByKey
    def wordcount4(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[(String, Int)] = wordRDD.map((_, 1))
    val group: RDD[(String, Int)] = wordmap.foldByKey(0)(_ + _)

    group.collect().foreach(println)
    }
    //combineByKey
    def wordcount5(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[(String, Int)] = wordRDD.map((_, 1))
    val group: RDD[(String, Int)] = wordmap.combineByKey(
    v => v,
    (x: Int, y) => x + y,
    (x: Int, y: Int) => x + y
    )

    group.collect().foreach(println)
    }

    //countByKey
    def wordcount6(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[(String, Int)] = wordRDD.map((_, 1))
    val group: collection.Map[String, Long] = wordmap.countByKey()
    println(group)


    }

    //countByValue
    def wordcount7(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )

    val group: collection.Map[String, Long] = wordRDD.countByValue()
    println(group)


    }

    //reduce
    def wordcount8(sc:SparkContext)={

    val fileRDD: RDD[String] = sc.textFile("D:\\qq text\\1791028291\\FileRecv\\《飘》英文版.txt")
    // 将文件中的数据进行分词
    val wordRDD: RDD[String] = fileRDD.flatMap( _.split(" ") )
    val wordmap: RDD[mutable.Map[String, Long]] = wordRDD.map(
    word => {
    mutable.Map[String, Long]((word, 1))
    }
    )

    val stringToLong: mutable.Map[String, Long] = wordmap.reduce(
    (map1, map2) => {
    map2.foreach{
    case (word,count)=>{
    val newCount=map1.getOrElse(word,0L)+count
    map1.update(word,newCount)
    }
    }
    map1
    }
    )
    println(stringToLong)
    }

     

  • 相关阅读:
    quartz 时间表达式----- Cron表达式详解
    js正则表达式
    tomcat的server.xml文件的详解
    spring web.xml中 过滤器(Filter)的工作原理和代码演示
    tomcat端口占用解决方案
    Spring集成Hibernate映射文件的4种方式
    Hibernate 一二级缓存的使用场景
    Hibernate各种主键生成策略
    Pip的安装,环境变量的配置以及使用方法
    解决pycharm无法调用pip安装的包
  • 原文地址:https://www.cnblogs.com/chenghaixiang/p/15795733.html
Copyright © 2011-2022 走看看