zoukankan      html  css  js  c++  java
  • 三、spark入门:文本中发现5个最常用的word,排除常用停用词

    package com.yl.wordcount

    import java.io.File

    import org.apache.spark.{SparkConf, SparkContext}

    import scala.collection.Iterator
    import scala.io.Source

    /**
    * wordcount进行排序并排除停用词
    */
    object WordCountStopWords {

    def main(args: Array[String]) {
    val conf = new SparkConf().setMaster("spark://localhost:7077").setAppName("wordcount")
    val sc = new SparkContext(conf)

    val outFile = "/Users/admin/spark/sparkoutput"
    var stopWords:Iterator[String] = null
    val stopWordsFile = new File("/Users/admin/src"+"/tingyongci.txt")

    if(stopWordsFile.exists()){
    stopWords = Source.fromFile(stopWordsFile).getLines
    }
    val stopWordList = stopWords.toList

    val textFile = sc.textFile("/Users/admin/spark/spark-1.5.1-bin-hadoop2.4/README.md")
    val result = textFile.flatMap(_.split(" ")).filter(!_.isEmpty).filter(!stopWordList.contains(_)).map((_,1)).reduceByKey(_+_).map{case (word,count) =>(count,word)}.sortByKey(false)

    result.saveAsTextFile(outFile)
    }

    }
    http://www.cnblogs.com/ylcoder/
  • 相关阅读:
    课堂练习求环整数组中最大子数组之和
    学习进度第7周
    声明
    最大数
    学习进度02
    构建之法阅读笔记02
    学习进度01
    课堂练习之《哈利波特》
    《构建之法》阅读笔记06
    寻找水龙王2
  • 原文地址:https://www.cnblogs.com/ylcoder/p/5730947.html
Copyright © 2011-2022 走看看