zoukankan      html  css  js  c++  java
  • 三、spark入门:文本中发现5个最常用的word,排除常用停用词

    package com.yl.wordcount

    import java.io.File

    import org.apache.spark.{SparkConf, SparkContext}

    import scala.collection.Iterator
    import scala.io.Source

    /**
    * wordcount进行排序并排除停用词
    */
    object WordCountStopWords {

    def main(args: Array[String]) {
    val conf = new SparkConf().setMaster("spark://localhost:7077").setAppName("wordcount")
    val sc = new SparkContext(conf)

    val outFile = "/Users/admin/spark/sparkoutput"
    var stopWords:Iterator[String] = null
    val stopWordsFile = new File("/Users/admin/src"+"/tingyongci.txt")

    if(stopWordsFile.exists()){
    stopWords = Source.fromFile(stopWordsFile).getLines
    }
    val stopWordList = stopWords.toList

    val textFile = sc.textFile("/Users/admin/spark/spark-1.5.1-bin-hadoop2.4/README.md")
    val result = textFile.flatMap(_.split(" ")).filter(!_.isEmpty).filter(!stopWordList.contains(_)).map((_,1)).reduceByKey(_+_).map{case (word,count) =>(count,word)}.sortByKey(false)

    result.saveAsTextFile(outFile)
    }

    }
    http://www.cnblogs.com/ylcoder/
  • 相关阅读:
    通过使用 SQL,可以为列名称和表名称指定别名(Alias)
    BETWEEN 操作符
    IN 操作符
    SQL 通配符
    LIKE 操作符
    TOP 子句
    DELETE 语句
    Update 语句
    INSERT INTO 语句
    IOS SWIFT 网络请求JSON解析 基础一
  • 原文地址:https://www.cnblogs.com/ylcoder/p/5730947.html
Copyright © 2011-2022 走看看