zoukankan      html  css  js  c++  java
  • 【spark】jieba + wordcount

    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    from os import path
    import jieba
    from pyspark import SparkContext
    from pyspark.sql import SQLContext
    #from operator import add
    
    sc = SparkContext("local[1]" , "wordCount")
    sc.setLogLevel("ERROR")
    sqc = SQLContext(sc)
    
    thisDir = path.dirname(__file__)
    
    def wordCut(strings):
        strings = strings.strip()
        returnList = []
        for r in jieba.cut(strings):
            returnList.append(r)
        return returnList
    
    fileName = 'words.txt'
    file_in = sc.textFile(path.join(thisDir,fileName))
    
    linesNum = file_in.count()
    print '[INFO]number of lines in file %s : %d' % (fileName , linesNum)
    
    charsNum = file_in.map(lambda x : len(x)).reduce(lambda x,y : x+y)
    print '[INFO]number of charts in file %s : %d' % (fileName , charsNum)
    
    words = file_in.flatMap(lambda line : wordCut(line))
    termBigger3 = words.filter(lambda word : len(word) > 3)
    print '[INFO]number of words bigger than 3 in file %s : %d' % (fileName , termBigger3.count())
    
    wordCount = words.map(lambda w : (w,1)).reduceByKey(lambda x,y:x+y)
    sqc.createDataFrame(wordCount,['word','count']).sort('count',ascending = False).show(20)
  • 相关阅读:
    mock数据
    Vuex
    React生命周期
    Vue基础知识
    前端面试题
    NodeJS巅峰之作
    Oracle数据库
    CSS Bootstrap jsp开发 前端遇到的一些问题。
    如何寻找node.js 与win7兼容的版本?eclipse中引入bootstrap。
    Window 常用命令
  • 原文地址:https://www.cnblogs.com/colipso/p/6841169.html
Copyright © 2011-2022 走看看