zoukankan      html  css  js  c++  java
  • 【spark】jieba + wordcount

    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    from os import path
    import jieba
    from pyspark import SparkContext
    from pyspark.sql import SQLContext
    #from operator import add
    
    sc = SparkContext("local[1]" , "wordCount")
    sc.setLogLevel("ERROR")
    sqc = SQLContext(sc)
    
    thisDir = path.dirname(__file__)
    
    def wordCut(strings):
        strings = strings.strip()
        returnList = []
        for r in jieba.cut(strings):
            returnList.append(r)
        return returnList
    
    fileName = 'words.txt'
    file_in = sc.textFile(path.join(thisDir,fileName))
    
    linesNum = file_in.count()
    print '[INFO]number of lines in file %s : %d' % (fileName , linesNum)
    
    charsNum = file_in.map(lambda x : len(x)).reduce(lambda x,y : x+y)
    print '[INFO]number of charts in file %s : %d' % (fileName , charsNum)
    
    words = file_in.flatMap(lambda line : wordCut(line))
    termBigger3 = words.filter(lambda word : len(word) > 3)
    print '[INFO]number of words bigger than 3 in file %s : %d' % (fileName , termBigger3.count())
    
    wordCount = words.map(lambda w : (w,1)).reduceByKey(lambda x,y:x+y)
    sqc.createDataFrame(wordCount,['word','count']).sort('count',ascending = False).show(20)
  • 相关阅读:
    jQuery:提交表单前判断表单是否被修改过
    jQuery multiselect的使用
    input[file]标签的accept=”image/*”属性响应很慢的解决办法
    Linux-read命令
    shell编程学习
    优化网站加载速度
    select下拉框选中问题
    QTableWidget class
    QLabel class
    QMainWindow class
  • 原文地址:https://www.cnblogs.com/colipso/p/6841169.html
Copyright © 2011-2022 走看看