WordCount.py
# coding:utf-8 from pyspark import SparkContext from pyspark import SparkConf def SetLogger(sc): """设置不要显示过多信息""" logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR) def CreateSparkContext(): sparkConf = SparkConf().setAppName("WordCounts").set("spark.ui.showConsoleProgress","false") sc = SparkContext(conf=sparkConf) print("master=",sc.master) SetLogger(sc) return sc def main(): print("开始执行") sc = CreateSparkContext() textFile = sc.textFile("file:/root/ipynotebook/test.txt") # 本地文件 # textFile = sc.textFile("hdfs://master:9000/user/hadoop/test.txt") # hdfs文件 stringRDD = textFile.flatMap(lambda x: x.split(" ")) # print(stringRDD.collect()) countsRDD = stringRDD.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y) print("开始保存") countsRDD.saveAsTextFile("file:/root/ipynotebook/output") # countsRDD.saveAsTextFile("hdfs://master:9000/user/hadoop/output") sc.stop() if __name__ == "__main__": main()
使用spark-submit执行命令
# 本地 $ spark-submit --master local WordCount.py $ cat /output/part-00000 # part文件数取决于实例数 # yarn $ spark-submit --master yarn WordCount.py $ hadoop fs -cat /user/hadoop/output/part-00000
Hadoop Web界面
http://master:8088/