zoukankan      html  css  js  c++  java
  • 使用hadoop统计多个文本中每个单词数目

    程序源码

     1 import java.io.IOException;
     2 import java.util.StringTokenizer;
     3 import org.apache.hadoop.conf.Configuration;
     4 import org.apache.hadoop.fs.Path;
     5 import org.apache.hadoop.io.IntWritable;
     6 import org.apache.hadoop.io.LongWritable;
     7 import org.apache.hadoop.io.Text;
     8 import org.apache.hadoop.mapreduce.Job;
     9 import org.apache.hadoop.mapreduce.Mapper;
    10 import org.apache.hadoop.mapreduce.Reducer;
    11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    12 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    13 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    14 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    15 
    16 public class WordCount {
    17     public static class WordCountMap extends
    18             Mapper<LongWritable, Text, Text, IntWritable> {
    19         private final IntWritable one = new IntWritable(1);//输出的值  1
    20         private Text word = new Text();//输出的键 单词
    21 
    22         public void map(LongWritable key, Text value, Context context)
    23                 throws IOException, InterruptedException {//处理经过  TextInputFormat  产生的  <k1,v1>,然后产生 <k2,v2>
    24             String line = value.toString();//读取文本中
    25             StringTokenizer token = new StringTokenizer(line);//按照空格对单词进行切割
    26             while (token.hasMoreTokens()) {
    27                 word.set(token.nextToken());//读取到的单词作为键值
    28                 context.write(word, one);//以  单词,1的中间形式交给reduce处理
    29             }
    30         }
    31     }
    32 
    33     public static class WordCountReduce extends
    34             Reducer<Text, IntWritable, Text, IntWritable> {
    35         public void reduce(Text key, Iterable<IntWritable> values,
    36                 Context context) throws IOException, InterruptedException {
    37             int sum = 0;
    38             for (IntWritable val : values) {
    39                 sum += val.get();
    40             }
    41             context.write(key, new IntWritable(sum));
    42         }
    43     }
    44 
    45     public static void main(String[] args) throws Exception {
    46         Configuration conf = new Configuration();
    47         Job job = new Job(conf);
    48         job.setJarByClass(WordCount.class);
    49         job.setJobName("wordcount");
    50         job.setOutputKeyClass(Text.class);
    51         job.setOutputValueClass(IntWritable.class);
    52         job.setMapperClass(WordCountMap.class);
    53         job.setReducerClass(WordCountReduce.class);
    54         job.setInputFormatClass(TextInputFormat.class);//生成可供Map处理的键值对
    55         job.setOutputFormatClass(TextOutputFormat.class);
    56         FileInputFormat.addInputPath(job, new Path(args[0]));
    57         FileOutputFormat.setOutputPath(job, new Path(args[1]));
    58         job.waitForCompletion(true);
    59     }
    60 }

    1 编译源码

    javac -classpath /opt/hadoop-1.2.1/hadoop-core-1.2.1.jar:/opt/hadoop-1.2.1/lib/commons-cli-1.2.jar -d ./word_count_class/ WordCount.java
    将源码编译成class文件并放在当前文件夹下的word_count_class目录,当然,首先需要创建该目录

    2 将源码打成jar包

    进入源码目录

    jar -cvf wordcount.jar  *

    3 上传输入文件

    先在hadoop中为本次任务创建一个输入文件存放目录

    hadoop fs -mkdir input_wordcount

    将input目录下的所有文本文件上传到hadoop中的input_wordcount目录下

    hadoop fs -put input/* input_wordcount/

    注意:不能在运行前穿创建输出文件夹

    4 上传jar并执行

    hadoop jar word_count_class/wordcount.jar input_wordcount output_wordcount

    5 查看计算结果

    程序输出目录

     hadoop fs -ls output_wordcount

    程序输出内容

    hadoop fs -cat output_wordcount/part-r-00000



    版本二:自己实际操作中的程序

    Map程序

     1 package com.zln.chapter03;
     2 
     3 import org.apache.hadoop.io.IntWritable;
     4 import org.apache.hadoop.io.LongWritable;
     5 import org.apache.hadoop.io.Text;
     6 import org.apache.hadoop.mapred.MapReduceBase;
     7 import org.apache.hadoop.mapred.Mapper;
     8 import org.apache.hadoop.mapred.OutputCollector;
     9 import org.apache.hadoop.mapred.Reporter;
    10 
    11 import java.io.IOException;
    12 import java.util.StringTokenizer;
    13 
    14 /**
    15  * Created by sherry on 15-7-12.
    16  */
    17 public class WordCountMap extends MapReduceBase implements Mapper<LongWritable,Text,Text,IntWritable> {
    18     private final static IntWritable one = new IntWritable(1);//每个单词 +1
    19     private Text word = new Text();
    20 
    21     @Override
    22     public void map(LongWritable longWritable, Text text, OutputCollector<Text, IntWritable> outputCollector, Reporter reporter) throws IOException {
    23         String line = text.toString();
    24         StringTokenizer tokenizer = new StringTokenizer(line);//分割出单词
    25         while (tokenizer.hasMoreTokens()){
    26             word.set(tokenizer.nextToken());
    27             outputCollector.collect(word,one);
    28         }
    29     }
    30 }

    Reduce程序

     1 package com.zln.chapter03;
     2 
     3 import org.apache.hadoop.io.IntWritable;
     4 import org.apache.hadoop.io.Text;
     5 import org.apache.hadoop.mapred.MapReduceBase;
     6 import org.apache.hadoop.mapred.OutputCollector;
     7 import org.apache.hadoop.mapred.Reducer;
     8 import org.apache.hadoop.mapred.Reporter;
     9 
    10 import java.io.IOException;
    11 import java.util.Iterator;
    12 
    13 /**
    14  * Created by sherry on 15-7-12.
    15  */
    16 public class WordCountReduce extends MapReduceBase implements Reducer<Text,IntWritable,Text,IntWritable> {
    17     @Override
    18     public void reduce(Text text, Iterator<IntWritable> iterator, OutputCollector<Text, IntWritable> outputCollector, Reporter reporter) throws IOException {
    19         int sum = 0;
    20         while (iterator.hasNext()){
    21             sum += iterator.next().get();
    22         }
    23         outputCollector.collect(text,new IntWritable(sum));
    24     }
    25 }


    主函数

     1 package com.zln.chapter03;
     2 
     3 import org.apache.hadoop.fs.Path;
     4 import org.apache.hadoop.io.IntWritable;
     5 import org.apache.hadoop.io.Text;
     6 import org.apache.hadoop.mapred.*;
     7 
     8 import java.io.IOException;
     9 
    10 
    11 /**
    12  * Created by sherry on 15-7-12.
    13  */
    14 public class WordCount {
    15     public static void main(String[] args) throws IOException {
    16         JobConf conf = new JobConf(WordCount.class);
    17         conf.setJobName("wordCount");
    18 
    19         //设置输出格式
    20         conf.setOutputKeyClass(Text.class);
    21         conf.setOutputValueClass(IntWritable.class);
    22 
    23         //设置MapReduce类
    24         conf.setMapperClass(WordCountMap.class);
    25         conf.setReducerClass(WordCountReduce.class);
    26 
    27         //设置处理输入类
    28         conf.setInputFormat(TextInputFormat.class);
    29         //设置处理输出类
    30         conf.setOutputFormat(TextOutputFormat.class);
    31 
    32         FileInputFormat.setInputPaths(conf, new Path(args[0]));
    33         FileOutputFormat.setOutputPath(conf, new Path(args[1]));
    34 
    35         JobClient.runJob(conf);
    36     }
    37 }

    准备输入文件

    file1

    Hello Word By Word
    Hello Word By zln

    file2

    Hello Hadoop
    Hello GoodBye

    放在同一个目录下:/home/sherry/IdeaProjects/Hadoop/WordCount/输入文件准备

    编译class打成一个jar包

    我使用IDEA进行编译。注意不要忘记指定main函数

    上传输入文件

    root@sherry:/opt/hadoop-1.2.1# hadoop fs -mkdir /user/root/zln/WordCount/InputFiles
    root@sherry:/opt/hadoop-1.2.1# hadoop fs -put /home/sherry/IdeaProjects/Hadoop/WordCount/输入文件准备/* /user/root/zln/WordCount/InputFiles

    上传jar并执行

    root@sherry:/opt/hadoop-1.2.1# hadoop jar /home/sherry/IdeaProjects/Hadoop/out/artifacts/WordCount_jar/WordCount.jar /user/root/zln/WordCount/InputFiles /user/root/zln/WordCount/OutputFiles

    查看执行结果

    root@sherry:/opt/hadoop-1.2.1# hadoop fs -ls /user/root/zln/WordCount/OutputFiles
    root@sherry:/opt/hadoop-1.2.1# hadoop fs -text /user/root/zln/WordCount/OutputFiles/part-00000


    版本三:使用新版本的API对Map  Reduce  main函数进行重写

    Map

     1 package com.zln.chapter03;
     2 
     3 import org.apache.hadoop.io.IntWritable;
     4 import org.apache.hadoop.io.LongWritable;
     5 import org.apache.hadoop.io.Text;
     6 import org.apache.hadoop.mapreduce.Mapper;
     7 
     8 import java.io.IOException;
     9 import java.util.StringTokenizer;
    10 
    11 /**
    12  * Created by sherry on 15-7-12.
    13  */
    14 public class WordCountMap extends Mapper<LongWritable,Text,Text,IntWritable> {
    15     private final static IntWritable one = new IntWritable(1);//每个单词 +1
    16     private Text word = new Text();
    17 
    18 
    19     @Override
    20     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    21         String line = value.toString();
    22         StringTokenizer tokenizer = new StringTokenizer(line);//分割出单词
    23         while (tokenizer.hasMoreTokens()){
    24             word.set(tokenizer.nextToken());
    25             context.write(word,one);
    26         }
    27     }
    28 
    29 }

    Reduce

     1 package com.zln.chapter03;
     2 
     3 import org.apache.hadoop.io.IntWritable;
     4 import org.apache.hadoop.io.Text;
     5 import org.apache.hadoop.mapreduce.Reducer;
     6 
     7 import java.io.IOException;
     8 
     9 /**
    10  * Created by sherry on 15-7-12.
    11  */
    12 public class WordCountReduce extends Reducer<Text,IntWritable,Text,IntWritable> {
    13 
    14     @Override
    15     protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    16         int sum = 0;
    17         for (IntWritable intWritable:values){
    18             sum += intWritable.get();
    19         }
    20         context.write(key,new IntWritable(sum));
    21     }
    22 }

    Main

     1 package com.zln.chapter03;
     2 
     3 
     4 import org.apache.hadoop.conf.Configured;
     5 import org.apache.hadoop.fs.Path;
     6 import org.apache.hadoop.io.IntWritable;
     7 import org.apache.hadoop.io.Text;
     8 import org.apache.hadoop.mapreduce.Job;
     9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    10 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    11 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    12 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    13 import org.apache.hadoop.util.Tool;
    14 import org.apache.hadoop.util.ToolRunner;
    15 
    16 
    17 
    18 /**
    19  * Created by sherry on 15-7-12.
    20  */
    21 public class WordCount extends Configured implements Tool{
    22 
    23     public int run(String[] args) throws Exception {
    24         Job job = new Job(getConf());
    25         job.setJarByClass(WordCount.class);
    26         job.setJobName("WordCount");
    27 
    28 
    29         job.setOutputKeyClass(Text.class);
    30         job.setOutputValueClass(IntWritable.class);
    31 
    32         job.setMapperClass(WordCountMap.class);
    33         job.setReducerClass(WordCountReduce.class);
    34 
    35         job.setInputFormatClass(TextInputFormat.class);
    36         job.setOutputFormatClass(TextOutputFormat.class);
    37 
    38         FileInputFormat.setInputPaths(job,new Path(args[0]));
    39         FileOutputFormat.setOutputPath(job,new Path(args[1]));
    40 
    41         boolean success = job.waitForCompletion(true);
    42         return success?0:1;
    43     }
    44 
    45     public static void main(String[] args) throws Exception {
    46         int ret = ToolRunner.run(new WordCount(),args);
    47         System.exit(ret);
    48     }
    49 }
  • 相关阅读:
    如何在iTerm2中配置oh my zsh?
    sublime中格式化jsx文件
    ES6 new syntax of Literal
    ES6 new syntax of Rest and Spread Operators
    How to preview html file in our browser at sublime text?
    ES6 new syntax of Default Function Parameters
    ES6 new syntax of Arrow Function
    七牛云2018春招笔试题
    Spring-使用注解开发(十二)
    Spring-声明式事物(十一)
  • 原文地址:https://www.cnblogs.com/sherrykid/p/4604717.html
Copyright © 2011-2022 走看看