zoukankan      html  css  js  c++  java
  • 单词计数示例

    一、代码

    import java.io.IOException;
    import java.util.Iterator;
    import java.util.StringTokenizer;

    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapred.FileInputFormat;
    import org.apache.hadoop.mapred.FileOutputFormat;
    import org.apache.hadoop.mapred.JobClient;
    import org.apache.hadoop.mapred.JobConf;
    import org.apache.hadoop.mapred.MapReduceBase;
    import org.apache.hadoop.mapred.Mapper;
    import org.apache.hadoop.mapred.OutputCollector;
    import org.apache.hadoop.mapred.Reducer;
    import org.apache.hadoop.mapred.Reporter;
    import org.apache.hadoop.mapred.TextInputFormat;
    import org.apache.hadoop.mapred.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;

    public class WordCount extends Configured implements Tool{

        public static void main(String[] args) throws Exception {
            // TODO Auto-generated method stub
            int exitcode=ToolRunner.run(new WordCount(), args);
            System.exit(exitcode);

        }

        // TODO Auto-generated method stub
        private final static IntWritable one = new IntWritable(1);
        private static Text word = new Text();

        public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {

            @Override
            public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
                    throws IOException {
                String line = value.toString();
                StringTokenizer tokenizer = new StringTokenizer(line);
                while (tokenizer.hasMoreTokens()) {
                    word.set(tokenizer.nextToken());
                    output.collect(word, one);
                }
            }

        }

        public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {

            @Override
            public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output,
                    Reporter reporter) throws IOException {
                int sum = 0;
                while (values.hasNext()) {
                    sum += values.next().get();
                }
                output.collect(key, new IntWritable(sum));
            }

        }

        @Override
        public int run(String[] args) throws Exception {
            // TODO Auto-generated method stub
            if(args.length!=2) {
                System.err.printf("Usage %s need <input> <output> ", getClass().getSimpleName());
                ToolRunner.printGenericCommandUsage(System.err);
                return -1;
            }
            System.out.print(args[0]);
            JobConf job=new JobConf(getConf());
            job.setJarByClass(getClass());
            job.setJobName("wordcount");
            
            job.setInputFormat(TextInputFormat.class);    //为map-reduce任务设置InputFormat实现类
            job.setOutputFormat(TextOutputFormat.class);  
            
            FileInputFormat.addInputPath(job, new Path(args[0]));
           FileOutputFormat.setOutputPath(job,  new Path(args[1]));

           
           job.setOutputKeyClass(Text.class);    //为job的输出数据设置Key类
           job.setOutputValueClass(IntWritable.class);

            job.setMapperClass(WordCount.Map.class);
            job.setCombinerClass(WordCount.Reduce.class);
            job.setReducerClass(WordCount.Reduce.class);
            JobClient.runJob(job);
            return 0;
        }

    }
    二、执行

    1、本地执行

    export HADOOP_CONF_DIR=/root/soft/hdp312/localconf/hadoop
    hadoop jar wordcount.jar  WordCount --input testinput --output  testoutput

    三、流模式运行

    --mapper

    --reducer

    以上二者可以指定脚本命令或java类都可以

    脚本可以对基本输入和输出进行读写

    hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.1.2.jar   -input testinput/ -output testoutput/  -mapper /bin/cat

  • 相关阅读:
    c++-面向对象:类和对象
    c++-内联函数和函数重载和默认参数和函数指针
    c++-引用
    c++-const
    c++--语言本身
    排序-基数排序
    排序-归并排序
    排序-堆排序
    常用Java API: ArrayList(Vector) 和 LinkedList
    常用Java API:Calendar日期类
  • 原文地址:https://www.cnblogs.com/justart/p/11631235.html
Copyright © 2011-2022 走看看