zoukankan      html  css  js  c++  java
  • MR案例:WordCount改写

    请参照wordcount实现一个自己的MapReduce,需求为:
        a. 输入文件格式:
           xxx,xxx,xxx,xxx,xxx,xxx,xxx
        b. 输出文件格式:
           xxx,20
           xxx,30
           xxx.40
        c. 功能:根据命令行参数统计输入文件中指定关键字出现的次数,并展示出来
           例如:hadoop jar xxxxx.jar keywordcount xxx,xxx,xxx,xxx(四个关键字)
    package demo0830;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    
    import java.io.IOException;
    import java.util.ArrayList;
    
    public class Demo0902 {
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
    
            if (args.length < 3) {
                System.out.println("Usage: wordcount <input_path> <output_path> <keyword_list>");
                return;
            }
    
            //Add to target(静态方法)
            String[] target_words = args[2].split(",");
            for (String word : target_words) {
                WCMap.addTargetWord(word.toLowerCase());
            }
    
            Job job = Job.getInstance(conf);
            job.setJarByClass(Demo0902.class);
    
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
    
            job.setMapperClass(WCMap.class);
            job.setReducerClass(WCReduce.class);
    
            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);
    
            FileInputFormat.addInputPath(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
            job.waitForCompletion(true);
        }
        public static class WCMap extends Mapper<LongWritable, Text, Text, IntWritable> {
    
            private final static IntWritable one = new IntWritable(1);
            private Text word = new Text();
            private final static ArrayList<String> target_words = new ArrayList<String>();
    
            public static void addTargetWord(String word) {
                target_words.add(word);
            }
    
            public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                String[] items = value.toString().toLowerCase().split(" ");
                for (String item : items) {
                    
                    //filter keyword
                    if (target_words.contains(item)) {
                        word.set(item);
                        context.write(word, one);
                    }
                }
            }
        }
    
        public static class WCReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
            public void reduce(Text key, Iterable<IntWritable> values, Context context)
                    throws IOException, InterruptedException {
                int sum = 0;
                for (IntWritable val : values) {
                    sum += val.get();
                }
                context.write(key, new IntWritable(sum));
            }
        }
    }
  • 相关阅读:
    sql数据黑马程序员——SQL入门
    函数sql黑马程序员——SQL常用函数
    schemaeasyui实例:SSh结合Easyui实现Datagrid的分页显示
    代码用于脚本语言开发平台Script.NET即将开源
    javadataAbout stack and heap in JAVA(2)
    方法object面试题分析:7JAVA中Object的clone方法详解-克隆-深克隆
    集合元素最近的学习心得
    产品经理能力产品经理工作积累(3)
    schemamvcSpringMVC+Spring3+Hibernate4开发环境搭建
    降低FFmpeg的解码延时
  • 原文地址:https://www.cnblogs.com/skyl/p/4779512.html
Copyright © 2011-2022 走看看