Hadoop2.4.1 MapReduce通过Map端shuffle（Combiner）完成数据去重

zoukankan html css js c++ java

Hadoop2.4.1 MapReduce通过Map端shuffle（Combiner）完成数据去重

package com.bank.service;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
* 将清洗后的数据通过Map端Shuffle（Job.setCombinerClass）去除重复值
* @author mengyao
*
*/
public class CnyDataFormatReplition extends Configured implements Tool {

   /**
   * Map端将行内容通过key输出到Reduce，这样会按照字典顺序对key进行排序，输出的value则为空，空值使用Hadoop提供的NullWritable类，该类是Hadoop的序列化后的类型
   * @author mengyao
   *
   */
   static class CnyDataFormatReplitionMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
       @Override
       protected void map(LongWritable key, Text value, Context context)
               throws IOException, InterruptedException {
           context.write(value, NullWritable.get());
       }
   }

   /**
   * 在Map端Combiner后作为Reduce接收的key，Reduce端将key写入到HDFS，value则无需输出，使用NullWritable表示不输出
   * @author mengyao
   *
   */
   static class CnyDataFormatReplitionReduce extends Reducer<Text, NullWritable, Text, NullWritable> {
       @Override
       protected void reduce(Text key, Iterable<NullWritable> value, Context context)
               throws IOException, InterruptedException {
           context.write(key, NullWritable.get());
       }
   }

   @Override
   public int run(String[] arg0) throws Exception {
       Job job = Job.getInstance(getConf(), CnyDataFormatReplition.class.getSimpleName());
       //指定运行作业类的主函数入口
       job.setJarByClass(CnyDataFormatReplition.class);

       FileInputFormat.setInputPaths(job, new Path(arg0[0]));
       job.setMapperClass(CnyDataFormatReplitionMapper.class);
       job.setMapOutputKeyClass(Text.class);
       job.setMapOutputValueClass(NullWritable.class);

       //在Map端进行shuffle，先写入缓冲区预排序（达到缓冲区默认100m后系统起后台线程spill到本地磁盘，写入磁盘前会进行二次快速排序），减少到Reduce的网络开销
       job.setCombinerClass(CnyDataFormatReplitionReduce.class);

       FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
       job.setReducerClass(CnyDataFormatReplitionReduce.class);
       job.setOutputKeyClass(Text.class);
       job.setOutputValueClass(NullWritable.class);

       //提交作业并打印作业的进度详情，true打印，false为不打印
       return job.waitForCompletion(true) ? 0 : 1;
   }


   public static void main(String[] args) throws Exception {
       Configuration conf = new Configuration();
       String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
       if (otherArgs.length != 2) {
           System.err.println(" ERROR: <inputDir> <outputDir>");
           System.exit(2);
       }
       int status = ToolRunner.run(new CnyDataFormatReplition(), otherArgs);
       System.exit(status);
   }

}

查看全文

相关阅读:
Springboot+mybatis-plus+mysql+clickhouse集成多数据源
 对集合里每个元素是一个对象，按照对象某一个属性值给这个集合排序
 vue的a-tree-select选择父节点回显的是子节点
 Es简单条件查询
 使用Ant Desigen在vue里面实现分页以及表头的模糊查询
 搭建第一个vue项目
 Address localhost:1099 is already in use
spring的控制反转DI---基于注解实现
 mybatis下的ResultMap配置一对一以及一对多
 mybatis入门

原文地址：https://www.cnblogs.com/mengyao/p/4230062.html