zoukankan      html  css  js  c++  java
  • MapReduce_dedup

     1 package MapReduce;
     2 
     3 import java.io.IOException;
     4 import java.net.URI;
     5 
     6 import org.apache.hadoop.conf.Configuration;
     7 import org.apache.hadoop.fs.FileSystem;
     8 import org.apache.hadoop.fs.Path;
     9 //import org.apache.hadoop.io.IntWritable;
    10 import org.apache.hadoop.io.Text;
    11 import org.apache.hadoop.mapreduce.Job;
    12 import org.apache.hadoop.mapreduce.Mapper;
    13 import org.apache.hadoop.mapreduce.Reducer;
    14 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    15 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    16 //import org.apache.hadoop.util.GenericOptionsParser;
    17  
    18 public class Dedup {
    19     private static final String INPUT_PATH = "hdfs://h201:9000/user/hadoop/input";
    20     private static final String OUTPUT_PATH = "hdfs://h201:9000/user/hadoop/output";
    21     //map将输入中的value复制到输出数据的key上,并直接输出
    22     public static class Map extends Mapper<Object,Text,Text,Text>{
    23         private static Text line=new Text();//每行数据     
    24         //实现map函数
    25         public void map(Object key,Text value,Context context) throws IOException,InterruptedException{
    26             line=value;
    27             context.write(line, new Text(""));
    28         }       
    29     }
    30     //reduce将输入中的key复制到输出数据的key上,并直接输出
    31     public static class Reduce extends Reducer<Text,Text,Text,Text>{
    32         //实现reduce函数
    33         public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException{
    34             context.write(key, new Text(""));//行去重
    35         }       
    36     }
    37     public static void main(String[] args) throws Exception{
    38         Configuration conf = new Configuration();
    39         //这句话很关键
    40         conf.set("mapred.jar","Dedup.jar");
    41         final FileSystem fileSystem = FileSystem.get(new URI(OUTPUT_PATH), conf);//读路径信息
    42         fileSystem.delete(new Path(OUTPUT_PATH), true);//删除路径信息 输出路径不能存在
    43         //String[] ioArgs=new String[]{"dedup_in","dedup_out"};//dedup_in在/user/hadoop下ioArgs对应的是括号内的
    44         //String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();//把ioArgs赋给oherArgs,引用Generic这个类,调用getRemain这个方法 conf代表的是初始化
    45         //if (otherArgs.length != 2) {
    46             //System.err.println("Usage: Data Deduplication <in> <out>");
    47             //System.exit(2);
    48             //}
    49         final Job job = new Job(conf, WordCountApp.class.getSimpleName());
    50         job.setJarByClass(Dedup.class); //启动job
    51         FileInputFormat.setInputPaths(job, INPUT_PATH);
    52         //设置Map、Combine和Reduce处理类
    53         job.setMapperClass(Map.class);
    54         job.setCombinerClass(Reduce.class);
    55         job.setReducerClass(Reduce.class);    
    56          //设置输出类型
    57         job.setOutputKeyClass(Text.class);
    58         job.setOutputValueClass(Text.class);    
    59          //设置输入和输出目录
    60         FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
    61         System.exit(job.waitForCompletion(true) ? 0 : 1);
    62         }
    63 }

    结果

    [hadoop@h201 Dedup]$ hadoop fs -cat /user/hadoop/output/part-r-00000
    18/03/18 22:02:59 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    ayi mama
    ayi shushu
    cai wen wei
    didi mama
    gege jiejie didi
    hello baba
    hello mama
    hello word
    jiejie hello
    mama baba jiejie gege
    mama jiejie
    meimei jiejie

    原始数据

    [hadoop@h201 Dedup]$ hadoop fs -cat /user/hadoop/input/counttext1.txt
    18/03/18 22:07:08 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    hello baba
    hello mama
    mama jiejie
    jiejie hello
    [hadoop@h201 Dedup]$ hadoop fs -cat /user/hadoop/input/counttext.txt
    18/03/18 22:07:18 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    hello mama
    hello baba
    hello word
    cai wen wei
    mama baba jiejie gege
    gege jiejie didi
    meimei jiejie
    didi mama
    ayi shushu
    ayi mama
    hello mama
    hello baba
    hello word
    cai wen wei
    mama baba jiejie gege
    gege jiejie didi
    meimei jiejie
    didi mama
    ayi shushu
    ayi mama
    hello mama
    hello baba
    hello word
    cai wen wei
    mama baba jiejie gege
    gege jiejie didi
    meimei jiejie
    didi mama
    ayi shushu
    ayi mama
    hello mama
    hello baba
    hello word
    cai wen wei
    mama baba jiejie gege
    gege jiejie didi
    meimei jiejie
    didi mama
    ayi shushu
    ayi mama
    hello mama
    hello baba
    hello word
    cai wen wei
    mama baba jiejie gege
    gege jiejie didi
    meimei jiejie
    didi mama
    ayi shushu
    ayi mama

  • 相关阅读:
    我在项目内使用了设计模式后,同事直呼看不懂
    pom文件中依赖找不到的根本解决方法
    基于session的传统认证授权详解
    python中2个字典比较
    编码设计应遵循的规则
    yarn任务执行流程
    python3 中print 显示不全问题
    pandas 可视化
    python时间大小判断,相差天数秒数计算
    Impala任务程序cancle
  • 原文地址:https://www.cnblogs.com/jieran/p/8597857.html
Copyright © 2011-2022 走看看