hadoop1.2.1 MultipleOutputs将结果输出到多个文件或文件夹

zoukankan html css js c++ java

hadoop1.2.1 MultipleOutputs将结果输出到多个文件或文件夹
hadoop1.2.1 MultipleOutputs将结果输出到多个文件或文件夹
博客分类：http://tydldd.iteye.com/blog/2053867

hadoop
hadoop1.2.1中使用MultipleOutputs将结果输出到多个文件或文件夹

使用步骤主要有三步：

1、在reduce或map类中创建MultipleOutputs对象，将结果输出

Java代码

class reduceStatistics extends Reducer<Text, IntWritable, Text, IntWritable>{



    //将结果输出到多个文件或多个文件夹

    private MultipleOutputs<Text,IntWritable> mos;

    //创建对象

    protected void setup(Context context) throws IOException,InterruptedException {

        mos = new MultipleOutputs<Text, IntWritable>(context);

     }



        //关闭对象

    protected void cleanup(Context context) throws IOException,InterruptedException {

        mos.close();

    }

}

2、在map或reduce方法中使用MultipleOutputs对象输出数据，代替congtext.write()

Java代码

protected void reduce(Text key, Iterable<IntWritable> values, Context context)

            throws IOException, InterruptedException {

        IntWritable V = new IntWritable();

        int sum = 0;

        for(IntWritable value : values){

            sum = sum + value.get();

        }

        System.out.println("word:" + key.toString() + "     sum = " + sum);

        V.set(sum);



        //使用MultipleOutputs对象输出数据

        if(key.toString().equals("hello")){

            mos.write("hello", key, V);

        }else if(key.toString().equals("world")){

            mos.write("world", key, V);

        }else if(key.toString().equals("hadoop")){

            //输出到hadoop/hadoopfile-r-00000文件

            mos.write("hadoopfile", key, V, "hadoop/");

        }



    }

3、在创建job时，定义附加的输出文件，这里的文件名称与第二步设置的文件名相同

Java代码

//定义附加的输出文件

            MultipleOutputs.addNamedOutput(job,"hello",TextOutputFormat.class,Text.class,IntWritable.class);

            MultipleOutputs.addNamedOutput(job,"world",TextOutputFormat.class,Text.class,IntWritable.class);

            MultipleOutputs.addNamedOutput(job,"hadoopfile",TextOutputFormat.class,Text.class,IntWritable.class);

完整代码：

Java代码

package com.ru.hadoop.wordcount;



import java.io.IOException;

import java.net.URI;

import java.net.URISyntaxException;

import java.util.regex.Pattern;



import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.JobConf;

import org.apache.hadoop.mapred.RecordWriter;

import org.apache.hadoop.mapred.lib.MultipleOutputFormat;

import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import org.apache.hadoop.util.Progressable;



public class WordCount2 extends Configured{



    public static void main(String[] args) {

        String in = "/home/nange/work/test/word/";

        String out = "hdfs://localhost:9000/hdfs/test/wordcount/out/";



        Job job;

        try {

            //删除hdfs目录

            WordCount2 wc2 = new WordCount2();

            wc2.removeDir(out);



            job = new Job(new Configuration(), "wordcount Job");

            job.setOutputKeyClass(Text.class);

            job.setOutputValueClass(IntWritable.class);

            job.setMapperClass(mapperString.class);

//          job.setCombinerClass(reduceStatistics.class);

            job.setReducerClass(reduceStatistics.class);



            //定义附加的输出文件

            MultipleOutputs.addNamedOutput(job,"hello",TextOutputFormat.class,Text.class,IntWritable.class);

            MultipleOutputs.addNamedOutput(job,"world",TextOutputFormat.class,Text.class,IntWritable.class);

            MultipleOutputs.addNamedOutput(job,"hadoopfile",TextOutputFormat.class,Text.class,IntWritable.class);



            FileInputFormat.addInputPath(job, new Path(in));

            FileOutputFormat.setOutputPath(job, new Path(out));

            job.waitForCompletion(true);

        } catch (IOException e) {

            e.printStackTrace();

        } catch (URISyntaxException e) {

            e.printStackTrace();

        } catch (ClassNotFoundException e) {

            e.printStackTrace();

        } catch (InterruptedException e) {

            e.printStackTrace();

        }

    }



    public void removeDir(String filePath) throws IOException, URISyntaxException{

        String url = "hdfs://localhost:9000";

        FileSystem fs  = FileSystem.get(new URI(url), new Configuration());

        fs.delete(new Path(filePath));

    }

}





/**

* 重写maptask使用的map方法

* @author nange

*

*/

class mapperString extends Mapper<LongWritable, Text, Text, IntWritable>{

    //设置正则表达式的编译表达形式

    public static Pattern PATTERN = Pattern.compile(" ");

    Text K = new Text();

    IntWritable V = new IntWritable(1);

    @Override

    protected void map(LongWritable key, Text value, Context context)

            throws IOException, InterruptedException {



        String[] words = PATTERN.split(value.toString());

        System.out.println("********" + value.toString());

        for(String word : words){

            K.set(word);

            context.write(K, V);

        }

    }

}



/**

* 对单词做统计

* @author nange

*

*/

class reduceStatistics extends Reducer<Text, IntWritable, Text, IntWritable>{



    //将结果输出到多个文件或多个文件夹

    private MultipleOutputs<Text,IntWritable> mos;

    //创建MultipleOutputs对象

    protected void setup(Context context) throws IOException,InterruptedException {

        mos = new MultipleOutputs<Text, IntWritable>(context);

     }



    @Override

    protected void reduce(Text key, Iterable<IntWritable> values, Context context)

            throws IOException, InterruptedException {

        IntWritable V = new IntWritable();

        int sum = 0;

        for(IntWritable value : values){

            sum = sum + value.get();

        }

        System.out.println("word:" + key.toString() + "     sum = " + sum);

        V.set(sum);



        //使用MultipleOutputs对象输出数据

        if(key.toString().equals("hello")){

            mos.write("hello", key, V);

        }else if(key.toString().equals("world")){

            mos.write("world", key, V);

        }else if(key.toString().equals("hadoop")){

            //输出到hadoop/hadoopfile-r-00000文件

            mos.write("hadoopfile", key, V, "hadoop/");

        }



    }



    //关闭MultipleOutputs对象

    protected void cleanup(Context context) throws IOException,InterruptedException {

        mos.close();

    }

}
查看全文

相关阅读:
RAID10磁盘阵列损坏的修复
 Linux系统中物理劵增加、删除；卷组的扩容、缩容；逻辑卷的增加与删除
 Ubuntu alternate和desktop区别 zz
freecommander 快捷键列表 zz
调试小技巧
 Java框架
 获取url的文件名（动态改变css）
Urlrewrite方法集
 NVelocity模板引擎,初级体验,非常有用的东东.（转）
CodeSmith&NetTiers Step by Step[转]

原文地址：https://www.cnblogs.com/luolizhi/p/4931561.html