zoukankan      html  css  js  c++  java
  • Data deduplication

    题目:Data deduplication

    描述

    你的程序要求读入输入文件,在去掉所有数据中的重复数据后输出结果。在输入文件中每一行是一个元数据。

    输入

    输入是一组文本文件,在每个输入文件中每一行是一个数据。每一个元数据都是一个字符串。

    输出文件

    输出文件的每一行都是在输入文件中出现过的一个数据,并且输出文件中的每一行都不相同。

    输入样例

    input1:
    2006-6-9 a
    2006-6-10 b
    2006-6-11 c
    2006-6-12 d
    2006-6-13 a
    2006-6-14 b
    2006-6-15 c
    2006-6-11 c
    input2:
    2006-6-9 b
    2006-6-10 a
    2006-6-11 b
    2006-6-12 d
    2006-6-13 a
    2006-6-14 c
    2006-6-15 d
    2006-6-11 c

    输出样例:
    2006-6-10 a 
    2006-6-10 b 
    2006-6-11 b 
    2006-6-11 c 
    2006-6-12 d 
    2006-6-13 a 
    2006-6-14 b 
    2006-6-14 c 
    2006-6-15 c 
    2006-6-15 d 
    2006-6-9 a 
    2006-6-9 b

    注意:
    1 输出结果是按照字典顺序排序的;
    2 每一行都是一个元数据;
    3 重复数据在输出文件中也要输出一次。

    整整干了一下午加晚上,悲催的。。。。

    程序应该有漏洞,只是在单机下小数据文件可以测试通过。欢迎指正~~

    代码如下:

    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;

    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.io.WritableComparable;


    public class TextPair implements WritableComparable<TextPair> {

    public Text first;
    public Text second;

    public TextPair(){
    this.first=new Text();
    this.second=new Text();
    }

    public TextPair(Text first, Text second) {
    //super();
    this.first = first;
    this.second = second;
    }
    public TextPair(String first,String second){
    this.first=new Text(first);
    this.second=new Text(second);

    }

    public Text getFirst() {
    return first;
    }

    public void setFirst(Text first) {
    this.first = first;
    }

    public Text getSecond() {
    return second;
    }

    public void setSecond(Text second) {
    this.second = second;
    }
    public void set(Text first,Text second){
    this.first=first;
    this.second=second;
    }

    @Override
    public int hashCode() {
    // TODO Auto-generated method stub
    return first.hashCode()*163+second.hashCode();
    }

    @Override
    public boolean equals(Object obj) {
    // TODO Auto-generated method stub
    if(obj instanceof TextPair){
    TextPair tp=(TextPair)obj;
    return first.equals(tp.getFirst())&&second.equals(tp.getSecond());
    }
    return false;
    }

    @Override
    public String toString() {
    // TODO Auto-generated method stub
    return first+"\t"+second;
    }

    @Override
    public void readFields(DataInput in) throws IOException {
    // TODO Auto-generated method stub
    first.readFields(in);
    second.readFields(in);
    }

    @Override
    public void write(DataOutput out) throws IOException {
    // TODO Auto-generated method stub
    first.write(out);
    second.write(out);
    }




    @Override
    public int compareTo(TextPair tp) {
    // TODO Auto-generated method stub
    int cmp=first.compareTo(tp.getFirst());
    if(cmp!=0)
    return cmp;
    return second.compareTo(tp.getSecond());

    }


    }

    import org.apache.hadoop.io.Text;

    import org.apache.hadoop.mapreduce.Partitioner;


    public class FirstPartitioner extends Partitioner<TextPair, Text> {

    @Override
    public int getPartition(TextPair key, Text value, int numPartitions) {
    // TODO Auto-generated method stub
    return Math.abs(key.getFirst().hashCode()&Integer.MAX_VALUE)%numPartitions;
    }

    }

    import org.apache.hadoop.io.WritableComparable;
    import org.apache.hadoop.io.WritableComparator;


    public class GroupComparator extends WritableComparator {


    public GroupComparator() {
    super(TextPair.class, true);
    // TODO Auto-generated constructor stub
    }


    @Override
    public int compare(WritableComparable a, WritableComparable b) {
    // TODO Auto-generated method stub
    TextPair p1=(TextPair)a;
    TextPair p2=(TextPair)b;
    return p1.getFirst().compareTo(p2.getFirst());
    }


    }

    import java.io.IOException;
    import java.util.StringTokenizer;

    import org.apache.hadoop.fs.*;
    import org.apache.hadoop.io.*;
    import org.apache.hadoop.mapreduce.Mapper;

    public class DatadeduplicationMapper extends Mapper<LongWritable,Text,TextPair,Text>{

    @Override
    protected void map(LongWritable key, Text value,
    org.apache.hadoop.mapreduce.Mapper.Context context)
    throws IOException, InterruptedException {
    // TODO Auto-generated method stub
    System.out.println("mapper");
    String line=value.toString();
    StringTokenizer token=new StringTokenizer(line);
    String []str=new String[2];
    int i=0;
    TextPair tp=new TextPair();
    while(token.hasMoreTokens()){
    str[i]=token.nextToken();
    i++;
    }

    tp.set(new Text(str[0]), new Text(str[1]));
    System.out.println(tp);
    System.out.println(str[1]);
    context.write(tp, new Text(str[1]));

    }


    }

    import java.awt.List;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.HashSet;
    import java.util.Iterator;
    import java.util.Set;
    import java.util.Vector;

    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;


    public class DatadeduplicationReducer extends
    Reducer<TextPair, Text, Text, Text> {
    @Override
    protected void reduce(TextPair key, Iterable<Text> values,
    Context context)
    throws IOException, InterruptedException {
    // TODO Auto-generated method stub
    System.out.println("reducer");

    Set set=new HashSet();
    Vector<String>list=new Vector();

    for(Text value:values){
    //set.add(value.toString());
    if(!list.contains(value.toString())){
    list.add(value.toString());
    }
    System.out.println(key);
    System.out.println(value);
    }
    for(int i=0;i<list.size();i++)
    {
    //System.out.println(it.next());
    System.out.println((String)list.get(i));
    context.write(key.getFirst(), new Text(list.get(i).toString()));
    }
    // context.write(key, values)
    }

    }

    import java.io.IOException;



    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


    public class Datadeduplication {

    /**
    *
    @param args
    *
    @throws IOException
    *
    @throws ClassNotFoundException
    *
    @throws InterruptedException
    */
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    // TODO Auto-generated method stub
    System.out.println("nihao");
    Job job=new Job();
    System.out.println("nihao");
    job.setJobName("Datadeduplication");
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(DatadeduplicationMapper.class);
    job.setMapOutputKeyClass(TextPair.class);
    job.setMapOutputValueClass(Text.class);


    job.setPartitionerClass(FirstPartitioner.class);

    job.setGroupingComparatorClass(GroupComparator.class);
    //job.setSortComparatorClass(FirstGroupingComparator.class);
    //job.setSortComparatorClass(GroupComparator.class);
    //job.setGroupingComparatorClass(KeyComparator.class);

    job.setReducerClass(DatadeduplicationReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);


    job.waitForCompletion(false);

    }

    }



              



  • 相关阅读:
    spring 09-Spring框架基于QuartZ实现调度任务
    spring 08-Spring框架Resource资源注入
    spring 07-Spring框架Resource读取不同资源
    spring 06-Spring框架基于Annotation的依赖注入配置
    html 默认跳转
    poi 设置样式
    支付宝扫码支付回调验证签名
    构造器初始化
    cxf webservice
    CSS3 border-image 属性
  • 原文地址:https://www.cnblogs.com/dlutxm/p/2192154.html
Copyright © 2011-2022 走看看