zoukankan      html  css  js  c++  java
  • MR案例:单表关联查询

    "单表关联"这个实例要求从给出的数据中寻找所关心的数据,它是对原始数据所包含信息的挖掘。

    需求:实例中给出 child-parent(孩子—父母)表,要求输出 grandchild-grandparent(孙子—爷奶)表。

    package test;
    
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    /**
     * 输入:
     * child        parent
     * 张三            张三的爸爸
     * 张三的爸爸        张三的爷爷
     * 
     * 输出:
     * grandChiled    grandFather
     * 张三            张三的爷爷 
     */
    public class MySingle {
    
        public static void main(String[] args) throws Exception {
            
            //配置环境变量
            System.setProperty("hadoop.home.dir", "F:\JAVA\hadoop-2.2.0");
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
            job.setJarByClass(MySingle.class);
    
            job.setMapperClass(STMapper.class);
            job.setReducerClass(STReducer.class);
    
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
    
            FileInputFormat.addInputPath(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
            System.exit(job.waitForCompletion(true) ? 0 : -1);
        }
    
        public static class STMapper extends Mapper<LongWritable, Text, Text, Text>{
            @Override
            protected void map(LongWritable key, Text value, Context context)
                    throws IOException, InterruptedException {
    
                String[] splited = value.toString().split(" ");
                if(splited.length >= 2){
    
                    //正向输出,value即 父亲前加符号"-"            
                    context.write(new Text(splited[0]), new Text("-"+splited[1]));
    
                    //反向输出
                    context.write(new Text(splited[1]), new Text(splited[0]));
                }
            }
        }
    
        public static class STReducer extends Reducer<Text, Text, Text, Text>{
            @Override
            protected void reduce(Text key, Iterable<Text> v2s,Context context)
                    throws IOException, InterruptedException {
    
                List<String> grandChild=new ArrayList<String>();
                List<String> grandParent=new ArrayList<String>();
    
                for(Text text : v2s){
    
                    //以"-"开始则是key的父亲
                    if(text.toString().startsWith("-")){
                        
                        //将可能成为爷爷的变量存储到grandParent集合中去
                        grandParent.add(text.toString().substring(1));            
                    }else {
    
                        grandChild.add(text.toString());                    
                    }
                }
                /**
                 * 【关键的判断】
                 * 当前输入的key既有儿子又有父亲
                 */
                if(grandChild.size()!=0 && grandParent.size()!=0){
    
                    for(int i=0;i<grandChild.size();i++){
                        for(int j=0;j<grandParent.size();j++){
                            
                            //key:孙子 value:爷爷
                            context.write(new Text(grandChild.get(i)), new Text(grandParent.get(j)));
                        }
                    }                
                }
            }
        }
    }
    • 在reduce阶段,将两种Value分别存储到grandchild和grandparent集合中
    • 对于reduce阶段的key,只有当他既有儿子又有父亲时,他才可以使得grandchild和grandparent两集合都不为空
  • 相关阅读:
    Forword: ssh server for windows
    多浏览器测试
    GMT PST Beijing 时间
    Localization process
    【转】CentOS下配置PXE+Kickstart无人值守安装(Howto install CentOS through PXE+KickStart)
    CI hudson 远程部署
    s3cmd 安装使用指南
    Git
    关于Request.Form获取listbox所有项的问题
    Assembly.LoadFrom 与Assembly.Load 与 Assembly.LoadFile
  • 原文地址:https://www.cnblogs.com/skyl/p/4732327.html
Copyright © 2011-2022 走看看