mapreduce实现"浏览该商品的人大多数还浏览了"经典应用

zoukankan html css js c++ java

mapreduce实现"浏览该商品的人大多数还浏览了"经典应用
输入:

日期    ...cookie id.        ...商品id..

xx            xx                        xx

输出:

商品id        商品id列表(按优先级排序,用逗号分隔)

xx                   xx

比如:

id1              id3,id0,id4,id2

id2             id0,id5

整个计算过程分为4步

1、提取原始日志日期,cookie id,商品id信息，按天计算,最后输出数据格式

商品id-0 商品id-1

xx           x x

这一步做了次优化,商品id-0一定比商品id-1小，为了减少存储，在最后汇总数据转置下即可

reduce做局部排序及排重

2、基于上次的结果做汇总,按天计算

商品id-0 商品id-1 关联值(关联值即同时访问这两个商品的用户数)

xx            x x                xx

3、汇总最近三个月数据,同时考虑时间衰减,时间越久关联值的贡献越低,最后输出两两商品的关联值（包括转置后)

4、行列转换，生成最后要的推荐结果数据,按关联值排序生成

第一个MR

[java] view plain copy

import java.io.IOException;

import java.util.ArrayList;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.WritableComparable;

import org.apache.hadoop.io.WritableComparator;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Partitioner;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.log4j.Logger;





/*

* 输入:原始数据,会有重复

*日期 cookie 楼盘id

*

* 输出:

* 日期楼盘id1 楼盘id2  //楼盘id1一定小于楼盘id2 ,按日期 cookie进行分组

*

*/



public class HouseMergeAndSplit {



    public static class Partitioner1 extends Partitioner<TextPair, Text> {

          @Override

          public int getPartition(TextPair key, Text value, int numParititon) {

                      return Math.abs((new Text(key.getFirst().toString()+key.getSecond().toString())).hashCode() * 127) % numParititon;



          }

    }

          public static class Comp1 extends WritableComparator {

              public Comp1() {

               super(TextPair.class, true);

              }

              @SuppressWarnings("unchecked")

              public int compare(WritableComparable a, WritableComparable b) {

               TextPair t1 = (TextPair) a;

               TextPair t2 = (TextPair) b;

               int comp= t1.getFirst().compareTo(t2.getFirst());

               if (comp!=0)

                   return comp;

               return t1.getSecond().compareTo(t2.getSecond());

              }

            }

      public static class TokenizerMapper

           extends Mapper<LongWritable, Text, TextPair, Text>{

                  Text val=new Text("test");

        public void map(LongWritable key, Text value, Context context

                        ) throws IOException, InterruptedException {

                         String s[]=value.toString().split("01");

             TextPair tp=new TextPair(s[0],s[1],s[4]+s[3]); //thedate cookie city+houseid

             context.write(tp, val);

        }

      }



      public static class IntSumReducer

           extends Reducer<TextPair,Text,Text,Text> {

          private static String comparedColumn[] = new String[3];

          ArrayList<String> houselist= new ArrayList<String>();

          private static Text keyv = new Text();



          private static Text valuev = new Text();

          static Logger logger = Logger.getLogger(HouseMergeAndSplit.class.getName());



        public void reduce(TextPair key, Iterable<Text> values,

                           Context context

                           ) throws IOException, InterruptedException {



            houselist.clear();

            String thedate=key.getFirst().toString();

            String cookie=key.getSecond().toString();



            for (int i=0;i<3;i++)

                comparedColumn[i]="";



            //first+second为分组键,每次不同重新调用reduce函数

            for (Text val:values)

            {



                if (thedate.equals(comparedColumn[0]) && cookie.equals(comparedColumn[1])&&  !key.getThree().toString().equals(comparedColumn[2]))

                 {

                    // context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" first"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2]));

                     houselist.add(key.getThree().toString());



                     comparedColumn[0]=key.getFirst().toString();

                       comparedColumn[1]=key.getSecond().toString();

                       comparedColumn[2]=key.getThree().toString();



                 }



                   if (!thedate.equals(comparedColumn[0])||!cookie.equals(comparedColumn[1]))



                       {



                     //  context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" second"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2]));

                       houselist.add(key.getThree().toString());

                       comparedColumn[0]=key.getFirst().toString();

                       comparedColumn[1]=key.getSecond().toString();

                       comparedColumn[2]=key.getThree().toString();



                       }







            }







            keyv.set(comparedColumn[0]); //日期

            //valuev.set(houselist.toString());

            //logger.info(houselist.toString());

            //context.write(keyv,valuev);





            for (int i=0;i<houselist.size()-1;i++)

            {

                for (int j=i+1;j<houselist.size();j++)

                {    valuev.set(houselist.get(i)+"  "+houselist.get(j)); //关联的楼盘

                    context.write(keyv,valuev);

                }

            }



        }

      }



      public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        if (otherArgs.length != 2) {

          System.err.println("Usage: wordcount <in> <out>");

          System.exit(2);

        }



        FileSystem fstm = FileSystem.get(conf);

        Path outDir = new Path(otherArgs[1]);

        fstm.delete(outDir, true);



   conf.set("mapred.textoutputformat.separator", " "); //reduce输出时key value中间的分隔符

        Job job = new Job(conf, "HouseMergeAndSplit");

        job.setNumReduceTasks(4);

        job.setJarByClass(HouseMergeAndSplit.class);

        job.setMapperClass(TokenizerMapper.class);



        job.setMapOutputKeyClass(TextPair.class);

        job.setMapOutputValueClass(Text.class);

        // 设置partition

        job.setPartitionerClass(Partitioner1.class);

        // 在分区之后按照指定的条件分组

        job.setGroupingComparatorClass(Comp1.class);

        // 设置reduce

        // 设置reduce的输出

        job.setReducerClass(IntSumReducer.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(Text.class);

        //job.setNumReduceTasks(18);

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

      }

}

TextPair

[java] view plain copy

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;



import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.WritableComparable;



public class TextPair implements WritableComparable<TextPair> {

    private Text first;

    private Text second;

    private Text three;

    public TextPair() {

      set(new Text(), new Text(),new Text());

    }

    public TextPair(String first, String second,String three) {

      set(new Text(first), new Text(second),new Text(three));

    }

    public TextPair(Text first, Text second,Text Three) {

      set(first, second,three);

    }

    public void set(Text first, Text second,Text three) {

      this.first = first;

      this.second = second;

      this.three=three;

    }

    public Text getFirst() {

      return first;

    }

    public Text getSecond() {

      return second;

    }

    public Text getThree() {

          return three;

        }

    public void write(DataOutput out) throws IOException {

      first.write(out);

      second.write(out);

      three.write(out);

    }

    public void readFields(DataInput in) throws IOException {

      first.readFields(in);

      second.readFields(in);

      three.readFields(in);

    }

    public int compareTo(TextPair tp) {

      int cmp = first.compareTo(tp.first);

      if (cmp != 0) {

       return cmp;

      }

      cmp= second.compareTo(tp.second);

      if (cmp != 0) {

           return cmp;

          }

      return three.compareTo(tp.three);

    }

    }

TextPairSecond

[java] view plain copy

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;



import org.apache.hadoop.io.FloatWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.WritableComparable;



public class TextPairSecond implements WritableComparable<TextPairSecond> {

    private Text first;

    private FloatWritable second;

    public TextPairSecond() {

      set(new Text(), new FloatWritable());

    }

    public TextPairSecond(String first, float second) {

      set(new Text(first), new FloatWritable(second));

    }

    public TextPairSecond(Text first, FloatWritable second) {

      set(first, second);

    }

    public void set(Text first, FloatWritable second) {

      this.first = first;

      this.second = second;

    }

    public Text getFirst() {

      return first;

    }

    public FloatWritable getSecond() {

      return second;

    }

    public void write(DataOutput out) throws IOException {

      first.write(out);

      second.write(out);

    }

    public void readFields(DataInput in) throws IOException {

      first.readFields(in);

      second.readFields(in);

    }

    public int compareTo(TextPairSecond tp) {

      int cmp = first.compareTo(tp.first);

      if (cmp != 0) {

       return cmp;

      }

      return second.compareTo(tp.second);

    }



    }

第二个MR

[java] view plain copy

import java.io.IOException;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Date;



import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.WritableComparable;

import org.apache.hadoop.io.WritableComparator;

import org.apache.hadoop.mapred.OutputCollector;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Partitioner;

import org.apache.hadoop.mapreduce.Reducer;



import org.apache.hadoop.mapreduce.Mapper.Context;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.log4j.Logger;





/*

*  统计楼盘之间共同出现的次数

* 输入:

* 日期楼盘1 楼盘2

*

* 输出：

* 日期楼盘1 楼盘2 共同出现的次数

*

*/



public class HouseCount {





      public static class TokenizerMapper

           extends Mapper<LongWritable, Text, Text, IntWritable>{





    IntWritable iw=new IntWritable(1);

        public void map(LongWritable key, Text value, Context context

                        ) throws IOException, InterruptedException {





         context.write(value, iw);

        }

      }



      public static class IntSumReducer

           extends Reducer<Text,IntWritable,Text,IntWritable> {



         IntWritable result=new IntWritable();

        public void reduce(Text key, Iterable<IntWritable> values,

                           Context context

                           ) throws IOException, InterruptedException {



             int sum=0;

             for (IntWritable iw:values)

             {

                 sum+=iw.get();

             }

             result.set(sum);

         context.write(key, result) ;



        }

      }



      public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        if (otherArgs.length != 2) {

          System.err.println("Usage: wordcount <in> <out>");

          System.exit(2);

        }



        FileSystem fstm = FileSystem.get(conf);

        Path outDir = new Path(otherArgs[1]);

        fstm.delete(outDir, true);



   conf.set("mapred.textoutputformat.separator", " "); //reduce输出时key value中间的分隔符

        Job job = new Job(conf, "HouseCount");

        job.setNumReduceTasks(2);

        job.setJarByClass(HouseCount.class);

        job.setMapperClass(TokenizerMapper.class);



        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(IntWritable.class);



        // 设置reduce

        // 设置reduce的输出

        job.setReducerClass(IntSumReducer.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(IntWritable.class);

        //job.setNumReduceTasks(18);

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

      }

}

第三个MR

[java] view plain copy

import java.io.IOException;

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Calendar;

import java.util.Date;



import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.FloatWritable;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.WritableComparable;

import org.apache.hadoop.io.WritableComparator;

import org.apache.hadoop.mapred.OutputCollector;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Partitioner;

import org.apache.hadoop.mapreduce.Reducer;



import org.apache.hadoop.mapreduce.Mapper.Context;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.log4j.Logger;





/*

* 汇总近三个月统计楼盘之间共同出现的次数,考虑衰减系数，并最后a b 转成 b a输出一次

* 输入:

* 日期  楼盘1 楼盘2 共同出现的次数

*

* 输出

* 楼盘1 楼盘2 共同出现的次数（考虑了衰减系数，每天的衰减系数不一样)

*

*/



public class HouseCountHz {





      public static class HouseCountHzMapper

           extends Mapper<LongWritable, Text, Text, FloatWritable>{



    Text keyv=new Text();



    FloatWritable valuev=new FloatWritable();

        public void map(LongWritable key, Text value, Context context

                        ) throws IOException, InterruptedException {



        String[] s=value.toString().split(" ");

        keyv.set(s[1]+" "+s[2]);//楼盘1,楼盘2

        Calendar date1=Calendar.getInstance();

          Calendar d2=Calendar.getInstance();



          Date b = null;

          SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");

          try {

            b=sdf.parse(s[0]);

          } catch (ParseException e) {

           e.printStackTrace();

          }

          d2.setTime(b);

          long n=date1.getTimeInMillis();

          long birth=d2.getTimeInMillis();

          long sss=n-birth;

          int day=(int)((sss)/(3600*24*1000)); //该条记录的日期与当前日期的日期差

          float factor=1/(1+(float)(day-1)/10); //衰减系数

        valuev.set(Float.parseFloat(s[3])*factor);



         context.write(keyv, valuev);

        }

      }



      public static class HouseCountHzReducer

           extends Reducer<Text,FloatWritable,Text,FloatWritable> {



          FloatWritable result=new FloatWritable();

          Text keyreverse=new Text();

        public void reduce(Text key, Iterable<FloatWritable> values,

                           Context context

                           ) throws IOException, InterruptedException {



             float sum=0;

             for (FloatWritable iw:values)

             {

                 sum+=iw.get();

             }

             result.set(sum);

             String[] keys=key.toString().split(" ");

             keyreverse.set(keys[1]+"   "+keys[0]);

         context.write(key, result) ;

         context.write(keyreverse, result)  ;



        }

      }



      public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        if (otherArgs.length != 2) {

          System.err.println("Usage: wordcount <in> <out>");

          System.exit(2);

        }



        FileSystem fstm = FileSystem.get(conf);

        Path outDir = new Path(otherArgs[1]);

        fstm.delete(outDir, true);



   conf.set("mapred.textoutputformat.separator", " "); //reduce输出时key value中间的分隔符

        Job job = new Job(conf, "HouseCountHz");

        job.setNumReduceTasks(2);

        job.setJarByClass(HouseCountHz.class);

        job.setMapperClass(HouseCountHzMapper.class);



        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(FloatWritable.class);



        // 设置reduce

        // 设置reduce的输出

        job.setReducerClass(HouseCountHzReducer.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(FloatWritable.class);

        //job.setNumReduceTasks(18);

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

      }

}

第四个MR

[java] view plain copy

import java.io.IOException;

import java.util.Iterator;





import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.FloatWritable;



import org.apache.hadoop.io.LongWritable;



import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.WritableComparable;

import org.apache.hadoop.io.WritableComparator;



import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Partitioner;

import org.apache.hadoop.mapreduce.Reducer;



import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;







/*

* 输入数据:

* 楼盘1 楼盘2 共同出现的次数

*

* 输出数据

*  楼盘1 楼盘2,楼盘3,楼盘4 (按次数排序)

*/



public class HouseRowToCol {



    public static class Partitioner1 extends Partitioner<TextPairSecond, Text> {

          @Override

          //分区

          public int getPartition(TextPairSecond key, Text value, int numParititon) {

                      return Math.abs((new Text(key.getFirst().toString()+key.getSecond().toString())).hashCode() * 127) % numParititon;



          }

    }

    //分组

          public static class Comp1 extends WritableComparator {

              public Comp1() {

               super(TextPairSecond.class, true);

              }

              @SuppressWarnings("unchecked")

              public int compare(WritableComparable a, WritableComparable b) {

                  TextPairSecond t1 = (TextPairSecond) a;

                  TextPairSecond t2 = (TextPairSecond) b;

                return t1.getFirst().compareTo(t2.getFirst());



              }

            }



          //排序

          public static class KeyComp extends WritableComparator {

              public KeyComp() {

               super(TextPairSecond.class, true);

              }

              @SuppressWarnings("unchecked")

              public int compare(WritableComparable a, WritableComparable b) {

                  TextPairSecond t1 = (TextPairSecond) a;

                  TextPairSecond t2 = (TextPairSecond) b;

               int comp= t1.getFirst().compareTo(t2.getFirst());

               if (comp!=0)

                   return comp;

               return -t1.getSecond().compareTo(t2.getSecond());

              }

            }

      public static class HouseRowToColMapper

           extends Mapper<LongWritable, Text, TextPairSecond, Text>{



          Text houseid1=new Text();

          Text houseid2=new Text();

          FloatWritable weight=new FloatWritable();

        public void map(LongWritable key, Text value, Context context

                        ) throws IOException, InterruptedException {



         String s[]=value.toString().split(" ");



           weight.set(Float.parseFloat(s[2]));

           houseid1.set(s[0]);

           houseid2.set(s[1]);

         TextPairSecond tp=new TextPairSecond(houseid1,weight);

         context.write(tp, houseid2);

        }

      }



      public static class HouseRowToColReducer

           extends Reducer<TextPairSecond,Text,Text,Text> {



       Text valuev=new Text();

        public void reduce(TextPairSecond key, Iterable<Text> values,

                           Context context

                           ) throws IOException, InterruptedException {

            Text keyv=key.getFirst();

            Iterator<Text> it=values.iterator();

            StringBuilder sb=new StringBuilder(it.next().toString());

            while(it.hasNext())

            {

                sb.append(","+it.next().toString());

            }

            valuev.set(sb.toString());

            context.write(keyv, valuev);







        }

      }



      public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        if (otherArgs.length != 2) {

          System.err.println("Usage: wordcount <in> <out>");

          System.exit(2);

        }



        FileSystem fstm = FileSystem.get(conf);

        Path outDir = new Path(otherArgs[1]);

        fstm.delete(outDir, true);



   conf.set("mapred.textoutputformat.separator", " "); //reduce输出时key value中间的分隔符

        Job job = new Job(conf, "HouseRowToCol");

        job.setNumReduceTasks(4);

        job.setJarByClass(HouseRowToCol.class);

        job.setMapperClass(HouseRowToColMapper.class);



        job.setMapOutputKeyClass(TextPairSecond.class);

        job.setMapOutputValueClass(Text.class);

        // 设置partition

        job.setPartitionerClass(Partitioner1.class);

        // 在分区之后按照指定的条件分组

        job.setGroupingComparatorClass(Comp1.class);

        job.setSortComparatorClass(KeyComp.class);

        // 设置reduce

        // 设置reduce的输出

        job.setReducerClass(HouseRowToColReducer.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(Text.class);

        //job.setNumReduceTasks(18);

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

      }

}
转：http://blog.csdn.net/u011750989/article/details/12004065
查看全文

相关阅读:
Linux 技巧：让进程在后台可靠运行的几种方法
 What is /dev/null 2>&1?
In the shell, what does “ 2>&1 ” mean?
Linux命令之umask
/dev/null简介
 What is special about /dev/tty?
sed用法
 cobbler 更换dns和dhcp服务器为dnsmasq
Linux下如何退出vim的一些常用命令总结
 nginx部署vue项目

原文地址：https://www.cnblogs.com/hd-zg/p/6905222.html