zoukankan      html  css  js  c++  java
  • 求亿级记录中搜索次数Top N的搜索词(MapReduce实现)

    程序事例:

    日志信息: 二手车
    1345 二手房 3416 洗衣机 2789 输入: N=2 输出: 二手房 洗衣机

    map函数如下:

    import java.io.IOException;
    import java.util.Map;
    import java.util.TreeMap;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    public class TopNMapper extends Mapper<Object, Text, NullWritable, Text> {
        private TreeMap<IntWritable, Text> tm = new TreeMap<IntWritable, Text>();
        private IntWritable mykey = new IntWritable();
        private Text myvalue = new Text();
        private int N = 10;
        @Override
        protected void map(Object key, Text value,
                Mapper<Object, Text, NullWritable, Text>.Context context)
                throws IOException, InterruptedException {
            String word = value.toString().split("	")[0];
            int num = Integer.parseInt(value.toString().split("	")[1]);
            mykey.set(num);
            myvalue.set(word);
            tm.put(mykey, myvalue);
            if (tm.size() > N) tm.remove(tm.firstKey());
        }
        
        @Override
        protected void cleanup(
                Mapper<Object, Text, NullWritable, Text>.Context context)
                throws IOException, InterruptedException {
            for (Map.Entry<IntWritable, Text> entry : tm.entrySet()) {
                Text value = new Text(entry.getKey() + " " + entry.getValue());
                context.write(NullWritable.get(), value);
            }     
        }
    }

    Reduce函数如下:

    import java.io.IOException;
    import java.util.TreeMap;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    public class TopNReducer extends Reducer<NullWritable, Text, NullWritable, Text>{
        private TreeMap<IntWritable, Text> tm = new TreeMap<IntWritable, Text>();
        private IntWritable mykey = new IntWritable();
        private Text myvalue = new Text();
        private int N = 10;
        @Override
        protected void reduce(NullWritable key, Iterable<Text> values,
                Reducer<NullWritable, Text, NullWritable, Text>.Context context)
                throws IOException, InterruptedException {
            for (Text val : values) {
                String[] tmp = val.toString().split(" ");
                mykey.set(Integer.parseInt(tmp[0]));
                myvalue.set(tmp[1]);
                tm.put(mykey, myvalue);
                if (tm.size() > N) tm.remove(tm.firstKey());
            }
            for (Text res : tm.descendingMap().values()) {
                context.write(NullWritable.get(), res);
            }
        }
    }
  • 相关阅读:
    Zookeeper ZAB 协议分析
    Docker技术快速精通指南
    Oracle闪回技术详解
    怎样打造一个分布式数据库
    使用js冒泡实现点击空白处关闭弹窗
    也谈谈我对Docker的简单理解
    Docker技术快速精通指南
    Oracle优化网上常见的5个错误观点
    使用Spring AOP实现MySQL读写分离
    RESTEASY ,从学会使用到了解原理。
  • 原文地址:https://www.cnblogs.com/lasclocker/p/4819655.html
Copyright © 2011-2022 走看看