zoukankan      html  css  js  c++  java
  • Hadoop下进行反向索引(Inverted Index)操作

    0.参考资料:

    代码参考1:http://www.pudn.com/downloads212/sourcecode/unix_linux/detail999273.html

    理论参考2:http://zhangyu8374.javaeye.com/blog/86307,http://nything.javaeye.com/blog/411787

    1.分析

    假如有file0,file1,file2三个文件,这些文件中都保存了一些文本内容,比如在file0中只有一个句子,内容为"we are happy"。一般的索引都是记录在这个文件中没有一个单词的索引号。比如file0的索引可以是(we,0),(are,1),(happy,2)。这样的键值对中key是单词,value是这个单词在这个文件中的位置。但是,反向索引刚好相反,对应于多个文件,我们要求出某一个单词在所有这些文件中出现的位置。我们可以按如下操作进行实验:

    在本地创建文件夹IndexTest并在里面创建3个文件,每个文件中的内容如下。

        * T0 = "it is what it is"
        * T1 = "what is it"
        * T2 = "it is a banana"

    其中T0,T1,T2分别是文件名,后面为文件内容。将IndexTest文件夹上传到DFS中。然后运行反向索引程序。反向索引程序见代码示例。

    最后输出结果为:

    a     (T2, 3)
    banana     (T2, 4)
    is     (T2, 2) (T0, 2) (T0, 5) (T1, 2)
    it     (T1, 3) (T2, 1) (T0, 1) (T0, 4)
    what     (T0, 3) (T1, 1)

    2.代码示例

    InvertedIndex.java

    View Code
    /*
     * To change this template, choose Tools | Templates
     * and open the template in the editor.
     */
    package pa4;
    import java.io.IOException;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.util.GenericOptionsParser;
    /**
     *
     * @author Ming
     */
    public class InvertedIndex {
        public static class TokenizerMapper
            extends Mapper<Text, ValuePair, Text, ValuePair> {
        @Override
        public void map(Text key, ValuePair value, Context context) throws IOException, InterruptedException {
        // TokenInputFormat has generate (word, (fileID, wordPosition))
        // so mapper just spill it to reducer
            key.set(key.toString().toLowerCase());
            context.write(key, value);
        }
        }
        public static class IndexReducer
            extends Reducer<Text, ValuePair, Text, Text> {
        private Text postings = new Text();
        @Override
        public void reduce(Text key, Iterable<ValuePair> values,
            Context context) throws IOException, InterruptedException {
            String list = "";
            for (ValuePair val : values) {
            list += " " + val.toString();
            }
            postings.set(list);
            context.write(key, postings);
        }
        }
        public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: InvertedIndex <in-dir> <out-dir>");
            System.exit(2);
        }
        // remove the old output dir
        FileSystem.get(conf).delete(new Path(otherArgs[1]), true);
        Job job = new Job(conf, "Inverted Indexer");
        job.setJarByClass(InvertedIndex.class);
        job.setInputFormatClass(TokenInputFormat.class);
        job.setMapperClass(InvertedIndex.TokenizerMapper.class);
        //job.setCombinerClass(InvertedIndex.IndexReducer.class);
        job.setReducerClass(InvertedIndex.IndexReducer.class);
        
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(ValuePair.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
        }
    }

    TokenInputFormat.java

    View Code
    package pa4;
    import java.io.IOException;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    import org.apache.hadoop.mapreduce.InputSplit;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FSDataInputStream;
    import org.apache.hadoop.mapreduce.JobContext;
    import org.apache.hadoop.mapreduce.RecordReader;
    import org.apache.hadoop.mapreduce.TaskAttemptContext;
    import org.apache.hadoop.mapreduce.TaskAttemptID;
    import org.apache.hadoop.util.LineReader;
    import java.util.StringTokenizer;
    public class TokenInputFormat extends FileInputFormat<Text, ValuePair> {
        /**
         * Don't allow the files to be split!
         */
        @Override
        protected boolean isSplitable(JobContext ctx, Path filename) {
        // ensure the input files are not splittable!
        return false;
        }
        /**
         * Just return the record reader
         * key is the docno
         */
        public RecordReader<Text, ValuePair> createRecordReader(InputSplit split,
            TaskAttemptContext ctx)
            throws IOException, InterruptedException {
        return new TokenRecordReader();
        }
        public static class TokenRecordReader extends RecordReader<Text, ValuePair> {
        private long start;
        private long pos;
        private long end;
        private LineReader in;
        private int maxLineLength;
        private Text line;
        private Text key = null;
        private ValuePair value = null;
        private StringTokenizer tokens = null;
        private int tokenPos = 0;
        private String fileID = "0";    // input file id that appears in inverted index
        public void initialize(InputSplit genericSplit,
            TaskAttemptContext context) throws IOException {
            FileSplit split = (FileSplit) genericSplit;
            Configuration job = context.getConfiguration();
            this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength",
                Integer.MAX_VALUE);
            start = split.getStart();
            end = start + split.getLength();
            final Path file = split.getPath();
            // Assume file name is an integer of file ID
            fileID = file.getName();
            FileSystem fs = file.getFileSystem(job);
            FSDataInputStream fileIn = fs.open(split.getPath());
            in = new LineReader(fileIn, job);
            this.pos = start;
            line = new Text();
            key = new Text();
            value = new ValuePair();
        }
        public boolean nextKeyValue() throws IOException {
            boolean splitEnds = false;
            while (tokens == null || !tokens.hasMoreTokens()) {
            int lineSize = in.readLine(line, maxLineLength,
                Math.max((int) Math.min(Integer.MAX_VALUE, end - pos),
                maxLineLength));
            if (lineSize == 0) {
                splitEnds = true;
                break;
            }
            pos += lineSize;
            tokens = new StringTokenizer(line.toString(), " /t/n/r/f,.;<>-?///!'/":=*{}()$[]");
            }
            if (splitEnds) {
            key = null;
            value = null;
            line = null;
            tokens = null;
            return false;
            } else
            return true;
        }
        @Override
        public Text getCurrentKey() {
            key.set(tokens.nextToken());
            tokenPos ++;
            return key;
        }
        @Override
        public ValuePair getCurrentValue() {
            value.set(fileID, tokenPos);
            return value;
        }
        /**
         * Get the progress within the split
         */
        public float getProgress() {
            if (start == end) {
            return 0.0f;
            } else {
            return Math.min(1.0f, (pos - start) / (float) (end - start));
            }
        }
        public synchronized void close() throws IOException {
            if (in != null) {
            in.close();
            }
        }
        }
        public static void main(String[] args)
            throws IOException {
        String fn = args[0];
        Configuration conf = new Configuration();
        FileSplit split = new FileSplit(new Path(fn), 0, 10000000, null);
        TokenRecordReader irr = new TokenRecordReader();
        TaskAttemptContext ctx = new TaskAttemptContext(conf,
            new TaskAttemptID("hello", 12, true, 12, 12));
        irr.initialize(split, ctx);
        while (irr.nextKeyValue()) {
            System.out.println(irr.getCurrentKey() + ": " + irr.getCurrentValue());
        }
        }
    }

    ValuePair.java

    View Code
    package pa4;
    /*
     * To change this template, choose Tools | Templates
     * and open the template in the editor.
     */
    import java.io.*;
    import org.apache.hadoop.io.*;
    /**
     *
     * @author Ming
     */
    public class ValuePair implements WritableComparable<ValuePair> {
        private Text one;
        private IntWritable two;
        public void set(Text first, IntWritable second) {
        one = first;
        two = second;
        }
        public void set(String first, int second) {
        one.set(first);
        two.set(second);
        }
        public ValuePair() {
        set(new Text(), new IntWritable());
        }
        public ValuePair(Text first, IntWritable second) {
        set(first, second);
        }
        public ValuePair(String first, int second) {
        set(first, second);
        }
        public Text getFirst() {
        return one;
        }
        public IntWritable getSecond() {
        return two;
        }
        @Override
        public void write(DataOutput out) throws IOException {
        one.write(out);
        two.write(out);
        }
        @Override
        public void readFields(DataInput in) throws IOException {
        one.readFields(in);
        two.readFields(in);
        }
        @Override
        public int hashCode() {
        return one.hashCode();
        }
        @Override
        public boolean equals(Object o) {
        if (o instanceof ValuePair) {
            ValuePair tp = (ValuePair)o;
            return one.equals(tp.one);
        }
        return false;
        }
        @Override
        public String toString() {
        return "(" + one + ", " + two + ")";
        }
        @Override
        public int compareTo(ValuePair tp) {
        int cmp = one.compareTo(tp.one);
        if (cmp != 0) {
            return cmp;
        }
        return two.compareTo(tp.two);
        }
        public static class Comparator extends WritableComparator {
        private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
        private static final IntWritable.Comparator INT_COMPARATOR = new IntWritable.Comparator();
        public Comparator() {
            super(ValuePair.class);
        }
        @Override
        public int compare(byte[] b1, int s1, int l1,
                    byte[] b2, int s2, int l2) {
            try {
            int oneL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
            int oneL2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);
            int cmp = TEXT_COMPARATOR.compare(b1, s1, oneL1, b2, s2, oneL2);
            if (cmp != 0) {
                return cmp;
            }
            return INT_COMPARATOR.compare(b1, s1+oneL1, l1-oneL1,
                            b2, s2+oneL2, l2-oneL2);
            } catch (IOException e) {
            throw new IllegalArgumentException(e);
            }
        }
        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            if (a instanceof ValuePair && b instanceof ValuePair) {
            return ((ValuePair) a).compareTo((ValuePair) b);
            }
            return super.compare(a, b);
        }
        }
        static {
        WritableComparator.define(ValuePair.class, new Comparator());
        }
    }

    ps:2012-5-20

    这里键值对valuepair的运用让我想到了前几天写的Hashmap实现原理。在hashmap的实现过程中,也运用了键值对类Entry。 两者之间有共通之处,有空可以再改进Hashmap实现原理

    作者:xwdreamer
    欢迎任何形式的转载,但请务必注明出处。
    分享到:
  • 相关阅读:
    第八周总结和实验六
    第七周总结与实验五
    遍历目录中的所有文件和目录,并生成全路径
    python watchdog
    Offer_answer_with_SDP_rfc3264
    [转]UML八大误解
    leetcode周赛220
    Codeforces Round #690 (Div. 3)
    学习资料
    鱼眼图与六面图转换(python)
  • 原文地址:https://www.cnblogs.com/xwdreamer/p/2297043.html
Copyright © 2011-2022 走看看