倒排索引 - 走看看

zoukankan html css js c++ java

倒排索引

"倒排索引"是文档检索系统中最常用的数据结构，被广泛地应用于全文搜索引擎。它主要是用来存储某个单词（或词组）在一个文档或一组文档中的存储位置的映射，即提供了一种根据内容来查找文档的方式。由于不是根据文档来确定文档所包含的内容，而是进行相反的操作，因而称为倒排索引（Inverted Index）。

实例描述通常情况下，倒排索引由一个单词（或词组）以及相关的文档列表组成，文档列表中的文档或者是标识文档的ID号，或者是指文档所在位置的URL 在实际应用中，还需要给每个文档添加一个权值，用来指出每个文档与搜索内容的相关度：

样例输入：1）file1： MapReduce is simple　　　　　　　　　　　　　　　　

　　　　　2）file2： MapReduce is powerful is simple 　　　　　　　　　　　　　　

　　　　 3）file3： Hello MapReduce bye MapReduce　　　　　　　

样例输出：　　　

　　　　

思路：

Map过程： key：word+url value：字频（设置为1）

Combine阶段：key：word value：url+字频（所有map阶段相同的key对应的value（1）相加）

Reduce阶段：key：word value：将combine阶段的url+字频合并起来。

代码：

package mapreduce01;

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class daopai {

static String INPUT_PATH = "hdfs://master:9000/qp";

static String OUTPUT_PATH="hdfs://master:9000/output";

static class MyMapper extends Mapper<Object,Object,Text,Text> {

Text output_key=new Text();

Text output_value=new Text();

FileSplit split;

protected void map(Object key,Object value,Context context)throws IOException, InterruptedException{

//获得<key,value>对所属的FileSplit对象。

split = (FileSplit)context.getInputSplit();

System.out.println(split);

//StringTokenizer是用来把字符串截取成一个个标记或单词的，默认是空格或多个空格( 等等)截取

StringTokenizer itr = new StringTokenizer( value.toString());

while(itr.hasMoreTokens()){

// key值由单词和URI组成。

output_key.set(itr.nextToken()+":"+split.getPath().toString());

output_value.set("1");

context.write(output_key, output_value);

}

}

}

public static class MyCombiner extends  Reducer<Text,Text,Text,Text> {

Text output_value= new Text();

Text output_key = new Text();

        protected void reduce(Text key, Iterable<Text> values,Reducer<Text, Text, Text, Text>.Context context) throws java.io.IOException, InterruptedException {

         //统计词频

         int sum=0;

         for(Text value:values){

         sum += Integer.parseInt(value.toString() );   //parseInt解析字符串

         }

         System.out.println(sum);

         int splitIndex = key.toString().indexOf(":");//找：的位置

         //重新设置value值由URI和词频组成

          output_value.set( key.toString().substring( splitIndex + 1) +":"+sum );

        //重新设置key值为单词

            output_key.set( key.toString().substring(0,splitIndex));

            context.write(output_key,output_value);

         }

}

  public static class MyReduce extends Reducer<Text,Text,Text,Text>{

         Text output_value = new Text();

         protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)

                    throws IOException, InterruptedException {

         //生成文档列表

                String fileList = new String();

                for (Text value : values) {

                    fileList += value.toString()+";";

                }

                output_value.set(fileList);

                context.write(key, output_value);

         }

        }

public static void main(String[] args) throws Exception{

Path outputpath=new Path(OUTPUT_PATH);

Configuration conf=new Configuration();

FileSystem fs = outputpath.getFileSystem(conf);

if(fs.exists(outputpath)){

fs.delete(outputpath,true);

}

//wordCount

Job job = Job.getInstance(conf);

FileInputFormat.setInputPaths(job, INPUT_PATH);

FileOutputFormat.setOutputPath(job, outputpath);

job.setMapperClass(MyMapper.class);   //map

job.setCombinerClass( MyCombiner.class);

job.setReducerClass(MyReduce.class);   //reduce

// job.setMapOutputKeyClass(LongWritable.class);

// job.setMapOutputValueClass(LongWritable.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class);

job.waitForCompletion(true);

}

}

输出结果：

Never Give up；

查看全文

相关阅读:
[Angular 2] Share a Service Across Angular 2 Components and Modules
[Angular 2] How To Debug An Angular 2 Application
[Angular 2] Create Shareable Angular 2 Components
[Angular 2] Import custom module
[Angular 2] Understanding Pure & Impure pipe
[Javascript] Manipulate the DOM with the classList API
[Angular 2] Understanding OpaqueToken
[Angular 2] Value Providers & @Inject
[Angular 2] Understanding @Injectable
[Angular 2] Factory Provider with dependencies

原文地址：https://www.cnblogs.com/luminous1/p/8383777.html