zoukankan html css js c++ java

lucene构建同义词分词器

lucene4.0版本号以后已经用TokenStreamComponents 代替了TokenStream流。里面包含了filter和tokenizer

在较复杂的lucene搜索业务场景下，直接网上下载一个作为项目的分词器，是不够的。那么怎么去评定一个中文分词器的好与差：一般来讲。有两个点。词库和搜索效率，也就是算法。

lucene的倒排列表中，不同的分词单元有不同的PositionIncrementAttribute，假设两个词之间PositionIncrementAttribute距离为0。则为同义词；比方：我定义美国和中国这两个词在倒排列表中是同一个位置及距离为0，那么搜索美国的话，中国也能出来。

这就是同义词搜索原理。

下面代码（用mmseg的 Tokenizer 去切词之后，然后再做同义词）：

先自己定义分词器：

package hhc;

import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;

import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;

/**
 * 写一个分词器，一般能够參照原来分词器是怎么写法的
 * @author hhc
 *
 */
public class MySameAnalyzer extends Analyzer{
	//同义词
	private SamewordContext samewordContext=null;
	
	public MySameAnalyzer(SamewordContext samewordContext){
		this.samewordContext=samewordContext;
	}

	@Override
	public TokenStream tokenStream(String fieldName, Reader reader) {
		// 
		Dictionary dic=Dictionary.getInstance();
		return new MySameTokenFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader),samewordContext);
	}

}

然后再对TokenStream流做同义词处理

package hhc;

import java.io.IOException;
import java.util.Stack;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;

public class MySameTokenFilter extends TokenFilter {
	// 分词单元信息
	private CharTermAttribute cta = null;
	// 位置信息
	private PositionIncrementAttribute pia = null;
	// 状态
	private AttributeSource.State current;
	// 同义词集合
	private Stack<String> sames = null;
	private SamewordContext samewordContext=null;

	protected MySameTokenFilter(TokenStream input,SamewordContext samewordContext) {
		super(input);
		cta = input.addAttribute(CharTermAttribute.class);
		pia = input.addAttribute(PositionIncrementAttribute.class);
		sames=new Stack<String>();
		this.samewordContext=samewordContext;
	}

	@Override
	public boolean incrementToken() throws IOException {
		try {
			if (sames!=null&&sames.size()> 0) {
				// 删除对象在堆栈,然后返回的对象上的函数值。而且获取这个同义词
				String str = sames.pop();
				// 还原状态
				restoreState(current);
				cta.setEmpty();
				cta.append(str);
				pia.setPositionIncrement(0);
				return true;
			}
			// 假设流中没有数据了。
			if (!input.incrementToken())return false;

			/**
			 * 流中有数据的话，进行对应的同义词
			 */
			// 处理切分出来的词的信息
			if (existAddSameword(cta.toString())) {
				// 把当前状态先保存
				current = captureState();
			}
		} catch (Exception e) {
			// TODO: handle exception
			e.printStackTrace();
		}
		return true;
	}

	/**
	 * 推断是否该分词单元存在
	 * 
	 * @param word
	 * @return
	 */
	private boolean existAddSameword(String word) {
	    String[] words=samewordContext.getSameword(word);
		if (words != null) {
			for (String s : words) {
				sames.push(s);
			}
			return true;
		}
		return false;
	}

}

查看全文

相关阅读:
mysql数据库表导入导出
 Java爬虫
 oracle 表空间统计、自动扩展修改
 dba_segements 没有所有的表的信息
 html&css基础框架
 javascript-ajax之json学习笔记
 符合BME风格的弹窗菜单表格文件上传控件
 iframe元素获取
 文件上传与下载
 JSON.parse 函数

原文地址：https://www.cnblogs.com/zsychanpin/p/6789050.html