zoukankan      html  css  js  c++  java
  • 全文检索(二)-基于lucene4.10的增删改查

    今天 用lucene完毕了 一个简单的web应用。提取了早期编写的一个測试类。 首先简单介绍下lucene几个经常使用包;

    lucene 包的组成结构:对于外部应用来说索引模块(index)和检索模块(search)是基本的外部应用入口

    org.apache.Lucene.search/ 搜索入口 
    org.apache.Lucene.index/ 索引入口 
    org.apache.Lucene.analysis/ 语言分析器 
    org.apache.Lucene.queryParser/ 查询分析器 
    org.apache.Lucene.document/ 存储结构 
    org.apache.Lucene.store/  底层IO/存储结构 
    org.apache.Lucene.util/ 一些公用的数据结构 


    话不多说,直接上代码(这是早期封装的一个測试类。封装的还算比較完好,有兴趣的朋友能够在此基础上继续完好):

    package com.lucene.util;
    
    import java.io.File;
    import java.io.IOException;
    import java.io.StringReader;
    import java.util.List;
    
    import org.apache.log4j.Logger;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.StringField;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.document.Field.Store;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.LogDocMergePolicy;
    import org.apache.lucene.index.LogMergePolicy;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.index.IndexWriterConfig.OpenMode;
    import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
    import org.apache.lucene.queryparser.classic.ParseException;
    import org.apache.lucene.queryparser.classic.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.MatchAllDocsQuery;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TermQuery;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.search.highlight.Highlighter;
    import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
    import org.apache.lucene.search.highlight.QueryScorer;
    import org.apache.lucene.search.highlight.SimpleFragmenter;
    import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.NumericUtils;
    import org.apache.lucene.util.Version;
    import org.wltea.analyzer.lucene.IKAnalyzer;
    
    import com.message.base.search.SearchBean;
    
    /**
     * lucene 4.10.1
     * 
     * @creatTime 2014-10-28
     * @author 胡慧超
     * 
     */
    public class HhcIndexTools {
    
    	private final static Logger logger = Logger.getLogger(HhcIndexTools.class);
    	private static String indexPath = "E://lucene//index";
    
    	public static void main(String[] args) {
    		try {
    //			createIndex();
    //			searchIndex("码农");
    //			query();
    //			deleteIndex(null);
    			forceDeleteIndex();
    			query();
    			highlighterSearch();
    		} catch (Exception e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
    
    	/**
    	 * 创建索引
    	 */
    	public static void createIndex() {
    		// 最细粒切分算法--true的话是 智能切分
    		Analyzer analyzer = new IKAnalyzer(false);
    		Document doc = null;
    		IndexWriter indexWriter = null;
    		try {
    			indexWriter = getIndexWriter(analyzer);
    			// 加入索引
    			doc = new Document();
    			doc.add(new StringField("id", "1", Store.YES));
    			doc.add(new TextField("title", "标题:開始", Store.YES));
    			doc.add(new TextField("content", "内容:我如今是个码农", Store.YES));
    			indexWriter.addDocument(doc);
    			doc = new Document();
    			doc.add(new StringField("id", "2", Store.YES));
    			doc.add(new TextField("title", "标题:结束", Store.YES));
    			doc.add(new TextField("content", "内容:我如今是个lucene开发project师的专家",
    					Store.YES));
    			indexWriter.addDocument(doc);
    			indexWriter.commit();
    		} catch (IOException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    			logger.info("索引器发送异常");
    		} finally {
    			try {
    				destroyWriter(indexWriter);
    			} catch (IOException e) {
    				// TODO Auto-generated catch block
    				e.printStackTrace();
    			}
    		}
    	}
    
    	/**
    	 * 搜索文档
    	 * 
    	 * @param keyword
    	 */
    	@SuppressWarnings("deprecation")
    	public static void searchIndex(String keyword) {
    		IndexReader indexReader = null;
    		IndexSearcher indexSearcher = null;
    		try {
    			// 1.创建Directory 在硬盘上的F:/luence/index下建立索引
    			Directory dir = FSDirectory.open(new File(indexPath));
    			// 2.创建IndexReader
    			indexReader = IndexReader.open(dir);
    			// 实例化搜索器
    			indexSearcher = new IndexSearcher(indexReader);
    
    			// 使用QueryParser查询分析器构造Query对象
    			QueryParser parse = new QueryParser(Version.LUCENE_4_10_1,
    					"content", new IKAnalyzer(false));
    			// 搜索包括keyword关键字的文档
    			Query query = parse.parse(keyword.trim());
    
    			// 使用lucene构造搜索引擎的时候,假设要针对多个域进行一次性查询
    			// 这样的方法的优点就是能够加权给字段的控制
    			// 在这四个域中检索
    			String[] fields = { "phoneType", "name", "category", "price" };
    			Query querys = new MultiFieldQueryParser(Version.LATEST, fields,
    					new IKAnalyzer(false)).parse(keyword.trim());
    
    			TopDocs results = indexSearcher.search(query, 1000);
    			// 6.依据TopDocs获取ScoreDoc对象
    			ScoreDoc[] score = results.scoreDocs;
    			if (score.length > 0) {
    				logger.info("查询结果数:" + score.length);
    				System.out.println("查询结果数:" + score.length);
    				for (int i = 0; i < score.length; i++) {
    					// 7.依据Seacher和ScoreDoc对象获取详细的Document对象
    					Document doc = indexSearcher.doc(score[i].doc);
    					// 8.依据Document对象获取须要的值
    					System.out.println(doc.toString());
    					System.out.println(doc.get("title") + "["
    							+ doc.get("content") + "]");
    				}
    			} else {
    			}
    		} catch (Exception e) {
    			// TODO: handle exception
    			logger.info("查询结果为空!");
    		} finally {
    			if (indexReader != null) {
    				try {
    					indexReader.close();
    				} catch (IOException e) {
    					// TODO Auto-generated catch block
    					e.printStackTrace();
    				}
    			}
    		}
    
    	}
    
    	/**
    	 * 对搜索返回的前n条结果进行分页显示
    	 * 
    	 * @param keyWord
    	 *            查询关键词
    	 * @param pageSize
    	 *            每页显示记录数
    	 * @param currentPage
    	 *            当前页
    	 * @throws ParseException
    	 */
    	@SuppressWarnings("deprecation")
    	public void paginationQuery(String keyWord, int pageSize, int currentPage)
    			throws IOException, ParseException {
    		String[] fields = { "title", "content" };
    		QueryParser queryParser = new MultiFieldQueryParser(Version.LATEST,
    				fields, new IKAnalyzer());
    		Query query = queryParser.parse(keyWord.trim());
    
    		IndexReader indexReader = IndexReader.open(FSDirectory.open(new File(
    				indexPath)));
    		IndexSearcher indexSearcher = new IndexSearcher(indexReader);
    
    		// TopDocs 搜索返回的结果
    		TopDocs topDocs = indexSearcher.search(query, 100);// 仅仅返回前100条记录
    		TopDocs all = indexSearcher.search(new MatchAllDocsQuery(), 100);
    		// int totalCount = topDocs.totalHits; // 搜索结果总数量
    		ScoreDoc[] scoreDocs = topDocs.scoreDocs; // 搜索返回的结果集合
    
    		// 查询起始记录位置
    		int begin = pageSize * (currentPage - 1);
    		// 查询终止记录位置
    		int end = Math.min(begin + pageSize, scoreDocs.length);
    
    		// 进行分页查询
    		for (int i = begin; i < end; i++) {
    			int docID = scoreDocs[i].doc;
    			System.out.println("docID=" + docID);
    			Document doc = indexSearcher.doc(docID);
    			String title = doc.get("title");
    			System.out.println("title is : " + title);
    		}
    		indexReader.close();
    	}
    
    	@SuppressWarnings("deprecation")
    	public static void highlighterSearch() throws IOException, ParseException, InvalidTokenOffsetsException {
    		IndexReader reader = IndexReader.open(FSDirectory.open(new File(
    				indexPath)));
    		IndexSearcher searcher = new IndexSearcher(reader);
    
    		// String []fields={"title","content"};
    		// QueryParser parser=new MultiFieldQueryParser(Version.LATEST, fields,
    		// new IKAnalyzer());
    		// Query query=parser.parse("");
    
    		Term term = new Term("content", "lucene");
    		TermQuery query = new TermQuery(term);
    
    		TopDocs topdocs = searcher.search(query, Integer.MAX_VALUE);
    		ScoreDoc[] scoreDoc = topdocs.scoreDocs;
    		System.out.println("查询结果总数:" + topdocs.totalHits);
    		System.out.println("最大的评分:" + topdocs.getMaxScore());
    		
    		for(int i=0;i<scoreDoc.length;i++){
    			int docid=scoreDoc[i].doc;
    			Document document=searcher.doc(docid);
    			System.out.println("============文件【"+(i+1)+"】=========");
    			System.out.println("检索关键字:"+term.toString());
    			String content=document.get("content");
    			
    			//高亮展示 SimpleHTMLFormatter //
    			SimpleHTMLFormatter  formatter=new SimpleHTMLFormatter("<font color='red'>", "</font>");
    			Highlighter highlighter=new Highlighter(formatter, new QueryScorer(query));
    	        highlighter.setTextFragmenter(new SimpleFragmenter(content.length())); 
    	        
    	        if(!"".equals(content)){
    	        	TokenStream tokenstream=new IKAnalyzer().tokenStream(content, new StringReader(content));
    	        	String highLightText = highlighter.getBestFragment(tokenstream,content);  
    	            System.out.println("高亮显示第 " + (i + 1) + " 条检索结果例如以下所看到的:");  
    	            System.out.println(highLightText); 
    	            /*End:结束关键字高亮*/
    	            System.out.println("文件内容:"+content);	            
    	            System.out.println("匹配相关度:"+scoreDoc[i].score);
    	        }
    		}
    	}
    
    	/**
    	 * 获取indexWriter对象---获取索引器
    	 * 
    	 * @param dir
    	 * @param analyer
    	 * @return
    	 * @throws IOException
    	 */
    	private static IndexWriter getIndexWriter(Analyzer analyzer)
    			throws IOException {
    		File indexFile = new File(indexPath);
    		if (!indexFile.exists())
    			indexFile.mkdir();// 索引库不存在 则新建一个
    		Directory directory = FSDirectory.open(indexFile);
    		// Directory directory = new RAMDirectory(); //在内存中建立索引
    
    		IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_4_10_1,
    				analyzer);
    		LogMergePolicy mergePolicy = new LogDocMergePolicy();
    		// 索引基本配置
    		// 设置segment加入文档(Document)时的合并频率
    		// 值较小,建立索引的速度就较慢
    		// 值较大,建立索引的速度就较快,>10适合批量建立索引
    		mergePolicy.setMergeFactor(30);
    		// 设置segment最大合并文档(Document)数
    		// 值较小有利于追加索引的速度
    		// 值较大,适合批量建立索引和更快的搜索
    		mergePolicy.setMaxMergeDocs(5000);
    		conf.setMaxBufferedDocs(10000);
    		conf.setMergePolicy(mergePolicy);
    		conf.setRAMBufferSizeMB(64);
    
    		conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
    		if (IndexWriter.isLocked(directory)) {// ?
    			IndexWriter.unlock(directory);
    		}
    		IndexWriter indexWriter = new IndexWriter(directory, conf);
    		return indexWriter;
    	}
    
    	/**
    	 * 销毁writer
    	 * 
    	 * @param writer
    	 * @throws IOException
    	 */
    	private static void destroyWriter(IndexWriter indexWriter)
    			throws IOException {
    		if (indexWriter != null) {
    			indexWriter.close();
    		}
    	}
    
    	/**
    	 * 批量删除
    	 * 
    	 * @param list
    	 * @throws IOException
    	 */
    	public static void deleteIndexs(List<SearchBean> list) throws IOException {
    		if (list == null || list.size() > 0) {
    			logger.debug("beans is null");
    			return;
    		}
    		for (SearchBean bean : list) {
    			deleteIndex(bean);
    		}
    	}
    
    	/**
    	 * 删除单个索引 --不会立马删除,生成.del文件
    	 * 
    	 * @param bean
    	 * @throws IOException
    	 */
    	private static void deleteIndex(SearchBean bean) throws IOException {
    		// if(bean==null){
    		// logger.debug("Get search bean is empty!");
    		// return;
    		// }
    		IndexWriter indexWriter = getIndexWriter(new IKAnalyzer());
    		// 參数是一个选项,能够是一个Query,也能够是一个term,term是一个精确查找的值
    		// 这里删除id=1的文档,还会留在”回收站“。xxx.del
    		indexWriter.deleteDocuments(new Term("id", "1"));
    		destroyWriter(indexWriter);
    	}
    
    	/**
    	 * 查询文档
    	 */
    	@SuppressWarnings("deprecation")
    	public static void query() {
    		// 1.创建Directory 在硬盘上的F:/luence/index下建立索引
    		try {
    			IndexReader indexReader = IndexReader.open(FSDirectory
    					.open(new File(indexPath)));
    			System.out.println("存储的文档数:" + indexReader.numDocs());
    			System.out.println("总存储量:" + indexReader.maxDoc());
    			System.out.println("被删除的文档:" + indexReader.numDeletedDocs());
    		} catch (IOException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
    
    	/**
    	 * 回滚回收站
    	 * 
    	 * @throws IOException
    	 */
    	public void recoveryIndexByIsDelete() throws IOException {
    		IndexWriter indexWriter = getIndexWriter(new IKAnalyzer());
    		indexWriter.rollback();
    		destroyWriter(indexWriter);
    	}
    
    	/**
    	 * 清空回收站 在版本号3.6之后,已经没有了unDeleteAll()方法了
    	 * 
    	 * @throws IOException
    	 */
    	public static void forceDeleteIndex() throws IOException {
    		IndexWriter indexWriter = getIndexWriter(new IKAnalyzer());
    		indexWriter.forceMergeDeletes();
    		destroyWriter(indexWriter);
    	}
    
    	/**
    	 * 更新索引
    	 * 
    	 * @throws IOException
    	 */
    	public void update() throws IOException {
    		IndexWriter indexWriter = new IndexWriter(FSDirectory.open(new File(
    				indexPath)), new IndexWriterConfig(Version.LATEST,
    				new IKAnalyzer(true)));
    
    		Document document = new Document();
    
    		document.add(new Field("id", "10", Field.Store.YES,
    				Field.Index.NOT_ANALYZED_NO_NORMS));
    		document.add(new Field("email", "9481629991", Field.Store.YES,
    				Field.Index.NOT_ANALYZED));
    		document.add(new Field("name", "小米", Field.Store.YES,
    				Field.Index.NOT_ANALYZED_NO_NORMS));
    		document.add(new Field("content", "小米好", Field.Store.NO,
    				Field.Index.ANALYZED));
    
    		// 这里的更新,从方法上能够看出,它实际上时将旧的删除,然后加入一个新文档的进去,将匹配到term的文档删除,然后就新的document加入进去
    		indexWriter.updateDocument(new Term("id", "1"), document);
    
    		indexWriter.close();
    	}
    }
    

    另外附带高亮的项目代码

    	/**
    	 * 高亮的公共方法
    	 * @param text  --查询内容
    	 * @param query --查询query
    	 * @param field --查询域
    	 * @return
    	 */
    	private String highligher(String text,Query query,String field) {
    		try {
    			QueryScorer scorer = new QueryScorer(query);
    			Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
    			Formatter formatter = new SimpleHTMLFormatter("<span class='lighter'>","</span>");
    			Highlighter lighter = new Highlighter(formatter,scorer);
    			lighter.setTextFragmenter(fragmenter);
    			String ht = lighter.getBestFragment(LuceneContext.getInstance().getAnalyzer(),
    					field,text);
    			if(ht==null) {
    				if(text.length()>=200) {
    					text = text.substring(0, 200);
    					text=text+"....";
    				}
    				return text;
    			}
    			else return ht.trim();
    		} catch (IOException e) {
    			e.printStackTrace();
    		} catch (InvalidTokenOffsetsException e) {
    			e.printStackTrace();
    		}
    		return text;
    	}




  • 相关阅读:
    C++多态
    C++和C#实现剪切板数据交互
    通过CLR API实现C++调用C#代码交互
    COM方式实现C++调用C#代码的一些总结
    输入LPCWSTR类型字符串
    取得COM对象的UUID并以string输出
    springmvc xml文件配置中使用系统环境变量
    SpringMVC,SpringBoot上传文件简洁代码
    c语言实行泛型hashmap
    java使用nio(Paths,Files)遍历文件目录,转成java.io.File
  • 原文地址:https://www.cnblogs.com/gcczhongduan/p/5246570.html
Copyright © 2011-2022 走看看