用Lucene实现了一个简单文件检索功能,作为最近学习lucene的学习成果。
定义常量类:
public class Constant { public static String FILE_NAME = "fileName"; public static String FILE_CONTENT = "fileContent"; public static String FILE_PATH = "filePath"; }
索引创建类:
import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Queue; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class IndexGenerator { /** * * @Title: generatorIndex * @Description:将该目录下支持的文件生成索引,用于搜索 * @param @param dir 需要生成索引的目录 * @return void 返回类型 */ public static int generatorIndex(String fileDir){ //遍历文件 List<File> fileList = new ArrayList<File>(); Queue<File> fileQueue = new LinkedList<File>(); File file = new File(fileDir); if(file.isDirectory()){ fileQueue.add(file); }else{ fileList.add(file); } while(!fileQueue.isEmpty()){ File f = fileQueue.poll(); File[] files = f.listFiles(); for(File subFile : files){ if(subFile.isDirectory()){ fileQueue.add(subFile); }else{ fileList.add(subFile); } } } //为文件创建索引 Directory dir = null; IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)); IndexWriter indexWriter = null; int num = 0; try { dir = FSDirectory.open(new File("E:\exercise\luceneFS")); indexWriter = new IndexWriter(dir, config); for(File f : fileList){ Document doc = new Document(); //解析文件名 doc.add(new Field(Constant.FILE_NAME, f.getName(), Store.YES, Index.NOT_ANALYZED)); //解析文件路径并保存 doc.add(new Field(Constant.FILE_PATH, file.getAbsolutePath(), Store.YES, Index.NOT_ANALYZED)); doc.add(new Field(Constant.FILE_CONTENT, new FileReader(f))); indexWriter.addDocument(doc); } num = indexWriter.numDocs(); System.out.println("共对" + num + "个文件生成了索引"); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if(indexWriter != null){ try { indexWriter.close(); } catch (IOException e) { e.printStackTrace(); } } if(dir != null){ try { dir.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } return num; } }
搜索类:
package com.insaneXs.learnLucene; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.SearcherFactory; import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class LuceneSearcher { public static void search(String str){ Directory dir = null; SearcherManager searcherManager = null; IndexSearcher indexSearcher = null; IndexWriter indexWriter = null; SearcherFactory factory = new SearcherFactory(); try { //可以直接构造indexSearcher // indexSearcher = new IndexSearcher(dir); dir = FSDirectory.open(new File("E:\exercise\luceneFS")); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)); indexWriter = new IndexWriter(dir, config); searcherManager = new SearcherManager(indexWriter, true, factory); // searcherManager.maybeRefresh(); indexSearcher = searcherManager.acquire(); //用queryparser对象创建Query,但是需要处理ParseException // QueryParser queryParser = new QueryParser(Version.LUCENE_36, Constant.FILE_CONTENT, new StandardAnalyzer(Version.LUCENE_36)); // Query q = queryParser.parse(str); //注意检索关键字的大小写问题 在检索文件中 关键字全部小写化 //用QueryParser指定 StandardAnalyzer时 StandardAnalyzer底层会将关键字小写化 if(str != null){ str = str.toLowerCase(); } Query titleQ = new TermQuery(new Term(Constant.FILE_NAME, str)); Query contentQ = new TermQuery(new Term(Constant.FILE_CONTENT, str)); //5.3版本不再直接使用构造函数创建BooleanQuery对象,而是通过Builder对象的build方法取代 // Builder queryBuilder = new BooleanQuery.Builder(); // queryBuilder.add(titleQ, Occur.MUST); // queryBuilder.add(contentQ, Occur.SHOULD); BooleanQuery query = new BooleanQuery(); query.add(titleQ, Occur.MUST); query.add(contentQ, Occur.SHOULD); //SortFiled构造函数的两个参数分别指定了Field(进行排序的域)和Type(指定排序的类型) Sort sort = new Sort(new SortField(Constant.FILE_NAME, SortField.DOC)); //只查询十条记录 TopDocs res = indexSearcher.search(query, 10, sort); //符合条件的总记录数 int totalLength = res.totalHits; System.out.println("一共有" + totalLength + "条记录符合条件"); //本次查询的记录数 int length = res.scoreDocs.length; System.out.println("本次查询" + length + "条记录"); for(ScoreDoc scoreDoc : res.scoreDocs){ //根据ScoreDoc取对应Document记录 Document doc = indexSearcher.doc(scoreDoc.doc); //取记录对应的Field用于显示 System.out.println("文件名:" + doc.get(Constant.FILE_NAME) + "; 路径:" + doc.get(Constant.FILE_PATH)); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }/* catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); }*/ finally { if(searcherManager != null){ try { searcherManager.release(indexSearcher); } catch (IOException e) { e.printStackTrace(); } } if(indexWriter != null){ try { indexWriter.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } if(dir != null){ try { dir.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } }
测试类:
public class LuceneTest { public static void main(String[] args) { IndexGenerator.generatorIndex("E:\exercise\testData"); // LuceneSearcher.search("happy"); } }
过程中使用TermQuery的时候踩了不小的坑。主要是索引关键字大小写的问题。关键字会被自动存为小写。因此用TermQuery查询的时候要先转成小写。
而使用QueryParser会在底层自动处理成小写。所以不用关心。
另外,Lucene版本变动对API的影响也较大。一些接口都被废弃了。
上述代码用的3.6的版本。
参考资料: