zoukankan      html  css  js  c++  java
  • Lucene学习笔记

    用Lucene实现了一个简单文件检索功能,作为最近学习lucene的学习成果。

    定义常量类:

    public class Constant {
        public static String FILE_NAME = "fileName";
        
        public static String FILE_CONTENT = "fileContent";
        
        public static String FILE_PATH = "filePath";
    }

    索引创建类:

    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.LinkedList;
    import java.util.List;
    import java.util.Queue;
    
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.Field.Index;
    import org.apache.lucene.document.Field.Store;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    public class IndexGenerator {
        
        /**
         * 
         * @Title: generatorIndex 
         * @Description:将该目录下支持的文件生成索引,用于搜索
         * @param @param dir 需要生成索引的目录 
         * @return void    返回类型 
         */
        public static int generatorIndex(String fileDir){
            //遍历文件
            List<File> fileList = new ArrayList<File>();
            Queue<File> fileQueue = new LinkedList<File>();
            
            File file = new File(fileDir);
            if(file.isDirectory()){
                fileQueue.add(file);
            }else{
                fileList.add(file);
            }
            
            while(!fileQueue.isEmpty()){
                File f = fileQueue.poll();
                
                File[] files = f.listFiles();
                
                for(File subFile : files){
                    if(subFile.isDirectory()){
                        fileQueue.add(subFile);
                    }else{
                        fileList.add(subFile);
                    }
                }
            }
            
            //为文件创建索引
            Directory dir = null;
            IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36));
            IndexWriter indexWriter = null;
            int num = 0;
            try {
                dir = FSDirectory.open(new File("E:\exercise\luceneFS"));
                indexWriter = new IndexWriter(dir, config);
                for(File f : fileList){
                    Document doc = new Document();
                    
                    //解析文件名
                    doc.add(new Field(Constant.FILE_NAME, f.getName(), Store.YES, Index.NOT_ANALYZED));
                    //解析文件路径并保存
                    doc.add(new Field(Constant.FILE_PATH, file.getAbsolutePath(), Store.YES, Index.NOT_ANALYZED));
                    
                    doc.add(new Field(Constant.FILE_CONTENT, new FileReader(f)));
                    indexWriter.addDocument(doc);
                    
                }
                
                num = indexWriter.numDocs();
                System.out.println("共对" + num + "个文件生成了索引");
                
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                if(indexWriter != null){
                    try {
                        indexWriter.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
                if(dir != null){
                    try {
                        dir.close();
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            }
            
            return num;
        }
    }

    搜索类:

    package com.insaneXs.learnLucene;
    
    import java.io.File;
    import java.io.IOException;
    
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.search.BooleanClause.Occur;
    import org.apache.lucene.search.BooleanQuery;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.SearcherFactory;
    import org.apache.lucene.search.SearcherManager;
    import org.apache.lucene.search.Sort;
    import org.apache.lucene.search.SortField;
    import org.apache.lucene.search.TermQuery;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    public class LuceneSearcher {
        public static void search(String str){
            Directory dir = null;
            SearcherManager searcherManager = null;
            IndexSearcher indexSearcher = null;
            IndexWriter indexWriter = null;
            SearcherFactory factory = new SearcherFactory();
            try {
                //可以直接构造indexSearcher
    //            indexSearcher = new IndexSearcher(dir);
                
                dir = FSDirectory.open(new File("E:\exercise\luceneFS"));
                IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36));
                indexWriter = new IndexWriter(dir, config);
                searcherManager = new SearcherManager(indexWriter, true, factory);
    //            
                searcherManager.maybeRefresh();
                indexSearcher = searcherManager.acquire();
                
                //用queryparser对象创建Query,但是需要处理ParseException
    //            QueryParser queryParser = new QueryParser(Version.LUCENE_36, Constant.FILE_CONTENT, new StandardAnalyzer(Version.LUCENE_36));
    //            Query q = queryParser.parse(str);
                
                //注意检索关键字的大小写问题  在检索文件中 关键字全部小写化 
                //用QueryParser指定 StandardAnalyzer时 StandardAnalyzer底层会将关键字小写化
                if(str != null){
                    str = str.toLowerCase();
                }
                Query titleQ = new TermQuery(new Term(Constant.FILE_NAME, str));
                Query contentQ = new TermQuery(new Term(Constant.FILE_CONTENT, str));
                
                //5.3版本不再直接使用构造函数创建BooleanQuery对象,而是通过Builder对象的build方法取代
    //            Builder queryBuilder = new BooleanQuery.Builder();
    //            queryBuilder.add(titleQ, Occur.MUST);
    //            queryBuilder.add(contentQ, Occur.SHOULD);
                BooleanQuery query = new BooleanQuery();
                query.add(titleQ, Occur.MUST);
                query.add(contentQ, Occur.SHOULD);
                
                
                //SortFiled构造函数的两个参数分别指定了Field(进行排序的域)和Type(指定排序的类型)
                Sort sort = new Sort(new SortField(Constant.FILE_NAME, SortField.DOC));
                //只查询十条记录
                TopDocs res = indexSearcher.search(query, 10, sort);
                //符合条件的总记录数
                int totalLength = res.totalHits;
                System.out.println("一共有" + totalLength + "条记录符合条件");
                //本次查询的记录数
                int length = res.scoreDocs.length;
                System.out.println("本次查询" + length + "条记录");
                for(ScoreDoc scoreDoc : res.scoreDocs){
                    //根据ScoreDoc取对应Document记录
                    Document doc = indexSearcher.doc(scoreDoc.doc);
                    //取记录对应的Field用于显示
                    System.out.println("文件名:" + doc.get(Constant.FILE_NAME) + "; 路径:" + doc.get(Constant.FILE_PATH));
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }/* catch (ParseException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }*/ finally {
                if(searcherManager != null){
                    try {
                        searcherManager.release(indexSearcher);
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
                if(indexWriter != null){
                    try {
                        indexWriter.close();
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
                if(dir != null){
                    try {
                        dir.close();
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            }
        }
    }

    测试类:

    public class LuceneTest {
    
        public static void main(String[] args) {
            IndexGenerator.generatorIndex("E:\exercise\testData");
    //        
            
            LuceneSearcher.search("happy");
        }
    
    }

    过程中使用TermQuery的时候踩了不小的坑。主要是索引关键字大小写的问题。关键字会被自动存为小写。因此用TermQuery查询的时候要先转成小写。

    而使用QueryParser会在底层自动处理成小写。所以不用关心。

    另外,Lucene版本变动对API的影响也较大。一些接口都被废弃了。

    上述代码用的3.6的版本。

    参考资料:

    Lucene主要API介绍

    luceneapi.com

    lucene易佰教程

  • 相关阅读:
    [转载]PHP中PSR-[0-4]规范
    Git忽略规则及.gitignore规则不生效的解决办法
    nginx配置tp5的pathinfo模式并隐藏后台入口文件
    php过滤&nbsp;字符
    使用ajax的post方式下载excel
    scws简单中文分词
    php的api及登录的权限验证
    对钩子的理解
    基于角色的权限控制
    微信开发之SVN提交代码与FTP同步到apache的根目录
  • 原文地址:https://www.cnblogs.com/insaneXs/p/7347782.html
Copyright © 2011-2022 走看看