zoukankan      html  css  js  c++  java
  • lucenc代码阅读指南、测试范例

    阅读指南

    Lucene 原理与代码分析完整版  -- 力荐

    Lucene介绍及源码剖析: http://javenstudio.org/blog/annotated-lucene  -- 核心IndexWriter

    下载:Annotated+Lucene+.pdf: http://ishare.iask.sina.com.cn/f/24103589.html

    阅读步骤:

    1、了解检索的基本原理和概念

    2、了解lucene的基本概念

    3、熟悉lucene的索引文件格式 -- 关键

    4、熟悉lucene的索引流程:具体代码的类层次较多,且引入不必要的设计模式致使代码阅读相对困难。基本思路:controler + model 封装索引链,实现多线程并发处理(数据不共享)。

    5、熟悉lucene的搜索流程

    6、了解lucene搜索语法解析器 和 熟悉分词

    推荐资料深入剖析lucene的源码,非常有价值。光看文档,不够形象,大体看过文档后,建议结合源码理解文档内容。代码能让读者有大体的基本概念,但文档对源码细节的解释容易让读者"只见枝叶不见森林”,理解困难。根据文档作者提供的大体思路,结合实际源码,读起来更容易。

    测试

    测试对于了解lucene的工作原理、代码执行流程极有帮助,是阅读代码的重要辅助手段。

    IndexerExample.java

    /*
     * Compiler: javac -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  IndexerExample.java  
     * Exec    : java  -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  IndexerExample  
     *
     */
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileReader;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.io.StringReader;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
    import org.apache.lucene.analysis.WhitespaceAnalyzer;
    import org.apache.lucene.analysis.cn.ChineseAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.DateTools;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    
    
    public class IndexerExample {
        
        private static void EnExample() throws Exception {
    
            // Store the index on disk
            Directory directory = FSDirectory.getDirectory("/tmp/testindex");
            // Use standard analyzer
            Analyzer analyzer = new StandardAnalyzer();
            // Create IndexWriter object
            IndexWriter iwriter = new IndexWriter(directory, analyzer, true);
            iwriter.setMaxFieldLength(25000);
            // make a new, empty document
            Document doc = new Document();
            File f = new File("/tmp/test.txt");
            
            // Add the path of the file as a field named "path".  Use a field that is
            // indexed (i.e. searchable), but don't tokenize the field into words.
            doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED));
            
            String text = "This is the text to be indexed.";
            doc.add(new Field("fieldname", text, Field.Store.YES,      Field.Index.TOKENIZED));
            doc.add(new Field("name", text, Field.Store.YES,      Field.Index.TOKENIZED));
            
            // Add the last modified date of the file a field named "modified".  Use
            // a field that is indexed (i.e. searchable), but don't tokenize the field
            // into words.
            doc.add(new Field("modified",
                        DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
                        Field.Store.YES, Field.Index.UN_TOKENIZED));
            // Add the contents of the file to a field named "contents".  Specify a Reader,
            // so that the text of the file is tokenized and indexed, but not stored.
            // Note that FileReader expects the file to be in the system's default encoding.
            // If that's not the case searching for special characters will fail.
            doc.add(new Field("contents", new FileReader(f)));
            
            iwriter.addDocument(doc);
            iwriter.optimize();
            iwriter.close();
    
        }
     
        private static void CnExample() throws Exception {
    
            // Store the index on disk
            Directory directory = FSDirectory.getDirectory("/tmp/testindex");
            // Use chinese analyzer
            Analyzer analyzer = new ChineseAnalyzer();
            PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer());
            wrapper.addAnalyzer("name", analyzer);
            
            // Create IndexWriter object
            IndexWriter iwriter = new IndexWriter(directory, wrapper, true);
            iwriter.setMaxFieldLength(25000);
            // make a new, empty document
            Document doc = new Document();
            File f = new File("/tmp/test.txt");
            
            // Add the path of the file as a field named "path".  Use a field that is
            // indexed (i.e. searchable), but don't tokenize the field into words.
            doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED));
            
            String text = "This is the text to be indexed.";
            doc.add(new Field("fieldname", text, Field.Store.YES, Field.Index.TOKENIZED));
            
            String name = "2013春装新款女气质修身风衣大翻领双层大摆长款外套 系腰带";
            doc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED));
            
            // Add the last modified date of the file a field named "modified".  Use
            // a field that is indexed (i.e. searchable), but don't tokenize the field
            // into words.
            doc.add(new Field("modified",
                        DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
                        Field.Store.YES, Field.Index.UN_TOKENIZED));
            // Add the contents of the file to a field named "contents".  Specify a Reader,
            // so that the text of the file is tokenized and indexed, but not stored.
            // Note that FileReader expects the file to be in the system's default encoding.
            // If that's not the case searching for special characters will fail.
            doc.add(new Field("contents", new FileReader(f)));
            
            iwriter.addDocument(doc);
            iwriter.optimize();
            iwriter.close();
        }
    
        public static void main(String[] args) throws Exception {
            System.out.println("Start test: ");
    
            if( args.length > 0){
                CnExample();
            }
            else{
                EnExample();
            }
    
            System.out.println("Index dir: /tmp/testindex");
        }
    }

    SearcherExample.java

    /*
     * Compiler: javac -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  SearcherExample.java  
     * Exec    : java  -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  SearcherExample
     * 
     */
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileReader;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.io.StringReader;
    import java.util.Date;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.analysis.cn.ChineseAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.DateTools;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Searcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.Hits;
    import org.apache.lucene.queryParser.QueryParser;
    
    
    public class SearcherExample { 
    
        public static void main(String[] args) throws Exception { 
            if (args.length < 2) { 
                throw new Exception("Usage: java " + Searcher.class.getName() 
                        + "<index dir> <query> [cn]"); 
            } 
            File indexDir = new File(args[0]);
            String q = args[1]; 
            boolean bCn = args.length > 2? true : false;
    
            if (!indexDir.exists() || !indexDir.isDirectory()) { 
                throw new Exception(indexDir + 
                        " does not exist or is not a directory."); 
            } 
            search(indexDir, q, bCn); 
        } 
    
        public static void search(File indexDir, String q, boolean bCn) 
            throws Exception { 
            Directory fsDir = FSDirectory.getDirectory(indexDir, false); 
            IndexSearcher is = new IndexSearcher(fsDir);
    
            Analyzer analyzer = new StandardAnalyzer();
            if( bCn ){
                analyzer = new ChineseAnalyzer();
            }
    
            QueryParser parser = new QueryParser( "name",  analyzer);
            Query query = parser.parse(q); 
            
            System.out.println("Query: " + query.toString());
            long start = new Date().getTime(); 
            Hits hits = is.search(query);
            long end = new Date().getTime(); 
    
            System.err.println("Found " + hits.length() + 
                    " document(s) (in " + (end - start) + 
                    " milliseconds) that matched query '" + 
                    q + "'"); 
    
            for (int i = 0; i < hits.length(); i++) { 
                Document doc = hits.doc(i); 
                System.out.println( "HIT " + i + " :" + doc.get("name")); 
            } 
        } 
    } 

    中文分词可采用lucene自带的库,效果不好,或者自行封装,核心就是封装分词Tokenizer。

    package org.apache.lucene.analysis.cn;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.io.Reader;
    import java.nio.charset.Charset;
    import java.util.ArrayList;
    import java.util.HashSet;
    import java.util.Iterator;
    import java.util.List;
    
    import org.apache.commons.lang.StringUtils;
    import org.apache.lucene.analysis.Token;
    import org.apache.lucene.analysis.Tokenizer;
    
    public class SnippetTermTokenizer extends Tokenizer {
            private StringBuffer buffer = new StringBuffer();
            private BufferedReader inputBuffer;
            private JNISelecter selecter;     // 中文分词核心类
            private List<Token> tokenList = null;
            private List<String> phraseTokenList = null;
            private Iterator<Token> tokenIter = null;
    
            public SnippetTermTokenizer(Reader reader, JNISelecter s) {
                    inputBuffer = new BufferedReader(reader, 2048);
                    selecter = s;
            }
    
            public Token next() throws IOException {
                    if (tokenIter != null) {
                            if (tokenIter.hasNext()) {
                                    return tokenIter.next();
                            } else {
                                    // finish read input
                                    return null;
                            }
                    }
                    // need to read content
                    readContent();
                    if (segment()) {
                            // segment succeed, create iterator
                            return tokenIter.next();
                    }
                    return null;
            }
    
            public void close() throws IOException {
                    inputBuffer.close();
            }
           
            // 分词相关略
  • 相关阅读:
    RESTful API 设计指南
    理解RESTful架构
    django-mysqlclient_1193错误
    获取当前脚本所在的目录和路径
    20191007
    20191005
    20191001
    20190927
    20190922
    莫比乌斯反演证明
  • 原文地址:https://www.cnblogs.com/zhenjing/p/lucene_source_code.html
Copyright © 2011-2022 走看看