zoukankan      html  css  js  c++  java
  • Lucene学习笔记(1)初步了解Lucene

          Lucen是一个强大的java搜索库,它能让你很轻易地将搜索功能加入到任何程序中。刚开始学习Lucene,首先要了解Lucene的整体架构,这样就能清晰地理解程序中由Lucene完成的内容,以及其他需要你自行完成的内容。

          搜索程序首先需要实现的功能是索引链,这需要按照几个独立的步骤依次来完成:1、检索原始内容;2、根据原始内容来创建对应的文档;3、对创建的文档进行索引。一旦建立起索引,用于搜索的组件也就出来了,这些搜索组件包括:用户接口、构建可编程查询语句的方法、执行查询语句(或者检索匹配文档)、展现查询结果等。

           根据以上的说明,我们先来创建一个Lucene的示例程序,通过这个示例来进一步了解Lucene的易用性和强大功能。

           1、建立索引

           

    package com.lucene.demo;
    
    import java.io.File;
    import java.io.FileFilter;
    import java.io.FileReader;
    import java.io.IOException;
    
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    public class Indexer {
            
       private IndexWriter writer;
       //构造方法,创建IndexWriter
       public Indexer(String indexDir) throws IOException{
           Directory dir = FSDirectory.open(new File(indexDir));
           writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30),true,IndexWriter.MaxFieldLength.UNLIMITED);      
       }
       
       
       public void close() throws IOException, IOException{
           writer.close();
       }
       
       //返回被索引的文档数
       public int index(String dataDir, FileFilter filter) throws Exception{
           File[] files = new File(dataDir).listFiles();
           
           for(File f:files){
               if(!f.isDirectory()&&!f.isHidden()&&f.exists()&&f.canRead()&&(filter==null||filter.accept(f))){
                   indexFile(f);
                   
               }
               
           }
           return writer.numDocs();
           
       }
       
       //只对txt文档建立索引
       private static class TextFilesFilter implements FileFilter{
    
        @Override
        public boolean accept(File pathname) {        
            return pathname.getName().toLowerCase().endsWith(".txt");
        }
           
       }
       
       protected Document getDocument(File f) throws Exception{
           Document doc = new Document();
           doc.add(new Field("contents", new FileReader(f)));
           doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED));
           doc.add(new Field("fullpath", f.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
           return doc;
       }
       
       private void indexFile(File f) throws Exception{
           System.out.println("Indexing "+f.getCanonicalPath());
           Document doc = getDocument(f);
           writer.addDocument(doc);
       }
       
       
       
       public static void main(String args[]) throws Exception{
           //存放Lucene索引的路径
           String indexDir = "E:\\luceneDir\\indexDir";
          
           //被索引文件的存放路径
           String dataDir = "E:\\luceneDir\\dataDir";
           
           long start = System.currentTimeMillis();
           Indexer indexer = new Indexer(indexDir);
           int numIndexed;
           try{
               numIndexed = indexer.index(dataDir, new TextFilesFilter());
           }finally{
               indexer.close();
           }
           long end  = System.currentTimeMillis();
           
           System.out.println(" Indexing "+ numIndexed + " files took "+ (end - start)+ " milliseconds");
           
           
       }
       
       
    }

    2、搜索索引

    package com.lucene.demo;
    
    import java.io.File;
    import java.io.IOException;
    
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.queryParser.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    public class Searcher {
        /*
         * indexDir:索引文件存放路径
         * q:输入的查询条件
         */
        public static void search(String indexDir, String q) throws Exception{
            //打开索引文件
            Directory dir = FSDirectory.open(new File(indexDir));
            IndexSearcher is = new IndexSearcher(dir);
            
            //解析查询字符串
            QueryParser parser = new QueryParser(Version.LUCENE_30,"contents", new StandardAnalyzer(Version.LUCENE_30));
            Query query = parser.parse(q);
            
            
            long start = System.currentTimeMillis();
            //搜索索引
            TopDocs hits = is.search(query, 10);
            long end = System.currentTimeMillis();
            
            System.err.println("Found "+hits.totalHits+" documnet(s) (in "+(end-start)+" milliseconds) that matched query '"+q+"';" );
            
            for(ScoreDoc scoreDoc : hits.scoreDocs){
                Document doc = is.doc(scoreDoc.doc);
                System.out.println(doc.get("fullpath"));
            }
            is.close();
        }
        public static void main(String[] args) throws Exception {
            String indexDir = "E:\\luceneDir\\indexDir";
            String queryStr = "lucene";
            search(indexDir, queryStr);
    
        }
        
    }

    通过以上代码,我们初步的了解了一下Lucene的功能,但不要因为这个例子简单就感到满足,Lucene包含的内容还有很多。

  • 相关阅读:
    [kuangbin带你飞]专题十六 KMP & 扩展KMP & ManacherK
    [kuangbin带你飞]专题十六 KMP & 扩展KMP & Manacher J
    [kuangbin带你飞]专题十六 KMP & 扩展KMP & Manacher I
    pat 1065 A+B and C (64bit)(20 分)(大数, Java)
    pat 1069 The Black Hole of Numbers(20 分)
    pat 1077 Kuchiguse(20 分) (字典树)
    pat 1084 Broken Keyboard(20 分)
    pat 1092 To Buy or Not to Buy(20 分)
    pat 1046 Shortest Distance(20 分) (线段树)
    pat 1042 Shuffling Machine(20 分)
  • 原文地址:https://www.cnblogs.com/angryprogrammer/p/3056183.html
Copyright © 2011-2022 走看看