zoukankan      html  css  js  c++  java
  • Lucene实战构建索引

    搭建lucene的步骤这里就不详细介绍了,无外乎就是下载相关jar包,在eclipse中新建java工程,引入相关的jar包即可

    本文主要在没有剖析lucene的源码之前实战一下,通过实战来促进研究

    建立索引

    下面的程序展示了indexer的使用

    package com.wuyudong.mylucene;
    
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.util.Version;
    
    import java.io.File;
    import java.io.FileFilter;
    import java.io.IOException;
    import java.io.FileReader;
    
    public class IndexerTest {
    
      public static void main(String[] args) throws Exception {
        if (args.length != 2) {
          throw new IllegalArgumentException("Usage: java " + IndexerTest.class.getName()
            + " <index dir> <data dir>");
        }
        String indexDir = args[0];         //1 指定目录创建索引
        String dataDir = args[1];          //2 对指定目录中的*.txt文件进行索引
    
        long start = System.currentTimeMillis();
        IndexerTest indexer = new IndexerTest(indexDir);
        int numIndexed;
        try {
          numIndexed = indexer.index(dataDir, new TextFilesFilter());
        } finally {
          indexer.close();
        }
        long end = System.currentTimeMillis();
    
        System.out.println("Indexing " + numIndexed + " files took "
          + (end - start) + " milliseconds");
      }
    
      private IndexWriter writer;
    
      public IndexerTest(String indexDir) throws IOException {
        Directory dir = FSDirectory.open(new File(indexDir));
        writer = new IndexWriter(dir,            //3 创建IndexWriter
                     new StandardAnalyzer(       //3
                         Version.LUCENE_30),//3
                     true,                       //3
                                 IndexWriter.MaxFieldLength.UNLIMITED); //3
      }
    
      public void close() throws IOException {
        writer.close();                             //4 关闭IndexWriter
      }
    
      public int index(String dataDir, FileFilter filter)
        throws Exception {
    
        File[] files = new File(dataDir).listFiles();
    
        for (File f: files) {
          if (!f.isDirectory() &&
              !f.isHidden() &&
              f.exists() &&
              f.canRead() &&
              (filter == null || filter.accept(f))) {
            indexFile(f);
          }
        }
    
        return writer.numDocs();                     //5 返回被索引的文档数
      }
    
      private static class TextFilesFilter implements FileFilter {
        public boolean accept(File path) {
          return path.getName().toLowerCase()        //6 只索引*.txt文件,采用FileFilter
                 .endsWith(".txt");                  //6
        }
      }
    
      protected Document getDocument(File f) throws Exception {
        Document doc = new Document();
        doc.add(new Field("contents", new FileReader(f)));      //7 索引文件内容
        doc.add(new Field("filename", f.getName(),              //8 索引文件名
                    Field.Store.YES, Field.Index.NOT_ANALYZED));//8
        doc.add(new Field("fullpath", f.getCanonicalPath(),     //9 索引文件完整路径
                    Field.Store.YES, Field.Index.NOT_ANALYZED));//9
        return doc;
      }
    
      private void indexFile(File f) throws Exception {
        System.out.println("Indexing " + f.getCanonicalPath());
        Document doc = getDocument(f);
        writer.addDocument(doc);                              //10 向Lucene索引中添加文档
      }
    }

    在eclipse中配置好参数:

    E:luceneinactionindex E:luceneinactionlia2esrcliameetlucenedata

    运行结果如下:

    Indexing E:luceneinactionlia2esrcliameetlucenedataapache1.0.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedataapache1.1.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedataapache2.0.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedatacpl1.0.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedataepl1.0.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedatafreebsd.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedatagpl1.0.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedatagpl2.0.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedatagpl3.0.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedatalgpl2.1.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedatalgpl3.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedatalpgl2.0.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedatamit.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedatamozilla1.1.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedatamozilla_eula_firefox3.txt
    Indexing E:luceneinactionlia2esrcliameetlucenedatamozilla_eula_thunderbird2.txt
    Indexing 16 files took 888 milliseconds

    在index文件内会产生索引文件:

    由于被索引的文件都很小,数量也不大(如下图),但是会花费888ms,还是很让人不安

    总体说来,搜索索引比建立索引重要,因为搜索很多次,而索引只是建立一次

    搜索索引

    接下来将创建一个程序 来对上面创建的索引进行搜索:

    import org.apache.lucene.document.Document;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.queryParser.QueryParser;
    import org.apache.lucene.queryParser.ParseException;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.util.Version;
    
    import java.io.File;
    import java.io.IOException;
    
    public class SearcherTest {
    
      public static void main(String[] args) throws IllegalArgumentException,
            IOException, ParseException {
        if (args.length != 2) {
          throw new IllegalArgumentException("Usage: java " + SearcherTest.class.getName()
            + " <index dir> <query>");
        }
    
        String indexDir = args[0];               //1 解析输入的索引路径
        String q = args[1];                      //2 解析输入的查询字符串   
    
        search(indexDir, q);
      }
    
      public static void search(String indexDir, String q)
        throws IOException, ParseException {
    
        Directory dir = FSDirectory.open(new File(indexDir)); //3 打开索引文件
        IndexSearcher is = new IndexSearcher(dir);   //3   
    
        QueryParser parser = new QueryParser(Version.LUCENE_30, // 4 解析查询字符串
                                             "contents",  //4
                         new StandardAnalyzer(          //4
                           Version.LUCENE_30));  //4
        Query query = parser.parse(q);              //4   
        long start = System.currentTimeMillis();
        TopDocs hits = is.search(query, 10); //5 搜索索引
        long end = System.currentTimeMillis();
    
        System.err.println("Found " + hits.totalHits +   //6 记录索引状态
          " document(s) (in " + (end - start) +        // 6
          " milliseconds) that matched query '" +     // 6
          q + "':");                                   // 6
    
        for(ScoreDoc scoreDoc : hits.scoreDocs) {
          Document doc = is.doc(scoreDoc.doc);               //7 返回匹配文本
          System.out.println(doc.get("fullpath"));  //8 显示匹配文件名
        }
    
        is.close();                                //9 关闭IndexSearcher
      }
    }

    设置好参数:E:luceneinactionindex patent

    运行结果如下:

    Found 8 document(s) (in 12 milliseconds) that matched query 'patent':
    E:luceneinactionlia2esrcliameetlucenedatacpl1.0.txt
    E:luceneinactionlia2esrcliameetlucenedatamozilla1.1.txt
    E:luceneinactionlia2esrcliameetlucenedataepl1.0.txt
    E:luceneinactionlia2esrcliameetlucenedatagpl3.0.txt
    E:luceneinactionlia2esrcliameetlucenedataapache2.0.txt
    E:luceneinactionlia2esrcliameetlucenedatagpl2.0.txt
    E:luceneinactionlia2esrcliameetlucenedatalpgl2.0.txt
    E:luceneinactionlia2esrcliameetlucenedatalgpl2.1.txt

    可以看到速度很快(12ms),打印的是文件的绝对路径,这是因为indexer存储的是文件的绝对路径

  • 相关阅读:
    关于zabbix 的lld的web界面的配置
    fastdfs+nginx的安装部署
    mybatis 模糊查询写法
    mybatis-generator:generate failed: Exception getting JDBC Driver: com.mysql.jdbc.Driver
    mybatis判断不为空,不为null等
    mybatis错误:Parameter 'companyName' not found. Available parameters are [arg3, arg2, arg1, arg0,..]
    oracle数据库 ORA-01810: 格式代码出现两次
    Shell学习(一)
    本地计算机上的OracleOraDb11g_home1TNSListener服务启动后停止
    net share c$=c: 发生系统错误
  • 原文地址:https://www.cnblogs.com/wuyudong/p/5391000.html
Copyright © 2011-2022 走看看