zoukankan      html  css  js  c++  java
  • lucene创建索引的几种方式(一)

    什么是索引:

    根据你输入的值去找,这个值就是索引

    第一种创建索引的方式:

    根据文件来生成索引,如后缀为.txt等的文件

    步骤:

    第一步:FSDirectory.open(Paths.get(url));根据路径获取存储索引的目录。

    FSDirectory:表示对文件系统目录的操作。RAMDirectory :内存中的目录操作。

    Paths为NIO(new io)的一个类;Path 类是 java.io.File 类的升级版,File file=newFile("index.html")而Path path=Paths.get("index.html");由于 Path 类基于字符串创建,因此它引用的资源也有可能不存在。

    关于nio:传统的io流都是通过字节的移动来处理的,也就是说输入/输出流一次只能处理一个字节,因此面向流的输入/输出系统通常效率不高;因此引进了新IO(new IO),NIO采用内存映射文件的方式来处理输入/输出,NIO将文件或文件的一段区域映射到内存中,这样就可以向访问内存一样来访问文件了(这种方式模拟了操作系统上的虚拟内存的概念),所以NIO的效率很快。

    第二步:new IndexWriter(Directory,IndexWriterConfig)创建索引

    第三步:索引指定目录的文件

    第四步:将文件写入lucene中的文档(Document)

    package com.wp.util;
    
    import java.io.File;
    import java.io.FileReader;
    import java.nio.file.Paths;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    
    public class Indexer {
    
        private IndexWriter writer; // 写索引实例
    
        /**
         * 构造方法 实例化IndexWriter
         * 
         * @param indexDir
         * @throws Exception
         */
        public Indexer(String indexDir) throws Exception {
            Directory dir = FSDirectory.open(Paths.get(indexDir));// 根据路径获取存储索引的目录
            Analyzer analyzer = new StandardAnalyzer(); // 这里用了多态,StandardAnalyzer是标准分词器,Analyzer是一个分词器
     IndexWriterConfig iwc = new IndexWriterConfig(analyzer); writer = new IndexWriter(dir, iwc); } /** * 关闭写索引 * * @throws Exception */ public void close() throws Exception { writer.close(); } /** * 索引指定目录的所有文件 * * @param dataDir * @throws Exception */ public int index(String dataDir) throws Exception { File[] files = new File(dataDir).listFiles(); for (File f : files) { indexFile(f); } return writer.numDocs(); } /** * 索引指定文件 * * @param f */ private void indexFile(File f) throws Exception { // 关于f.getCanonicalPath()查看http://www.blogjava.net/dreamstone/archive/2007/08/08/134968.html System.out.println("索引文件:" + f.getCanonicalPath()); Document doc = getDocument(f); writer.addDocument(doc); } /** * 获取文档,文档里再设置每个字段 * * @param f */ private Document getDocument(File f) throws Exception { Document doc = new Document(); doc.add(new TextField("contents", new FileReader(f))); doc.add(new TextField("fileName", f.getName(), Field.Store.YES)); doc .add(new TextField("fullPath", f.getCanonicalPath(), Field.Store.YES)); return doc; } public static void main(String[] args) { String indexDir = "D:\lucene4"; String dataDir = "D:\lucene4\data"; Indexer indexer = null; int numIndexed = 0; long start = System.currentTimeMillis(); try { indexer = new Indexer(indexDir); numIndexed = indexer.index(dataDir); } catch (Exception e) { e.printStackTrace(); } finally { try { indexer.close(); } catch (Exception e) { e.printStackTrace(); } } long end = System.currentTimeMillis(); System.out.println("索引:" + numIndexed + " 个文件 花费了" + (end - start) + " 毫秒"); } }

    第二种创建索引的方式:

    根据字段来生成索引,我用的是数组

    第一步:创建索引

    第二步:将字段添加到文档中

    package com.wp.util;
    
    import java.nio.file.Paths;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.StringField;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.junit.Before;
    import org.junit.Test;
    
    public class IndexIngTest {
    
        private String ids[] = { "1", "2", "3" };
        private String citys[] = { "qingdao", "nanjing", "shanghai" };
        private String descs[] = { "Qingdao is a beautiful city.",
                "Nanjing is a city of culture.", "Shanghai is a bustling city." };
    
        private Directory dir;// 目录
    
        /**
         * 获取IndexWriter实例
         * 
         * @return
         * @throws Exception
         */
        private IndexWriter getWriter() throws Exception {
            Analyzer analyzer = new StandardAnalyzer(); // 标准分词器
            IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
            IndexWriter writer = new IndexWriter(dir, iwc);
            return writer;
        }
    
        /**
         * 添加文档
         * 
         * @throws Exception
         */
        @Before
        public void setUp() throws Exception {
            dir = FSDirectory.open(Paths.get("D:\lucene\luceneIndex"));// 得到luceneIndex目录
            IndexWriter writer = getWriter();// 得到索引
            for (int i = 0; i < ids.length; i++) {
                Document doc = new Document();// 创建文档
                doc.add(new StringField("id", ids[i], Field.Store.YES));// 将id属性存入内存中
                doc.add(new StringField("city", citys[i], Field.Store.YES));
                doc.add(new TextField("desc", descs[i], Field.Store.NO));
                writer.addDocument(doc); // 添加文档
            }
            writer.close();
        }
    
        /**
         * 测试写了几个文档
         * 
         * @throws Exception
         */
        @Test
        public void testIndexWriter() throws Exception {
            IndexWriter writer = getWriter();
            System.out.println("写入了" + writer.numDocs() + "个文档");
            writer.close();
        }
    
        /**
         * 测试读取文档
         * 
         * @throws Exception
         */
        @Test
        public void testIndexReader() throws Exception {
            IndexReader reader = DirectoryReader.open(dir);
            System.out.println("最大文档数:" + reader.maxDoc());
            System.out.println("实际文档数:" + reader.numDocs());
            reader.close();
        }
    
        /**
         * 测试删除 在合并前
         * 
         * @throws Exception
         */
        @Test
        public void testDeleteBeforeMerge() throws Exception {
            IndexWriter writer = getWriter();
            System.out.println("删除前:" + writer.numDocs());
            writer.deleteDocuments(new Term("id", "1"));// term:根据id找到为1的
            writer.commit();
            System.out.println("writer.maxDoc():" + writer.maxDoc());
            System.out.println("writer.numDocs():" + writer.numDocs());
            writer.close();
        }
    
        /**
         * 测试删除 在合并后
         * 
         * @throws Exception
         */
        @Test
        public void testDeleteAfterMerge() throws Exception {
            IndexWriter writer = getWriter();
            System.out.println("删除前:" + writer.numDocs());
            writer.deleteDocuments(new Term("id", "1"));
            writer.forceMergeDeletes(); // 强制删除
            writer.commit();
            System.out.println("writer.maxDoc():" + writer.maxDoc());
            System.out.println("writer.numDocs():" + writer.numDocs());
            writer.close();
        }
    
        /**
         * 测试更新
         * 
         * @throws Exception
         */
        @Test
        public void testUpdate() throws Exception {
            IndexWriter writer = getWriter();
            Document doc = new Document();
            doc.add(new StringField("id", "1", Field.Store.YES));
            doc.add(new StringField("city", "qingdao", Field.Store.YES));
            doc.add(new TextField("desc", "dsss is a city.", Field.Store.NO));
            writer.updateDocument(new Term("id", "1"), doc);
            writer.close();
        }
    }

    生成的索引文件如下:

    关于索引的搜索:

    这里有一个要注意的地方:一定要先创建出索引后才能去进行查找,否则会报

    org.apache.lucene.index.IndexNotFoundException:
    no segments* file found in MMapDirectory@D:lucene lockFactory=org.apache.lucene.store.NativeFSLockFactory@753f67a9: files: [data, lucene-5.3.1, lucene-5.3.1.zip]
    package com.wp.lucene;
    
    import java.nio.file.Paths;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.queryparser.classic.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    
    public class Searcher {
        /**
         * 
         * @param indexDir
         *            哪个目录
         * @param q
         *            要查询的字段
         * @throws Exception
         */
        public static void search(String indexDir, String q) throws Exception {
            Directory dir = FSDirectory.open(Paths.get(indexDir));// 打开目录
            IndexReader reader = DirectoryReader.open(dir);// 进行读取
            IndexSearcher is = new IndexSearcher(reader);// 索引查询器
            Analyzer analyzer = new StandardAnalyzer(); // 标准分词器
            QueryParser parser = new QueryParser("contents", analyzer);// 在哪查询,第一个参数为查询的Document,在Indexer中创建了
            Query query = parser.parse(q);// 对字段进行解析后返回给查询
            long start = System.currentTimeMillis();
            TopDocs hits = is.search(query, 10);// 开始查询,10代表前10条数据;返回一个文档
            long end = System.currentTimeMillis();
            System.out.println("匹配 " + q + " ,总共花费" + (end - start) + "毫秒" + "查询到"
                    + hits.totalHits + "个记录");
            for (ScoreDoc scoreDoc : hits.scoreDocs) {
                Document doc = is.doc(scoreDoc.doc);// 根据文档的标识获取文档
                System.out.println(doc.get("fullPath"));
            }
            reader.close();
        }
    
        /**
         * 执行这个main方法进行查询之前,必须要有索引,即先执行Indexer这个类
         * 
         * @param args
         */
        public static void main(String[] args) {
            String indexDir = "D:\lucene";
            String q = "ADD";
            try {
                search(indexDir, q);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    Java小生店铺:

    Pc端:http://shop125970977.taobao.com/index.htm

    手机端:搜索 java小生店铺

    希望店铺的资料能帮助到你!!!

     

  • 相关阅读:
    2018 ICPC南京网络赛 A An Olympian Math Problem(数论题)
    算法竞赛模板 素数测试(Miller-Rabin测试)
    算法竞赛模板 tarjan算法
    2018 CCPC网络赛 1004 Find Integer(勾股数+费马大定理)
    算法竞赛模板 概率dp
    算法竞赛模板 重载运算符
    算法竞赛模板 矩阵快速幂
    算法竞赛模板 回文素数
    算法竞赛模板 AC自动机
    算法竞赛模板 拓扑排序
  • 原文地址:https://www.cnblogs.com/lirenzhujiu/p/5912243.html
Copyright © 2011-2022 走看看