lucene4.4 索引的增删改查

zoukankan html css js c++ java

lucene4.4 索引的增删改查

package com.lucene.test;

import java.io.File;

import java.io.FileReader;

import java.io.IOException;

import java.util.Date;

import org.apache.log4j.Logger;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field.Store;

import org.apache.lucene.document.IntField;

import org.apache.lucene.document.StringField;

import org.apache.lucene.document.TextField;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.DocsAndPositionsEnum;

import org.apache.lucene.index.Fields;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.MultiFields;

import org.apache.lucene.index.Term;

import org.apache.lucene.index.Terms;

import org.apache.lucene.index.TermsEnum;

import org.apache.lucene.queryparser.classic.ParseException;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.DocIdSetIterator;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.BytesRef;

import org.apache.lucene.util.Version;

publicclass IndexUtil {

    privatestaticfinal Logger LOGGER = Logger.getLogger(IndexUtil.class);

    private Directory directory = null;

    private DirectoryReader reader = null;

    private IndexWriterConfig config = null;

    private IndexWriter writer = null;

    publicstaticfinal IndexUtil Instance = new IndexUtil();

    private IndexUtil() {

        try {

            directory = FSDirectory.open(new File("D:/lucene/index"));

            config = new IndexWriterConfig(Version.LUCENE_44,

                    new StandardAnalyzer(Version.LUCENE_44));

        } catch (IOException e) {

            e.printStackTrace();

        }

    }

    /**

     *添加索引

     *@throwsIOException

     */

    publicvoid index() throws IOException {

        writer = new IndexWriter(directory, config);

        File file = new File("D:\lucene\example");

        Document document = null;

        int id = 0;

        long start = new Date().getTime();

        LOGGER.info("添加索引…………………………");

        for (File f : file.listFiles()) {

            document = new Document();

            document.add(new StringField("name",f.getName(), Store.YES));

            document.add(new IntField("id", id++,Store.YES));

            document.add(new StringField("path",f.getAbsolutePath(), Store.YES));

            document.add(new TextField("context", new FileReader(f)));

            writer.addDocument(document);

        }

        long end = new Date().getTime();

        LOGGER.info("添加索引完成，用时：" + (end - start) / 1000.0 + "s…………………………");

        writer.close();

    }

    /**

     *查询索引

     *@throwsIOException

     *@throwsParseException

     */

    publicvoid search() throws IOException, ParseException {

        reader = DirectoryReader.open(directory);

        QueryParser parser = newQueryParser(Version.LUCENE_44, "context",

                new StandardAnalyzer(Version.LUCENE_44));

        Query query = parser.parse("lucene");

        IndexSearcher searcher = new IndexSearcher(reader);

        TopDocs docs = searcher.search(query,100);

        /**

         *reader.maxDoc()包含索引文档的总数包含可用的和已经删除的数量

         *reader.numDocs()当前可用的索引文档的数量不包含已经删除的

         *reader.numDeletedDocs()删除的索引文档的数量

         */

        LOGGER.info("总记录：" + docs.totalHits + " 命中文档数：" + docs.scoreDocs.length

                + " 最大的文档数maxDoc：" + reader.maxDoc() + " 删除文件数numDeletedDocs："

                + reader.numDeletedDocs() + " numDocs" + reader.numDocs());

        for (ScoreDoc doc : docs.scoreDocs) {

            Document document = reader.document(doc.doc);

            LOGGER.info("id:" +document.get("id") + " name:"

                    + document.get("name") + " path:" + document.get("path"));

        }

        reader.close();

    }

    /**

     *更新索引

     *@throwsIOException

     */

    publicvoid update() throws IOException {

        writer = new IndexWriter(directory, config);

        Document document = new Document();

        document.add(new StringField("name", "新文件", Store.YES));

        document.add(new IntField("id", 12, Store.YES));

        document.add(new StringField("path", "D:\lucene\example\新文件.txt", Store.YES));

        writer.updateDocument(new Term("id", "2"),document);

        writer.commit();

        writer.close();

    }



    /**

     *删除索引删除的索引会保存到一个新的文件中（以del为结尾的文件相当于删除到回收站）

     *@throwsIOException

     */

    publicvoid delete() throws IOException {

        writer = new IndexWriter(directory, config);

        writer.deleteDocuments(new Term("name", "11.txt"));

        writer.close();

    }

    /**

     *删除所有的索引删除的索引会保存到一个新的文件中（以del为结尾的文件相当于删除到回收站）

     *@throwsIOException

     */

    publicvoid deleteAll() throws IOException {

        writer = new IndexWriter(directory, config);

        writer.deleteAll();

        writer.close();

    }

    /**

     *删除已经删除的索引对应上一个删除方法删除回收站的文件

     *@throwsIOException

     */

    publicvoid forceMergeDeletes() throws IOException {

        writer = new IndexWriter(directory, config);

        writer.forceMergeDeletes();// 清空回收站

        writer.close();

    }

    /**

     *显示所有的索引

     *@throwsIOException

     */

    publicvoid showIndex() throws IOException {

        reader = DirectoryReader.open(directory);

        Fields fields = MultiFields.getFields(reader); //获取directory中所有的field

            for (String field : fields) {

                LOGGER.info(field);

            }

            //显示 field 中 context的所有的分词

            Terms terms = fields.terms("context");

            TermsEnum termsEnum = terms.iterator(null);

            BytesRef term = null;

            while ((term=termsEnum.next()) !=null) {

                System.out.print(term.utf8ToString()+" ");//分词的内容

                System.out.print(termsEnum.docFreq()+" ");//出现该分词的有文档的数量

                System.out.print(termsEnum.totalTermFreq()+" ");//分词的总数

                DocsAndPositionsEnumdocsAndPositionsEnum = termsEnum.docsAndPositions(null, null);

                //如果要查询的字段没有被分词，docsAndPositionsEnum就会为空继续循环

                if(docsAndPositionsEnum==null){

                    continue;

                }

                int docId ;

                while ((docId = docsAndPositionsEnum.nextDoc())!= DocIdSetIterator.NO_MORE_DOCS) {

                    Document document = reader.document(docId);//获取document对象

                    System.out.print(docId+" ");//分词的总数

                    System.out.print(document.get("name")+" ");//可以获取document中field的值

                    int freq = docsAndPositionsEnum.freq();//该document中该分词出现的次数

                    for (int i = 0; i < freq; i++) {

                        System.out.print(docsAndPositionsEnum.nextPosition()+":"); //分词的位置

                         System.out.print("["+docsAndPositionsEnum.startOffset()+"");//分词起始偏移量的位置

                         System.out.print(docsAndPositionsEnum.endOffset()+"],");//分词结束偏移量的位置

                         System.out.print(docsAndPositionsEnum.getPayload()+" ");

                    }

                }

                System.out.println();

            }

        reader.close();

    }

}

查看全文

相关阅读:
24 Scrapy爬虫的基本使用
 22 Scrapy框架简介
 21 Scrapy框架的安装
 19 正则表达式的基本知识
 18 “中国大学排名定向爬虫”实例介绍
 17 基于bs4库的HTML内容查找方法
 16 信息标记形式及信息提取的一般方法
 python中with as语句的用法
 Firefox安装Charles配置https后无法上网解决方案（转载）
Windows下(Win10)Charles从下载安装到证书设置和浏览器抓包测试，亲测有效！（转载）

原文地址：https://www.cnblogs.com/pangblog/p/3297230.html