zoukankan      html  css  js  c++  java
  • lucene的增量更新

    对于每天更新的索引,可以采用增量更新,例子如下:

    例如:
    pid mondayCv
    123   23000
    
    第二天
    pid  mondayCv   tuesdayCv
    123   23000        45000
    package com.sachie.lucene.test;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.apache.lucene.index.IndexWriterConfig.OpenMode;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.Fieldable;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.TermQuery;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.SimpleFSDirectory;
    import org.apache.lucene.util.Version;
    import com.sachie.lucene.model.TestObject;
    
    public class CreateTeest {
    
        /**
         * @param args
         */
        static IndexWriterConfig conf = null;
        static {
            Analyzer analysis = new StandardAnalyzer(Version.LUCENE_36);
            conf = new IndexWriterConfig(Version.LUCENE_36, analysis);
            conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
            conf.setRAMBufferSizeMB(512.00);
        }
    
        public void addDoc(Document doc, String name, String value) {
            doc.add(new Field(name, value, Field.Store.YES,
                    Field.Index.NOT_ANALYZED));
        }
    
        public List<TestObject> getIndexObject(File file) throws IOException {
            List<TestObject> objectList = new ArrayList<TestObject>();
            if (!file.exists())
                throw new FileNotFoundException();
            else {
                BufferedReader br = new BufferedReader(new FileReader(file));
                String tempStr = null;
                while ((tempStr = br.readLine()) != null) {
                    String[] tempStrs = tempStr.split("\t");
                    objectList.add(new TestObject(tempStrs[0], tempStrs[1],
                            tempStrs[2]));
                }
                br.close();
            }
            return objectList;
    
        }
    
        public void createIndex() throws IOException {
            String sourcePath = "d:\\data";
            String target = "d:\\testIndex";
            File files = new File(sourcePath);
            IndexWriter indexWriter = null;
            boolean create = false;
            Directory directory = new SimpleFSDirectory(new File(target));
            IndexSearcher searcher = null;
            indexWriter = new IndexWriter(directory, conf);
            for (int i = 0; i < files.listFiles().length; i++) {
                if (i != 0) {
                    searcher = new IndexSearcher(IndexReader.open(directory));
                }
    
                File file = files.listFiles()[i];
                String date = file.getName();
                List<TestObject> list = this.getIndexObject(file);
    
                try {
                    for (TestObject tmp : list) {
                        Document doc = new Document();
                        addDoc(doc, "pid", tmp.getPid());
                        addDoc(doc, date + "cvOne", tmp.getCvOne());
                        addDoc(doc, date + "cvAll", tmp.getCvAll());
                        if (i == 0)
                            indexWriter.addDocument(doc);
                        else
                            this.searchAndUpdateDocument(indexWriter, searcher,
                                    doc, new Term("pid", tmp.getPid()));
                    }
    
                } catch (Exception e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                // indexWriter.forceMerge(1);
                indexWriter.commit();
                indexWriter.forceMerge(1);
            }
    
            indexWriter.close();
    
        }
    
        public void searchAndUpdateDocument(IndexWriter writer,
                IndexSearcher searcher, Document updateDoc, Term term)
                throws IOException {
            TermQuery query = new TermQuery(term);
            TopDocs hits = searcher.search(query, 10);
            if (hits.scoreDocs.length == 0) {
                writer.addDocument(updateDoc);
            } else if (hits.scoreDocs.length > 1) {
                throw new IllegalArgumentException(
                        "Given Term matches more than 1 document in the index.");
            } else {
                int docId = hits.scoreDocs[0].doc;
                Document doc = searcher.doc(docId);
                List<Fieldable> replacementFields = updateDoc.getFields();
                for (Fieldable field : replacementFields) {
                    String name = field.name();
                    String currentValue = doc.get(name);
                    if (currentValue != null) {
                        doc.removeFields(name);
                        doc.add(field);
                    } else {
                        doc.add(field);
                    }
                }
                writer.updateDocument(term, doc);
            }
    
        }
    
        public static void main(String args[]) throws IOException {
            CreateTeest ic = new CreateTeest();
            ic.createIndex();
        }
    
    }
    I believe,I can. Sachie.Dong
  • 相关阅读:
    Python爬虫实例:爬取豆瓣Top250
    爬虫协议 Tobots
    【Python 库】bs4的使用
    【Python 库】Selenium 的使用
    【Python 库】Selenium 浏览器驱动
    【Python 库】机器学习三剑客之 NumPy
    【Python】zip 函数的用法
    面试(一)-HashMap
    由树到数据库索引
    Win10下安装RabbitMQ以及基本知识学习
  • 原文地址:https://www.cnblogs.com/sachie/p/3108736.html
Copyright © 2011-2022 走看看