zoukankan      html  css  js  c++  java
  • 关于使用Filter降低Lucene tf idf打分计算的调研

    将query改成filter,lucene中有个QueryWrapperFilter性能比较差,所以基本上都须要自己写filter。包含TermFilter,ExactPhraseFilter,ConjunctionFilter,DisjunctionFilter。

    这几天验证下来,还是or改善最明显,4个termfilter,4508个返回结果,在我本机上性能提高1/3。ExactPhraseFilter也有小幅提升(5%-10%)。

    最令人不解的是and,原来以为跟结果数和子查询数相关,但几次測试基本都是下降。

    附ExactPhraseFilter和ut代码:

    import java.io.IOException;
    import java.util.ArrayList;
    
    import org.apache.lucene.index.AtomicReaderContext;
    import org.apache.lucene.index.DocsAndPositionsEnum;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.index.TermContext;
    import org.apache.lucene.index.TermState;
    import org.apache.lucene.index.Terms;
    import org.apache.lucene.index.TermsEnum;
    import org.apache.lucene.search.DocIdSet;
    import org.apache.lucene.search.DocIdSetIterator;
    import org.apache.lucene.search.Filter;
    import org.apache.lucene.util.ArrayUtil;
    import org.apache.lucene.util.Bits;
    
    // A fake to lucene phrase query, but far simplified.
    public class ExactPhraseFilter extends Filter {
        protected final ArrayList<Term> terms = new ArrayList<Term>();
        protected final ArrayList<Integer> positions = new ArrayList<Integer>();
        
        protected String fieldName;
        
        public void add(Term term) {
            if (terms.size() == 0) {
                fieldName = term.field();
            } else {
                assert fieldName == term.field();
            }
            positions.add(Integer.valueOf(terms.size()));
            terms.add(term);
        }
        
        @Override
        public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException
        {
            return new ExactPhraseDocIdSet(context, acceptDocs);
        }
        
        static class PostingAndFreq implements Comparable<PostingAndFreq> {
            DocsAndPositionsEnum posEnum;
            int docFreq;
            int position;
            boolean useAdvance;
            int posFreq = 0;
            int pos = -1;
            int posTime = 0;
            
            public PostingAndFreq(DocsAndPositionsEnum posEnum, int docFreq, int position, boolean useAdvance) {
                this.posEnum = posEnum;
                this.docFreq = docFreq;
                this.position = position;
                this.useAdvance = useAdvance;
            }
         
            @Override
            public int compareTo(PostingAndFreq other) {
                if (docFreq != other.docFreq) {
                    return docFreq - other.docFreq;
                }
                if (position != other.position) {
                    return position - other.position;
                }
                return 0;
            }
        }
        
        protected class ExactPhraseDocIdSet extends DocIdSet {
            protected final AtomicReaderContext context;
            protected final Bits acceptDocs;
            protected final PostingAndFreq[] postings;
            protected boolean noDocs = false;
            
            public ExactPhraseDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
                this.context = context;
                this.acceptDocs = acceptDocs;
                
                Terms fieldTerms = context.reader().fields().terms(fieldName);
                // TermContext states[] = new TermContext[terms.size()];
                postings = new PostingAndFreq[terms.size()];
                
                TermsEnum te = fieldTerms.iterator(null);
                for (int i = 0; i < terms.size(); ++i) {
                    final Term t = terms.get(i);
                    // states[i] = TermContext.build(context, terms.get(i), true);
                    // final TermState state = states[i].get(context.ord);
                    if (!te.seekExact(t.bytes(), true)) {
                        noDocs = true;
                        return;
                    }
                    if (i == 0) {
                        postings[i] = new PostingAndFreq(te.docsAndPositions(acceptDocs, null, 0), te.docFreq(), positions.get(i), false);
                    } else {
                        postings[i] = new PostingAndFreq(te.docsAndPositions(acceptDocs, null, 0), te.docFreq(), positions.get(i), te.docFreq() > 5 * postings[0].docFreq);
                    }
                }
                
                ArrayUtil.mergeSort(postings);
                for (int i = 1; i < terms.size(); ++i) {
                    postings[i].posEnum.nextDoc();
                }
            }
            
            @Override
            public DocIdSetIterator iterator() throws IOException
            {
                if (noDocs) {
                    return EMPTY_DOCIDSET.iterator();
                } else {
                    return new ExactPhraseDocIdSetIterator(context, acceptDocs);
                }
            }
            
            protected class ExactPhraseDocIdSetIterator extends DocIdSetIterator {
                protected int docID = -1;
                
                public ExactPhraseDocIdSetIterator(AtomicReaderContext context, Bits acceptDocs) throws IOException {
                }
                
                @Override
                public int nextDoc() throws IOException {
                    while (true) {
                        // first (rarest) term
                        final int doc = postings[0].posEnum.nextDoc();
                        if (doc == DocIdSetIterator.NO_MORE_DOCS) {
                            // System.err.println("END");
                            return docID = doc;
                        }
                        
                        // non-first terms
                        int i = 1;
                        while (i < postings.length) {
                            final PostingAndFreq pf = postings[i];
                            int doc2 = pf.posEnum.docID();
                            if (pf.useAdvance) {
                                if (doc2 < doc) {
                                    doc2 = pf.posEnum.advance(doc);
                                }
                            } else {
                                int iter = 0;
                                while (doc2 < doc) {
                                    if (++iter == 50) {
                                        doc2 = pf.posEnum.advance(doc);
                                    } else {
                                        doc2 = pf.posEnum.nextDoc();
                                    }
                                }
                            }
                            if (doc2 > doc) {
                                break;
                            }
                            ++i;
                        }
                        
                        if (i == postings.length) {
                            // System.err.println(doc);
                            docID = doc;
                            // return docID;
                            if (containsPhrase()) {
                                return docID;
                            }
                        }
                    }
                }
                
                @Override
                public int advance(int target) throws IOException {
                    throw new IOException();
                }
                
                private boolean containsPhrase() throws IOException {
                    int index = -1;
                    int i = 0;
                    PostingAndFreq pf;
                    
                    // init.
                    for (i = 0; i < postings.length; ++i) {
                        postings[i].posFreq = postings[i].posEnum.freq();
                        postings[i].pos = postings[i].posEnum.nextPosition() - postings[i].position;
                        postings[i].posTime = 1;
                    }
                    
                    while (true) {
                        pf = postings[0];
                        
                        // first term.
                        while (pf.pos < index && pf.posTime < pf.posFreq) {
                            pf.pos = pf.posEnum.nextPosition() - pf.position;
                            ++pf.posTime;
                        }
                        if (pf.pos >= index) {
                            index = pf.pos;
                        } else if (pf.posTime == pf.posFreq) {
                            return false;
                        }
                        
                        // other terms.
                        for (i = 1; i < postings.length; ++i) {
                            pf = postings[i];
                            while (pf.pos < index && pf.posTime < pf.posFreq) {
                                pf.pos = pf.posEnum.nextPosition() - pf.position;
                                ++pf.posTime;
                            }
                            if (pf.pos > index) {
                                index = pf.pos;
                                break;
                            }
                            if (pf.pos == index) {
                                continue;
                            }
                            if (pf.posTime == pf.posFreq) {
                                return false;
                            }
                        }
                        if (i == postings.length) {
                            return true;
                        }
                    }
                }
    
                @Override
                public int docID()
                {
                    return docID;
                }
            }
    
        }
        
    }


    UT:

    import java.io.IOException;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    
    import org.apache.lucene.codecs.Codec;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.document.Field.Store;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.index.IndexWriterConfig.OpenMode;
    import org.apache.lucene.search.ConstantScoreQuery;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.RAMDirectory;
    import org.apache.lucene.util.Version;
    import org.testng.annotations.AfterTest;
    import org.testng.annotations.BeforeTest;
    import org.testng.annotations.Test;
    
    import com.dp.arts.lucenex.codec.Dp10Codec;
    
    public class ExactPhraseFilterTest
    {
        final Directory dir = new RAMDirectory();
        
        @BeforeTest
        public void setUp() throws IOException {
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer);
            iwc.setOpenMode(OpenMode.CREATE);
            iwc.setCodec(Codec.forName(Dp10Codec.DP10_CODEC_NAME));
            
            IndexWriter writer = new IndexWriter(dir, iwc); 
            addDocument(writer, "新疆烧烤");  // 0
            addDocument(writer, "啤酒");  // 1
            addDocument(writer, "烤烧");  // 2
            addDocument(writer, "烧烧烧");  // 3
            addDocument(writer, "烤烧中华烧烤"); // 4
            writer.close();
        }
        
        private void addDocument(IndexWriter writer, String str) throws IOException {
            Document doc = new Document();
            doc.add(new TextField("searchkeywords", str, Store.YES));
            writer.addDocument(doc, new StandardAnalyzer(Version.LUCENE_40));
        }
        
        @AfterTest
        public void tearDown() throws IOException
        {
            this.dir.close();
        }
        
        @Test
        public void test1() throws IOException
        {
            IndexReader reader = DirectoryReader.open(dir);
            IndexSearcher searcher = new IndexSearcher(reader);
            
            ExactPhraseFilter pf = new ExactPhraseFilter();
            pf.add(new Term("searchkeywords", "烧"));
            pf.add(new Term("searchkeywords", "烤"));
            Query query = new ConstantScoreQuery(pf);
            TopDocs results = searcher.search(query, 20);
            
            assert results.totalHits == 2;
            assert results.scoreDocs[0].doc == 0;
            assert results.scoreDocs[1].doc == 4;
            
            searcher.getIndexReader().close();
        }
    }
    


  • 相关阅读:
    jQuery入门和DOM对象
    jQuery事件
    基础,层次,选择器
    MarkDown快速入门(typora)
    source是读入环境配置文件的命令,不能读入vimrc
    vi中将tab键转化为空格
    django-rest-framework学习之Quickstart和Serializer--2017年4月10日至12日
    Flask-RESTful插件介绍--2017年4月7日
    python restful api 编程--2017年4月6日
    一个验证登录的程序:python编写flask架构restful风格--2017年4月6日
  • 原文地址:https://www.cnblogs.com/mqxnongmin/p/10695682.html
Copyright © 2011-2022 走看看