lucene的多种搜索2-SpanQuery

zoukankan html css js c++ java

lucene的多种搜索2-SpanQuery
SpanQuery按照词在文章中的距离或者查询几个相邻词的查询

SpanQuery包括以下几种：

SpanTermQuery：词距查询的基础，结果和TermQuery相似，只不过是增加了查询结果中单词的距离信息。

SpanFirstQuery：在指定距离可以找到第一个单词的查询。

SpanNearQuery：查询的几个语句之间保持者一定的距离。

SpanOrQuery：同时查询几个词句查询。

SpanNotQuery：从一个词距查询结果中，去除一个词距查询。

下面一个简单例子介绍
Java代码

package com;



//SpanQuery：跨度查询。此类为抽象类。



import java.io.IOException;

import java.io.StringReader;

import java.util.ArrayList;

import java.util.List;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.Token;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.WhitespaceAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.Field.Index;

import org.apache.lucene.document.Field.Store;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.Term;

import org.apache.lucene.search.Hits;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.spans.SpanFirstQuery;

import org.apache.lucene.search.spans.SpanNearQuery;

import org.apache.lucene.search.spans.SpanNotQuery;

import org.apache.lucene.search.spans.SpanOrQuery;

import org.apache.lucene.search.spans.SpanQuery;

import org.apache.lucene.search.spans.SpanTermQuery;

import org.apache.lucene.search.spans.Spans;

import org.apache.lucene.store.RAMDirectory;



public class SpanQueryTest {



    private RAMDirectory directory;



    private IndexSearcher indexSearcher;



    private IndexReader reader;



    private SpanTermQuery quick;



    private SpanTermQuery brown;



    private SpanTermQuery red;



    private SpanTermQuery fox;



    private SpanTermQuery lazy;



    private SpanTermQuery sleepy;



    private SpanTermQuery dog;



    private SpanTermQuery cat;



    private Analyzer analyzer;



    // 索引及初使化

    public void index() throws IOException {



        directory = new RAMDirectory();



        analyzer = new WhitespaceAnalyzer();



        IndexWriter writer = new IndexWriter(directory, analyzer, true);



        Document doc1 = new Document();



        doc1.add(new Field("field",

                "the quick brown fox jumps over the lazy dog", Store.YES,

                Index.TOKENIZED));



        Document doc2 = new Document();



        doc2.add(new Field("field",

                "the quick red fox jumps over the sleepy cat", Store.YES,

                Index.TOKENIZED));



        writer.addDocument(doc1);



        writer.addDocument(doc2);



        writer.optimize();



        writer.close();



        quick = new SpanTermQuery(new Term("field", "quick"));



        brown = new SpanTermQuery(new Term("field", "brown"));



        red = new SpanTermQuery(new Term("field", "red"));



        fox = new SpanTermQuery(new Term("field", "fox"));

        lazy = new SpanTermQuery(new Term("field", "lazy"));

        sleepy = new SpanTermQuery(new Term("field", "sleepy"));

        dog = new SpanTermQuery(new Term("field", "dog"));

        cat = new SpanTermQuery(new Term("field", "cat"));



        indexSearcher = new IndexSearcher(directory);



        reader = IndexReader.open(directory);

    }



    private void dumpSpans(SpanQuery query) throws IOException {



        // 检索效果和TermQuery一样,可以把他当成TermQuery

        Hits hits = indexSearcher.search(query);

        for (int i = 0; i < hits.length(); i++) {

            // System.out.println(hits.doc(i).get("field"));

        }



        // 但内部会记录一些位置信息，供SpanQuery的其它API使用，是其它属于SpanQuery的Query的基础。



        Spans spans = query.getSpans(reader);



        int numSpans = 0;



        float[] scores = new float[2];

        for (int i = 0; i < hits.length(); i++) {

            scores[hits.id(i)] = hits.score(i);

        }



        while (spans.next()) {



            numSpans++;



            int id = spans.doc();



            Document doc = reader.document(id);



            Token[] tokens = AnalyzerUtils.tokensFromAnalysis(analyzer, doc

                    .get("field"));



            StringBuffer buffer = new StringBuffer();



            for (int i = 0; i < tokens.length; i++) {

                // the quick brown fox jumps over the lazy dog

                // spans记录了位置信息,比如搜索brown,brown在这句话中位于第三个位置,所以spans.start()=2,spans.end()=3

                // 在第二项的位置后加<,第三项后加> 返回<brown>

                if (i == spans.start()) {

                    buffer.append("<");

                }

                buffer.append(tokens[i].termText());

                if (i + 1 == spans.end()) {

                    buffer.append(">");

                }

                buffer.append(" ");

            }

            buffer.append("(" + scores[id] + ") ");



            System.out.println(buffer);

        }



        // indexSearcher.close();

    }



    // SpanTermQuery：检索效果完全同TermQuery，但内部会记录一些位置信息，供SpanQuery的其它API使用，是其它属于SpanQuery的Query的基础。

    public void spanTermQueryTest() throws IOException {

        dumpSpans(brown);



        //// 搜索结果

        // the quick <brown> fox jumps over the lazy dog (0.22097087)

    }



    // SpanFirstQuery：查找方式为从Field的内容起始位置开始，在一个固定的宽度内查找所指定的词条。

    public void spanFirstQueryTest() throws IOException {

        // the quick brown fox jumps over the lazy dog

        // 在给定的范围搜索,前两个为the quick

        // brown 在doc1的第三个位置,用SpanFirstQuery从起点查找的话,他的跨度必须为>=3才能找到

        SpanFirstQuery firstQuery = new SpanFirstQuery(brown, 3);

        dumpSpans(firstQuery);



        ////搜索结果

        // the quick <brown> fox jumps over the lazy dog (0.22097087)

    }



    // SpanNearQuery：功能类似PharaseQuery。SpanNearQuery查找所匹配的不一定是短语，还有可能是另一个SpanQuery的查询结果作为整体考虑，进行嵌套查询。

    public void spanNearQueryTest() throws IOException {

        // the quick brown fox jumps over the lazy dog



        // 第二个参数为两个项的位置之间允许的最大间隔

        // 在这里两个较远的项为quick和fox,他们之是的最大间隔为5,所以slop必须>=5才能搜到结果

        SpanNearQuery nearQuery = new SpanNearQuery(new SpanQuery[] { quick,

                brown, fox }, 5, true);



        dumpSpans(nearQuery);



        // 与PhraseQuery短语搜索相似

        // 这里搜索quick,dog,brown,要想得到结果,就要将brown向后移动5个位置才能到dog的后面,所以slop要>=5才能找到结果

        // 第三个参数,如果为true表示保持各项位置不变,顺序搜索

        nearQuery = new SpanNearQuery(new SpanQuery[] { quick, dog, brown }, 5,

                false);



        dumpSpans(nearQuery);



        //////搜索结果/////

        // 第一个dumpSpans的结果 the <quick brown fox> jumps over the lazy dog (0.34204215)

        // 第二个dumpSpans的结果 the <quick brown fox jumps over the lazy dog> (0.27026406)

    }



    // 从第一个SpanQuery查询结果中，去掉第二个SpanQuery查询结果，作为检索结果

    public void spanNotQueryTest() throws IOException {



        // the quick brown fox jumps over the lazy dog



        SpanNearQuery quick_fox = new SpanNearQuery(new SpanQuery[] { quick,

                fox }, 1, true);



        // 结果为quick brown fox 和 quick red fox

        dumpSpans(quick_fox);



        // SpanNotQuery quick_fox_dog = new SpanNotQuery(quick_fox, dog);

        //

        // dumpSpans(quick_fox_dog);



        // 在quick_fox结果中,去掉red,结果为quick brown fox

        SpanNotQuery no_quick_red_fox = new SpanNotQuery(quick_fox, red);



        dumpSpans(no_quick_red_fox);



        //////搜索结果///////第一个dumpSpans结果为前两条,第二个dumpSpans结果为第三条

        //the <quick brown fox> jumps over the lazy dog (0.18579213)

        //the <quick red fox> jumps over the sleepy cat (0.18579213)

        //the <quick brown fox> jumps over the lazy dog (0.18579213)

    }



    // SpanOrQuery：把所有SpanQuery查询结果综合起来，作为检索结果。

    public void spanOrQueryTest() throws IOException   {



        SpanNearQuery quick_fox = new SpanNearQuery(new SpanQuery[] { quick,

                fox }, 1, true);



        SpanNearQuery lazy_dog = new SpanNearQuery(

                new SpanQuery[] { lazy, dog }, 0, true);



        SpanNearQuery sleepy_cat = new SpanNearQuery(new SpanQuery[] { sleepy,

                cat }, 0, true);



        SpanNearQuery qf_near_ld = new SpanNearQuery(new SpanQuery[] {

                quick_fox, lazy_dog }, 3, true);



        dumpSpans(qf_near_ld);



        SpanNearQuery qf_near_sc = new SpanNearQuery(new SpanQuery[] {

                quick_fox, sleepy_cat }, 3, true);



        dumpSpans(qf_near_sc);



        SpanOrQuery or = new SpanOrQuery(new SpanQuery[] { qf_near_ld,

                qf_near_sc });



        dumpSpans(or);



        /////////搜索结果第一个dumpSpans结果为第一条,第二个为第二条,第三个为第三,四条

        // the <quick brown fox jumps over the lazy dog> (0.3321948)

        // the <quick red fox jumps over the sleepy cat> (0.3321948)

        // the <quick brown fox jumps over the lazy dog> (0.5405281)

        // the <quick red fox jumps over the sleepy cat> (0.5405281)

    }



    public static void main(String[] args) throws IOException {



        SpanQueryTest test = new SpanQueryTest();



        test.index();



        test.spanOrQueryTest();

    }

}



class AnalyzerUtils {

    public static Token[] tokensFromAnalysis(Analyzer analyzer, String text)

            throws IOException {

        TokenStream stream = analyzer.tokenStream("contents", new StringReader(

                text));

        boolean b = true;

        List<Token> list = new ArrayList<Token>();

        while (b) {

            Token token = stream.next();

            if (token == null)

                b = false;

            else

                list.add(token);

        }

        return (Token[]) list.toArray(new Token[0]);

    }

}
查看全文

相关阅读:
DBUtils温习2
DBUtils温习1
C3P0连接池温习1
JDBC复习2
JDBC复习1
Spring的AOP基于AspectJ的注解方式开发3
Spring的AOP基于AspectJ的注解方式开发2
Spring的AOP基于AspectJ的注解方式开发1
高血压认知3
pandas cookbook

原文地址：https://www.cnblogs.com/1130136248wlxk/p/5031130.html