zoukankan      html  css  js  c++  java
  • lucene3.6.0的高亮显示

    需要引入

    		<dependency>
    			<groupId>org.apache.lucene</groupId>
    			<artifactId>lucene-core</artifactId>
    			<version>3.6.0</version>
    		</dependency>
    		<dependency>
    			<groupId>org.apache.lucene</groupId>
    			<artifactId>lucene-highlighter</artifactId>
    			<version>3.6.0</version>
    		</dependency>

    示例代码:

    import java.io.IOException;
    import java.io.StringReader;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.queryParser.ParseException;
    import org.apache.lucene.queryParser.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.highlight.Highlighter;
    import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
    import org.apache.lucene.search.highlight.QueryScorer;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    public class DocSearch {
    
    	private static IndexSearcher isearcher = null;
    	public static void search(String key) throws IOException, ParseException, InvalidTokenOffsetsException{
    		 Directory directory = FSDirectory.open(new File("E:\\output\\lucence\\index"));
    		 // Now search the index:
    	    IndexReader ireader = IndexReader.open(directory); // read-only=true
    	    isearcher  = new IndexSearcher(ireader);
    	    // Parse a simple query that searches for "text":
    	    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    	    
    //	    TokenStream tokenStream = analyzer.tokenStream("context", new StringReader("this is a quick gooobuy"));
    //	    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    //	    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    //
    //	    while (tokenStream.incrementToken()) {
    //	        int startOffset = offsetAttribute.startOffset();
    //	        int endOffset = offsetAttribute.endOffset();
    //	        String term = charTermAttribute.toString();
    //	        System.out.println(offsetAttribute.toString() + "\t" + term);
    //	    }
    	    
    	    QueryParser parser = new QueryParser(Version.LUCENE_CURRENT,"context", analyzer);
    	    Query query = parser.parse(key);
    	    ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
    	    
    	    Highlighter hl = new Highlighter(new QueryScorer(query));
    	    
    	    System.out.println(query.toString());
    	    // Iterate through the results:
    	    for (int i = 0; i < hits.length; i++) {
    	      Document hitDoc = isearcher.doc(hits[i].doc);
    	      TokenStream ts = analyzer.tokenStream("context", new StringReader(hitDoc.getValues("context")[0]));
    	      String frament = hl.getBestFragment(ts, hitDoc.getValues("context")[0]);
    	      System.out.println(frament);
    //	      System.out.println(hitDoc.getValues("id")[0] + "\t" + hitDoc.getValues("context")[0] + "\t" + hits[i].score);
    //	      Explanation explan = isearcher.explain(query, hits[i].doc);
    //	      System.out.println(explan);
    	    }
    	}
    	
    	public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException {
    		search("旧水泥袋");
    		isearcher.close();
    	}
    	
    }

    索引建立和数据参考http://zhwj184.iteye.com/admin/blogs/1522709

     

    输出结果:

    context:旧 context:水 context:泥 context:袋
    采购<B>旧</B>编织<B>袋</B>、<B>旧</B><B>水</B><B>泥</B><B>袋</B>
    <B>水</B><B>泥</B>
    采购<B>水</B><B>泥</B>电阻
    求购<B>水</B><B>泥</B>输送链条和提升机
    1万5 潜<B>水</B>料啤酒手提包 手提<B>袋</B>
    大量采购包装用的编织<B>袋</B>(新的<B>旧</B>的,有无商标皆可)
    铁<B>泥</B> 铁灰
    废<B>旧</B>砂轮
    软陶<B>泥</B>,超轻粘土
    <B>水</B>泵
    手<B>袋</B>
    <B>水</B>锈石 上<B>水</B>石  吸<B>水</B>石
    足浴<B>袋</B>  泡脚<B>袋</B> 异形<B>袋</B>
    手提<B>袋</B>制<B>袋</B>机
    回收库存废<B>旧</B>油墨油漆
    回收库存<B>旧</B>油漆13463048572
    求购废<B>旧</B>油漆油墨13463048572
    求购库存<B>旧</B>化工树脂

    highlighter类的分析
    /**
     * Class used to markup highlighted terms found in the best sections of a
     * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
     * {@link Encoder} and tokenizers.
     */
    public class Highlighter
    {
      public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
    
      private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
    	private Formatter formatter;
    	private Encoder encoder;
    	private Fragmenter textFragmenter=new SimpleFragmenter();
    	private Scorer fragmentScorer=null;
    
    	public Highlighter(Scorer fragmentScorer)
    	{
    		this(new SimpleHTMLFormatter(),fragmentScorer);
    	}
    
    
     	public Highlighter(Formatter formatter, Scorer fragmentScorer)
     	{
    		this(formatter,new DefaultEncoder(),fragmentScorer);
    	}
    
    
    	public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
    	{
     		this.formatter = formatter;
    		this.encoder = encoder;
     		this.fragmentScorer = fragmentScorer;
     	}

    这里有两个扩展,formatter和encoder,formatter其实就是堆高亮部分的显示逻辑,比如默认是直接加<B></B>,encoder编码这里默认是不错任何处理,这里可以对输入的文本进行编码处理,

     

    可以查看highlighter的encoder的一个默认实现

    package org.apache.lucene.search.highlight;
    /**
     * Copyright 2005 The Apache Software Foundation
     *
     * Licensed under the Apache License, Version 2.0 (the "License");
     * you may not use this file except in compliance with the License.
     * You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */
    
    /**
     * Simple {@link Encoder} implementation to escape text for HTML output
     *
     */
    public class SimpleHTMLEncoder implements Encoder
    {
    	public SimpleHTMLEncoder()
    	{
    	}
    
    	public String encodeText(String originalText)
    	{
    		return htmlEncode(originalText);
    	}
    	
    	/**
    	 * Encode string into HTML
    	 */
    	public final static String htmlEncode(String plainText) 
    	{
    		if (plainText == null || plainText.length() == 0)
    		{
    			return "";
    		}
    
    		StringBuilder result = new StringBuilder(plainText.length());
    
    		for (int index=0; index<plainText.length(); index++) 
    		{
    			char ch = plainText.charAt(index);
    
    			switch (ch) 
    			{
    			case '"':
    				result.append(""");
    				break;
    
    			case '&':
    				result.append("&");
    				break;
    
    			case '<':
    				result.append("<");
    				break;
    
    			case '>':
    				result.append(">");
    				break;
    
    			default:
    				   if (ch < 128) 
    				   {
    			           result.append(ch);
    			       } 
    				   else 
    			       {
    			           result.append("&#").append((int)ch).append(";");
    			       }
    			}
    		}
    
    		return result.toString();
    	}
    }

    formatter的默认实现

    package org.apache.lucene.search.highlight;
    
    /**
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */
    
    /**
     * Simple {@link Formatter} implementation to highlight terms with a pre and
     * post tag.
     */
    public class SimpleHTMLFormatter implements Formatter {
      
      private static final String DEFAULT_PRE_TAG = "<B>";
      private static final String DEFAULT_POST_TAG = "</B>";
      
    	private String preTag;
    	private String postTag;
    	
    	public SimpleHTMLFormatter(String preTag, String postTag) {
    		this.preTag = preTag;
    		this.postTag = postTag;
    	}
    
    	/** Default constructor uses HTML: <B> tags to markup terms. */
    	public SimpleHTMLFormatter() {
    	  this(DEFAULT_PRE_TAG, DEFAULT_POST_TAG);
    	}
    
    	/* (non-Javadoc)
    	 * @see org.apache.lucene.search.highlight.Formatter#highlightTerm(java.lang.String, org.apache.lucene.search.highlight.TokenGroup)
    	 */
    	public String highlightTerm(String originalText, TokenGroup tokenGroup) {
    	  if (tokenGroup.getTotalScore() <= 0) {
    	    return originalText;
    	  }
    	  
    	  // Allocate StringBuilder with the right number of characters from the
        // beginning, to avoid char[] allocations in the middle of appends.
    	  StringBuilder returnBuffer = new StringBuilder(preTag.length() + originalText.length() + postTag.length());
    	  returnBuffer.append(preTag);
    	  returnBuffer.append(originalText);
    	  returnBuffer.append(postTag);
    	  return returnBuffer.toString();
    	}
    	
    }
    


  • 相关阅读:
    6. Flask请求和响应
    5. Flask模板
    FW:Software Testing
    What is the difference between modified duration, effective duration and duration?
    How to push master to QA branch in GIT
    FTPS Firewall
    Query performance optimization of Vertica
    (Forward)5 Public Speaking Tips That'll Prepare You for Any Interview
    (转)The remote certificate is invalid according to the validation procedure
    Change
  • 原文地址:https://www.cnblogs.com/secbook/p/2655174.html
Copyright © 2011-2022 走看看