zoukankan      html  css  js  c++  java
  • lucene定义自己的分词器将其分成单个字符

    问题描写叙述:将一句话拆分成单个字符。而且去掉空格。


    package com.mylucene;
    
    import java.io.IOException;
    import java.io.Reader;
    
    import org.apache.lucene.analysis.Tokenizer;
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
    import org.apache.lucene.util.AttributeSource.AttributeFactory;
    
    public class SpiltChar extends Tokenizer {
    	
    	 public SpiltChar(AttributeFactory factory, Reader input) {
    		super(factory, input);
    		// TODO Auto-generated constructor stub
    	}
    
    	public SpiltChar(Reader input) {
    	      super(input);
    	    }
    	    private int offset = 0, bufferIndex=0, dataLen=0;
    	    private final static int MAX_WORD_LEN = 255;
    	    private final static int IO_BUFFER_SIZE = 1024;
    	    private final char[] buffer = new char[MAX_WORD_LEN];
    	    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
    	    private int length;
    	    private int start;
    	    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    	    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    	    private final void push(char c) {
    	        if (length == 0) start = offset-1;            // start of token
    	        buffer[length++] = Character.toLowerCase(c);  // buffer it
    
    	    }
    
    	    private final boolean flush() {
    
    	        if (length>0) {
    	            //System.out.println(new String(buffer, 0,
    	            //length));
    	          termAtt.copyBuffer(buffer, 0, length);
    	          offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
    	          return true;
    	        }
    	        else
    	            return false;
    	    }
    
    	    @Override
    	    public boolean incrementToken() throws IOException {
    	        clearAttributes();
    
    	        length = 0;
    	        start = offset;
    	        while (true) {
    	            final char c;
    	            offset++;
    	            if (bufferIndex >= dataLen) {
    	                dataLen = input.read(ioBuffer);
    	                bufferIndex = 0;
    	            }
    
    	            if (dataLen == -1) {
    	              offset--;
    	              return flush();
    	            } else
    	                c = ioBuffer[bufferIndex++];
    
    	            switch(Character.getType(c)) {
    
    	            case Character.DECIMAL_DIGIT_NUMBER://注意此部分只是滤一些熟悉或者字母
    	            case Character.LOWERCASE_LETTER://注意此部分
    	            case Character.UPPERCASE_LETTER://注意此部分
    //	                push(c);
    //	                if (length == MAX_WORD_LEN) return flush();
    //	                break;
    	         
    	            case Character.OTHER_LETTER:
    	                if (length>0) {
    	                    bufferIndex--;
    	                    offset--;
    	                    return flush();
    	                }
    	                push(c);
    	                return flush();
    
    	            default:
    	                if (length>0) return flush();
    	            	 
    		                break;
    	                
    	            }
    	        }
    	    }
    	    
    	    @Override
    	    public final void end() {
    	      // set final offset
    	      final int finalOffset = correctOffset(offset);
    	      this.offsetAtt.setOffset(finalOffset, finalOffset);
    	    }
    
    	    @Override
    	    public void reset() throws IOException {
    	      super.reset();
    	      offset = bufferIndex = dataLen = 0;
    	    }
    
    }
    

    定义自己的分词器类:

    package com.mylucene;
    
    import java.io.Reader;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.Tokenizer;
    
    /**
     * 单字切分
     * **/
    public class SpiltCharAnalyzer extends Analyzer {
    
    	@Override
    	protected TokenStreamComponents createComponents(String arg0, Reader arg1) {
    	   
    		Tokenizer token=new SpiltChar(arg1);
    		
    		return new TokenStreamComponents(token);
    	}
    	
    	
    	
    	
    
    }
    
    
    

  • 相关阅读:
    漫游Kafka介绍章节简介
    poj 2309 BST 使用树阵lowbit
    华为-on练习--小写字符数的统计显示
    OpenMp高速分拣
    eclipse 于 Tomcat于 热部署 project
    2015第49周二
    2015第49周一
    2015第48周六
    2015第48周五
    2015第48周四
  • 原文地址:https://www.cnblogs.com/yxwkf/p/5207541.html
Copyright © 2011-2022 走看看