zoukankan      html  css  js  c++  java
  • Lucene实现自己的英文空格小写分词器

    看一下继承图,Tokenizer和TokenFilter都是继承于TokenStream,TokenStream继承了AttributeSource

    package com.lucene.demo.analizer;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenFilter;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.Tokenizer;
    import org.apache.lucene.util.Attribute;
    import org.apache.lucene.util.AttributeImpl;
    import org.apache.lucene.util.AttributeReflector;
    
    import java.io.IOException;
    
    public class SansamAnalyzer extends Analyzer{
        /**
         *
         */
    
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            //装饰器模式,将分出的词项用filter进行处理,可以链式装饰实现多个filter
            MyTokenizer myTokenizer = new MyTokenizer();
            MyLowerCaseTokenFilter myLowerCaseTokenFilter = new MyLowerCaseTokenFilter(myTokenizer);
            return new TokenStreamComponents(myTokenizer, myLowerCaseTokenFilter);
        }
    
        public static class MyTokenizer extends Tokenizer{
            //调用AttributeSource-addAttribute方法
            //维护了一个attributes Map,实现可复用
            //private final Map<Class<? extends Attribute>, AttributeImpl> attributes;
            //private final Map<Class<? extends AttributeImpl>, AttributeImpl> attributeImpls;
    
            MyAttribute attribute = this.addAttribute(MyAttribute.class);
    
            char[] buffer = new char[255];
            int length = 0;
            int c;
    
            @Override
            public boolean incrementToken() throws IOException {    
                //进行分析处理逻辑
                clearAttributes();
                length = 0;
                while (true){
                    c = this.input.read();
                    if(c == -1){
                        if(length > 0){
                            this.attribute.setChar(buffer,length);
                            return true;
                        }else {
                            return false;
                        }
                    }
    
                    if(Character.isWhitespace(c)){
                        if(length > 0){
                            this.attribute.setChar(buffer,length);
                            return true;
                        }
                    }
    
                    buffer[length++] = (char)c;
                }
            }
        }
    
        public static class MyLowerCaseTokenFilter extends TokenFilter{
            public MyLowerCaseTokenFilter(TokenStream s){
                super(s);
            }
    
            MyAttribute attribute = this.addAttribute(MyAttribute.class);
    
            @Override
            public boolean incrementToken() throws IOException {
                //获取一个分词项进行处理
                boolean b = this.input.incrementToken();
                if (b){
                    char[] chars = this.attribute.getChar();
                    int length = this.attribute.getLength();
                    if(length > 0){
                        for (int i = 0; i < length; i++) {
                            chars[i] = Character.toLowerCase(chars[i]);
                        }
                    }
                }
    
                return b;
            }
        }
    
    
        /**
         * 自定义Attribute属性接口 继承Attribute
         */
        public static interface MyAttribute extends Attribute {
    
            void setChar(char [] c, int length);
    
            char [] getChar();
    
            int getLength();
    
            String getString();
        }
    
        /**
         * 必须使用interface+Impl 继承AttributeImpl
         */
        public static class MyAttributeImpl extends AttributeImpl implements MyAttribute {
    
            char [] term = new char[255];
            int length = 0;
    
            @Override
            public void setChar(char[] c, int length) {
                this.length = length;
                if(c.length > 0){
                    System.arraycopy(c,0,term,0,length);
                }
            }
    
            @Override
            public char[] getChar() {
                return term;
            }
    
            @Override
            public int getLength() {
                return length;
            }
    
            @Override
            public String getString() {
                if(length > 0){
                    return new String(term,0,length);
                }
                return null;
    //            return new String(term);  //不能直接返回 因为长度问题 默认255字符
            }
    
            @Override
            public void clear() {
                term = null;
                term = new char[255];
                this.length = 0;
            }
    
            @Override
            public void reflectWith(AttributeReflector reflector) {
    
            }
    
            @Override
            public void copyTo(AttributeImpl target) {
    
            }
        }
    
        public static void main(String[] args) {
            String text = "Hello World A b C";
            try(SansamAnalyzer analyzer = new SansamAnalyzer();
                //调用tokenStream()时 会先得到TokenStreamComponents对象 得到了MyLowerCaseTokenFilter 对象 观察其构造方法及此方法的返回值
                TokenStream stream =  analyzer.tokenStream("title",text);){    
                MyAttribute attribute = stream.getAttribute(MyAttribute.class);
                stream.reset();
                while (stream.incrementToken()){
                    System.out.print(attribute.getString()+" | ");
                }
                stream.end();
            }catch (Exception e){
                    e.printStackTrace();
            }
        }
    }
    
    
  • 相关阅读:
    mongodb笔记一
    mysql的备份和恢复
    explain的type列
    Debian下apache2设置并发
    nginx基本调优
    c语言struct
    Centos 安装nginx + php + mysql
    Debian下系统启动时执行脚本
    centos5.5服务器基本篇
    分治法
  • 原文地址:https://www.cnblogs.com/sansamh/p/9030783.html
Copyright © 2011-2022 走看看