zoukankan      html  css  js  c++  java
  • Lucene实现自己的英文空格小写分词器

    看一下继承图,Tokenizer和TokenFilter都是继承于TokenStream,TokenStream继承了AttributeSource

    package com.lucene.demo.analizer;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenFilter;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.Tokenizer;
    import org.apache.lucene.util.Attribute;
    import org.apache.lucene.util.AttributeImpl;
    import org.apache.lucene.util.AttributeReflector;
    
    import java.io.IOException;
    
    public class SansamAnalyzer extends Analyzer{
        /**
         *
         */
    
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            //装饰器模式,将分出的词项用filter进行处理,可以链式装饰实现多个filter
            MyTokenizer myTokenizer = new MyTokenizer();
            MyLowerCaseTokenFilter myLowerCaseTokenFilter = new MyLowerCaseTokenFilter(myTokenizer);
            return new TokenStreamComponents(myTokenizer, myLowerCaseTokenFilter);
        }
    
        public static class MyTokenizer extends Tokenizer{
            //调用AttributeSource-addAttribute方法
            //维护了一个attributes Map,实现可复用
            //private final Map<Class<? extends Attribute>, AttributeImpl> attributes;
            //private final Map<Class<? extends AttributeImpl>, AttributeImpl> attributeImpls;
    
            MyAttribute attribute = this.addAttribute(MyAttribute.class);
    
            char[] buffer = new char[255];
            int length = 0;
            int c;
    
            @Override
            public boolean incrementToken() throws IOException {    
                //进行分析处理逻辑
                clearAttributes();
                length = 0;
                while (true){
                    c = this.input.read();
                    if(c == -1){
                        if(length > 0){
                            this.attribute.setChar(buffer,length);
                            return true;
                        }else {
                            return false;
                        }
                    }
    
                    if(Character.isWhitespace(c)){
                        if(length > 0){
                            this.attribute.setChar(buffer,length);
                            return true;
                        }
                    }
    
                    buffer[length++] = (char)c;
                }
            }
        }
    
        public static class MyLowerCaseTokenFilter extends TokenFilter{
            public MyLowerCaseTokenFilter(TokenStream s){
                super(s);
            }
    
            MyAttribute attribute = this.addAttribute(MyAttribute.class);
    
            @Override
            public boolean incrementToken() throws IOException {
                //获取一个分词项进行处理
                boolean b = this.input.incrementToken();
                if (b){
                    char[] chars = this.attribute.getChar();
                    int length = this.attribute.getLength();
                    if(length > 0){
                        for (int i = 0; i < length; i++) {
                            chars[i] = Character.toLowerCase(chars[i]);
                        }
                    }
                }
    
                return b;
            }
        }
    
    
        /**
         * 自定义Attribute属性接口 继承Attribute
         */
        public static interface MyAttribute extends Attribute {
    
            void setChar(char [] c, int length);
    
            char [] getChar();
    
            int getLength();
    
            String getString();
        }
    
        /**
         * 必须使用interface+Impl 继承AttributeImpl
         */
        public static class MyAttributeImpl extends AttributeImpl implements MyAttribute {
    
            char [] term = new char[255];
            int length = 0;
    
            @Override
            public void setChar(char[] c, int length) {
                this.length = length;
                if(c.length > 0){
                    System.arraycopy(c,0,term,0,length);
                }
            }
    
            @Override
            public char[] getChar() {
                return term;
            }
    
            @Override
            public int getLength() {
                return length;
            }
    
            @Override
            public String getString() {
                if(length > 0){
                    return new String(term,0,length);
                }
                return null;
    //            return new String(term);  //不能直接返回 因为长度问题 默认255字符
            }
    
            @Override
            public void clear() {
                term = null;
                term = new char[255];
                this.length = 0;
            }
    
            @Override
            public void reflectWith(AttributeReflector reflector) {
    
            }
    
            @Override
            public void copyTo(AttributeImpl target) {
    
            }
        }
    
        public static void main(String[] args) {
            String text = "Hello World A b C";
            try(SansamAnalyzer analyzer = new SansamAnalyzer();
                //调用tokenStream()时 会先得到TokenStreamComponents对象 得到了MyLowerCaseTokenFilter 对象 观察其构造方法及此方法的返回值
                TokenStream stream =  analyzer.tokenStream("title",text);){    
                MyAttribute attribute = stream.getAttribute(MyAttribute.class);
                stream.reset();
                while (stream.incrementToken()){
                    System.out.print(attribute.getString()+" | ");
                }
                stream.end();
            }catch (Exception e){
                    e.printStackTrace();
            }
        }
    }
    
    
  • 相关阅读:
    fedora上部署ASP.NET——(卡带式电脑跑.NET WEB服务器)
    SQL Server 请求失败或服务未及时响应。有关详细信息,请参见事件日志或其它适合的错误日志
    8086CPU的出栈(pop)和入栈(push) 都是以字为单位进行的
    FTP 服务搭建后不能访问问题解决
    指定的 DSN 中,驱动程序和应用程序之间的体系结构不匹配
    Linux 安装MongoDB 并设置防火墙,使用远程客户端访问
    svn Please execute the 'Cleanup' command. 问题解决
    .net 操作MongoDB 基础
    oracle 使用绑定变量极大的提升性能
    尝试加载 Oracle 客户端库时引发 BadImageFormatException。如果在安装 32 位 Oracle 客户端组件的情况下以 64 位模式运行,将出现此问题。
  • 原文地址:https://www.cnblogs.com/sansamh/p/9030783.html
Copyright © 2011-2022 走看看