zoukankan      html  css  js  c++  java
  • Lucene:如何写一个自定义的分词器

    实现一个自定义分词器

    实现一个简单的英文分词器,主要分为以下几个步骤:

    1.建立自己的Attribute接口MyCharAttribute

     1 /**
     2 * MyCharAttribute
     3 *
     4 * @author limingcheng
     5 * @Date 2019/11/28
     6 */
     7 public interface MyCharAttribute extends Attribute {
     8 void setChars(char[] buffer, int length);
     9 
    10 char[] getChars();
    11 
    12 int getLength();
    13 
    14 String getString();
    15 }

    2.建立自定义attribute接口MyCharAttribute的实现类MyCharAttributeImpl

      1 /**
      2 * MyCharAttributeImpl
      3 * 2.建立自定义attribute接口MyCharAttribute的实现类MyCharAttributeImpl
      4 * 注意:MyCharAttributeImpl一定要和MyCharAttribute放在一个包下,否则会出现没有MyCharAttribute的实现类,
      5 * 这是由org.apache.lucene.util.AttributeFactory.DefaultAttributeFactory.findImplClass(Class<? extends Attribute>)这个方法决定的
      6 * @author limingcheng
      7 * @Date 2019/11/28
      8 */
      9 public class MyCharAttributeImpl extends AttributeImpl implements MyCharAttribute {
     10 
     11 private char[] chatTerm = new char[255];
     12 private int length = 0;
     13 
     14 @Override
     15 public void setChars(char[] buffer, int length) {
     16 this.length = length;
     17 if (length > 0) {
     18 System.arraycopy(buffer, 0, this.chatTerm, 0, length);
     19 }
     20 }
     21 
     22 @Override
     23 public char[] getChars() {
     24 return this.chatTerm;
     25 }
     26 
     27 @Override
     28 public int getLength() {
     29 return this.length;
     30 }
     31 
     32 @Override
     33 public String getString() {
     34 if (this.length > 0) {
     35 return new String(this.chatTerm, 0, length);
     36 }
     37 return null;
     38 }
     39 
     40 @Override
     41 public void clear() {
     42 this.length = 0;
     43 }
     44 
     45 @Override
     46 public void reflectWith(AttributeReflector reflector) {
     47 
     48 }
     49 
     50 @Override
     51 public void copyTo(AttributeImpl target) {
     52 
     53 }
     54 }
     55 3.建立分词器MyWhitespaceTokenizer:实现对英文按空白字符进行分词
     56 /**
     57 * MyWhitespaceTokenizer
     58 *
     59 * 3. 建立分词器MyWhitespaceTokenizer:实现对英文按空白字符进行分词
     60 * @author limingcheng
     61 * @Date 2019/11/28
     62 */
     63 public class MyWhitespaceTokenizer extends Tokenizer {
     64 
     65 // 需要记录的属性
     66 //
     67 MyCharAttribute charAttr = this.addAttribute(MyCharAttribute.class);
     68 
     69 // 存词的出现位置
     70 
     71 // 存放词的偏移
     72 
     73 //
     74 char[] buffer = new char[255];
     75 int length = 0;
     76 int c;
     77 
     78 @Override
     79 public boolean incrementToken() throws IOException {
     80 // 清除所有的词项属性
     81 clearAttributes();
     82 length = 0;
     83 while (true) {
     84 c = this.input.read();
     85 
     86 if (c == -1) {
     87 if (length > 0) {
     88 // 复制到charAttr
     89 this.charAttr.setChars(buffer, length);
     90 return true;
     91 } else {
     92 return false;
     93 }
     94 }
     95 
     96 if (Character.isWhitespace(c)) {
     97 if (length > 0) {
     98 // 复制到charAttr
     99 this.charAttr.setChars(buffer, length);
    100 return true;
    101 }
    102 }
    103 
    104 buffer[length++] = (char) c;
    105 }
    106 }
    107 
    108 }

    4.建立分项过滤器:把大写字母转换为小写字母

     1 /**
     2 * MyLowerCaseTokenFilter
     3 *
     4 * 4.建立分项过滤器:把大写字母转换为小写字母
     5 * @author limingcheng
     6 * @Date 2019/11/28
     7 */
     8 public class MyLowerCaseTokenFilter extends TokenFilter {
     9 public MyLowerCaseTokenFilter(TokenStream input) {
    10 super(input);
    11 }
    12 
    13 MyCharAttribute charAttr = this.addAttribute(MyCharAttribute.class);
    14 
    15 @Override
    16 public boolean incrementToken() throws IOException {
    17 boolean res = this.input.incrementToken();
    18 if (res) {
    19 char[] chars = charAttr.getChars();
    20 int length = charAttr.getLength();
    21 if (length > 0) {
    22 for (int i = 0; i < length; i++) {
    23 chars[i] = Character.toLowerCase(chars[i]);
    24 }
    25 }
    26 }
    27 return res;
    28 }
    29 }

    5.建立分析器

     1 /**
     2 * MyWhitespaceAnalyzer
     3 *
     4 * 5. 建立分析器
     5 * @author limingcheng
     6 * @Date 2019/11/28
     7 */
     8 public class MyWhitespaceAnalyzer extends Analyzer {
     9 @Override
    10 protected TokenStreamComponents createComponents(String fieldName) {
    11 Tokenizer source = new MyWhitespaceTokenizer();
    12 TokenStream filter = new MyLowerCaseTokenFilter(source);
    13 return new TokenStreamComponents(source, filter);
    14 }
    15 
    16 public static void main(String[] args) {
    17 
    18 String text = "广州华为有限公司 An AttributeSource contains a list of different AttributeImpls, and methods to add and get them. ";
    19 
    20 try {
    21 Analyzer ana = new MyWhitespaceAnalyzer();
    22 TokenStream ts = ana.tokenStream("aa", text);
    23 MyCharAttribute ca = ts.getAttribute(MyCharAttribute.class);
    24 ts.reset();
    25 while (ts.incrementToken()) {
    26 System.out.print(ca.getString() + "|");
    27 }
    28 ts.end();
    29 ana.close();
    30 System.out.println();
    31 } catch (IOException e) {
    32 e.printStackTrace();
    33 }
    34 
    35 }
    36 }

    一个简单的分词器可以这样实现,但是要实现一个可以对中文分词的分词器就需要算法方面的知识了。

    本文参考:https://www.cnblogs.com/leeSmall/p/8993185.html

  • 相关阅读:
    【C++】基础及引用
    gradle打包分编译环境
    gradle
    MediaPlayer滑动不准的问题
    python初步入门
    音频播放服务
    《深入理解Android2》读书笔记(二)
    缓存(LruCache)机制
    handler机制
    监听网络状态
  • 原文地址:https://www.cnblogs.com/bestlmc/p/12304665.html
Copyright © 2011-2022 走看看