实现一个自定义分词器
实现一个简单的英文分词器,主要分为以下几个步骤:
1.建立自己的Attribute接口MyCharAttribute
1 /** 2 * MyCharAttribute 3 * 4 * @author limingcheng 5 * @Date 2019/11/28 6 */ 7 public interface MyCharAttribute extends Attribute { 8 void setChars(char[] buffer, int length); 9 10 char[] getChars(); 11 12 int getLength(); 13 14 String getString(); 15 }
2.建立自定义attribute接口MyCharAttribute的实现类MyCharAttributeImpl
1 /** 2 * MyCharAttributeImpl 3 * 2.建立自定义attribute接口MyCharAttribute的实现类MyCharAttributeImpl 4 * 注意:MyCharAttributeImpl一定要和MyCharAttribute放在一个包下,否则会出现没有MyCharAttribute的实现类, 5 * 这是由org.apache.lucene.util.AttributeFactory.DefaultAttributeFactory.findImplClass(Class<? extends Attribute>)这个方法决定的 6 * @author limingcheng 7 * @Date 2019/11/28 8 */ 9 public class MyCharAttributeImpl extends AttributeImpl implements MyCharAttribute { 10 11 private char[] chatTerm = new char[255]; 12 private int length = 0; 13 14 @Override 15 public void setChars(char[] buffer, int length) { 16 this.length = length; 17 if (length > 0) { 18 System.arraycopy(buffer, 0, this.chatTerm, 0, length); 19 } 20 } 21 22 @Override 23 public char[] getChars() { 24 return this.chatTerm; 25 } 26 27 @Override 28 public int getLength() { 29 return this.length; 30 } 31 32 @Override 33 public String getString() { 34 if (this.length > 0) { 35 return new String(this.chatTerm, 0, length); 36 } 37 return null; 38 } 39 40 @Override 41 public void clear() { 42 this.length = 0; 43 } 44 45 @Override 46 public void reflectWith(AttributeReflector reflector) { 47 48 } 49 50 @Override 51 public void copyTo(AttributeImpl target) { 52 53 } 54 } 55 3.建立分词器MyWhitespaceTokenizer:实现对英文按空白字符进行分词 56 /** 57 * MyWhitespaceTokenizer 58 * 59 * 3. 建立分词器MyWhitespaceTokenizer:实现对英文按空白字符进行分词 60 * @author limingcheng 61 * @Date 2019/11/28 62 */ 63 public class MyWhitespaceTokenizer extends Tokenizer { 64 65 // 需要记录的属性 66 // 词 67 MyCharAttribute charAttr = this.addAttribute(MyCharAttribute.class); 68 69 // 存词的出现位置 70 71 // 存放词的偏移 72 73 // 74 char[] buffer = new char[255]; 75 int length = 0; 76 int c; 77 78 @Override 79 public boolean incrementToken() throws IOException { 80 // 清除所有的词项属性 81 clearAttributes(); 82 length = 0; 83 while (true) { 84 c = this.input.read(); 85 86 if (c == -1) { 87 if (length > 0) { 88 // 复制到charAttr 89 this.charAttr.setChars(buffer, length); 90 return true; 91 } else { 92 return false; 93 } 94 } 95 96 if (Character.isWhitespace(c)) { 97 if (length > 0) { 98 // 复制到charAttr 99 this.charAttr.setChars(buffer, length); 100 return true; 101 } 102 } 103 104 buffer[length++] = (char) c; 105 } 106 } 107 108 }
4.建立分项过滤器:把大写字母转换为小写字母
1 /** 2 * MyLowerCaseTokenFilter 3 * 4 * 4.建立分项过滤器:把大写字母转换为小写字母 5 * @author limingcheng 6 * @Date 2019/11/28 7 */ 8 public class MyLowerCaseTokenFilter extends TokenFilter { 9 public MyLowerCaseTokenFilter(TokenStream input) { 10 super(input); 11 } 12 13 MyCharAttribute charAttr = this.addAttribute(MyCharAttribute.class); 14 15 @Override 16 public boolean incrementToken() throws IOException { 17 boolean res = this.input.incrementToken(); 18 if (res) { 19 char[] chars = charAttr.getChars(); 20 int length = charAttr.getLength(); 21 if (length > 0) { 22 for (int i = 0; i < length; i++) { 23 chars[i] = Character.toLowerCase(chars[i]); 24 } 25 } 26 } 27 return res; 28 } 29 }
5.建立分析器
1 /** 2 * MyWhitespaceAnalyzer 3 * 4 * 5. 建立分析器 5 * @author limingcheng 6 * @Date 2019/11/28 7 */ 8 public class MyWhitespaceAnalyzer extends Analyzer { 9 @Override 10 protected TokenStreamComponents createComponents(String fieldName) { 11 Tokenizer source = new MyWhitespaceTokenizer(); 12 TokenStream filter = new MyLowerCaseTokenFilter(source); 13 return new TokenStreamComponents(source, filter); 14 } 15 16 public static void main(String[] args) { 17 18 String text = "广州华为有限公司 An AttributeSource contains a list of different AttributeImpls, and methods to add and get them. "; 19 20 try { 21 Analyzer ana = new MyWhitespaceAnalyzer(); 22 TokenStream ts = ana.tokenStream("aa", text); 23 MyCharAttribute ca = ts.getAttribute(MyCharAttribute.class); 24 ts.reset(); 25 while (ts.incrementToken()) { 26 System.out.print(ca.getString() + "|"); 27 } 28 ts.end(); 29 ana.close(); 30 System.out.println(); 31 } catch (IOException e) { 32 e.printStackTrace(); 33 } 34 35 } 36 }
一个简单的分词器可以这样实现,但是要实现一个可以对中文分词的分词器就需要算法方面的知识了。
本文参考:https://www.cnblogs.com/leeSmall/p/8993185.html