zoukankan      html  css  js  c++  java
  • lucene中分词器的用法

    package com.ljq.analyzer;

    import java.io.StringReader;

    import jeasy.analysis.MMAnalyzer;

    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.SimpleAnalyzer;
    import org.apache.lucene.analysis.Token;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.cjk.CJKAnalyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.junit.Test;

    public class AnalyzerTest {

    String ensaText
    = "IndexWriter addDocument's a javadoc.txt";
    String ensa2Text
    = "我们是中国人";
    String zhcjkaText
    = "小笑话_总统的房间 Room .txt";
    String zhmnText
    = "一位绅士到旅游胜地的一家饭店要开个房间";

    Analyzer ensa
    = new StandardAnalyzer(); // 单字分词
    Analyzer ensa2 = new SimpleAnalyzer();
    Analyzer zhcjka
    = new CJKAnalyzer(); // 二分法分词
    Analyzer zhmn = new MMAnalyzer(); // 词库分词

    @Test
    public void test() throws Exception {
    // 单字分词
    /*(indexwriter,0,11,type=<ALPHANUM>)
    (adddocument,12,25,type=<APOSTROPHE>)
    (javadoc.txt,28,39,type=<HOST>)
    */
    //analyze(ensa, ensaText);

    //(我们是中国人,0,6)
    //analyze(ensa2, ensa2Text);

    // 二分法分词
    /*(小笑,0,2,type=double)
    (笑话,1,3,type=double)
    (_,3,4,type=single)
    (总统,4,6,type=double)
    (统的,5,7,type=double)
    (的房,6,8,type=double)
    (房间,7,9,type=double)
    (room,10,14,type=single)
    (txt,16,19,type=single)
    */
    //analyze(zhcjka, zhcjkaText);

    // 词库分词
    /*(一位,0,2)
    (绅士,2,4)
    (旅游胜地,5,9)
    (一家,10,12)
    (饭店,12,14)
    (要,14,15)
    (开个,15,17)
    (房间,17,19)
    */
    analyze(zhmn, zhmnText);
    }

    /**
    * 分词
    *
    *
    @param analyzer
    * 分词器
    *
    @param text
    * 数据源
    *
    @throws Exception
    */
    public void analyze(Analyzer analyzer, String text) throws Exception {
    TokenStream tokenStream
    = analyzer.tokenStream("content",
    new StringReader(text));
    for (Token token = new Token(); (token = tokenStream.next(token)) != null;) {
    System.out.println(token);
    }
    }
    }
  • 相关阅读:
    Net 下安装、调试的常见问题与错误
    解决在网页框架中,页面的样式表失效的方法
    C#.NET Show Text Info
    C#.NET 部署应用程序之ClickOnce
    VS2005 数据库间转移数据(SSIS)
    C#.NET ClickOnce
    SQL2005 还原备份数据
    C#.NET TreeView.cs
    C#.NET SetComboBox Class
    C#.NET GetLocalMachineInfo.cs
  • 原文地址:https://www.cnblogs.com/linjiqin/p/2001594.html
Copyright © 2011-2022 走看看