zoukankan      html  css  js  c++  java
  • lucene中分词器的用法

    package com.ljq.analyzer;

    import java.io.StringReader;

    import jeasy.analysis.MMAnalyzer;

    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.SimpleAnalyzer;
    import org.apache.lucene.analysis.Token;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.cjk.CJKAnalyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.junit.Test;

    public class AnalyzerTest {

    String ensaText
    = "IndexWriter addDocument's a javadoc.txt";
    String ensa2Text
    = "我们是中国人";
    String zhcjkaText
    = "小笑话_总统的房间 Room .txt";
    String zhmnText
    = "一位绅士到旅游胜地的一家饭店要开个房间";

    Analyzer ensa
    = new StandardAnalyzer(); // 单字分词
    Analyzer ensa2 = new SimpleAnalyzer();
    Analyzer zhcjka
    = new CJKAnalyzer(); // 二分法分词
    Analyzer zhmn = new MMAnalyzer(); // 词库分词

    @Test
    public void test() throws Exception {
    // 单字分词
    /*(indexwriter,0,11,type=<ALPHANUM>)
    (adddocument,12,25,type=<APOSTROPHE>)
    (javadoc.txt,28,39,type=<HOST>)
    */
    //analyze(ensa, ensaText);

    //(我们是中国人,0,6)
    //analyze(ensa2, ensa2Text);

    // 二分法分词
    /*(小笑,0,2,type=double)
    (笑话,1,3,type=double)
    (_,3,4,type=single)
    (总统,4,6,type=double)
    (统的,5,7,type=double)
    (的房,6,8,type=double)
    (房间,7,9,type=double)
    (room,10,14,type=single)
    (txt,16,19,type=single)
    */
    //analyze(zhcjka, zhcjkaText);

    // 词库分词
    /*(一位,0,2)
    (绅士,2,4)
    (旅游胜地,5,9)
    (一家,10,12)
    (饭店,12,14)
    (要,14,15)
    (开个,15,17)
    (房间,17,19)
    */
    analyze(zhmn, zhmnText);
    }

    /**
    * 分词
    *
    *
    @param analyzer
    * 分词器
    *
    @param text
    * 数据源
    *
    @throws Exception
    */
    public void analyze(Analyzer analyzer, String text) throws Exception {
    TokenStream tokenStream
    = analyzer.tokenStream("content",
    new StringReader(text));
    for (Token token = new Token(); (token = tokenStream.next(token)) != null;) {
    System.out.println(token);
    }
    }
    }
  • 相关阅读:
    zookeeper使用场景
    zookeeper安装配置
    hadoop 远程调试
    deep learning笔记
    Sentiment Analysis(1)-Dependency Tree-based Sentiment Classification using CRFs with Hidden Variables
    PRML阅读笔记 introduction
    Python 学习笔记(2)
    python nltk 学习笔记(5) Learning to Classify Text
    python nltk 学习笔记(4) Writing Structured Programs
    python nltk 学习笔记(3) processing raw text
  • 原文地址:https://www.cnblogs.com/linjiqin/p/2001594.html
Copyright © 2011-2022 走看看