zoukankan      html  css  js  c++  java
  • Lucene中Analyzer语句分析

        Lucene中Analyzer语句分析,利用lucene中自带的词法分析工具Analyzer,进行对句子的分析。

    源代码如下:

     1 package com.test;
     2 
     3 import java.io.IOException;
     4 import java.io.StringReader;
     5 import java.util.List;
     6 
     7 import org.apache.lucene.analysis.Analyzer;
     8 import org.apache.lucene.analysis.SimpleAnalyzer;
     9 import org.apache.lucene.analysis.StopAnalyzer;
    10 import org.apache.lucene.analysis.Token;
    11 import org.apache.lucene.analysis.TokenStream;
    12 import org.apache.lucene.analysis.WhitespaceAnalyzer;
    13 import org.apache.lucene.analysis.standard.StandardAnalyzer;
    14 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
    15 import org.apache.lucene.util.Version;
    16 
    17 import com.bean.mashupDerscriptionTest;
    18 import com.daoImpl.MashupDaoImpl;
    19 import com.gargoylesoftware.htmlunit.javascript.host.Comment;
    20 
    21 public class KeyWordsTest {
    22 
    23     /**
    24      * @param args
    25      */
    26     public static void main(String[] args) {
    27         MashupDaoImpl mashupDao = new MashupDaoImpl();
    28         List<mashupDerscriptionTest> list = mashupDao
    29                 .findAllmashupDescripteonTest();
    30         int i = 1;
    31         String comment = null;
    32         for (mashupDerscriptionTest mashup : list) {
    33             // 描述为空去名字作为描述
    34             if (mashup.getComments().equals("")) {
    35                 comment = mashup.getName();
    36             } else {
    37                 comment = mashup.getComments();
    38             }
    39 //            System.out.println(comment);
    40             //对读取的描述利用Lucene中的Analyzer进行句子分析产生
    41             //空格及各种符号分割,去掉停止词,停止词包括 is,are,in,on,the等无实际意义的词  
    42             StringReader reader = new StringReader(comment);
    43             Analyzer analyzer = new StopAnalyzer();
    44             TokenStream tStream = analyzer.tokenStream("", reader);
    45             Token t;
    46             try {
    47                 while ((t = tStream.next()) != null) {
    48                     //对每个单词采用
    49                     System.out.print(t.termText()+" ");
    50                 }
    51                 System.out.println((i++)+"条描述分词结束!");
    52             } catch (IOException e) {
    53                 e.printStackTrace();
    54             }    
    55         }
    56     }
    57 }

      注:数据来源于数据库中......

  • 相关阅读:
    「USACO 2020.12 Platinum」Sleeping Cows
    拉格朗日反演 (Lagrange Inversion)
    「ROI 2016 Day1」人烟之山
    「ROI 2016 Day2」二指禅
    ZJOI2016 大森林
    CF1119H Triple
    [ZJOI2016]线段树
    CF1237F
    NOI2018 情报中心
    CF1270I Xor on Figures
  • 原文地址:https://www.cnblogs.com/rememberme/p/Lucene_Analyzer.html
Copyright © 2011-2022 走看看