Lucene中Analyzer语句分析,利用lucene中自带的词法分析工具Analyzer,进行对句子的分析。
源代码如下:
1 package com.test; 2 3 import java.io.IOException; 4 import java.io.StringReader; 5 import java.util.List; 6 7 import org.apache.lucene.analysis.Analyzer; 8 import org.apache.lucene.analysis.SimpleAnalyzer; 9 import org.apache.lucene.analysis.StopAnalyzer; 10 import org.apache.lucene.analysis.Token; 11 import org.apache.lucene.analysis.TokenStream; 12 import org.apache.lucene.analysis.WhitespaceAnalyzer; 13 import org.apache.lucene.analysis.standard.StandardAnalyzer; 14 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; 15 import org.apache.lucene.util.Version; 16 17 import com.bean.mashupDerscriptionTest; 18 import com.daoImpl.MashupDaoImpl; 19 import com.gargoylesoftware.htmlunit.javascript.host.Comment; 20 21 public class KeyWordsTest { 22 23 /** 24 * @param args 25 */ 26 public static void main(String[] args) { 27 MashupDaoImpl mashupDao = new MashupDaoImpl(); 28 List<mashupDerscriptionTest> list = mashupDao 29 .findAllmashupDescripteonTest(); 30 int i = 1; 31 String comment = null; 32 for (mashupDerscriptionTest mashup : list) { 33 // 描述为空去名字作为描述 34 if (mashup.getComments().equals("")) { 35 comment = mashup.getName(); 36 } else { 37 comment = mashup.getComments(); 38 } 39 // System.out.println(comment); 40 //对读取的描述利用Lucene中的Analyzer进行句子分析产生 41 //空格及各种符号分割,去掉停止词,停止词包括 is,are,in,on,the等无实际意义的词 42 StringReader reader = new StringReader(comment); 43 Analyzer analyzer = new StopAnalyzer(); 44 TokenStream tStream = analyzer.tokenStream("", reader); 45 Token t; 46 try { 47 while ((t = tStream.next()) != null) { 48 //对每个单词采用 49 System.out.print(t.termText()+" "); 50 } 51 System.out.println((i++)+"条描述分词结束!"); 52 } catch (IOException e) { 53 e.printStackTrace(); 54 } 55 } 56 } 57 }
注:数据来源于数据库中......