zoukankan      html  css  js  c++  java
  • lucene 自定义分词器小程序

    测试类
    package LuceneUtil;
    
    import java.io.Reader;
    import java.util.Set;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.LetterTokenizer;
    import org.apache.lucene.analysis.LowerCaseFilter;
    import org.apache.lucene.analysis.StopAnalyzer;
    import org.apache.lucene.analysis.StopFilter;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.util.Version;
    //自定义过滤分词器
    
    public class MyStopAnalyzer extends Analyzer {
    	private Set stops;
    	public MyStopAnalyzer(String [] sws)//形参为 字符串数组
    	{
    		//会自动将字符串数组转换为Set
    		stops=StopFilter.makeStopSet(Version.LUCENE_35, sws,true);
    		//将原有的停用词加入到现在的停用词中
    		stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    	}
    	
    	public MyStopAnalyzer()
    	{
    		stops=StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    	}
    	
    	public TokenStream tokenStream(String FileName,Reader reader)
    	{
    		return new StopFilter(Version.LUCENE_35, 
    				new LowerCaseFilter(Version.LUCENE_35,
    				new LetterTokenizer(Version.LUCENE_35, reader)),  stops);
    		
    	}
    
    }
    

    
    
     
    package LuceneTest;
    
    import java.io.BufferedWriter;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.Iterator;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.StopAnalyzer;
    import org.apache.lucene.util.Version;
    import org.wltea.analyzer.lucene.IKAnalyzer;
    
    import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
    
    import LuceneUtil.AnalyzerUtils;
    import LuceneUtil.MyStopAnalyzer;
    
    public class TestAnalyzer {
    	static ArrayList<String> list=null;
    	
    	
    	public static void main (String [] args) throws IOException{
    		//addNewWord( "烟台大学 ");
    		//test();
    		test01();
    		
    	}
    	
    	public static void test()
    	{
    		Analyzer a1=new MMSegAnalyzer();
    		String txt="我是一名大学生,我来自菏*,我现在烟台大学。";
    		AnalyzerUtils.displayToken(txt,a1); 
    	
    	}
    	
    	
    	public static void test01()
    	{
    		//使用自定义的过滤分词器
    		//这个语句 可以吧 “you“,”meet”,和“***”  给和谐掉
    		Analyzer a2=new MyStopAnalyzer(new String [] {"you","meet","***"});
    		//系统自带的StopAnalyzer
    		Analyzer a3=new StopAnalyzer(Version.LUCENE_35);
    		
    		String txt=" i say :how are You,nice to meet you. ***";
    		AnalyzerUtils.displayToken(txt,a2); 
    		AnalyzerUtils.displayToken(txt,a3); 
    	
    	}
    
    package LuceneUtil;
    import java.io.IOException;
    import java.io.StringReader;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    import org.omg.CORBA.portable.Streamable;
    //测试类
    
    public class AnalyzerUtils {
    	
    	
    	public static  void displayToken(String str,Analyzer a)
    	{
    		
    		try {
    			TokenStream ts=a.tokenStream("cotents", new StringReader(str));
    			//创建一个属性,这个属性添加到流中,随着TokenStream增加
    			CharTermAttribute cta=ts.addAttribute(CharTermAttribute.class);
    			
    			while (ts.incrementToken())
    			{
    				System.out.print("["+cta+"]");
    				
    			}
    			System.out.println();
    			
    		} catch (IOException e) {
    			
    			e.printStackTrace();
    		}
    	}
    
    }
    
    
    

    /*public static void addNewWord(String newWord) throws IOException{BufferedWriter bw=new BufferedWriter(new FileWriter("G:\\mmseg\\data\\words-my.dic"));ArrayList<String> list=new ArrayList<String>();list.add(newWord);Iterator<String> iterator=list.iterator();while (iterator.hasNext()){bw.write(iterator.next());bw.flush();bw.newLine();}bw.close();System.out.println("添加成功");}*/}
    
    

    测试结果如下:

    可见 我想和谐掉的 那几个字已被和谐

    第一行为执行和谐后的结果

    第二行为未被和谐的
    [i][say][how][nice]
    [i][say][how][you][nice][meet][you][***]

    不足:还不能对单个汉语词语和谐 ,汉语只能屏蔽一句话。而英语却可以

  • 相关阅读:
    HDU 2236 无题Ⅱ
    Golden Tiger Claw(二分图)
    HDU 5969 最大的位或 (思维,贪心)
    HDU 3686 Traffic Real Time Query System (图论)
    SCOI 2016 萌萌哒
    Spring Boot支持控制台Banner定制
    构建第一个Spring Boot程序
    Spring Boot重要模块
    Java fastjson JSON和String互相转换
    BCompare 4 Windows激活方法【试用期30天重置】
  • 原文地址:https://www.cnblogs.com/lixingle/p/3313037.html
Copyright © 2011-2022 走看看