zoukankan      html  css  js  c++  java
  • IK 用java 代码实现分词

    需要导入IK 对应的jar 包


    IKAnalyzer2012.jar

    lucene-core-4.10.jar

    public static void main(String[] args) throws IOException {
           //  String text="基于java语言开发的轻量级的中文分词工具包";  
             String text="宋祖英语培训班、周渝民政服务中心、容祖儿童医院、吴奇隆胸医院、苏永康复中心、梁朝伟哥专卖、陈冠希望小学、吴彦祖传中医坊、林书豪华酒店";  
          //创建分词对象  
    
                Analyzer anal=new IKAnalyzer(true);       
                StringReader reader=new StringReader(text);  
                //分词  
                TokenStream ts=anal.tokenStream("", reader);  
                CharTermAttribute term=ts.getAttribute(CharTermAttribute.class);  
                //遍历分词数据  
                ts.reset();
                while(ts.incrementToken()){  
                    System.out.print(term.toString()+"|");  
                }      
                ts.close();
                reader.close();  
                System.out.println();  
    
            
        }

    分词结果:

    宋祖英|语|培训班|周渝民|政|服务中心|容祖儿|童|医院|吴奇隆|胸|医院|苏永康|复|中心|梁朝伟|哥|专卖|陈冠希|望|小学|吴彦祖|传|中医|坊|林|书|豪华酒店|

             String text = "据说WWDC要推出iPhone6要出了?与iPhone5s相比怎样呢?@2014巴西世界杯";

    public static  String spiltword(String str) {
            String datas = "";
            Analyzer anal = new IKAnalyzer(true);
            StringReader reader = new StringReader(str);
            TokenStream ts ;
            try {
                ts = anal.tokenStream("", reader);
                CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
                // 遍历分词数据
                ts.reset();
                while (ts.incrementToken()) {
                    datas += ts.toString() +"
    ";
                }
                ts.close();
                reader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
    
            return datas;
        }

    分词结果:

    IKTokenizer@23134c4b term=据说,bytes=[e6 8d ae e8 af b4],startOffset=0,endOffset=2,positionIncrement=1,positionLength=1,type=CN_WORD
    IKTokenizer@23134c4b term=wwdc,bytes=[77 77 64 63],startOffset=2,endOffset=6,positionIncrement=1,positionLength=1,type=ENGLISH
    IKTokenizer@23134c4b term=要,bytes=[e8 a6 81],startOffset=6,endOffset=7,positionIncrement=1,positionLength=1,type=CN_CHAR
    IKTokenizer@23134c4b term=推出,bytes=[e6 8e a8 e5 87 ba],startOffset=7,endOffset=9,positionIncrement=1,positionLength=1,type=CN_WORD
    IKTokenizer@23134c4b term=iphone6,bytes=[69 70 68 6f 6e 65 36],startOffset=9,endOffset=16,positionIncrement=1,positionLength=1,type=LETTER
    IKTokenizer@23134c4b term=要,bytes=[e8 a6 81],startOffset=16,endOffset=17,positionIncrement=1,positionLength=1,type=CN_CHAR
    IKTokenizer@23134c4b term=出了,bytes=[e5 87 ba e4 ba 86],startOffset=17,endOffset=19,positionIncrement=1,positionLength=1,type=CN_WORD
    IKTokenizer@23134c4b term=与,bytes=[e4 b8 8e],startOffset=20,endOffset=21,positionIncrement=1,positionLength=1,type=CN_CHAR
    IKTokenizer@23134c4b term=iphone5s,bytes=[69 70 68 6f 6e 65 35 73],startOffset=21,endOffset=29,positionIncrement=1,positionLength=1,type=LETTER
    IKTokenizer@23134c4b term=相比,bytes=[e7 9b b8 e6 af 94],startOffset=29,endOffset=31,positionIncrement=1,positionLength=1,type=CN_WORD
    IKTokenizer@23134c4b term=怎样,bytes=[e6 80 8e e6 a0 b7],startOffset=31,endOffset=33,positionIncrement=1,positionLength=1,type=CN_WORD
    IKTokenizer@23134c4b term=呢,bytes=[e5 91 a2],startOffset=33,endOffset=34,positionIncrement=1,positionLength=1,type=CN_CHAR
    IKTokenizer@23134c4b term=2014,bytes=[32 30 31 34],startOffset=36,endOffset=40,positionIncrement=1,positionLength=1,type=ARABIC
    IKTokenizer@23134c4b term=巴西,bytes=[e5 b7 b4 e8 a5 bf],startOffset=40,endOffset=42,positionIncrement=1,positionLength=1,type=CN_WORD
    IKTokenizer@23134c4b term=世界杯,bytes=[e4 b8 96 e7 95 8c e6 9d af],startOffset=42,endOffset=45,positionIncrement=1,positionLength=1,type=CN_WORD

    	public static void main(String[] args)throws Exception{
    		 
    		  String text2="我们是中国人举行了2008年8月8日北京奥林匹克运动会";
    		  StringBuffer sb=new StringBuffer();
    		  sb.append(text2);
    		  String testAanlyzer = testAanlyzer(text2);
    		 
    		 }
    		 public static String  testAanlyzer (String text)throws Exception{
    			 String datas = " ";
    		 Analyzer anal = new IKAnalyzer(true);
    		 StringReader reader = new StringReader(text);
    		  long start=System.currentTimeMillis();  
    		  TokenStream ts=anal.tokenStream(" ",reader);
    		  CharTermAttribute term=ts.getAttribute(CharTermAttribute.class);
    		  System.out.println("分词效果如下:");
    		  int i=0;
    		  ts.reset();
    		  while(ts.incrementToken()){
    			  datas += ts.toString();
    		   i++;
    		   System.out.println(new String(term.buffer(),0,term.length()));
    		  }
    		  ts.close();
    		  long usetime=System.currentTimeMillis()-start;
    		  System.out.println("共分词="+i+",共耗时="+usetime+"毫秒。");
    		  return datas;
    		 }
    

     打印结果:

    分词效果如下:
    我们

    中国人

    行了
    2008年
    8月
    8日
    北京
    奥林匹克运动会
    共分词=10,共耗时=431毫秒。

    public static List<String> test02(String text) throws Exception{
                List<String> datasList = new ArrayList<String>();
                anal = new IKAnalyzer(true);
                reader = new StringReader(text);
                long start=System.currentTimeMillis();  
                ts =anal.tokenStream(" ",reader);
                term  =ts.getAttribute(CharTermAttribute.class);
                System.out.println("分词效果如下:");
                ts.reset();
                ///Object value =null;
                while (ts.incrementToken()){
                    String datas = new String (term.buffer(),0,term.length());
                    datasList.add(datas);
                }
                for (int is=0; is<datasList.size(); is++){
                    System.out.println(datasList.get(is));
                    //Map<String, Object> searchByskey = SolrUtil.searchSenContentByWord(solrUrl, "collection1", datasList.get(is));
                    Map<String, Object> searchByskey = SolrUtil.searchByskey( datasList.get(is), solrUrl, 0,200);
                    //Map<String, Object> searchByskey = SolrUtil.solrCloudR( datasList.get(is), solrUrl, 0,200);
                    for (Object obj : searchByskey.entrySet()) {
                        Entry entry = (Entry) obj;
                        String key = (String) entry.getKey();
                        Object value = entry.getValue();
                        System.out.println(key + "....=" + value);
                        
                    }
                    //datasList.add(value.toString());
                }
                
                ts.close();
                long usetime=System.currentTimeMillis()-start;
                System.out.println("共耗时="+usetime+"毫秒。");
                return datasList;
            }

    数据返回List<String> 类型

    参考地址:http://blog.csdn.net/wangxiaojing123/article/details/7397951

  • 相关阅读:
    Get distinct count of rows in the DataSet
    单引号双引号的html转义符
    PETS Public English Test System
    Code 39 basics (39条形码原理)
    Index was outside the bounds of the array ,LocalReport.Render
    Thread was being aborted Errors
    Reportviewer Error: ASP.NET session has expired
    ReportDataSource 值不在预期的范围内
    .NET/FCL 2.0在Serialization方面的增强
    Perl像C一样强大,像awk、sed等脚本描述语言一样方便。
  • 原文地址:https://www.cnblogs.com/zhanggl/p/4817145.html
Copyright © 2011-2022 走看看