zoukankan      html  css  js  c++  java
  • java 实现字符串词频统计

    package com.gpdi.action;
    
    import java.util.ArrayList;
    import java.util.Collections;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    
    public class WordsStatistics {
    	
    	class Obj {
    		int count ;
    		Obj(int count){
    			this.count = count;
    		}
    	}
    	
    	public List<WordCount> statistics(String word) {
    		List<WordCount> rs = new ArrayList<WordCount>();
    		Map <String,Obj> map = new HashMap<String,Obj>();
    		
    		if(word == null ) {
    			return null;
    		}
    		word = word.toLowerCase();
    		word = word.replaceAll("'s", "");
    		word = word.replaceAll(",", "");
    		word = word.replaceAll("-", "");
    		word = word.replaceAll("\\.", "");
    		word = word.replaceAll("'", "");
    		word = word.replaceAll(":", "");
    		word = word.replaceAll("!", "");
    		word = word.replaceAll("\n", "");
    		
    		String [] wordArray = word.split(" ");
    		for(String simpleWord : wordArray) {
    			simpleWord = simpleWord.trim(); 
    			if (simpleWord != null && !simpleWord.equalsIgnoreCase("")) {
    				Obj cnt = map.get(simpleWord);
    				if ( cnt!= null ) {
    					cnt.count++;
    				}else {
    					map.put(simpleWord, new Obj(1));
    				}
    			}
    		}
    		
    		for(String key : map.keySet()) {
    			WordCount wd = new WordCount(key,map.get(key).count);
    			rs.add(wd);
    		}
    		
    		Collections.sort(rs, new java.util.Comparator<WordCount>(){
    			@Override
    			public int compare(WordCount o1, WordCount o2) {
    				int result = 0 ;
    				if (o1.getCount() > o2.getCount() ) {
    					result = -1;
    				}else if (o1.getCount() < o2.getCount()) {
    					result = 1;
    				}else {
    					int strRs = o1.getWord().compareToIgnoreCase(o2.getWord());
    					if ( strRs > 0 ) {
    						result = 1;
    					}else {
    						result = -1 ;
    					}
    				}
    				return result;
    			}
    			
    		});
    		return rs;
    	}
    	
    	
    	public static void main(String args[]) {
    		String word = "Pinterest is might be aa ab aa ab marketer's dream  - ths site is largely used to curate products " ;
    		WordsStatistics s = new WordsStatistics();
    		List<WordCount> rs = s.statistics(word);
    		for(WordCount word1 : rs) {
    			System.out.println(word1.getWord()+"*"+word1.getCount());
    		}
    	}
    	
    }
    
  • 相关阅读:
    c++:函数模板
    1084 外观数列
    1083 是否存在相等的差
    1082 射击比赛
    1081 检查密码
    1080 MOOC期终成绩
    1079 延迟的回文数
    1078 字符串压缩与解压
    1077 互评成绩计算
    1076 Wifi密码
  • 原文地址:https://www.cnblogs.com/treemanfm/p/2989924.html
Copyright © 2011-2022 走看看