zoukankan      html  css  js  c++  java
  • 统计文章中字母、单词出现的频率

    package 统计英文字母出现频率;
    import java.io.BufferedReader;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.List;

    /*
     * 统计一片文章中各字母出现的频率
     */
    class entity{
     String zimu;
     int cishu;
     public entity(String zimu,int cishu) {
      this.zimu = zimu;
      this.cishu = cishu;
     }
     public String getZimu() {
      return zimu;
     }
     public void setZimu(String zimu) {
      this.zimu = zimu;
     }
     public int getCishu() {
      return cishu;
     }
     public void setCishu(int cishu) {
      this.cishu = cishu;
     }
    }
    public class ZimuCollect {
     
    // public static Map.Entry[] getSortedHashtableByValue(Map map) {
    //  Set set = map.entrySet();
    //  Map.Entry[] entries = (Map.Entry[]) set.toArray(new Map.Entry[set.size()]);
    //  Arrays.sort(entries, new Comparator() {
    //  public int compare(Object arg0, Object arg1) {
    //  Long key1 = Long.valueOf(((Map.Entry) arg0).getValue().toString());
    //  Long key2 = Long.valueOf(((Map.Entry) arg1).getValue().toString());
    //  return key2.compareTo(key1);
    //  }
    //  });
    //  return entries;
    //  }
     public static void collect() throws IOException {
      try {
       //IO操作读取文件内容
       FileReader fr = new FileReader("file.txt");
       BufferedReader br = new BufferedReader(fr);
       HashMap<String, Integer> map = new HashMap<String, Integer>();
       String string =null;
       Integer count = 0;//每个字母的次数
       Integer total = 0;//总共多少个字母
       while ((string=br.readLine())!=null) {
        char[] ch = string.toCharArray();//将获取的string分成字符数组
        total = total + ch.length;
        for (int i = 0; i < ch.length; i++) {
         ch[i] = Character.toLowerCase(ch[i]);//将所有的字母变成小写的
         count = map.get(ch[i]+"");
         if (count == null) {//字母没有出现重复;
          count = 1;
         }else {//字母出现重复,count+1;
          count++;
         }
         map.put(ch[i]+"", count);
        }
       }
    //   Map.Entry[] finmap = getSortedHashtableByValue(map);
    //   for(int i=0;i<finmap.length;i++)
    //   {
    //   System.out.println("字母"+finmap[i]+"其频率为:");
    //   }
       List<entity> result = new ArrayList<>();
       
       entity e = null;
       for (String str : map.keySet()) {
        e = new entity(str,map.get(str));
        result.add(e);
       }
       result.sort((entity e1,entity e2)->{
        return e2.getCishu()-e1.getCishu();});
       
       for(entity ee : result) {
        System.out.println("字母"+ee.getZimu()+"在文章中出现"+ee.getCishu()+"次,其频率为"+String.format("%.2f",ee.getCishu()*1.0/total));
       }
       

       
    //   for (String str : map.keySet()) {
    //    System.out.println("字母"+str+"出现"+map.get(str)+"次,其频率为:"+String.format("%.2f",map.get(str)*1.0/total));
    //   }
      } catch (FileNotFoundException e) {
       // TODO Auto-generated catch block
       e.printStackTrace();
      }
     }
     public static void main(String[] args) throws IOException {
      try{
       ZimuCollect zimucollect = new ZimuCollect();
       ZimuCollect.collect();
      
      
         } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
         }
      }
    }

      

    统计文章中字母出现频率思路: FileReader fr = new FileReader("file.txt");BufferedReader br = new BufferedReader(fr);按行读取文件,将每次读取到的一行都进行统计,HashMap<String, Integer> map = new HashMap<String, Integer>();用map的统计方法,其中String代表出现的字母,int代表该字母出现的次数,其次可以利用map方法看其是否重复重现,重复的话其int值便+1;最后遍历即可。

    package 统计英文字母出现频率;
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.Map;
    import java.util.Scanner;
    import java.util.TreeMap;
    
    class entity11{//单词实体类
    	String danci;//出现的单词
    	int cishu;//单词对应出现的次数
    	public entity11(String zimu,int cishu) {
    		this.danci = zimu;
    		this.cishu = cishu;
    	}
    	public String getDanci() {
    		return danci;
    	}
    	
    	public int getCishu() {
    		return cishu;
    	}
    	
    }
    public class DanciCollect {
    	public static boolean judgeNouse(String str) throws IOException {
    		boolean flag = true;
    		FileReader fr = new FileReader("judge.txt");
    		BufferedReader bf = new BufferedReader(fr);
    		String str1;			// 按行读取字符串
    		while ((str1 = bf.readLine()) != null) {
    			if(str.equals(str1)) {
    				flag = false;
    			}
    		}
    		bf.close();
    		fr.close();
    		
    		
    		return flag;	
    			
    	
        }  
    	public static String toLowerCase(String str) {
    		char []StringArr = str.toCharArray();
    		for (int i = 0; i < StringArr.length; i++) {
    			StringArr[i] = Character.toLowerCase(StringArr[i]);
    		}
        	StringBuffer sb = new StringBuffer();
    		for(int i = 0;i < StringArr.length;i++) {
    			sb.append(StringArr[i]);
    		}
    		String str1 = sb.toString();
    		return str1; 
    		
    	}
    	public static void collect1() throws IOException {
    		try {
    			File file1 = new File("piao.txt");//定义一个file对象,用来初始化FileReader
    			FileReader reader1 = new FileReader(file1);//定义一个fileReader对象,用来初始化BufferedReader
    			BufferedReader bReader1 = new BufferedReader(reader1);//new一个BufferedReader对象,将文件内容读取到缓存
    			StringBuilder sb1 = new StringBuilder();//定义一个字符串缓存,将字符串存放缓存中
    			String s1 = "";
    			while ((s1 =bReader1.readLine()) != null) {//逐行读取文件内容,不读取换行符和末尾的空格
    			sb1.append(s1);//将读取的字符串添加换行符后累加p存放在缓存中
    			}
    			bReader1.close();
    			String text = sb1.toString();
    			int i=0;
    			String[] array = {".",",","?","!",":","‘","’","“","”","—",";","-"};
    			for (int j = 0; j < array.length; j++) {
    				text = text.replace(array[j]," ");                      //将text中的array数组中包含的特殊字符用空格代替
    			}
    			String[] textArray = text.split(" ");                       //根据空格将text分割并存放在textArray中
    			Map<String, Integer> map = new TreeMap<String, Integer>();
    			Integer count = 0;//每个字母的次数
    			Integer total = 0;//总共多少个字母
    			while(i < textArray.length) {
    				
    					String str = toLowerCase(textArray[i]);
    					if(!judgeNouse(str)) {
    						total = total + 1;
    						count = map.get(str+"");
    						if (count == null) {//单词没有出现重复;
    							count = 1;
    						}else {//单词出现重复,count+1;
    							count++;
    						}
    						map.put(str+"", count);
    						i++;
    					
    					}
    					else {
    						i++;
    					}
    					
    				
    			  }
    		
    			List<entity11> result = new ArrayList<>();
    			
    			entity11 e = null;
    			for (String str : map.keySet()) {
    				e = new entity11(str,map.get(str));
    				result.add(e);
    			}
    			result.sort((entity11 e1,entity11 e2)->{
    				return e2.getCishu()-e1.getCishu();});
    			System.out.println("文章共计"+total+"个单词");
    			
    	        
    			for(int ii = 0 ; ii < result.size();ii++) {
    				System.out.println(result.get(ii).getDanci()+"在文章中出现"+result.get(ii).getCishu()+"次,其频率为"+String.format("%.2f",result.get(ii).getCishu()*1.0/total));
    			}
    //			for(entity11 ee : result) {
    //				System.out.println("单词"+ee.getDanci()+"在文章中出现"+ee.getCishu()+"次,其频率为"+String.format("%.2f",ee.getCishu()*1.0/total));
    //			}
    			
    	
    	
    			
    	//		for (String str : map.keySet()) {
    	//			System.out.println("字母"+str+"出现"+map.get(str)+"次,其频率为:"+String.format("%.2f",map.get(str)*1.0/total));
    	//		}
    		} catch (FileNotFoundException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
    	public static void main(String args[]) throws IOException {
    		try {
    			DanciCollect dancicollect = new DanciCollect();
    			DanciCollect.collect1();
    		}catch (FileNotFoundException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
    }
    

      统计文章中单词出现的频率思路:File file1 = new File("piao.txt");//定义一个file对象,用来初始化FileReader
       FileReader reader1 = new FileReader(file1);//定义一个fileReader对象,用来初始化BufferedReader
       BufferedReader bReader1 = new BufferedReader(reader1);//new一个BufferedReader对象,将文件内容读取到缓存,,按行读取文件中的内容,用sb1.append(s1);方法将每次读取的内容追加到缓存,这样将文件中所有的内容全部存放进缓存中,用String text = sb1.toString();将缓存内容转化成字符串。按照String[] array = {".",",","?","!",":","‘","’","“","”","—",";","-"};这些特殊符号用空格代替,这样便出现了一个个单词中间以空格间隔,用text.split(" ");方法将其分成一个个单词存进字符串数组,同样用map来从头遍历存放。

    package 统计英文字母出现频率;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.Map;
    import java.util.Scanner;
    import java.util.TreeMap;
    
    class entity111{
    	String danci;
    	int cishu;
    	public entity111(String zimu,int cishu) {
    		this.danci = zimu;
    		this.cishu = cishu;
    	}
    	public String getDanci() {
    		return danci;
    	}
    	
    	public int getCishu() {
    		return cishu;
    	}
    	
    }
    public class NDanciCollect {
    	public static boolean judgeNouse(String str) throws IOException {
    		boolean flag = true;
    		FileReader fr = new FileReader("judge.txt");
    		BufferedReader bf = new BufferedReader(fr);
    		String str1;			// 按行读取字符串
    		while ((str1 = bf.readLine()) != null) {
    			if(str.equals(str1)) {
    				flag = false;
    			}
    		}
    		bf.close();
    		fr.close();
    		
    		
    		return flag;	
    			
    	
        }  
    	public static String toLowerCase(String str) {
    		char []StringArr = str.toCharArray();
    		for (int i = 0; i < StringArr.length; i++) {
    			StringArr[i] = Character.toLowerCase(StringArr[i]);
    		}
        	StringBuffer sb = new StringBuffer();
    		for(int i = 0;i < StringArr.length;i++) {
    			sb.append(StringArr[i]);
    		}
    		String str1 = sb.toString();
    		return str1; 
    		
    	}
    	public static void collect11() throws IOException {
    		try {
    			File file1 = new File("piao.txt");//定义一个file对象,用来初始化FileReader
    			FileReader reader1 = new FileReader(file1);//定义一个fileReader对象,用来初始化BufferedReader
    			BufferedReader bReader1 = new BufferedReader(reader1);//new一个BufferedReader对象,将文件内容读取到缓存
    			StringBuilder sb1 = new StringBuilder();//定义一个字符串缓存,将字符串存放缓存中
    			String s1 = "";
    			while ((s1 =bReader1.readLine()) != null) {//逐行读取文件内容,不读取换行符和末尾的空格
    			sb1.append(s1);//将读取的字符串添加换行符后累加p存放在缓存中
    			}
    			bReader1.close();
    			String text = sb1.toString();
    			int i=0;
    			String[] array = {".",",","?","!",":","‘","’","“","”","—",";","-"};
    			for (int j = 0; j < array.length; j++) {
    				text = text.replace(array[j]," ");                      //将text中的array数组中包含的特殊字符用空格代替
    			}
    			String[] textArray = text.split(" ");                       //根据空格将text分割并存放在textArray中
    			Map<String, Integer> map = new TreeMap<String, Integer>();
    			Integer count = 0;//每个字母的次数
    			Integer total = 0;//总共多少个字母
    			while(i < textArray.length) {
    				String str = toLowerCase(textArray[i]);
    				if(!judgeNouse(str)) {
    				total = total + 1;
    				count = map.get(str+"");
    				if (count == null) {//单词没有出现重复;
    					count = 1;
    				}else {//单词出现重复,count+1;
    					count++;
    				}
    				map.put(str+"", count);
    				i++;
    				}else {
    					i++;
    				}
    			  }
    		
    			List<entity111> result = new ArrayList<>();
    			
    			entity111 e = null;
    			for (String str : map.keySet()) {
    				e = new entity111(str,map.get(str));
    				result.add(e);
    			}
    			result.sort((entity111 e1,entity111 e2)->{
    				return e2.getCishu()-e1.getCishu();});
    			System.out.println("文章共计"+total+"个单词");
    			System.out.println("请输入要输出出现频率最高的前N个单词------请输入N的值:");
    	        Scanner scan = new Scanner(System.in);
    	        int top = scan.nextInt();
    			for(int ii = 0 ; ii < top;ii++) {
    				System.out.println(result.get(ii).getDanci()+"在文章中出现"+result.get(ii).getCishu()+"次,其频率为"+String.format("%.2f",result.get(ii).getCishu()*1.0/total));
    			}
    //			for(entity111 ee : result) {
    //				System.out.println("单词"+ee.getDanci()+"在文章中出现"+ee.getCishu()+"次,其频率为"+String.format("%.2f",ee.getCishu()*1.0/total));
    //			}
    			
    	
    	
    			
    	//		for (String str : map.keySet()) {
    	//			System.out.println("字母"+str+"出现"+map.get(str)+"次,其频率为:"+String.format("%.2f",map.get(str)*1.0/total));
    	//		}
    		} catch (FileNotFoundException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
    	public static void main(String args[]) throws IOException {
    		try {
    			NDanciCollect ndancicollect = new NDanciCollect();
    			NDanciCollect.collect11();
    		}catch (FileNotFoundException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
    	}
    

      输出前几个频率最高的单词,并且去掉文中一些常见单词,这里只是追加了一个方法,在存进map之前用judgeNouse()函数判断,不是常见单词在存进map里,同样用map来统计单词和其出现的次数,最后将map转换成List数组,用sort函数按照其次数这个属性来进行排序,这样便可按照用户的意愿来输出频率最高的前N个单词。

  • 相关阅读:
    正则表达式周二挑战赛 第七周
    [译]视区百分比,canvas.toBlob()以及WebRTC
    [译]因扩展Object.prototype而引发Object.defineProperty不可用的一个问题
    [译]JavaScript需要类吗?
    [译]JavaScript中几种愚蠢的写法
    [译]JavaScript中对象的属性
    JavaScript:数组的length属性
    [译]JavaScript中的变量声明:你可以打破的三条规则
    [译]ES6:JavaScript中将会有的几个新东西
    [译]ECMAScript 6中的集合类型,第三部分:WeakMap
  • 原文地址:https://www.cnblogs.com/zjl-0217/p/10993838.html
Copyright © 2011-2022 走看看