zoukankan      html  css  js  c++  java
  • 词频统计设计的改进

     1 package zuoye1;
     2 
     3 import java.io.BufferedReader;
     4 import java.io.FileNotFoundException;
     5 import java.io.FileReader;
     6 import java.io.IOException;
     7 import java.util.ArrayList;
     8 import java.util.Collections;
     9 import java.util.Comparator;
    10 import java.util.HashMap;
    11 import java.util.List;
    12 import java.util.Map;
    13 import java.util.StringTokenizer;
    14 import java.util.Map.Entry;
    15 
    16 public class FileWord {
    17 
    18     /**
    19      * 读入文件,实现词频统计
    20      */
    21     public static void main(String[] args) {
    22         HashMap<String,Integer> map=new HashMap<String,Integer>();//用于统计各个单词的个数,排序
    23         //过滤字符串中的所有标点符号
    24         String regex=" ?.!:,""'';
    ";
    25         BufferedReader br;
    26         try {
    27             //FileReader类创建了一个可以读取文件内容的Reader类、调用构造方法FileReader()
    28             br = new BufferedReader(new FileReader("c:\english.txt"));//文件完整路径
    29             String sentence;
    30             int wordCount = 0;
    31             try {
    32                 while((sentence = br.readLine()) !=null){     //用readLine读取文件,判断读取文件是否为空
    33                     sentence = sentence.replaceAll(regex, "");
    34                     StringTokenizer token=new StringTokenizer(sentence);
    35                     while(token.hasMoreTokens()){     //循环遍历
    36                         wordCount++;    
    37                         String word = token.nextToken();
    38                         if(map.containsKey(word)){     //HashMap不允许重复的key,所以利用这个特性,去统计单词的个数
    39                         int count=map.get(word);
    40                         map.put(word, count+1);     //如果HashMap已有这个单词,则设置它的数量加1
    41                     }
    42                     else{
    43                         map.put(word, 1);          //如果没有这个单词,则新填入,数量为1
    44                 }
    45             }
    46         }
    47                 System.out.println("总共单词数:"+wordCount);
    48                 sort(map); 
    49             } catch (IOException e) {
    50                 e.printStackTrace();
    51             }
    52         }catch(FileNotFoundException e) {
    53             e.printStackTrace();
    54         }
    55     }
    56         //排序
    57         public static void sort(HashMap<String,Integer> map){
    58             //声明集合folder,存放单词和单词个数
    59             List<Map.Entry<String, Integer>> folder = new ArrayList<Map.Entry<String, Integer>>(map.entrySet()); 
    60             Collections.sort(folder, new Comparator<Map.Entry<String, Integer>>() {   
    61                 public int compare(Map.Entry<String, Integer> obj1, Map.Entry<String, Integer> obj2) {   
    62                     return (obj2.getValue() - obj1.getValue());   
    63                 }   
    64         }); 
    65         //输出
    66         for (int i = 0; i < folder.size(); i++) {   
    67             Entry<String, Integer> en = folder.get(i);
    68             System.out.println(en.getKey()+":"+en.getValue());
    69             }
    70         }
    71     }

    实现结果

    总共单词数:181
    as:7
    the:7
    not:6
    it:6
    to:5
    are:4
    a:4
    your:4
    in:4
    they:3
    live:3
    and:3
    of:2
    do:2
    may:2
    by:2
    be:2
    clothes:2
    that:2
    often:2
    have:2
    from:2
    above:2
    is:2
    you:2
    door:1
    its:1
    suppose.It:1
    palace.The:1
    contentedly:1
    snow:1
    friends,Turn:1
    yourself:1
    means.which:1
    or:1
    windows:1
    life,poor:1
    bad:1
    quiet:1
    like:1
    without:1
    thoughts.:1
    simply:1
    abode;the:1
    change.Sell:1
    will:1
    some:1
    fault-finder:1
    herb,like:1
    before:1
    most:1
    I:1
    old,return:1
    trouble:1
    life:1
    change;we:1
    supported:1
    is.You:1
    spring.:1
    me:1
    mind:1
    town;but:1
    there,and:1
    paradise.Love:1
    hardnames.It:1
    is,meet:1
    should:1
    seem:1
    independent:1
    new:1
    alms-house:1
    poor-house.The:1
    pleasant,thrilling,glorious:1
    ;do:1
    garden:1
    happens:1
    keep:1
    but:1
    However:1
    reflected:1
    being:1
    brightly:1
    enough:1
    Cultivate:1
    any.May:1
    looks:1
    more:1
    sage.Do:1
    town's:1
    when:1
    faults:1
    richest.The:1
    disreputable.:1
    think:1
    get:1
    so:1
    much:1
    lives:1
    perhaps:1
    early:1
    things,whether:1
    call:1
    dishonest:1
    sun:1
    shun:1
    melts:1
    setting:1
    them.Things:1
    poverty:1
    poorest:1
    mean:1
    receive:1
    find:1
    hourss,even:1
    thoughts,as:1
    rich:1
    poor:1
    man's:1
    cheering:1
    great:1
    see:1
    supporting:1
    themselves:1
    misgiving.Most:1

     ssh://git@git.coding.net:linliaimeli/FileWord.git

     https://git.coding.net/linliaimeli/FileWord.git

  • 相关阅读:
    02-线性结构1 两个有序链表序列的合并
    ScSPM
    中国大学MOOC-陈越、何钦铭-数据结构-笔记
    01-复杂度1 最大子列和问题(剑指offer和PAT)
    Matlab中配置VLFeat
    循环队列实现
    对于利用pca 和 cca 进行fmri激活区识别的理解
    对于利用ica进行fmri激活区识别的理解
    利用spm提供的MoAEpilot听觉数据学习预处理以及单被试glm分析与统计推断
    fsl的feat软件分包使用笔记
  • 原文地址:https://www.cnblogs.com/linliaimeili/p/5841864.html
Copyright © 2011-2022 走看看