zoukankan      html  css  js  c++  java
  • 单词统计功能续

    短语统计

    我们在处理文本的时候只需将提取出来的文本数据进行特殊的分割处理,比如只需将英文的“,”,“.”,“?”,“!"以及回车符设为分隔符。并将一些无用单词作为间断比如

    "a",  "it", "the", "and", "this"等。

    package analyse_word;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.util.HashMap;
    import java.util.Iterator;
    import java.util.Scanner;
    import java.util.Set;
    public class recognize_sentence {
     
     public static boolean useless(String str) throws FileNotFoundException {
      File file = new File("D:\useless.txt");// 读取文件
      String words[] = new String [100000];
      int out_words[] = new int [100000];
      if (!file.exists()) {// 如果文件打不开或不存在则提示错误
       System.out.println("文件不存在");
       return false;
      }
      Scanner x = new Scanner(file);
      HashMap<String, Integer> hashMap = new HashMap<String, Integer>();
      while (x.hasNextLine()) {
       String line = x.nextLine();
       String[] lineWords = line.split("[
    ]");
       Set<String> wordSet = hashMap.keySet();
       for (int i = 0; i < lineWords.length; i++) {
        if (wordSet.contains(lineWords[i])) {
         Integer number = hashMap.get(lineWords[i]);
         number++;
         hashMap.put(lineWords[i], number);
        } else {
         hashMap.put(lineWords[i], 1);
        }
       }
      }
      Iterator<String> iterator = hashMap.keySet().iterator();
      int max = 0,i=0;
      while (iterator.hasNext()) {
       String word = iterator.next();
       if(!"".equals(word)&&word!=null&&!"a".equals(word)&&!"the".equals(word)&&!"  ".equals(word)) {
        if(str.indexOf(" "+word+" ")==0) {
         return true;
        }
        words[i]=word;
        out_words[i]=hashMap.get(word);
        i++;
       }
      }
      return true;
     }
     public static void recognize() throws FileNotFoundException {
      File file = new File("D:\Englis_letters.txt");// 读取文件
      if (!file.exists()) {// 如果文件打不开或不存在则提示错误
       System.out.println("文件不存在");
       return;
      }
      Scanner x = new Scanner(file);
      HashMap<String, Integer> hashMap = new HashMap<String, Integer>();
      while (x.hasNextLine()) {
       String line = x.nextLine();
       String[] lineWords = line.split("[\t+;.,“”‘’?!
    +]");
       Set<String> wordSet = hashMap.keySet();
       for (int i = 0; i < lineWords.length; i++) {
        if (wordSet.contains(lineWords[i])) {
         Integer number = hashMap.get(lineWords[i]);
         number++;
         hashMap.put(lineWords[i], number);
        } else {
         hashMap.put(lineWords[i], 1);
        }
       }
      }
      Iterator<String> iterator = hashMap.keySet().iterator();
      while (iterator.hasNext()) {
       String word = iterator.next();
       if(useless(word)) {
        System.out.println(word);
       }
      }
     }
     public static void main(String[] args) throws FileNotFoundException {
      recognize();
     }
    }
  • 相关阅读:
    PAT 甲级 1126 Eulerian Path (25 分)
    PAT 甲级 1126 Eulerian Path (25 分)
    PAT 甲级 1125 Chain the Ropes (25 分)
    PAT 甲级 1125 Chain the Ropes (25 分)
    PAT 甲级 1124 Raffle for Weibo Followers (20 分)
    PAT 甲级 1124 Raffle for Weibo Followers (20 分)
    PAT 甲级 1131 Subway Map (30 分)
    PAT 甲级 1131 Subway Map (30 分)
    AcWing 906. 区间分组 区间贪心
    AcWing 907. 区间覆盖 区间贪心
  • 原文地址:https://www.cnblogs.com/adret/p/11070468.html
Copyright © 2011-2022 走看看