zoukankan      html  css  js  c++  java
  • 单词 统计续(补)

    短语统计

    我们在处理文本的时候只需将提取出来的文本数据进行特殊的分割处理,比如只需将英文的“,”,“.”,“?”,“!"以及回车符设为分隔符。并将一些无用单词作为间断比如

    "a",  "it", "the", "and", "this"等。

    package analyse_word;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.util.HashMap;
    import java.util.Iterator;
    import java.util.Scanner;
    import java.util.Set;
    public class recognize_sentence {
     
     public static boolean useless(String str) throws FileNotFoundException {
      File file = new File("D:\useless.txt");// 读取文件
      String words[] = new String [100000];
      int out_words[] = new int [100000];
      if (!file.exists()) {// 如果文件打不开或不存在则提示错误
       System.out.println("文件不存在");
       return false;
      }
      Scanner x = new Scanner(file);
      HashMap<String, Integer> hashMap = new HashMap<String, Integer>();
      while (x.hasNextLine()) {
       String line = x.nextLine();
       String[] lineWords = line.split("[ ]");
       Set<String> wordSet = hashMap.keySet();
       for (int i = 0; i < lineWords.length; i++) {
        if (wordSet.contains(lineWords[i])) {
         Integer number = hashMap.get(lineWords[i]);
         number++;
         hashMap.put(lineWords[i], number);
        } else {
         hashMap.put(lineWords[i], 1);
        }
       }
      }
      Iterator<String> iterator = hashMap.keySet().iterator();
      int max = 0,i=0;
      while (iterator.hasNext()) {
       String word = iterator.next();
       if(!"".equals(word)&&word!=null&&!"a".equals(word)&&!"the".equals(word)&&!"  ".equals(word)) {
        if(str.indexOf(" "+word+" ")==0) {
         return true;
        }
        words[i]=word;
        out_words[i]=hashMap.get(word);
        i++;
       }
      }
      return true;
     }
     public static void recognize() throws FileNotFoundException {
      File file = new File("D:\Englis_letters.txt");// 读取文件
      if (!file.exists()) {// 如果文件打不开或不存在则提示错误
       System.out.println("文件不存在");
       return;
      }
      Scanner x = new Scanner(file);
      HashMap<String, Integer> hashMap = new HashMap<String, Integer>();
      while (x.hasNextLine()) {
       String line = x.nextLine();
       String[] lineWords = line.split("[\t+;.,“”‘’?! +]");
       Set<String> wordSet = hashMap.keySet();
       for (int i = 0; i < lineWords.length; i++) {
        if (wordSet.contains(lineWords[i])) {
         Integer number = hashMap.get(lineWords[i]);
         number++;
         hashMap.put(lineWords[i], number);
        } else {
         hashMap.put(lineWords[i], 1);
        }
       }
      }
      Iterator<String> iterator = hashMap.keySet().iterator();
      while (iterator.hasNext()) {
       String word = iterator.next();
       if(useless(word)) {
        System.out.println(word);
       }
      }
     }
     public static void main(String[] args) throws FileNotFoundException {
      recognize();
     }
    }
  • 相关阅读:
    反流技术之IE插件技术研究第一部分
    c# post和接收的实现
    C# post提交表单的例程
    用C#搭建IE BHO勾子, 取表单密码
    复杂的 DataBinding 接受 IList 或 IListSource 作为数据源" 错误原来是自己的笔误
    C#判断ContextMenuStrip右键菜单的来源(从哪个控件弹出来的)
    练习5.1
    示例:实用函数(Utilities)
    闭包
    一个错误
  • 原文地址:https://www.cnblogs.com/goubb/p/11031048.html
Copyright © 2011-2022 走看看