zoukankan      html  css  js  c++  java
  • 单词 统计续(补)

    短语统计

    我们在处理文本的时候只需将提取出来的文本数据进行特殊的分割处理,比如只需将英文的“,”,“.”,“?”,“!"以及回车符设为分隔符。并将一些无用单词作为间断比如

    "a",  "it", "the", "and", "this"等。

    package analyse_word;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.util.HashMap;
    import java.util.Iterator;
    import java.util.Scanner;
    import java.util.Set;
    public class recognize_sentence {
     
     public static boolean useless(String str) throws FileNotFoundException {
      File file = new File("D:\useless.txt");// 读取文件
      String words[] = new String [100000];
      int out_words[] = new int [100000];
      if (!file.exists()) {// 如果文件打不开或不存在则提示错误
       System.out.println("文件不存在");
       return false;
      }
      Scanner x = new Scanner(file);
      HashMap<String, Integer> hashMap = new HashMap<String, Integer>();
      while (x.hasNextLine()) {
       String line = x.nextLine();
       String[] lineWords = line.split("[ ]");
       Set<String> wordSet = hashMap.keySet();
       for (int i = 0; i < lineWords.length; i++) {
        if (wordSet.contains(lineWords[i])) {
         Integer number = hashMap.get(lineWords[i]);
         number++;
         hashMap.put(lineWords[i], number);
        } else {
         hashMap.put(lineWords[i], 1);
        }
       }
      }
      Iterator<String> iterator = hashMap.keySet().iterator();
      int max = 0,i=0;
      while (iterator.hasNext()) {
       String word = iterator.next();
       if(!"".equals(word)&&word!=null&&!"a".equals(word)&&!"the".equals(word)&&!"  ".equals(word)) {
        if(str.indexOf(" "+word+" ")==0) {
         return true;
        }
        words[i]=word;
        out_words[i]=hashMap.get(word);
        i++;
       }
      }
      return true;
     }
     public static void recognize() throws FileNotFoundException {
      File file = new File("D:\Englis_letters.txt");// 读取文件
      if (!file.exists()) {// 如果文件打不开或不存在则提示错误
       System.out.println("文件不存在");
       return;
      }
      Scanner x = new Scanner(file);
      HashMap<String, Integer> hashMap = new HashMap<String, Integer>();
      while (x.hasNextLine()) {
       String line = x.nextLine();
       String[] lineWords = line.split("[\t+;.,“”‘’?! +]");
       Set<String> wordSet = hashMap.keySet();
       for (int i = 0; i < lineWords.length; i++) {
        if (wordSet.contains(lineWords[i])) {
         Integer number = hashMap.get(lineWords[i]);
         number++;
         hashMap.put(lineWords[i], number);
        } else {
         hashMap.put(lineWords[i], 1);
        }
       }
      }
      Iterator<String> iterator = hashMap.keySet().iterator();
      while (iterator.hasNext()) {
       String word = iterator.next();
       if(useless(word)) {
        System.out.println(word);
       }
      }
     }
     public static void main(String[] args) throws FileNotFoundException {
      recognize();
     }
    }
  • 相关阅读:
    c# GDI+简单绘图(二)
    ProE 工程图教程系列4 ProE不能导出dwg等格式的解决办法
    McAfee卸载工具及卡巴KIS2009注册码
    Vista下修改网卡MAC地址
    开机后出现Spooler Subsystem App 的解决办法
    微软office 2007在线编辑平台Office Live Workspace(docx转doc格式)
    美丽的草原风电场————内蒙古锡林浩特阿巴嘎旗风电场
    ProE官方网站系列视频教程
    [转]Stimator:评估您的网站/博客的价值
    关注苏迪曼杯,关注Lining为羽毛球队打造的衣服
  • 原文地址:https://www.cnblogs.com/goubb/p/11031048.html
Copyright © 2011-2022 走看看