1. C语言:一个字符一个字符的读取
(有空再贴出来)
2.Java语言:按行读取,并用正则分割成多个单词,再用MapReduce并行计算单词数 (我使用的是ieda,有些地方跟eclipse有点区别)
/** * 按流读取文件 (通过read.readLine()获取一行) * @param path * @return * @throws FileNotFoundException */ public BufferedReader openFile(final String path) throws FileNotFoundException { BufferedReader reader = new BufferedReader(new FileReader(path)); return reader; }
/** * 采用Hash计算单词数 * @param line * @return */ public void hash(final HashMap<String, Integer> hashMap, final String line) { // 不能分割b2c,it's这类单词 String[] words = line.split("[^a-z]+"); for (String word : words) { // 去除空格、空行 if (word.length() > 0) { if (hashMap.containsKey(word) == false) { hashMap.put(word, 1); } } } }
/** * 计算单词个数 * @param hashMap * @return */ public Integer computeWordCount(final HashMap<String, Integer> hashMap) { return hashMap.size(); }
测试用例:
public static void main(String args[]) throws IOException { String path = Paths.get(PROJECT_ROOT_DIR, "src/main/resources/articles/test.txt").toString(); BufferedReader reader = openFile(path); HashMap<String, Integer> hashMap = new HashMap<>(); String line; int wordCount; while((line = reader.readLine()) != null) { hash(hashMap, line); } wordCount = computeWordCount(hashMap); System.out.println(wordCount); }