第四题(共四题100分):低频词过滤(40分)
题目描述:请编写程序,从包含大量单词的文本中删除出现次数最少的单词。如果有多个单词都出现最少的次数,则将这些单词都删除。
输入数据:程序读入已被命名为corpus.txt的一个大数据量的文本文件,该文件包含英文单词和中文单词,词与词之间以一个或多个whitespace分隔。(为便于调试,您可下载测试corpus.txt文件,实际运行时我们会使用不同内容的输入文件。)
输出数据:在标准输出上打印删除了corpus.txt中出现次数最少的单词之后的文本(词与词保持原来的顺序,仍以空格分隔)。
评分标准:程序输出结果必须正确,内存使用越少越好,程序的执行时间越快越好。
我的程序:
#pragma warning(disable: 4786)
#include <map>
#include <string>
#include <vector>
#include <cstdio>
#include <fstream>
#include <algorithm>

using namespace std;

struct Pair


{

Pair()
{}
Pair(const string& w, int c):word(w), count(c)

{
}
string word;
int count;
};

map< string, int > wordmap;
vector< Pair > wordcount;
vector< int > wordindex;

int main()


{
// load the file, count word, build index
string word;
fstream file("corpus.txt");
while(file >> word)

{
int id;
map< string, int >::iterator itw = wordmap.find(word);
if (itw == wordmap.end()) // new word

{
id = wordcount.size();
wordmap[word] = id;
wordcount.push_back(Pair(word, 1)); // 奇怪,我当时写的是id吗?- -晕了,谢谢mammger 指出来
}
else // found it

{
wordcount[itw->second].count++;
id = itw->second;
}
wordindex.push_back(id);
}
file.close();
if (wordcount.empty())
return 0;
// find the min count
vector< Pair >::iterator itc = wordcount.begin();
int mincount = itc->count;
for (++itc; itc != wordcount.end(); ++itc)

{
if (mincount > itc->count)
mincount = itc->count;
}
// show all filtered words
if (wordindex.empty())
return 0;

// skip leading filtered words
vector< int >::iterator w = wordindex.begin();
while(w != wordindex.end() && wordcount[*w].count == mincount)
++w;
if (w == wordindex.end())
return 0;


/**//* debug use
for (vector< Pair >::iterator iw = wordcount.begin(); iw != wordcount.end(); ++iw)
printf("%s %d\n", iw->word.c_str(), iw->count);
printf("mincount = %d\n", mincount);
/*/

// show first word, with no space after
printf("%s", wordcount[*w].word.c_str());
for (++w; w != wordindex.end(); ++w)

{
if (wordcount[*w].count != mincount)

{
printf(" %s", wordcount[*w].word.c_str());
}
}
printf("\n");
//*/
return 0;
}