#include <iostream>
#include <string>
#include <vector>
#include<sstream>//string IO流头文件
#include <fstream>
#include<set> //使用set的头文件
#include <map> // 使用map 的头文件
#include <sys/time.h>
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif /* __STDC_FORMAT_MACROS */
#include <inttypes.h>//加上上面的一句,可以直接使用int64_t,使用“%“PRId64”来打印
using namespace std;
#define ERR_EXIT(m)
do {
perror(m);
exit(EXIT_FAILURE);
}while(0)
void del_punct(string &word)
{
for (string::size_type ix = 0;
ix != word.size();
++ix)
{
if (ispunct(word[ix]))
{
word.erase(ix,1);//string 中的erase的使用方法
--ix;//小心越界
}
}
}
void read_del_punct(const char* filename, set<string> &str)//使用set存储stop_list
{
ifstream infile;
infile.open(filename);
if (!infile)//返回值是不是这样
throw runtime_error("open file failed!");
string line;
while (getline (infile, line))
{
istringstream instring(line);//isstrngstream 的用法,不需要关闭
string word;
while (instring >> word)
{
del_punct(word);
str.insert(word);
}
}
infile.close();
}
bool in_stoplist(const set<string> &str, const string &word)
{
set<string>::iterator it = str.find(word);//find 的使用
if (it == str.end())//遍历寻找set中的元素
return false;
return true;
}
void read_del_punct(const char* filename, const set<string> &str, vector<string> &words)//读取文件,删除文本中标点符号,删掉stop_list里面的单词,存至vector
{
ifstream infile;
infile.open(filename);
if (!infile)//返回值是不是这样
throw runtime_error("open file failed!");
string word;
while (infile >> word)
{
del_punct(word);
if (!in_stoplist(str,word))
{
words.push_back(word);
}
}
}
void map_creat(vector<string> &words, map<string,int> &M)//使用map统计单词
{
string word;
for (vector<string>::iterator it = words.begin();
it != words.end();
++it)
{
M[*it] ++;//map 的语法:直接赋值,自动排序
}
}
void map_print(map<int,string, greater<int> > &N)//打印map
{
for (map<int, string>::iterator it = N.begin();
it != N.end();
++it)//定义迭代器不需要加上第三个参数
{
cout << it -> second << " : " << it -> first << endl;
}
}
void map_trans(map<string,int> &M, map<int, string, greater<int> > &N)//交换map的key和value
{
for (map<string,int>::iterator it = M.begin();
it != M.end();
++it)
{
N[it -> second] = it -> first;
}
}
int64_t gettime ()//gettime()函数
{
struct timeval tm;
memset(&tm,0,sizeof tm);
if (gettimeofday(&tm, NULL) == -1)//gettimeofday 的用法和返回值
throw runtime_error("gettimeofday");
int64_t t = tm.tv_usec;
t += tm.tv_sec * 1000 * 1000;
return t;
}
int main(int argc, const char *argv[])
{
if (argc < 3)//语法是不是这样写?
{
// cerr << "Usage : " << argv[0] << " file" << endl;
fprintf(stderr, "Usage : %s filename stoplist
", argv[0]);
// ERR_EXIT("usage");//不是这么写,usage未定义,此时perror里面没有错误
exit(EXIT_FAILURE);//exit 为小写!!
}
int64_t start,end;
vector<string> words;
set<string> str;
start = gettime();
read_del_punct (argv[2],str);//读取stoplist
read_del_punct (argv[1],str,words);//读取文件
end = gettime();
cout << "读取文件花费 " << end - start << " us" << endl;
map<string, int> M;//map 可以按照value排序,但不能建立在value值改变的基础上,就是说,在统计词频的时候,不要尝试着在按照key排序操作value的基础上同时按照value排序
start = gettime();
map_creat (words,M);
end = gettime();
cout << "读取单词入map花费 " << end - start << " us" << endl;
map<int, string, greater<int> > N;
start = gettime();
map_trans(M,N);
end = gettime();
cout << "转置map花费 " << end - start << " us" << endl;
map_print(N);
return 0;
}