原本是因学习《Accelerated C++》写的练习代码,想到今后实现HuffmanTree的时候可以用到,所以先深入研究一下统计词频的方法
本文讨论使用标准C++语言来统计英文单词出现次数。
最初的形式用vector来构建(比较笨拙):
1#include <iostream>
2#include <string>
3#include <vector>
4#include <iomanip>
5
6using std::string; using std::vector;
7using std::cout; using std::cin;
8using std::endl; using std::streamsize;
9using std::setw;
10
11int main()
12{
13 string word;
14 string sep(" ,.?!:\"\t\n");
15 vector<string> word_set;
16 vector<int> word_num;
17 bool flag;
18 typedef vector<string>::size_type vect_sz;
19 vect_sz size = 0;
20 string::size_type pos = 0;
21 cout << "Input the word set" << endl;
22 while (cin >> word) {
23 pos = word.find_first_of(sep);
24 if (pos != 0 && pos != word.size()) {
25 string tmp(word.substr(0, pos));
26 word = tmp;
27 } else if (pos == 0){
28 string::size_type pos2 = word.find_first_not_of(sep, 1);
29 pos = word.find_first_of(sep, pos2+1);
30 string tmp(word.substr(pos2, pos-1));
31 word = tmp;
32 }
33 size = word_set.size();
34 flag = true;
35 for (int i=0; i!=size; ++i) {
36 if (word_set[i] == word) {
37 ++word_num[i];
38 flag = false;
39 break;
40 }
41 }
42 if (flag) {
43 word_set.push_back(word);
44 word_num.push_back(1);
45 }
46 }
47 size = word_set.size();
48 if (size != word_num.size()) {
49 cout << "error occur!" << endl;
50 return -1;
51 }
52 streamsize prev = cout.width();
53 for (int i=0; i<size; ++i) {
54 cout << "The word \"" << setw(10) << word_set[i]
55 << setw(prev) << "\" appears " << word_num[i]
56 << " times." << endl;
57 }
58 return 0;
59}
60 用vector来存储有个非常不方便的在于单词和其出现的次数不是一个整体,而是分在两个容器中,靠序号上的一一对应来建立联系的。
在这里用vector作为存储结构并不合适。
map作为键-值对,更适合作为“单词-出现次数”的存储结构
改进后的代码如下(比vector版少了好几行代码,代码不是越多越好哈亲:D)
1#include <iostream>
2#include <string>
3#include <map>
4#include <utility>
5#include <iomanip>
6
7using std::string; using std::map;
8using std::cout; using std::cin;
9using std::endl; using std::streamsize;
10using std::setw; using std::pair;
11
12int main()
13{
14 string word;
15 string sep(",.?!:\"\' \t\n");
16 map<const string, unsigned int> word_count;
17 string::size_type pos = 0;
18 cout << "Input the word set" << endl;
19 while (cin >> word) {
20 pos = word.find_first_of(sep);
21 if (pos != 0 && pos != word.size()) {
22 string tmp(word.substr(0, pos));
23 word = tmp;
24 } else if (pos == 0){
25 string::size_type pos2 = word.find_first_not_of(sep, 1);
26 pos = word.find_first_of(sep, pos2+1);
27 string tmp(word.substr(pos2, pos-1));
28 word = tmp;
29 }
30 // ++word_count[word];
31 pair<map<const string, unsigned int>::iterator, bool>ret =
32 word_count.insert(make_pair(word, 1));
33 if (!ret.second) { // word already exists
34 ++ret.first->second; // increment counter
35 }
36 }
37 streamsize prev = cout.width();
38 typedef map<const string, unsigned int>::const_iterator map_const_iterator;
39 map_const_iterator map_iter = word_count.begin();
40 for (; map_iter!=word_count.end(); ++map_iter) {
41 cout << "The word \"" << setw(10) << map_iter->first
42 << setw(prev) << "\" appears " << map_iter->second
43 << " times." << endl;
44 }
45 return 0;
46} 为了增加普适性,还准备扩充对文本文档的读取功能,未完待续