原本是因学习《Accelerated C++》写的练习代码,想到今后实现HuffmanTree的时候可以用到,所以先深入研究一下统计词频的方法
本文讨论使用标准C++语言来统计英文单词出现次数。
最初的形式用vector来构建(比较笨拙):
1
#include <iostream>
2
#include <string>
3
#include <vector>
4
#include <iomanip>
5
6
using std::string; using std::vector;
7
using std::cout; using std::cin;
8
using std::endl; using std::streamsize;
9
using std::setw;
10
11
int main()
12

{
13
string word;
14
string sep(" ,.?!:\"\t\n");
15
vector<string> word_set;
16
vector<int> word_num;
17
bool flag;
18
typedef vector<string>::size_type vect_sz;
19
vect_sz size = 0;
20
string::size_type pos = 0;
21
cout << "Input the word set" << endl;
22
while (cin >> word)
{
23
pos = word.find_first_of(sep);
24
if (pos != 0 && pos != word.size())
{
25
string tmp(word.substr(0, pos));
26
word = tmp;
27
} else if (pos == 0)
{
28
string::size_type pos2 = word.find_first_not_of(sep, 1);
29
pos = word.find_first_of(sep, pos2+1);
30
string tmp(word.substr(pos2, pos-1));
31
word = tmp;
32
}
33
size = word_set.size();
34
flag = true;
35
for (int i=0; i!=size; ++i)
{
36
if (word_set[i] == word)
{
37
++word_num[i];
38
flag = false;
39
break;
40
}
41
}
42
if (flag)
{
43
word_set.push_back(word);
44
word_num.push_back(1);
45
}
46
}
47
size = word_set.size();
48
if (size != word_num.size())
{
49
cout << "error occur!" << endl;
50
return -1;
51
}
52
streamsize prev = cout.width();
53
for (int i=0; i<size; ++i)
{
54
cout << "The word \"" << setw(10) << word_set[i]
55
<< setw(prev) << "\" appears " << word_num[i]
56
<< " times." << endl;
57
}
58
return 0;
59
}
60
用vector来存储有个非常不方便的在于单词和其出现的次数不是一个整体,而是分在两个容器中,靠序号上的一一对应来建立联系的。
在这里用vector作为存储结构并不合适。
map作为键-值对,更适合作为“单词-出现次数”的存储结构
改进后的代码如下(比vector版少了好几行代码,代码不是越多越好哈亲:D)
1
#include <iostream>
2
#include <string>
3
#include <map>
4
#include <utility>
5
#include <iomanip>
6
7
using std::string; using std::map;
8
using std::cout; using std::cin;
9
using std::endl; using std::streamsize;
10
using std::setw; using std::pair;
11
12
int main()
13

{
14
string word;
15
string sep(",.?!:\"\' \t\n");
16
map<const string, unsigned int> word_count;
17
string::size_type pos = 0;
18
cout << "Input the word set" << endl;
19
while (cin >> word)
{
20
pos = word.find_first_of(sep);
21
if (pos != 0 && pos != word.size())
{
22
string tmp(word.substr(0, pos));
23
word = tmp;
24
} else if (pos == 0)
{
25
string::size_type pos2 = word.find_first_not_of(sep, 1);
26
pos = word.find_first_of(sep, pos2+1);
27
string tmp(word.substr(pos2, pos-1));
28
word = tmp;
29
}
30
// ++word_count[word];
31
pair<map<const string, unsigned int>::iterator, bool>ret =
32
word_count.insert(make_pair(word, 1));
33
if (!ret.second)
{ // word already exists
34
++ret.first->second; // increment counter
35
}
36
}
37
streamsize prev = cout.width();
38
typedef map<const string, unsigned int>::const_iterator map_const_iterator;
39
map_const_iterator map_iter = word_count.begin();
40
for (; map_iter!=word_count.end(); ++map_iter)
{
41
cout << "The word \"" << setw(10) << map_iter->first
42
<< setw(prev) << "\" appears " << map_iter->second
43
<< " times." << endl;
44
}
45
return 0;
46
} 为了增加普适性,还准备扩充对文本文档的读取功能,未完待续