把另一种解法也贴出来吧(由于题目的背景是在大量数据中查找少数关键字,效率不如前面的解法)。
基本思路:将 关键字 从1开始编号, 对分词后源词组,进行映射,如果是关键字,就映射为该关键字的编号,否则映射为数字0
用两个指针,一前一后,很容易就可以实现在O(n)时间内找出结果。
1
2 #include<iostream>
3 #include<vector>
4 #include<string>
5 #include<algorithm>
6 #include<set>
7
8 #define USE_CPLUSPLUS0x 1
9
10 #if USE_CPLUSPLUS0x
11 #include<unordered_map>
12 #else
13 #include<map>
14 #endif
15
16 #include<cassert>
17
18 using std::string;
19 using std::vector;
20 using std::cout;
21
22
23 #if USE_CPLUSPLUS0x
24 struct Cmp{
25 bool operator()(const string* a, const string* b) const { return *a == *b; }
26 } ;
27
28 struct Hash{
29 size_t operator()(const string* a) const { return std::hash<string>()(*a);}
30 } ;
31
32 typedef std::unordered_map<const string*, size_t, Hash, Cmp> Dict;
33 #else
34 struct Cmp{
35 bool operator()(const string* sa, const string * sb) const { return *sa < *sb; }
36 } ;
37
38 typedef std::map<const string*, size_t, Cmp> Dict;
39 #endif
40
41 template<typename T>
42 void output(T beg, T end, const char str[] = "", const char sep[] = "")
43 {
44 cout << str;
45 while (beg != end) cout << *beg++ << sep;
46 cout << "\n";
47 }
48
49 //方法一,先扫一遍找出原字符串包含的关键词数,再扫两遍,找出所求结果
50 void solve(const string str[], const size_t str_sz,
51 const string keyword[], const size_t keyword_sz)
52 {
53 if (str_sz == 0 || keyword_sz == 0) return;
54
55 vector<size_t> old_pos; //分词后的字符串,在原字符串中的位置
56 old_pos.reserve(str_sz + 1);
57 old_pos.push_back(0);
58 for (size_t i = 0, sum = 0; i < str_sz; ++i) {
59 sum += str[i].size();
60 old_pos.push_back(sum);
61 }
62
63 Dict key_idx; //关键字映射为数字,以减少字符串比较
64 for (size_t i = 0; i < keyword_sz; ++i)
65 key_idx.insert(Dict::value_type(&keyword[i], i + 1));
66
67 typedef size_t User_size_type;
68 vector<User_size_type> str_idx(str_sz); //将原字符串映射为数字
69 vector<User_size_type> keyword_count(keyword_sz + 1);
70
71 size_t matched = 0, pos = 0;
72 for (size_t i = 0; i < str_sz; ++i) {
73 Dict::iterator it = key_idx.find(&str[i]);
74 if (it == key_idx.end()) { str_idx[i] = 0; continue; }
75 size_t idx = (str_idx[i] = it->second); //将原字符串映射为数字
76 if (++keyword_count[idx] == 1) { ++matched; pos = idx;}
77 }
78
79 size_t beg = 0, end = 0, len = -1; //记录结果
80 if (matched > 0) {
81 std::fill(keyword_count.begin(), keyword_count.end(), 0);
82 for (size_t high = 0; high < pos; ++high) ++keyword_count[str_idx[high]] ;
83 for (size_t high = pos, low = 0; high < str_sz; ++high) {
84 size_t idx = str_idx[high]; //将原字符串映射为数字
85 if (idx == 0) continue;
86 ++keyword_count[idx];
87 for (; ;++low) {
88 size_t idx = str_idx[low];
89 if (idx == 0) continue;
90 assert(idx <= keyword_sz);
91 if (keyword_count[idx] == 1) break;
92 --keyword_count[idx];
93 }
94 size_t plen = old_pos[high] - old_pos[low];
95 if (plen < len) { len = plen; beg = low; end = high + 1; }
96 }
97 }
98
99 output(&str[0], &str[str_sz], "words: ");
100 output(&keyword[0], &keyword[keyword_sz], "keywords: ", " ");
101 if (matched > 0 && matched < keyword_sz)
102 cout << "(" << matched << "/" << keyword_sz << " Matched) ";
103 output(&str[beg], &str[end], "result: ");
104 cout << "\n";
105 }
106
107
108 //方法二,先扫两遍尝试找出结果,若原字符串没有包含所有关键词,则再扫两遍
109 void solve2(const string str[], const size_t str_sz,
110 const string keyword[], const size_t keyword_sz)
111 {
112 if (str_sz == 0 || keyword_sz == 0) return;
113
114 vector<size_t> old_pos; //分词后的字符串,在原字符串中的位置
115 old_pos.reserve(str_sz + 1);
116 old_pos.push_back(0);
117 for (size_t i = 0, sum = 0; i < str_sz; ++i) {
118 sum += str[i].size();
119 old_pos.push_back(sum);
120 }
121
122 Dict key_idx; //关键字映射为数字,以减少字符串比较
123 for (size_t i = 0; i < keyword_sz; ++i)
124 key_idx.insert(Dict::value_type(&keyword[i], i + 1));
125
126 typedef size_t User_size_type;
127 vector<User_size_type> str_idx(str_sz); //将原字符串映射为数字
128 vector<User_size_type> keyword_count(keyword_sz + 1);
129
130 size_t unmatched = keyword_sz, pos = 0;
131 size_t beg = 0, end = 0, len = -1; //记录结果
132 for (size_t high = 0, low = 0; high < str_sz; ++high) {
133 Dict::iterator it = key_idx.find(&str[high]);
134 if (it == key_idx.end()) { str_idx[high] = 0; continue; }
135 size_t idx = (str_idx[high] = it->second); //将原字符串映射为数字
136
137 if (++keyword_count[idx] == 1) { --unmatched; pos = high; }
138
139 if (unmatched == 0) {
140 for (; ;++low) {
141 size_t k = str_idx[low];
142 if (k == 0) continue;
143 assert(k <= keyword_sz);
144 if (keyword_count[k] == 1) break;
145 --keyword_count[k];
146 }
147 size_t plen = old_pos[high] - old_pos[low];
148 if (plen < len) { len = plen; beg = low; end = high + 1; }
149 }
150 }
151
152 size_t matched = keyword_sz - unmatched;
153 if (matched != 0 && matched < keyword_sz) { //没找到所有关键字
154 std::fill(keyword_count.begin(), keyword_count.end(), 0);
155 for (size_t high = 0; high < pos; ++high) ++keyword_count[str_idx[high]] ;
156 for (size_t high = pos, low = 0; high < str_sz; ++high) {
157 size_t idx = str_idx[high]; //将原字符串映射为数字
158 if (idx == 0) continue;
159 ++keyword_count[idx];
160 for (; ;++low) {
161 size_t idx = str_idx[low];
162 if (idx == 0) continue;
163 assert(idx <= keyword_sz);
164 if (keyword_count[idx] == 1) break;
165 --keyword_count[idx];
166 }
167 size_t plen = old_pos[high] - old_pos[low];
168 if (plen < len) { len = plen; beg = low; end = high + 1; }
169 }
170 }
171
172 output(&str[0], &str[str_sz], "words: ");
173 output(&keyword[0], &keyword[keyword_sz], "keywords: ", " ");
174 if (matched > 0 && matched < keyword_sz)
175 cout << "(" << matched << "/" << keyword_sz << " Matched) ";
176 output(&str[beg], &str[end], "result: ");
177 cout << "\n";
178 }
179
180
181 template<typename T, size_t sz>
182 inline size_t getsz(T (&)[sz]) { return sz; }
183
184 int main()
185 {
186 string keyword[] = { "微软", "计算机", "亚洲", "中国"};
187 string str[] = {
188 "微软","亚洲","研究院","成立","于","1998","年",",","我们","的","使命",
189 "是","使","未来","的","计算机","能够","看","、","听","、","学",",",
190 "能","用","自然语言","与","人类","进行","交流","。","在","此","基础","上",
191 ",","微软","亚洲","研究院","还","将","促进","计算机","在","亚太","地区",
192 "的","普及",",","改善","亚太","用户","的","计算","体验","。","”"
193 };
194 solve(str, getsz(str), keyword, getsz(keyword));
195 std::cout << "\n";
196 solve2(str, getsz(str), keyword, getsz(keyword));
197 }
198
199