DFA与捕获和预查结合起来的话很麻烦,不能用一张表来迭代,而是要用回溯,然后在回溯的时候记下状态。至此正则表达式的所有算法都完成了,接下来是详细介绍。
每一个回溯的记录都需要一张带有所有捕获轨迹的表(元素为堆栈的堆栈),给每一个记录集一张表显然很浪费,因此我们只用一张表来记录。但是我们对表进行删除项目的时候,又可能会影响到上一个回溯的数据,因此我们要开发出一个新的堆栈,可以push,可以pop,而且pop之后再push不会破坏原有的东西,一直到外边显示指定跳到前一个状态为止:
1 template<typename T, typename K>
2 void Push(List<T, K>& elements, int& available, int& count, const T& element)
3 {
4 if(elements.Count()==count)
5 {
6 elements.Add(element);
7 }
8 else
9 {
10 elements[count]=element;
11 }
12 T& current=elements[count];
13 current.previous=available;
14 available=count++;
15 }
16
17 template<typename T, typename K>
18 T Pop(List<T, K>& elements, int& available, int& count)
19 {
20 T& current=elements[available];
21 available=current.previous;
22 return current;
23 }
于是我们就可以对可以回溯的列表进行保存了。一开始available=-1,count=0。每次push的时候改变available和count,但是对于列表来说,0到count-1的东西都是安全的。pop的时候改变available,但是count保留不变,这个时候elements[count-1]可以通过elements[count-1].available(而不是参数的available)追溯到上一个项。这样就把所有存在的堆栈都压入到同一个列表里面去了,而且不同的堆栈可以通过不同的count访问,一直到比较小的count决定push东西了,大的count对应的堆栈才会受到破坏。于是我们就用一个列表和一个count表组成了一个元素是堆栈的堆栈。
知道怎么操作之后下面的匹配代码就很好理解了:
1 #include "RegexRich.h"
2
3 namespace vl
4 {
5 namespace regex_internal
6 {
7
8 /***********************************************************************
9 回溯辅助数据结构
10 ***********************************************************************/
11
12 class SaverBase
13 {
14 public:
15 bool available;
16 int previous;
17 };
18
19 class StateSaver
20 {
21 public:
22 enum StateStoreType
23 {
24 Positive,
25 Negative,
26 Other
27 };
28
29 const wchar_t* reading; //当前字符串位置
30 State* currentState; //当前状态
31 int minTransition; //最小可用转换
32 int captureCount; //有效capture数量
33 int stateSaverCount; //有效回溯状态数量
34 int extensionSaverAvailable; //有效未封闭扩展功能数量
35 int extensionSaverCount; //所有未封闭扩展功能数量
36 StateStoreType storeType; //保存状态的原因
37
38 bool operator==(const StateSaver& saver)
39 {
40 return
41 reading==saver.reading &&
42 currentState==saver.currentState &&
43 minTransition==saver.minTransition &&
44 captureCount==saver.captureCount;
45 }
46 };
47
48 class ExtensionSaver : public SaverBase
49 {
50 public:
51 int captureListIndex;
52 Transition* transition;
53 const wchar_t* reading;
54
55 bool operator==(const ExtensionSaver& saver)
56 {
57 return
58 captureListIndex==saver.captureListIndex &&
59 transition==saver.transition &&
60 reading==saver.reading;
61 }
62 };
63
64 template<typename T, typename K>
65 void Push(List<T, K>& elements, int& available, int& count, const T& element)
66 {
67 if(elements.Count()==count)
68 {
69 elements.Add(element);
70 }
71 else
72 {
73 elements[count]=element;
74 }
75 T& current=elements[count];
76 current.previous=available;
77 available=count++;
78 }
79
80 template<typename T, typename K>
81 T Pop(List<T, K>& elements, int& available, int& count)
82 {
83 T& current=elements[available];
84 available=current.previous;
85 return current;
86 }
87
88 template<typename T, typename K>
89 void PushNonSaver(List<T, K>& elements, int& count, const T& element)
90 {
91 if(elements.Count()==count)
92 {
93 elements.Add(element);
94 }
95 else
96 {
97 elements[count]=element;
98 }
99 count++;
100 }
101
102 template<typename T, typename K>
103 T PopNonSaver(List<T, K>& elements, int& count)
104 {
105 return elements[--count];
106 }
107 }
108
109 template<>
110 struct POD<regex_internal::StateSaver>
111 {
112 static const bool Result=true;
113 };
114
115 template<>
116 struct POD<regex_internal::ExtensionSaver>
117 {
118 static const bool Result=true;
119 };
120
121 namespace regex_internal
122 {
123 /***********************************************************************
124 CaptureRecord
125 ***********************************************************************/
126
127 bool CaptureRecord::operator==(const CaptureRecord& record)
128 {
129 return capture==record.capture && start==record.start && length==record.length;
130 }
131
132 /***********************************************************************
133 PureInterpretor
134 ***********************************************************************/
135
136 RichInterpretor::RichInterpretor(Automaton::Ref _dfa)
137 :dfa(_dfa)
138 {
139 datas=new UserData[dfa->states.Count()];
140
141 for(int i=0;i<dfa->states.Count();i++)
142 {
143 State* state=dfa->states[i].Obj();
144 int charEdges=0;
145 int nonCharEdges=0;
146 bool mustSave=false;
147 for(int j=0;j<state->transitions.Count();j++)
148 {
149 if(state->transitions[j]->type==Transition::Chars)
150 {
151 charEdges++;
152 }
153 else
154 {
155 if(state->transitions[j]->type==Transition::Negative ||
156 state->transitions[j]->type==Transition::Positive)
157 {
158 mustSave=true;
159 }
160 nonCharEdges++;
161 }
162 }
163 datas[i].NeedKeepState=mustSave || nonCharEdges>1 || (nonCharEdges!=0 && charEdges!=0);
164 state->userData=&datas[i];
165 }
166 }
167
168 RichInterpretor::~RichInterpretor()
169 {
170 delete[] datas;
171 }
172
173 bool RichInterpretor::MatchHead(const wchar_t* input, const wchar_t* start, Result& result)
174 {
175 List<StateSaver> stateSavers;
176 List<ExtensionSaver> extensionSavers;
177
178 StateSaver currentState;
179 currentState.captureCount=0;
180 currentState.currentState=dfa->startState;
181 currentState.extensionSaverAvailable=-1;
182 currentState.extensionSaverCount=0;
183 currentState.minTransition=0;
184 currentState.reading=input;
185 currentState.stateSaverCount=0;
186 currentState.storeType=StateSaver::Other;
187
188 while(!currentState.currentState->finalState)
189 {
190 bool found=false;
191 StateSaver oldState=currentState;
192 //开始遍历转换
193 for(int i=currentState.minTransition;i<currentState.currentState->transitions.Count();i++)
194 {
195 Transition* transition=currentState.currentState->transitions[i];
196 switch(transition->type)
197 {
198 case Transition::Chars:
199 {
200 CharRange range=transition->range;
201 found=
202 range.begin<=*currentState.reading &&
203 range.end>=*currentState.reading;
204 if(found)
205 {
206 currentState.reading++;
207 }
208 }
209 break;
210 case Transition::BeginString:
211 {
212 found=currentState.reading==start;
213 }
214 break;
215 case Transition::EndString:
216 {
217 found=*currentState.reading==L'\0';
218 }
219 break;
220 case Transition::Nop:
221 {
222 found=true;
223 }
224 break;
225 case Transition::Capture:
226 {
227 ExtensionSaver saver;
228 saver.captureListIndex=currentState.captureCount;
229 saver.reading=currentState.reading;
230 saver.transition=transition;
231 Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
232
233 CaptureRecord capture;
234 capture.capture=transition->capture;
235 capture.start=currentState.reading-input;
236 capture.length=-1;
237 PushNonSaver(result.captures, currentState.captureCount, capture);
238
239 found=true;
240 }
241 break;
242 case Transition::Match:
243 {
244 int index=0;
245 for(int j=0;j<currentState.captureCount;j++)
246 {
247 CaptureRecord& capture=result.captures[j];
248 if(capture.capture==transition->capture)
249 {
250 if(capture.length!=-1 && (transition->index==-1 || transition->index==index))
251 {
252 if(wcsncmp(input+capture.start, currentState.reading, capture.length)==0)
253 {
254 currentState.reading+=capture.length;
255 found=true;
256 break;
257 }
258 }
259 if(transition->index!=-1 && index==transition->index)
260 {
261 break;
262 }
263 else
264 {
265 index++;
266 }
267 }
268 }
269 }
270 break;
271 case Transition::Positive:
272 {
273 ExtensionSaver saver;
274 saver.captureListIndex=-1;
275 saver.reading=currentState.reading;
276 saver.transition=transition;
277 Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
278 //Positive的oldState一定会被push
279 oldState.storeType=StateSaver::Positive;
280 found=true;
281 }
282 break;
283 case Transition::Negative:
284 {
285 ExtensionSaver saver;
286 saver.captureListIndex=-1;
287 saver.reading=currentState.reading;
288 saver.transition=transition;
289 Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
290 //Negative的oldState一定会被push
291 oldState.storeType=StateSaver::Negative;
292 found=true;
293 }
294 break;
295 case Transition::NegativeFail:
296 {
297 //只有在回溯的时候NegativeFail才会被考虑
298 }
299 break;
300 case Transition::End:
301 {
302 ExtensionSaver saver=Pop(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount);
303 switch(saver.transition->type)
304 {
305 case Transition::Capture:
306 {
307 CaptureRecord& capture=result.captures[saver.captureListIndex];
308 capture.length=(currentState.reading-input)-capture.start;
309 found=true;
310 }
311 break;
312 case Transition::Positive:
313 for(int j=currentState.stateSaverCount-1;j>=0;j--)
314 {
315 StateSaver& saver=stateSavers[j];
316 if(saver.storeType==StateSaver::Positive)
317 {
318 oldState.reading=saver.reading;
319 oldState.stateSaverCount=j;
320 currentState.reading=saver.reading;
321 currentState.stateSaverCount=j;
322 break;
323 }
324 }
325 found=true;
326 break;
327 case Transition::Negative:
328 for(int j=currentState.stateSaverCount-1;j>=0;j--)
329 {
330 StateSaver& saver=stateSavers[j];
331 if(saver.storeType==StateSaver::Negative)
332 {
333 oldState=saver;
334 oldState.storeType=StateSaver::Other;
335 currentState=saver;
336 currentState.storeType=StateSaver::Other;
337 i=currentState.minTransition-1;
338 break;
339 }
340 }
341 break;
342 }
343 }
344 break;
345 }
346 //寻找成功,在必要的时候保存当前的回溯状态
347 if(found)
348 {
349 UserData* data=(UserData*)currentState.currentState->userData;
350 if(data->NeedKeepState)
351 {
352 oldState.minTransition=i+1;
353 PushNonSaver(stateSavers, currentState.stateSaverCount, oldState);
354 }
355 currentState.currentState=transition->target;
356 currentState.minTransition=0;
357 break;
358 }
359 }
360 if(!found)
361 {
362 //存在回溯记录则回溯
363 if(currentState.stateSaverCount)
364 {
365 //恢复Negative失败状态的时候要移动到NegativeFail后面
366 currentState=PopNonSaver(stateSavers, currentState.stateSaverCount);
367 //minTransition总是被+1后保存,因此直接-1总是有效值
368 if(currentState.currentState->transitions[currentState.minTransition-1]->type==Transition::Negative)
369 {
370 //寻找NegativeFail
371 for(int i=0;i<currentState.currentState->transitions.Count();i++)
372 {
373 Transition* transition=currentState.currentState->transitions[i];
374 if(transition->type==Transition::NegativeFail)
375 {
376 //将当前状态移动到NegativeFail后面
377 currentState.currentState=transition->target;
378 currentState.minTransition=0;
379 currentState.storeType=StateSaver::Other;
380 break;
381 }
382 }
383 }
384 }
385 else
386 {
387 break;
388 }
389 }
390 }
391
392 //判断是否成功并且处理返回结果
393 if(currentState.currentState->finalState)
394 {
395 result.start=0;
396 result.length=currentState.reading-input;
397 for(int i=result.captures.Count()-1;i>=currentState.captureCount;i++)
398 {
399 result.captures.RemoveAt(i);
400 }
401 return true;
402 }
403 else
404 {
405 result.captures.Clear();
406 return false;
407 }
408 }
409
410 bool RichInterpretor::Match(const wchar_t* input, const wchar_t* start, Result& result)
411 {
412 const wchar_t* read=input;
413 while(*read)
414 {
415 if(MatchHead(read, start, result))
416 {
417 result.start=read-input;
418 for(int i=0;i<result.captures.Count();i++)
419 {
420 result.captures[i].start+=result.start;
421 }
422 return true;
423 }
424 read++;
425 }
426 return false;
427 }
428
429 const IReadonlyList<WString>& RichInterpretor::CaptureNames()
430 {
431 return dfa->captureNames.Wrap();
432 }
433 }
434 }
于是带有各种扩展功能的匹配就完成了。之前还用同样的前端开发了一个不带扩展功能的纯匹配的正则表达式引擎。因为过于简单就不列出来了。这些算法的详细介绍可以看
这里。
下面的工作就是写更多的单元测试用例,最后将这两个不同的匹配算法封装在一起,智能地在正确的情况下选择更快的算法。这样正则表达式引擎的效率将会大大提高。
posted on 2009-11-14 19:13
陈梓瀚(vczh) 阅读(2462)
评论(1) 编辑 收藏 引用 所属分类:
VL++3.0开发纪事