不知道什么是epsilon-NFA的话先看
这里。
从正则表达式生成epsilon-NFA其实跟将一棵树变换成另一棵树是类似的。epsilon转换提供了一种工具让我们可以把一个图表达成漂亮的形式,看起来就有典型的递归结构。因此这个工作依然可以用RegexExpressionAlgorithm这个visitor模式的产物来解决:
1 class EpsilonNfaInfo
2 {
3 public:
4 Automaton::Ref automaton;
5 };
6
7 class EpsilonNfa
8 {
9 public:
10 State* start;
11 State* end;
12
13 EpsilonNfa()
14 {
15 start=0;
16 end=0;
17 }
18 };
19
20 class EpsilonNfaAlgorithm : public RegexExpressionAlgorithm<EpsilonNfa, Automaton*>
21 {
22 public:
23 EpsilonNfa Connect(EpsilonNfa a, EpsilonNfa b, Automaton* target)
24 {
25 if(a.start)
26 {
27 target->NewEpsilon(a.end, b.start);
28 a.end=b.end;
29 return a;
30 }
31 else
32 {
33 return b;
34 }
35 }
36
37 EpsilonNfa Apply(CharSetExpression* expression, Automaton* target)
38 {
39 EpsilonNfa nfa;
40 nfa.start=target->NewState();
41 nfa.end=target->NewState();
42 for(int i=0;i<expression->ranges.Count();i++)
43 {
44 target->NewChars(nfa.start, nfa.end, expression->ranges[i]);
45 }
46 return nfa;
47 }
48
49 EpsilonNfa Apply(LoopExpression* expression, Automaton* target)
50 {
51 EpsilonNfa head;
52 for(int i=0;i<expression->min;i++)
53 {
54 EpsilonNfa body=Invoke(expression->expression, target);
55 head=Connect(head, body, target);
56 }
57 if(expression->max==-1)
58 {
59 EpsilonNfa body=Invoke(expression->expression, target);
60 if(!head.start)
61 {
62 head.start=head.end=target->NewState();
63 }
64 target->NewEpsilon(head.end, body.start);
65 target->NewEpsilon(body.end, head.end);
66 }
67 else if(expression->max>expression->min)
68 {
69 for(int i=expression->min;i<expression->max;i++)
70 {
71 EpsilonNfa body=Invoke(expression->expression, target);
72 target->NewEpsilon(body.start, body.end);
73 head=Connect(head, body, target);
74 }
75 }
76 return head;
77 }
78
79 EpsilonNfa Apply(SequenceExpression* expression, Automaton* target)
80 {
81 EpsilonNfa a=Invoke(expression->left, target);
82 EpsilonNfa b=Invoke(expression->right, target);
83 return Connect(a, b, target);
84 }
85
86 EpsilonNfa Apply(AlternateExpression* expression, Automaton* target)
87 {
88 EpsilonNfa result;
89 result.start=target->NewState();
90 result.end=target->NewState();
91 EpsilonNfa a=Invoke(expression->left, target);
92 EpsilonNfa b=Invoke(expression->right, target);
93 target->NewEpsilon(result.start, a.start);
94 target->NewEpsilon(a.end, result.end);
95 target->NewEpsilon(result.start, b.start);
96 target->NewEpsilon(b.end, result.end);
97 return result;
98 }
99
100 EpsilonNfa Apply(BeginExpression* expression, Automaton* target)
101 {
102 EpsilonNfa result;
103 result.start=target->NewState();
104 result.end=target->NewState();
105 target->NewBeginString(result.start, result.end);
106 return result;
107 }
108
109 EpsilonNfa Apply(EndExpression* expression, Automaton* target)
110 {
111 EpsilonNfa result;
112 result.start=target->NewState();
113 result.end=target->NewState();
114 target->NewEndString(result.start, result.end);
115 return result;
116 }
117
118 EpsilonNfa Apply(CaptureExpression* expression, Automaton* target)
119 {
120 EpsilonNfa result;
121 result.start=target->NewState();
122 result.end=target->NewState();
123
124 int capture=-1;
125 if(expression->name!=L"")
126 {
127 capture=target->captureNames.IndexOf(expression->name);
128 if(capture==-1)
129 {
130 capture=target->captureNames.Count();
131 target->captureNames.Add(expression->name);
132 }
133 }
134
135 EpsilonNfa body=Invoke(expression->expression, target);
136 target->NewCapture(result.start, body.start, capture);
137 target->NewEnd(body.end, result.end);
138 return result;
139 }
140
141 EpsilonNfa Apply(MatchExpression* expression, Automaton* target)
142 {
143 int capture=-1;
144 if(expression->name!=L"")
145 {
146 capture=target->captureNames.IndexOf(expression->name);
147 if(capture==-1)
148 {
149 capture=target->captureNames.Count();
150 target->captureNames.Add(expression->name);
151 }
152 }
153 EpsilonNfa result;
154 result.start=target->NewState();
155 result.end=target->NewState();
156 target->NewMatch(result.start, result.end, capture, expression->index);
157 return result;
158 }
159
160 EpsilonNfa Apply(PositiveExpression* expression, Automaton* target)
161 {
162 EpsilonNfa result;
163 result.start=target->NewState();
164 result.end=target->NewState();
165 EpsilonNfa body=Invoke(expression->expression, target);
166 target->NewPositive(result.start, body.start);
167 target->NewEnd(body.end, result.end);
168 return result;
169 }
170
171 EpsilonNfa Apply(NegativeExpression* expression, Automaton* target)
172 {
173 EpsilonNfa result;
174 result.start=target->NewState();
175 result.end=target->NewState();
176 EpsilonNfa body=Invoke(expression->expression, target);
177 target->NewNegative(result.start, body.start);
178 target->NewEnd(body.end, result.end);
179 return result;
180 }
181
182 EpsilonNfa Apply(UsingExpression* expression, Automaton* target)
183 {
184 CHECK_ERROR(false, L"RegexExpression::GenerateEpsilonNfa()#UsingExpression不能产生状态机。");
185 }
186 };
当然,为了让上面的代码成功,我们还需要一个描述状态机的结构:
1 /***********************************************************************
2 Vczh Library++ 3.0
3 Developer: 陈梓瀚(vczh)
4 Regex::RegexAutomaton
5
6 Classes:
7 ***********************************************************************/
8
9 #ifndef VCZH_REGEX_REGEXAUTOMATON
10 #define VCZH_REGEX_REGEXAUTOMATON
11
12 #include "RegexData.h"
13
14 namespace vl
15 {
16 namespace regex_internal
17 {
18 class State;
19 class Transition;
20
21 class Transition
22 {
23 public:
24 enum Type
25 {
26 Chars, //range为字符范围
27 Epsilon,
28 BeginString,
29 EndString,
30 Capture, //capture为捕获频道
31 Match, //capture为捕获频道,index为匹配的位置,-1代表匹配频道下面的所有项目
32 Positive, //
33 Negative, //
34 End //Capture, Position, Negative
35 };
36
37 State* source;
38 State* target;
39 CharRange range;
40 Type type;
41 int capture;
42 int index;
43 };
44
45 class State
46 {
47 public:
48 List<Transition*> transitions;
49 List<Transition*> inputs;
50 bool finalState;
51 };
52
53 class Automaton
54 {
55 public:
56 typedef Ptr<Automaton> Ref;
57
58 List<Ptr<State>> states;
59 List<Ptr<Transition>> transitions;
60 List<WString> captureNames;
61 State* startState;
62
63 Automaton();
64
65 State* NewState();
66 Transition* NewTransition(State* start, State* end);
67 Transition* NewChars(State* start, State* end, CharRange range);
68 Transition* NewEpsilon(State* start, State* end);
69 Transition* NewBeginString(State* start, State* end);
70 Transition* NewEndString(State* start, State* end);
71 Transition* NewCapture(State* start, State* end, int capture);
72 Transition* NewMatch(State* start, State* end, int capture, int index=-1);
73 Transition* NewPositive(State* start, State* end);
74 Transition* NewNegative(State* start, State* end);
75 Transition* NewEnd(State* start, State* end);
76 };
77 }
78 }
79
80 #endif
这里Automaton::New×××都是一些辅助建立图结构的函数,就不详细描述了。于是让我们看一个例子,譬如说分析一个字符串是不是C语言的注释:
/\*([^*]|\*+[^*/])*\*+/:
1 [START]State<0>
2 To State<1> : <Char : 47[/] - 47[/]>
3 State<1>
4 To State<2> : <Epsilon>
5 State<2>
6 To State<3> : <Char : 42[*] - 42[*]>
7 State<3>
8 To State<14> : <Epsilon>
9 State<4>
10 To State<6> : <Epsilon>
11 To State<8> : <Epsilon>
12 State<5>
13 To State<14> : <Epsilon>
14 State<6>
15 To State<7> : <Char : 1[] - 41[)]>
16 To State<7> : <Char : 43[+] - 46[.]>
17 To State<7> : <Char : 47[/] - 47[/]>
18 To State<7> : <Char : 48[0] - 65535?[]>
19 State<7>
20 To State<5> : <Epsilon>
21 State<8>
22 To State<9> : <Char : 42[*] - 42[*]>
23 State<9>
24 To State<10> : <Epsilon>
25 To State<12> : <Epsilon>
26 State<10>
27 To State<11> : <Char : 42[*] - 42[*]>
28 State<11>
29 To State<9> : <Epsilon>
30 State<12>
31 To State<13> : <Char : 1[] - 41[)]>
32 To State<13> : <Char : 43[+] - 46[.]>
33 To State<13> : <Char : 48[0] - 65535[]>
34 State<13>
35 To State<5> : <Epsilon>
36 State<14>
37 To State<4> : <Epsilon>
38 To State<15> : <Epsilon>
39 State<15>
40 To State<16> : <Char : 42[*] - 42[*]>
41 State<16>
42 To State<17> : <Epsilon>
43 To State<19> : <Epsilon>
44 State<17>
45 To State<18> : <Char : 42[*] - 42[*]>
46 State<18>
47 To State<16> : <Epsilon>
48 State<19>
49 To State<20> : <Char : 47[/] - 47[/]>
50 [FINISH]State<20>
51
posted on 2009-10-24 00:23
陈梓瀚(vczh) 阅读(2263)
评论(5) 编辑 收藏 引用 所属分类:
VL++3.0开发纪事