随笔-341  评论-2670  文章-0  trackbacks-0
    不知道什么是epsilon-NFA的话先看这里

    从正则表达式生成epsilon-NFA其实跟将一棵树变换成另一棵树是类似的。epsilon转换提供了一种工具让我们可以把一个图表达成漂亮的形式,看起来就有典型的递归结构。因此这个工作依然可以用RegexExpressionAlgorithm这个visitor模式的产物来解决:
  1         class EpsilonNfaInfo
  2         {
  3         public:
  4             Automaton::Ref        automaton;
  5         };
  6 
  7         class EpsilonNfa
  8         {
  9         public:
 10             State*                start;
 11             State*                end;
 12 
 13             EpsilonNfa()
 14             {
 15                 start=0;
 16                 end=0;
 17             }
 18         };
 19 
 20         class EpsilonNfaAlgorithm : public RegexExpressionAlgorithm<EpsilonNfa, Automaton*>
 21         {
 22         public:
 23             EpsilonNfa Connect(EpsilonNfa a, EpsilonNfa b, Automaton* target)
 24             {
 25                 if(a.start)
 26                 {
 27                     target->NewEpsilon(a.end, b.start);
 28                     a.end=b.end;
 29                     return a;
 30                 }
 31                 else
 32                 {
 33                     return b;
 34                 }
 35             }
 36 
 37             EpsilonNfa Apply(CharSetExpression* expression, Automaton* target)
 38             {
 39                 EpsilonNfa nfa;
 40                 nfa.start=target->NewState();
 41                 nfa.end=target->NewState();
 42                 for(int i=0;i<expression->ranges.Count();i++)
 43                 {
 44                     target->NewChars(nfa.start, nfa.end, expression->ranges[i]);
 45                 }
 46                 return nfa;
 47             }
 48 
 49             EpsilonNfa Apply(LoopExpression* expression, Automaton* target)
 50             {
 51                 EpsilonNfa head;
 52                 for(int i=0;i<expression->min;i++)
 53                 {
 54                     EpsilonNfa body=Invoke(expression->expression, target);
 55                     head=Connect(head, body, target);
 56                 }
 57                 if(expression->max==-1)
 58                 {
 59                     EpsilonNfa body=Invoke(expression->expression, target);
 60                     if(!head.start)
 61                     {
 62                         head.start=head.end=target->NewState();
 63                     }
 64                     target->NewEpsilon(head.end, body.start);
 65                     target->NewEpsilon(body.end, head.end);
 66                 }
 67                 else if(expression->max>expression->min)
 68                 {
 69                     for(int i=expression->min;i<expression->max;i++)
 70                     {
 71                         EpsilonNfa body=Invoke(expression->expression, target);
 72                         target->NewEpsilon(body.start, body.end);
 73                         head=Connect(head, body, target);
 74                     }
 75                 }
 76                 return head;
 77             }
 78 
 79             EpsilonNfa Apply(SequenceExpression* expression, Automaton* target)
 80             {
 81                 EpsilonNfa a=Invoke(expression->left, target);
 82                 EpsilonNfa b=Invoke(expression->right, target);
 83                 return Connect(a, b, target);
 84             }
 85 
 86             EpsilonNfa Apply(AlternateExpression* expression, Automaton* target)
 87             {
 88                 EpsilonNfa result;
 89                 result.start=target->NewState();
 90                 result.end=target->NewState();
 91                 EpsilonNfa a=Invoke(expression->left, target);
 92                 EpsilonNfa b=Invoke(expression->right, target);
 93                 target->NewEpsilon(result.start, a.start);
 94                 target->NewEpsilon(a.end, result.end);
 95                 target->NewEpsilon(result.start, b.start);
 96                 target->NewEpsilon(b.end, result.end);
 97                 return result;
 98             }
 99 
100             EpsilonNfa Apply(BeginExpression* expression, Automaton* target)
101             {
102                 EpsilonNfa result;
103                 result.start=target->NewState();
104                 result.end=target->NewState();
105                 target->NewBeginString(result.start, result.end);
106                 return result;
107             }
108 
109             EpsilonNfa Apply(EndExpression* expression, Automaton* target)
110             {
111                 EpsilonNfa result;
112                 result.start=target->NewState();
113                 result.end=target->NewState();
114                 target->NewEndString(result.start, result.end);
115                 return result;
116             }
117 
118             EpsilonNfa Apply(CaptureExpression* expression, Automaton* target)
119             {
120                 EpsilonNfa result;
121                 result.start=target->NewState();
122                 result.end=target->NewState();
123 
124                 int capture=-1;
125                 if(expression->name!=L"")
126                 {
127                     capture=target->captureNames.IndexOf(expression->name);
128                     if(capture==-1)
129                     {
130                         capture=target->captureNames.Count();
131                         target->captureNames.Add(expression->name);
132                     }
133                 }
134 
135                 EpsilonNfa body=Invoke(expression->expression, target);
136                 target->NewCapture(result.start, body.start, capture);
137                 target->NewEnd(body.end, result.end);
138                 return result;
139             }
140 
141             EpsilonNfa Apply(MatchExpression* expression, Automaton* target)
142             {
143                 int capture=-1;
144                 if(expression->name!=L"")
145                 {
146                     capture=target->captureNames.IndexOf(expression->name);
147                     if(capture==-1)
148                     {
149                         capture=target->captureNames.Count();
150                         target->captureNames.Add(expression->name);
151                     }
152                 }
153                 EpsilonNfa result;
154                 result.start=target->NewState();
155                 result.end=target->NewState();
156                 target->NewMatch(result.start, result.end, capture, expression->index);
157                 return result;
158             }
159 
160             EpsilonNfa Apply(PositiveExpression* expression, Automaton* target)
161             {
162                 EpsilonNfa result;
163                 result.start=target->NewState();
164                 result.end=target->NewState();
165                 EpsilonNfa body=Invoke(expression->expression, target);
166                 target->NewPositive(result.start, body.start);
167                 target->NewEnd(body.end, result.end);
168                 return result;
169             }
170 
171             EpsilonNfa Apply(NegativeExpression* expression, Automaton* target)
172             {
173                 EpsilonNfa result;
174                 result.start=target->NewState();
175                 result.end=target->NewState();
176                 EpsilonNfa body=Invoke(expression->expression, target);
177                 target->NewNegative(result.start, body.start);
178                 target->NewEnd(body.end, result.end);
179                 return result;
180             }
181 
182             EpsilonNfa Apply(UsingExpression* expression, Automaton* target)
183             {
184                 CHECK_ERROR(false, L"RegexExpression::GenerateEpsilonNfa()#UsingExpression不能产生状态机。");
185             }
186         };

    当然,为了让上面的代码成功,我们还需要一个描述状态机的结构:
 1 /***********************************************************************
 2 Vczh Library++ 3.0
 3 Developer: 陈梓瀚(vczh)
 4 Regex::RegexAutomaton
 5 
 6 Classes:
 7 ***********************************************************************/
 8 
 9 #ifndef VCZH_REGEX_REGEXAUTOMATON
10 #define VCZH_REGEX_REGEXAUTOMATON
11 
12 #include "RegexData.h"
13 
14 namespace vl
15 {
16     namespace regex_internal
17     {
18         class State;
19         class Transition;
20 
21         class Transition
22         {
23         public:
24             enum Type
25             {
26                 Chars,                //range为字符范围
27                 Epsilon,
28                 BeginString,
29                 EndString,
30                 Capture,            //capture为捕获频道
31                 Match,                //capture为捕获频道,index为匹配的位置,-1代表匹配频道下面的所有项目
32                 Positive,            //
33                 Negative,            //
34                 End                    //Capture, Position, Negative
35             };
36 
37             State*                    source;
38             State*                    target;
39             CharRange                range;
40             Type                    type;
41             int                        capture;
42             int                        index;
43         };
44 
45         class State
46         {
47         public:
48             List<Transition*>        transitions;
49             List<Transition*>        inputs;
50             bool                    finalState;
51         };
52 
53         class Automaton
54         {
55         public:
56             typedef Ptr<Automaton>        Ref;
57 
58             List<Ptr<State>>        states;
59             List<Ptr<Transition>>    transitions;
60             List<WString>            captureNames;
61             State*                    startState;
62 
63             Automaton();
64 
65             State*                    NewState();
66             Transition*                NewTransition(State* start, State* end);
67             Transition*                NewChars(State* start, State* end, CharRange range);
68             Transition*                NewEpsilon(State* start, State* end);
69             Transition*                NewBeginString(State* start, State* end);
70             Transition*                NewEndString(State* start, State* end);
71             Transition*                NewCapture(State* start, State* end, int capture);
72             Transition*                NewMatch(State* start, State* end, int capture, int index=-1);
73             Transition*                NewPositive(State* start, State* end);
74             Transition*                NewNegative(State* start, State* end);
75             Transition*                NewEnd(State* start, State* end);
76         };
77     }
78 }
79 
80 #endif

    这里Automaton::New×××都是一些辅助建立图结构的函数,就不详细描述了。于是让我们看一个例子,譬如说分析一个字符串是不是C语言的注释:/\*([^*]|\*+[^*/])*\*+/
 1 [START]State<0>
 2     To State<1> : <Char : 47[/- 47[/]>
 3 State<1>
 4     To State<2> : <Epsilon>
 5 State<2>
 6     To State<3> : <Char : 42[*- 42[*]>
 7 State<3>
 8     To State<14> : <Epsilon>
 9 State<4>
10     To State<6> : <Epsilon>
11     To State<8> : <Epsilon>
12 State<5>
13     To State<14> : <Epsilon>
14 State<6>
15     To State<7> : <Char : 1[] - 41[)]>
16     To State<7> : <Char : 43[+- 46[.]>
17     To State<7> : <Char : 47[/- 47[/]>
18          To State<7> : <Char : 48[0- 65535?[]>
19 State<7>
20     To State<5> : <Epsilon>
21 State<8>
22     To State<9> : <Char : 42[*- 42[*]>
23 State<9>
24     To State<10> : <Epsilon>
25     To State<12> : <Epsilon>
26 State<10>
27     To State<11> : <Char : 42[*- 42[*]>
28 State<11>
29     To State<9> : <Epsilon>
30 State<12>
31     To State<13> : <Char : 1[] - 41[)]>
32     To State<13> : <Char : 43[+- 46[.]>
33     To State<13> : <Char : 48[0- 65535[]>
34 State<13>
35     To State<5> : <Epsilon>
36 State<14>
37     To State<4> : <Epsilon>
38     To State<15> : <Epsilon>
39 State<15>
40     To State<16> : <Char : 42[*- 42[*]>
41 State<16>
42     To State<17> : <Epsilon>
43     To State<19> : <Epsilon>
44 State<17>
45     To State<18> : <Char : 42[*- 42[*]>
46 State<18>
47     To State<16> : <Epsilon>
48 State<19>
49     To State<20> : <Char : 47[/- 47[/]>
50 [FINISH]State<20>
51 
posted on 2009-10-24 00:23 陈梓瀚(vczh) 阅读(2263) 评论(5)  编辑 收藏 引用 所属分类: VL++3.0开发纪事

评论:
# re: Vczh Library++3.0之正则表达式引擎(生成epsilon-NFA) 2009-10-24 06:49 | 彭小虎(Tigerkin)
V兄又开始高产了  回复  更多评论
  
# re: Vczh Library++3.0之正则表达式引擎(生成epsilon-NFA) 2009-10-24 17:56 | 李佳
真牛 自己写正则表达式的解析引擎 ... 我用的还不大熟...  回复  更多评论
  
# re: Vczh Library++3.0之正则表达式引擎(生成epsilon-NFA)[未登录] 2009-10-25 17:09 | jans2002
感谢如此精彩的正则引擎DIY教程。  回复  更多评论
  
# re: Vczh Library++3.0之正则表达式引擎(生成epsilon-NFA) 2009-10-25 20:47 | 凡客诚品
感谢如此精彩的正则引擎DIY教程  回复  更多评论
  
# re: Vczh Library++3.0之正则表达式引擎(生成epsilon-NFA) 2009-11-28 22:03 | radar
感谢如此精彩的正则引擎DIY教程  回复  更多评论
  

只有注册用户登录后才能发表评论。
网站导航: 博客园   IT新闻   BlogJava   博问   Chat2DB   管理