随笔-341  评论-2670  文章-0  trackbacks-0
    这几天正在着手开发一个基于Vczh Library++ 3.0基础之上的新的正则表达式引擎。这个东西跟以前的功能没什么大变化,只是最基础的那一层被全部改了,而且还不兼容,所以要重写。不过都是自己的东西,无所谓了,不兼容也没什么问题。

    下面介绍正则表达式引擎的语法分析过程。

    首先我们要定义正则表达式的语法。详细的规则我就不说了,贴一张单元测试的代码就清楚了:
  1 #include "..\..\Library\UnitTest\UnitTest.h"
  2 #include "..\..\Library\Regex\RegexExpression.h"
  3 #include "..\..\Library\Regex\RegexWriter.h"
  4 
  5 using namespace vl;
  6 using namespace vl::regex;
  7 using namespace vl::regex_internal;
  8 
  9 /***********************************************************************
 10 语法分析
 11 ***********************************************************************/
 12 
 13 void RegexAssert(const wchar_t* input, RegexNode node)
 14 {
 15     Expression::Ref exp=ParseExpression(input);
 16     TEST_ASSERT(exp->IsEqual(node.expression.Obj()));
 17 }
 18 
 19 TEST_CASE(TestRegexCharSetParsing)
 20 {
 21     RegexAssert(L"a", rC(L'a'));
 22     RegexAssert(L"vczh", rC(L'v')+rC(L'c')+rC(L'z')+rC(L'h'));
 23 
 24     RegexAssert(L"/d", r_d());
 25     RegexAssert(L"/l", r_l());
 26     RegexAssert(L"/w", r_w());
 27     RegexAssert(L"/D"!r_d());
 28     RegexAssert(L"/L"!r_l());
 29     RegexAssert(L"/W"!r_w());
 30     RegexAssert(L"/r", rC(L'\r'));
 31     RegexAssert(L"/n", rC(L'\n'));
 32     RegexAssert(L"/t", rC(L'\t'));
 33 
 34     RegexAssert(L"\\d", r_d());
 35     RegexAssert(L"\\l", r_l());
 36     RegexAssert(L"\\w", r_w());
 37     RegexAssert(L"\\D"!r_d());
 38     RegexAssert(L"\\L"!r_l());
 39     RegexAssert(L"\\W"!r_w());
 40     RegexAssert(L"\\r", rC(L'\r'));
 41     RegexAssert(L"\\n", rC(L'\n'));
 42     RegexAssert(L"\\t", rC(L'\t'));
 43 
 44     RegexAssert(L"^", rBegin());
 45     RegexAssert(L"$", rEnd());
 46     RegexAssert(L"[abc]", rC(L'a')%rC(L'b')%rC(L'c'));
 47     RegexAssert(L"[0-9]", r_d());
 48     RegexAssert(L"[a-zA-Z_]", r_l());
 49     RegexAssert(L"[a-zA-Z0-9_]", r_w());
 50     RegexAssert(L"[^0-9]"!r_d());
 51     RegexAssert(L"[^a-zA-Z_]"!r_l());
 52     RegexAssert(L"[^a-zA-Z0-9_]"!r_w());
 53 }
 54 
 55 TEST_CASE(TestRegexLoopParsing)
 56 {
 57     RegexAssert(L"/d+", r_d().Some());
 58     RegexAssert(L"/d*", r_d().Any());
 59     RegexAssert(L"/d?", r_d().Opt());
 60     RegexAssert(L"/d{3}", r_d().Loop(33));
 61     RegexAssert(L"/d{3,5}", r_d().Loop(35));
 62     RegexAssert(L"/d{4,}", r_d().AtLeast(4));
 63     
 64     RegexAssert(L"\\d+", r_d().Some());
 65     RegexAssert(L"\\d*", r_d().Any());
 66     RegexAssert(L"\\d?", r_d().Opt());
 67     RegexAssert(L"\\d{3}", r_d().Loop(33));
 68     RegexAssert(L"\\d{3,5}", r_d().Loop(35));
 69     RegexAssert(L"\\d{4,}", r_d().AtLeast(4));
 70 }
 71 
 72 TEST_CASE(TestRegexFunctionParsing)
 73 {
 74     RegexAssert(L"(<name>/d+)", rCapture(L"name", r_d().Some()));
 75     RegexAssert(L"(<&name>)", rUsing(L"name"));
 76     RegexAssert(L"(<$name>)", rMatch(L"name"));
 77     RegexAssert(L"(<$name;3>)", rMatch(L"name"3));
 78     RegexAssert(L"(<$3>)", rMatch(3));
 79     RegexAssert(L"(=/d+)"+r_d().Some());
 80     RegexAssert(L"(!/d+)"-r_d().Some());
 81     RegexAssert(L"(=\\d+)"+r_d().Some());
 82     RegexAssert(L"(!\\d+)"-r_d().Some());
 83 }
 84 
 85 TEST_CASE(TestRegexComplexParsing)
 86 {
 87     RegexAssert(L"a+(bc)*", rC(L'a').Some()+(rC(L'b')+rC(L'c')).Any());
 88     RegexAssert(L"(1+2)*(3+4)", (rC(L'1').Some()+rC(L'2')).Any()+(rC(L'3').Some()+rC(L'4')));
 89     RegexAssert(L"[a-zA-Z_][a-zA-Z0-9_]*", r_l()+r_w().Any());
 90     RegexAssert(L"((<part>/d+).){3}(<part>/d+)", (rCapture(L"part", r_d().Some())+rC(L'.')).Loop(33)+rCapture(L"part", r_d().Some()));
 91     RegexAssert(L"ab|ac", (rC(L'a')+rC(L'b'))|(rC(L'a')+rC(L'c')));
 92     RegexAssert(L"a(b|c)", rC(L'a')+(rC(L'b')|rC(L'c')));
 93     RegexAssert(L"/.*[/r/n/t]", rAnyChar().Any()+(rC(L'\r')%rC(L'\n')%rC(L'\t')));
 94     
 95     RegexAssert(L"((<part>\\d+).){3}(<part>\\d+)", (rCapture(L"part", r_d().Some())+rC(L'.')).Loop(33)+rCapture(L"part", r_d().Some()));
 96     RegexAssert(L"\\.*[\\r\\n\\t]", rAnyChar().Any()+(rC(L'\r')%rC(L'\n')%rC(L'\t')));
 97 }
 98 
 99 TEST_CASE(TestRegexCompleteParsingA)
100 {
101     WString code=L"(<#part>/d+)(<#capture>(<section>(<&part>)))((<&capture>).){3}(<&capture>)";
102     RegexExpression::Ref regex=ParseRegexExpression(code);
103 
104     Expression::Ref part=r_d().Some().expression;
105     Expression::Ref capture=rCapture(L"section", rUsing(L"part")).expression;
106     Expression::Ref main=((rUsing(L"capture")+rC(L'.')).Loop(33)+rUsing(L"capture")).expression;
107 
108     TEST_ASSERT(regex->definitions.Count()==2);
109     TEST_ASSERT(regex->definitions.Keys()[0]==L"capture");
110     TEST_ASSERT(regex->definitions.Keys()[1]==L"part");
111 
112     TEST_ASSERT(regex->definitions[L"part"]->IsEqual(part.Obj()));
113     TEST_ASSERT(regex->definitions[L"capture"]->IsEqual(capture.Obj()));
114     TEST_ASSERT(regex->expression->IsEqual(main.Obj()));
115 }
116 
117 TEST_CASE(TestRegexCompleteParsingB)
118 {
119     WString code=L"((<part>\\d+).){3}(<part>\\d+)";
120     RegexExpression::Ref regex=ParseRegexExpression(code);
121 
122     Expression::Ref main=((rCapture(L"part", r_d().Some())+rC(L'.')).Loop(33)+rCapture(L"part", r_d().Some())).expression;
123 
124     TEST_ASSERT(regex->definitions.Count()==0);
125     TEST_ASSERT(regex->expression->IsEqual(main.Obj()));
126 }

    那些rCapture(L"part", r_d().Some()之类的东西是一个手写正则表达式引擎的工具,专门为了测试而开发的。这里我使用ParseExpression分析表达式变成语法树,然后我手写一棵语法树,两棵树进行对比,如果不相同那么语法分析器就有bug了。下面来看语法树的定义:

  1 /***********************************************************************
  2 Vczh Library++ 3.0
  3 Developer: 陈梓瀚(vczh)
  4 Regex::RegexExpression
  5 
  6 Classes:
  7     Expression                        :表达式基类
  8     CharSetExpression                :字符集表达式
  9     LoopExpression                    :循环表达式
 10     SequenceExpression                :顺序表达式
 11     AlternateExpression                :选择表达式
 12     BeginExpression                    :【非纯】字符串起始表达式
 13     EndExpression                    :【非纯】字符串末尾表达式
 14     CaptureExpression                :【非纯】捕获表达式
 15     MatchExpression                    :【非纯】匹配表达式
 16     PositiveExpression                :【非纯】正向预查表达式
 17     NegativeExpression                :【非纯】反向预查表达式
 18     UsingExpression                    :引用表达式
 19 
 20     RegexExpression                    :正则表达式
 21 
 22 Functions:
 23     ParseRegexExpression            :将字符串分析为RegexExpression对象,如果语法有问题则抛异常
 24 ***********************************************************************/
 25 
 26 #ifndef VCZH_REGEX_REGEXEXPRESSION
 27 #define VCZH_REGEX_REGEXEXPRESSION
 28 
 29 #include "RegexData.h"
 30 
 31 namespace vl
 32 {
 33     namespace regex_internal
 34     {
 35 
 36 /***********************************************************************
 37 正则表达式表达式树
 38 ***********************************************************************/
 39 
 40         class Expression : public Object, private NotCopyable
 41         {
 42         public:
 43             typedef Ptr<Expression>                                Ref;
 44             typedef Dictionary<WString, Expression::Ref>        Map;
 45 
 46             virtual bool                IsEqual(Expression* target)=0;
 47         };
 48 
 49         class CharSetExpression : public Expression
 50         {
 51         public:
 52             CharRange::List                ranges;
 53             bool                        reverse;
 54 
 55             bool                        AddRangeWithConflict(CharRange range);
 56             bool                        IsEqual(Expression* target);
 57         };
 58 
 59         class LoopExpression : public Expression
 60         {
 61         public:
 62             Expression::Ref                expression;        //被循环表达式
 63             int                            min;            //下限
 64             int                            max;            //上限,-1代表无限
 65 
 66             bool                        IsEqual(Expression* target);
 67         };
 68 
 69         class SequenceExpression : public Expression
 70         {
 71         public:
 72             Expression::Ref                left;            //左表达式
 73             Expression::Ref                right;            //右表达式
 74 
 75             bool                        IsEqual(Expression* target);
 76         };
 77 
 78         class AlternateExpression : public Expression
 79         {
 80         public:
 81             Expression::Ref                left;            //左表达式
 82             Expression::Ref                right;            //右表达式
 83 
 84             bool                        IsEqual(Expression* target);
 85         };
 86 
 87         class BeginExpression: public Expression
 88         {
 89         public:
 90 
 91             bool                        IsEqual(Expression* target);
 92         };
 93 
 94         class EndExpression : public Expression
 95         {
 96         public:
 97 
 98             bool                        IsEqual(Expression* target);
 99         };
100 
101         class CaptureExpression : public Expression
102         {
103         public:
104             WString                        name;            //捕获名,空代表缺省捕获
105             Expression::Ref                expression;        //被捕获表达式
106 
107             bool                        IsEqual(Expression* target);
108         };
109 
110         class MatchExpression : public Expression
111         {
112         public:
113             WString                        name;            //捕获名,空代表缺省捕获
114             int                            index;            //捕获序号,-1代表非空捕获的所有项
115 
116             bool                        IsEqual(Expression* target);
117         };
118 
119         class PositiveExpression : public Expression
120         {
121         public:
122             Expression::Ref                expression;        //正向匹配表达式
123 
124             bool                        IsEqual(Expression* target);
125         };
126 
127         class NegativeExpression : public Expression
128         {
129         public:
130             Expression::Ref                expression;        //反向匹配表达式
131 
132             bool                        IsEqual(Expression* target);
133         };
134 
135         class UsingExpression : public Expression
136         {
137         public:
138             WString                        name;            //引用名
139 
140             bool                        IsEqual(Expression* target);
141         };
142 
143         class RegexExpression : public Object, private NotCopyable
144         {
145         public:
146             typedef Ptr<RegexExpression>                        Ref;
147 
148             Expression::Map                definitions;    //命名子表达式
149             Expression::Ref                expression;        //主表达式
150         };
151 
152 /***********************************************************************
153 辅助函数
154 ***********************************************************************/
155 
156         extern Ptr<LoopExpression>        ParseLoop(const wchar_t*& input);
157         extern Ptr<Expression>            ParseCharSet(const wchar_t*& input);
158         extern Ptr<Expression>            ParseFunction(const wchar_t*& input);
159         extern Ptr<Expression>            ParseUnit(const wchar_t*& input);
160         extern Ptr<Expression>            ParseJoin(const wchar_t*& input);
161         extern Ptr<Expression>            ParseAlt(const wchar_t*& input);
162         extern Ptr<Expression>            ParseExpression(const wchar_t*& input);
163         extern RegexExpression::Ref        ParseRegexExpression(const WString& code);
164     }
165 }
166 
167 #endif

    树的定义还是很直观的,语法里面有什么表达式,树就有什么类。将来的生成DFA的过程就会跟IsEqual一样建立在虚函数之上,一路递归下去。当然匹配过程是不能递归的。由于IsEqual很简单,就是比较一个节点跟自己是否相等,就不展示了,接下来展示分析的过程。首先我们来看一些辅助函数:

 1         bool IsChar(const wchar_t*& input, wchar_t c)
 2         {
 3             if(*input==c)
 4             {
 5                 input++;
 6                 return true;
 7             }
 8             else
 9             {
10                 return false;
11             }
12         }
13 
14         bool IsChars(const wchar_t*& input, wchar_t* chars, wchar_t& c)
15         {
16             const wchar_t* position=wcschr(chars, *input);
17             if(position)
18             {
19                 c=*input++;
20                 return true;
21             }
22             else
23             {
24                 return false;
25             }
26         }
27 
28         bool IsStr(const wchar_t*& input, wchar_t* str)
29         {
30             size_t len=wcslen(str);
31             if(wcsncmp(input, str, len)==0)
32             {
33                 input+=len;
34                 return true;
35             }
36             else
37             {
38                 return false;
39             }
40         }
41 
42         bool IsChars(const wchar_t*& input, wchar_t* chars)
43         {
44             wchar_t c;
45             return IsChars(input, chars, c);
46         }
47 
48         bool IsPositiveInteger(const wchar_t*& input, int& number)
49         {
50             bool readed=false;
51             number=0;
52             while(L'0'<=*input && *input<=L'9')
53             {
54                 number=number*10+(*input++)-L'0';
55                 readed=true;
56             }
57             return readed;
58         }
59 
60         bool IsName(const wchar_t*& input, WString& name)
61         {
62             const wchar_t* read=input;
63             if((L'A'<=*read && *read<=L'Z'|| (L'a'<=*read && *read<=L'z'|| *read==L'_')
64             {
65                 read++;
66                 while((L'A'<=*read && *read<=L'Z'|| (L'a'<=*read && *read<=L'z'|| (L'0'<=*read && *read<=L'9'|| *read==L'_')
67                 {
68                     read++;
69                 }
70             }
71             if(input==read)
72             {
73                 return false;
74             }
75             else
76             {
77                 name=WString(input, read-input);
78                 input=read;
79                 return true;
80             }
81         }

    这些函数有一个特点,都是输入一个指针然后输出结果。如果匹配正确那么移动指针返回true,失败返回false指针不移动。于是我们就可以来看看如何处理一个Loop的分析过程。Loop主要指的是循环的后缀,有+、*、?、{a}、{a,}和{a,b}。ParseLoop函数返回LoopExpression对象,但是里面只有循环信息是填满的。循环的对象暂时不处理。

 1         Ptr<LoopExpression> ParseLoop(const wchar_t*& input)
 2         {
 3             int min=0;
 4             int max=0;
 5             if(!*input)
 6             {
 7                 return 0;
 8             }
 9             else if(IsChar(input, L'+'))
10             {
11                 min=1;
12                 max=-1;
13             }
14             else if(IsChar(input, L'*'))
15             {
16                 min=0;
17                 max=-1;
18             }
19             else if(IsChar(input, L'?'))
20             {
21                 min=0;
22                 max=1;
23             }
24             else if(IsChar(input, L'{'))
25             {
26                 if(IsPositiveInteger(input, min))
27                 {
28                     if(IsChar(input, L','))
29                     {
30                         if(!IsPositiveInteger(input, max))
31                         {
32                             max=-1;
33                         }
34                     }
35                     else
36                     {
37                         max=min;
38                     }
39                     if(!IsChar(input, L'}'))
40                     {
41                         goto THROW_EXCEPTION;
42                     }
43                 }
44                 else
45                 {
46                     goto THROW_EXCEPTION;
47                 }
48             }
49             else
50             {
51                 return 0;
52             }
53             LoopExpression* expression=new LoopExpression;
54             expression->min=min;
55             expression->max=max;
56             return expression;
57         THROW_EXCEPTION:
58                 throw ArgumentException(L"正则表达式语法错误,循环格式不合法。", L"vl::regex_internal::ParseLoop", L"input");
59         }

    接下来就是分析字符集了。字符集有集合、^、$和字符等等。ParseCharSet函数返回一个完整的Expression对象:

  1         Ptr<Expression> ParseCharSet(const wchar_t*& input)
  2         {
  3             if(!*input)
  4             {
  5                 return 0;
  6             }
  7             else if(IsChar(input, L'^'))
  8             {
  9                 return new BeginExpression;
 10             }
 11             else if(IsChar(input, L'$'))
 12             {
 13                 return new EndExpression;
 14             }
 15             else if(IsChar(input, L'\\'|| IsChar(input, L'/'))
 16             {
 17                 Ptr<CharSetExpression> expression=new CharSetExpression;
 18                 expression->reverse=false;
 19                 switch(*input)
 20                 {
 21                 case L'.':
 22                     expression->ranges.Add(CharRange(165535));
 23                     break;
 24                 case L'r':
 25                     expression->ranges.Add(CharRange(L'\r', L'\r'));
 26                     break;
 27                 case L'n':
 28                     expression->ranges.Add(CharRange(L'\n', L'\n'));
 29                     break;
 30                 case L't':
 31                     expression->ranges.Add(CharRange(L'\t', L'\t'));
 32                     break;
 33                 case L'\\':case L'/':case L'(':case L')':case L'+':case L'*':case L'?':
 34                 case L'{':case L'}':case L'[':case L']':case L'<':case L'>':
 35                 case L'^':case L'$':case L'!':case L'=':
 36                     expression->ranges.Add(CharRange(*input, *input));
 37                     break;
 38                 case L'S':
 39                     expression->reverse=true;
 40                 case L's':
 41                     expression->ranges.Add(CharRange(L' ', L' '));
 42                     expression->ranges.Add(CharRange(L'\r', L'\r'));
 43                     expression->ranges.Add(CharRange(L'\n', L'\n'));
 44                     expression->ranges.Add(CharRange(L'\t', L'\t'));
 45                     break;
 46                 case L'D':
 47                     expression->reverse=true;
 48                 case L'd':
 49                     expression->ranges.Add(CharRange(L'0', L'9'));
 50                     break;
 51                 case L'L':
 52                     expression->reverse=true;
 53                 case L'l':
 54                     expression->ranges.Add(CharRange(L'_', L'_'));
 55                     expression->ranges.Add(CharRange(L'A', L'Z'));
 56                     expression->ranges.Add(CharRange(L'a', L'z'));
 57                     break;
 58                 case L'W':
 59                     expression->reverse=true;
 60                 case L'w':
 61                     expression->ranges.Add(CharRange(L'_', L'_'));
 62                     expression->ranges.Add(CharRange(L'0', L'9'));
 63                     expression->ranges.Add(CharRange(L'A', L'Z'));
 64                     expression->ranges.Add(CharRange(L'a', L'z'));
 65                     break;
 66                 default:
 67                     throw ArgumentException(L"正则表达式语法错误:非法转义符。", L"vl::regex_internal::ParseCharSet", L"input");
 68                 }
 69                 input++;
 70                 return expression;
 71             }
 72             else if(IsChar(input, L'['))
 73             {
 74                 Ptr<CharSetExpression> expression=new CharSetExpression;
 75                 if(IsChar(input, L'^'))
 76                 {
 77                     expression->reverse=true;
 78                 }
 79                 else
 80                 {
 81                     expression->reverse=false;
 82                 }
 83                 bool midState=false;
 84                 wchar_t a=L'\0';
 85                 wchar_t b=L'\0';
 86                 while(true)
 87                 {
 88                     if(IsChar(input, L'\\'|| IsChar(input, L'/'))
 89                     {
 90                         wchar_t c=L'\0';
 91                         switch(*input)
 92                         {
 93                         case L'r':
 94                             c=L'\r';
 95                             break;
 96                         case L'n':
 97                             c=L'\n';
 98                             break;
 99                         case L't':
100                             c=L'\t';
101                             break;
102                         case L'-':case L'[':case L']':case L'\\':case L'/':case L'^':case L'$':
103                             c=*input;
104                             break;
105                         default:
106                             throw ArgumentException(L"正则表达式语法错误:在[]内部能使用的转义符只有\"rnt-[]\\/\"", L"vl::regex_internal::ParseCharSet", L"input");
107                         }
108                         input++;
109                         midState?b=c:a=c;
110                         midState=!midState;
111                     }
112                     else if(IsChars(input, L"-]"))
113                     {
114                         goto THROW_EXCEPTION;
115                     }
116                     else if(*input)
117                     {
118                         midState?b=*input++:a=*input++;
119                         midState=!midState;
120                     }
121                     else
122                     {
123                         goto THROW_EXCEPTION;
124                     }
125                     if(IsChar(input, L']'))
126                     {
127                         if(midState)
128                         {
129                             b=a;
130                         }
131                         if(!expression->AddRangeWithConflict(CharRange(a, b)))
132                         {
133                             goto THROW_EXCEPTION;
134                         }
135                         break;
136                     }
137                     else if(IsChar(input, L'-'))
138                     {
139                         if(!midState)
140                         {
141                             goto THROW_EXCEPTION;
142                         }
143                     }
144                     else
145                     {
146                         if(midState)
147                         {
148                             b=a;
149                         }
150                         if(expression->AddRangeWithConflict(CharRange(a, b)))
151                         {
152                             midState=false;
153                         }
154                         else
155                         {
156                             goto THROW_EXCEPTION;
157                         }
158                     }
159                 }
160                 return expression;
161         THROW_EXCEPTION:
162                 throw ArgumentException(L"正则表达式语法错误:字符集格式不合法。");
163             }
164             else if(IsChars(input, L"()+*?{}|"))
165             {
166                 input--;
167                 return 0;
168             }
169             else
170             {
171                 CharSetExpression* expression=new CharSetExpression;
172                 expression->reverse=false;
173                 expression->ranges.Add(CharRange(*input, *input));
174                 input++;
175                 return expression;
176             }
177         }

    然后是分析高级功能的函数,这些功能主要有捕获、匹配、正向预查、反向预查和表达式引用等:

  1         Ptr<Expression> ParseFunction(const wchar_t*& input)
  2         {
  3             if(IsStr(input, L"(="))
  4             {
  5                 Ptr<Expression> sub=ParseExpression(input);
  6                 if(!IsChar(input, L')'))
  7                 {
  8                     goto NEED_RIGHT_BRACKET;
  9                 }
 10                 PositiveExpression* expression=new PositiveExpression;
 11                 expression->expression=sub;
 12                 return expression;
 13             }
 14             else if(IsStr(input, L"(!"))
 15             {
 16                 Ptr<Expression> sub=ParseExpression(input);
 17                 if(!IsChar(input, L')'))
 18                 {
 19                     goto NEED_RIGHT_BRACKET;
 20                 }
 21                 NegativeExpression* expression=new NegativeExpression;
 22                 expression->expression=sub;
 23                 return expression;
 24             }
 25             else if(IsStr(input, L"(<&"))
 26             {
 27                 WString name;
 28                 if(!IsName(input, name))
 29                 {
 30                     goto NEED_NAME;
 31                 }
 32                 if(!IsChar(input, L'>'))
 33                 {
 34                     goto NEED_GREATER;
 35                 }
 36                 if(!IsChar(input, L')'))
 37                 {
 38                     goto NEED_RIGHT_BRACKET;
 39                 }
 40                 UsingExpression* expression=new UsingExpression;
 41                 expression->name=name;
 42                 return expression;
 43             }
 44             else if(IsStr(input, L"(<$"))
 45             {
 46                 WString name;
 47                 int index=-1;
 48                 if(IsName(input, name))
 49                 {
 50                     if(IsChar(input, L';'))
 51                     {
 52                         if(!IsPositiveInteger(input, index))
 53                         {
 54                             goto NEED_NUMBER;
 55                         }
 56                     }
 57                 }
 58                 else if(!IsPositiveInteger(input, index))
 59                 {
 60                     goto NEED_NUMBER;
 61                 }
 62                 if(!IsChar(input, L'>'))
 63                 {
 64                     goto NEED_GREATER;
 65                 }
 66                 if(!IsChar(input, L')'))
 67                 {
 68                     goto NEED_RIGHT_BRACKET;
 69                 }
 70                 MatchExpression* expression=new MatchExpression;
 71                 expression->name=name;
 72                 expression->index=index;
 73                 return expression;
 74             }
 75             else if(IsStr(input, L"(<"))
 76             {
 77                 WString name;
 78                 if(!IsName(input, name))
 79                 {
 80                     goto NEED_NAME;
 81                 }
 82                 if(!IsChar(input, L'>'))
 83                 {
 84                     goto NEED_GREATER;
 85                 }
 86                 Ptr<Expression> sub=ParseExpression(input);
 87                 if(!IsChar(input, L')'))
 88                 {
 89                     goto NEED_RIGHT_BRACKET;
 90                 }
 91                 CaptureExpression* expression=new CaptureExpression;
 92                 expression->name=name;
 93                 expression->expression=sub;
 94                 return expression;
 95             }
 96             else if(IsChar(input, L'('))
 97             {
 98                 Ptr<Expression> sub=ParseExpression(input);
 99                 if(!IsChar(input, L')'))
100                 {
101                     goto NEED_RIGHT_BRACKET;
102                 }
103                 return sub;
104             }
105             else
106             {
107                 return 0;
108             }
109         NEED_RIGHT_BRACKET:
110             throw ArgumentException(L"正则表达式语法错误:缺少右小括号\")\"", L"vl::regex_internal::ParseFunction", L"input");
111         NEED_GREATER:
112             throw ArgumentException(L"正则表达式语法错误:缺少右尖括号\">\"", L"vl::regex_internal::ParseFunction", L"input");
113         NEED_NAME:
114             throw ArgumentException(L"正则表达式语法错误:缺少标识符。", L"vl::regex_internal::ParseFunction", L"input");
115         NEED_NUMBER:
116             throw ArgumentException(L"正则表达式语法错误:缺少数字。", L"vl::regex_internal::ParseFunction", L"input");
117         }

    这些琐碎的事情做完之后,就可以着手进行分析的大框架的开发了。表达式由基础表达式、顺序表达式和选择表达式构成。这个东西跟四则运算的分析一模一样,于是代码轻松搞定,基本上都是些套路:

 1         Ptr<Expression> ParseUnit(const wchar_t*& input)
 2         {
 3             Ptr<Expression> unit=ParseCharSet(input);
 4             if(!unit)
 5             {
 6                 unit=ParseFunction(input);
 7             }
 8             if(!unit)
 9             {
10                 return 0;
11             }
12             Ptr<LoopExpression> loop;
13             while(loop=ParseLoop(input))
14             {
15                 loop->expression=unit;
16                 unit=loop;
17             }
18             return unit;
19         }
20 
21         Ptr<Expression> ParseJoin(const wchar_t*& input)
22         {
23             Ptr<Expression> expression=ParseUnit(input);
24             while(true)
25             {
26                 Ptr<Expression> right=ParseUnit(input);
27                 if(right)
28                 {
29                     SequenceExpression* sequence=new SequenceExpression;
30                     sequence->left=expression;
31                     sequence->right=right;
32                     expression=sequence;
33                 }
34                 else
35                 {
36                     break;
37                 }
38             }
39             return expression;
40         }
41 
42         Ptr<Expression> ParseAlt(const wchar_t*& input)
43         {
44             Ptr<Expression> expression=ParseJoin(input);
45             while(true)
46             {
47                 if(IsChar(input, L'|'))
48                 {
49                     Ptr<Expression> right=ParseJoin(input);
50                     if(right)
51                     {
52                         AlternateExpression* alternate=new AlternateExpression;
53                         alternate->left=expression;
54                         alternate->right=right;
55                         expression=alternate;
56                     }
57                     else
58                     {
59                         throw ArgumentException(L"正则表达式语法错误:缺少表达式。", L"vl::regex_internal::ParseAlt", L"input");
60                     }
61                 }
62                 else
63                 {
64                     break;
65                 }
66             }
67             return expression;
68         }
69 
70         Ptr<Expression> ParseExpression(const wchar_t*& input)
71         {
72             return ParseAlt(input);
73         }

    最后就是主函数了。首先分析一下有多少个表达式命名,然后后面接着一条主表达式。这样做可以把相同的东西的重复消除,当然这个语法也是我独有的:

 1         RegexExpression::Ref ParseRegexExpression(const WString& code)
 2         {
 3             RegexExpression::Ref regex=new RegexExpression;
 4             const wchar_t* start=code.Buffer();
 5             const wchar_t* input=start;
 6             try
 7             {
 8                 while(IsStr(input, L"(<#"))
 9                 {
10                     WString name;
11                     if(!IsName(input, name))
12                     {
13                         throw ArgumentException(L"正则表达式语法错误:缺少标识符。", L"vl::regex_internal::ParseRegexExpression", L"code");
14                     }
15                     if(!IsChar(input, L'>'))
16                     {
17                         throw ArgumentException(L"正则表达式语法错误:缺少右尖括号\">\"", L"vl::regex_internal::ParseFunction", L"input");
18                     }
19                     Ptr<Expression> sub=ParseExpression(input);
20                     if(!IsChar(input, L')'))
21                     {
22                         throw ArgumentException(L"正则表达式语法错误:缺少右小括号\")\"", L"vl::regex_internal::ParseFunction", L"input");
23                     }
24                     if(regex->definitions.Keys().Contains(name))
25                     {
26                         throw ArgumentException(L"正则表达式语法错误:子表达式名称("+name+L")重复定义。", L"vl::regex_internal::ParseFunction", L"input");
27                     }
28                     else
29                     {
30                         regex->definitions.Add(name, sub);
31                     }
32                 }
33                 regex->expression=ParseExpression(input);
34                 if(!regex->expression)
35                 {
36                     throw ArgumentException(L"正则表达式语法错误:缺少表达式。", L"vl::regex_internal::ParseUnit", L"input");
37                 }
38                 if(*input)
39                 {
40                     throw ArgumentException(L"正则表达式语法错误:遇到多余的符号。", L"vl::regex_internal::ParseUnit", L"input");
41                 }
42                 return regex;
43             }
44             catch(const ArgumentException& e)
45             {
46                 throw ParsingException(e.GetMessage(), code, input-start);
47             }
48         }

    至此正则表达式的分析就结束了。接下来可以开始做DFA的生成了。
posted on 2009-10-15 05:53 陈梓瀚(vczh) 阅读(3108) 评论(4)  编辑 收藏 引用 所属分类: VL++3.0开发纪事

评论:
# re: Vczh Library++3.0之正则表达式引擎(语法分析)[未登录] 2009-10-15 06:30 | yuhuan
正则表达式是处理字符串的最强大语法,有了正则表达式许多复杂的字串处理就变得简单了. 好文.  回复  更多评论
  
# re: Vczh Library++3.0之正则表达式引擎(语法分析) 2009-10-16 22:57 | SonicLing
@yuhuan
正则表达式是最弱的吧。  回复  更多评论
  
# re: Vczh Library++3.0之正则表达式引擎(语法分析) 2009-10-17 01:13 | jans2002
正则表达式处理字符串的确很强大,但是对于模糊的自然语言而言,比如中文,却无能为力。
想问问楼主不知道对此有没有兴趣。  回复  更多评论
  
# re: Vczh Library++3.0之正则表达式引擎(语法分析) 2009-10-17 17:18 | 陈梓瀚(vczh)
@jans2002
实际上这根模糊不模糊没关系,正则表达式不管语义的。如果你的目标仅仅是把中文变成语法树,显然LR(k)还是可以做到的。如果你是想做理解,那么这跟什么(编译原理所指的)语法分析完全无关。  回复  更多评论
  

只有注册用户登录后才能发表评论。
网站导航: 博客园   IT新闻   BlogJava   知识库   博问   管理