[lexical_analyzer.h]
#ifndef LEXICAL_ANALYZER_H
#define LEXICAL_ANALYZER_H
enum lexical_status
{
status_success,
status_eof,
status_invalid_char,
status_unknown = -1,
};
enum token_category
{
token_error,
token_keyword,
token_identifier,
token_number,
token_char,
token_string,
token_operator,
token_punctuator,
};
struct token
{
token_category category;
std::string value;
};
class lexical_analyzer
{
public:
explicit lexical_analyzer(std::istream& ifs);
lexical_status get_token(token& t);
int current_line() const;
private:
bool filter_space();
bool filter_comment();
std::string get_hex_string();
std::string get_digital_string();
std::string get_string(char delimiter);
bool is_keyword(const std::string& str);
int get_char();
int peek_char();
void putback(char ch);
void skip_char();
private:
std::istream* m_pstream;
int m_line;
};
inline lexical_analyzer::lexical_analyzer(std::istream& ifs)
:m_pstream(&ifs), m_line(1)
{
}
inline int lexical_analyzer::current_line() const
{
return m_line;
}
inline int lexical_analyzer::peek_char()
{
return m_pstream->peek();
}
inline void lexical_analyzer::skip_char()
{
get_char();
}
#endif//LEXICAL_ANALYZER_H
[lexical_analyzer.cpp]
#include <fstream>
#include "lexical_analyzer.h"
int lexical_analyzer::get_char()
{
int ch = m_pstream->get();
if (ch=='\n')
++m_line;
return ch;
}
void lexical_analyzer::putback(char ch)
{
if (ch=='\n')
--m_line;
m_pstream->putback(ch);
}
bool lexical_analyzer::filter_space()
{
bool result = false;
char ch = peek_char();
if (isspace(ch))
{
do
{
skip_char();
ch = peek_char();
}
while (isspace(ch));
return true;
}
return false;
}
bool lexical_analyzer::filter_comment()
{
if(peek_char()=='/')
{
skip_char();
char ch = get_char();
if(ch=='/')
{
while(peek_char()!='\n')
{
skip_char();
}
}
else if(ch=='*')
{
for(;;)
{
if(get_char()=='*' && get_char()=='/')
break;
}
}
else
{
putback('/');
return false;
}
return true;
}
else
{
return false;
}
}
bool lexical_analyzer::is_keyword(const std::string& str)
{
static const char* const keywords[]=
{
"asm", "auto", "bad_cast", "bad_typeid",
"bool", "break", "case", "catch",
"char", "class", "const", "const_cast",
"continue", "default", "delete", "do",
"double", "dynamic_cast", "else", "enum",
"except", "explicit", "extern", "false",
"finally", "float", "for", "friend",
"goto", "if", "inline", "int",
"long", "mutable", "namespace", "new",
"operator", "private", "protected", "public",
"register", "reinterpret_cast", "return", "short",
"signed", "sizeof", "static", "static_cast",
"struct", "switch", "template", "this",
"throw", "true", "try", "typedef",
"typeid", "typename", "union", "unsigned",
"using", "virtual", "void", "volatile",
"while",
};
for(int i=0; i<sizeof(keywords)/sizeof(keywords[0]); i++)
{
if(str.compare(keywords[i])==0)
return true;
}
return false;
}
std::string lexical_analyzer::get_string(char delimiter)
{
std::string result;
for(;;)
{
char ch = get_char();
if(ch==delimiter)
break;
else if(ch=='\\')
{
ch = get_char();
switch(ch)
{
case '\"':
ch = '\"';
break;
case '\'':
ch = '\'';
break;
case 'r':
ch = '\r';
break;
case 'n':
ch = '\n';
break;
case 'v':
ch = '\v';
break;
case 't':
ch = '\t';
break;
case 'a':
ch = '\a';
break;
case 'b':
ch = '\b';
break;
case 'f':
ch = '\f';
break;
case '\r': // line splice
case '\n':
continue;
break;
default:
break;
}
if(ch=='x' || ch=='X')
{
std::string s = get_hex_string();
int x = 0;
for(int i=0; i<s.length(); i++)
{
x *= 16;
if(s[i]>='A' && s[i]<='F')
x += s[i]-'A' + 10;
else if(s[i]>='a' && s[i]<='f')
x += s[i]-'a' + 10;
else
x += s[i]-'0';
}
ch = (char)x;
}
}
result += ch;
}
return result;
}
std::string lexical_analyzer::get_digital_string()
{
std::string result;
char ch;
while(isdigit(ch=get_char()))
{
result += ch;
}
putback(ch);
return result;
}
std::string lexical_analyzer::get_hex_string()
{
std::string result;
char ch;
while(isxdigit(ch=get_char()))
{
result += ch;
}
putback(ch);
return result;
}
lexical_status lexical_analyzer::get_token(token& t)
{
if(m_pstream->eof())
return status_eof;
while(filter_space() || filter_comment())
{
}
while(filter_comment() || filter_space())
{
}
if(m_pstream->eof())
return status_eof;
t.value.resize(0);
char ch = get_char();
if(ch=='_' || isalpha(ch) || isdigit(ch) || ch=='$')
{
t.category = token_identifier;
do
{
t.value += ch;
ch = get_char();
}while(ch=='_' || isalpha(ch) || isdigit(ch) || ch=='$');
putback(ch);
}
else if(isdigit(ch))
{
t.category = token_number;
t.value += ch;
ch = get_char();
if(ch=='x' || ch=='X')
{
t.value += ch;
t.value += get_hex_string();
}
else if(isdigit(ch))
{
t.value += ch;
t.value += get_digital_string();
}
}
else if(ch=='\"')
{
t.category = token_string;
t.value = get_string('\"');
}
else if(ch=='\'')
{
t.category = token_char;
t.value = get_string('\'');
}
else
{
t.category = token_operator;
if(ch=='=' || ch=='&' || ch=='|' || ch==':')
{
t.value = ch;
if(peek_char()==ch)
{
t.value += ch;
skip_char();
}
}
else if(ch=='+' || ch=='-')
{
t.value = ch;
char cc = get_char();
if(cc==ch)
{
t.value += ch;
}
else if(cc=='=')
{
t.value += '=';
}
else if(ch=='-' && cc=='>')
{
t.value += '>'; // ->
cc = peek_char();
if(cc=='*')
{
skip_char();
t.value += '*'; // ->*
}
}
else
{
putback(cc);
}
}
else if(ch=='*' || ch=='/' || ch=='%' || ch=='^' || ch=='!')
{
t.value = ch;
ch = peek_char();
if(ch=='=')
{
t.value+='=';
skip_char();
}
}
else if(ch=='<' || ch=='>')
{
t.value = ch;
char cc = get_char();
if(ch==cc) // << >>
{
t.value += cc;
cc = peek_char();
if(cc=='=') // <<= >>=
{
skip_char();
t.value += '=';
}
}
else if(cc=='=')
{
t.value += '=';
}
else
{
putback(cc);
}
}
else if(ch=='.')
{
t.value = '.'; // .
ch = get_char();
if(ch=='*')
{
t.value += '*'; // .*
}
else if(ch=='.')
{
char cc = get_char();
if(cc=='.') // ...
{
t.value += "..";
}
else
{
putback(cc);
putback(ch);
}
}
else
{
putback(ch);
}
}
else if(ch=='~' || ch =='?' ||
ch=='[' || ch==']' ||
ch=='(' || ch==')'
)
{
t.value = ch;
}
else if(ch==';' || ch=='{'|| ch=='}'|| ch==','|| ch=='#' )
{
t.category = token_punctuator;
t.value = ch;
}
else if(ch=='\\')
{
ch = peek_char();
if(ch=='\r' || ch=='\n')
{
skip_char();
}
else
{
t.category = token_error;
t.value = ch;
}
}
else
{
t.category = token_error;
t.value = ch;
return status_invalid_char;
}
}
if(t.category == token_identifier && is_keyword(t.value))
{
t.category = token_keyword;
}
return status_success;
}
[main.c], 测试程序
#include <fstream>
#include <string>
#include <iostream>
#include "lexical_analyzer.h"
int main()
{
std::ifstream ifs("D:\\ThreadFuncs.cpp", std::ios::in | std::ios::binary);
lexical_analyzer lex(std::cin);
//lexical_analyzer lex(ifs);
std::ofstream ofs("D:\\out.cpp");
//std::ostream& os = ofs;
std::ostream& os = std::cout;
token t;
lexical_status status;
while((status=lex.get_token(t))!=status_eof)
{
if(status==status_success)
os << t.value << '\n';
else if(status==status_invalid_char)
std::cerr << "Line:" << lex.current_line() << "invalid_char: " << t.value << '\n';
}
return 0;
}