先看看下面这个HTML字符串。
<HTML>=====<SPARATOR>s-value</SPARATOR>++++++<BODY A="AA">{123}<BR>[456]</BODY><IMG>*****</HTML>
很不标准吧?但浏览器却是能够正常显示的。Lingoes的结果里面充斥了大量的这样的字符串,但就如我常说的 -- “代码在手,天下我有”一样,修改了TinyHtmlParser,咱也能解析了~
这下不怕了,嘿嘿。。。下面是修改后的TinyHtmlParser,修改了CParserData结构,使其更加容易理解,而“容易理解”对于我来说很重要,因为我喜欢--第一感觉该如何实现,那么代码就该按照感觉去写。代码只是现实的一种描述而已,越容易,越简单,越好。
#ifndef __TINYHTMLPARSER_H__
#define __TINYHTMLPARSER_H__
#include <iostream>
#include <string>
#include <queue>
#include <stack>
namespace TinyHtmlParser
{
enum ExceptionNumber { EN_UNKNOWN = -1, EN_ATTRIB_VALUEMISS = 0, EN_ATTRIB_VALUEBROKEN,
EN_DOCUMENT_FORMATERROR
};
class CExceptionObject
{
public:
CExceptionObject(ExceptionNumber type, const std::wstring& info);
CExceptionObject(const CExceptionObject& right);
virtual ~CExceptionObject() {}
int Number() const { return _type; }
const std::wstring& Info() const { return _info; }
protected:
ExceptionNumber _type;
std::wstring _info;
private:
CExceptionObject& operator = (const CExceptionObject& right) { return *this; }
};
enum ElementType { ET_UNKNOWN = -1, ET_TAG = 0, ET_NODE, ET_ELEMENT };//0:just a tag, 1:no value, 2:have value
class CAttributeObject
{
public:
CAttributeObject(const std::wstring& a, const std::wstring& v)
: attr(a), value(v), next(NULL)
{
}
virtual ~CAttributeObject() {}
void Show(std::wostream& os) const;
public:
std::wstring attr;
std::wstring value;
CAttributeObject* next;
};
class CElementObject
{
public:
CElementObject();
virtual ~CElementObject();
virtual int Analyse();
const CAttributeObject* FindAttribute(const std::wstring& attr) const;
void Show(std::wostream& os) const;
protected:
int AnalyseAttribute(const std::wstring& attr);
int MakeAttribute(const std::wstring& attr);
int MakeAttribute(const std::wstring& attr, const std::wstring& value);
void FreeAnalyseAttribute();
int AnalyseValue();
public:
ElementType type;
size_t level;
CElementObject* parent;
CElementObject* child;
CElementObject* sibling;
CAttributeObject* attrib;
public:
std::wstring tag;
std::wstring value;
};
class CParserData
{
public:
enum DataType { DT_UNKNOWN = -1, DT_TAG = 0, DT_VALUE, DT_END, DT_DONE, DT_TAG_VALUE, DT_BROKEN };
typedef std::pair<size_t, size_t> TRange;//start + end;
typedef std::vector<TRange> TValueVector;
public:
CParserData()
: type(DT_UNKNOWN)
{
}
virtual ~CParserData() {}
void Show(std::wostream& os) const;
void Show(std::wostream& os, const std::wstring& html) const;
public:
DataType type;
TRange tag;
TValueVector value;
};
class CDocumentObject
{
protected:
static const wchar_t TAG_LT = L'<';
static const wchar_t TAG_GT = L'>';
static const wchar_t TAG_SLASH = L'/';
static const wchar_t TAG_BSLASH = L'\\';
static const wchar_t TAG_AND = L'&';
typedef std::stack<CParserData> TDataStack;
typedef std::pair<size_t, CParserData> TNodeData;//level + tag;
typedef std::deque<TNodeData> TNodeQueue;
public:
typedef std::stack<const CElementObject*> TElementStack;
public:
CDocumentObject();
virtual ~CDocumentObject();
int Load(const std::wstring& str, bool strict = true);
const CElementObject* Root() const { return _root; }
const CElementObject* FindFirstElement(const std::wstring& tag);
const CElementObject* FindNextElement();
const CElementObject* FindFirstElement(const CElementObject* element, const std::wstring& tag, TElementStack& tmpstack);
const CElementObject* FindNextElement(const CElementObject* element, const std::wstring& tag, TElementStack& tmpstack);
const CAttributeObject* FindAttribute(const CElementObject* element, const std::wstring& attr);
bool IsMistake() const { return _bIsMistake; }
void Show(std::wostream& os) const;
protected:
int PreProcess(const std::wstring& str, std::wstring& html, bool strict);
int PreParser(const std::wstring& html, TNodeQueue& que, bool strict);
int Parser(const std::wstring& html, TNodeQueue& que, bool strict);
private:
int PreParserLT(const std::wstring& html, std::wstring::size_type& pos, CParserData& data);
int PushValueData(CParserData::DataType type, size_t start, size_t end, TDataStack& datastack) const;
int PushTagData(const std::wstring& html, CParserData& data, TDataStack& datastack, TNodeQueue& nodeque) const;
int PreParserBroken(const std::wstring& html, TDataStack& datastack, TNodeQueue& nodeque) const;
int CheckSpecialTag(const std::wstring& html, const CParserData& data) const;
int CheckTag(const std::wstring& html, const CParserData& tag, const CParserData& end) const;
CElementObject* MakeElement(const std::wstring& html, const TNodeData& node, CElementObject* parent, CElementObject* sibling) const;
void CDocumentObject::ShowElement(std::wostream& os, const CElementObject* e) const;
void FreeElement(CElementObject* root);
const CElementObject* FindElement(const CElementObject* root, const CElementObject* pe, const std::wstring& tag, TElementStack& stack);
private:
CElementObject* _root;
private:
std::wstring _findtag;
TElementStack _findstack;
private:
bool _bIsMistake;
};
}
#endif
#include <sstream>
#include "TinyHtmlParser.h"
namespace TinyHtmlParser
{
CExceptionObject::CExceptionObject(TinyHtmlParser::ExceptionNumber type, const std::wstring &info)
: _type(type)
, _info(info)
{
}
CExceptionObject::CExceptionObject(const TinyHtmlParser::CExceptionObject &right)
: _type(right._type)
, _info(right._info)
{
}
#define THROW_EXCEPTION(type, info) \
{ \
std::wostringstream oss; \
oss << info; \
throw CExceptionObject(type, oss.str()); \
}
/**////////////////
void CAttributeObject::Show(std::wostream& os) const
{
os << " attr : " << this->attr << " -- value = " << this->value << std::endl;
}
CElementObject::CElementObject()
: type(ET_UNKNOWN)
, level(0)
, parent(NULL)
, child(NULL)
, sibling(NULL)
, attrib(NULL)
{
}
CElementObject::~CElementObject()
{
FreeAnalyseAttribute();
}
int CElementObject::Analyse()
{
std::wstring str = tag;
std::wstring::size_type pos = str.find(L" ");
if(pos != std::wstring::npos)
{
tag = str.substr(0, pos);
str = str.substr(pos + 1);
if(AnalyseAttribute(str) != 0)
{
return -1;
}
}
if(type == ET_ELEMENT)
{
if(AnalyseValue() != 0)
return -1;
}
return 0;
}
int CElementObject::AnalyseAttribute(const std::wstring& attr)
{
if(attr.size() == 0)
return 0;
std::wstring a, v;
std::wstring::size_type pos = attr.find(L"="), start = 0;
while(pos != std::wstring::npos)
{
if(pos == attr.size() - 1)
{
THROW_EXCEPTION(EN_ATTRIB_VALUEMISS, L"Attribue analyse failed - attribute string : " << attr);
return -1;
}
a = attr.substr(start, pos - start);
start = pos + 1;
if(attr[pos + 1] == L'\"')
{
pos = attr.find(L"\"", start + 1);
if(pos == std::wstring::npos)
{
THROW_EXCEPTION(EN_ATTRIB_VALUEBROKEN, L"Attribue analyse failed - attribute string : " << attr);
return -1;
}
v = attr.substr(start, pos - start + 1);
start = pos + 2;
}
else
{
pos = attr.find(L" ", start);
if(pos == std::wstring::npos)
pos = attr.size();
v = attr.substr(start, pos - start);
start = pos + 1;
}
if(MakeAttribute(a, v) != 0)
return -1;
if(start >= attr.size())
break;
pos = attr.find(L"=", start);
}
return 0;
}
int CElementObject::MakeAttribute(const std::wstring &attr)
{
std::wstring::size_type pos = attr.find(L"=");
if(pos == std::wstring::npos)
return -1;
return MakeAttribute(attr.substr(0, pos), attr.substr(pos));
}
int CElementObject::MakeAttribute(const std::wstring &attr, const std::wstring& value)
{
std::auto_ptr<CAttributeObject> obj(new CAttributeObject(attr, value));//attr.substr(0, pos), attr.substr(pos)));
if(attrib != NULL)
{
CAttributeObject* tmp = attrib;
while(tmp->next != NULL)
tmp = tmp->next;
tmp->next = obj.release();
}
else
{
attrib = obj.release();
}
return 0;
}
void CElementObject::FreeAnalyseAttribute()
{
CAttributeObject* tmp = attrib;
while(attrib != NULL)
{
tmp = attrib->next;
delete attrib;
attrib = tmp;
}
}
int CElementObject::AnalyseValue()
{
std::wstring::size_type pos = this->value.find(L" ");
while(pos != std::wstring::npos)
{
this->value.replace(pos, 6, L" ");
pos = this->value.find(L" ", pos + 1);
}
return 0;
}
const CAttributeObject* CElementObject::FindAttribute(const std::wstring& attr) const
{
const CAttributeObject* pa = this->attrib;
while(pa != NULL)
{
if(pa->attr == attr)
return pa;
pa = pa->next;
}
return pa;
}
void CElementObject::Show(std::wostream& os) const
{
os << "[" << this->level << "]" << "Tag : " << this->tag;
if(this->type == ET_ELEMENT)
os << " -- value = " << /**//*std::wstring*/(this->value);
os << std::endl;
const CAttributeObject* attr = this->attrib;
while(attr != NULL)
{
attr->Show(os);
attr = attr->next;
}
os << std::endl;
}
//
void CParserData::Show(std::wostream &os) const
{
os << "\nType = " << this->type;
os << "\nTag Start = " << this->tag.first << " - End = " << this->tag.second;
for(TValueVector::const_iterator it = this->value.begin(); it != this->value.end(); ++ it)
{
os << "\nValue Start = " << it->first << " - End = " << it->second;
}
os << std::endl;
}
void CParserData::Show(std::wostream& os, const std::wstring& html) const
{
os << "\nType = " << this->type;
os << "\nTag = " << "[" << this->tag.first << "," << this->tag.second << "]" << html.substr(this->tag.first, this->tag.second - this->tag.first + 1);
for(TValueVector::const_iterator it = this->value.begin(); it != this->value.end(); ++ it)
{
os << "\nValue = " << "[" << it->first << "," << it->second << "]" << html.substr(it->first, it->second - it->first + 1);
}
os << std::endl;
}
//
CDocumentObject::CDocumentObject()
: _root(NULL)
, _bIsMistake(false)
{
}
CDocumentObject::~CDocumentObject()
{
if(_root != NULL)
FreeElement(_root);
}
int CDocumentObject::Load(const std::wstring &str, bool strict)
{
std::wstring html;
if(PreProcess(str, html, strict) != 0)
return -1;
TNodeQueue que;
if(PreParser(html, que, strict) != 0)
return -1;
if(Parser(html, que, strict) != 0)
return -1;
return 0;
}
int CDocumentObject::PreProcess(const std::wstring& str, std::wstring& html, bool strict)
{
//html = str;
bool tag = false;
for(std::wstring::const_iterator it = str.begin(); it != str.end(); ++ it)
{
if(*it == TAG_LT)
{
if(tag == true)
{
THROW_EXCEPTION(EN_DOCUMENT_FORMATERROR, L"Double '<'.");
return -1;
}
tag = true;
}
else if(*it == TAG_GT)
{
if(tag == false)
{
THROW_EXCEPTION(EN_DOCUMENT_FORMATERROR, L"Miss '<' before '>'.");
return -1;
}
tag = false;
}
else
{
if(tag == false)
{
//if(isspace((unsigned char)*it) != 0)
// continue;
if((unsigned char)(*it) == '\r' || (unsigned char)(*it) == '\n')
continue;
}
}
html += *it;
}
return 0;
}
int CDocumentObject::PreParser(const std::wstring& html, CDocumentObject::TNodeQueue& que, bool strict)
{
std::wstring::size_type pos = 0;
if(html.size() == 0)
{
THROW_EXCEPTION(EN_DOCUMENT_FORMATERROR, L"HTML is empty.");
return -1;
}
if(html[pos] != TAG_LT)
{
THROW_EXCEPTION(EN_DOCUMENT_FORMATERROR, L"First character of HTML is NOT '<' - pos : " << pos);
return -1;
}
TDataStack datastack;
CParserData data;
size_t start = 0;
while(pos < html.size())
{
if(html[pos] == TAG_LT)
{
if(pos > start)
{
if(PushValueData(CParserData::DT_VALUE, start, pos -1, datastack) != 0)
return -1;
}
if(PreParserLT(html, pos, data) != 0)
break;
if(PushTagData(html, data, datastack, que) != 0)
return -1;
++ pos;
start = pos;
}
//else if(html[pos] == TAG_GT || html[pos] == TAG_SLASH)
//{
// return -1;
//}
else
{
++ pos;
}
}
if(datastack.size() > 0)
{
if(strict)
{
THROW_EXCEPTION(EN_DOCUMENT_FORMATERROR, L"Tags do NOT match each other.");
return -1;
}
if(pos > start)
{
if(PushValueData(CParserData::DT_BROKEN, start, pos, datastack) != 0)
return -1;
}
if(PreParserBroken(html, datastack, que) != 0)
return -1;
}
return 0;
}
int CDocumentObject::Parser(const std::wstring& html, CDocumentObject::TNodeQueue& que, bool strict)
{
CElementObject *pe = NULL, *pp = NULL, *ps = NULL;
size_t level = 0;
while(que.size()> 0)
{
const TNodeData &node = que.front();
if(level < node.first)
{
pp = pe;
ps = NULL;
}
else if(level == node.first)
{
ps = pe;
}
else//>
{
ps = pe;
pp = pe->parent;
int t = level - node.first;
while(t > 0)
{
ps = ps->parent;
pp = pp->parent;
-- t;
}
}
level = node.first;
pe = MakeElement(html, node, pp, ps);
if(pe == NULL)
return -1;
que.pop_front();
}
if(pp != NULL)
{
while(pp->parent != NULL)
pp = pp->parent;
_root = pp;
}
else
_root = pe;
return 0;
}
int CDocumentObject::PreParserLT(const std::wstring& html, std::wstring::size_type& pos, CParserData& data)
{
if(pos == html.size() - 1)
{
//THROW_EXCEPTION(EN_DOCUMENT_FORMATERROR, L"'<' is the last character.");
return -1;
}
data.tag.first = pos;
++ pos;
if(html[pos] != TAG_SLASH)
{
data.type = CParserData::DT_TAG;
}
else
{
data.type = CParserData::DT_END;
++ pos;
}
while(pos < html.size())
{
if(html[pos] == TAG_GT)
{
if(html[pos - 1] == TAG_SLASH)
{
data.type = CParserData::DT_DONE;
}
data.tag.second = pos;
return 0;
}
else if(html[pos] == TAG_LT)
{
//THROW_EXCEPTION(EN_DOCUMENT_FORMATERROR, L"'<' follows '<'.");
return -1;
}
++ pos;
}
//THROW_EXCEPTION(EN_DOCUMENT_FORMATERROR, L"Miss '>' after '<'");
return -1;
}
int CDocumentObject::PushValueData(CParserData::DataType type, size_t start, size_t end, TDataStack& datastack) const
{
if(datastack.size() == 0)
{
THROW_EXCEPTION(EN_DOCUMENT_FORMATERROR, L"Miss '<' before got value - pos : " << start);
return -1;
}
CParserData& prev = datastack.top();
if(prev.type == CParserData::DT_TAG)
{
prev.value.push_back(std::make_pair(start, end));
prev.type = CParserData::DT_TAG_VALUE;
}
else if(prev.type == CParserData::DT_TAG_VALUE)
{
prev.value.push_back(std::make_pair(start, end));
}
else
{
THROW_EXCEPTION(EN_DOCUMENT_FORMATERROR, L"Type does match : " << prev.type << " - pos : " << start);
return -1;
}
return 0;
}
int CDocumentObject::PushTagData(const std::wstring& html, CParserData& data, CDocumentObject::TDataStack& datastack, CDocumentObject::TNodeQueue& nodeque) const
{
if(CheckSpecialTag(html, data) == 0)
{
data.type = CParserData::DT_DONE;
}
if(data.type == CParserData::DT_TAG)
{
datastack.push(data);
}
else if(data.type == CParserData::DT_END)
{
if(datastack.size() == 0)
{
THROW_EXCEPTION(EN_DOCUMENT_FORMATERROR, L"Miss start-tag before end-tag - pos : " << data.tag.first);
return -1;
}
if(CheckTag(html, datastack.top(), data) != 0)
return -1;
nodeque.push_front(std::make_pair(datastack.size() - 1, datastack.top()));
datastack.pop();
}
else if(data.type == CParserData::DT_DONE)
{
nodeque.push_front(std::make_pair(datastack.size(), data));
}
else
{
THROW_EXCEPTION(EN_DOCUMENT_FORMATERROR, L"Wrong tag type : " << data.type << L" - pos : " << data.tag.first);
return -1;
}
return 0;
}
int CDocumentObject::PreParserBroken(const std::wstring& html, TDataStack& datastack, TNodeQueue& nodeque) const
{
while(datastack.size() > 0)
{
CParserData& data = datastack.top();
if(data.type == CParserData::DT_TAG || data.type == CParserData::DT_TAG_VALUE)
{
nodeque.push_front(std::make_pair(datastack.size() - 1, data));
datastack.pop();
}
else if(data.type == CParserData::DT_BROKEN)
{
nodeque.push_front(std::make_pair(datastack.size() - 1, data));
datastack.pop();
}
else
{
return -1;
}
}
return 0;
}
int CDocumentObject::CheckSpecialTag(const std::wstring& html, const CParserData& data) const
{
std::wstring tag = html.substr(data.tag.first + 1, data.tag.second - data.tag.first - 1);
std::wstring::size_type pos = tag.find(L" ");
if(pos != std::wstring::npos)
tag = tag.substr(0, pos);
if(tag == L"IMG")
return 0;
if(tag == L"PARAM")
return 0;
if(tag == L"BR")
return 0;
if(tag == L"HR")
return 0;
if(tag == L"P")
return 0;
return -1;
}
int CDocumentObject::CheckTag(const std::wstring& html, const CParserData& tag, const CParserData& end) const
{
std::wstring str = html.substr(tag.tag.first + 1, tag.tag.second - tag.tag.first - 1);
std::wstring::size_type pos = str.find(L" ");
if(pos != std::wstring::npos)
str = str.substr(0, pos);
if(str != html.substr(end.tag.first + 2, end.tag.second - end.tag.first - 2))
{
THROW_EXCEPTION(EN_DOCUMENT_FORMATERROR, L"do NOT match tag : " << str << L" and " << html.substr(end.tag.first + 2, end.tag.second - end.tag.first - 2));
return -1;
}
return 0;
}
CElementObject* CDocumentObject::MakeElement(const std::wstring& html, const CDocumentObject::TNodeData &node, CElementObject *parent, CElementObject *sibling) const
{
std::auto_ptr<CElementObject> ele(new CElementObject);
ele->level = node.first;
if(node.second.type == CParserData::DT_TAG)
{
ele->type = ET_NODE;
ele->tag = html.substr(node.second.tag.first + 1, node.second.tag.second - node.second.tag.first - 1);
}
else if(node.second.type == CParserData::DT_DONE)
{
ele->type = ET_TAG;
ele->tag = html.substr(node.second.tag.first + 1, node.second.tag.second - node.second.tag.first - 1);
}
else if(node.second.type == CParserData::DT_TAG_VALUE)
{
ele->type = ET_ELEMENT;
ele->tag = html.substr(node.second.tag.first + 1, node.second.tag.second - node.second.tag.first - 1);
ele->value = L"";
for(CParserData::TValueVector::const_iterator it = node.second.value.begin(); it != node.second.value.end(); ++ it)
{
ele->value += html.substr(it->first, it->second - it->first + 1);
}
}
else
{
THROW_EXCEPTION(EN_DOCUMENT_FORMATERROR, L"Wrong Tag Type : " << node.second.type);
return NULL;
}
if(ele->Analyse() != 0)
{
return NULL;
}
if(parent != NULL)
parent->child = ele.get();
ele->parent = parent;
ele->sibling = sibling;
return ele.release();
}
void CDocumentObject::Show(std::wostream &os) const
{
if(_root != NULL)
ShowElement(os, _root);
}
void CDocumentObject::ShowElement(std::wostream& os, const CElementObject* e) const
{
const CElementObject* pe = e, *ps = e->sibling;
pe->Show(os);
pe = pe->child;
if(pe != NULL)
{
ShowElement(os, pe);
}
if(ps != NULL)
{
ShowElement(os, ps);
}
}
void CDocumentObject::FreeElement(CElementObject* root)
{
CElementObject* pe = root->child, *ps = root->sibling;
// std::cout << "free:" << root->tag << std::endl;
if(root != NULL)
{
delete root;
root = NULL;
}
if(pe != NULL)
{
FreeElement(pe);
}
if(ps != NULL)
{
FreeElement(ps);
}
}
const CElementObject* CDocumentObject::FindFirstElement(const std::wstring &tag)
{
if(_root == NULL)
return NULL;
_findtag = tag;
while(!_findstack.empty())
_findstack.pop();
return FindElement(NULL, _root, _findtag, _findstack);
}
const CElementObject* CDocumentObject::FindNextElement()
{
if(_findstack.empty())
return NULL;
return FindElement(NULL, _findstack.top()->child, _findtag, _findstack);
}
const CElementObject* CDocumentObject::FindFirstElement(const CElementObject* element, const std::wstring& tag, TElementStack& tmpstack)
{
if(element == NULL)
return NULL;
while(!tmpstack.empty())
tmpstack.pop();
return FindElement(element, element, tag, tmpstack);
}
const CElementObject* CDocumentObject::FindNextElement(const CElementObject* element, const std::wstring& tag, TElementStack& tmpstack)
{
if(tmpstack.empty())
return NULL;
return FindElement(element, tmpstack.top()->child, tag, tmpstack);
}
const CElementObject* CDocumentObject::FindElement(const CElementObject* root, const CElementObject* pe, const std::wstring& tag, TElementStack& stack)
{
while(pe != NULL)
{
stack.push(pe);
if(pe->tag == tag)
return pe;
pe = pe->child;
}
while(!stack.empty() && stack.top() != root && pe == NULL)
{
pe = stack.top()->sibling;
stack.pop();
}
if(pe == NULL)
return NULL;
return FindElement(root, pe, tag, stack);
}
const CAttributeObject* CDocumentObject::FindAttribute(const TinyHtmlParser::CElementObject *element, const std::wstring &attr)
{
if(element == NULL)
return NULL;
const CAttributeObject* pa = element->attrib;
while(pa != NULL)
{
if(pa->attr == attr)
return pa;
pa = pa->next;
}
return pa;
}
}
明天合成到LingosHook中试试,今天就到这里了,累了,而且还要留点时间去看--《The Pacific》。