forked from ange007/HTMLp
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathHTMLp.HTMLParser.pas
295 lines (247 loc) · 8.12 KB
/
HTMLp.HTMLParser.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
unit HTMLp.HtmlParser;
interface
uses
HTMLp.DomCore, HTMLp.HtmlReader, HTMLp.HtmlTags;
type
THTMLParser = class
private
FHtmlDocument: TDocument;
FHtmlReader: THTMLReader;
FCurrentNode: TNode;
FCurrentTag: THTMLTag;
function FindDefParent: TElement;
function FindParent: TElement;
function FindParentElement(tagList: THTMLTagSet): TElement;
function FindTableParent: TElement;
function FindThisElement: TElement;
function GetMainElement(const tagName: WideString): TElement;
procedure ProcessAttributeEnd(Sender: TObject);
procedure ProcessAttributeStart(Sender: TObject);
procedure ProcessCDataSection(Sender: TObject);
procedure ProcessComment(Sender: TObject);
procedure ProcessDocType(Sender: TObject);
procedure ProcessElementEnd(Sender: TObject);
procedure ProcessElementStart(Sender: TObject);
procedure ProcessEndElement(Sender: TObject);
procedure ProcessEntityReference(Sender: TObject);
procedure ProcessScript(Sender: TObject);
procedure ProcessTextNode(Sender: TObject);
public
constructor Create;
destructor Destroy; override;
function ParseString(const htmlStr: WideString): TDocument;
property HTMLDocument: TDocument read FHtmlDocument;
end;
implementation
const
htmlTagName = 'html';
headTagName = 'head';
bodyTagName = 'body';
constructor THTMLParser.Create;
begin
inherited Create;
FHtmlReader := THTMLReader.Create;
with FHtmlReader do
begin
OnAttributeEnd := ProcessAttributeEnd;
OnAttributeStart := ProcessAttributeStart;
OnCDataSection := ProcessCDataSection;
OnComment := ProcessComment;
OnDocType := ProcessDocType;
OnElementEnd := ProcessElementEnd;
OnElementStart := ProcessElementStart;
OnEndElement := ProcessEndElement;
OnEntityReference := ProcessEntityReference;
OnScript := ProcessScript;
//OnNotation := ProcessNotation;
//OnProcessingInstruction := ProcessProcessingInstruction;
OnTextNode := ProcessTextNode;
end
end;
destructor THTMLParser.Destroy;
begin
FHtmlReader.Free;
inherited Destroy;
end;
function THTMLParser.FindDefParent: TElement;
begin
if FCurrentTag.Number in [HEAD_TAG, BODY_TAG] then Result := FHtmlDocument.AppendChild(FHtmlDocument.CreateElement(htmlTagName)) as TElement
else if FCurrentTag.Number in HeadTags then Result := GetMainElement(headTagName)
else Result := GetMainElement(bodyTagName);
end;
function THTMLParser.FindParent: TElement;
begin
if (FCurrentTag.Number = P_TAG) or (FCurrentTag.Number in BlockTags) then Result := FindParentElement(BlockParentTags)
else if FCurrentTag.Number in [LI_TAG] then Result := FindParentElement(ListItemParentTags)
else if FCurrentTag.Number in [DD_TAG, DT_TAG] then Result := FindParentElement(DefItemParentTags)
else if FCurrentTag.Number in [TD_TAG, TH_TAG] then Result := FindParentElement(CellParentTags)
else if FCurrentTag.Number in [TR_TAG] then Result := FindParentElement(RowParentTags)
else if FCurrentTag.Number in [COL_TAG] then Result := FindParentElement(ColParentTags)
else if FCurrentTag.Number in [COLGROUP_TAG, THEAD_TAG, TFOOT_TAG, TBODY_TAG] then Result := FindParentElement(TableSectionParentTags)
else if FCurrentTag.Number in [TABLE_TAG] then Result := FindTableParent
else if FCurrentTag.Number in [OPTION_TAG] then Result := FindParentElement(OptionParentTags)
else if FCurrentTag.Number in [HEAD_TAG, BODY_TAG] then Result := FHtmlDocument.DocumentElement as TElement
else Result := nil;
if Result = nil then Result := FindDefParent;
end;
function THTMLParser.FindParentElement(tagList: THTMLTagSet): TElement;
var
Node: TNode;
HtmlTag: THTMLTag;
begin
Node := FCurrentNode;
while Node.NodeType = ELEMENT_NODE do
begin
HtmlTag := HtmlTagList.GetTagByName(Node.Name);
if HtmlTag.Number in tagList then
begin
Result := Node as TElement;
Exit;
end;
Node := Node.ParentNode;
end;
Result := nil;
end;
function THTMLParser.FindTableParent: TElement;
var
Node: TNode;
HtmlTag: THTMLTag;
begin
Node := FCurrentNode;
while Node.NodeType = ELEMENT_NODE do
begin
HtmlTag := HtmlTagList.GetTagByName(Node.Name);
if (HtmlTag.Number = TD_TAG) or (HtmlTag.Number in BlockTags) then
begin
Result := (Node as TElement);
Exit;
end;
Node := Node.ParentNode;
end;
Result := GetMainElement(bodyTagName);
end;
function THTMLParser.FindThisElement: TElement;
var
Node: TNode;
begin
Node := FCurrentNode;
while Node.NodeType = ELEMENT_NODE do
begin
Result := (Node as TElement);
if Result.TagName = FHtmlReader.Name then Exit;
Node := Node.ParentNode;
end;
Result := nil;
end;
function THTMLParser.GetMainElement(const tagName: WideString): TElement;
var
child: TNode;
I: Integer;
begin
if (FHtmlDocument.DocumentElement = nil) then FHtmlDocument.AppendChild(FHtmlDocument.CreateElement(htmlTagName));
for I := 0 to FHtmlDocument.DocumentElement.ChildNodes.Count - 1 do
begin
child := FHtmlDocument.DocumentElement.ChildNodes.Items[I];
if (child.NodeType = ELEMENT_NODE) and (child.Name = tagName) then
begin
Result := (child as TElement);
Exit
end
end;
Result := FHtmlDocument.CreateElement(tagName);
FHtmlDocument.DocumentElement.AppendChild(Result);
end;
procedure THTMLParser.ProcessAttributeEnd(Sender: TObject);
begin
FCurrentNode := (FCurrentNode as TAttr).OwnerElement;
end;
procedure THTMLParser.ProcessAttributeStart(Sender: TObject);
var
Attr: TAttr;
begin
Attr := FHtmlDocument.CreateAttribute((Sender as THTMLReader).Name);
(FCurrentNode as TElement).SetAttributeNode(Attr);
FCurrentNode := Attr;
end;
procedure THTMLParser.ProcessCDataSection(Sender: TObject);
var
CDataSection: TCDataSection;
begin
CDataSection := FHtmlDocument.CreateCDATASection(FHtmlReader.NodeValue);
FCurrentNode.AppendChild(CDataSection)
end;
procedure THTMLParser.ProcessComment(Sender: TObject);
var
Comment: TComment;
begin
Comment := FHtmlDocument.CreateComment(FHtmlReader.NodeValue);
FCurrentNode.AppendChild(Comment);
end;
procedure THTMLParser.ProcessDocType(Sender: TObject);
begin
with FHtmlReader do FHtmlDocument.Doctype := DomImplementation.CreateDocumentType(Name, PublicID, SystemID);
end;
procedure THTMLParser.ProcessElementEnd(Sender: TObject);
begin
if FHtmlReader.isEmptyElement
or (FCurrentTag.Number in EmptyTags) then FCurrentNode := FCurrentNode.ParentNode;
FCurrentTag := nil;
end;
procedure THTMLParser.ProcessElementStart(Sender: TObject);
var
Element: TElement;
Parent: TNode;
begin
FCurrentTag := HtmlTagList.GetTagByName(FHtmlReader.Name);
if FCurrentTag.Number in (NeedFindParentTags + BlockTags) then
begin
Parent := FindParent;
if not Assigned(Parent) then raise DomException.Create(HIERARCHY_REQUEST_ERR);
FCurrentNode := Parent;
end;
Element := FHtmlDocument.CreateElement(FHtmlReader.Name);
FCurrentNode.AppendChild(Element);
FCurrentNode := Element;
end;
procedure THTMLParser.ProcessEndElement(Sender: TObject);
var
Element: TElement;
begin
Element := FindThisElement;
if Assigned(Element) then FCurrentNode := Element.ParentNode
// else if IsBlockTagName(FHtmlReader.nodeName) then raise DomException.Create(HIERARCHY_REQUEST_ERR);
end;
procedure THTMLParser.ProcessEntityReference(Sender: TObject);
var
EntityReference: TEntityReference;
begin
EntityReference := FHtmlDocument.CreateEntityReference(FHtmlReader.Name);
FCurrentNode.AppendChild(EntityReference);
end;
procedure THtmlParser.ProcessScript(Sender: TObject);
var
Script: TScript;
begin
Script := FHtmlDocument.CreateScript(FHtmlReader.NodeValue);
FCurrentNode.AppendChild(Script);
end;
procedure THtmlParser.ProcessTextNode(Sender: TObject);
var
TextNode: TTextNode;
begin
TextNode := FHtmlDocument.CreateTextNode(FHtmlReader.NodeValue);
FCurrentNode.AppendChild(TextNode);
end;
function THTMLParser.ParseString(const htmlStr: WideString): TDocument;
begin
FHtmlReader.HTMLStr := htmlStr;
FHtmlDocument := DomImplementation.CreateEmptyDocument(nil);
FCurrentNode := FHtmlDocument;
try
while FHtmlReader.Read do;
except
// TODO: Add event ?
end;
Result := FHtmlDocument;
end;
end.