Skip to content

Commit

Permalink
Added HtmlTokenizer.IgnoreTruncatedTags option
Browse files Browse the repository at this point in the history
  • Loading branch information
jstedfast committed Mar 25, 2020
1 parent 9428ba6 commit ec07b51
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 29 deletions.
72 changes: 45 additions & 27 deletions HtmlKit/HtmlTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,19 @@ public HtmlNamespace HtmlNamespace {
get; private set;
}

/// <summary>
/// Get or set whether or not the tokenizer should ignore truncated tags.
/// </summary>
/// <remarks>
/// <para>Gets or sets whether or not the tokenizer should ignore truncated tags.</para>
/// <para>If <c>false</c> and the stream abrubtly ends in the middle of an HTML tag, it will be
/// treated as an <see cref="HtmlDataToken"/> instead.</para>
/// </remarks>
/// <value><c>true</c> if truncated tags should be ignored; otherwise, <c>false</c>.</value>
public bool IgnoreTruncatedTags {
get; set;
}

/// <summary>
/// Gets the current line number.
/// </summary>
Expand Down Expand Up @@ -319,11 +332,16 @@ HtmlToken EmitDocType ()
return token;
}

HtmlToken EmitDataToken (bool encodeEntities)
HtmlToken EmitDataToken (bool encodeEntities, bool truncated)
{
if (data.Length == 0)
return null;

if (truncated && IgnoreTruncatedTags) {
data.Length = 0;
return null;
}

var token = CreateDataToken (data.ToString ());
token.EncodeEntities = encodeEntities;
data.Length = 0;
Expand Down Expand Up @@ -413,7 +431,7 @@ HtmlToken ReadCharacterReference (HtmlTokenizerState next)
TokenizerState = HtmlTokenizerState.EndOfFile;
data.Append ('&');

return EmitDataToken (true);
return EmitDataToken (true, false);
}

c = (char) nc;
Expand All @@ -439,7 +457,7 @@ HtmlToken ReadCharacterReference (HtmlTokenizerState next)
data.Append (entity.GetPushedInput ());
entity.Reset ();

return EmitDataToken (true);
return EmitDataToken (true, false);
}

c = (char) nc;
Expand Down Expand Up @@ -481,7 +499,7 @@ HtmlToken ReadGenericRawTextEndTagOpen (bool decoded, HtmlTokenizerState rawText

if (nc == -1) {
TokenizerState = HtmlTokenizerState.EndOfFile;
return EmitDataToken (decoded);
return EmitDataToken (decoded, true);
}

c = (char) nc;
Expand Down Expand Up @@ -510,7 +528,7 @@ HtmlToken ReadGenericRawTextEndTagName (bool decoded, HtmlTokenizerState rawText
TokenizerState = HtmlTokenizerState.EndOfFile;
name.Length = 0;

return EmitDataToken (decoded);
return EmitDataToken (decoded, true);
}

c = (char) nc;
Expand Down Expand Up @@ -590,7 +608,7 @@ HtmlToken ReadData ()
}
} while (TokenizerState == HtmlTokenizerState.Data);

return EmitDataToken (DecodeCharacterReferences);
return EmitDataToken (DecodeCharacterReferences, false);
}

// 8.2.4.2 Character reference in data state
Expand Down Expand Up @@ -623,14 +641,14 @@ HtmlToken ReadRcData ()
goto default;
case '<':
TokenizerState = HtmlTokenizerState.RcDataLessThan;
return EmitDataToken (DecodeCharacterReferences);
return EmitDataToken (DecodeCharacterReferences, false);
default:
data.Append (c == '\0' ? '\uFFFD' : c);
break;
}
} while (TokenizerState == HtmlTokenizerState.RcData);

return EmitDataToken (DecodeCharacterReferences);
return EmitDataToken (DecodeCharacterReferences, false);
}

// 8.2.4.4 Character reference in RCDATA state
Expand All @@ -656,14 +674,14 @@ HtmlToken ReadRawText ()
switch (c) {
case '<':
TokenizerState = HtmlTokenizerState.RawTextLessThan;
return EmitDataToken (false);
return EmitDataToken (false, false);
default:
data.Append (c == '\0' ? '\uFFFD' : c);
break;
}
} while (TokenizerState == HtmlTokenizerState.RawText);

return EmitDataToken (false);
return EmitDataToken (false, false);
}

// 8.2.4.6 Script data state
Expand Down Expand Up @@ -707,7 +725,7 @@ HtmlToken ReadPlainText ()

TokenizerState = HtmlTokenizerState.EndOfFile;

return EmitDataToken (false);
return EmitDataToken (false, false);
}

// 8.2.4.8 Tag open state
Expand All @@ -717,8 +735,8 @@ HtmlToken ReadTagOpen ()
char c;

if (nc == -1) {
var token = IgnoreTruncatedTags ? null : CreateDataToken ("<");
TokenizerState = HtmlTokenizerState.EndOfFile;
var token = CreateDataToken ("<");
return token;
}

Expand Down Expand Up @@ -762,7 +780,7 @@ HtmlToken ReadEndTagOpen ()

if (nc == -1) {
TokenizerState = HtmlTokenizerState.EndOfFile;
return EmitDataToken (false);
return EmitDataToken (false, true);
}

c = (char) nc;
Expand Down Expand Up @@ -802,7 +820,7 @@ HtmlToken ReadTagName ()
TokenizerState = HtmlTokenizerState.EndOfFile;
name.Length = 0;

return EmitDataToken (false);
return EmitDataToken (false, true);
}

c = (char) nc;
Expand Down Expand Up @@ -1421,7 +1439,7 @@ HtmlToken ReadBeforeAttributeName ()
TokenizerState = HtmlTokenizerState.EndOfFile;
tag = null;

return EmitDataToken (false);
return EmitDataToken (false, true);
}

c = (char) nc;
Expand Down Expand Up @@ -1460,7 +1478,7 @@ HtmlToken ReadAttributeName ()
name.Length = 0;
tag = null;

return EmitDataToken (false);
return EmitDataToken (false, true);
}

c = (char) nc;
Expand Down Expand Up @@ -1504,7 +1522,7 @@ HtmlToken ReadAfterAttributeName ()
TokenizerState = HtmlTokenizerState.EndOfFile;
tag = null;

return EmitDataToken (false);
return EmitDataToken (false, true);
}

c = (char) nc;
Expand Down Expand Up @@ -1545,7 +1563,7 @@ HtmlToken ReadBeforeAttributeValue ()
TokenizerState = HtmlTokenizerState.EndOfFile;
tag = null;

return EmitDataToken (false);
return EmitDataToken (false, true);
}

c = (char) nc;
Expand Down Expand Up @@ -1590,7 +1608,7 @@ HtmlToken ReadAttributeValueQuoted ()
TokenizerState = HtmlTokenizerState.EndOfFile;
name.Length = 0;

return EmitDataToken (false);
return EmitDataToken (false, true);
}

c = (char) nc;
Expand Down Expand Up @@ -1631,7 +1649,7 @@ HtmlToken ReadAttributeValueUnquoted ()
TokenizerState = HtmlTokenizerState.EndOfFile;
name.Length = 0;

return EmitDataToken (false);
return EmitDataToken (false, true);
}

c = (char) nc;
Expand Down Expand Up @@ -1677,7 +1695,7 @@ HtmlToken ReadCharacterReferenceInAttributeValue ()
TokenizerState = HtmlTokenizerState.EndOfFile;
name.Length = 0;

return EmitDataToken (false);
return EmitDataToken (false, true);
}

c = (char) nc;
Expand Down Expand Up @@ -1708,7 +1726,7 @@ HtmlToken ReadCharacterReferenceInAttributeValue ()
data.Append (entity.GetPushedInput ());
entity.Reset ();

return EmitDataToken (false);
return EmitDataToken (false, true);
}

c = (char) nc;
Expand Down Expand Up @@ -1747,7 +1765,7 @@ HtmlToken ReadAfterAttributeValueQuoted ()

if (nc == -1) {
TokenizerState = HtmlTokenizerState.EndOfFile;
return EmitDataToken (false);
return EmitDataToken (false, true);
}

c = (char) nc;
Expand Down Expand Up @@ -1787,7 +1805,7 @@ HtmlToken ReadSelfClosingStartTag ()

if (nc == -1) {
TokenizerState = HtmlTokenizerState.EndOfFile;
return EmitDataToken (false);
return EmitDataToken (false, true);
}

c = (char) nc;
Expand Down Expand Up @@ -1839,7 +1857,7 @@ HtmlToken ReadMarkupDeclarationOpen ()
while (count < 2) {
if ((nc = Peek ()) == -1) {
TokenizerState = HtmlTokenizerState.EndOfFile;
return EmitDataToken (false);
return EmitDataToken (false, true);
}

if ((c = (char) nc) != '-')
Expand Down Expand Up @@ -1870,7 +1888,7 @@ HtmlToken ReadMarkupDeclarationOpen ()
while (count < 7) {
if ((nc = Read ()) == -1) {
TokenizerState = HtmlTokenizerState.EndOfFile;
return EmitDataToken (false);
return EmitDataToken (false, true);
}

c = (char) nc;
Expand Down Expand Up @@ -1902,7 +1920,7 @@ HtmlToken ReadMarkupDeclarationOpen ()
while (count < 7) {
if ((nc = Read ()) == -1) {
TokenizerState = HtmlTokenizerState.EndOfFile;
return EmitDataToken (false);
return EmitDataToken (false, true);
}

c = (char) nc;
Expand Down
Loading

0 comments on commit ec07b51

Please sign in to comment.