Skip to content
This repository was archived by the owner on Apr 12, 2024. It is now read-only.

fix(ngSanitize): follow HTML parser rules for start tags / allow < in text content #8212

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions src/ngSanitize/sanitize.js
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,11 @@ function sanitizeText(chars) {

// Regular Expressions for parsing tags and attributes
var START_TAG_REGEXP =
/^<\s*([\w:-]+)((?:\s+[\w:-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)\s*>/,
END_TAG_REGEXP = /^<\s*\/\s*([\w:-]+)[^>]*>/,
/^<((?:[a-zA-Z])[\w:-]*)((?:\s+[\w:-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)\s*(>?)/,
END_TAG_REGEXP = /^<\/\s*([\w:-]+)[^>]*>/,
ATTR_REGEXP = /([\w:-]+)(?:\s*=\s*(?:(?:"((?:[^"])*)")|(?:'((?:[^'])*)')|([^>\s]+)))?/g,
BEGIN_TAG_REGEXP = /^</,
BEGING_END_TAGE_REGEXP = /^<\s*\//,
BEGING_END_TAGE_REGEXP = /^<\//,
COMMENT_REGEXP = /<!--(.*?)-->/g,
DOCTYPE_REGEXP = /<!DOCTYPE([^>]*?)>/i,
CDATA_REGEXP = /<!\[CDATA\[(.*?)]]>/g,
Expand Down Expand Up @@ -232,10 +232,11 @@ function makeMap(str) {
* @param {object} handler
*/
function htmlParser( html, handler ) {
var index, chars, match, stack = [], last = html;
var index, chars, match, stack = [], last = html, text;
stack.last = function() { return stack[ stack.length - 1 ]; };

while ( html ) {
text = '';
chars = true;

// Make sure we're not in a script or style element
Expand Down Expand Up @@ -274,16 +275,23 @@ function htmlParser( html, handler ) {
match = html.match( START_TAG_REGEXP );

if ( match ) {
html = html.substring( match[0].length );
match[0].replace( START_TAG_REGEXP, parseStartTag );
// We only have a valid start-tag if there is a '>'.
if ( match[4] ) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/cc @IgorMinar PTAL --- This particular block is only here to make sure that we throw if we find an apparent start-tag without a trailing >

This might not be the right thing to do --- if we don't have a trailing >, we could potentially just treat it as a text node. I'm not sure what the best thing to do in this case is.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is better to treat as a text node. IMO the sanitizer should be secure but tolerant

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's fine.

html = html.substring( match[0].length );
match[0].replace( START_TAG_REGEXP, parseStartTag );
}
chars = false;
} else {
// no ending tag found --- this piece should be encoded as an entity.
text += '<';
html = html.substring(1);
}
}

if ( chars ) {
index = html.indexOf("<");

var text = index < 0 ? html : html.substring( 0, index );
text += index < 0 ? html : html.substring( 0, index );
html = index < 0 ? "" : html.substring( index );

if (handler.chars) handler.chars( decodeEntities(text) );
Expand Down
49 changes: 41 additions & 8 deletions test/ngSanitize/sanitizeSpec.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ describe('HTML', function() {

var handler, start, text, comment;
beforeEach(function() {
text = "";
handler = {
start: function(tag, attrs, unary){
start = {
Expand All @@ -35,7 +36,7 @@ describe('HTML', function() {
});
},
chars: function(text_){
text = text_;
text += text_;
},
end:function(tag) {
expect(tag).toEqual(start.tag);
Expand Down Expand Up @@ -81,8 +82,31 @@ describe('HTML', function() {
expect(text).toEqual('text');
});

it('should not treat "<" followed by a non-/ or non-letter as a tag', function() {
expectHTML('<- text1 text2 <1 text1 text2 <{', handler).
toBe('&lt;- text1 text2 &lt;1 text1 text2 &lt;{');
});

it('should throw badparse if text content contains "<" followed by "/" without matching ">"', function() {
expect(function() {
htmlParser('foo </ bar', handler);
}).toThrowMinErr('$sanitize', 'badparse', 'The sanitizer was unable to parse the following block of html: </ bar');
});

it('should throw badparse if text content contains "<" followed by an ASCII letter without matching ">"', function() {
expect(function() {
htmlParser('foo <a bar', handler);
}).toThrowMinErr('$sanitize', 'badparse', 'The sanitizer was unable to parse the following block of html: <a bar');
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I this really a bad text string? I would let it go as a text block. For instance:

In my math project I found that a<b when b=10

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as HTML parsing is concerned, /</[a-zA-Z/ is the start of a tag, so we shouldn't "fix" this, I think

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although arguably we are not trying to "parse" html here, only sanitize text that may be inadvertently parsed by a browser later

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that this is right. we shouldn't try to fix broken html.

});

it('should accept tag delimiters such as "<" inside real tags', function() {
// Assert that the < is part of the text node content, and not part of a tag name.
htmlParser('<p> 10 < 100 </p>', handler);
expect(text).toEqual(' 10 < 100 ');
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't this < be encoded just to be safe?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is encoded in the real world, however in the test, the chars handler just appends the value to a string

});

it('should parse newlines in tags', function() {
htmlParser('<\ntag\n attr="value"\n>text<\n/\ntag\n>', handler);
htmlParser('<tag\n attr="value"\n>text</\ntag\n>', handler);
expect(start).toEqual({tag:'tag', attrs:{attr:'value'}, unary:false});
expect(text).toEqual('text');
});
Expand Down Expand Up @@ -123,8 +147,9 @@ describe('HTML', function() {
expectHTML('a<!DocTyPe html>c.').toEqual('ac.');
});

it('should remove nested script', function() {
expectHTML('a< SCRIPT >A< SCRIPT >evil< / scrIpt >B< / scrIpt >c.').toEqual('ac.');
it('should escape non-start tags', function() {
expectHTML('a< SCRIPT >A< SCRIPT >evil< / scrIpt >B< / scrIpt >c.').
toBe('a&lt; SCRIPT &gt;A&lt; SCRIPT &gt;evil&lt; / scrIpt &gt;B&lt; / scrIpt &gt;c.');
});

it('should remove attrs', function() {
Expand Down Expand Up @@ -165,14 +190,16 @@ describe('HTML', function() {
expectHTML(everything).toEqual(everything);
});

it('should handle improper html', function() {
it('should mangle improper html', function() {
// This text is encoded more than a real HTML parser would, but it should render the same.
expectHTML('< div rel="</div>" alt=abc dir=\'"\' >text< /div>').
toEqual('<div rel="&lt;/div&gt;" alt="abc" dir="&#34;">text</div>');
toBe('&lt; div rel=&#34;&#34; alt=abc dir=\'&#34;\' &gt;text&lt; /div&gt;');
});

it('should handle improper html2', function() {
it('should mangle improper html2', function() {
// A proper HTML parser would clobber this more in most cases, but it looks reasonable.
expectHTML('< div rel="</div>" / >').
toEqual('<div rel="&lt;/div&gt;"/>');
toBe('&lt; div rel=&#34;&#34; / &gt;');
});

it('should ignore back slash as escape', function() {
Expand All @@ -195,6 +222,12 @@ describe('HTML', function() {
expectHTML('\na\n').toEqual('&#10;a&#10;');
});

it('should accept tag delimiters such as "<" inside real tags (with nesting)', function() {
//this is an integrated version of the 'should accept tag delimiters such as "<" inside real tags' test
expectHTML('<p> 10 < <span>100</span> </p>')
.toEqual('<p> 10 &lt; <span>100</span> </p>');
});

describe('htmlSanitizerWriter', function() {
/* global htmlSanitizeWriter: false */
if (angular.isUndefined(window.htmlSanitizeWriter)) return;
Expand Down