Skip to content

Commit

Permalink
Update to Nokogumbo 2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
stevecheckoway committed Oct 10, 2018
1 parent 9deaf67 commit e3dab48
Show file tree
Hide file tree
Showing 9 changed files with 35 additions and 77 deletions.
46 changes: 2 additions & 44 deletions lib/sanitize.rb
Original file line number Diff line number Diff line change
Expand Up @@ -121,19 +121,7 @@ def fragment(html)
return '' unless html

html = preprocess(html)
doc = Nokogiri::HTML5.parse("<html><body>#{html}")

# Hack to allow fragments containing <body>. Borrowed from
# Nokogiri::HTML::DocumentFragment.
if html =~ /\A<body(?:\s|>)/i
path = '/html/body'
else
path = '/html/body/node()'
end

frag = doc.fragment
frag << doc.xpath(path)

frag = Nokogiri::HTML5.fragment(html)
node!(frag)
to_html(frag)
end
Expand Down Expand Up @@ -184,37 +172,7 @@ def preprocess(html)
end

def to_html(node)
replace_meta = false

# Hacky workaround for a libxml2 bug that adds an undesired Content-Type
# meta tag to all serialized HTML documents.
#
# https://github.com/sparklemotion/nokogiri/issues/1008
if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE

regex_meta = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i

# Only replace the content-type meta tag if <meta> isn't whitelisted or
# the original document didn't actually include a content-type meta tag.
replace_meta = !@config[:elements].include?('meta') ||
node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
meta['http-equiv'].casecmp('content-type').zero?
end
end

so = Nokogiri::XML::Node::SaveOptions

# Serialize to HTML without any formatting to prevent Nokogiri from adding
# newlines after certain tags.
html = node.to_html(
:encoding => 'utf-8',
:indent => 0,
:save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
)

html.gsub!(regex_meta, '\1') if replace_meta
html
node.to_html(preserve_newline: true)
end

def transform_node!(node, node_whitelist)
Expand Down
2 changes: 1 addition & 1 deletion sanitize.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Gem::Specification.new do |s|
# Runtime dependencies.
s.add_dependency('crass', '~> 1.0.2')
s.add_dependency('nokogiri', '>= 1.4.4')
s.add_dependency('nokogumbo', '~> 1.4')
s.add_dependency('nokogumbo', '~> 2.0')

# Development dependencies.
s.add_development_dependency('minitest', '~> 5.10.2')
Expand Down
2 changes: 1 addition & 1 deletion test/test_clean_css.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
@s.fragment(%[
<div style="color: #fff; width: expression(alert(1)); /* <-- evil! */"></div>
].strip).must_equal %[
<div style="color: #fff; /* &lt;-- evil! */"></div>
<div style="color: #fff; /* <-- evil! */"></div>
].strip
end

Expand Down
16 changes: 8 additions & 8 deletions test/test_clean_doctype.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
end

it 'should remove doctype declarations' do
@s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>\n"
@s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>"
@s.fragment('<!DOCTYPE html>foo').must_equal 'foo'
end

Expand All @@ -34,27 +34,27 @@

it 'should allow doctype declarations in documents' do
@s.document('<!DOCTYPE html><html>foo</html>')
.must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
.must_equal "<!DOCTYPE html><html>foo</html>"

@s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
.must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n<html>foo</html>\n"
.must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\"><html>foo</html>"

@s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
.must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html>foo</html>\n"
.must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"
end

it 'should not allow obviously invalid doctype declarations in documents' do
@s.document('<!DOCTYPE blah blah blah><html>foo</html>')
.must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
.must_equal "<!DOCTYPE html><html>foo</html>"

@s.document('<!DOCTYPE blah><html>foo</html>')
.must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
.must_equal "<!DOCTYPE html><html>foo</html>"

@s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
.must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
.must_equal "<!DOCTYPE html><html>foo</html>"

@s.document('<!whatever><html>foo</html>')
.must_equal "<html>foo</html>\n"
.must_equal "<html>foo</html>"
end

it 'should not allow doctype definitions in fragments' do
Expand Down
6 changes: 3 additions & 3 deletions test/test_clean_element.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
:default => 'Lorem dolor sit amet alert("hello world");',
:restricted => 'Lorem <strong>dolor</strong> sit amet alert("hello world");',
:basic => 'Lorem <a href="pants" rel="nofollow"><strong>dolor</strong></a> sit<br>amet alert("hello world");',
:relaxed => 'Lorem <a href="pants" title="foo&gt;ipsum &lt;a href="><strong>dolor</strong></a> sit<br>amet alert("hello world");',
:relaxed => 'Lorem <a href="pants" title="foo>ipsum <a href="><strong>dolor</strong></a> sit<br>amet alert("hello world");',
},

:unclosed => {
Expand Down Expand Up @@ -234,7 +234,7 @@

it 'should not choke on valueless attributes' do
@s.fragment('foo <a href>foo</a> bar')
.must_equal 'foo <a href rel="nofollow">foo</a> bar'
.must_equal 'foo <a href="" rel="nofollow">foo</a> bar'
end

it 'should downcase attribute names' do
Expand Down Expand Up @@ -262,7 +262,7 @@

it 'should encode special chars in attribute values' do
@s.fragment('<a href="http://example.com" title="<b>&eacute;xamples</b> & things">foo</a>')
.must_equal '<a href="http://example.com" title="&lt;b&gt;éxamples&lt;/b&gt; &amp; things">foo</a>'
.must_equal '<a href="http://example.com" title="<b>éxamples</b> &amp; things">foo</a>'
end

strings.each do |name, data|
Expand Down
2 changes: 1 addition & 1 deletion test/test_malicious_html.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
describe '<body>' do
it 'should not be possible to inject JS via a malformed event attribute' do
@s.document('<html><head></head><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>').
must_equal "<html><head></head><body></body></html>\n"
must_equal "<html><head></head><body></body></html>"
end
end

Expand Down
10 changes: 5 additions & 5 deletions test/test_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,29 +32,29 @@
it 'should work around the libxml2 content-type meta tag bug' do
Sanitize.document('<html><head></head><body>Howdy!</body></html>',
:elements => %w[html head body]
).must_equal "<html><head></head><body>Howdy!</body></html>\n"
).must_equal "<html><head></head><body>Howdy!</body></html>"

Sanitize.document('<html><head></head><body>Howdy!</body></html>',
:elements => %w[html head meta body]
).must_equal "<html><head></head><body>Howdy!</body></html>\n"
).must_equal "<html><head></head><body>Howdy!</body></html>"

Sanitize.document('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>',
:elements => %w[html head meta body],
:attributes => {'meta' => ['charset']}
).must_equal "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>\n"
).must_equal "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>"

Sanitize.document('<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head><body>Howdy!</body></html>',
:elements => %w[html head meta body],
:attributes => {'meta' => %w[charset content http-equiv]}
).must_equal "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>\n"
).must_equal "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>"

# Edge case: an existing content-type meta tag with a non-UTF-8 content type
# will be converted to UTF-8, since that's the only output encoding we
# support.
Sanitize.document('<html><head><meta http-equiv="content-type" content="text/html;charset=us-ascii"></head><body>Howdy!</body></html>',
:elements => %w[html head meta body],
:attributes => {'meta' => %w[charset content http-equiv]}
).must_equal "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body>Howdy!</body></html>\n"
).must_equal "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body>Howdy!</body></html>"
end

describe 'when siblings are added after a node during traversal' do
Expand Down
4 changes: 2 additions & 2 deletions test/test_sanitize.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

it 'should sanitize an HTML document' do
@s.document('<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>')
.must_equal "<html>Lorem ipsum dolor sit amet alert(\"hello world\");</html>\n"
.must_equal "<html>Lorem ipsum dolor sit amet alert(\"hello world\");</html>"
end

it 'should not modify the input string' do
Expand All @@ -35,7 +35,7 @@
end

it 'should not choke on frozen documents' do
@s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>\n"
@s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>"
end
end

Expand Down
24 changes: 12 additions & 12 deletions test/test_unicode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,61 +23,61 @@
end

it 'should strip deprecated grave and acute clones' do
@s.document("a\u0340b\u0341c").must_equal "<html><head></head><body>abc</body></html>\n"
@s.document("a\u0340b\u0341c").must_equal "<html><head></head><body>abc</body></html>"
@s.fragment("a\u0340b\u0341c").must_equal 'abc'
end

it 'should strip deprecated Khmer characters' do
@s.document("a\u17a3b\u17d3c").must_equal "<html><head></head><body>abc</body></html>\n"
@s.document("a\u17a3b\u17d3c").must_equal "<html><head></head><body>abc</body></html>"
@s.fragment("a\u17a3b\u17d3c").must_equal 'abc'
end

it 'should strip line and paragraph separator punctuation' do
@s.document("a\u2028b\u2029c").must_equal "<html><head></head><body>abc</body></html>\n"
@s.document("a\u2028b\u2029c").must_equal "<html><head></head><body>abc</body></html>"
@s.fragment("a\u2028b\u2029c").must_equal 'abc'
end

it 'should strip bidi embedding control characters' do
@s.document("a\u202ab\u202bc\u202cd\u202de\u202e")
.must_equal "<html><head></head><body>abcde</body></html>\n"
.must_equal "<html><head></head><body>abcde</body></html>"

@s.fragment("a\u202ab\u202bc\u202cd\u202de\u202e")
.must_equal 'abcde'
end

it 'should strip deprecated symmetric swapping characters' do
@s.document("a\u206ab\u206bc").must_equal "<html><head></head><body>abc</body></html>\n"
@s.document("a\u206ab\u206bc").must_equal "<html><head></head><body>abc</body></html>"
@s.fragment("a\u206ab\u206bc").must_equal 'abc'
end

it 'should strip deprecated Arabic form shaping characters' do
@s.document("a\u206cb\u206dc").must_equal "<html><head></head><body>abc</body></html>\n"
@s.document("a\u206cb\u206dc").must_equal "<html><head></head><body>abc</body></html>"
@s.fragment("a\u206cb\u206dc").must_equal 'abc'
end

it 'should strip deprecated National digit shape characters' do
@s.document("a\u206eb\u206fc").must_equal "<html><head></head><body>abc</body></html>\n"
@s.document("a\u206eb\u206fc").must_equal "<html><head></head><body>abc</body></html>"
@s.fragment("a\u206eb\u206fc").must_equal 'abc'
end

it 'should strip interlinear annotation characters' do
@s.document("a\ufff9b\ufffac\ufffb").must_equal "<html><head></head><body>abc</body></html>\n"
@s.document("a\ufff9b\ufffac\ufffb").must_equal "<html><head></head><body>abc</body></html>"
@s.fragment("a\ufff9b\ufffac\ufffb").must_equal 'abc'
end

it 'should strip BOM/zero-width non-breaking space characters' do
@s.document("a\ufeffbc").must_equal "<html><head></head><body>abc</body></html>\n"
@s.document("a\ufeffbc").must_equal "<html><head></head><body>abc</body></html>"
@s.fragment("a\ufeffbc").must_equal 'abc'
end

it 'should strip object replacement characters' do
@s.document("a\ufffcbc").must_equal "<html><head></head><body>abc</body></html>\n"
@s.document("a\ufffcbc").must_equal "<html><head></head><body>abc</body></html>"
@s.fragment("a\ufffcbc").must_equal 'abc'
end

it 'should strip musical notation scoping characters' do
@s.document("a\u{1d173}b\u{1d174}c\u{1d175}d\u{1d176}e\u{1d177}f\u{1d178}g\u{1d179}h\u{1d17a}")
.must_equal "<html><head></head><body>abcdefgh</body></html>\n"
.must_equal "<html><head></head><body>abcdefgh</body></html>"

@s.fragment("a\u{1d173}b\u{1d174}c\u{1d175}d\u{1d176}e\u{1d177}f\u{1d178}g\u{1d179}h\u{1d17a}")
.must_equal 'abcdefgh'
Expand All @@ -88,7 +88,7 @@
(0xE0000..0xE007F).each {|n| str << [n].pack('U') }
str << 'b'

@s.document(str).must_equal "<html><head></head><body>ab</body></html>\n"
@s.document(str).must_equal "<html><head></head><body>ab</body></html>"
@s.fragment(str).must_equal 'ab'
end
end
Expand Down

0 comments on commit e3dab48

Please sign in to comment.