Update to Nokogumbo 2.0

rgrove · Oct 10, 2018 · e3dab48 · e3dab48
1 parent 9deaf67
commit e3dab48
Show file tree

Hide file tree

Showing 9 changed files with 35 additions and 77 deletions.
diff --git a/lib/sanitize.rb b/lib/sanitize.rb
@@ -121,19 +121,7 @@ def fragment(html)
     return '' unless html
 
     html = preprocess(html)
-    doc  = Nokogiri::HTML5.parse("<html><body>#{html}")
-
-    # Hack to allow fragments containing <body>. Borrowed from
-    # Nokogiri::HTML::DocumentFragment.
-    if html =~ /\A<body(?:\s|>)/i
-      path = '/html/body'
-    else
-      path = '/html/body/node()'
-    end
-
-    frag = doc.fragment
-    frag << doc.xpath(path)
-
+    frag  = Nokogiri::HTML5.fragment(html)
     node!(frag)
     to_html(frag)
   end
@@ -184,37 +172,7 @@ def preprocess(html)
   end
 
   def to_html(node)
-    replace_meta = false
-
-    # Hacky workaround for a libxml2 bug that adds an undesired Content-Type
-    # meta tag to all serialized HTML documents.
-    #
-    # https://github.com/sparklemotion/nokogiri/issues/1008
-    if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
-        node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
-
-      regex_meta   = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
-
-      # Only replace the content-type meta tag if <meta> isn't whitelisted or
-      # the original document didn't actually include a content-type meta tag.
-      replace_meta = !@config[:elements].include?('meta') ||
-        node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
-          meta['http-equiv'].casecmp('content-type').zero?
-        end
-    end
-
-    so = Nokogiri::XML::Node::SaveOptions
-
-    # Serialize to HTML without any formatting to prevent Nokogiri from adding
-    # newlines after certain tags.
-    html = node.to_html(
-      :encoding  => 'utf-8',
-      :indent    => 0,
-      :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
-    )
-
-    html.gsub!(regex_meta, '\1') if replace_meta
-    html
+    node.to_html(preserve_newline: true)
   end
 
   def transform_node!(node, node_whitelist)

diff --git a/sanitize.gemspec b/sanitize.gemspec
@@ -19,7 +19,7 @@ Gem::Specification.new do |s|
   # Runtime dependencies.
   s.add_dependency('crass', '~> 1.0.2')
   s.add_dependency('nokogiri', '>= 1.4.4')
-  s.add_dependency('nokogumbo', '~> 1.4')
+  s.add_dependency('nokogumbo', '~> 2.0')
 
   # Development dependencies.
   s.add_development_dependency('minitest', '~> 5.10.2')

diff --git a/test/test_clean_css.rb b/test/test_clean_css.rb
@@ -13,7 +13,7 @@
     @s.fragment(%[
       <div style="color: #fff; width: expression(alert(1)); /* <-- evil! */"></div>
     ].strip).must_equal %[
-      <div style="color: #fff;  /* &lt;-- evil! */"></div>
+      <div style="color: #fff;  /* <-- evil! */"></div>
     ].strip
   end
 

diff --git a/test/test_clean_doctype.rb b/test/test_clean_doctype.rb
@@ -11,7 +11,7 @@
     end
 
     it 'should remove doctype declarations' do
-      @s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>\n"
+      @s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>"
       @s.fragment('<!DOCTYPE html>foo').must_equal 'foo'
     end
 
@@ -34,27 +34,27 @@
 
     it 'should allow doctype declarations in documents' do
       @s.document('<!DOCTYPE html><html>foo</html>')
-        .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
+        .must_equal "<!DOCTYPE html><html>foo</html>"
 
       @s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
-        .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n<html>foo</html>\n"
+        .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\"><html>foo</html>"
 
       @s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
-        .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html>foo</html>\n"
+        .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"
     end
 
     it 'should not allow obviously invalid doctype declarations in documents' do
       @s.document('<!DOCTYPE blah blah blah><html>foo</html>')
-        .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
+        .must_equal "<!DOCTYPE html><html>foo</html>"
 
       @s.document('<!DOCTYPE blah><html>foo</html>')
-        .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
+        .must_equal "<!DOCTYPE html><html>foo</html>"
 
       @s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
-        .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
+        .must_equal "<!DOCTYPE html><html>foo</html>"
 
       @s.document('<!whatever><html>foo</html>')
-        .must_equal "<html>foo</html>\n"
+        .must_equal "<html>foo</html>"
     end
 
     it 'should not allow doctype definitions in fragments' do

diff --git a/test/test_clean_element.rb b/test/test_clean_element.rb
@@ -21,7 +21,7 @@
       :default    => 'Lorem dolor sit amet alert("hello world");',
       :restricted => 'Lorem <strong>dolor</strong> sit amet alert("hello world");',
       :basic      => 'Lorem <a href="pants" rel="nofollow"><strong>dolor</strong></a> sit<br>amet alert("hello world");',
-      :relaxed    => 'Lorem <a href="pants" title="foo&gt;ipsum &lt;a href="><strong>dolor</strong></a> sit<br>amet alert("hello world");',
+      :relaxed    => 'Lorem <a href="pants" title="foo>ipsum <a href="><strong>dolor</strong></a> sit<br>amet alert("hello world");',
     },
 
     :unclosed => {
@@ -234,7 +234,7 @@
 
     it 'should not choke on valueless attributes' do
       @s.fragment('foo <a href>foo</a> bar')
-        .must_equal 'foo <a href rel="nofollow">foo</a> bar'
+        .must_equal 'foo <a href="" rel="nofollow">foo</a> bar'
     end
 
     it 'should downcase attribute names' do
@@ -262,7 +262,7 @@
 
     it 'should encode special chars in attribute values' do
       @s.fragment('<a href="http://example.com" title="<b>&eacute;xamples</b> & things">foo</a>')
-        .must_equal '<a href="http://example.com" title="&lt;b&gt;éxamples&lt;/b&gt; &amp; things">foo</a>'
+        .must_equal '<a href="http://example.com" title="<b>éxamples</b> &amp; things">foo</a>'
     end
 
     strings.each do |name, data|

diff --git a/test/test_malicious_html.rb b/test/test_malicious_html.rb
@@ -43,7 +43,7 @@
   describe '<body>' do
     it 'should not be possible to inject JS via a malformed event attribute' do
       @s.document('<html><head></head><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>').
-        must_equal "<html><head></head><body></body></html>\n"
+        must_equal "<html><head></head><body></body></html>"
     end
   end
 

diff --git a/test/test_parser.rb b/test/test_parser.rb
@@ -32,29 +32,29 @@
   it 'should work around the libxml2 content-type meta tag bug' do
     Sanitize.document('<html><head></head><body>Howdy!</body></html>',
       :elements => %w[html head body]
-    ).must_equal "<html><head></head><body>Howdy!</body></html>\n"
+    ).must_equal "<html><head></head><body>Howdy!</body></html>"
 
     Sanitize.document('<html><head></head><body>Howdy!</body></html>',
       :elements => %w[html head meta body]
-    ).must_equal "<html><head></head><body>Howdy!</body></html>\n"
+    ).must_equal "<html><head></head><body>Howdy!</body></html>"
 
     Sanitize.document('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>',
       :elements   => %w[html head meta body],
       :attributes => {'meta' => ['charset']}
-    ).must_equal "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>\n"
+    ).must_equal "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>"
 
     Sanitize.document('<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head><body>Howdy!</body></html>',
       :elements   => %w[html head meta body],
       :attributes => {'meta' => %w[charset content http-equiv]}
-    ).must_equal "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>\n"
+    ).must_equal "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>"
 
     # Edge case: an existing content-type meta tag with a non-UTF-8 content type
     # will be converted to UTF-8, since that's the only output encoding we
     # support.
     Sanitize.document('<html><head><meta http-equiv="content-type" content="text/html;charset=us-ascii"></head><body>Howdy!</body></html>',
       :elements   => %w[html head meta body],
       :attributes => {'meta' => %w[charset content http-equiv]}
-    ).must_equal "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body>Howdy!</body></html>\n"
+    ).must_equal "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body>Howdy!</body></html>"
   end
 
   describe 'when siblings are added after a node during traversal' do

diff --git a/test/test_sanitize.rb b/test/test_sanitize.rb
@@ -25,7 +25,7 @@
 
       it 'should sanitize an HTML document' do
         @s.document('<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>')
-          .must_equal "<html>Lorem ipsum dolor sit amet alert(\"hello world\");</html>\n"
+          .must_equal "<html>Lorem ipsum dolor sit amet alert(\"hello world\");</html>"
       end
 
       it 'should not modify the input string' do
@@ -35,7 +35,7 @@
       end
 
       it 'should not choke on frozen documents' do
-        @s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>\n"
+        @s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>"
       end
     end
 

diff --git a/test/test_unicode.rb b/test/test_unicode.rb
@@ -23,61 +23,61 @@
     end
 
     it 'should strip deprecated grave and acute clones' do
-      @s.document("a\u0340b\u0341c").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.document("a\u0340b\u0341c").must_equal "<html><head></head><body>abc</body></html>"
       @s.fragment("a\u0340b\u0341c").must_equal 'abc'
     end
 
     it 'should strip deprecated Khmer characters' do
-      @s.document("a\u17a3b\u17d3c").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.document("a\u17a3b\u17d3c").must_equal "<html><head></head><body>abc</body></html>"
       @s.fragment("a\u17a3b\u17d3c").must_equal 'abc'
     end
 
     it 'should strip line and paragraph separator punctuation' do
-      @s.document("a\u2028b\u2029c").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.document("a\u2028b\u2029c").must_equal "<html><head></head><body>abc</body></html>"
       @s.fragment("a\u2028b\u2029c").must_equal 'abc'
     end
 
     it 'should strip bidi embedding control characters' do
       @s.document("a\u202ab\u202bc\u202cd\u202de\u202e")
-        .must_equal "<html><head></head><body>abcde</body></html>\n"
+        .must_equal "<html><head></head><body>abcde</body></html>"
 
       @s.fragment("a\u202ab\u202bc\u202cd\u202de\u202e")
         .must_equal 'abcde'
     end
 
     it 'should strip deprecated symmetric swapping characters' do
-      @s.document("a\u206ab\u206bc").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.document("a\u206ab\u206bc").must_equal "<html><head></head><body>abc</body></html>"
       @s.fragment("a\u206ab\u206bc").must_equal 'abc'
     end
 
     it 'should strip deprecated Arabic form shaping characters' do
-      @s.document("a\u206cb\u206dc").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.document("a\u206cb\u206dc").must_equal "<html><head></head><body>abc</body></html>"
       @s.fragment("a\u206cb\u206dc").must_equal 'abc'
     end
 
     it 'should strip deprecated National digit shape characters' do
-      @s.document("a\u206eb\u206fc").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.document("a\u206eb\u206fc").must_equal "<html><head></head><body>abc</body></html>"
       @s.fragment("a\u206eb\u206fc").must_equal 'abc'
     end
 
     it 'should strip interlinear annotation characters' do
-      @s.document("a\ufff9b\ufffac\ufffb").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.document("a\ufff9b\ufffac\ufffb").must_equal "<html><head></head><body>abc</body></html>"
       @s.fragment("a\ufff9b\ufffac\ufffb").must_equal 'abc'
     end
 
     it 'should strip BOM/zero-width non-breaking space characters' do
-      @s.document("a\ufeffbc").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.document("a\ufeffbc").must_equal "<html><head></head><body>abc</body></html>"
       @s.fragment("a\ufeffbc").must_equal 'abc'
     end
 
     it 'should strip object replacement characters' do
-      @s.document("a\ufffcbc").must_equal "<html><head></head><body>abc</body></html>\n"
+      @s.document("a\ufffcbc").must_equal "<html><head></head><body>abc</body></html>"
       @s.fragment("a\ufffcbc").must_equal 'abc'
     end
 
     it 'should strip musical notation scoping characters' do
       @s.document("a\u{1d173}b\u{1d174}c\u{1d175}d\u{1d176}e\u{1d177}f\u{1d178}g\u{1d179}h\u{1d17a}")
-        .must_equal "<html><head></head><body>abcdefgh</body></html>\n"
+        .must_equal "<html><head></head><body>abcdefgh</body></html>"
 
       @s.fragment("a\u{1d173}b\u{1d174}c\u{1d175}d\u{1d176}e\u{1d177}f\u{1d178}g\u{1d179}h\u{1d17a}")
         .must_equal 'abcdefgh'
@@ -88,7 +88,7 @@
       (0xE0000..0xE007F).each {|n| str << [n].pack('U') }
       str << 'b'
 
-      @s.document(str).must_equal "<html><head></head><body>ab</body></html>\n"
+      @s.document(str).must_equal "<html><head></head><body>ab</body></html>"
       @s.fragment(str).must_equal 'ab'
     end
   end