Skip to content

Commit

Permalink
fix: serializing an individual attribute in an HTML5 document
Browse files Browse the repository at this point in the history
Previously an exception would be raised, either "Unexpected node" or
"Unsupported document node (2)" depending on the version of Nokogiri.

Fixes #3125
  • Loading branch information
flavorjones committed Feb 6, 2024
1 parent 04ecb5b commit 5ba752c
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 90 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA
### Added

* [CRuby] `Nokogiri::HTML5::Builder` is similar to `HTML4::Builder` but returns an `HTML5::Document`. (@flavorjones)
* [CRuby] Attributes in an HTML5 document can be serialized individually, something that has always been supported by the HTML4 serializer. [#3125, #3127] @flavorjones


### Fixed
Expand Down
29 changes: 18 additions & 11 deletions ext/nokogiri/xml_node.c
Original file line number Diff line number Diff line change
Expand Up @@ -1875,17 +1875,7 @@ output_node(
// Add attributes.
for (xmlAttrPtr attr = node->properties; attr; attr = attr->next) {
output_char(out, ' ');
output_attr_name(out, attr);
if (attr->children) {
output_string(out, "=\"");
xmlChar *value = xmlNodeListGetString(attr->doc, attr->children, 1);
output_escaped_string(out, value, true);
xmlFree(value);
output_char(out, '"');
} else {
// Output name=""
output_string(out, "=\"\"");
}
output_node(out, (xmlNodePtr)attr, preserve_newline);
}
output_char(out, '>');

Expand All @@ -1903,6 +1893,23 @@ output_node(
}
break;

case XML_ATTRIBUTE_NODE:
{
xmlAttrPtr attr = (xmlAttrPtr)node;
output_attr_name(out, attr);
if (attr->children) {
output_string(out, "=\"");
xmlChar *value = xmlNodeListGetString(attr->doc, attr->children, 1);
output_escaped_string(out, value, true);
xmlFree(value);
output_char(out, '"');
} else {
// Output name=""
output_string(out, "=\"\"");
}
}
break;

case XML_TEXT_NODE:
if (node->parent
&& is_one_of(node->parent, UNESCAPED_TEXT_ELEMENTS,
Expand Down
175 changes: 96 additions & 79 deletions test/html4/test_attributes.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,90 +2,107 @@

require "helper"

module Nokogiri
module HTML
class TestAttr < Nokogiri::TestCase
#
# libxml2 >= 2.9.2 fails to escape comments within some attributes. It
# wants to ensure these comments can be treated as "server-side includes",
# but as a result fails to ensure that serialization is well-formed,
# resulting in an opportunity for XSS injection of code into a final
# re-parsed document (presumably in a browser).
#
# the offending commit is:
#
# https://github.com/GNOME/libxml2/commit/960f0e2
#
# we'll test this by parsing the HTML, serializing it, then
# re-parsing it to ensure there isn't any ambiguity in the output
# that might allow code injection into a browser consuming
# "sanitized" output.
#
# complaints have been made upstream about this behavior, notably at
#
# https://bugzilla.gnome.org/show_bug.cgi?id=769760
#
# and multiple CVEs have been declared and fixed in downstream
# libraries as a result, a list is being kept up to date here:
#
# https://github.com/flavorjones/loofah/issues/144
#
[
#
# these tags and attributes are determined by the code at:
#
# https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714
#
{ tag: "a", attr: "href" },
{ tag: "div", attr: "href" },
{ tag: "a", attr: "action" },
{ tag: "div", attr: "action" },
{ tag: "a", attr: "src" },
{ tag: "div", attr: "src" },
{ tag: "a", attr: "name" },
#
# note that div+name is _not_ affected by the libxml2 issue.
# but we test it anyway to ensure our logic isn't modifying
# attributes that don't need modifying.
#
{ tag: "div", attr: "name", unescaped: true },
].each do |config|
define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do
skip if Nokogiri::VersionInfo.instance.libxml2? && Nokogiri::VersionInfo.instance.libxml2_using_system?
class TestHTML4Attributes < Nokogiri::TestCase
#
# libxml2 >= 2.9.2 fails to escape comments within some attributes. It
# wants to ensure these comments can be treated as "server-side includes",
# but as a result fails to ensure that serialization is well-formed,
# resulting in an opportunity for XSS injection of code into a final
# re-parsed document (presumably in a browser).
#
# the offending commit is:
#
# https://github.com/GNOME/libxml2/commit/960f0e2
#
# we'll test this by parsing the HTML, serializing it, then
# re-parsing it to ensure there isn't any ambiguity in the output
# that might allow code injection into a browser consuming
# "sanitized" output.
#
# complaints have been made upstream about this behavior, notably at
#
# https://bugzilla.gnome.org/show_bug.cgi?id=769760
#
# and multiple CVEs have been declared and fixed in downstream
# libraries as a result, a list is being kept up to date here:
#
# https://github.com/flavorjones/loofah/issues/144
#
[
#
# these tags and attributes are determined by the code at:
#
# https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714
#
{ tag: "a", attr: "href" },
{ tag: "div", attr: "href" },
{ tag: "a", attr: "action" },
{ tag: "div", attr: "action" },
{ tag: "a", attr: "src" },
{ tag: "div", attr: "src" },
{ tag: "a", attr: "name" },
#
# note that div+name is _not_ affected by the libxml2 issue.
# but we test it anyway to ensure our logic isn't modifying
# attributes that don't need modifying.
#
{ tag: "div", attr: "name", unescaped: true },
].each do |config|
define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do
skip if Nokogiri::VersionInfo.instance.libxml2? && Nokogiri::VersionInfo.instance.libxml2_using_system?

html = %{<#{config[:tag]} #{config[:attr]}='examp<!--" unsafeattr=unsafevalue()>-->le.com'>test</#{config[:tag]}>}
html = %{<#{config[:tag]} #{config[:attr]}='examp<!--" unsafeattr=unsafevalue()>-->le.com'>test</#{config[:tag]}>}

reparsed = Nokogiri::HTML4.fragment(Nokogiri::HTML4.fragment(html).to_html)
attributes = reparsed.at_css(config[:tag]).attribute_nodes
reparsed = Nokogiri::HTML4.fragment(Nokogiri::HTML4.fragment(html).to_html)
attributes = reparsed.at_css(config[:tag]).attribute_nodes

assert_equal [config[:attr]], attributes.collect(&:name)
if Nokogiri::VersionInfo.instance.libxml2?
if config[:unescaped]
#
# this attribute was emitted wrapped in single-quotes, so a double quote is A-OK.
# assert that this attribute's serialization is unaffected.
#
assert_equal %{examp<!--" unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
elsif Nokogiri.uses_libxml?(">= 2.11.0")
#
# as of upstream 76d6b0d768d4e60a2d2844d in v2.11.0 (dev), quotes are no longer escaped.
#
assert_equal %{examp<!--"%20unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
else
#
# let's match the behavior in libxml < 2.9.2.
# test that this attribute's serialization is well-formed and sanitized.
#
assert_equal %{examp<!--%22%20unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
end
else
#
# yay for consistency in javaland. move along, nothing to see here.
#
assert_equal %{examp<!--%22 unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
end
assert_equal [config[:attr]], attributes.collect(&:name)
if Nokogiri::VersionInfo.instance.libxml2?
if config[:unescaped]
#
# this attribute was emitted wrapped in single-quotes, so a double quote is A-OK.
# assert that this attribute's serialization is unaffected.
#
assert_equal %{examp<!--" unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
elsif Nokogiri.uses_libxml?(">= 2.11.0")
#
# as of upstream 76d6b0d768d4e60a2d2844d in v2.11.0 (dev), quotes are no longer escaped.
#
assert_equal %{examp<!--"%20unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
else
#
# let's match the behavior in libxml < 2.9.2.
# test that this attribute's serialization is well-formed and sanitized.
#
assert_equal %{examp<!--%22%20unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
end
else
#
# yay for consistency in javaland. move along, nothing to see here.
#
assert_equal %{examp<!--%22 unsafeattr=unsafevalue()>-->le.com}, attributes.first.value
end
end
end

def test_serialize_attribute
html = <<~HTML
<div id='foo' class="bar baz"></div>
HTML
div = Nokogiri::HTML4::DocumentFragment.parse(html).at_css("div")
attrs = div.attribute_nodes
id_attr = attrs.find { |a| a.name == "id" }
class_attr = attrs.find { |a| a.name == "class" }

if Nokogiri.uses_libxml?
id_expected = ' id="foo"'
class_expected = ' class="bar baz"'
else
id_expected = 'id="foo"'
class_expected = 'class="bar baz"'
end

assert_equal(id_expected, id_attr.to_html)
assert_equal(class_expected, class_attr.to_html)
end
end
18 changes: 18 additions & 0 deletions test/html5/test_attributes.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# frozen_string_literal: true

require "helper"

class TestHtml5Attributes < Nokogiri::TestCase
def test_serialize_attribute
html = <<~HTML
<div id='foo' class="bar baz"></div>
HTML
div = Nokogiri::HTML5::DocumentFragment.parse(html).at_css("div")
attrs = div.attribute_nodes
id_attr = attrs.find { |a| a.name == "id" }
class_attr = attrs.find { |a| a.name == "class" }

assert_equal('id="foo"', id_attr.to_html)
assert_equal('class="bar baz"', class_attr.to_html)
end
end if Nokogiri.uses_gumbo?

0 comments on commit 5ba752c

Please sign in to comment.