Skip to content

Commit

Permalink
fix: multiple SAX parser issues
Browse files Browse the repository at this point in the history
On CRuby, this fixes the fact that the parser was registering errors
when encountering general (non-predefined) entities. Now these
entities are resolved properly and converted into `#characters`
callbacks. Fixes #1926.

On JRuby, the SAX parser now respects the `#replace_entities`
attribute, which was previously ignored AND defaulted incorrectly to
`true`. The default now matches CRuby -- `false` -- and the parser
behavior matches CRuby with respect to entities. Fixes #614.

This commit also includes some granular tests of how the sax parser
handles different entities under different circumstances, which should
be clarifying for user reports like #1284 and #1500 that expect
predefined entities and character references to be treated like parsed
entities (which they aren't).
  • Loading branch information
flavorjones committed Jul 1, 2024
1 parent d68aafb commit 3b03c8d
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 56 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA
* [CRuby] libgumbo (the HTML5 parser) treats reaching max-depth as EOF. This addresses a class of issues when the parser is interrupted in this way. [#3121] @stevecheckoway
* [CRuby] Update node GC lifecycle to avoid a potential memory leak with fragments in libxml 2.13.0 caused by changes in `xmlAddChild`. [#3156] @flavorjones
* [CRuby] libgumbo correctly prints nonstandard element names in error messages. [#3219] @stevecheckoway
* [CRuby] SAX parsing no longer registers errors when encountering external entity references. [#1926] @flavorjones
* [JRuby] Fixed some bugs in how `Node#attributes` handles attributes with namespaces. [#2677, #2679] @flavorjones
* [JRuby] Fix `Schema#validate` to only return the most recent Document's errors. Previously, if multiple documents were validated, this method returned the accumulated errors of all previous documents. [#1282] @flavorjones
* [JRuby] Fix `Schema#validate` to not clobber the `@errors` instance variable. [#1282] @flavorjones
* [JRuby] Empty documents fail schema validation as they should. [#783] @flavorjones
* [JRuby] SAX parsing now respects the `#replace_entities` attribute, which defaults to `false`. Previously this flag defaulted to `true` and was completely ignored. [#614] @flavorjones


### Changed
Expand Down
5 changes: 4 additions & 1 deletion ext/java/nokogiri/XmlSaxParserContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public class XmlSaxParserContext extends ParserContext

protected NokogiriHandler handler;
protected NokogiriErrorHandler errorHandler;
private boolean replaceEntities = true;
private boolean replaceEntities = false;
private boolean recovery = false;

public
Expand Down Expand Up @@ -222,6 +222,9 @@ public class XmlSaxParserContext extends ParserContext

/* TODO: how should we pass in parse options? */
ParserContext.Options options = defaultParseOptions(context);
if (replaceEntities) {
options.noEnt = true;
}

errorHandler = new NokogiriStrictErrorHandler(runtime, options.noError, options.noWarning);
handler = new NokogiriHandler(runtime, handlerRuby, errorHandler);
Expand Down
34 changes: 17 additions & 17 deletions ext/java/nokogiri/internals/ParserContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -182,23 +182,23 @@ public static class Options
protected static final long NOCDATA = 16384;
protected static final long NOXINCNODE = 32768;

public final boolean strict;
public final boolean recover;
public final boolean noEnt;
public final boolean dtdLoad;
public final boolean dtdAttr;
public final boolean dtdValid;
public final boolean noError;
public final boolean noWarning;
public final boolean pedantic;
public final boolean noBlanks;
public final boolean sax1;
public final boolean xInclude;
public final boolean noNet;
public final boolean noDict;
public final boolean nsClean;
public final boolean noCdata;
public final boolean noXIncNode;
public boolean strict;
public boolean recover;
public boolean noEnt;
public boolean dtdLoad;
public boolean dtdAttr;
public boolean dtdValid;
public boolean noError;
public boolean noWarning;
public boolean pedantic;
public boolean noBlanks;
public boolean sax1;
public boolean xInclude;
public boolean noNet;
public boolean noDict;
public boolean nsClean;
public boolean noCdata;
public boolean noXIncNode;

protected static boolean
test(long options, long mask)
Expand Down
2 changes: 2 additions & 0 deletions ext/nokogiri/html4_sax_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ noko_html4_sax_parser_start_document(void *ctx)
VALUE self = (VALUE)ctxt->_private;
VALUE doc = rb_iv_get(self, "@document");

xmlSAX2StartDocument(ctx);

rb_funcall(doc, id_start_document, 0);
}

Expand Down
14 changes: 14 additions & 0 deletions ext/nokogiri/xml_sax_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ noko_xml_sax_parser_start_document_callback(void *ctx)
VALUE self = (VALUE)ctxt->_private;
VALUE doc = rb_iv_get(self, "@document");

xmlSAX2StartDocument(ctx);

if (ctxt->standalone != -1) { /* -1 means there was no declaration */
VALUE encoding = Qnil ;
Expand Down Expand Up @@ -324,6 +325,19 @@ noko_xml_sax_parser__initialize_native(VALUE self)
handler->error = noko_xml_sax_parser_error_callback;
handler->cdataBlock = noko_xml_sax_parser_cdata_block_callback;
handler->processingInstruction = noko_xml_sax_parser_processing_instruction_callback;

/* use some of libxml2's default callbacks to managed DTDs and entities */
handler->getEntity = xmlSAX2GetEntity;
handler->internalSubset = xmlSAX2InternalSubset;
handler->externalSubset = xmlSAX2ExternalSubset;
handler->isStandalone = xmlSAX2IsStandalone;
handler->hasInternalSubset = xmlSAX2HasInternalSubset;
handler->hasExternalSubset = xmlSAX2HasExternalSubset;
handler->resolveEntity = xmlSAX2ResolveEntity;
handler->getParameterEntity = xmlSAX2GetParameterEntity;
handler->entityDecl = xmlSAX2EntityDecl;
handler->unparsedEntityDecl = xmlSAX2UnparsedEntityDecl;

handler->initialized = XML_SAX2_MAGIC;

return self;
Expand Down
16 changes: 12 additions & 4 deletions test/helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -402,10 +402,17 @@ def memwatch(method, n: nil, retry_once: true, &block)
module SAX
class TestCase < Nokogiri::TestCase
class Doc < XML::SAX::Document
attr_reader :start_elements, :start_document_called
attr_reader :end_elements, :end_document_called
attr_reader :data, :comments, :cdata_blocks, :start_elements_namespace
attr_reader :errors, :warnings, :end_elements_namespace
attr_reader :start_elements
attr_reader :start_document_called
attr_reader :end_elements
attr_reader :end_document_called
attr_reader :data
attr_reader :comments
attr_reader :cdata_blocks
attr_reader :start_elements_namespace
attr_reader :errors
attr_reader :warnings
attr_reader :end_elements_namespace
attr_reader :xmldecls
attr_reader :processing_instructions

Expand Down Expand Up @@ -480,6 +487,7 @@ def cdata_block(string)
def processing_instruction(name, content)
@processing_instructions ||= []
@processing_instructions << [name, content]
super
end
end

Expand Down
162 changes: 128 additions & 34 deletions test/xml/sax/test_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ class TestCase
[
"root",
[],
nil, nil,
nil, (nil | ""), # TODO: jruby is returning an empty string, cruby is returning nil
[["foo", "http://foo.example.com/"]], # namespace declarations
], [
"a",
Expand Down Expand Up @@ -475,39 +475,6 @@ class TestCase
end
end

it "does not resolve entities by default" do
xml = <<~XML
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE doc [
<!ENTITY local SYSTEM "file:///#{File.expand_path(__FILE__)}">
<!ENTITY custom "resolved>
]>
<doc><foo>&local;</foo><foo>&custom;</foo></doc>
XML

doc = Doc.new
parser = Nokogiri::XML::SAX::Parser.new(doc)
parser.parse(xml)

assert_nil(doc.data)
end

it "does not resolve network external entities by default" do
xml = <<~XML
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE doc [
<!ENTITY remote SYSTEM "http://0.0.0.0:8080/evil.dtd">
]>
<doc><foo>&remote;</foo></doc>
XML

doc = Doc.new
parser = Nokogiri::XML::SAX::Parser.new(doc)
parser.parse(xml)

assert_nil(doc.data)
end

it "handles parser warnings" do
skip_unless_libxml2("this is testing error message formatting in the C extension")

Expand All @@ -527,6 +494,133 @@ class TestCase
assert_match(/URI .* is not absolute/, parser.document.warnings.first)
end
end

describe "entities" do
it "does not replace entities by default" do
parser_context = nil
parser.parse("<root></root>") do |ctx|
parser_context = ctx
end

refute(parser_context.replace_entities)
end

describe "character references" do
let(:xml) { <<~XML }
<?xml version="1.0" encoding="UTF-8"?>
<root><foo>&#146;</foo><foo>&#146;</foo></root>
XML

[true, false].each do |replace_entities|
it "always replace when replace_entities=#{replace_entities}" do
parser.parse(xml) { |pc| pc.replace_entities = replace_entities }

assert_equal(["\u0092", "\u0092"], parser.document.data)
end
end
end

describe "predefined entities" do
let(:xml) { <<~XML }
<?xml version="1.0" encoding="UTF-8"?>
<root><foo>&amp;</foo><foo>&amp;</foo></root>
XML

[true, false].each do |replace_entities|
it "always replace when replace_entities=#{replace_entities}" do
parser.parse(xml) { |pc| pc.replace_entities = replace_entities }

assert_equal(["&", "&"], parser.document.data)
end
end
end

describe "internal entities" do
let(:xml) { <<~XML }
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE root [ <!ENTITY bar "quux"> ]>
<root><foo>&bar;</foo><foo>&bar;</foo></root>
XML

[true, false].each do |replace_entities|
it "always replaces when replace_entities=#{replace_entities}" do
parser.parse(xml) { |pc| pc.replace_entities = replace_entities }

assert_equal(["quux", "quux"], parser.document.data)
end
end
end

describe "undeclared entities" do
let(:xml) { <<~XML }
<?xml version="1.0" encoding="UTF-8"?>
<root><foo>&bar;</foo><foo>&bar;</foo></root>
XML

[true, false].each do |replace_entities|
it "does not replace undeclared entities when replace_entities is #{replace_entities}" do
parser.parse(xml) do |pc|
pc.replace_entities = replace_entities
pc.recovery = true # because an undeclared entity is an error
end

assert_nil(parser.document.data)
end
end
end

describe "local external entities" do
it "does not resolve local external entities when replace_entities is false" do
Tempfile.create do |io|
io.write("local-contents")
io.close
xml = <<~XML
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE doc [
<!ENTITY local SYSTEM "file:///#{io.path}">
]>
<doc><foo>&local;</foo><foo>&local;</foo></doc>
XML
parser.parse(xml) { |pc| pc.replace_entities = false }
end

assert_nil(parser.document.data)
assert_empty(parser.document.errors)
end

it "resolves local external entities when replace_entities is true" do
Tempfile.create do |io|
io.write("local-contents")
io.close
xml = <<~XML
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE doc [
<!ENTITY local SYSTEM "#{io.path}">
]>
<doc><foo>&local;</foo><foo>&local;</foo></doc>
XML
parser.parse(xml) { |pc| pc.replace_entities = true }
end

assert_equal(["local-contents", "local-contents"], parser.document.data)
assert_equal(0, parser.document.errors.length)
end
end

it "does not resolve network external entities when replace_entities is false" do
xml = <<~XML
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE doc [
<!ENTITY remote SYSTEM "http://0.0.0.0:8080/evil.dtd">
]>
<doc><foo>&remote;</foo><foo>&remote;</foo></doc>
XML
parser.parse(xml) { |pc| pc.replace_entities = false }

assert_nil(parser.document.data)
assert_empty(parser.document.errors)
end
end
end
end
end
Expand Down

0 comments on commit 3b03c8d

Please sign in to comment.