Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement XML::Reader.new with keyword args and forward Reader() to it #3326

Merged
merged 3 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions ext/nokogiri/xml_reader.c
Original file line number Diff line number Diff line change
Expand Up @@ -608,11 +608,13 @@ outer_xml(VALUE self)
* call-seq:
* from_memory(string, url = nil, encoding = nil, options = 0)
*
* Create a new reader that parses +string+
* Create a new Reader to parse a String.
*/
static VALUE
from_memory(int argc, VALUE *argv, VALUE klass)
{
/* TODO: deprecate this method, since Reader.new can handle both memory and IO. It can then
* become private. */
VALUE rb_buffer, rb_url, encoding, rb_options;
xmlTextReaderPtr reader;
const char *c_url = NULL;
Expand Down Expand Up @@ -653,11 +655,13 @@ from_memory(int argc, VALUE *argv, VALUE klass)
* call-seq:
* from_io(io, url = nil, encoding = nil, options = 0)
*
* Create a new reader that parses +io+
* Create a new Reader to parse an IO stream.
*/
static VALUE
from_io(int argc, VALUE *argv, VALUE klass)
{
/* TODO: deprecate this method, since Reader.new can handle both memory and IO. It can then
* become private. */
VALUE rb_io, rb_url, encoding, rb_options;
xmlTextReaderPtr reader;
const char *c_url = NULL;
Expand Down Expand Up @@ -739,11 +743,6 @@ rb_xml_reader_encoding(VALUE rb_reader)
void
noko_init_xml_reader(void)
{
/*
* The Reader parser allows you to effectively pull parse an XML document.
* Once instantiated, call Nokogiri::XML::Reader#each to iterate over each
* node. Note that you may only iterate over the document once!
*/
cNokogiriXmlReader = rb_define_class_under(mNokogiriXml, "Reader", rb_cObject);

rb_undef_alloc_func(cNokogiriXmlReader);
Expand Down
16 changes: 4 additions & 12 deletions lib/nokogiri/xml.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,11 @@ module XML
XML_C14N_EXCLUSIVE_1_0 = 1
# C14N 1.1 spec canonicalization
XML_C14N_1_1 = 2
class << self
###
# Parse an XML document using the Nokogiri::XML::Reader API. See
# Nokogiri::XML::Reader for more information
def Reader(string_or_io, url = nil, encoding = nil, options = ParseOptions::STRICT)
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
yield options if block_given?

if string_or_io.respond_to?(:read)
return Reader.from_io(string_or_io, url, encoding, options.to_i)
end

Reader.from_memory(string_or_io, url, encoding, options.to_i)
class << self
# Convenience method for Nokogiri::XML::Reader.new
def Reader(...)
Reader.new(...)
end

###
Expand Down
59 changes: 46 additions & 13 deletions lib/nokogiri/xml/reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,33 @@
module Nokogiri
module XML
###
# Nokogiri::XML::Reader parses an XML document similar to the way a cursor would move. The
# Reader is given an XML document, and yields nodes to an each block.
# The Reader parser allows you to effectively pull parse an \XML document. Once instantiated,
# call Nokogiri::XML::Reader#each to iterate over each node.
#
# The Reader parser might be good for when you need the speed and low memory usage of the SAX
# parser, but do not want to write a Document handler.
# Nokogiri::XML::Reader parses an \XML document similar to the way a cursor would move. The
# Reader is given an \XML document, and yields nodes to an each block.
#
# The Reader parser might be good for when you need the speed and low memory usage of a \SAX
# parser, but do not want to write a SAX::Document handler.
#
# Here is an example of usage:
#
# reader = Nokogiri::XML::Reader(<<-eoxml)
# reader = Nokogiri::XML::Reader.new <<~XML
# <x xmlns:tenderlove='http://tenderlovemaking.com/'>
# <tenderlove:foo awesome='true'>snuggles!</tenderlove:foo>
# </x>
# eoxml
# XML
#
# reader.each do |node|
#
# # node is an instance of Nokogiri::XML::Reader
# puts node.name
#
# end
#
# ⚠ Nokogiri::XML::Reader#each can only be called once! Once the cursor moves through the entire
# document, you must parse the document again. It may be better to capture all information you
# need during a single iteration.
#
# ⚠ libxml2 does not support error recovery in the Reader parser. The `RECOVER` ParseOption is
# ⚠ libxml2 does not support error recovery in the Reader parser. The +RECOVER+ ParseOption is
# ignored. If a syntax error is encountered during parsing, an exception will be raised.
class Reader
include Enumerable
Expand Down Expand Up @@ -66,23 +67,55 @@ class Reader
TYPE_END_ELEMENT = 15
# Entity end node type
TYPE_END_ENTITY = 16
# XML Declaration node type
# \XML Declaration node type
TYPE_XML_DECLARATION = 17

# A list of errors encountered while parsing
attr_accessor :errors

# The XML source
# The \XML source
attr_reader :source

alias_method :self_closing?, :empty_element?

def initialize(source, url = nil, encoding = nil) # :nodoc:
# :call-seq:
# Reader.new(input) { |options| ... } → Reader
# Reader.new(input, url:, encoding:, options:) { |options| ... } → Reader
#
# Create a new Reader to parse an \XML document.
#
# [Required Parameters]
# - +input+ (String | IO): The \XML document to parse.
#
# [Optional Parameters]
# - +url:+ (String) The base URL of the document.
# - +encoding:+ (String) The name of the encoding of the document.
# - +options:+ (Integer | ParseOptions) Options to control the parser behavior.
# Defaults to +ParseOptions::STRICT+.
#
# [Yields]
# If present, the block will be passed a Nokogiri::XML::ParseOptions object to modify before
# the fragment is parsed. See Nokogiri::XML::ParseOptions for more information.
def self.new(
string_or_io,
url_ = nil, encoding_ = nil, options_ = ParseOptions::STRICT,
url: url_, encoding: encoding_, options: options_
)
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
yield options if block_given?

if string_or_io.respond_to?(:read)
return Reader.from_io(string_or_io, url, encoding, options.to_i)
end

Reader.from_memory(string_or_io, url, encoding, options.to_i)
end

private def initialize(source, url = nil, encoding = nil) # :nodoc:
@source = source
@errors = []
@encoding = encoding
end
private :initialize

# Get the attributes and namespaces of the current node as a Hash.
#
Expand Down
12 changes: 6 additions & 6 deletions test/xml/test_reader_encoding.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_override_internal_encoding_when_specified
# and 2.12, so the testing here is superficial -- asserting on the reported encoding, but
# not asserting on the bytes in the document or the serialized nodes.
#
reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
reader = Nokogiri::XML::Reader(<<~XML, encoding: "UTF-8")
<?xml version="1.0" encoding="ISO-8859-1"?>
<foo>asdf</foo>
XML
Expand Down Expand Up @@ -114,7 +114,7 @@ def test_attribute_encoding_issue_2891_correct_encoding_specified
end

# https://github.com/sparklemotion/nokogiri/issues/2891
reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
reader = Nokogiri::XML::Reader(<<~XML, encoding: "UTF-8")
<?xml version="1.0"?>
<anotación tipo="inspiración">INSPIRACIÓN</anotación>
XML
Expand All @@ -135,7 +135,7 @@ def test_attribute_encoding_issue_2891_correct_encoding_specified_non_utf8
<?xml version="1.0"?>
<test>\u{82B1}\u{82F1}</test>
XML
reader = Nokogiri::XML::Reader(xml, nil, "Shift_JIS")
reader = Nokogiri::XML::Reader(xml, encoding: "Shift_JIS")

assert_equal("Shift_JIS", reader.encoding)

Expand Down Expand Up @@ -216,7 +216,7 @@ def test_prefix
<edi:foo>hello</edi:foo>
</x>
eoxml
reader = Nokogiri::XML::Reader(xml, nil, "UTF-8")
reader = Nokogiri::XML::Reader(xml, encoding: "UTF-8")
reader.each do |node|
next unless (prefix = node.prefix)

Expand All @@ -230,7 +230,7 @@ def test_ns_uri
<edi:foo>hello</edi:foo>
</x>
eoxml
reader = Nokogiri::XML::Reader(xml, nil, "UTF-8")
reader = Nokogiri::XML::Reader(xml, encoding: "UTF-8")
reader.each do |node|
next unless (uri = node.namespace_uri)

Expand All @@ -244,7 +244,7 @@ def test_local_name
<edi:foo>hello</edi:foo>
</x>
eoxml
reader = Nokogiri::XML::Reader(xml, nil, "UTF-8")
reader = Nokogiri::XML::Reader(xml, encoding: "UTF-8")
reader.each do |node|
next unless (lname = node.local_name)

Expand Down
Loading