diff --git a/CHANGELOG.md b/CHANGELOG.md index f6da0f64f4..13f9f4f3f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -90,6 +90,7 @@ This release ends support for: * Remove unnecessary array allocations from Node serialization methods [[#1911](https://github.com/sparklemotion/nokogiri/issues/1911)] (Thanks, [@ashmaroli](https://github.com/ashmaroli)!) * Avoid creation of unnecessary zero-length String objects. [[#1970](https://github.com/sparklemotion/nokogiri/issues/1970)] (Thanks, [@ashmaroli](https://github.com/ashmaroli)!) * Always compile libxml2 and libxslt with '-O2' [[#2022](https://github.com/sparklemotion/nokogiri/issues/2022), [#2100](https://github.com/sparklemotion/nokogiri/issues/2100)] (Thanks, [@ilyazub](https://github.com/ilyazub)!) +* {HTML,XML}::Document#parse now accept `Pathname` objects. Previously this worked only if the referenced file was less than 4096 bytes long; longer files resulted in undefined behavior because the `read` method would be repeatedly invoked. [[#1821](https://github.com/sparklemotion/nokogiri/issues/1821), [#2110](https://github.com/sparklemotion/nokogiri/issues/2110)] (Thanks, [@doriantaylor](https://github.com/doriantaylor) and [@phokz](https://github.com/phokz)!) * [JRuby] Lots of code cleanup and performance improvements. [[#1934](https://github.com/sparklemotion/nokogiri/issues/1934)] (Thanks, [@kares](https://github.com/kares)!) * [JRuby] Clean up deprecated calls into JRuby. [[#2027](https://github.com/sparklemotion/nokogiri/issues/2027)] (Thanks, [@headius](https://github.com/headius)!) diff --git a/lib/nokogiri/html/document.rb b/lib/nokogiri/html/document.rb index 6448bb23ea..95142aa43b 100644 --- a/lib/nokogiri/html/document.rb +++ b/lib/nokogiri/html/document.rb @@ -1,4 +1,7 @@ # frozen_string_literal: true + +require 'pathname' + module Nokogiri module HTML class Document < Nokogiri::XML::Document @@ -161,11 +164,12 @@ class << self # Nokogiri::XML::ParseOptions::RECOVER. See the constants in # Nokogiri::XML::ParseOptions. def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML - options = Nokogiri::XML::ParseOptions.new(options) if Integer === options - # Give the options to the user + yield options if block_given? + url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil + if string_or_io.respond_to?(:encoding) unless string_or_io.encoding.name == "ASCII-8BIT" encoding ||= string_or_io.encoding.name @@ -173,7 +177,12 @@ def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions:: end if string_or_io.respond_to?(:read) - url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil + if string_or_io.is_a?(Pathname) + # resolve the Pathname to the file and open it as an IO object, see #2110 + string_or_io = string_or_io.expand_path.open + url ||= string_or_io.path + end + unless encoding # Libxml2's parser has poor support for encoding # detection. First, it does not recognize the HTML5 diff --git a/lib/nokogiri/xml/document.rb b/lib/nokogiri/xml/document.rb index 1cc69570c3..5741085727 100644 --- a/lib/nokogiri/xml/document.rb +++ b/lib/nokogiri/xml/document.rb @@ -1,4 +1,7 @@ # frozen_string_literal: true + +require 'pathname' + module Nokogiri module XML ## @@ -44,9 +47,11 @@ class Document < Nokogiri::XML::Node # def self.parse string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML options = Nokogiri::XML::ParseOptions.new(options) if Integer === options - # Give the options to the user + yield options if block_given? + url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil + if empty_doc?(string_or_io) if options.strict? raise Nokogiri::XML::SyntaxError.new("Empty document") @@ -56,12 +61,17 @@ def self.parse string_or_io, url = nil, encoding = nil, options = ParseOptions:: end doc = if string_or_io.respond_to?(:read) - url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil - read_io(string_or_io, url, encoding, options.to_i) - else - # read_memory pukes on empty docs - read_memory(string_or_io, url, encoding, options.to_i) - end + if string_or_io.is_a?(Pathname) + # resolve the Pathname to the file and open it as an IO object, see #2110 + string_or_io = string_or_io.expand_path.open + url ||= string_or_io.path + end + + read_io(string_or_io, url, encoding, options.to_i) + else + # read_memory pukes on empty docs + read_memory(string_or_io, url, encoding, options.to_i) + end # do xinclude processing doc.do_xinclude(options) if options.xinclude? diff --git a/test/html/test_document.rb b/test/html/test_document.rb index a127b1e4b3..26cd6e3980 100644 --- a/test/html/test_document.rb +++ b/test/html/test_document.rb @@ -91,8 +91,9 @@ def test_document_parse_method end def test_document_parse_method_with_url - doc = Nokogiri::HTML "", "http:/foobar.foobar/", "UTF-8" + doc = Nokogiri::HTML "", "http://foobar.example.com/", "UTF-8" refute_empty doc.to_s, "Document should not be empty" + assert_equal "http://foobar.example.com/", doc.url end ### @@ -627,6 +628,29 @@ def test_parse_can_take_io html = Nokogiri::HTML(f) } assert html.html? + assert_equal HTML_FILE, html.url + end + + def test_parse_works_with_an_object_that_responds_to_path + html = String.new("hello") + def html.path + "/i/should/be/the/document/url" + end + + doc = Nokogiri::HTML.parse(html) + + assert_equal "/i/should/be/the/document/url", doc.url + end + + # issue #1821, #2110 + def test_parse_can_take_pathnames + assert(File.size(HTML_FILE) > 4096) # file must be big enough to trip the read callback more than once + + doc = Nokogiri::HTML.parse(Pathname.new(HTML_FILE)) + + # an arbitrary assertion on the structure of the document + assert_equal 166, doc.css("a").length + assert_equal HTML_FILE, doc.url end def test_html? diff --git a/test/xml/test_document.rb b/test/xml/test_document.rb index 46d91f0d3d..3bf417fb5e 100644 --- a/test/xml/test_document.rb +++ b/test/xml/test_document.rb @@ -674,7 +674,7 @@ def test_XML_function def test_url assert @xml.url - assert_equal XML_FILE, URI.unescape(@xml.url).sub("file:///", "") + assert_equal XML_FILE, @xml.url end def test_document_parent @@ -695,6 +695,7 @@ def test_parse_can_take_io xml = Nokogiri::XML(f) } assert xml.xml? + assert_equal XML_FILE, xml.url set = xml.search("//employee") assert set.length > 0 end @@ -719,6 +720,29 @@ def read(*args) assert_equal "foo", doc.at_css("div").content end + def test_parse_works_with_an_object_that_responds_to_path + xml = String.new("hello") + def xml.path + "/i/should/be/the/document/url" + end + + doc = Nokogiri::XML.parse(xml) + + assert_equal "/i/should/be/the/document/url", doc.url + end + + # issue #1821, #2110 + def test_parse_can_take_pathnames + assert(File.size(XML_ATOM_FILE) > 4096) # file must be big enough to trip the read callback more than once + + doc = Nokogiri::XML.parse(Pathname.new(XML_ATOM_FILE)) + + # an arbitrary assertion on the structure of the document + assert_equal 20, doc.xpath("/xmlns:feed/xmlns:entry/xmlns:author", + "xmlns" => "http://www.w3.org/2005/Atom").length + assert_equal XML_ATOM_FILE, doc.url + end + def test_search_on_empty_documents doc = Nokogiri::XML::Document.new ns = doc.search("//foo")