Skip to content

Commit

Permalink
Fix up tests and implementation of Document.parse Pathname handling
Browse files Browse the repository at this point in the history
- no need to resolve Pathname in Nokogiri.parse
- added test coverage for XML::Document and HTML::Document#parse
- moved resolution of Pathname to after `url` is set in HTML::Document#parse
- open files with implict "r" because "b" assumes ASCII-8BIT encoding

Part of sparklemotion#2110, sparklemotion#1821
  • Loading branch information
flavorjones committed Nov 12, 2020
1 parent 86f02c0 commit 5add435
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 35 deletions.
4 changes: 0 additions & 4 deletions lib/nokogiri.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# Modify the PATH on windows so that the external DLLs will get loaded.

require 'rbconfig'
require 'pathname'

if defined?(RUBY_ENGINE) && RUBY_ENGINE == "jruby"
require 'nokogiri/jruby/dependencies'
Expand Down Expand Up @@ -55,9 +54,6 @@ class << self
###
# Parse an HTML or XML document. +string+ contains the document.
def parse string, url = nil, encoding = nil, options = nil
# whackamole; see respective (XML|HTML)::Document
string = string.expand_path.open('rb') if string.is_a? Pathname

if string.respond_to?(:read) ||
/^\s*<(?:!DOCTYPE\s+)?html[\s>]/i === string[0, 512]
# Expect an HTML indicator to appear within the first 512
Expand Down
11 changes: 6 additions & 5 deletions lib/nokogiri/html/document.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# frozen_string_literal: true

# for the constant; pathname is standard lib
require 'pathname'

module Nokogiri
Expand Down Expand Up @@ -170,10 +169,6 @@ def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::
# Give the options to the user
yield options if block_given?

# coerce to binary filehandle if this is a Pathname object
string_or_io = string_or_io.expand_path.open('rb') if
string_or_io.is_a? Pathname

if string_or_io.respond_to?(:encoding)
unless string_or_io.encoding.name == "ASCII-8BIT"
encoding ||= string_or_io.encoding.name
Expand All @@ -182,6 +177,12 @@ def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::

if string_or_io.respond_to?(:read)
url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil

if string_or_io.is_a?(Pathname)
# resolve the Pathname to the file and open it as an IO object, see #2110
string_or_io = string_or_io.expand_path.open
end

unless encoding
# Libxml2's parser has poor support for encoding
# detection. First, it does not recognize the HTML5
Expand Down
13 changes: 5 additions & 8 deletions lib/nokogiri/xml/document.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# frozen_string_literal: true

# for the constant; pathname is standard lib
require 'pathname'

module Nokogiri
Expand Down Expand Up @@ -60,15 +59,13 @@ def self.parse string_or_io, url = nil, encoding = nil, options = ParseOptions::
end

doc = if string_or_io.respond_to?(:read)
# check url first cause this might get blown away
url ||= string_or_io.respond_to?(:path) ?
string_or_io.path : nil
url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil

# coerce to binary filehandle if this is a Pathname object
string_or_io = string_or_io.expand_path.open('rb') if
string_or_io.is_a? Pathname
if string_or_io.is_a?(Pathname)
# resolve the Pathname to the file and open it as an IO object, see #2110
string_or_io = string_or_io.expand_path.open
end

# aaaand go
read_io(string_or_io, url, encoding, options.to_i)
else
# read_memory pukes on empty docs
Expand Down
10 changes: 10 additions & 0 deletions test/html/test_document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,16 @@ def test_parse_can_take_io
assert html.html?
end

# issue #1821, #2110
def test_parse_can_take_pathnames
assert(File.size(HTML_FILE) > 4096) # file must be big enough to trip the read callback more than once

doc = Nokogiri::HTML.parse(Pathname.new(HTML_FILE))

# an arbitrary assertion on the structure of the document
assert_equal 166, doc.css("a").length
end

def test_html?
assert !@html.xml?
assert @html.html?
Expand Down
18 changes: 0 additions & 18 deletions test/test_nokogiri.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,6 @@ def test_atom_is_xml?
assert !doc.html?
end

def test_atom_from_pathname
# atom file is big enough to trip the input callback more than once

path = Pathname(XML_ATOM_FILE) # pathname should be already required

# XXX this behaviour should probably change
assert Nokogiri.parse(path).html?

# we explicitly say xml because of Nokogiri.parse behaviour
doc = Nokogiri.XML(path)

assert doc.xml?
assert !doc.html?

# wqe already know the second half of this works
assert_equal doc.to_xml, Nokogiri.parse(File.read(XML_ATOM_FILE)).to_xml
end

def test_html?
doc = Nokogiri.parse(File.read(HTML_FILE))
assert !doc.xml?
Expand Down
11 changes: 11 additions & 0 deletions test/xml/test_document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -719,6 +719,17 @@ def read(*args)
assert_equal "foo", doc.at_css("div").content
end

# issue #1821, #2110
def test_parse_can_take_pathnames
assert(File.size(XML_ATOM_FILE) > 4096) # file must be big enough to trip the read callback more than once

doc = Nokogiri::XML.parse(Pathname.new(XML_ATOM_FILE))

# an arbitrary assertion on the structure of the document
assert_equal 20, doc.xpath("/xmlns:feed/xmlns:entry/xmlns:author",
"xmlns" => "http://www.w3.org/2005/Atom").length
end

def test_search_on_empty_documents
doc = Nokogiri::XML::Document.new
ns = doc.search("//foo")
Expand Down

0 comments on commit 5add435

Please sign in to comment.