From d52698f2382db9e3e169e7d0c32aaae0a2fa27a3 Mon Sep 17 00:00:00 2001 From: Jake Hughey Date: Mon, 13 Jul 2020 20:52:58 -0500 Subject: [PATCH 1/3] added flatten argument to xml_find_all.xml_nodeset. --- DESCRIPTION | 4 +-- NEWS.md | 2 ++ R/xml_find.R | 48 +++++++++++++++++++++++++--------- man/xml_find_all.Rd | 28 +++++++++++++++----- tests/testthat/test-xml_find.R | 11 ++++++++ 5 files changed, 72 insertions(+), 21 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b44131b1..dcd3bb43 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: xml2 Title: Parse XML -Version: 1.3.2.9000 +Version: 1.3.2.9001 Authors@R: c(person(given = "Hadley", family = "Wickham", @@ -40,7 +40,7 @@ VignetteBuilder: knitr Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.1 SystemRequirements: libxml2: libxml2-dev (deb), libxml2-devel (rpm) Collate: diff --git a/NEWS.md b/NEWS.md index 13cf97f4..bd953151 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # xml2 (development version) +* `xml_find_all.xml_nodeset()` gains a `flatten` argument to control whether to return a single nodeset or a list of nodesets (#311) + * `write_xml()` and `write_html()` now return NULL invisibly, as they did prior to version 1.3.0 (#307) # xml2 1.3.2 diff --git a/R/xml_find.R b/R/xml_find.R index efbf17cb..b1b48f94 100644 --- a/R/xml_find.R +++ b/R/xml_find.R @@ -12,9 +12,11 @@ #' @param xpath A string containing a xpath (1.0) expression. #' @inheritParams xml_name -#' @return `xml_find_all` always returns a nodeset: if there are no matches -#' the nodeset will be empty. The result will always be unique; repeated -#' nodes are automatically de-duplicated. +#' @param ... Further arguments passed to or from other methods. +#' @return `xml_find_all` returns a nodeset if applied to a node, and a nodeset +#' or a list of nodesets if applied to a nodeset. If there are no matches, +#' the nodeset(s) will be empty. Within each nodeset, the result will always +#' be unique; repeated nodes are automatically de-duplicated. #' #' `xml_find_first` returns a node if applied to a node, and a nodeset #' if applied to a nodeset. The output is *always* the same size as @@ -46,11 +48,16 @@ #' ") #' para <- xml_find_all(x, ".//p") #' -#' # If you apply xml_find_all to a nodeset, it finds all matches, -#' # de-duplicates them, and returns as a single list. This means you +#' # By default, if you apply xml_find_all to a nodeset, it finds all matches, +#' # de-duplicates them, and returns as a single nodeset. This means you #' # never know how many results you'll get #' xml_find_all(para, ".//b") #' +#' # If you set flatten to FALSE, though, xml_find_all will return a list of +#' # nodesets, where each nodeset contains the matches for the corresponding +#' # node in the original nodeset. +#' xml_find_all(para, ".//b", flatten = FALSE) +#' #' # xml_find_first only returns the first match per input node. If there are 0 #' # matches it will return a missing node #' xml_find_first(para, ".//b") @@ -67,33 +74,48 @@ #' ') #' xml_find_all(x, ".//f:doc") #' xml_find_all(x, ".//f:doc", xml_ns(x)) -xml_find_all <- function(x, xpath, ns = xml_ns(x)) { +xml_find_all <- function(x, xpath, ns = xml_ns(x), ...) { UseMethod("xml_find_all") } #' @export -xml_find_all.xml_missing <- function(x, xpath, ns = xml_ns(x)) { +xml_find_all.xml_missing <- function(x, xpath, ns = xml_ns(x), ...) { xml_nodeset() } #' @export -xml_find_all.xml_node <- function(x, xpath, ns = xml_ns(x)) { +xml_find_all.xml_node <- function(x, xpath, ns = xml_ns(x), ...) { nodes <- .Call(xpath_search, x$node, x$doc, xpath, ns, Inf) xml_nodeset(nodes) } +#' @param flatten A logical indicating whether to return a single, flattened +#' nodeset or a list of nodesets. #' @export -xml_find_all.xml_nodeset <- function(x, xpath, ns = xml_ns(x)) { +#' @rdname xml_find_all +xml_find_all.xml_nodeset <- function(x, xpath, ns = xml_ns(x), flatten = TRUE, ...) { if (length(x) == 0) return(xml_nodeset()) - nodes <- unlist(recursive = FALSE, - lapply(x, function(x) - .Call(xpath_search, x$node, x$doc, xpath, ns, Inf))) + res <- lapply(x, function(x) .Call(xpath_search, x$node, x$doc, xpath, ns, Inf)) - xml_nodeset(nodes) + if (isTRUE(flatten)) { + return(xml_nodeset(unlist(recursive = FALSE, res))) + } + + res[] <- lapply(res, xml_nodeset) + res } +#' #' @export +#' xml_find_all_list <- function(x, xpath, ns = xml_ns(x)) { +#' if (length(x) == 0) +#' return(xml_nodeset()) +#' +#' lapply(x, function(x) +#' xml_nodeset(.Call(xpath_search, x$node, x$doc, xpath, ns, Inf))) +#' } + #' @export #' @rdname xml_find_all xml_find_first <- function(x, xpath, ns = xml_ns(x)) { diff --git a/man/xml_find_all.Rd b/man/xml_find_all.Rd index 8ede3306..8fc62997 100644 --- a/man/xml_find_all.Rd +++ b/man/xml_find_all.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/xml_find.R \name{xml_find_all} \alias{xml_find_all} +\alias{xml_find_all.xml_nodeset} \alias{xml_find_first} \alias{xml_find_num} \alias{xml_find_chr} @@ -9,7 +10,9 @@ \alias{xml_find_one} \title{Find nodes that match an xpath expression.} \usage{ -xml_find_all(x, xpath, ns = xml_ns(x)) +xml_find_all(x, xpath, ns = xml_ns(x), ...) + +\method{xml_find_all}{xml_nodeset}(x, xpath, ns = xml_ns(x), flatten = TRUE, ...) xml_find_first(x, xpath, ns = xml_ns(x)) @@ -31,11 +34,17 @@ in namespace \code{foo}, it will be called \code{foo:bar}. (And similarly for attributes). Default namespaces must be given an explicit name. The ns is ignored when using \code{\link[=xml_name<-]{xml_name<-()}} and \code{\link[=xml_set_name]{xml_set_name()}}.} + +\item{...}{Further arguments passed to or from other methods.} + +\item{flatten}{A logical indicating whether to return a single, flattened +nodeset or a list of nodesets.} } \value{ -\code{xml_find_all} always returns a nodeset: if there are no matches -the nodeset will be empty. The result will always be unique; repeated -nodes are automatically de-duplicated. +\code{xml_find_all} returns a nodeset if applied to a node, and a nodeset +or a list of nodesets if applied to a nodeset. If there are no matches, +the nodeset(s) will be empty. Within each nodeset, the result will always +be unique; repeated nodes are automatically de-duplicated. \code{xml_find_first} returns a node if applied to a node, and a nodeset if applied to a nodeset. The output is \emph{always} the same size as @@ -52,6 +61,8 @@ you're trying to extract nodes from arbitrary locations in a document. Use \code{xml_find_all} to find all matches - if there's no match you'll get an empty result. Use \code{xml_find_first} to find a specific match - if there's no match you'll get an \code{xml_missing} node. + + } \section{Deprecated functions}{ @@ -79,11 +90,16 @@ x <- read_xml(" ") para <- xml_find_all(x, ".//p") -# If you apply xml_find_all to a nodeset, it finds all matches, -# de-duplicates them, and returns as a single list. This means you +# By default, if you apply xml_find_all to a nodeset, it finds all matches, +# de-duplicates them, and returns as a single nodeset. This means you # never know how many results you'll get xml_find_all(para, ".//b") +# If you set flatten to FALSE, though, xml_find_all will return a list of +# nodesets, where each nodeset contains the matches for the corresponding +# node in the original nodeset. +xml_find_all(para, ".//b", flatten = FALSE) + # xml_find_first only returns the first match per input node. If there are 0 # matches it will return a missing node xml_find_first(para, ".//b") diff --git a/tests/testthat/test-xml_find.R b/tests/testthat/test-xml_find.R index c525072f..93181f0d 100644 --- a/tests/testthat/test-xml_find.R +++ b/tests/testthat/test-xml_find.R @@ -49,6 +49,17 @@ test_that("no matches returns empty nodeset", { expect_equal(length(xml_find_all(x, "//baz")), 0) }) +test_that("xml_find_all returns nodeset or list of nodesets based on flatten", { + x <- read_xml("

Some text.

+

Some other text.

+

No bold here!

") + y <- xml_find_all(x, './/p') + z <- xml_find_all(y, './/b', flatten = FALSE) + expect_s3_class(xml_find_all(y, './/b'), 'xml_nodeset') + expect_type(z, 'list') + expect_s3_class(z[[1L]], 'xml_nodeset') +}) + # Find num --------------------------------------------------------------------- test_that("xml_find_num errors with non numeric results", { x <- read_xml("") From 8b419328f3a69485798c2b7c308c3a89e6277849 Mon Sep 17 00:00:00 2001 From: Jake Hughey Date: Mon, 13 Jul 2020 20:57:25 -0500 Subject: [PATCH 2/3] removed commented-out code. --- R/xml_find.R | 9 --------- man/xml_find_all.Rd | 2 -- 2 files changed, 11 deletions(-) diff --git a/R/xml_find.R b/R/xml_find.R index b1b48f94..14113220 100644 --- a/R/xml_find.R +++ b/R/xml_find.R @@ -107,15 +107,6 @@ xml_find_all.xml_nodeset <- function(x, xpath, ns = xml_ns(x), flatten = TRUE, . res } -#' #' @export -#' xml_find_all_list <- function(x, xpath, ns = xml_ns(x)) { -#' if (length(x) == 0) -#' return(xml_nodeset()) -#' -#' lapply(x, function(x) -#' xml_nodeset(.Call(xpath_search, x$node, x$doc, xpath, ns, Inf))) -#' } - #' @export #' @rdname xml_find_all xml_find_first <- function(x, xpath, ns = xml_ns(x)) { diff --git a/man/xml_find_all.Rd b/man/xml_find_all.Rd index 8fc62997..0fb232c5 100644 --- a/man/xml_find_all.Rd +++ b/man/xml_find_all.Rd @@ -61,8 +61,6 @@ you're trying to extract nodes from arbitrary locations in a document. Use \code{xml_find_all} to find all matches - if there's no match you'll get an empty result. Use \code{xml_find_first} to find a specific match - if there's no match you'll get an \code{xml_missing} node. - - } \section{Deprecated functions}{ From a45dc3cbe2b09219bb88312a391a82147b11542f Mon Sep 17 00:00:00 2001 From: Jake Hughey Date: Tue, 14 Jul 2020 07:49:13 -0500 Subject: [PATCH 3/3] Update NEWS.md Co-authored-by: Jim Hester --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index bd953151..37ac7995 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # xml2 (development version) -* `xml_find_all.xml_nodeset()` gains a `flatten` argument to control whether to return a single nodeset or a list of nodesets (#311) +* `xml_find_all.xml_nodeset()` gains a `flatten` argument to control whether to return a single nodeset or a list of nodesets (#311, @jakejh) * `write_xml()` and `write_html()` now return NULL invisibly, as they did prior to version 1.3.0 (#307)