From 8f19086d543f9af83294096dcc3afdc04b4935e3 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Wed, 23 Aug 2023 07:44:41 +0000 Subject: [PATCH 01/24] Implement `xml_name.xml_nodeset()` in C --- R/utils.R | 11 +++++++++++ R/xml_name.R | 2 +- src/init.c | 2 ++ src/xml2_node.cpp | 30 ++++++++++++++++++++++++++++++ tests/testthat/test-xml_name.R | 12 +++++++++++- 5 files changed, 55 insertions(+), 2 deletions(-) diff --git a/R/utils.R b/R/utils.R index e5d26aeb..71e5f890 100644 --- a/R/utils.R +++ b/R/utils.R @@ -80,3 +80,14 @@ xml2_example <- function(path = NULL) { system.file("extdata", path, package = "xml2", mustWork = TRUE) } } + +sample_nodeset <- function() { + x <- read_xml(" +

Some text.

+

Some other.

+

No bold text

+ ") + + children <- xml_children(x) + xml_find_first(children, ".//b|.//i") +} diff --git a/R/xml_name.R b/R/xml_name.R index a849be90..e33ffac6 100644 --- a/R/xml_name.R +++ b/R/xml_name.R @@ -28,7 +28,7 @@ xml_name.xml_missing <- function(x, ns = character()) { #' @export xml_name.xml_nodeset <- function(x, ns = character()) { - vapply(x, xml_name, ns = ns, FUN.VALUE = character(1)) + .Call(nodeset_name, x, ns) } #' @export diff --git a/src/init.c b/src/init.c index 9fcecfc0..df723e66 100644 --- a/src/init.c +++ b/src/init.c @@ -38,6 +38,7 @@ extern SEXP node_copy(SEXP); extern SEXP node_has_children(SEXP, SEXP); extern SEXP node_length(SEXP, SEXP); extern SEXP node_name(SEXP, SEXP); +extern SEXP nodeset_name(SEXP, SEXP); extern SEXP node_new(SEXP); extern SEXP node_new_dtd(SEXP, SEXP, SEXP, SEXP); extern SEXP node_new_ns(SEXP, SEXP); @@ -103,6 +104,7 @@ static const R_CallMethodDef CallEntries[] = { {"node_has_children", (DL_FUNC) &node_has_children, 2}, {"node_length", (DL_FUNC) &node_length, 2}, {"node_name", (DL_FUNC) &node_name, 2}, + {"nodeset_name", (DL_FUNC) &nodeset_name, 2}, {"node_new", (DL_FUNC) &node_new, 1}, {"node_new_dtd", (DL_FUNC) &node_new_dtd, 4}, {"node_new_ns", (DL_FUNC) &node_new_ns, 2}, diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index 2f9d50a8..ef6b8c93 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -37,6 +37,36 @@ extern "C" SEXP node_name(SEXP node_sxp, SEXP nsMap) { END_CPP } +// [[export]] +extern "C" SEXP nodeset_name(SEXP node_sxp, SEXP nsMap) { + BEGIN_CPP + + int n = Rf_xlength(node_sxp); + + SEXP out = PROTECT(Rf_allocVector(STRSXP, n)); + + for (int i = 0; i < n; ++i) { + SEXP node_sxp_i = VECTOR_ELT(node_sxp, i); + + if (Rf_inherits(node_sxp_i, "xml_node")) { + SEXP node_field_i = VECTOR_ELT(node_sxp_i, 0); + XPtrNode node_i(node_field_i); + std::string name_i = nodeName(node_i.checked_get(), nsMap); + SET_STRING_ELT(out, i, Rf_mkCharLenCE(name_i.c_str(), name_i.size(), CE_UTF8)); + } else if (Rf_inherits(node_sxp_i, "xml_missing")) { + SET_STRING_ELT(out, i, NA_STRING); + } else { + // xml_nodeset can't appear + Rf_error("Unexpected node type"); + } + } + + UNPROTECT(1); + return out; + + END_CPP +} + // [[export]] extern "C" SEXP node_set_name(SEXP node_sxp, SEXP value) { BEGIN_CPP diff --git a/tests/testthat/test-xml_name.R b/tests/testthat/test-xml_name.R index 94b3d284..d298fc43 100644 --- a/tests/testthat/test-xml_name.R +++ b/tests/testthat/test-xml_name.R @@ -1,3 +1,13 @@ +test_that("xml_name() returns the name", { + x <- sample_nodeset() + + expect_equal(xml_name(x[[1]]), "b") + expect_equal(xml_name(x[[2]]), "i") + expect_equal(xml_name(x[[3]]), NA_character_) + + expect_equal(xml_name(x), c("b", "i", NA_character_)) +}) + test_that("qualified names returned when ns given", { x <- read_xml(test_path("ns-multiple-default.xml")) ns <- xml_ns(x) @@ -12,7 +22,7 @@ test_that("error if missing ns spec", { ns <- xml_ns(x)[1] bars <- xml_children(xml_children(x)) - expect_error(xml_name(bars, ns), "Couldn't find prefix") + expect_snapshot_error(xml_name(bars, ns)) }) test_that("xml_name<- modifies the name", { From 073a0a5b3797b447d81f33ec2d08c93fcc93aec9 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Wed, 23 Aug 2023 08:59:17 +0000 Subject: [PATCH 02/24] Implement `xml_text.xml_nodeset()` in C --- R/xml_text.R | 15 ++++++++++++--- src/init.c | 2 ++ src/xml2_node.cpp | 30 ++++++++++++++++++++++++++++++ tests/testthat/test-xml_text.R | 22 ++++++++-------------- 4 files changed, 52 insertions(+), 17 deletions(-) diff --git a/R/xml_text.R b/R/xml_text.R index 0aad2974..3d941204 100644 --- a/R/xml_text.R +++ b/R/xml_text.R @@ -34,15 +34,24 @@ xml_text.xml_missing <- function(x, trim = FALSE) { xml_text.xml_node <- function(x, trim = FALSE) { res <- .Call(node_text, x$node) if (isTRUE(trim)) { - res <- sub("^[[:space:]\u00a0]+", "", res) - res <- sub("[[:space:]\u00a0]+$", "", res) + res <- trim_text(res) } res } #' @export xml_text.xml_nodeset <- function(x, trim = FALSE) { - vapply(x, xml_text, trim = trim, FUN.VALUE = character(1)) + res <- .Call(nodeset_text, x) + + if (isTRUE(trim)) { + res <- trim_text(res) + } + res +} + +trim_text <- function(x) { + x <- sub("^[[:space:]\u00a0]+", "", x) + sub("[[:space:]\u00a0]+$", "", x) } #' @rdname xml_text diff --git a/src/init.c b/src/init.c index df723e66..2d646c92 100644 --- a/src/init.c +++ b/src/init.c @@ -58,6 +58,7 @@ extern SEXP node_set_namespace_prefix(SEXP, SEXP, SEXP); extern SEXP node_set_namespace_uri(SEXP, SEXP, SEXP); extern SEXP node_siblings(SEXP, SEXP); extern SEXP node_text(SEXP); +extern SEXP nodeset_text(SEXP); extern SEXP node_type(SEXP); extern SEXP node_write_character(SEXP, SEXP, SEXP); extern SEXP node_write_connection(SEXP, SEXP, SEXP, SEXP); @@ -124,6 +125,7 @@ static const R_CallMethodDef CallEntries[] = { {"node_set_namespace_uri", (DL_FUNC) &node_set_namespace_uri, 3}, {"node_siblings", (DL_FUNC) &node_siblings, 2}, {"node_text", (DL_FUNC) &node_text, 1}, + {"nodeset_text", (DL_FUNC) &nodeset_text, 1}, {"node_type", (DL_FUNC) &node_type, 1}, {"node_write_character", (DL_FUNC) &node_write_character, 3}, {"node_write_connection", (DL_FUNC) &node_write_connection, 4}, diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index ef6b8c93..5b349ae2 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -87,6 +87,36 @@ extern "C" SEXP node_text(SEXP node_sxp) { END_CPP } +// [[export]] +extern "C" SEXP nodeset_text(SEXP node_sxp) { + BEGIN_CPP + + int n = Rf_xlength(node_sxp); + + SEXP out = PROTECT(Rf_allocVector(STRSXP, n)); + + for (int i = 0; i < n; ++i) { + SEXP node_sxp_i = VECTOR_ELT(node_sxp, i); + + if (Rf_inherits(node_sxp_i, "xml_node")) { + SEXP node_field_i = VECTOR_ELT(node_sxp_i, 0); + XPtrNode node_i(node_field_i); + SEXP text_i = Xml2String(xmlNodeGetContent(node_i.checked_get())).asRString(); + SET_STRING_ELT(out, i, text_i); + } else if (Rf_inherits(node_sxp_i, "xml_missing")) { + SET_STRING_ELT(out, i, NA_STRING); + } else { + // xml_nodeset can't appear + Rf_error("Unexpected node type"); + } + } + + UNPROTECT(1); + return out; + + END_CPP +} + bool hasPrefix(std::string lhs, std::string rhs) { if (lhs.length() > rhs.length()) { return false; diff --git a/tests/testthat/test-xml_text.R b/tests/testthat/test-xml_text.R index 56e8986b..894794f8 100644 --- a/tests/testthat/test-xml_text.R +++ b/tests/testthat/test-xml_text.R @@ -6,22 +6,10 @@ test_that("xml_text returns only text without markup", { expect_identical(xml_text(xml_children(x)), "bold!") }) -test_that("xml_text returns only text without markup", { - x <- read_xml("

This is some text. This is bold!

") - - expect_identical(xml_text(x), "This is some text. This is bold!") - - expect_identical(xml_text(xml_children(x)), "bold!") -}) - test_that("xml_text works properly with xml_nodeset objects", { - x <- read_xml("This is some text. This is some nested text.") - - expect_identical(xml_text(x), "This is some text. This is some nested text.") - expect_identical( - xml_text(xml_find_all(x, "//x")), - c("This is some text. This is some nested text.", "This is some nested text.") + xml_text(sample_nodeset()), + c("text", "other", NA) ) }) @@ -47,6 +35,12 @@ test_that("xml_text trims whitespace if requested, including non-breaking spaces xml_text(x, trim = TRUE), "Some text \u20ac" ) + + x2 <- read_html("

Some text €  

and more € text  ") + expect_identical( + xml_text(xml_find_all(x2, ".//p"), trim = TRUE), + c("Some text \u20ac", "and more \u20ac text") + ) }) test_that("xml_integer() returns an integer vector", { From 3685c8e0c208c6a9cc952f17455f7ef4a5cbe2b5 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Wed, 23 Aug 2023 08:59:44 +0000 Subject: [PATCH 03/24] Use `xml_text.xml_nodeset()` in `xml_double/integer()` --- R/xml_text.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/xml_text.R b/R/xml_text.R index 3d941204..9637835d 100644 --- a/R/xml_text.R +++ b/R/xml_text.R @@ -118,7 +118,7 @@ xml_double.xml_node <- function(x) { #' @export xml_double.xml_nodeset <- function(x) { - vapply(x, xml_double, numeric(1)) + as.numeric(xml_text(x)) } #' @export @@ -144,5 +144,5 @@ xml_integer.xml_node <- function(x) { #' @export xml_integer.xml_nodeset <- function(x) { - vapply(x, xml_integer, integer(1)) + as.integer(xml_text(x)) } From 5784f6708e421ae2e53f043cf95e730e2f7108fe Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Wed, 23 Aug 2023 09:33:45 +0000 Subject: [PATCH 04/24] Implement `xml_type.xml_nodeset()` in C --- R/xml_type.R | 2 +- src/init.c | 2 ++ src/xml2_node.cpp | 30 ++++++++++++++++++++++++++++++ tests/testthat/_snaps/xml_name.md | 4 ++++ tests/testthat/test-xml_missing.R | 1 - tests/testthat/test-xml_nodeset.R | 1 - tests/testthat/test-xml_type.R | 11 +++++++++++ 7 files changed, 48 insertions(+), 3 deletions(-) create mode 100644 tests/testthat/_snaps/xml_name.md create mode 100644 tests/testthat/test-xml_type.R diff --git a/R/xml_type.R b/R/xml_type.R index f957d759..b37d2ea3 100644 --- a/R/xml_type.R +++ b/R/xml_type.R @@ -22,7 +22,7 @@ xml_type.xml_node <- function(x) { #' @export xml_type.xml_nodeset <- function(x) { - types <- vapply(x, function(x) .Call(node_type, x$node), integer(1)) + types <- .Call(nodeset_type, x) xmlElementType[types] } diff --git a/src/init.c b/src/init.c index 2d646c92..5c2364ae 100644 --- a/src/init.c +++ b/src/init.c @@ -60,6 +60,7 @@ extern SEXP node_siblings(SEXP, SEXP); extern SEXP node_text(SEXP); extern SEXP nodeset_text(SEXP); extern SEXP node_type(SEXP); +extern SEXP nodeset_type(SEXP); extern SEXP node_write_character(SEXP, SEXP, SEXP); extern SEXP node_write_connection(SEXP, SEXP, SEXP, SEXP); extern SEXP node_write_file(SEXP, SEXP, SEXP, SEXP); @@ -127,6 +128,7 @@ static const R_CallMethodDef CallEntries[] = { {"node_text", (DL_FUNC) &node_text, 1}, {"nodeset_text", (DL_FUNC) &nodeset_text, 1}, {"node_type", (DL_FUNC) &node_type, 1}, + {"nodeset_type", (DL_FUNC) &nodeset_type, 1}, {"node_write_character", (DL_FUNC) &node_write_character, 3}, {"node_write_connection", (DL_FUNC) &node_write_connection, 4}, {"node_write_file", (DL_FUNC) &node_write_file, 4}, diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index 5b349ae2..d10c1a16 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -644,6 +644,36 @@ extern "C" SEXP node_type(SEXP node_sxp) { END_CPP } +// [[export]] +extern "C" SEXP nodeset_type(SEXP node_sxp) { + BEGIN_CPP + + int n = Rf_xlength(node_sxp); + + SEXP out = PROTECT(Rf_allocVector(INTSXP, n)); + + for (int i = 0; i < n; ++i) { + SEXP node_sxp_i = VECTOR_ELT(node_sxp, i); + + if (Rf_inherits(node_sxp_i, "xml_node")) { + SEXP node_field_i = VECTOR_ELT(node_sxp_i, 0); + XPtrNode node_i(node_field_i); + INTEGER(out)[i] = node_i->type; + } else if (Rf_inherits(node_sxp_i, "xml_missing")) { + INTEGER(out)[i] = NA_INTEGER; + } else { + // xml_nodeset can't appear + Rf_error("Unexpected node type"); + } + } + + UNPROTECT(1); + return out; + + END_CPP +} + + // [[export]] extern "C" SEXP node_copy(SEXP node_sxp) { BEGIN_CPP diff --git a/tests/testthat/_snaps/xml_name.md b/tests/testthat/_snaps/xml_name.md new file mode 100644 index 00000000..38ee479d --- /dev/null +++ b/tests/testthat/_snaps/xml_name.md @@ -0,0 +1,4 @@ +# error if missing ns spec + + Couldn't find prefix for url http://bar.com + diff --git a/tests/testthat/test-xml_missing.R b/tests/testthat/test-xml_missing.R index 66124d72..fdc24f04 100644 --- a/tests/testthat/test-xml_missing.R +++ b/tests/testthat/test-xml_missing.R @@ -33,7 +33,6 @@ test_that("xml_missing methods return properly for all S3 methods", { expect_equal(xml_parent(mss), xml_missing()) expect_equal(xml_path(mss), NA_character_) expect_equal(xml_text(mss), NA_character_) - expect_equal(xml_type(mss), NA_character_) expect_equal(xml_url(mss), NA_character_) }) diff --git a/tests/testthat/test-xml_nodeset.R b/tests/testthat/test-xml_nodeset.R index 81382fbc..3a92c0b6 100644 --- a/tests/testthat/test-xml_nodeset.R +++ b/tests/testthat/test-xml_nodeset.R @@ -67,7 +67,6 @@ test_that("methods work on empty nodesets", { expect_output(xml_structure(empty), NA) expect_identical(xml_text(empty), character()) - expect_identical(xml_type(empty), character()) expect_identical(xml_url(empty), character()) }) diff --git a/tests/testthat/test-xml_type.R b/tests/testthat/test-xml_type.R new file mode 100644 index 00000000..1972ef4e --- /dev/null +++ b/tests/testthat/test-xml_type.R @@ -0,0 +1,11 @@ +test_that("multiplication works", { + x <- sample_nodeset() + + expect_equal(xml_type(x[[1]]), "element") + expect_equal(xml_type(x[[3]]), NA_character_) + + expect_equal(xml_type(x), c("element", "element", NA)) + + empty <- xml_children(x) + expect_identical(xml_type(empty), character()) +}) From 34a5f12abde331498048e1990e5410606e9fa865 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Aug 2023 08:42:55 +0000 Subject: [PATCH 05/24] Remove S3 dispatch in `xml_name()` --- NAMESPACE | 3 --- R/xml_name.R | 17 +------------ src/init.c | 2 -- src/xml2_node.cpp | 65 ++++++++++++++++++++++++++++------------------- src/xml2_utils.h | 18 +++++++++++++ 5 files changed, 58 insertions(+), 47 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index ac692aa7..e2a3a443 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -101,9 +101,6 @@ S3method(xml_integer,xml_nodeset) S3method(xml_length,xml_missing) S3method(xml_length,xml_node) S3method(xml_length,xml_nodeset) -S3method(xml_name,xml_missing) -S3method(xml_name,xml_node) -S3method(xml_name,xml_nodeset) S3method(xml_ns,xml_document) S3method(xml_ns,xml_missing) S3method(xml_ns,xml_node) diff --git a/R/xml_name.R b/R/xml_name.R index e33ffac6..812a6909 100644 --- a/R/xml_name.R +++ b/R/xml_name.R @@ -18,22 +18,7 @@ #' z <- xml_children(y) #' xml_name(xml_children(y)) xml_name <- function(x, ns = character()) { - UseMethod("xml_name") -} - -#' @export -xml_name.xml_missing <- function(x, ns = character()) { - NA_character_ -} - -#' @export -xml_name.xml_nodeset <- function(x, ns = character()) { - .Call(nodeset_name, x, ns) -} - -#' @export -xml_name.xml_node <- function(x, ns = character()) { - .Call(node_name, x$node, ns) + .Call(node_name, x, ns) } #' Modify the (tag) name of an element diff --git a/src/init.c b/src/init.c index 5c2364ae..5ddb1cfc 100644 --- a/src/init.c +++ b/src/init.c @@ -38,7 +38,6 @@ extern SEXP node_copy(SEXP); extern SEXP node_has_children(SEXP, SEXP); extern SEXP node_length(SEXP, SEXP); extern SEXP node_name(SEXP, SEXP); -extern SEXP nodeset_name(SEXP, SEXP); extern SEXP node_new(SEXP); extern SEXP node_new_dtd(SEXP, SEXP, SEXP, SEXP); extern SEXP node_new_ns(SEXP, SEXP); @@ -106,7 +105,6 @@ static const R_CallMethodDef CallEntries[] = { {"node_has_children", (DL_FUNC) &node_has_children, 2}, {"node_length", (DL_FUNC) &node_length, 2}, {"node_name", (DL_FUNC) &node_name, 2}, - {"nodeset_name", (DL_FUNC) &nodeset_name, 2}, {"node_new", (DL_FUNC) &node_new, 1}, {"node_new_dtd", (DL_FUNC) &node_new_dtd, 4}, {"node_new_ns", (DL_FUNC) &node_new_ns, 2}, diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index d10c1a16..ce0494f5 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -27,42 +27,55 @@ std::string nodeName(T* node, SEXP nsMap) { return prefix + ":" + name; } -// [[export]] -extern "C" SEXP node_name(SEXP node_sxp, SEXP nsMap) { - BEGIN_CPP - XPtrNode node(node_sxp); +SEXP node_name_impl(SEXP x, SEXP nsMap) { + NodeType type = getNodeType(x); - std::string name = nodeName(node.checked_get(), nsMap); - return Rf_ScalarString(Rf_mkCharLenCE(name.c_str(), name.size(), CE_UTF8)); - END_CPP + SEXP out; + + switch(type) { + case NodeType::missing: + out = NA_STRING; + break; + case NodeType::node: { + SEXP node_sxp = VECTOR_ELT(x, 0); + XPtrNode node(node_sxp); + + std::string name = nodeName(node.checked_get(), nsMap); + out = Rf_mkCharLenCE(name.c_str(), name.size(), CE_UTF8); + break; + } + default: Rf_error("Unexpected node type"); + } + + return(out); } // [[export]] -extern "C" SEXP nodeset_name(SEXP node_sxp, SEXP nsMap) { +extern "C" SEXP node_name(SEXP x, SEXP nsMap) { BEGIN_CPP + NodeType type = getNodeType(x); - int n = Rf_xlength(node_sxp); - - SEXP out = PROTECT(Rf_allocVector(STRSXP, n)); + switch(type) + { + case NodeType::missing: + case NodeType::node : + return(Rf_ScalarString(node_name_impl(x, nsMap))); + break; + case NodeType::nodeset: { + int n = Rf_xlength(x); - for (int i = 0; i < n; ++i) { - SEXP node_sxp_i = VECTOR_ELT(node_sxp, i); + SEXP out = PROTECT(Rf_allocVector(STRSXP, n)); - if (Rf_inherits(node_sxp_i, "xml_node")) { - SEXP node_field_i = VECTOR_ELT(node_sxp_i, 0); - XPtrNode node_i(node_field_i); - std::string name_i = nodeName(node_i.checked_get(), nsMap); - SET_STRING_ELT(out, i, Rf_mkCharLenCE(name_i.c_str(), name_i.size(), CE_UTF8)); - } else if (Rf_inherits(node_sxp_i, "xml_missing")) { - SET_STRING_ELT(out, i, NA_STRING); - } else { - // xml_nodeset can't appear - Rf_error("Unexpected node type"); + for (int i = 0; i < n; ++i) { + SEXP x_i = VECTOR_ELT(x, i); + SEXP name_i = node_name_impl(x_i, nsMap); + SET_STRING_ELT(out, i, name_i); } - } - UNPROTECT(1); - return out; + UNPROTECT(1); + return(out); + }; + } END_CPP } diff --git a/src/xml2_utils.h b/src/xml2_utils.h index 231b7e38..d77b3cf7 100644 --- a/src/xml2_utils.h +++ b/src/xml2_utils.h @@ -9,6 +9,24 @@ #include #include +enum NodeType { + missing = 1, + node = 2, + nodeset = 3, +}; + +inline const NodeType getNodeType(SEXP x) { + if (Rf_inherits(x, "xml_node")) { + return(NodeType::node); + } else if (Rf_inherits(x, "xml_nodeset")) { + return(NodeType::nodeset); + } else if (Rf_inherits(x, "xml_missing")) { + return(NodeType::missing); + } else { + Rf_error("Unexpected node type"); + } +} + inline const xmlChar* asXmlChar(std::string const& x) { return (const xmlChar*) x.c_str(); } From 653e56e8974e5dbfc4c2c2d019f603356ee477ac Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Aug 2023 08:49:21 +0000 Subject: [PATCH 06/24] Remove S3 dispatch in `xml_text()` --- NAMESPACE | 3 --- R/xml_text.R | 22 +---------------- src/init.c | 2 -- src/xml2_node.cpp | 63 ++++++++++++++++++++++++++++------------------- 4 files changed, 39 insertions(+), 51 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index e2a3a443..946c28ca 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -129,9 +129,6 @@ S3method(xml_set_attrs,xml_nodeset) S3method(xml_set_name,xml_missing) S3method(xml_set_name,xml_node) S3method(xml_set_name,xml_nodeset) -S3method(xml_text,xml_missing) -S3method(xml_text,xml_node) -S3method(xml_text,xml_nodeset) S3method(xml_type,xml_missing) S3method(xml_type,xml_node) S3method(xml_type,xml_nodeset) diff --git a/R/xml_text.R b/R/xml_text.R index 9637835d..b3d2c70c 100644 --- a/R/xml_text.R +++ b/R/xml_text.R @@ -22,27 +22,7 @@ #' xml_integer(xml_find_all(x, "//@x")) #' @export xml_text <- function(x, trim = FALSE) { - UseMethod("xml_text") -} - -#' @export -xml_text.xml_missing <- function(x, trim = FALSE) { - NA_character_ -} - -#' @export -xml_text.xml_node <- function(x, trim = FALSE) { - res <- .Call(node_text, x$node) - if (isTRUE(trim)) { - res <- trim_text(res) - } - res -} - -#' @export -xml_text.xml_nodeset <- function(x, trim = FALSE) { - res <- .Call(nodeset_text, x) - + res <- .Call(node_text, x) if (isTRUE(trim)) { res <- trim_text(res) } diff --git a/src/init.c b/src/init.c index 5ddb1cfc..52b85122 100644 --- a/src/init.c +++ b/src/init.c @@ -57,7 +57,6 @@ extern SEXP node_set_namespace_prefix(SEXP, SEXP, SEXP); extern SEXP node_set_namespace_uri(SEXP, SEXP, SEXP); extern SEXP node_siblings(SEXP, SEXP); extern SEXP node_text(SEXP); -extern SEXP nodeset_text(SEXP); extern SEXP node_type(SEXP); extern SEXP nodeset_type(SEXP); extern SEXP node_write_character(SEXP, SEXP, SEXP); @@ -124,7 +123,6 @@ static const R_CallMethodDef CallEntries[] = { {"node_set_namespace_uri", (DL_FUNC) &node_set_namespace_uri, 3}, {"node_siblings", (DL_FUNC) &node_siblings, 2}, {"node_text", (DL_FUNC) &node_text, 1}, - {"nodeset_text", (DL_FUNC) &nodeset_text, 1}, {"node_type", (DL_FUNC) &node_type, 1}, {"nodeset_type", (DL_FUNC) &nodeset_type, 1}, {"node_write_character", (DL_FUNC) &node_write_character, 3}, diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index ce0494f5..7e45b514 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -91,41 +91,54 @@ extern "C" SEXP node_set_name(SEXP node_sxp, SEXP value) { END_CPP } -// [[export]] -extern "C" SEXP node_text(SEXP node_sxp) { - BEGIN_CPP - XPtrNode node(node_sxp); +SEXP node_text_impl(SEXP x) { + NodeType type = getNodeType(x); - return Rf_ScalarString(Xml2String(xmlNodeGetContent(node.checked_get())).asRString()); - END_CPP + SEXP out; + + switch(type) { + case NodeType::missing: + out = NA_STRING; + break; + case NodeType::node: { + SEXP node_sxp = VECTOR_ELT(x, 0); + XPtrNode node(node_sxp); + + out = Xml2String(xmlNodeGetContent(node.checked_get())).asRString(); + break; + } + default: Rf_error("Unexpected node type"); + } + + return(out); } // [[export]] -extern "C" SEXP nodeset_text(SEXP node_sxp) { +extern "C" SEXP node_text(SEXP x) { BEGIN_CPP + NodeType type = getNodeType(x); - int n = Rf_xlength(node_sxp); - - SEXP out = PROTECT(Rf_allocVector(STRSXP, n)); + switch(type) + { + case NodeType::missing: + case NodeType::node : + return(Rf_ScalarString(node_text_impl(x))); + break; + case NodeType::nodeset: { + int n = Rf_xlength(x); - for (int i = 0; i < n; ++i) { - SEXP node_sxp_i = VECTOR_ELT(node_sxp, i); + SEXP out = PROTECT(Rf_allocVector(STRSXP, n)); - if (Rf_inherits(node_sxp_i, "xml_node")) { - SEXP node_field_i = VECTOR_ELT(node_sxp_i, 0); - XPtrNode node_i(node_field_i); - SEXP text_i = Xml2String(xmlNodeGetContent(node_i.checked_get())).asRString(); - SET_STRING_ELT(out, i, text_i); - } else if (Rf_inherits(node_sxp_i, "xml_missing")) { - SET_STRING_ELT(out, i, NA_STRING); - } else { - // xml_nodeset can't appear - Rf_error("Unexpected node type"); + for (int i = 0; i < n; ++i) { + SEXP x_i = VECTOR_ELT(x, i); + SEXP name_i = node_text_impl(x_i); + SET_STRING_ELT(out, i, name_i); } - } - UNPROTECT(1); - return out; + UNPROTECT(1); + return(out); + }; + } END_CPP } From 2aa3ca18ba3792299567cc7690b23ad18a75f826 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Aug 2023 08:50:39 +0000 Subject: [PATCH 07/24] Simplify `xml_double()` and `xml_integer()` --- NAMESPACE | 6 ------ R/xml_text.R | 36 ------------------------------------ man/xml_text.Rd | 3 --- 3 files changed, 45 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 946c28ca..1c7bba94 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -77,9 +77,6 @@ S3method(xml_attr,xml_nodeset) S3method(xml_attrs,xml_missing) S3method(xml_attrs,xml_node) S3method(xml_attrs,xml_nodeset) -S3method(xml_double,xml_missing) -S3method(xml_double,xml_node) -S3method(xml_double,xml_nodeset) S3method(xml_find_all,xml_missing) S3method(xml_find_all,xml_node) S3method(xml_find_all,xml_nodeset) @@ -95,9 +92,6 @@ S3method(xml_find_lgl,xml_nodeset) S3method(xml_find_num,xml_missing) S3method(xml_find_num,xml_node) S3method(xml_find_num,xml_nodeset) -S3method(xml_integer,xml_missing) -S3method(xml_integer,xml_node) -S3method(xml_integer,xml_nodeset) S3method(xml_length,xml_missing) S3method(xml_length,xml_node) S3method(xml_length,xml_nodeset) diff --git a/R/xml_text.R b/R/xml_text.R index b3d2c70c..a748f0f4 100644 --- a/R/xml_text.R +++ b/R/xml_text.R @@ -83,46 +83,10 @@ trim_text <- function(x) { #' @rdname xml_text #' @export xml_double <- function(x) { - UseMethod("xml_double") -} - -#' @export -xml_double.xml_missing <- function(x) { - NA_real_ -} - -#' @export -xml_double.xml_node <- function(x) { - as.numeric(xml_text(x)) -} - -#' @export -xml_double.xml_nodeset <- function(x) { as.numeric(xml_text(x)) } #' @export xml_integer <- function(x) { - UseMethod("xml_integer") -} - -#' @export -xml_integer.xml_missing <- function(x) { - NA_integer_ -} - -#' @rdname xml_text -#' @export -xml_integer <- function(x) { - UseMethod("xml_integer") -} - -#' @export -xml_integer.xml_node <- function(x) { - as.integer(xml_text(x)) -} - -#' @export -xml_integer.xml_nodeset <- function(x) { as.integer(xml_text(x)) } diff --git a/man/xml_text.Rd b/man/xml_text.Rd index 825b4c95..0026fbfe 100644 --- a/man/xml_text.Rd +++ b/man/xml_text.Rd @@ -5,7 +5,6 @@ \alias{xml_text<-} \alias{xml_set_text} \alias{xml_double} -\alias{xml_integer} \title{Extract or modify the text} \usage{ xml_text(x, trim = FALSE) @@ -15,8 +14,6 @@ xml_text(x) <- value xml_set_text(x, value) xml_double(x) - -xml_integer(x) } \arguments{ \item{x}{A document, node, or node set.} From 84674e15ec1efbe195ba72feacde064da8138eb1 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Aug 2023 10:46:44 +0000 Subject: [PATCH 08/24] Remove S3 dispatch in `xml_type()` --- NAMESPACE | 3 --- R/xml_type.R | 17 +------------ src/init.c | 2 -- src/xml2_node.cpp | 65 +++++++++++++++++++++++++++++------------------ 4 files changed, 41 insertions(+), 46 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 1c7bba94..00246842 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -123,9 +123,6 @@ S3method(xml_set_attrs,xml_nodeset) S3method(xml_set_name,xml_missing) S3method(xml_set_name,xml_node) S3method(xml_set_name,xml_nodeset) -S3method(xml_type,xml_missing) -S3method(xml_type,xml_node) -S3method(xml_type,xml_nodeset) S3method(xml_url,xml_missing) S3method(xml_url,xml_node) S3method(xml_url,xml_nodeset) diff --git a/R/xml_type.R b/R/xml_type.R index b37d2ea3..d5aac52a 100644 --- a/R/xml_type.R +++ b/R/xml_type.R @@ -7,22 +7,7 @@ #' xml_type(x) #' xml_type(xml_contents(x)) xml_type <- function(x) { - UseMethod("xml_type") -} - -#' @export -xml_type.xml_missing <- function(x) { - NA_character_ -} - -#' @export -xml_type.xml_node <- function(x) { - xmlElementType[.Call(node_type, x$node)] -} - -#' @export -xml_type.xml_nodeset <- function(x) { - types <- .Call(nodeset_type, x) + types <- .Call(node_type, x) xmlElementType[types] } diff --git a/src/init.c b/src/init.c index 52b85122..9fcecfc0 100644 --- a/src/init.c +++ b/src/init.c @@ -58,7 +58,6 @@ extern SEXP node_set_namespace_uri(SEXP, SEXP, SEXP); extern SEXP node_siblings(SEXP, SEXP); extern SEXP node_text(SEXP); extern SEXP node_type(SEXP); -extern SEXP nodeset_type(SEXP); extern SEXP node_write_character(SEXP, SEXP, SEXP); extern SEXP node_write_connection(SEXP, SEXP, SEXP, SEXP); extern SEXP node_write_file(SEXP, SEXP, SEXP, SEXP); @@ -124,7 +123,6 @@ static const R_CallMethodDef CallEntries[] = { {"node_siblings", (DL_FUNC) &node_siblings, 2}, {"node_text", (DL_FUNC) &node_text, 1}, {"node_type", (DL_FUNC) &node_type, 1}, - {"nodeset_type", (DL_FUNC) &nodeset_type, 1}, {"node_write_character", (DL_FUNC) &node_write_character, 3}, {"node_write_connection", (DL_FUNC) &node_write_connection, 4}, {"node_write_file", (DL_FUNC) &node_write_file, 4}, diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index 7e45b514..33b4744d 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -661,45 +661,60 @@ extern "C" SEXP nodes_duplicated(SEXP nodes) { END_CPP } -// [[export]] -extern "C" SEXP node_type(SEXP node_sxp) { - BEGIN_CPP - XPtrNode node(node_sxp); +int node_type_impl(SEXP x) { + NodeType type = getNodeType(x); - return Rf_ScalarInteger(node->type); - END_CPP + int out; + + switch(type) { + case NodeType::missing: + out = NA_INTEGER; + break; + case NodeType::node: { + SEXP node_sxp = VECTOR_ELT(x, 0); + XPtrNode node(node_sxp); + + out = node->type; + break; + } + default: Rf_error("Unexpected node type"); + } + + return(out); } // [[export]] -extern "C" SEXP nodeset_type(SEXP node_sxp) { +extern "C" SEXP node_type(SEXP x) { BEGIN_CPP + NodeType type = getNodeType(x); - int n = Rf_xlength(node_sxp); + switch(type) + { + case NodeType::missing: + case NodeType::node : + return(Rf_ScalarInteger(node_type_impl(x))); + break; + case NodeType::nodeset: { + int n = Rf_xlength(x); - SEXP out = PROTECT(Rf_allocVector(INTSXP, n)); + SEXP out = PROTECT(Rf_allocVector(INTSXP, n)); + int* p_out = INTEGER(out); - for (int i = 0; i < n; ++i) { - SEXP node_sxp_i = VECTOR_ELT(node_sxp, i); - - if (Rf_inherits(node_sxp_i, "xml_node")) { - SEXP node_field_i = VECTOR_ELT(node_sxp_i, 0); - XPtrNode node_i(node_field_i); - INTEGER(out)[i] = node_i->type; - } else if (Rf_inherits(node_sxp_i, "xml_missing")) { - INTEGER(out)[i] = NA_INTEGER; - } else { - // xml_nodeset can't appear - Rf_error("Unexpected node type"); + for (int i = 0; i < n; ++i) { + SEXP x_i = VECTOR_ELT(x, i); + int type_i = node_type_impl(x_i); + *p_out = type_i; + ++p_out; } - } - UNPROTECT(1); - return out; + UNPROTECT(1); + return(out); + }; + } END_CPP } - // [[export]] extern "C" SEXP node_copy(SEXP node_sxp) { BEGIN_CPP From 69da196b0c2196e1b9e7a64c07720a0d8f195120 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Aug 2023 10:56:41 +0000 Subject: [PATCH 09/24] Inline `sample_nodeset()` in tests --- R/utils.R | 11 ----------- tests/testthat/test-xml_name.R | 9 ++++++++- tests/testthat/test-xml_text.R | 11 ++++++++++- tests/testthat/test-xml_type.R | 9 ++++++++- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/R/utils.R b/R/utils.R index 71e5f890..e5d26aeb 100644 --- a/R/utils.R +++ b/R/utils.R @@ -80,14 +80,3 @@ xml2_example <- function(path = NULL) { system.file("extdata", path, package = "xml2", mustWork = TRUE) } } - -sample_nodeset <- function() { - x <- read_xml(" -

Some text.

-

Some other.

-

No bold text

- ") - - children <- xml_children(x) - xml_find_first(children, ".//b|.//i") -} diff --git a/tests/testthat/test-xml_name.R b/tests/testthat/test-xml_name.R index d298fc43..869db269 100644 --- a/tests/testthat/test-xml_name.R +++ b/tests/testthat/test-xml_name.R @@ -1,5 +1,12 @@ test_that("xml_name() returns the name", { - x <- sample_nodeset() + x <- read_xml(" +

Some text.

+

Some other.

+

No bold text

+ ") + + children <- xml_children(x) + x <- xml_find_first(children, ".//b|.//i") expect_equal(xml_name(x[[1]]), "b") expect_equal(xml_name(x[[2]]), "i") diff --git a/tests/testthat/test-xml_text.R b/tests/testthat/test-xml_text.R index 894794f8..6b7af3b6 100644 --- a/tests/testthat/test-xml_text.R +++ b/tests/testthat/test-xml_text.R @@ -7,8 +7,17 @@ test_that("xml_text returns only text without markup", { }) test_that("xml_text works properly with xml_nodeset objects", { + x <- read_xml(" +

Some text.

+

Some other.

+

No bold text

+ ") + + children <- xml_children(x) + x <- xml_find_first(children, ".//b|.//i") + expect_identical( - xml_text(sample_nodeset()), + xml_text(x), c("text", "other", NA) ) }) diff --git a/tests/testthat/test-xml_type.R b/tests/testthat/test-xml_type.R index 1972ef4e..a2a63376 100644 --- a/tests/testthat/test-xml_type.R +++ b/tests/testthat/test-xml_type.R @@ -1,5 +1,12 @@ test_that("multiplication works", { - x <- sample_nodeset() + x <- read_xml(" +

Some text.

+

Some other.

+

No bold text

+ ") + + children <- xml_children(x) + x <- xml_find_first(children, ".//b|.//i") expect_equal(xml_type(x[[1]]), "element") expect_equal(xml_type(x[[3]]), NA_character_) From 03445ecccb7e1993f917010a32c96ed2e2a94dbe Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Aug 2023 12:02:59 +0000 Subject: [PATCH 10/24] Move `NodeType` enum to `xml2_types.h` --- inst/include/xml2_types.h | 18 ++++++++++++++++++ src/xml2_utils.h | 18 ------------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/inst/include/xml2_types.h b/inst/include/xml2_types.h index c83f18d1..6b3c5972 100644 --- a/inst/include/xml2_types.h +++ b/inst/include/xml2_types.h @@ -5,6 +5,24 @@ #define R_NO_REMAP #include +enum NodeType { + missing = 1, + node = 2, + nodeset = 3, +}; + +inline const NodeType getNodeType(SEXP x) { + if (Rf_inherits(x, "xml_node")) { + return(NodeType::node); + } else if (Rf_inherits(x, "xml_nodeset")) { + return(NodeType::nodeset); + } else if (Rf_inherits(x, "xml_missing")) { + return(NodeType::missing); + } else { + Rf_error("Unexpected node type"); + } +} + template class XPtr { protected: SEXP data_; diff --git a/src/xml2_utils.h b/src/xml2_utils.h index d77b3cf7..231b7e38 100644 --- a/src/xml2_utils.h +++ b/src/xml2_utils.h @@ -9,24 +9,6 @@ #include #include -enum NodeType { - missing = 1, - node = 2, - nodeset = 3, -}; - -inline const NodeType getNodeType(SEXP x) { - if (Rf_inherits(x, "xml_node")) { - return(NodeType::node); - } else if (Rf_inherits(x, "xml_nodeset")) { - return(NodeType::nodeset); - } else if (Rf_inherits(x, "xml_missing")) { - return(NodeType::missing); - } else { - Rf_error("Unexpected node type"); - } -} - inline const xmlChar* asXmlChar(std::string const& x) { return (const xmlChar*) x.c_str(); } From 2293448cbfaf35dc77576be3fffa072299b260db Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Aug 2023 13:15:06 +0000 Subject: [PATCH 11/24] Document `xml_integer()` --- R/xml_text.R | 1 + man/xml_text.Rd | 3 +++ 2 files changed, 4 insertions(+) diff --git a/R/xml_text.R b/R/xml_text.R index a748f0f4..be2b173e 100644 --- a/R/xml_text.R +++ b/R/xml_text.R @@ -86,6 +86,7 @@ xml_double <- function(x) { as.numeric(xml_text(x)) } +#' @rdname xml_text #' @export xml_integer <- function(x) { as.integer(xml_text(x)) diff --git a/man/xml_text.Rd b/man/xml_text.Rd index 0026fbfe..825b4c95 100644 --- a/man/xml_text.Rd +++ b/man/xml_text.Rd @@ -5,6 +5,7 @@ \alias{xml_text<-} \alias{xml_set_text} \alias{xml_double} +\alias{xml_integer} \title{Extract or modify the text} \usage{ xml_text(x, trim = FALSE) @@ -14,6 +15,8 @@ xml_text(x) <- value xml_set_text(x, value) xml_double(x) + +xml_integer(x) } \arguments{ \item{x}{A document, node, or node set.} From 4a92b3b858f2f4542497435a06e3d292e1103f93 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Aug 2023 13:26:43 +0000 Subject: [PATCH 12/24] Implement `xml_length()` in C --- NAMESPACE | 3 --- R/xml_children.R | 21 +--------------- src/xml2_node.cpp | 63 +++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 56 insertions(+), 31 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 00246842..d96c5dbe 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -92,9 +92,6 @@ S3method(xml_find_lgl,xml_nodeset) S3method(xml_find_num,xml_missing) S3method(xml_find_num,xml_node) S3method(xml_find_num,xml_nodeset) -S3method(xml_length,xml_missing) -S3method(xml_length,xml_node) -S3method(xml_length,xml_nodeset) S3method(xml_ns,xml_document) S3method(xml_ns,xml_missing) S3method(xml_ns,xml_node) diff --git a/R/xml_children.R b/R/xml_children.R index 56f424a7..dc34ecfc 100644 --- a/R/xml_children.R +++ b/R/xml_children.R @@ -100,26 +100,7 @@ xml_parent.xml_nodeset <- function(x) { #' @export #' @rdname xml_children xml_length <- function(x, only_elements = TRUE) { - UseMethod("xml_length") -} - -#' @export -xml_length.xml_missing <- function(x, only_elements = TRUE) { - 0L -} - -#' @export -xml_length.xml_node <- function(x, only_elements = TRUE) { - .Call(node_length, x$node, only_elements) -} - -#' @export -xml_length.xml_nodeset <- function(x, only_elements = TRUE) { - if (length(x) == 0) { - return(0L) - } - - vapply(x, xml_length, only_elements = only_elements, FUN.VALUE = integer(1)) + .Call(node_length, x, only_elements) } #' @export diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index 33b4744d..a0161eea 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -529,22 +529,69 @@ extern "C" SEXP node_children(SEXP node_sxp, SEXP only_node_sxp) { END_CPP } +int node_length_impl(SEXP x, bool only_node) { + NodeType type = getNodeType(x); + + int out; + + switch(type) { + case NodeType::missing: + out = 0; + break; + case NodeType::node: { + SEXP node_sxp = VECTOR_ELT(x, 0); + XPtrNode node(node_sxp); + + out = 0; + for(xmlNode* cur = node->xmlChildrenNode; cur != NULL; cur = cur->next) { + if (only_node && cur->type != XML_ELEMENT_NODE) { + continue; + } + ++out; + } + break; + } + default: Rf_error("Unexpected node type"); + } + + return out; +} + // [[export]] -extern "C" SEXP node_length(SEXP node_sxp, SEXP only_node_sxp) { +extern "C" SEXP node_length(SEXP x, SEXP only_node_sxp) { BEGIN_CPP + NodeType type = getNodeType(x); - XPtrNode node(node_sxp); bool only_node = LOGICAL(only_node_sxp)[0]; - int i = 0; - for(xmlNode* cur = node->xmlChildrenNode; cur != NULL; cur = cur->next) { - if (only_node && cur->type != XML_ELEMENT_NODE) { - continue; + switch(type) + { + case NodeType::missing: + case NodeType::node : + return(Rf_ScalarInteger(node_length_impl(x, only_node))); + break; + case NodeType::nodeset: { + int n = Rf_xlength(x); + + if (n == 0) { + return(Rf_ScalarInteger(0)); } - ++i; + + SEXP out = PROTECT(Rf_allocVector(INTSXP, n)); + int* p_out = INTEGER(out); + + for (int i = 0; i < n; ++i) { + SEXP x_i = VECTOR_ELT(x, i); + int length_i = node_length_impl(x_i, only_node); + *p_out = length_i; + p_out++; + } + + UNPROTECT(1); + return(out); + }; } - return Rf_ScalarInteger(i); END_CPP } From 1406af60bef62217b101a8cfe5d6a94555dacc7e Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Aug 2023 13:40:05 +0000 Subject: [PATCH 13/24] Implement `xml_attr()` in C --- NAMESPACE | 3 - R/xml_attr.R | 26 +------- src/xml2_node.cpp | 98 ++++++++++++++++++++++--------- tests/testthat/test-xml_missing.R | 2 +- 4 files changed, 72 insertions(+), 57 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index d96c5dbe..f4fa013a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -71,9 +71,6 @@ S3method(xml_add_parent,xml_nodeset) S3method(xml_add_sibling,xml_missing) S3method(xml_add_sibling,xml_node) S3method(xml_add_sibling,xml_nodeset) -S3method(xml_attr,xml_missing) -S3method(xml_attr,xml_node) -S3method(xml_attr,xml_nodeset) S3method(xml_attrs,xml_missing) S3method(xml_attrs,xml_node) S3method(xml_attrs,xml_nodeset) diff --git a/R/xml_attr.R b/R/xml_attr.R index 1f2a2fca..495c4fd6 100644 --- a/R/xml_attr.R +++ b/R/xml_attr.R @@ -60,31 +60,7 @@ #' xml_attrs(doc) <- c("b:id" = "one", "f:id" = "two", "id" = "three") #' xml_set_attrs(doc, c("b:id" = "one", "f:id" = "two", "id" = "three")) xml_attr <- function(x, attr, ns = character(), default = NA_character_) { - UseMethod("xml_attr") -} - -#' @export -xml_attr.xml_missing <- function(x, attr, ns = character(), default = NA_character_) { - default -} - -#' @export -xml_attr.xml_node <- function(x, attr, ns = character(), - default = NA_character_) { - .Call(node_attr, x$node, attr, as.character(default), ns) -} - -#' @export -xml_attr.xml_nodeset <- function(x, attr, ns = character(), - default = NA_character_) { - vapply( - x, - xml_attr, - attr = attr, - default = default, - ns = ns, - FUN.VALUE = character(1) - ) + .Call(node_attr, x, attr, as.character(default), ns) } #' @export diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index a0161eea..e1bd23ca 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -168,15 +168,66 @@ const xmlChar* xmlNsDefinition(xmlNodePtr node, const xmlChar* lookup) { return NULL; } + +SEXP node_attr_impl(SEXP x, + std::string name, + SEXP missingVal, + SEXP nsMap_sxp) { + NodeType type = getNodeType(x); + + switch(type) { + case NodeType::missing: + return NA_STRING; + break; + case NodeType::node: { + SEXP node_sxp = VECTOR_ELT(x, 0); + XPtrNode node(node_sxp); + if (name == "xmlns") { + return Xml2String(xmlNsDefinition(node, NULL)).asRString(missingVal); + } + + if (hasPrefix("xmlns:", name)) { + std::string prefix = name.substr(6); + return Xml2String(xmlNsDefinition(node, asXmlChar(prefix))).asRString(missingVal); + } + + xmlChar* string; + if (Rf_xlength(nsMap_sxp) == 0) { + string = xmlGetProp(node.checked_get(), asXmlChar(name)); + } else { + size_t colon = name.find(':'); + if (colon == std::string::npos) { + // Has namespace spec, but attribute not qualified, so look for attribute + // without namespace + string = xmlGetNoNsProp(node.checked_get(), asXmlChar(name)); + } else { + // Split name into prefix & attr, then look up full url + std::string + prefix = name.substr(0, colon), + attr = name.substr(colon + 1, name.size() - 1); + + std::string url = NsMap(nsMap_sxp).findUrl(prefix); + + string = xmlGetNsProp(node.checked_get(), asXmlChar(attr), asXmlChar(url)); + } + } + + return Xml2String(string).asRString(missingVal); + break; + } + default: Rf_error("Unexpected node type"); + } +} + // [[export]] extern "C" SEXP node_attr( - SEXP node_sxp, + SEXP x, SEXP name_sxp, SEXP missing_sxp, SEXP nsMap_sxp) { BEGIN_CPP + NodeType type = getNodeType(x); - XPtrNode node(node_sxp); std::string name(CHAR(STRING_ELT(name_sxp, 0))); if (Rf_xlength(missing_sxp) != 1) { @@ -185,37 +236,28 @@ extern "C" SEXP node_attr( SEXP missingVal = STRING_ELT(missing_sxp, 0); - if (name == "xmlns") { - return Rf_ScalarString(Xml2String(xmlNsDefinition(node, NULL)).asRString(missingVal)); - } - - if (hasPrefix("xmlns:", name)) { - std::string prefix = name.substr(6); - return Rf_ScalarString(Xml2String(xmlNsDefinition(node, asXmlChar(prefix))).asRString(missingVal)); - } - - xmlChar* string; - if (Rf_xlength(nsMap_sxp) == 0) { - string = xmlGetProp(node.checked_get(), asXmlChar(name)); - } else { - size_t colon = name.find(':'); - if (colon == std::string::npos) { - // Has namespace spec, but attribute not qualified, so look for attribute - // without namespace - string = xmlGetNoNsProp(node.checked_get(), asXmlChar(name)); - } else { - // Split name into prefix & attr, then look up full url - std::string - prefix = name.substr(0, colon), - attr = name.substr(colon + 1, name.size() - 1); + switch(type) + { + case NodeType::missing: + case NodeType::node : + return Rf_ScalarString(node_attr_impl(x, name, missingVal, nsMap_sxp)); + break; + case NodeType::nodeset: { + int n = Rf_xlength(x); - std::string url = NsMap(nsMap_sxp).findUrl(prefix); + SEXP out = PROTECT(Rf_allocVector(STRSXP, n)); - string = xmlGetNsProp(node.checked_get(), asXmlChar(attr), asXmlChar(url)); + for (int i = 0; i < n; ++i) { + SEXP x_i = VECTOR_ELT(x, i); + SEXP attr_i = node_attr_impl(x_i, name, missingVal, nsMap_sxp); + SET_STRING_ELT(out, i, attr_i); } + + UNPROTECT(1); + return(out); + }; } - return Rf_ScalarString(Xml2String(string).asRString(missingVal)); END_CPP } diff --git a/tests/testthat/test-xml_missing.R b/tests/testthat/test-xml_missing.R index fdc24f04..78a3a144 100644 --- a/tests/testthat/test-xml_missing.R +++ b/tests/testthat/test-xml_missing.R @@ -21,7 +21,7 @@ test_that("xml_missing methods return properly for all S3 methods", { expect_equal(tree_structure(mss), NA_character_) expect_error(write_xml(mss), "Missing data cannot be written") expect_error(write_html(mss), "Missing data cannot be written") - expect_equal(xml_attr(mss), NA_character_) + expect_equal(xml_attr(mss, "dummy_attr"), NA_character_) expect_equal(xml_attrs(mss), NA_character_) expect_equal(xml_find_all(mss), xml_nodeset()) expect_equal(xml_find_chr(mss), character()) From 3e2049ef987f4bc559106711f61fa1cbd5227a0f Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Aug 2023 14:01:13 +0000 Subject: [PATCH 14/24] Implement `xml_attrs()` in C --- NAMESPACE | 3 -- R/xml_attr.R | 17 +------ src/xml2_node.cpp | 126 ++++++++++++++++++++++++++++++---------------- 3 files changed, 83 insertions(+), 63 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index f4fa013a..100d4c3a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -71,9 +71,6 @@ S3method(xml_add_parent,xml_nodeset) S3method(xml_add_sibling,xml_missing) S3method(xml_add_sibling,xml_node) S3method(xml_add_sibling,xml_nodeset) -S3method(xml_attrs,xml_missing) -S3method(xml_attrs,xml_node) -S3method(xml_attrs,xml_nodeset) S3method(xml_find_all,xml_missing) S3method(xml_find_all,xml_node) S3method(xml_find_all,xml_nodeset) diff --git a/R/xml_attr.R b/R/xml_attr.R index 495c4fd6..804a3788 100644 --- a/R/xml_attr.R +++ b/R/xml_attr.R @@ -72,22 +72,7 @@ xml_has_attr <- function(x, attr, ns = character()) { #' @export #' @rdname xml_attr xml_attrs <- function(x, ns = character()) { - UseMethod("xml_attrs") -} - -#' @export -xml_attrs.xml_missing <- function(x, ns = character()) { - NA_character_ -} - -#' @export -xml_attrs.xml_node <- function(x, ns = character()) { - .Call(node_attrs, x$node, nsMap = ns) -} - -#' @export -xml_attrs.xml_nodeset <- function(x, ns = character()) { - lapply(x, xml_attrs, ns = ns) + .Call(node_attrs, x, nsMap = ns) } #' @param value character vector of new value. diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index e1bd23ca..7718e17f 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -261,64 +261,102 @@ extern "C" SEXP node_attr( END_CPP } -// [[export]] -extern "C" SEXP node_attrs(SEXP node_sxp, SEXP nsMap_sxp) { - BEGIN_CPP - XPtrNode node_(node_sxp); - - int n = 0; - xmlNodePtr node = node_.checked_get(); - - if (node->type == XML_ELEMENT_NODE) { - // attributes - for(xmlAttr* cur = node->properties; cur != NULL; cur = cur->next) - n++; - - // namespace definitions - for(xmlNsPtr cur = node->nsDef; cur != NULL; cur = cur->next) - n++; - - SEXP names = PROTECT(Rf_allocVector(STRSXP, n)); - SEXP values = PROTECT(Rf_allocVector(STRSXP, n)); - - int i = 0; - for(xmlAttr* cur = node->properties; cur != NULL; cur = cur->next, ++i) { - std::string name = nodeName(cur, nsMap_sxp); - SET_STRING_ELT(names, i, Rf_mkCharLenCE(name.c_str(), name.size(), CE_UTF8)); +SEXP node_attrs_impl(SEXP x, SEXP nsMap_sxp) { + NodeType type = getNodeType(x); - xmlNs* ns = cur->ns; - if (ns == NULL) { - if (Rf_xlength(nsMap_sxp) > 0) { - SET_STRING_ELT(values, i, Xml2String(xmlGetNoNsProp(node, cur->name)).asRString()); + switch(type) { + case NodeType::missing: + return Rf_ScalarString(NA_STRING); + break; + case NodeType::node: { + SEXP node_sxp = VECTOR_ELT(x, 0); + XPtrNode node_(node_sxp); + + int n = 0; + xmlNodePtr node = node_.checked_get(); + + if (node->type == XML_ELEMENT_NODE) { + // attributes + for(xmlAttr* cur = node->properties; cur != NULL; cur = cur->next) + n++; + + // namespace definitions + for(xmlNsPtr cur = node->nsDef; cur != NULL; cur = cur->next) + n++; + + SEXP names = PROTECT(Rf_allocVector(STRSXP, n)); + SEXP values = PROTECT(Rf_allocVector(STRSXP, n)); + + int i = 0; + for(xmlAttr* cur = node->properties; cur != NULL; cur = cur->next, ++i) { + std::string name = nodeName(cur, nsMap_sxp); + SET_STRING_ELT(names, i, Rf_mkCharLenCE(name.c_str(), name.size(), CE_UTF8)); + + xmlNs* ns = cur->ns; + if (ns == NULL) { + if (Rf_xlength(nsMap_sxp) > 0) { + SET_STRING_ELT(values, i, Xml2String(xmlGetNoNsProp(node, cur->name)).asRString()); + } else { + SET_STRING_ELT(values, i, Xml2String(xmlGetProp(node, cur->name)).asRString()); + } } else { - SET_STRING_ELT(values, i, Xml2String(xmlGetProp(node, cur->name)).asRString()); + SET_STRING_ELT(values, i, Xml2String(xmlGetNsProp(node, cur->name, ns->href)).asRString()); } - } else { - SET_STRING_ELT(values, i, Xml2String(xmlGetNsProp(node, cur->name, ns->href)).asRString()); } - } - for(xmlNsPtr cur = node->nsDef; cur != NULL; cur = cur->next, ++i) { - if (cur->prefix == NULL) { - SET_STRING_ELT(names, i, Rf_mkChar("xmlns")); - } else { - std::string name = std::string("xmlns:") + Xml2String(cur->prefix).asStdString(); - SET_STRING_ELT(names,i, Rf_mkCharLenCE(name.c_str(), name.size(), CE_UTF8)); + for(xmlNsPtr cur = node->nsDef; cur != NULL; cur = cur->next, ++i) { + if (cur->prefix == NULL) { + SET_STRING_ELT(names, i, Rf_mkChar("xmlns")); + } else { + std::string name = std::string("xmlns:") + Xml2String(cur->prefix).asStdString(); + SET_STRING_ELT(names,i, Rf_mkCharLenCE(name.c_str(), name.size(), CE_UTF8)); + } + SET_STRING_ELT(values, i, Xml2String(cur->href).asRString()); } - SET_STRING_ELT(values, i, Xml2String(cur->href).asRString()); + + Rf_setAttrib(values, R_NamesSymbol, names); + + UNPROTECT(2); + return values; } - Rf_setAttrib(values, R_NamesSymbol, names); + return Rf_allocVector(STRSXP, 0); + break; + } + default: Rf_error("Unexpected node type"); + } +} + +// [[export]] +extern "C" SEXP node_attrs(SEXP x, SEXP nsMap_sxp) { + BEGIN_CPP + NodeType type = getNodeType(x); - UNPROTECT(2); - return values; + switch(type) + { + case NodeType::missing: + case NodeType::node : + return(node_attrs_impl(x, nsMap_sxp)); + break; + case NodeType::nodeset: { + int n = Rf_xlength(x); + + SEXP out = PROTECT(Rf_allocVector(VECSXP, n)); + + for (int i = 0; i < n; ++i) { + SEXP x_i = VECTOR_ELT(x, i); + SEXP name_i = node_attrs_impl(x_i, nsMap_sxp); + SET_VECTOR_ELT(out, i, name_i); + } + + UNPROTECT(1); + return(out); + }; } - return Rf_allocVector(STRSXP, 0); END_CPP } - // Fix the tree by removing the namespace pointers to the given tree void xmlRemoveNamespace(xmlNodePtr tree, xmlNsPtr ns) { From 08ba881759cae326f009644b8dda7b50ad0d88ea Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Aug 2023 14:03:43 +0000 Subject: [PATCH 15/24] Implement `xml_path()` in C --- NAMESPACE | 3 --- R/xml_path.R | 17 +--------------- src/xml2_node.cpp | 49 ++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 47 insertions(+), 22 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 100d4c3a..bf0525b4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -93,9 +93,6 @@ S3method(xml_ns,xml_nodeset) S3method(xml_parent,xml_missing) S3method(xml_parent,xml_node) S3method(xml_parent,xml_nodeset) -S3method(xml_path,xml_missing) -S3method(xml_path,xml_node) -S3method(xml_path,xml_nodeset) S3method(xml_remove,xml_missing) S3method(xml_remove,xml_node) S3method(xml_remove,xml_nodeset) diff --git a/R/xml_path.R b/R/xml_path.R index 79f34d9e..7e233a58 100644 --- a/R/xml_path.R +++ b/R/xml_path.R @@ -10,20 +10,5 @@ #' x <- read_xml("") #' xml_path(xml_find_all(x, ".//baz")) xml_path <- function(x) { - UseMethod("xml_path") -} - -#' @export -xml_path.xml_missing <- function(x) { - NA_character_ -} - -#' @export -xml_path.xml_node <- function(x) { - .Call(node_path, x$node) -} - -#' @export -xml_path.xml_nodeset <- function(x) { - vapply(x, xml_path, FUN.VALUE = character(1)) + .Call(node_path, x) } diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index 7718e17f..3aa8c50a 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -749,12 +749,55 @@ extern "C" SEXP node_parent(SEXP node_sxp) { END_CPP } +SEXP node_path_impl(SEXP x) { + NodeType type = getNodeType(x); + + SEXP out; + + switch(type) { + case NodeType::missing: + out = NA_STRING; + break; + case NodeType::node: { + SEXP node_sxp = VECTOR_ELT(x, 0); + XPtrNode node(node_sxp); + + out = Xml2String(xmlGetNodePath(node.checked_get())).asRString(); + break; + } + default: Rf_error("Unexpected node type"); + } + + return(out); +} + // [[export]] -extern "C" SEXP node_path(SEXP node_sxp) { +extern "C" SEXP node_path(SEXP x) { BEGIN_CPP - XPtrNode node(node_sxp); + NodeType type = getNodeType(x); + + switch(type) + { + case NodeType::missing: + case NodeType::node : + return(Rf_ScalarString(node_path_impl(x))); + break; + case NodeType::nodeset: { + int n = Rf_xlength(x); + + SEXP out = PROTECT(Rf_allocVector(STRSXP, n)); + + for (int i = 0; i < n; ++i) { + SEXP x_i = VECTOR_ELT(x, i); + SEXP name_i = node_path_impl(x_i); + SET_STRING_ELT(out, i, name_i); + } + + UNPROTECT(1); + return(out); + }; + } - return Rf_ScalarString(Xml2String(xmlGetNodePath(node.checked_get())).asRString()); END_CPP } From efc24cfce51c544392695d0ddcb3c24641e3a050 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Aug 2023 14:06:42 +0000 Subject: [PATCH 16/24] Fix return style --- src/xml2_node.cpp | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index 3aa8c50a..28dc9565 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -47,7 +47,7 @@ SEXP node_name_impl(SEXP x, SEXP nsMap) { default: Rf_error("Unexpected node type"); } - return(out); + return out; } // [[export]] @@ -59,7 +59,7 @@ extern "C" SEXP node_name(SEXP x, SEXP nsMap) { { case NodeType::missing: case NodeType::node : - return(Rf_ScalarString(node_name_impl(x, nsMap))); + return Rf_ScalarString(node_name_impl(x, nsMap)); break; case NodeType::nodeset: { int n = Rf_xlength(x); @@ -73,7 +73,7 @@ extern "C" SEXP node_name(SEXP x, SEXP nsMap) { } UNPROTECT(1); - return(out); + return out; }; } @@ -110,7 +110,7 @@ SEXP node_text_impl(SEXP x) { default: Rf_error("Unexpected node type"); } - return(out); + return out; } // [[export]] @@ -122,7 +122,7 @@ extern "C" SEXP node_text(SEXP x) { { case NodeType::missing: case NodeType::node : - return(Rf_ScalarString(node_text_impl(x))); + return Rf_ScalarString(node_text_impl(x)); break; case NodeType::nodeset: { int n = Rf_xlength(x); @@ -136,7 +136,7 @@ extern "C" SEXP node_text(SEXP x) { } UNPROTECT(1); - return(out); + return out; }; } @@ -254,7 +254,7 @@ extern "C" SEXP node_attr( } UNPROTECT(1); - return(out); + return out; }; } @@ -336,7 +336,7 @@ extern "C" SEXP node_attrs(SEXP x, SEXP nsMap_sxp) { { case NodeType::missing: case NodeType::node : - return(node_attrs_impl(x, nsMap_sxp)); + return node_attrs_impl(x, nsMap_sxp); break; case NodeType::nodeset: { int n = Rf_xlength(x); @@ -350,7 +350,7 @@ extern "C" SEXP node_attrs(SEXP x, SEXP nsMap_sxp) { } UNPROTECT(1); - return(out); + return out; }; } @@ -648,13 +648,13 @@ extern "C" SEXP node_length(SEXP x, SEXP only_node_sxp) { { case NodeType::missing: case NodeType::node : - return(Rf_ScalarInteger(node_length_impl(x, only_node))); + return Rf_ScalarInteger(node_length_impl(x, only_node)); break; case NodeType::nodeset: { int n = Rf_xlength(x); if (n == 0) { - return(Rf_ScalarInteger(0)); + return Rf_ScalarInteger(0); } SEXP out = PROTECT(Rf_allocVector(INTSXP, n)); @@ -668,7 +668,7 @@ extern "C" SEXP node_length(SEXP x, SEXP only_node_sxp) { } UNPROTECT(1); - return(out); + return out; }; } @@ -768,7 +768,7 @@ SEXP node_path_impl(SEXP x) { default: Rf_error("Unexpected node type"); } - return(out); + return out; } // [[export]] @@ -780,7 +780,7 @@ extern "C" SEXP node_path(SEXP x) { { case NodeType::missing: case NodeType::node : - return(Rf_ScalarString(node_path_impl(x))); + return Rf_ScalarString(node_path_impl(x)); break; case NodeType::nodeset: { int n = Rf_xlength(x); @@ -794,7 +794,7 @@ extern "C" SEXP node_path(SEXP x) { } UNPROTECT(1); - return(out); + return out; }; } @@ -850,7 +850,7 @@ int node_type_impl(SEXP x) { default: Rf_error("Unexpected node type"); } - return(out); + return out; } // [[export]] @@ -862,7 +862,7 @@ extern "C" SEXP node_type(SEXP x) { { case NodeType::missing: case NodeType::node : - return(Rf_ScalarInteger(node_type_impl(x))); + return Rf_ScalarInteger(node_type_impl(x)); break; case NodeType::nodeset: { int n = Rf_xlength(x); @@ -878,7 +878,7 @@ extern "C" SEXP node_type(SEXP x) { } UNPROTECT(1); - return(out); + return out; }; } From 6a5624f50f0c33d712909e24dfbf7b8c41a89315 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Aug 2023 14:12:05 +0000 Subject: [PATCH 17/24] Fix compilation warning --- src/xml2_node.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index 28dc9565..35f105ec 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -75,6 +75,7 @@ extern "C" SEXP node_name(SEXP x, SEXP nsMap) { UNPROTECT(1); return out; }; + default: Rf_error("Unexpected node type"); } END_CPP @@ -138,6 +139,7 @@ extern "C" SEXP node_text(SEXP x) { UNPROTECT(1); return out; }; + default: Rf_error("Unexpected node type"); } END_CPP @@ -256,6 +258,7 @@ extern "C" SEXP node_attr( UNPROTECT(1); return out; }; + default: Rf_error("Unexpected node type"); } END_CPP @@ -352,6 +355,7 @@ extern "C" SEXP node_attrs(SEXP x, SEXP nsMap_sxp) { UNPROTECT(1); return out; }; + default: Rf_error("Unexpected node type"); } END_CPP @@ -670,6 +674,7 @@ extern "C" SEXP node_length(SEXP x, SEXP only_node_sxp) { UNPROTECT(1); return out; }; + default: Rf_error("Unexpected node type"); } END_CPP @@ -796,6 +801,7 @@ extern "C" SEXP node_path(SEXP x) { UNPROTECT(1); return out; }; + default: Rf_error("Unexpected node type"); } END_CPP @@ -880,6 +886,7 @@ extern "C" SEXP node_type(SEXP x) { UNPROTECT(1); return out; }; + default: Rf_error("Unexpected node type"); } END_CPP From 7c1f008b594b947ecd86bdf85c4a8e951d6518bf Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Mon, 4 Sep 2023 07:05:00 +0000 Subject: [PATCH 18/24] Move `getNodeType()` to `xml2_utils.h` --- inst/include/xml2_types.h | 22 ++-------------------- src/xml2_utils.h | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/inst/include/xml2_types.h b/inst/include/xml2_types.h index 6b3c5972..88356025 100644 --- a/inst/include/xml2_types.h +++ b/inst/include/xml2_types.h @@ -5,24 +5,6 @@ #define R_NO_REMAP #include -enum NodeType { - missing = 1, - node = 2, - nodeset = 3, -}; - -inline const NodeType getNodeType(SEXP x) { - if (Rf_inherits(x, "xml_node")) { - return(NodeType::node); - } else if (Rf_inherits(x, "xml_nodeset")) { - return(NodeType::nodeset); - } else if (Rf_inherits(x, "xml_missing")) { - return(NodeType::missing); - } else { - Rf_error("Unexpected node type"); - } -} - template class XPtr { protected: SEXP data_; @@ -39,12 +21,12 @@ template class XPtr { data_ = R_MakeExternalPtr((void *) p, R_NilValue, R_NilValue); R_PreserveObject(data_); } - + XPtr(const XPtr &old) { data_ = old.data_; R_PreserveObject(data_); } - + XPtr& operator=(const XPtr &other) { R_PreserveObject(other.data_); if (data_ != NULL) { diff --git a/src/xml2_utils.h b/src/xml2_utils.h index 231b7e38..d77b3cf7 100644 --- a/src/xml2_utils.h +++ b/src/xml2_utils.h @@ -9,6 +9,24 @@ #include #include +enum NodeType { + missing = 1, + node = 2, + nodeset = 3, +}; + +inline const NodeType getNodeType(SEXP x) { + if (Rf_inherits(x, "xml_node")) { + return(NodeType::node); + } else if (Rf_inherits(x, "xml_nodeset")) { + return(NodeType::nodeset); + } else if (Rf_inherits(x, "xml_missing")) { + return(NodeType::missing); + } else { + Rf_error("Unexpected node type"); + } +} + inline const xmlChar* asXmlChar(std::string const& x) { return (const xmlChar*) x.c_str(); } From 10fcf60e822b17eb04892d4f891f7205897e2e60 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Mon, 4 Sep 2023 07:06:45 +0000 Subject: [PATCH 19/24] Don't align `:` in switches --- src/xml2_node.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index 35f105ec..d6938f47 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -58,7 +58,7 @@ extern "C" SEXP node_name(SEXP x, SEXP nsMap) { switch(type) { case NodeType::missing: - case NodeType::node : + case NodeType::node: return Rf_ScalarString(node_name_impl(x, nsMap)); break; case NodeType::nodeset: { @@ -122,7 +122,7 @@ extern "C" SEXP node_text(SEXP x) { switch(type) { case NodeType::missing: - case NodeType::node : + case NodeType::node: return Rf_ScalarString(node_text_impl(x)); break; case NodeType::nodeset: { @@ -241,7 +241,7 @@ extern "C" SEXP node_attr( switch(type) { case NodeType::missing: - case NodeType::node : + case NodeType::node: return Rf_ScalarString(node_attr_impl(x, name, missingVal, nsMap_sxp)); break; case NodeType::nodeset: { @@ -338,7 +338,7 @@ extern "C" SEXP node_attrs(SEXP x, SEXP nsMap_sxp) { switch(type) { case NodeType::missing: - case NodeType::node : + case NodeType::node: return node_attrs_impl(x, nsMap_sxp); break; case NodeType::nodeset: { @@ -651,7 +651,7 @@ extern "C" SEXP node_length(SEXP x, SEXP only_node_sxp) { switch(type) { case NodeType::missing: - case NodeType::node : + case NodeType::node: return Rf_ScalarInteger(node_length_impl(x, only_node)); break; case NodeType::nodeset: { @@ -784,7 +784,7 @@ extern "C" SEXP node_path(SEXP x) { switch(type) { case NodeType::missing: - case NodeType::node : + case NodeType::node: return Rf_ScalarString(node_path_impl(x)); break; case NodeType::nodeset: { @@ -867,7 +867,7 @@ extern "C" SEXP node_type(SEXP x) { switch(type) { case NodeType::missing: - case NodeType::node : + case NodeType::node: return Rf_ScalarInteger(node_type_impl(x)); break; case NodeType::nodeset: { From 43689510497ce40091b8644b9e485a9d76e54a72 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Mon, 4 Sep 2023 07:08:26 +0000 Subject: [PATCH 20/24] Pass string by reference --- src/xml2_node.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index d6938f47..ca9b8c31 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -172,7 +172,7 @@ const xmlChar* xmlNsDefinition(xmlNodePtr node, const xmlChar* lookup) { SEXP node_attr_impl(SEXP x, - std::string name, + const std::string& name, SEXP missingVal, SEXP nsMap_sxp) { NodeType type = getNodeType(x); From a1ba91581816f5581471e495ea4dce524892419b Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Mon, 4 Sep 2023 07:10:33 +0000 Subject: [PATCH 21/24] Use array indexing for pointer --- src/xml2_node.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index ca9b8c31..f9cb4132 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -667,8 +667,7 @@ extern "C" SEXP node_length(SEXP x, SEXP only_node_sxp) { for (int i = 0; i < n; ++i) { SEXP x_i = VECTOR_ELT(x, i); int length_i = node_length_impl(x_i, only_node); - *p_out = length_i; - p_out++; + p_out[i] = length_i; } UNPROTECT(1); @@ -879,8 +878,7 @@ extern "C" SEXP node_type(SEXP x) { for (int i = 0; i < n; ++i) { SEXP x_i = VECTOR_ELT(x, i); int type_i = node_type_impl(x_i); - *p_out = type_i; - ++p_out; + p_out[i] = type_i; } UNPROTECT(1); From 71b21602c9792212c8b8f17d98680b83e138ba4e Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Mon, 4 Sep 2023 07:14:13 +0000 Subject: [PATCH 22/24] Fix test description --- tests/testthat/test-xml_type.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-xml_type.R b/tests/testthat/test-xml_type.R index a2a63376..eda8230d 100644 --- a/tests/testthat/test-xml_type.R +++ b/tests/testthat/test-xml_type.R @@ -1,4 +1,4 @@ -test_that("multiplication works", { +test_that("xml_type() works", { x <- read_xml("

Some text.

Some other.

From 5d3071f00d0d0710cc3fd67e566537990e9ed1f9 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Mon, 4 Sep 2023 07:17:13 +0000 Subject: [PATCH 23/24] Add `stop_unexpected_node_type()` --- src/xml2_node.cpp | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index f9cb4132..128b184a 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -12,6 +12,10 @@ #include "xml2_types.h" #include "xml2_utils.h" +void stop_unexpected_node_type() { + Rf_error("Unexpected node type"); +} + template // for xmlAttr and xmlNode std::string nodeName(T* node, SEXP nsMap) { std::string name = Xml2String(node->name).asStdString(); @@ -44,7 +48,7 @@ SEXP node_name_impl(SEXP x, SEXP nsMap) { out = Rf_mkCharLenCE(name.c_str(), name.size(), CE_UTF8); break; } - default: Rf_error("Unexpected node type"); + default: stop_unexpected_node_type(); } return out; @@ -75,7 +79,7 @@ extern "C" SEXP node_name(SEXP x, SEXP nsMap) { UNPROTECT(1); return out; }; - default: Rf_error("Unexpected node type"); + default: stop_unexpected_node_type(); } END_CPP @@ -108,7 +112,7 @@ SEXP node_text_impl(SEXP x) { out = Xml2String(xmlNodeGetContent(node.checked_get())).asRString(); break; } - default: Rf_error("Unexpected node type"); + default: stop_unexpected_node_type(); } return out; @@ -139,7 +143,7 @@ extern "C" SEXP node_text(SEXP x) { UNPROTECT(1); return out; }; - default: Rf_error("Unexpected node type"); + default: stop_unexpected_node_type(); } END_CPP @@ -217,7 +221,7 @@ SEXP node_attr_impl(SEXP x, return Xml2String(string).asRString(missingVal); break; } - default: Rf_error("Unexpected node type"); + default: stop_unexpected_node_type(); } } @@ -258,7 +262,7 @@ extern "C" SEXP node_attr( UNPROTECT(1); return out; }; - default: Rf_error("Unexpected node type"); + default: stop_unexpected_node_type(); } END_CPP @@ -326,7 +330,7 @@ SEXP node_attrs_impl(SEXP x, SEXP nsMap_sxp) { return Rf_allocVector(STRSXP, 0); break; } - default: Rf_error("Unexpected node type"); + default: stop_unexpected_node_type(); } } @@ -355,7 +359,7 @@ extern "C" SEXP node_attrs(SEXP x, SEXP nsMap_sxp) { UNPROTECT(1); return out; }; - default: Rf_error("Unexpected node type"); + default: stop_unexpected_node_type(); } END_CPP @@ -635,7 +639,7 @@ int node_length_impl(SEXP x, bool only_node) { } break; } - default: Rf_error("Unexpected node type"); + default: stop_unexpected_node_type(); } return out; @@ -673,7 +677,7 @@ extern "C" SEXP node_length(SEXP x, SEXP only_node_sxp) { UNPROTECT(1); return out; }; - default: Rf_error("Unexpected node type"); + default: stop_unexpected_node_type(); } END_CPP @@ -769,7 +773,7 @@ SEXP node_path_impl(SEXP x) { out = Xml2String(xmlGetNodePath(node.checked_get())).asRString(); break; } - default: Rf_error("Unexpected node type"); + default: stop_unexpected_node_type(); } return out; @@ -800,7 +804,7 @@ extern "C" SEXP node_path(SEXP x) { UNPROTECT(1); return out; }; - default: Rf_error("Unexpected node type"); + default: stop_unexpected_node_type(); } END_CPP @@ -852,7 +856,7 @@ int node_type_impl(SEXP x) { out = node->type; break; } - default: Rf_error("Unexpected node type"); + default: stop_unexpected_node_type(); } return out; @@ -884,7 +888,7 @@ extern "C" SEXP node_type(SEXP x) { UNPROTECT(1); return out; }; - default: Rf_error("Unexpected node type"); + default: stop_unexpected_node_type(); } END_CPP From fe5f269d11cbe43e520c7ab09138153026580a9d Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Mon, 4 Sep 2023 07:41:00 +0000 Subject: [PATCH 24/24] Fix compiler warning --- src/xml2_node.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index 128b184a..35b35cc4 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -12,6 +12,7 @@ #include "xml2_types.h" #include "xml2_utils.h" +__attribute__ ((noreturn)) void stop_unexpected_node_type() { Rf_error("Unexpected node type"); }