Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add functionality to get information on formatting properties of runs. #576

Merged
merged 4 commits into from
May 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@
- Fix. `docx_summary` preserves non-breaking hyphens. Non-breaking hyphens are
replaced with a hyphen-minus (Unicode character 002D). Closes #573.

## Features

- `docx_summary()` gains parameter 'detailed' which allows to get a detailed
summary including formatting properties of runs in a paragraph. Formatting
properties are stored in a list column `run`, where each element
is a dataframe with rows corresponding to a single
run and columns containing the information on formatting properties.

# officer 0.6.6

Expand Down
76 changes: 70 additions & 6 deletions R/fortify_docx.R
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ docxtable_as_tibble <- function(node, styles, preserve = FALSE) {
}

#' @importFrom xml2 xml_has_attr
par_as_tibble <- function(node, styles) {
par_as_tibble <- function(node, styles, detailed = FALSE) {
style_node <- xml_child(node, "w:pPr/w:pStyle")
if (inherits(style_node, "xml_missing")) {
style_name <- NA
Expand All @@ -129,14 +129,71 @@ par_as_tibble <- function(node, styles) {
stringsAsFactors = FALSE
)

if (detailed) {
nodes_run <- xml_find_all(node, "w:r")
run_data <- lapply(nodes_run, run_as_tibble)

run_data <- mapply(function(x, id) {
x$id <- id
x
}, run_data, seq_along(run_data), SIMPLIFY = FALSE)
run_data <- rbind_match_columns(run_data)

par_data$run <- I(list(run_data))
}

par_data$content_type <- rep("paragraph", nrow(par_data))
par_data
}
#' @importFrom xml2 xml_has_attr
val_child <- function(node, child_path, attr = "val", default = NULL) {
child_node <- xml_child(node, child_path)
if (inherits(child_node, "xml_missing")) return(NA_character_)
if (!xml_has_attr(child_node, attr)) default
else xml_attr(child_node, attr)
}

val_child_lgl <- function(node, child_path, attr = "val", default = NULL) {
val <- val_child(node = node, child_path = child_path, attr = attr, default = default)
if (is.na(val)) return(NA)
else (val %in% c("1", "on", "true"))
}

val_child_int <- function(node, child_path, attr = "val", default = NULL) {
as.integer(
val_child(node = node, child_path = child_path, attr = attr, default = default)
)
}

run_as_tibble <- function(node, styles) {
style_node <- xml_child(node, "w:rPr/w:rStyle")
if (inherits(style_node, "xml_missing")) {
style_name <- NA
} else {
style_id <- xml_attr(style_node, "val")
style_name <- styles$style_name[styles$style_id %in% style_id]
}
run_data <- data.frame(
text = xml_text(node),
bold = val_child_lgl(node, "w:rPr/w:b", default = "true"),
italic = val_child_lgl(node, "w:rPr/w:i", default = "true"),
underline = val_child(node, "w:rPr/w:u"),
sz = val_child_int(node, "w:rPr/w:sz"),
szCs = val_child_int(node, "w:rPr/w:szCs"),
color = val_child(node, "w:rPr/w:color"),
shading = val_child(node, "w:rPr/w:shd"),
shading_color = val_child(node, "w:rPr/w:shd", attr = "color"),
shading_fill = val_child(node, "w:rPr/w:shd", attr = "fill"),
stringsAsFactors = FALSE
)

run_data
}

node_content <- function(node, x, preserve = FALSE) {
node_content <- function(node, x, preserve = FALSE, detailed = FALSE) {
node_name <- xml_name(node)
switch(node_name,
p = par_as_tibble(node, styles_info(x)),
p = par_as_tibble(node, styles_info(x), detailed = detailed),
tbl = docxtable_as_tibble(node, styles_info(x), preserve = preserve),
NULL
)
Expand All @@ -158,6 +215,12 @@ node_content <- function(node, x, preserve = FALSE) {
#' the `{docxtractr}` package by Bob Rudis.
#' @param remove_fields if TRUE, prevent field codes from appearing in the
#' returned data.frame.
#' @param detailed Should information on runs be included in summary dataframe?
#' Defaults to `FALSE`. If `TRUE` a list column `run` is added to the summary
#' containing a summary of formatting properties of runs as a dataframe with
#' rows corresponding to a single run and columns containing the information
#' on formatting properties.
#'
#' @examples
#' example_docx <- system.file(
#' package = "officer",
Expand All @@ -169,7 +232,7 @@ node_content <- function(node, x, preserve = FALSE) {
#'
#' docx_summary(doc, preserve = TRUE)[28, ]
#' @export
docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE) {
docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE, detailed = FALSE) {
if (remove_fields) {
instrText_nodes <- xml_find_all(x$doc_obj$get(), "//w:instrText")
xml_remove(instrText_nodes)
Expand All @@ -178,18 +241,19 @@ docx_summary <- function(x, preserve = FALSE, remove_fields = FALSE) {
all_nodes <- xml_find_all(x$doc_obj$get(), "/w:document/w:body/*[self::w:p or self::w:tbl]")


data <- lapply(all_nodes, node_content, x = x, preserve = preserve)
data <- lapply(all_nodes, node_content, x = x, preserve = preserve, detailed = detailed)

data <- mapply(function(x, id) {
x$doc_index <- id
x
}, data, seq_along(data), SIMPLIFY = FALSE)

data <- rbind_match_columns(data)

colnames <- c(
"doc_index", "content_type", "style_name", "text",
"level", "num_id", "row_id", "is_header", "cell_id",
"col_span", "row_span"
"col_span", "row_span", "run"
)
colnames <- intersect(colnames, names(data))
data[, colnames]
Expand Down
8 changes: 7 additions & 1 deletion man/docx_summary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

69 changes: 69 additions & 0 deletions tests/testthat/test-doc-summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,75 @@ test_that("preserves non breaking hyphens", {
)
})

test_that("detailed summary", {
doc <- read_docx()

fpar_ <- fpar(
ftext("Formatted ", prop = fp_text(bold = TRUE, color = "red")),
ftext("paragraph ", prop = fp_text(
shading.color = "blue"
)),
ftext("with multiple runs.",
prop = fp_text(italic = TRUE, font.size = 20, font.family = "Arial")
)
)

doc <- body_add_fpar(doc, fpar_, style = "Normal")

fpar_ <- fpar(
"Unformatted ",
"paragraph ",
"with multiple runs."
)

doc <- body_add_fpar(doc, fpar_, style = "Normal")

doc <- body_add_par(doc, "Single Run", style = "Normal")

doc <- body_add_fpar(doc,
fpar(
"Single formatetd run ",
fp_t = fp_text(bold = TRUE, color = "red")
)
)

xml_elt <- paste0(
officer:::wp_ns_yes,
"<w:pPr><w:pStyle w:val=\"Normal\"/></w:pPr>",
"<w:r><w:rPr></w:rPr><w:t>NA</w:t></w:r>",
"<w:r><w:rPr><w:b/><w:i/></w:rPr><w:t>toggle</w:t></w:r>",
"<w:r><w:rPr><w:b w:val=\"0\"/><w:i w:val=\"0\"/></w:rPr><w:t>0</w:t></w:r>",
"<w:r><w:rPr><w:b w:val=\"1\"/><w:i w:val=\"1\"/></w:rPr><w:t>1</w:t></w:r>",
"<w:r><w:rPr><w:b w:val=\"false\"/><w:i w:val=\"false\"/></w:rPr><w:t>false</w:t></w:r>",
"<w:r><w:rPr><w:b w:val=\"true\"/><w:i w:val=\"true\"/></w:rPr><w:t>true</w:t></w:r>",
"<w:r><w:rPr><w:b w:val=\"off\"/><w:i w:val=\"off\"/></w:rPr><w:t>off</w:t></w:r>",
"<w:r><w:rPr><w:b w:val=\"on\"/><w:i w:val=\"on\"/></w:rPr><w:t>on</w:t></w:r>",
"</w:p>"
)

doc <- officer:::body_add_xml(
x = doc, str = xml_elt
)

doc_sum <- docx_summary(doc, detailed = TRUE)

expect_true("run" %in% names(doc_sum))
expect_type(doc_sum$run, "list")
expect_equal(lengths(doc_sum$run), rep(11, 5))
expect_equal(sapply(doc_sum$run, nrow), c(3, 3, 1, 1, 8))

expect_true(all(sapply(doc_sum$run$bold, is.logical)))
expect_true(all(sapply(doc_sum$run$italic, is.logical)))
expect_true(all(sapply(doc_sum$run$sz, is.integer)))
expect_true(all(sapply(doc_sum$run$szCs, is.integer)))
expect_true(all(sapply(doc_sum$run$underline, is_character)))
expect_true(all(sapply(doc_sum$run$color, is_character)))
expect_true(all(sapply(doc_sum$run$shading, is_character)))
expect_true(all(sapply(doc_sum$run$shading_color, is_character)))
expect_true(all(sapply(doc_sum$run$shading_fill, is_character)))
})




test_that("pptx summary", {
Expand Down
Loading