Skip to content
This repository has been archived by the owner on Jan 30, 2025. It is now read-only.

Commit

Permalink
Add sparsity()
Browse files Browse the repository at this point in the history
  • Loading branch information
nfrerebeau committed Feb 13, 2024
1 parent 86dab39 commit ab7dac4
Show file tree
Hide file tree
Showing 17 changed files with 158 additions and 31 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ Collate:
'remove.R'
'replace.R'
'seek.R'
'sparcity.R'
'statistics.R'
'transform.R'
'utilities.R'
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ exportMethods(replace_empty)
exportMethods(replace_zero)
exportMethods(seek_columns)
exportMethods(seek_rows)
exportMethods(sparsity)
importFrom(methods,.valueClassTest)
importFrom(methods,setGeneric)
importFrom(methods,setMethod)
3 changes: 2 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# arkhe 1.5.0.9000
## New classes and methods
* Add `describe()` to describe a table.
* Add `describe()` to quickly describe a table.
* Add `sparsity()` to computes data sparsity (proportion of zeros).

# arkhe 1.5.0
## New classes and methods
Expand Down
19 changes: 19 additions & 0 deletions R/AllGenerics.R
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,25 @@ setGeneric(
def = function(x, ...) standardGeneric("describe")
)

#' Sparcity
#'
#' Computes data sparsity (proportion of zeros).
#' @param x An \R object (should be a [`matrix`] or a [`data.frame`]).
#' @param count A [`logical`] scalar: should a count be returned instead of a
#' proportion?
#' @param ... Currently not used.
#' @return
#' A length-one [`numeric`] vector.
#' @example inst/examples/ex-describe.R
#' @author N. Frerebeau
#' @docType methods
#' @family data summaries
#' @aliases sparsity-method
setGeneric(
name = "sparsity",
def = function(x, ...) standardGeneric("sparsity")
)

# Data transformation ==========================================================

# Mathematics ==================================================================
Expand Down
22 changes: 20 additions & 2 deletions R/describe.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,30 @@ setMethod(
cols_var <- ngettext(p_var, "variable", "variables")
msg_col_var <- sprintf("%d %s with no variance", p_var, cols_var)

cat(msg_tbl, msg_row_NA, msg_col_NA, msg_col_var, sep = "\n* ")
## Sparsity
spa <- sparsity(x, count = FALSE)
msg_spa <- sprintf("%s of values are zero", label_percent(spa, digits = 1))

## Variable types
num <- detect(x, f = is.numeric, margin = 2)
bin <- detect(x, f = is.logical, margin = 2)
n_num <- sum(num)
n_bin <- sum(bin)
n_cha <- sum(!num & !bin)

msg_num <- sprintf("%d numeric %s", n_num, ngettext(n_num, "variable", "variables"))
msg_bin <- sprintf("%d binary %s", n_bin, ngettext(n_bin, "variable", "variables"))
msg_cha <- sprintf("%d categorial %s", n_cha, ngettext(n_cha, "variable", "variables"))

cat(msg_tbl, msg_num, msg_cha, msg_bin, sep = "\n* ")
cat("\nData checking:", msg_spa, msg_col_var, sep = "\n* ")
cat("\nMissing values:", msg_row_NA, msg_col_NA, sep = "\n* ")

# tot <- list(
# m = m, p = p,
# n_numeric = n_num, n_categorial = n_cha, n_binary = n_bin,
# row_missing = m_NA, col_missing = p_NA,
# col_constant = p_var
# zero_values = spa, zero_variance = p_var
# )
invisible(x)
}
Expand Down
32 changes: 17 additions & 15 deletions R/predicates.R
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
# PREDICATES

# Not exported =================================================================
is_empty_string <- function(x, na.rm = FALSE, assert_type = FALSE) {
if (isTRUE(assert_type)) assert_type(x, "character")
z <- x == ""
z
}
is_zero_numeric <- function(x, na.rm = FALSE, assert_type = FALSE) {
if (isTRUE(assert_type)) assert_type(x, "numeric")
z <- x == 0
z
is_zero_numeric <- function(x, tolerance = sqrt(.Machine$double.eps)) {
if (is.numeric(x)) return(abs(x) <= tolerance)
rep(FALSE, length(x))
}

# Helpers ======================================================================
Expand All @@ -27,6 +21,8 @@ is_zero_numeric <- function(x, na.rm = FALSE, assert_type = FALSE) {
#' `FALSE` otherwise.
#' @param names A [`character`] vector specifying the names to test `x`
#' with. If `NULL`, returns `TRUE` if `x` has names, and `FALSE` otherwise.
#' @param tolerance A [`numeric`] scalar giving the tolerance to check within
#' (for `numeric` vector).
#' @param na.rm A [`logical`] scalar: should missing values (including `NaN`)
#' be omitted?
#' @return A [`logical`] scalar.
Expand Down Expand Up @@ -70,9 +66,15 @@ has_infinite <- function(x) {

#' @export
#' @rdname predicate-utils
is_unique <- function(x, na.rm = FALSE) {
is_unique <- function(x, tolerance = sqrt(.Machine$double.eps), na.rm = FALSE) {
if (na.rm) x <- stats::na.omit(x)
length(unique(x)) == 1
if (is.numeric(x)) {
cte <- is_constant(x, tolerance = tolerance)
if (is.na(cte)) cte <- FALSE
} else {
cte <- length(unique(x)) == 1
}
cte
}
#' @export
#' @rdname predicate-utils
Expand Down Expand Up @@ -219,9 +221,9 @@ NULL

#' @export
#' @rdname predicate-numeric
is_zero <- function(x, ...) {
is_zero <- function(x, tolerance = sqrt(.Machine$double.eps), ...) {
assert_type(x, "numeric")
x == 0
abs(x) <= tolerance
}
#' @export
#' @rdname predicate-numeric
Expand Down Expand Up @@ -249,7 +251,7 @@ is_negative <- function(x, strict = FALSE, ...) {
}
#' @export
#' @rdname predicate-numeric
is_whole <- function(x, tolerance = .Machine$double.eps^0.5, ...) {
is_whole <- function(x, tolerance = sqrt(.Machine$double.eps), ...) {
assert_type(x, "numeric")
abs(x - round(x, digits = 0)) <= tolerance
}
Expand All @@ -273,7 +275,7 @@ NULL

#' @export
#' @rdname predicate-trend
is_constant <- function(x, tolerance = .Machine$double.eps^0.5, na.rm = FALSE) {
is_constant <- function(x, tolerance = sqrt(.Machine$double.eps), na.rm = FALSE) {
assert_type(x, "numeric")
abs(max(x, na.rm = na.rm) - min(x, na.rm = na.rm)) <= tolerance
}
Expand Down
4 changes: 2 additions & 2 deletions R/remove.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ setMethod(
signature = c(x = "ANY"),
definition = function(x, margin = 1, all = FALSE,
verbose = getOption("arkhe.verbose")) {
discard(x, f = is_empty_string, margin = margin,
discard(x, f = nzchar, margin = margin, negate = TRUE,
all = all, na.rm = TRUE, verbose = verbose)
}
)
Expand All @@ -64,7 +64,7 @@ setMethod(
f = "remove_constant",
signature = c(x = "ANY"),
definition = function(x, na.rm = FALSE, verbose = getOption("arkhe.verbose")) {
discard(x, f = function(x) { is_unique(x, na.rm) },
discard(x, f = function(x) { is_unique(x, na.rm = na.rm) },
margin = 2, all = FALSE, verbose = verbose)
}
)
4 changes: 2 additions & 2 deletions R/replace.R
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ setMethod(
f = "replace_empty",
signature = c(x = "matrix"),
definition = function(x, value) {
x[is_empty_string(x)] <- value
x[!nzchar(x)] <- value
x
}
)
Expand All @@ -125,7 +125,7 @@ setMethod(
noblank <- lapply(
X = x[, char, drop = FALSE],
FUN = function(x, value) {
x[is_empty_string(x)] <- value
x[!nzchar(x)] <- value
x
},
value = value
Expand Down
17 changes: 17 additions & 0 deletions R/sparcity.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# DATA SUMMARY: SPARCITY
#' @include AllGenerics.R
NULL

#' @export
#' @rdname sparsity
#' @aliases sparsity,ANY-method
setMethod(
f = "sparsity",
signature = c(x = "ANY"),
definition = function(x, count = FALSE) {
zeros <- sum(count(x, f = is_zero_numeric, margin = 2, na.rm = TRUE))
if (count) return(zeros)
total <- prod(dim(x))
zeros / total
}
)
2 changes: 1 addition & 1 deletion codemeta.json
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@
},
"SystemRequirements": null
},
"fileSize": "308.433KB",
"fileSize": "311.96KB",
"citation": [
{
"@type": "SoftwareSourceCode",
Expand Down
6 changes: 5 additions & 1 deletion inst/examples/ex-describe.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
## Create a data matrix
X <- matrix(sample(1:10, 15, TRUE), nrow = 3, ncol = 5)
X <- matrix(sample(0:9, 15, TRUE), nrow = 3, ncol = 5)

## Add NA
k <- sample(1:15, 3, FALSE)
X[k] <- NA

## Sparsity
sparsity(X)

## Quick description
describe(X)
4 changes: 4 additions & 0 deletions inst/tinytest/test_describe.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
## Read fake data
artefacts <- read.csv(system.file("tinytest/fake.csv", package = "arkhe"))

# Sparsity =====================================================================
expect_equal(sparsity(artefacts, count = TRUE), 20)
expect_equal(sparsity(artefacts, count = FALSE), 20/450)

# Describe =====================================================================
expect_stdout(describe(artefacts))
10 changes: 9 additions & 1 deletion man/describe.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions man/predicate-numeric.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/predicate-trend.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/predicate-utils.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 49 additions & 0 deletions man/sparsity.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit ab7dac4

Please sign in to comment.