383 improve default binning (#387)

* Update getDemographyAggregatedData with bins and stairstep new input * Fixes #383 binning performed after simulationSet selection The binning method is now centralized for all the demography parameters * Update documentation from merge * Update vignette with the simplified workflow * aggregation outputs NA values when there is no data in the bin * Remove simulationSetName from aggregation Because the aggregation is now performed by simulation set, keeping the variable is not necessary and actually causing issues for the stairstep implementation
Open-Systems-Pharmacology · Nov 23, 2020 · ffefd32 · ffefd32
1 parent 8102ca3
commit ffefd32
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 61 deletions.
diff --git a/R/utilities-calculate-pk-parameters.R b/R/utilities-calculate-pk-parameters.R
@@ -260,14 +260,7 @@ plotPopulationPKParameters <- function(structureSets,
           "median" = pkParameterMetaData$Value
         )
 
-        aggregatedData <- getDemographyAggregatedData(
-          data = pkParameterData,
-          xParameterName = demographyParameter,
-          yParameterName = "Value",
-          xParameterBreaks = settings$xParametersBreaks[[demographyParameter]]
-        )
-
-        populationNames <- levels(factor(aggregatedData$Population))
+        populationNames <- levels(factor(pkParameterData$simulationSetName))
 
         # For pediatric workflow, range plots compare reference population to the other populations
         if (workflowType %in% c(PopulationWorkflowTypes$pediatric)) {
@@ -285,7 +278,14 @@ plotPopulationPKParameters <- function(structureSets,
           )
 
           for (populationName in populationNames[!populationNames %in% referencePopulationName]) {
-            comparisonData <- aggregatedData[aggregatedData$Population %in% populationName, ]
+            comparisonData <- pkParameterData[pkParameterData$simulationSetName %in% populationName, ]
+            comparisonData <- getDemographyAggregatedData(
+              data = comparisonData,
+              xParameterName = demographyParameter,
+              yParameterName = "Value",
+              bins = settings$bins,
+              stairstep = settings$stairstep
+            )
             comparisonData$Population <- paste("Simulated", AggregationConfiguration$names$middle, "and", AggregationConfiguration$names$range)
 
             comparisonVpcPlot <- vpcParameterPlot(
@@ -304,7 +304,15 @@ plotPopulationPKParameters <- function(structureSets,
 
         # Regular range plots not associated to workflow type
         for (populationName in populationNames) {
-          vpcData <- aggregatedData[aggregatedData$Population %in% populationName, ]
+          vpcData <- pkParameterData[pkParameterData$simulationSetName %in% populationName, ]
+          vpcData <- getDemographyAggregatedData(
+            data = vpcData,
+            xParameterName = demographyParameter,
+            yParameterName = "Value",
+            bins = settings$bins,
+            stairstep = settings$stairstep
+          )
+
           vpcData$Population <- paste("Simulated", AggregationConfiguration$names$middle, "and", AggregationConfiguration$names$range)
           vpcPlot <- vpcParameterPlot(
             data = vpcData,

diff --git a/R/utilities-demography.R b/R/utilities-demography.R
@@ -62,7 +62,7 @@ plotDemographyParameters <- function(structureSets,
           metaData = demographyMetaData,
           dataMapping = histogramMapping,
           plotConfiguration = settings$plotConfigurations[["histogram"]],
-          bins = settings$bins %||% 11
+          bins = settings$bins %||% AggregationConfiguration$bins
         )
         demographyPlots[[parameterLabel]] <- demographyHistogram
         demographyCaptions[[parameterLabel]] <- getPkParametersCaptions("Histogram", captionSimulationNames, demographyMetaData[[parameterName]])
@@ -84,7 +84,7 @@ plotDemographyParameters <- function(structureSets,
             metaData = demographyMetaData,
             dataMapping = histogramMapping,
             plotConfiguration = settings$plotConfigurations[["histogram"]],
-            bins = settings$bins %||% 11
+            bins = settings$bins %||% AggregationConfiguration$bins
           )
 
           demographyPlots[[paste0(parameterLabel, "-", populationName)]] <- demographyHistogram
@@ -112,12 +112,6 @@ plotDemographyParameters <- function(structureSets,
         "x" = demographyMetaData[[demographyParameter]],
         "median" = demographyMetaData[[parameterName]]
       )
-      aggregatedData <- getDemographyAggregatedData(
-        data = demographyData,
-        xParameterName = demographyParameter,
-        yParameterName = parameterName,
-        xParameterBreaks = settings$xParametersBreaks[[demographyParameter]]
-      )
 
       # For pediatric workflow, range plots compare reference population to the other populations
       if (workflowType %in% c(PopulationWorkflowTypes$pediatric)) {
@@ -138,7 +132,14 @@ plotDemographyParameters <- function(structureSets,
         )
 
         for (populationName in populationNames[!populationNames %in% referencePopulationName]) {
-          comparisonData <- aggregatedData[aggregatedData$Population %in% populationName, ]
+          comparisonData <- demographyData[demographyData$simulationSetName %in% populationName, ]
+          comparisonData <- getDemographyAggregatedData(
+            data = comparisonData,
+            xParameterName = demographyParameter,
+            yParameterName = parameterName,
+            bins = settings$bins,
+            stairstep = settings$stairstep
+          )
           comparisonData$Population <- paste("Simulated", AggregationConfiguration$names$middle, "and", AggregationConfiguration$names$range)
 
           comparisonVpcPlot <- vpcParameterPlot(
@@ -156,7 +157,14 @@ plotDemographyParameters <- function(structureSets,
       }
 
       for (populationName in populationNames) {
-        vpcData <- aggregatedData[aggregatedData$Population %in% populationName, ]
+        vpcData <- demographyData[demographyData$simulationSetName %in% populationName, ]
+        vpcData <- getDemographyAggregatedData(
+          data = vpcData,
+          xParameterName = demographyParameter,
+          yParameterName = parameterName,
+          bins = settings$bins,
+          stairstep = settings$stairstep
+        )
         vpcData$Population <- paste("Simulated", AggregationConfiguration$names$middle, "and", AggregationConfiguration$names$range)
 
         vpcPlot <- vpcParameterPlot(
@@ -232,42 +240,43 @@ getDefaultDemographyXParameters <- function(workflowType) {
 getDemographyAggregatedData <- function(data,
                                         xParameterName,
                                         yParameterName,
-                                        xParameterBreaks = NULL) {
-  xParameterBreaks <- xParameterBreaks %||% 10
+                                        bins = NULL,
+                                        stairstep = TRUE) {
+  stairstep <- stairstep %||% TRUE
+  xParameterBreaks <- bins %||% AggregationConfiguration$bins
+  # binningOnQuantiles use data distribution to improve the binning
+  if (isOfLength(bins, 1) & AggregationConfiguration$binUsingQuantiles) {
+    xParameterBreaks <- unique(unname(quantile(x = data[, xParameterName], probs = seq(0, 1, length.out = xParameterBreaks))))
+  }
   xParameterBins <- cut(data[, xParameterName], breaks = xParameterBreaks)
 
+  # simulationSetName was removed from "by" input because
+  # it is a factor class that messes up the aggregation now that
+  # simulationSetName filtering is performed before aggregation
   xData <- stats::aggregate(
     x = data[, xParameterName],
-    by = list(
-      Bins = xParameterBins,
-      Population = data[, "simulationSetName"]
-    ),
-    FUN = AggregationConfiguration$functions$middle
+    by = list(Bins = xParameterBins),
+    FUN = AggregationConfiguration$functions$middle,
+    drop = FALSE
   )
 
   medianData <- stats::aggregate(
     x = data[, yParameterName],
-    by = list(
-      Bins = xParameterBins,
-      Population = data[, "simulationSetName"]
-    ),
-    FUN = AggregationConfiguration$functions$middle
+    by = list(Bins = xParameterBins),
+    FUN = AggregationConfiguration$functions$middle,
+    drop = FALSE
   )
   lowPercData <- stats::aggregate(
     x = data[, yParameterName],
-    by = list(
-      Bins = xParameterBins,
-      Population = data[, "simulationSetName"]
-    ),
-    FUN = AggregationConfiguration$functions$ymin
+    by = list(Bins = xParameterBins),
+    FUN = AggregationConfiguration$functions$ymin,
+    drop = FALSE
   )
   highPercData <- stats::aggregate(
     x = data[, yParameterName],
-    by = list(
-      Bins = xParameterBins,
-      Population = data[, "simulationSetName"]
-    ),
-    FUN = AggregationConfiguration$functions$ymax
+    by = list(Bins = xParameterBins),
+    FUN = AggregationConfiguration$functions$ymax,
+    drop = FALSE
   )
 
   aggregatedData <- cbind.data.frame(xData,
@@ -276,6 +285,22 @@ getDemographyAggregatedData <- function(data,
     ymax = highPercData$x
   )
 
+  if (stairstep) {
+    # Method in documentation of cut to get the bin edges
+    labs <- levels(xParameterBins)
+    xminValues <- as.numeric(sub("\\((.+),.*", "\\1", labs))
+    xmaxValues <- as.numeric(sub("[^,]*,([^]]*)\\]", "\\1", labs))
+
+    xData <- rbind.data.frame(xData, xData)
+    xData$x <- sort(c(xminValues, xmaxValues))
+
+    aggregatedData <- cbind.data.frame(xData,
+      median = rep(medianData$x, each = 2),
+      ymin = rep(lowPercData$x, each = 2),
+      ymax = rep(highPercData$x, each = 2)
+    )
+  }
+
   return(aggregatedData)
 }
 

diff --git a/R/utilities-plots.R b/R/utilities-plots.R
@@ -45,7 +45,11 @@ setPlotFormat <- function(format, width = NULL, height = NULL, units = NULL) {
 }
 
 #' @title AggregationConfiguration
-#' @description Aggregation default properties  (which functions and their captions)
+#' @description Aggregation default properties  (which functions and their captions).
+#' @field functions list of `middle`, `ymin` and `ymax` functions for aggregation
+#' @field names list of legend captions for `middle` and `range` from aggregation
+#' @field bins default number of bins in plots
+#' @field binUsingQuantiles logical to choose a binning based on the quantiles rather thanon a constant interval width
 #' @export
 AggregationConfiguration <- list(
   functions = list(
@@ -60,7 +64,9 @@ AggregationConfiguration <- list(
   names = list(
     middle = "median",
     range = "[5-95th] percentiles"
-  )
+  ),
+  bins = 11,
+  binUsingQuantiles = TRUE
 )
 
 workflowWatermarkMessage <- "preliminary analysis"

diff --git a/man/AggregationConfiguration.Rd b/man/AggregationConfiguration.Rd
diff --git a/man/getSAFileIndex.Rd b/man/getSAFileIndex.Rd
diff --git a/vignettes/plot-settings.Rmd b/vignettes/plot-settings.Rmd
@@ -291,18 +291,23 @@ The names in the __`plotConfigurations`__ list available for the __"plotPKParame
 - __"comparisonVpcPlot"__: for VPC like plots of PK parameters vs Demography parameters comparing to reference population performed by pediatric type
 
 Since pediatric workflows perform VPC like plot, aggregation of the data is performed along the demography parameters.
-The binning of the demography parameters can be set by the __`settings`__ optional field __`xParametersBreaks`__.
-This field is a list vectors corresponding of the edges of the bins and whose names are the demography parameter path.
-Below shows an example of how to set such an option:
+The binning of the demography parameters can be set by the optional field __`bins`__ from the task field __`settings`__.
+This field can be either a unique value corresponding to the number of bins or a vector defining the bin edges for all the demography parameter paths.
+
+Besides, the final plot can either link the aggregated values or plot them as stairstep. 
+The default behaviour is to perform a stairstep plot, but this can be tuned with the optional field __`stairstep`__ (`TRUE`/`FALSE`) from the task field __`settings`__.
+
+Below shows examples of how to set such options:
 
 ```{r PK parameter settings}
-# Create an empty list
-myParameterSettings <- list()
+# Associate the bin edges
+workflowA$plotPKParameters$settings$bins <- c(0, 1, 2, 3, 5, 10)
 
-# Associate the bin edges for Age demography parameter field
-myParameterSettings[[StandardPath$Age]]$xParametersBreaks <- c(0, 1, 2, 3, 5, 10)
+# Associate the number of bins
+workflowA$plotPKParameters$settings$bins <- 15
 
-workflowA$plotPKParameters$settings <- myParameterSettings
+# Set VPC as stair step 
+workflowA$plotPKParameters$settings$stairstep <- TRUE
 ```
 
 ## Sensitivity plots
@@ -357,7 +362,10 @@ The names in the __`plotConfigurations`__ list available for the __"plotDemograp
 - __"comparisonVpcPlot"__: for VPC like plots of Demography parameters vs Demography parameters comparing to reference population performed by pediatric type
 
 Since pediatric workflows perform VPC like plot, aggregation of the data is performed along the demography parameters.
-The binning of the demography parameters can be set by the __`settings`__ optional field __`xParametersBreaks`__.
-This field is a list vectors corresponding of the edges of the bins and whose names are the demography parameter path.
+The binning of the demography parameters can be set by the optional field __`bins`__ from the task field __`settings`__.
+This field can be either a unique value corresponding to the number of bins or a vector defining the bin edges for all the demography parameter paths.
+
+Besides, the final plot can either link the aggregated values or plot them as stairstep. 
+The default behaviour is to perform a stairstep plot, but this can be tuned with the optional field __`stairstep`__ (`TRUE`/`FALSE`) from the task field __`settings`__.
 
 Similarly, parallel comparison and ratio types of population workflows perform histograms, whose binning can set using the __`settings`__ field __`bins`__.