Merge pull request #244 from hendersontrent/trent-dev3

Small error fix
hendersontrent · Jul 18, 2023 · 3e96af4 · 3e96af4
2 parents 8ab0d9e + d4a8a0c
commit 3e96af4
Show file tree

Hide file tree

Showing 53 changed files with 1,052 additions and 876 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: theft
 Type: Package
 Title: Tools for Handling Extraction of Features from Time Series
-Version: 0.5.0
+Version: 0.5.1
 Date: 2023-07-18
 Authors@R: c(
   person("Trent", "Henderson", email = "then6675@uni.sydney.edu.au", role = c("cre", "aut")),

diff --git a/R/filter_duplicates.R b/R/filter_duplicates.R
@@ -3,7 +3,7 @@
 #' @import dplyr
 #' 
 #' @param data \code{feature_calculations} object containing the raw feature matrix produced by \code{calculate_features}
-#' @param preference \code{character} denoting which feature set to keep (meaning the others will be filtered out) between \code{"feasts"}, \code{"tsfeatures"}, and \code{"Kats"} since there is considerable overlap between these three sets. Defaults to \code{"feasts"}. Only applies if \code{by_set = TRUE} (since a set of "All features" is constructed automatically as a comparator)
+#' @param preference \code{character} denoting which feature set to keep (meaning the others will be filtered out) between \code{"feasts"}, \code{"tsfeatures"}, and \code{"Kats"} since there is considerable overlap between these three sets. Defaults to \code{"feasts"}. Duplicates will NOT be removed from sets when computing set-level results for the respective non-preferenced sets to ensure fairness. They are only filtered out for either the construction of the set of "All features" if \code{by_set = TRUE} and when computing individual feature results (to reduce redundant calculations)
 #' @return \code{feature_calculations} object containing filtered feature data
 #' @author Trent Henderson
 #' @export

diff --git a/R/select_stat_cols.R b/R/select_stat_cols.R
@@ -0,0 +1,62 @@
+#' Helper function to select only the relevant columns for statistical testing
+#' 
+#' @import dplyr
+#' 
+#' @param data \code{data.frame} of classification accuracy results
+#' @param by_set \code{Boolean} specifying whether you want to compare feature sets (if \code{TRUE}) or individual features (if \code{FALSE}).
+#' @param hypothesis \code{character} denoting whether p-values should be calculated for each feature set or feature (depending on \code{by_set} argument) individually relative to the null if \code{use_null = TRUE} in \code{tsfeature_classifier} through \code{"null"}, or whether pairwise comparisons between each set or feature should be conducted on main model fits only through \code{"pairwise"}.
+#' @param metric \code{character} denoting the classification performance metric to use in statistical testing. Can be one of \code{"accuracy"}, \code{"precision"}, \code{"recall"}, \code{"f1"}. Defaults to \code{"accuracy"}
+#' @returns object of class \code{data.frame}
+#' @author Trent Henderson
+#' 
+
+select_stat_cols <- function(data, by_set, metric, hypothesis){
+
+  if(hypothesis == "null"){
+    if(by_set){
+      if(metric == "accuracy"){
+        tmp <- data %>% dplyr::select(c(.data$model_type, .data$method, .data$accuracy)) %>% dplyr::rename(mymetric = .data$accuracy)
+      } else if(metric == "precision"){
+        tmp <- data %>% dplyr::select(c(.data$model_type, .data$method, .data$mean_precision)) %>% dplyr::rename(mymetric = .data$mean_precision)
+      } else if(metric == "recall"){
+        tmp <- data %>% dplyr::select(c(.data$model_type, .data$method, .data$mean_recall)) %>% dplyr::rename(mymetric = .data$mean_recall)
+      } else{
+        tmp <- data %>% dplyr::select(c(.data$model_type, .data$method, .data$mean_f1_score)) %>% dplyr::rename(mymetric = .data$mean_recall)
+      }
+    } else{
+      if(metric == "accuracy"){
+        tmp <- data %>% dplyr::select(c(.data$model_type, .data$names, .data$accuracy)) %>% dplyr::rename(mymetric = .data$accuracy)
+      } else if(metric == "precision"){
+        tmp <- data %>% dplyr::select(c(.data$model_type, .data$names, .data$mean_precision)) %>% dplyr::rename(mymetric = .data$mean_precision)
+      } else if(metric == "recall"){
+        tmp <- data %>% dplyr::select(c(.data$model_type, .data$names, .data$mean_recall)) %>% dplyr::rename(mymetric = .data$mean_recall)
+      } else{
+        tmp <- data %>% dplyr::select(c(.data$model_type, .data$names, .data$mean_f1_score)) %>% dplyr::rename(mymetric = .data$mean_f1_score)
+      }
+    }
+  } else{
+    if(by_set){
+      if(metric == "accuracy"){
+        tmp <- data %>% dplyr::select(c(.data$method, .data$accuracy)) %>% dplyr::rename(mymetric = .data$accuracy)
+      } else if(metric == "precision"){
+        tmp <- data %>% dplyr::select(c(.data$method, .data$mean_precision)) %>% dplyr::rename(mymetric = .data$mean_precision)
+      } else if(metric == "recall"){
+        tmp <- data %>% dplyr::select(c(.data$method, .data$mean_recall)) %>% dplyr::rename(mymetric = .data$mean_recall)
+      } else{
+        tmp <- data %>% dplyr::select(c(.data$method, .data$mean_f1_score)) %>% dplyr::rename(mymetric = .data$mean_f1_score)
+      }
+    } else{
+      if(metric == "accuracy"){
+        tmp <- data %>% dplyr::select(c(.data$names, .data$accuracy)) %>% dplyr::rename(mymetric = .data$accuracy)
+      } else if(metric == "precision"){
+        tmp <- data %>% dplyr::select(c(.data$names, .data$mean_precision)) %>% dplyr::rename(mymetric = .data$mean_precision)
+      } else if(metric == "recall"){
+        tmp <- data %>% dplyr::select(c(.data$names, .data$mean_recall)) %>% dplyr::rename(mymetric = .data$mean_recall)
+      } else{
+        tmp <- data %>% dplyr::select(c(.data$names, .data$mean_f1_score)) %>% dplyr::rename(mymetric = .data$mean_f1_score)
+      }
+    }
+  }
+
+  return(tmp)
+}
diff --git a/R/stat_test.R b/R/stat_test.R
@@ -47,51 +47,7 @@ stat_test <- function(data, iter_data, row_id, by_set = FALSE, hypothesis, metri
 
   # Select only relevant columns and rename for easier use later
 
-  if(hypothesis == "null"){
-    if(by_set){
-      if(metric == "accuracy"){
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$model_type, .data$method, .data$accuracy)) %>% dplyr::rename(mymetric = .data$accuracy)
-      } else if(metric == "precision"){
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$model_type, .data$method, .data$mean_precision)) %>% dplyr::rename(mymetric = .data$mean_precision)
-      } else if(metric == "recall"){
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$model_type, .data$method, .data$mean_recall)) %>% dplyr::rename(mymetric = .data$mean_recall)
-      } else{
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$model_type, .data$method, .data$mean_f1_score)) %>% dplyr::rename(mymetric = .data$mean_recall)
-      }
-    } else{
-      if(metric == "accuracy"){
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$model_type, .data$names, .data$accuracy)) %>% dplyr::rename(mymetric = .data$accuracy)
-      } else if(metric == "precision"){
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$model_type, .data$names, .data$mean_precision)) %>% dplyr::rename(mymetric = .data$mean_precision)
-      } else if(metric == "recall"){
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$model_type, .data$names, .data$mean_recall)) %>% dplyr::rename(mymetric = .data$mean_recall)
-      } else{
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$model_type, .data$names, .data$mean_f1_score)) %>% dplyr::rename(mymetric = .data$mean_f1_score)
-      }
-    }
-  } else{
-    if(by_set){
-      if(metric == "accuracy"){
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$method, .data$accuracy)) %>% dplyr::rename(mymetric = .data$accuracy)
-      } else if(metric == "precision"){
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$method, .data$mean_precision)) %>% dplyr::rename(mymetric = .data$mean_precision)
-      } else if(metric == "recall"){
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$method, .data$mean_recall)) %>% dplyr::rename(mymetric = .data$mean_recall)
-      } else{
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$method, .data$mean_f1_score)) %>% dplyr::rename(mymetric = .data$mean_f1_score)
-      }
-    } else{
-      if(metric == "accuracy"){
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$names, .data$accuracy)) %>% dplyr::rename(mymetric = .data$accuracy)
-      } else if(metric == "precision"){
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$names, .data$mean_precision)) %>% dplyr::rename(mymetric = .data$mean_precision)
-      } else if(metric == "recall"){
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$names, .data$mean_recall)) %>% dplyr::rename(mymetric = .data$mean_recall)
-      } else{
-        tmp_data <- tmp_data %>% dplyr::select(c(.data$names, .data$mean_f1_score)) %>% dplyr::rename(mymetric = .data$mean_f1_score)
-      }
-    }
-  }
+  tmp_data <- select_stat_cols(data = tmp_data, by_set = by_set, metric = metric, hypothesis = hypothesis)
 
   # Check for 0 variance
 
@@ -141,18 +97,18 @@ stat_test <- function(data, iter_data, row_id, by_set = FALSE, hypothesis, metri
   if(0 %in% sd_check$stddev){
     if(hypothesis == "null"){
       if(by_set){
-        outs <- data.frame(hypothesis = paste0(iter_filt, " > ", iter_filt, " (null)"),
+        outs <- data.frame(hypothesis = paste0(iter_filt, " != ", iter_filt, " (null)"),
                            method = iter_filt, metric = metric, t_statistic = NA, p.value = NA)
       } else{
-        outs <- data.frame(hypothesis = paste0(iter_filt$names, " > ", iter_filt$names, " (null)"),
-                           method = iter_filt$names, metric = metric, t_statistic = NA, p.value = NA)
+        outs <- data.frame(hypothesis = paste0(iter_filt, " != ", iter_filt, " (null)"),
+                           method = iter_filt, metric = metric, t_statistic = NA, p.value = NA)
       }
     } else{
       if(by_set){
-        outs <- data.frame(hypothesis = paste0(iter_filt$method_a, " > ", iter_filt$method_b),
+        outs <- data.frame(hypothesis = paste0(iter_filt$method_a, " != ", iter_filt$method_b),
                            method_a = iter_filt$method_a, method_b = iter_filt$method_b, metric = metric, t_statistic = NA, p.value = NA)
       } else{
-        outs <- data.frame(hypothesis = paste0(iter_filt$names_a, " > ", iter_filt$names_b),
+        outs <- data.frame(hypothesis = paste0(iter_filt$names_a, " != ", iter_filt$names_b),
                            names_a = iter_filt$names_a, names_b = iter_filt$names_b, metric = metric, t_statistic = NA, p.value = NA)
       }
     }
@@ -161,12 +117,12 @@ stat_test <- function(data, iter_data, row_id, by_set = FALSE, hypothesis, metri
       if(by_set){
         t_test <- resampled_ttest(x = x, y = y, n = n_resamples, n1 = train_test_sizes[1], n2 = train_test_sizes[1])
 
-        outs <- data.frame(hypothesis = paste0(iter_filt, " > ", iter_filt, " (null)"),
+        outs <- data.frame(hypothesis = paste0(iter_filt, " != ", iter_filt, " (null)"),
                            method = iter_filt, metric = metric, t_statistic = t_test$statistic, p.value = t_test$p.value)
       } else{
         t_test <- resampled_ttest(x = x, y = y, n = n_resamples, n1 = train_test_sizes[1], n2 = train_test_sizes[1])
 
-        outs <- data.frame(hypothesis = paste0(iter_filt, " > ", iter_filt, " (null)"),
+        outs <- data.frame(hypothesis = paste0(iter_filt, " != ", iter_filt, " (null)"),
                            names = iter_filt, method = gsub("_.*", "\\1", iter_filt), 
                            original_names = gsub("^[^_]*_", "", iter_filt), metric = metric, 
                            t_statistic = t_test$statistic, p.value = t_test$p.value)
@@ -175,12 +131,12 @@ stat_test <- function(data, iter_data, row_id, by_set = FALSE, hypothesis, metri
       if(by_set){
         t_test <- resampled_ttest(x = x, y = y, n = n_resamples, n1 = train_test_sizes[1], n2 = train_test_sizes[1])
 
-        outs <- data.frame(hypothesis = paste0(iter_filt$method_a, " > ", iter_filt$method_b),
+        outs <- data.frame(hypothesis = paste0(iter_filt$method_a, " != ", iter_filt$method_b),
                            method_a = iter_filt$method_a, method_b = iter_filt$method_b, metric = metric, t_statistic = t_test$statistic, p.value = t_test$p.value)
       } else{
         t_test <- resampled_ttest(x = x, y = y, n = n_resamples, n1 = train_test_sizes[1], n2 = train_test_sizes[1])
 
-        outs <- data.frame(hypothesis = paste0(iter_filt$names_a, " > ", iter_filt$names_b),
+        outs <- data.frame(hypothesis = paste0(iter_filt$names_a, " != ", iter_filt$names_b),
                            names_a = iter_filt$names_a, names_b = iter_filt$names_b, metric = metric, t_statistic = t_test$statistic, p.value = t_test$p.value)
       }
     }

diff --git a/R/tsfeature_classifier.R b/R/tsfeature_classifier.R
@@ -12,7 +12,7 @@
 #' @param by_set \code{Boolean} specifying whether to compute classifiers for each feature set. Defaults to \code{TRUE}. If \code{FALSE}, the function will instead find the best individually-performing features
 #' @param use_null \code{Boolean} whether to fit null models where class labels are shuffled in order to generate a null distribution that can be compared to performance on correct class labels. Defaults to \code{FALSE}
 #' @param seed \code{integer} to fix R's random number generator to ensure reproducibility. Defaults to \code{123}
-#' @param preference \code{character} denoting which feature set to keep (meaning the others will be filtered out) between \code{"feasts"}, \code{"tsfeatures"}, and \code{"Kats"} since there is considerable overlap between these three sets. Defaults to \code{"feasts"}. Only applies if \code{by_set = TRUE} (since a set of "All features" is constructed automatically as a comparator)
+#' @param preference \code{character} denoting which feature set to keep (meaning the others will be filtered out) between \code{"feasts"}, \code{"tsfeatures"}, and \code{"Kats"} since there is considerable overlap between these three sets. Defaults to \code{"feasts"}. Duplicates will NOT be removed from sets when computing set-level results for the respective non-preferenced sets to ensure fairness. They are only filtered out for either the construction of the set of "All features" if \code{by_set = TRUE} and when computing individual feature results (to reduce redundant calculations)
 #' @return \code{list} containing a named \code{vector} of train-test set sizes, and a \code{data.frame} of classification performance results
 #' @author Trent Henderson
 #' @export
@@ -55,18 +55,16 @@ tsfeature_classifier <- function(data, classifier = NULL, train_size = 0.75, n_r
 
   # Set up data
 
-  tmp <- data[[1]] %>%
-    dplyr::mutate(group = as.factor(as.character(.data$group)),
-                  names = paste0(.data$method, "_", .data$names)) %>%
-    dplyr::select(c(.data$id, .data$group, .data$names, .data$values)) %>%
-    tidyr::pivot_wider(id_cols = c("id", "group"), names_from = "names", values_from = "values") %>%
-    dplyr::select_if(~sum(!is.na(.)) > 0) %>% # Delete features that are all NaNs
-    dplyr::select(mywhere(~dplyr::n_distinct(.) > 1)) # Delete features with constant values
-
-  # Add "All features" if by_set = TRUE
-
   if(by_set){
 
+    tmp <- data[[1]] %>%
+      dplyr::mutate(group = as.factor(as.character(.data$group)),
+                    names = paste0(.data$method, "_", .data$names)) %>%
+      dplyr::select(c(.data$id, .data$group, .data$names, .data$values)) %>%
+      tidyr::pivot_wider(id_cols = c("id", "group"), names_from = "names", values_from = "values") %>%
+      dplyr::select_if(~sum(!is.na(.)) > 0) %>% # Delete features that are all NaNs
+      dplyr::select(mywhere(~dplyr::n_distinct(.) > 1)) # Delete features with constant values
+
     # Remove duplicate features
 
     tmp2 <- filter_duplicates(data = data, preference = preference)
@@ -84,10 +82,25 @@ tsfeature_classifier <- function(data, classifier = NULL, train_size = 0.75, n_r
 
     tmp <- tmp %>%
       dplyr::left_join(tmp2, by = c("id" = "id", "group" = "group"))
+
+  } else{
+
+    # Remove duplicate features
+
+    tmp <- filter_duplicates(data = data, preference = preference)
+
+    tmp <- tmp[[1]] %>%
+      dplyr::mutate(group = as.factor(as.character(.data$group)),
+                    names = paste0(.data$method, "_", .data$names)) %>%
+      dplyr::select(c(.data$id, .data$group, .data$names, .data$values)) %>%
+      tidyr::pivot_wider(id_cols = c("id", "group"), names_from = "names", values_from = "values") %>%
+      dplyr::select_if(~sum(!is.na(.)) > 0) %>% # Delete features that are all NaNs
+      dplyr::select(mywhere(~dplyr::n_distinct(.) > 1)) # Delete features with constant values
   }
 
   # Assign samples to train or test
 
+  set.seed(seed)
   dt <- sort(sample(nrow(tmp), nrow(tmp) * train_size))
   train <- tmp[dt, ] %>% dplyr::mutate(set_split = "Train")
   test <- tmp[-dt, ] %>% dplyr::mutate(set_split = "Test")
@@ -101,7 +114,7 @@ tsfeature_classifier <- function(data, classifier = NULL, train_size = 0.75, n_r
 
   if(is.null(classifier)){
     classifier <- function(formula, data){
-      mod <- e1071::svm(formula, data = data, scale = FALSE, probability = TRUE)
+      mod <- e1071::svm(formula, data = data, kernel = "linear", scale = FALSE, probability = TRUE)
     }
   } else{
     if(length(names(formals(classifier))) != 2){

diff --git a/README.Rmd b/README.Rmd
@@ -48,11 +48,11 @@ For a comprehensive comparison of these six feature sets, please refer to the re
 
 ## Statistical and graphical tools
 
-`theft` also contains an extensive suite of tools for automatic processing of extracted feature vectors (including data quality assessments and normalisation methods), low dimensional projections (linear and nonlinear), data matrix visualisations, single feature and multiple feature time-series classification procedures, and various other statistical and graphical tools.
+`theft` also contains an extensive suite of tools for automatic processing of extracted feature vectors (including data quality assessments and normalisation methods), low dimensional projections (linear and nonlinear), data matrix visualisations, automated time-series classification procedures, statistical hypothesis testing, and various other statistical and graphical tools.
 
 ## Web application
 
-An [interactive web application](https://dynamicsandneuralsystems.shinyapps.io/timeseriesfeaturevis/) has been built on top of `theft` which enables users to access most of the functionality included in the package from within a web browser without any code. The application automates the entire workflow included in `theft`, converts all static graphics included in the package into interactive visualisations, and enables downloads of feature calculations. Note that since `theft` is an active development project, not all functionality has been copied across to the webtool yet.
+An [interactive web application](https://dynamicsandneuralsystems.shinyapps.io/timeseriesfeaturevis/) has been built on top of `theft` which enables users to access most of the functionality included in the package from within a web browser without any code. The application automates the entire workflow included in `theft`, converts all static graphics included in the package into interactive visualisations, and enables downloads of feature calculations. Note that since `theft` is an active development project and the web application is not, functionality in the application may be out of date relative to what is available in `theft`.
 
 ## Citation