CSAFE-ISU · stephaniereinders · Jul 17, 2024 · Jul 17, 2024
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -7,8 +7,9 @@
 ^revdep$
 ^\.git$
 ^\.github$
-^LICENSE.txt
+^LICENSE.txt$
 ^cran-comments\.md$
 ^CRAN-SUBMISSION$
 ^README\.Rmd$
 ^tools$
+^scratch.R$
diff --git a/R/cluster_analysis.R b/R/cluster_analysis.R
@@ -52,7 +52,7 @@
 #' @md
 analyze_questioned_documents <- function(main_dir, questioned_docs, model, num_cores, writer_indices, doc_indices) {
   # bind global variables to fix check() note
-  writer <- d <- NULL
+  writer <- d <- docname <- NULL
 
   # process questioned documents
   message("Processing questioned documents...")
@@ -116,7 +116,7 @@ analyze_questioned_documents <- function(main_dir, questioned_docs, model, num_c
   doParallel::registerDoParallel(my_cluster)
 
   # list questioned writers
-  qwriters <- unique(questioned_data$graph_measurements$writer)
+  qdocs <- unique(questioned_data$graph_measurements$docname)
 
   # list known writers
   kwriters <- unique(model$graph_measurements$writer)
@@ -125,7 +125,7 @@ analyze_questioned_documents <- function(main_dir, questioned_docs, model, num_c
   message("Obtaining likelihood evaluations...")
   likelihood_evals <- foreach::foreach(d = 1:nrow(questioned_data$cluster_fill_counts)) %dopar% { # d is document
     # filter docs for current writer
-    qdoc2 <- questioned_data$graph_measurements %>% dplyr::filter(writer == qwriters[d]) # identical to m_qdoc
+    qdoc2 <- questioned_data$graph_measurements %>% dplyr::filter(docname == qdocs[d]) # identical to m_qdoc
 
     # get cluster assignments
     qcluster2 <- as.numeric(qdoc2$cluster) # identical to m_cluster

diff --git a/R/cluster_format.R b/R/cluster_format.R
@@ -131,9 +131,9 @@ format_model_data <- function(model_clusters, writer_indices, doc_indices, a = 2
 
   # if clusters aren't numbered sequentially, relabel them
   if (length(unique(graph_measurements$cluster)) < max(graph_measurements$cluster)) {
-    graph_measurements <- graph_measurements %>% dplyr::rename("old_cluster" = cluster)
-    cluster_lookup <- data.frame("old_cluster" = sort(unique(graph_measurements$old_cluster)), "cluster" = 1:length(unique(graph_measurements$old_cluster)))
-    graph_measurements <- graph_measurements %>% dplyr::left_join(cluster_lookup, by = "old_cluster")
+    graph_measurements <- graph_measurements %>% dplyr::rename("original_cluster" = cluster)
+    cluster_lookup <- data.frame("original_cluster" = sort(unique(graph_measurements$original_cluster)), "cluster" = 1:length(unique(graph_measurements$original_cluster)))
+    graph_measurements <- graph_measurements %>% dplyr::left_join(cluster_lookup, by = "original_cluster")
   }
 
   # get cluster fill counts ----
@@ -186,22 +186,27 @@ format_model_data <- function(model_clusters, writer_indices, doc_indices, a = 2
 #' @noRd
 format_questioned_data <- function(model, questioned_clusters, writer_indices, doc_indices) {
   # bind global variable to fix check() note
-  old_cluster <- cluster <- NULL
+  original_cluster <- cluster <- NULL
 
   graph_measurements <- questioned_clusters
 
   # if model clusters were relabeled, relabel the questioned clusters
-  if (any(names(model$graph_measurements) == "old_cluster")) {
+  if (any(names(model$graph_measurements) == "original_cluster")) {
     # make lookup table from model cluster data
     cluster_lookup <- model$graph_measurements %>%
-      dplyr::select(old_cluster, cluster) %>%
+      dplyr::select(original_cluster, cluster) %>%
       dplyr::distinct()
     # store clusters as old clusters
     graph_measurements <- graph_measurements %>%
-      dplyr::rename("old_cluster" = cluster)
-    # get new cluster labels
+      dplyr::rename("original_cluster" = cluster)
+    # get new cluster labels. NOTE: adds NA if questioned doc(s) use
+    # clusters that model doc(s) did not
     graph_measurements <- graph_measurements %>%
-      dplyr::left_join(cluster_lookup, by = "old_cluster")
+      dplyr::left_join(cluster_lookup, by = "original_cluster")
+    # return error if NAs exist in cluster
+    if (any(is.na(graph_measurements$cluster))) {
+      stop("graph_measurements$cluster has at least one NA")
+    }
   }
 
   # get cluster fill counts ----