Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed bug in analyze_questioned_documents() for multiple qds #167

Merged
merged 1 commit into from
Jul 17, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
@@ -7,8 +7,9 @@
^revdep$
^\.git$
^\.github$
^LICENSE.txt
^LICENSE.txt$
^cran-comments\.md$
^CRAN-SUBMISSION$
^README\.Rmd$
^tools$
^scratch.R$
6 changes: 3 additions & 3 deletions R/cluster_analysis.R
Original file line number Diff line number Diff line change
@@ -52,7 +52,7 @@
#' @md
analyze_questioned_documents <- function(main_dir, questioned_docs, model, num_cores, writer_indices, doc_indices) {
# bind global variables to fix check() note
writer <- d <- NULL
writer <- d <- docname <- NULL

# process questioned documents
message("Processing questioned documents...")
@@ -116,7 +116,7 @@ analyze_questioned_documents <- function(main_dir, questioned_docs, model, num_c
doParallel::registerDoParallel(my_cluster)

# list questioned writers
qwriters <- unique(questioned_data$graph_measurements$writer)
qdocs <- unique(questioned_data$graph_measurements$docname)

# list known writers
kwriters <- unique(model$graph_measurements$writer)
@@ -125,7 +125,7 @@ analyze_questioned_documents <- function(main_dir, questioned_docs, model, num_c
message("Obtaining likelihood evaluations...")
likelihood_evals <- foreach::foreach(d = 1:nrow(questioned_data$cluster_fill_counts)) %dopar% { # d is document
# filter docs for current writer
qdoc2 <- questioned_data$graph_measurements %>% dplyr::filter(writer == qwriters[d]) # identical to m_qdoc
qdoc2 <- questioned_data$graph_measurements %>% dplyr::filter(docname == qdocs[d]) # identical to m_qdoc

# get cluster assignments
qcluster2 <- as.numeric(qdoc2$cluster) # identical to m_cluster
23 changes: 14 additions & 9 deletions R/cluster_format.R
Original file line number Diff line number Diff line change
@@ -131,9 +131,9 @@ format_model_data <- function(model_clusters, writer_indices, doc_indices, a = 2

# if clusters aren't numbered sequentially, relabel them
if (length(unique(graph_measurements$cluster)) < max(graph_measurements$cluster)) {
graph_measurements <- graph_measurements %>% dplyr::rename("old_cluster" = cluster)
cluster_lookup <- data.frame("old_cluster" = sort(unique(graph_measurements$old_cluster)), "cluster" = 1:length(unique(graph_measurements$old_cluster)))
graph_measurements <- graph_measurements %>% dplyr::left_join(cluster_lookup, by = "old_cluster")
graph_measurements <- graph_measurements %>% dplyr::rename("original_cluster" = cluster)
cluster_lookup <- data.frame("original_cluster" = sort(unique(graph_measurements$original_cluster)), "cluster" = 1:length(unique(graph_measurements$original_cluster)))
graph_measurements <- graph_measurements %>% dplyr::left_join(cluster_lookup, by = "original_cluster")
}

# get cluster fill counts ----
@@ -186,22 +186,27 @@ format_model_data <- function(model_clusters, writer_indices, doc_indices, a = 2
#' @noRd
format_questioned_data <- function(model, questioned_clusters, writer_indices, doc_indices) {
# bind global variable to fix check() note
old_cluster <- cluster <- NULL
original_cluster <- cluster <- NULL

graph_measurements <- questioned_clusters

# if model clusters were relabeled, relabel the questioned clusters
if (any(names(model$graph_measurements) == "old_cluster")) {
if (any(names(model$graph_measurements) == "original_cluster")) {
# make lookup table from model cluster data
cluster_lookup <- model$graph_measurements %>%
dplyr::select(old_cluster, cluster) %>%
dplyr::select(original_cluster, cluster) %>%
dplyr::distinct()
# store clusters as old clusters
graph_measurements <- graph_measurements %>%
dplyr::rename("old_cluster" = cluster)
# get new cluster labels
dplyr::rename("original_cluster" = cluster)
# get new cluster labels. NOTE: adds NA if questioned doc(s) use
# clusters that model doc(s) did not
graph_measurements <- graph_measurements %>%
dplyr::left_join(cluster_lookup, by = "old_cluster")
dplyr::left_join(cluster_lookup, by = "original_cluster")
# return error if NAs exist in cluster
if (any(is.na(graph_measurements$cluster))) {
stop("graph_measurements$cluster has at least one NA")
}
}

# get cluster fill counts ----