From f861a616445f64651d5a9379293687cadfa41f0a Mon Sep 17 00:00:00 2001 From: Xichen Wu Date: Mon, 5 Feb 2024 22:37:22 +0100 Subject: [PATCH] donot match --- bin/donor_match.R | 429 +++++++++++++++++++++++++++++++++------------- 1 file changed, 309 insertions(+), 120 deletions(-) diff --git a/bin/donor_match.R b/bin/donor_match.R index 574b46f..4ab5ef9 100755 --- a/bin/donor_match.R +++ b/bin/donor_match.R @@ -6,7 +6,8 @@ library(tidyverse) library(vcfR) library(argparse) -parser <- ArgumentParser("Parameters for donor matching based on Pearson coorelation. ") +parser <- + ArgumentParser("Parameters for donor matching based on Pearson coorelation. ") parser$add_argument("--result_csv", help = "The path to the csv file or the directory containing the demultiplexing assignment. ") parser$add_argument("--barcode", help = "The path to the tsv file containing the white list of barcodes. ", default = NULL) parser$add_argument("--ndonor", help = "The number of donors in the cell mixture. ", type = "integer") @@ -14,10 +15,20 @@ parser$add_argument("--method1", help = "The Name of the first method in the ass parser$add_argument("--method2", help = "The Name of the second method in the assignment file. We will use its donor names as reference.", default = NULL) parser$add_argument("--findVariants", help = "How to find representative variants for each donor. ", default = "default") parser$add_argument("--cell_genotype", help = "The path to the VCF file containing the genotype of the cells.") -parser$add_argument("--variant_count", help = "The Minimal count of a variant for filtering", type = "integer", default = 0) -parser$add_argument("--variant_pct", help = "The Minimal percentage of a variant for filtering", type = "double", default = 0.5) -parser$add_argument("--vireo_parent_dir", help = "The parent folder for searching the output of the best vireo trial, e.g. - donor genotype and discriminatory variants. ") +parser$add_argument("--variant_count", + help = "The Minimal count of a variant for filtering", + type = "integer", + default = 0 +) +parser$add_argument("--variant_pct", + help = "The Minimal percentage of a variant for filtering", + type = "double", + default = 0.5 +) +parser$add_argument("--vireo_parent_dir", + help = "The parent folder for searching the output of the best vireo trial, e.g. + donor genotype and discriminatory variants. " +) parser$add_argument("--outputdir", help = "Output directory. ") args <- parser$parse_args() @@ -28,18 +39,23 @@ convert2binary <- function(result_csv, method_name, min_cell) { unique(method_assign[[method_name]]), c(NA, "negative", "doublet") ) - method_assign <- method_assign[method_assign[[method_name]] %in% donor_id, ] + method_assign <- + method_assign[method_assign[[method_name]] %in% donor_id, ] if (nrow(method_assign) < min_cell) { return(NULL) } if (length(unique(method_assign[[method_name]])) == 1) { - method_assign_binary <- as.data.frame(matrix(0, nrow = nrow(result_csv), ncol = 1), - row.names = result_csv$Barcode - ) - colnames(method_assign_binary) <- c(unique(method_assign[[method_name]])) - method_assign_binary[rownames(method_assign_binary) %in% method_assign$Barcode, ] <- 1 + method_assign_binary <- + as.data.frame(matrix(0, nrow = nrow(result_csv), ncol = 1), + row.names = result_csv$Barcode + ) + colnames(method_assign_binary) <- + c(unique(method_assign[[method_name]])) + method_assign_binary[rownames(method_assign_binary) %in% method_assign$Barcode, ] <- + 1 } else { - method_assign_binary <- data.frame(model.matrix(~ method_assign[[method_name]] - 1, data = method_assign)) + method_assign_binary <- + data.frame(model.matrix(~ method_assign[[method_name]] - 1, data = method_assign)) names(method_assign_binary) <- sort(donor_id) rownames(method_assign_binary) <- method_assign$Barcode } @@ -49,42 +65,82 @@ convert2binary <- function(result_csv, method_name, min_cell) { result_csv <- NULL min_cell <- 0 if (file.exists(args$result_csv) && !dir.exists(args$result_csv)) { - result_csv <- fread(args$result_csv, stringsAsFactors = FALSE, na.strings = c(NA_character_, "")) + result_csv <- + fread( + args$result_csv, + stringsAsFactors = FALSE, + na.strings = c(NA_character_, "") + ) } if (dir.exists(args$result_csv)) { result_csv <- list.files(args$result_csv, pattern = "assignment_all", full.names = TRUE ) - result_csv <- fread(result_csv[1], stringsAsFactors = FALSE, na.strings = c(NA_character_, "")) + result_csv <- + fread( + result_csv[1], + stringsAsFactors = FALSE, + na.strings = c(NA_character_, "") + ) } if (!is.null(args$barcode)) { barcode_whitelist <- fread(args$barcode, header = FALSE, stringsAsFactors = FALSE )$V1 - result_csv <- result_csv[result_csv$Barcode %in% barcode_whitelist, ] + result_csv <- + result_csv[result_csv$Barcode %in% barcode_whitelist, ] # min_cell <- floor(nrow(result_csv) * 0.01) } -colname_with_singlet <- colnames(result_csv %>% select_if(~ any(. != "negative" & . != "doublet"))) -colname_with_singlet <- colname_with_singlet[colname_with_singlet != "Barcode"] +colname_with_singlet <- + colnames(result_csv %>% select_if(~ any(. != "negative" & + . != "doublet"))) +colname_with_singlet <- + colname_with_singlet[colname_with_singlet != "Barcode"] if (length(colname_with_singlet) < 2) { stop("Please choose more methods to run donor matching!") } +hashing_methods <- + c( + "demuxem", + "htodemux", + "multiseq", + "hashsolo", + "hashedDrops", + "bff", + "gmm_demux" + ) +genetic_methods <- + c("demuxlet", "freemuxlet", "vireo", "scsplit", "souporcell") + + if (!is.null(args$method1) && !is.null(args$method2)) { - method1_all <- colname_with_singlet[startsWith(colnames(result_csv), args$method1)] - method2_all <- colname_with_singlet[startsWith(colnames(result_csv), args$method2)] + method1_all <- + colname_with_singlet[startsWith(colnames(result_csv), args$method1)] + method2_all <- + colname_with_singlet[startsWith(colnames(result_csv), args$method2)] } else { - all_methods_pair <- combn(colname_with_singlet, 2, simplify = FALSE) - method1_all <- sapply(all_methods_pair, "[", 1) - method2_all <- sapply(all_methods_pair, "[", 2) + genetics_all <- + Filter(function(x) { + any(sapply(genetic_methods, function(y) { + grepl(y, x) + })) + }, colname_with_singlet) + hashing_all <- + Filter(function(x) { + any(sapply(hashing_methods, function(y) { + grepl(y, x) + })) + }, colname_with_singlet) + all_methods_pair <- + expand.grid(genetics = genetics_all, hashing = hashing_all) + method1_all <- as.character(all_methods_pair$genetics) + method2_all <- as.character(all_methods_pair$hashing) } -hashing_methods <- c("demuxem", "htodemux", "multiseq", "hashsolo", "hashedDrops", "bff", "gmm_demux") -genetic_methods <- c("demuxlet", "freemuxlet", "vireo", "scsplit", "souporcell") - best_result <- 0 best_method1 <- "None" best_method2 <- "None" @@ -115,7 +171,8 @@ for (i in 1:length(method1_all)) { method2 <- hash_method } - outputdir <- file.path(args$outputdir, paste0(method1, "_vs_", method2)) + outputdir <- + file.path(args$outputdir, paste0(method1, "_vs_", method2)) ifelse(!dir.exists(outputdir), dir.create(outputdir), FALSE) method1_res <- convert2binary(result_csv, method1, min_cell) @@ -124,28 +181,45 @@ for (i in 1:length(method1_all)) { next } - intersect_barcode <- intersect(rownames(method1_res), rownames(method2_res)) + intersect_barcode <- + intersect(rownames(method1_res), rownames(method2_res)) if (length(intersect_barcode) == 0) { next } print(paste0("Comapring ", method1, " and ", method2)) - method1_res <- method1_res[rownames(method1_res) %in% intersect_barcode, , drop = FALSE] - method2_res <- method2_res[rownames(method2_res) %in% intersect_barcode, , drop = FALSE] - - correlation_res <- apply(method1_res, 2, function(x) { - apply(method2_res, 2, function(y) { - return(cor.test(x, y)[["estimate"]][["cor"]]) - }) - }) + method1_res <- + method1_res[rownames(method1_res) %in% intersect_barcode, , drop = FALSE] + method2_res <- + method2_res[rownames(method2_res) %in% intersect_barcode, , drop = FALSE] + + correlation_res <- try( + { + apply(method1_res, 2, function(x) { + apply(method2_res, 2, function(y) { + return(cor.test(x, y)[["estimate"]][["cor"]]) + }) + }) + }, + silent = TRUE + ) + if (inherits(correlation_res, "try-error")) { + cat("Failed to calculate phi coefficient") + next + } + if (is.vector(correlation_res)) { correlation_res <- t(as.data.frame(correlation_res)) rownames(correlation_res) <- colnames(method2_res) } - write.csv(correlation_res, file.path(outputdir, "correlation_res.csv")) + write.csv( + correlation_res, + file.path(outputdir, "correlation_res.csv") + ) match_score <- 0 matched_donor <- 0 - geno_match <- as.data.frame(matrix(nrow = ncol(correlation_res), ncol = 3)) + geno_match <- + as.data.frame(matrix(nrow = ncol(correlation_res), ncol = 3)) colnames(geno_match) <- c("Method1", "Method2", "Correlation") geno_match$Method1 <- colnames(correlation_res) @@ -158,15 +232,21 @@ for (i in 1:length(method1_all)) { rownames(correlation_res)[which.max(correlation_res[, id])], max(correlation_res[, id], na.rm = TRUE) ) - match_score <- match_score + max(correlation_res[, id], na.rm = TRUE) + match_score <- + match_score + max(correlation_res[, id], na.rm = TRUE) matched_donor <- matched_donor + 1 } else { - geno_match[which(geno_match$Cluster1_ID == id)] <- c("unassigned", NA) + geno_match[which(geno_match$Cluster1_ID == id)] <- + c("unassigned", NA) } } - write.table(geno_match[, 1:2], + write.table( + geno_match[, 1:2], file.path(outputdir, "donor_match.csv"), - row.names = FALSE, col.names = FALSE, sep = " ", quote = FALSE + row.names = FALSE, + col.names = FALSE, + sep = " ", + quote = FALSE ) if (!all(is.na(correlation_res))) { @@ -174,13 +254,22 @@ for (i in 1:length(method1_all)) { annoCol <- newCols(nrow(geno_match)) names(annoCol) <- colnames(correlation_res) annoCol <- list(category = annoCol) - correlation_res <- correlation_res[!is.na(row.names(correlation_res)), , drop = FALSE] - correlation_res <- correlation_res[order(as.numeric(row.names(correlation_res))), , drop = FALSE] - pheatmap(correlation_res, + correlation_res <- + correlation_res[!is.na(row.names(correlation_res)), , drop = FALSE] + correlation_res <- + correlation_res[order(as.numeric(row.names(correlation_res))), , drop = FALSE] + pheatmap( + correlation_res, treeheight_row = FALSE, - treeheight_col = FALSE, display_numbers = TRUE, angle_col = "45", - number_color = "white", fontsize = 12, cluster_rows = FALSE, - cluster_cols = FALSE, width = 7, height = 5, + treeheight_col = FALSE, + display_numbers = TRUE, + angle_col = "45", + number_color = "white", + fontsize = 12, + cluster_rows = FALSE, + cluster_cols = FALSE, + width = 7, + height = 5, filename = file.path(outputdir, "concordance_heatmap.png") ) } @@ -191,16 +280,35 @@ for (i in 1:length(method1_all)) { match_score <- match_score / args$ndonor if (match_score > best_result && !remain_na) { - write.table(geno_match[, 1:2], + write.table( + geno_match[, 1:2], file.path(args$outputdir, "donor_match.csv"), - row.names = FALSE, col.names = FALSE, - sep = " ", quote = FALSE + row.names = FALSE, + col.names = FALSE, + sep = " ", + quote = FALSE ) } - best_method1 <- ifelse(match_score > best_result & !remain_na, method1, best_method1) - best_method2 <- ifelse(match_score > best_result & !remain_na, method2, best_method2) - best_result <- ifelse(match_score > best_result & !remain_na, match_score, best_result) - new_record <- c(method1, method2, match_score, matched_donor, remain_na) + best_method1 <- + ifelse(match_score > best_result & + !remain_na, + method1, + best_method1 + ) + best_method2 <- + ifelse(match_score > best_result & + !remain_na, + method2, + best_method2 + ) + best_result <- + ifelse(match_score > best_result & + !remain_na, + match_score, + best_result + ) + new_record <- + c(method1, method2, match_score, matched_donor, remain_na) result_record[num_trial, ] <- new_record num_trial <- num_trial + 1 @@ -215,30 +323,40 @@ for (i in 1:length(method1_all)) { ) } - write.csv(result_merge_new, + write.csv( + result_merge_new, file.path(outputdir, "all_assignment_after_match.csv"), row.names = FALSE ) if (best_result == match_score) { - write.csv(result_merge_new, + write.csv( + result_merge_new, file.path(args$outputdir, "all_assignment_after_match.csv"), row.names = FALSE ) } - result_merge_new <- result_merge_new[result_merge_new$Barcode %in% intersect_barcode, ] + result_merge_new <- + result_merge_new[result_merge_new$Barcode %in% intersect_barcode, ] - write.csv(result_merge_new, + write.csv( + result_merge_new, file.path(outputdir, "intersect_assignment_after_match.csv"), row.names = FALSE ) } - # else{ - # print(paste0("Skip comparision between ", method1, " and ", method2)) - # } } if (best_method1 != "None" && best_method2 != "None") { - print(paste0("Best method pair: ", best_method1, " and ", best_method2, " with score ", best_result)) + print( + paste0( + "Best method pair: ", + best_method1, + " and ", + best_method2, + " with score ", + best_result + ) + ) print("------------------------------------------------------------------") } @@ -251,48 +369,77 @@ if (nrow(result_record) > 1) { if (args$findVariants == "True" || args$findVariants == "default") { if (startsWith(best_method1, "vireo")) { - write.table(best_method1, + write.table( + best_method1, file.path(args$outputdir, "best_method_vireo.txt"), - sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE + sep = "\t", + row.names = FALSE, + col.names = FALSE, + quote = FALSE ) } else { stop("Vireo is not the best method for donor matching!") } - outputdir <- file.path(args$outputdir, paste0(best_method1, "_vs_", best_method2)) + outputdir <- + file.path(args$outputdir, paste0(best_method1, "_vs_", best_method2)) outputdir_variant <- file.path(outputdir, "variant_filtering") - ifelse(!dir.exists(outputdir_variant), dir.create(outputdir_variant), FALSE) - result_merge_new <- fread(file.path(outputdir, "intersect_assignment_after_match.csv"), header = T) - result_merge_new$match <- result_merge_new[[best_method1]] == result_merge_new[[best_method2]] + ifelse(!dir.exists(outputdir_variant), + dir.create(outputdir_variant), + FALSE + ) + result_merge_new <- + fread(file.path(outputdir, "intersect_assignment_after_match.csv"), + header = T + ) + result_merge_new$match <- + result_merge_new[[best_method1]] == result_merge_new[[best_method2]] matched <- result_merge_new[result_merge_new$match, ] - unmatched <- result_csv[!result_csv$Barcode %in% matched$Barcode, ]$Barcode + unmatched <- + result_csv[!result_csv$Barcode %in% matched$Barcode, ]$Barcode cell_genotype_vcf <- read.vcfR(args$cell_genotype) - cell_genotype_vcf_gt <- extract.gt(cell_genotype_vcf, element = "GT", as.numeric = TRUE) + cell_genotype_vcf_gt <- + extract.gt(cell_genotype_vcf, + element = "GT", + as.numeric = TRUE + ) donors <- sort(unique(matched[[best_method1]])) - representative_variant_list <- vector(mode = "list", length = length(donors)) - representative_variant_list <- setNames(representative_variant_list, donors) + representative_variant_list <- + vector(mode = "list", length = length(donors)) + representative_variant_list <- + setNames(representative_variant_list, donors) for (donorid in donors) { - matched_barcode <- matched[matched[[best_method1]] == donorid]$Barcode + matched_barcode <- + matched[matched[[best_method1]] == donorid]$Barcode matched_gt_list <- cell_genotype_vcf_gt[, matched_barcode] - matched_gt_list <- matched_gt_list[rowSums(is.na(matched_gt_list)) != ncol(matched_gt_list), ] - matched_gt <- as.data.frame(matrix(nrow = nrow(matched_gt_list))) + matched_gt_list <- + matched_gt_list[rowSums(is.na(matched_gt_list)) != ncol(matched_gt_list), ] + matched_gt <- + as.data.frame(matrix(nrow = nrow(matched_gt_list))) matched_gt$ref <- rowSums(matched_gt_list == 0, na.rm = TRUE) matched_gt$alt <- rowSums(matched_gt_list != 0, na.rm = TRUE) matched_gt$V1 <- rownames(matched_gt_list) matched_gt$count <- matched_gt$ref + matched_gt$alt - matched_gt$pct <- matched_gt$alt / (matched_gt$ref + matched_gt$alt) + matched_gt$pct <- + matched_gt$alt / (matched_gt$ref + matched_gt$alt) matched_gt$dominant <- ifelse(matched_gt$pct > 0.5, 1, 0) matched_gt <- matched_gt[(matched_gt$pct >= args$variant_pct | matched_gt$pct <= (1 - args$variant_pct)), ] - matched_gt <- matched_gt[matched_gt$count >= args$variant_count, ] + matched_gt <- + matched_gt[matched_gt$count >= args$variant_count, ] unmatched_gt_list <- cell_genotype_vcf_gt[, unmatched] - unmatched_gt_list <- unmatched_gt_list[rownames(unmatched_gt_list) %in% matched_gt$V1, ] - unmatched_gt_list <- unmatched_gt_list[rowSums(is.na(unmatched_gt_list)) != ncol(unmatched_gt_list), ] - unmatched_gt_list <- cbind(rownames(unmatched_gt_list), unmatched_gt_list) - unmatched_gt_list <- melt(data.table(unmatched_gt_list), id.vars = "V1") - unmatched_gt_list <- unmatched_gt_list[!is.na(unmatched_gt_list$value), ] + unmatched_gt_list <- + unmatched_gt_list[rownames(unmatched_gt_list) %in% matched_gt$V1, ] + unmatched_gt_list <- + unmatched_gt_list[rowSums(is.na(unmatched_gt_list)) != ncol(unmatched_gt_list), ] + unmatched_gt_list <- + cbind(rownames(unmatched_gt_list), unmatched_gt_list) + unmatched_gt_list <- + melt(data.table(unmatched_gt_list), id.vars = "V1") + unmatched_gt_list <- + unmatched_gt_list[!is.na(unmatched_gt_list$value), ] colnames(unmatched_gt_list) <- c("variant", "cell", "allele") write.csv(matched_gt, @@ -304,10 +451,13 @@ if (args$findVariants == "True" || args$findVariants == "default") { row.names = FALSE ) - informative_variants_cells <- merge(matched_gt, unmatched_gt_list, - by.x = c("V1", "dominant"), - by.y = c("variant", "allele") - ) + informative_variants_cells <- + merge( + matched_gt, + unmatched_gt_list, + by.x = c("V1", "dominant"), + by.y = c("variant", "allele") + ) colnames(informative_variants_cells)[1] <- "variant" num_informative_variants <- informative_variants_cells %>% group_by(cell) %>% @@ -315,53 +465,82 @@ if (args$findVariants == "True" || args$findVariants == "default") { if (nrow(unmatched_gt_list[!unmatched_gt_list$cell %in% num_informative_variants$cell, ]) > 0) { print(unmatched_gt_list[!unmatched_gt_list$cell %in% num_informative_variants$cell, ]) } - representative_variant_list[[donorid]] <- list(unique(informative_variants_cells$variant)) - write.table(unique(informative_variants_cells$variant), - file.path(outputdir_variant, paste0(donorid, "_informative_variants.csv")), - row.names = FALSE, col.names = FALSE + representative_variant_list[[donorid]] <- + list(unique(informative_variants_cells$variant)) + write.table( + unique(informative_variants_cells$variant), + file.path( + outputdir_variant, + paste0(donorid, "_informative_variants.csv") + ), + row.names = FALSE, + col.names = FALSE ) } - representative_variant <- rbindlist(representative_variant_list, idcol = "donor") + representative_variant <- + rbindlist(representative_variant_list, idcol = "donor") colnames(representative_variant)[2] <- "variant" - representative_variant_df <- dcast(data = representative_variant, variant ~ donor, length) + representative_variant_df <- + dcast(data = representative_variant, variant ~ donor, length) write.csv( representative_variant_df, file.path(args$outputdir, "all_representative_variant_df.csv") ) - upset <- ComplexUpset::upset(representative_variant_df, donors, - width_ratio = 0.45, height_ratio = 0.9, - stripes = "white", max_degree = 1, + upset <- ComplexUpset::upset( + representative_variant_df, + donors, + width_ratio = 0.45, + height_ratio = 0.9, + stripes = "white", + max_degree = 1, name = "Number of donor-specific variants", - set_sizes = (upset_set_size() + - geom_text(aes(label = ..count.., size = 3), - hjust = -0.1, stat = "count", - color = "white", size = 2.3 - ) + - theme( - axis.text.x = element_text(angle = 90), - text = element_text(size = 10) - )), + set_sizes = ( + upset_set_size() + + geom_text( + aes(label = ..count.., size = 3), + hjust = -0.1, + stat = "count", + color = "white", + size = 2.3 + ) + + theme( + axis.text.x = element_text(angle = 90), + text = element_text(size = 10) + ) + ), base_annotations = list("Intersection size" = intersection_size()) ) ggsave(file.path(args$outputdir, "donor_specific_variants_upset.png")) - representative_variant_single <- representative_variant_df[rowSums(representative_variant_df[, -1]) == 1, ] - representative_variant_single <- separate(representative_variant_single, - col = "variant", - into = c("chr", "pos"), sep = "_" - ) - write.table(representative_variant_single[, c("chr", "pos")], - quote = FALSE, col.names = FALSE, sep = "\t", row.names = FALSE, + representative_variant_single <- + representative_variant_df[rowSums(representative_variant_df[, -1]) == 1, ] + representative_variant_single <- + separate( + representative_variant_single, + col = "variant", + into = c("chr", "pos"), + sep = "_" + ) + write.table( + representative_variant_single[, c("chr", "pos")], + quote = FALSE, + col.names = FALSE, + sep = "\t", + row.names = FALSE, file.path(args$outputdir, "donor_specific_variants.csv") ) } if (args$findVariants == "True" || args$findVariants == "vireo") { if (startsWith(best_method1, "vireo")) { - write.table(best_method1, + write.table( + best_method1, file.path(args$outputdir, "best_method_vireo.txt"), - sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE + sep = "\t", + row.names = FALSE, + col.names = FALSE, + quote = FALSE ) } else { stop("Vireo is not the best method1 for donor matching, variants can not be filtered!") @@ -369,17 +548,27 @@ if (args$findVariants == "True" || args$findVariants == "vireo") { vireo_result_dir <- file.path(args$vireo_parent_dir, best_method1) - representative_variant <- list.files(vireo_result_dir, "filtered_variants.tsv", - full.names = TRUE, recursive = TRUE - )[1] + representative_variant <- + list.files( + vireo_result_dir, + "filtered_variants.tsv", + full.names = TRUE, + recursive = TRUE + )[1] representative_variant <- fread(representative_variant) - representative_variant <- separate(representative_variant, + representative_variant <- separate( + representative_variant, col = "variants", into = c("chr", "pos"), - sep = "_", extra = "drop" + sep = "_", + extra = "drop" ) - write.table(representative_variant[, c("chr", "pos")], - quote = FALSE, col.names = FALSE, sep = "\t", row.names = FALSE, + write.table( + representative_variant[, c("chr", "pos")], + quote = FALSE, + col.names = FALSE, + sep = "\t", + row.names = FALSE, file.path(args$outputdir, "representative_variants_vireo.csv") ) }