diff --git a/workflow/rules/assigned_counts.smk b/workflow/rules/assigned_counts.smk index 61277fa..6be84a7 100644 --- a/workflow/rules/assigned_counts.smk +++ b/workflow/rules/assigned_counts.smk @@ -62,7 +62,7 @@ rule assigned_counts_assignBarcodes: conda: "../envs/python3.yaml" input: - counts=lambda wc: getFinalCounts(wc.project, wc.config, wc.type, "counts"), + counts=lambda wc: getFinalCounts(wc.project, wc.config, wc.condition, wc.type, "counts"), association="results/experiments/{project}/assignment/{assignment}.tsv.gz", script=getScript("count/merge_BC_and_assignment.py"), output: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index c4efcce..d62638b 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -276,15 +276,16 @@ def getOutputConditionReplicateType_helper(files, project, skip={}): return [] conditions = getConditions(project) for condition in conditions: - replicates = getReplicatesOfCondition(project, condition) for file in files: - output += expand( - file, - project=project, - condition=condition, - replicate=replicates, - type=["RNA", "DNA"], - ) + for type in ["DNA", "RNA"]: + replicates = getReplicatesOfConditionType(project, condition, type) + output += expand( + file, + project=project, + condition=condition, + replicate=replicates, + type=type, + ) return output @@ -687,25 +688,46 @@ def counts_getSamplingConfig(project, conf, dna_or_rna, command): return "" +def getReplicatesOfConditionType(project, condition, rna_or_dna): + exp = getExperiments(project) + + replicates = getReplicatesOfCondition(project, condition) + + if f"{rna_or_dna}_BC_F" in exp.columns: + + exp_filter = exp[exp.Condition == condition] + + if len(replicates) > 1 and exp_filter[f"{rna_or_dna}_BC_F"].nunique() == 1: + return [replicates[0]] + + return replicates -def getFinalCounts(project, conf, rna_or_dna, raw_or_assigned): + +def getFinalCounts(project, conf, condition, rna_or_dna, raw_or_assigned): output = "" + + replicates = getReplicatesOfConditionType(project, condition, rna_or_dna) + if len(replicates) > 1: + replicate = "{replicate}" + else: + replicate = replicates[0] + if raw_or_assigned == "counts": if useSampling(project, conf, rna_or_dna): output = ( - "results/experiments/{project}/%s/{condition}_{replicate}_%s_final_counts.sampling.{config}.tsv.gz" - % (raw_or_assigned, rna_or_dna) + "results/experiments/{project}/%s/{condition}_%s_%s_final_counts.sampling.{config}.tsv.gz" + % (raw_or_assigned, replicate, rna_or_dna) ) else: output = ( - "results/experiments/{project}/%s/{condition}_{replicate}_%s_final_counts.tsv.gz" - % (raw_or_assigned, rna_or_dna) + "results/experiments/{project}/%s/{condition}_%s_%s_final_counts.tsv.gz" + % (raw_or_assigned, replicate, rna_or_dna) ) else: output = ( - "results/experiments/{project}/%s/{condition}_{replicate}_%s_final_counts.config.{config}.tsv.gz" - % (raw_or_assigned, rna_or_dna) + "results/experiments/{project}/%s/{condition}_%s_%s_final_counts.config.{config}.tsv.gz" + % (raw_or_assigned, replicate, rna_or_dna) ) return output @@ -733,20 +755,6 @@ def assignedCounts_getAssignmentSamplingConfig(project, assignment, command): # statistic.smk specific functions -# get all counts of experiment (rule statistic_counts) -def getCountStats(project, countType): - exp = getExperiments(project) - output = [] - for index, row in exp.iterrows(): - output += expand( - "results/experiments/{{project}}/statistic/counts/{condition}_{replicate}_{type}_{{countType}}_counts.tsv.gz", - condition=row["Condition"], - replicate=row["Replicate"], - type=["DNA", "RNA"], - ) - return output - - # get all barcodes of experiment (rule statistic_BC_in_RNA_DNA) def getBCinRNADNAStats(wc): exp = getExperiments(wc.project) diff --git a/workflow/rules/counts.smk b/workflow/rules/counts.smk index bc82472..f85b186 100644 --- a/workflow/rules/counts.smk +++ b/workflow/rules/counts.smk @@ -110,8 +110,8 @@ rule counts_dna_rna_merge_counts: conda: "../envs/default.yaml" input: - dna=lambda wc: getFinalCounts(wc.project, wc.config, "DNA", wc.raw_or_assigned), - rna=lambda wc: getFinalCounts(wc.project, wc.config, "RNA", wc.raw_or_assigned), + dna=lambda wc: getFinalCounts(wc.project, wc.config, wc.condition, "DNA", wc.raw_or_assigned), + rna=lambda wc: getFinalCounts(wc.project, wc.config, wc.condition, "RNA", wc.raw_or_assigned), output: "results/experiments/{project}/{raw_or_assigned}/{condition}_{replicate}.merged.config.{config}.tsv.gz", params: diff --git a/workflow/rules/statistic/bc_overlap.smk b/workflow/rules/statistic/bc_overlap.smk index 7ac65bd..d76e59a 100644 --- a/workflow/rules/statistic/bc_overlap.smk +++ b/workflow/rules/statistic/bc_overlap.smk @@ -8,7 +8,7 @@ rule statistic_bc_overlap_run: "../../envs/r.yaml" input: files=lambda wc: expand( - getFinalCounts(wc.project, wc.config, wc.type, wc.raw_or_assigned), + getFinalCounts(wc.project, wc.config, wc.condition, wc.type, wc.raw_or_assigned), project=wc.project, condition=wc.condition, config=wc.config, @@ -22,7 +22,7 @@ rule statistic_bc_overlap_run: params: input=lambda wc: ",".join( expand( - getFinalCounts(wc.project, wc.config, wc.type, wc.raw_or_assigned), + getFinalCounts(wc.project, wc.config, wc.condition, wc.type, wc.raw_or_assigned), project=wc.project, condition=wc.condition, config=wc.config, @@ -31,7 +31,7 @@ rule statistic_bc_overlap_run: ), cond="{condition}_{type}", replicates=lambda wc: ",".join( - getReplicatesOfCondition(wc.project, wc.condition) + getReplicatesOfConditionType(wc.project, wc.condition, wc.type) ), log: temp( diff --git a/workflow/rules/statistic/counts.smk b/workflow/rules/statistic/counts.smk index d03b654..60c1fd9 100644 --- a/workflow/rules/statistic/counts.smk +++ b/workflow/rules/statistic/counts.smk @@ -1,3 +1,6 @@ +include: "counts_common.smk" + + ################################# ## count 10 most frequent UMIs ## ################################# @@ -148,15 +151,11 @@ rule statistic_counts_BC_in_RNA_DNA: conda: "../../envs/default.yaml" input: - dna=lambda wc: ( - "results/experiments/{project}/counts/{condition}_{replicate}_DNA_{countType}_counts.tsv.gz" - if wc.countType != "raw" - else getRawCounts(wc.project, "DNA") + dna=lambda wc: statistic_counts_BC_in_RNA_DNA_helper( + project, wc.condition, "DNA", wc.countType ), - rna=lambda wc: ( - "results/experiments/{project}/counts/{condition}_{replicate}_RNA_{countType}_counts.tsv.gz" - if wc.countType != "raw" - else getRawCounts(wc.project, "RNA") + rna=lambda wc: statistic_counts_BC_in_RNA_DNA_helper( + project, wc.condition, "RNA", wc.countType ), output: temp( diff --git a/workflow/rules/statistic/counts_common.smk b/workflow/rules/statistic/counts_common.smk new file mode 100644 index 0000000..fba9534 --- /dev/null +++ b/workflow/rules/statistic/counts_common.smk @@ -0,0 +1,37 @@ +def statistic_counts_BC_in_RNA_DNA_helper(project, condition, dna_or_rna, countType): + + replicates = getReplicatesOfConditionType(project, condition, dna_or_rna) + + if countType == "raw": + output = getRawCounts(project, dna_or_rna) + else: + output = ( + "results/experiments/{project}/counts/{condition}_{replicate}_%s_{countType}_counts.tsv.gz" + % dna_or_rna + ) + + if len(replicates) == 1: + output = output.replace("{replicate}", replicates[0]) + + return output + + +# get all counts of experiment (rule statistic_counts) +def getCountStats(project, countType): + exp = getExperiments(project) + output = [] + for index, row in exp.iterrows(): + condition=row["Condition"] + for dna_or_rna in ["DNA", "RNA"]: + replicates = getReplicatesOfConditionType(project, condition, dna_or_rna) + if len(replicates) == 1: + replicate = replicates[0] + else: + replicate = row["Replicate"] + output += expand( + "results/experiments/{{project}}/statistic/counts/{condition}_{replicate}_{type}_{{countType}}_counts.tsv.gz", + condition=condition, + replicate=replicate, + type=dna_or_rna, + ) + return output diff --git a/workflow/scripts/count/BCCounts_betweenReplicates.R b/workflow/scripts/count/BCCounts_betweenReplicates.R index a2f59a9..1089d60 100644 --- a/workflow/scripts/count/BCCounts_betweenReplicates.R +++ b/workflow/scripts/count/BCCounts_betweenReplicates.R @@ -1,118 +1,166 @@ - - +# This script calculates the overlap statistics between replicates of barcode counts. +# It uses the optparse library to handle command-line arguments and the tidyverse library for data manipulation. +# +# Command-line arguments: +# -c, --condition: Condition name (character) +# -f, --files: Comma-separated input files of assigned counts (character) +# -r, --replicates: Comma-separated names of the replicates (same order as files) (character) +# -o, --outfile: Output file of the correlation table (character) +# +# The script performs the following steps: +# 1. Parse command-line arguments and check for required arguments. +# 2. Read the input files and replicates. +# 3. If the number of files does not match the number of replicates, stop with an error. +# 4. Create a data frame with the files, replicates, and condition. +# 5. Define functions to: +# - Calculate overlap statistics between two data sets (get_overlap_stats). +# - Read data from a file (read_data). +# - Write the output to a file (write_output). +# 6. If there is more than one replicate, perform pairwise comparisons: +# - Generate all pairwise combinations of replicates. +# - For each pair, read the data, calculate overlap statistics, and store the results. +# - Calculate the mean Lincoln-Peterson estimator and write the results to the output file. +# 7. If there is only one replicate, create a single row with NA values for overlap statistics and write it to the output file. library(optparse) library(tidyverse) -#library(viridis) +# library(viridis) option_list <- list( - make_option(c("-c", "--condition"), type="character", - help="Condition name"), - make_option(c("-f", "--files"), type="character", - help="Comma separated input files of assigned counts"), - make_option(c("-r", "--replicates"), type="character", - help="Comma separated name of the replicates (same order than files)"), - make_option(c("-o", "--outfile"), type="character", - help="Output file of the correlation table.") + make_option(c("-c", "--condition"), + type = "character", + help = "Condition name", + metavar = "character" + ), + make_option(c("-f", "--files"), + type = "character", + help = "Comma separated input files of assigned counts", + metavar = "character" + ), + make_option(c("-r", "--replicates"), + type = "character", + help = "Comma separated name of the replicates (same order than files)", + metavar = "character" + ), + make_option(c("-o", "--outfile"), + type = "character", + help = "Output file of the correlation table.", + metavar = "character" + ) ) -parser <- OptionParser(option_list=option_list) -arguments <- parse_args(parser, positional_arguments=TRUE) +parser <- OptionParser(option_list = option_list) +arguments <- parse_args(parser, positional_arguments = TRUE) opt <- arguments$options -if (!"condition" %in% names(opt)) { - stop("--condition parameter must be provided. See script usage (--help)") -} -if (!"files" %in% names(opt)) { - stop("--files parameter must be provided. See script usage (--help)") -} -if (!"replicates" %in% names(opt)) { - stop("--replicates parameter must be provided. See script usage (--help)") -} -if (!"outfile" %in% names(opt)) { - stop("--outfile parameter must be provided. See script usage (--help)") +required_args <- c("condition", "files", "replicates", "outfile") +missing_args <- setdiff(required_args, names(opt)) + +if (length(missing_args) > 0) { + stop(paste("Missing required arguments:", paste(missing_args, collapse = ", "), ". See script usage (--help)")) } # condition -cond=opt$condition +cond <- opt$condition # outfile -outfile=opt$outfile +outfile <- opt$outfile # replicates and count files -files <- strsplit(opt$files,",")[[1]] -replicates=strsplit(opt$replicates,",")[[1]] +files <- strsplit(opt$files, ",")[[1]] +replicates <- strsplit(opt$replicates, ",")[[1]] if (length(files) != length(replicates)) { - stop("Number of input files must be euqal to number of replicates") + stop("Number of input files must be euqal to number of replicates") } -num_replicates=length(replicates) +num_replicates <- length(replicates) -data <- data.frame(File=files,Replicate=replicates) +data <- data.frame(File = files, Replicate = replicates) data$Condition <- cond -getOverlapStats <- function(data1,data2,condition,r1,r2){ - sData1 <- data1 %>% summarize(size=n(), count=sum(Counts)) - print(sData1) - sData2 <- data2 %>% summarize(size=n(), count=sum(Counts)) - print(sData2) - data <- data1 %>% inner_join(data2,by=c('Barcode')) - sData <- data %>% summarize(size=n(), count1=sum(Counts.x),count2=sum(Counts.y)) - print(sData) - print(sData$count1) - print(sData$count1[1]) +get_overlap_stats <- function(data1, data2, condition, r1, r2) { + s_data1 <- data1 %>% summarize(size = n(), count = sum(Counts)) + print(s_data1) + s_data2 <- data2 %>% summarize(size = n(), count = sum(Counts)) + print(s_data2) + data <- data1 %>% inner_join(data2, by = c("Barcode")) + s_data <- data %>% summarize(size = n(), count1 = sum(Counts.x), count2 = sum(Counts.y)) + print(s_data) + print(s_data$count1) + print(s_data$count1[1]) outs <- data.frame( - Condition = condition, - ReplicateA = r1, - ReplicateB = r2, - BCs_ReplicateA = sData1$size, - BCs_ReplicateB = sData2$size, - BCs_Overlap = sData$size, - Counts_ReplicateA = sData1$count, - Counts_ReplicateB = sData2$count, - Counts_OverlapA = sData$count1, - Counts_OverlapB = sData$count2, - Overlap_BC_ReplicateA = (sData$size / sData1$size), - Overlap_BC_ReplicateB = (sData$size / sData2$size), - Overlap_Counts_ReplicateA = (sData$count1 / sData1$count), - Overlap_Counts_ReplicateB = (sData$count2 / sData2$count), - Lincoln_Peterson_estimator = round(sData1$size * (sData2$size/sData$size)), - stringsAsFactors=FALSE) + Condition = condition, + ReplicateA = r1, + ReplicateB = r2, + BCs_ReplicateA = s_data1$size, + BCs_ReplicateB = s_data2$size, + BCs_Overlap = s_data$size, + Counts_ReplicateA = s_data1$count, + Counts_ReplicateB = s_data2$count, + Counts_OverlapA = s_data$count1, + Counts_OverlapB = s_data$count2, + Overlap_BC_ReplicateA = (s_data$size / s_data1$size), + Overlap_BC_ReplicateB = (s_data$size / s_data2$size), + Overlap_Counts_ReplicateA = (s_data$count1 / s_data1$count), + Overlap_Counts_ReplicateB = (s_data$count2 / s_data2$count), + Lincoln_Peterson_estimator = round(s_data1$size * (s_data2$size / s_data$size)), + stringsAsFactors = FALSE + ) return(outs) - } -readData <- function(file) { - data <- read.table(file,sep="\t",as.is=T,header=F,stringsAsFactors = F) - colnames(data) <- c("Barcode","Counts") +read_data <- function(file) { + data <- read.table(file, sep = "\t", as.is = T, header = F, stringsAsFactors = F) + colnames(data) <- c("Barcode", "Counts") return(data) } -writeOutput <- function(correlations, name){ - write.table(correlations,file=name,quote=FALSE,sep='\t',row.names=FALSE,col.names=TRUE ) +write_output <- function(correlations, name) { + write.table(correlations, file = name, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) } # pairwise comparison only if more than one replicate -if(data %>% nrow >1){ - +if (data %>% nrow() > 1) { # make pairwise combinations - selected <- combn(data$Replicate,2) - print('sel') + selected <- combn(data$Replicate, 2) + print("sel") print(selected) - overlapStats = data.frame() - print('reps') - for(i in seq(1,dim(selected)[2])){ - print(selected[,i]) - r1=selected[1,i] - r2=selected[2,i] - data1 <- readData(as.character((data %>% filter(Replicate == r1))$File)) + overlap_stats <- data.frame() + print("reps") + for (i in seq(1, dim(selected)[2])) { + print(selected[, i]) + r1 <- selected[1, i] + r2 <- selected[2, i] + data1 <- read_data(as.character((data %>% filter(Replicate == r1))$File)) print("Read data 1") - data2 <- readData(as.character((data %>% filter(Replicate == r2))$File)) + data2 <- read_data(as.character((data %>% filter(Replicate == r2))$File)) print("Read data 2") - - overlapStats_new <- getOverlapStats(data1, data2, cond,r1,r2) - overlapStats <- overlapStats %>% bind_rows(overlapStats_new) + + overlap_stats_new <- get_overlap_stats(data1, data2, cond, r1, r2) + overlap_stats <- overlap_stats %>% bind_rows(overlap_stats_new) } - overlapStats$Mean_Lincoln_Peterson_estimator <- mean(overlapStats$Lincoln_Peterson_estimator) - writeOutput(overlapStats, outfile) + overlap_stats$Mean_Lincoln_Peterson_estimator <- mean(overlap_stats$Lincoln_Peterson_estimator) + write_output(overlap_stats, outfile) +} else { + # If only one replicate, create a single row with NA values for overlap statistics + single_replicate_stats <- data.frame( + Condition = cond, + ReplicateA = replicates[1], + ReplicateB = NA, + BCs_ReplicateA = NA, + BCs_ReplicateB = NA, + BCs_Overlap = NA, + Counts_ReplicateA = NA, + Counts_ReplicateB = NA, + Counts_OverlapA = NA, + Counts_OverlapB = NA, + Overlap_BC_ReplicateA = NA, + Overlap_BC_ReplicateB = NA, + Overlap_Counts_ReplicateA = NA, + Overlap_Counts_ReplicateB = NA, + Lincoln_Peterson_estimator = NA, + Mean_Lincoln_Peterson_estimator = NA, + stringsAsFactors = FALSE + ) + write_output(single_replicate_stats, outfile) }