diff --git a/CHANGELOG.md b/CHANGELOG.md index 9585c977..7b85a3b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,10 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Template update v2.9 ([#52](https://github.com/nf-core/crisprseq/pull/52)) - Use `Channel.fromSamplesheet()` from `nf-validation` to validate input sample sheets and create an input channel ([#58](https://github.com/nf-core/crisprseq/pull/58)) +- BAGEL2 as a module which detects gene essentiality ([#60](https://github.com/nf-core/crisprseq/pull/60)) - Add custom plots to MultiQC report (cutadapt module, read processing, edition, edition QC) ([#64](https://github.com/nf-core/crisprseq/pull/64)) ### Fixed +- Change to `process_high` for the mageck mle module ([#60](https://github.com/nf-core/crisprseq/pull/60) - Summary processes don't modify the input file anymore, allowing resuming these processes ([#66](https://github.com/nf-core/crisprseq/pull/66)) - Do not stash unexistent files, use empty lists instead. Fixes AWS tests ([#67](https://github.com/nf-core/crisprseq/pull/67)) - Rename process `merging_summary` to `preprocessing_summary` to improve clarity ([#69](https://github.com/nf-core/crisprseq/pull/69)) diff --git a/CITATIONS.md b/CITATIONS.md index e279aae8..2f5384ed 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -63,6 +63,10 @@ > Li, W. et al. Quality control, modeling, and visualization of CRISPR screens with MAGeCK-VISPR. Genome Biology 16, 281, doi:10.1186/s13059-015-0843-6 (2015). +- [BAGEL2](https://pubmed.ncbi.nlm.nih.gov/33407829/) + + > Kim E, Hart T. Improved analysis of CRISPR fitness screens and reduced off-target effects with the BAGEL2 gene essentiality classifier. Genome Med. 2021 Jan 6;13(1):2. doi: 10.1186/s13073-020-00809-3. PMID: 33407829; PMCID: PMC7789424. + - [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/) > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671. diff --git a/assets/schema_input.json b/assets/schema_input.json index 21c8c38a..8b780504 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -30,8 +30,6 @@ "condition": { "type": "string", "pattern": "^\\S+$", - "enum": ["control", "treatment"], - "errorMessage": "Condition name must be one of 'control' or 'treatment'", "meta": ["condition"] }, "reference": { diff --git a/bin/BAGEL.py b/bin/BAGEL.py new file mode 100755 index 00000000..205fbb44 --- /dev/null +++ b/bin/BAGEL.py @@ -0,0 +1,1160 @@ +#!/usr/bin/env python + +# --------------------------------- +# BAGEL: Bayesian Analysis of Gene EssentaLity +# (c) Traver Hart , Eiru Kim 2017. + +# Acknowledgements: John McGonigle +# modified 10/2019 +# Free to modify and redistribute with attribution +# --------------------------------- + +# ------------------------------------ +# constants + +""" +MIT License + +Copyright (c) 2020 Hart Lab + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + + +import sys +import time + +import click +import numpy as np +import pandas as pd +import scipy.stats as stats +from sklearn.linear_model import LinearRegression + +VERSION = 2.0 + +BUILD = 115 + + +""" +Update history + +Build 115 +1. Add single pass without resampling + +Build 114 +1. Add option for normalizing rep counts + +Build 113 +1. Fixed bugs + +Build 112 +1. Add sgRNA filtering options +2. Implemented 'Click' library. Thanks to John McGonigle + + +Build 111 +1. Add an option to equalize # of sgRNA per gene + +Build 110 +1. Enable multi control for fold-change calculation +2. Now user can input column names +3. Fix Threshold function + +""" + + +class OptionRequiredIf(click.Option): + def full_process_value(self, ctx, value): + value = super(OptionRequiredIf, self).full_process_value(ctx, value) + + if value is None and ctx.params["filter_multi_target"] is True: + msg = ( + "Error! Multi-target-filtering selected and not align-info provided.\n" + "Please indicate align-info file." + ) + raise click.MissingParameter(ctx=ctx, param=self, message=msg) + return value + + +CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) + + +def round_to_hundredth(x): + return np.around(x * 100) / 100.0 + + +def func_linear(x, a, b): + return (a * x) + b + + +class Training: + def __init__(self, X, n=None, cvnum=10): + if n == None: + self._n = len(X) + self._cvnum = cvnum + self._bid = int(self._n / cvnum) + self._bucket = np.arange(len(X)) + self._X = X + self._step = 0 + + def cross_validation(self): + if self._bid < 1: # bid check + print("The number of genes is too small! n<" + str(self._cvnum)) + sys.exit(1) + drawing = list() + mask = np.array([True] * self._n) + for j in range(self._bid): + # drawing.append(delete(self._bucket, np.random.randrange(len(self._bucket)))) + select = np.random.randint(len(self._bucket)) + drawing.append(self._bucket[select]) + mask[self._bucket[select]] = False + self._bucket = np.delete(self._bucket, select) + if self._step < self._n % self._cvnum: # for distribute remain.. + select = np.random.randint(len(self._bucket)) + drawing.append(self._bucket[select]) + mask[self._bucket[select]] = False + self._bucket = np.delete(self._bucket, select) + self._step += 1 + X_resample = self._X[mask] + return X_resample, self._X[~mask] + + def get_cv_step(self): + return self._step + + def bootstrap_resample(self): + mask = np.array([False] * self._n) + resample_i = np.floor(np.random.rand(self._n) * len(self._X)).astype(int) + + mask[resample_i] = True + X_resample = self._X[mask] + return X_resample, self._X[~mask] + + def get_data(self, method=0): + if method == 0: + train, test = self.bootstrap_resample() + elif method == 1: + train, test = self.cross_validation() + else: + print("Method passed as a value that was neither 0 nor 1.") + sys.exit(1) + return train, test + + +def fibo_weighted_sum(listofscore): + value = p1 = p2 = 0.0 + c = 1.0 # current value + for v in listofscore: + value += v / c + p2 = p1 # go one step + p1 = c + c = p1 + p2 + return value + + +# Note the \b in the doc string below is prevent click from wrapping the lines on the terminal. +@click.group(context_settings=CONTEXT_SETTINGS) +def bagel(): + """ + -------------------------------------------------------------------- + BAGEL.py + -------------------------------------------------------------------- + A tool from the Bayesian Analysis of Gene EssentiaLity (BAGEL) suite. + + \b + Calculate fold changes from read count data: + + \b + BAGEL.py fc -i [read count file] -o [output label] -c [control column] + + Calculate Bayes Factors from foldchange data: + + \b + BAGEL.py bf -i [fold change] -o [output file] -e [essentials genes] -n [nonessentials genes] -c [columns] + + + Calculate precision-recall from Bayes Factors: + + \b + BAGEL.py pr -i [Bayes Factor file] -o [output file] -e [essentials genes] -n [nonessentials genes] + + + + To print the current build and version use: + + \b + BAGEL.py version + """ + + +@click.command(name="version") +def report_bagel_version(): + """ + Report the current build and version no. + """ + print( + "Bayesian Analysis of Gene EssentiaLity (BAGEL) suite:\n" + "Version: {VERSION}\n" + "Build: {BUILD}".format(VERSION=VERSION, BUILD=BUILD) + ) + + +@click.command(name="fc") +@click.option("-i", "--read-count-file", required=True, type=click.Path(exists=True)) +@click.option("-o", "--output-label", required=True) +@click.option("-c", "--control-columns", required=True) +@click.option("-m", "--min-reads", type=int, default=0) +@click.option("-Np", "--pseudo-count", type=int, default=5) +def calculate_fold_change(read_count_file, output_label, control_columns, min_reads, pseudo_count): + """ + \b + Calculate fold changes from read count data outputting a fold change column: + + \b + BAGEL.py fc -i [read count file] -o [output label] -c [control column] + + \b + Required options: + -i --read-count-file Tab-delimited file of reagents and fold changes. See documentation for format. + -o --output-label Label for all output files + -c --control-columns A comma-delimited list of columns of control (T0 or plasmid) columns. + Input can be either number or name. + \b + Other options: + --min-reads=N Discard gRNA with T0 counts < N (default 0) + -Np, --pseudo-count=N Add a pseudocount of N to every readcount (default 5) + -h, --help Show this help text + + + \b + Example: + BAGEL.py fc -i readcount_file.txt -o experiment_name -c 1 + + This command calculates fold change, and writes [output label].foldchange and [output label].normalized_reads + + """ + + # ---------------------------------------------------------------- # + # Import raw read data, normalize, filter for T0 min read counts # + # Output: [output label].foldchange # + # ---------------------------------------------------------------- # + reads = pd.read_csv(read_count_file, sep="\t", index_col=0) + + reads[reads.columns.values[0]].fillna("NO_GENE_NAME", inplace=True) + reads.fillna(0, inplace=True) + control_columns = control_columns.split(",") + # + # check if controls are given as numeric or name + # + + try: + try: + ctrl_columns = list(map(int, control_columns)) + ctrl_labels = reads.columns.values[ctrl_columns] + except ValueError: + ctrl_labels = control_columns + + ctrl_sum = reads[ctrl_labels].sum(axis=1) + reads.drop(ctrl_labels, axis=1, inplace=True) + ctrl_label_new = ";".join(ctrl_labels) + reads[ctrl_label_new] = ctrl_sum + except: + print(reads[ctrl_labels].sum(axis=1)) + print("Invalid input controls") + sys.exit(1) + + numClones, numColumns = reads.shape + print("Controls: " + ", ".join(ctrl_labels)) + + # + # Add pseudo count + # + + reads.iloc[:, list(range(1, numColumns))] += pseudo_count + + # + # normalize each sample to a fixed total readcount + # + sumReads = reads.iloc[:, list(range(1, numColumns))].sum(0) + normed = pd.DataFrame(index=reads.index.values) + normed["GENE"] = reads.iloc[:, 0] # first column is gene name + normed = ( + reads.iloc[:, list(range(1, numColumns))] / np.tile(sumReads, [numClones, 1]) * 10000000 + ) # normalize to 10M reads + + # + # filter for minimum readcount + # + f = np.where(reads[ctrl_label_new] >= min_reads)[0] + normed = normed.iloc[f, :] + + # + # calculate fold change + # + foldchange = pd.DataFrame(index=normed.index.values) + foldchange.index.name = "REAGENT_ID" + foldchange["GENE"] = reads.iloc[f, 0] # dataframe 'normed' has no GENE column + for i in range(numColumns - 1): + foldchange[normed.columns.values[i]] = np.log2( + (normed.loc[:, normed.columns.values[i]]) / normed[ctrl_label_new] + ) + # + # we have calculated a foldchange for the control column. Drop it. + # + foldchange.drop(ctrl_label_new, axis=1, inplace=True) + + # + # write normed readcount file + # write foldchange file + # + + foldchange_filename = output_label + ".foldchange" + foldchange.to_csv(foldchange_filename, sep="\t", float_format="%4.3f") + + normedreads_filename = output_label + ".normed_readcount" + normed.to_csv(normedreads_filename, sep="\t", float_format="%3.2f") + + +@click.command(name="bf") +@click.option("-i", "--fold-change", required=True, type=click.Path(exists=True)) +@click.option("-o", "--output-file", required=True) +@click.option("-e", "--essential-genes", required=True, type=click.Path(exists=True)) +@click.option("-n", "--non-essential-genes", required=True, type=click.Path(exists=True)) +@click.option("-c", "--columns-to-test", required=True) +@click.option("-w", "--network-file", metavar="[network File]", default=None, type=click.Path(exists=True)) +@click.option("-m", "--filter-multi-target", is_flag=True) +@click.option("-m0", "--loci-without-mismatch", type=int, default=10) +@click.option("-m1", "--loci-with-mismatch", type=int, default=10) +@click.option( + "--align-info", metavar="--align-info [File]", default=None, type=click.Path(exists=True), cls=OptionRequiredIf +) +@click.option("-b", "--use-bootstrapping", is_flag=True) +@click.option("-NS", "--no-resampling", is_flag=True) +@click.option("-s", "--use-small-sample", is_flag=True) +@click.option("-N", "--no-of-cross-validations", type=int, default=10) +@click.option("-NB", "--bootstrap-iterations", type=int, default=1000) +@click.option("-r", "--sgrna-bayes-factors", is_flag=True) +@click.option("-f", "--equalise-sgrna-no", type=int) +@click.option("-s", "--seed", default=int(time.time() * 100000 % 100000), type=int) +@click.option("-t", "--run-test-mode", is_flag=True) +@click.option("-p", "--equalise-rep-no", type=int) +def calculate_bayes_factors( + fold_change, + output_file, + essential_genes, + non_essential_genes, + columns_to_test, + network_file, + align_info, + use_bootstrapping, + no_resampling, + use_small_sample, + filter_multi_target, + loci_without_mismatch, + loci_with_mismatch, + bootstrap_iterations, + no_of_cross_validations, + sgrna_bayes_factors, + equalise_sgrna_no, + seed, + run_test_mode, + equalise_rep_no, +): + """ + \b + Calculate Bayes Factors from an input fold change file: + + \b + BAGEL.py bf -i [fold change] -o [output file] -e [essentials genes] -n [nonessentials genes] -c [columns] + + + \b + Calculates a log2 Bayes Factor for each gene. Positive BFs indicate confidence that the gene is essential. + Output written to the [output file] contains: gene name, mean Bayes Factor across all iterations, std deviation of + BFs, and number of iterations in which the gene was part of the test set (and a BF was calculated[output file]. + + + \b + Required options: + -i --fold-change [fold change file] Tab-delimited file of reagents and fold changes + (see documentation for format). + -o, --output-file [output file] Output filename + -e, --essential-genes [reference essentials] File with list of training set of essential genes + -n, --non-essential-genes [reference nonessentials] File with list of training set of nonessential genes + -c [columns to test] comma-delimited list of columns in input file to + include in analyisis + + \b + Network options: + -w [network file] Enable Network boosting. Tab-delmited file of edges. [GeneA (\\t) GeneB]\n' + + \b + Multi-target guides filtering options: + -m, --filter-multi-target Enable filtering multi-targeting guide RNAs + --align-info [file] Input precalculated align-info file + -m0, --loci-without-mismatch Filtering guide RNAs without mismatch targeting over than [N] loci, default = 10 + -m1, --loci-with-mismatch Filtering guide RNAs with 1-bp mismatch targeting over than [N] loci, default = 10 + + \b + Other options: + -b, --bootstrapping Use bootstrapping instead of cross-validation (Slow) + -NS, --no-resampling Run BAGEL without resampling + -s, --small-sample Low-fat BAGEL, Only resampled training set (Bootstrapping, iteration = 100) + -r --sgrna-bayes-factors Calculate sgRNA-wise Bayes Factor + -f --equalise-sgrna-no Equalize the number of sgRNAs per gene to particular value [Number] + -p --equalise-rep-no Equalize the number of repicates to particular value [Number] + -N --no-of-cross-validations Number of sections for cross validation (default 10) + -NB --bootstraps-iterations Number of bootstrap iterations (default 1000) + -s, --seed=N Define random seed + -h, --help Show this help text + + \b + Example: + + \b + BAGEL.py bf -i fc_file.txt -o results.bf -e ess_training_set.txt -n noness_training_set.txt -c 1,2,3 + + """ + np.random.seed(seed) # set random seed + if network_file: + network_boost = True + else: + network_boost = False + + if sgrna_bayes_factors: + rna_level = True + else: + rna_level = False + + if network_file and sgrna_bayes_factors: + network_boost = False + + if equalise_sgrna_no: + flat_sgrna = True + else: + flat_sgrna = False + + if equalise_rep_no: + flat_rep = True + else: + flat_rep = False + + if use_small_sample: + train_method = 0 + bootstrap_iterations = 100 + + elif use_bootstrapping: + train_method = 0 + + else: + train_method = 1 + + genes = {} + fc = {} + gene2rna = {} + rna2gene = {} + + multi_targeting_sgrnas = dict() + multi_targeting_sgrnas_info = dict() + + if filter_multi_target: + try: + aligninfo = pd.read_csv(align_info, header=None, index_col=0, sep="\t").fillna("") + for seqid in aligninfo.index: + perfectmatch = 0 + mismatch_1bp = 0 + perfectmatch_gene = 0 + mismatch_1bp_gene = 0 + if aligninfo[1][seqid] != "": + perfectmatch = len(aligninfo[1][seqid].split(",")) + if aligninfo[2][seqid] != "": + perfectmatch_gene = len(aligninfo[2][seqid].split(",")) + if aligninfo[3][seqid] != "": + mismatch_1bp = len(aligninfo[3][seqid].split(",")) + if aligninfo[4][seqid] != "": + mismatch_1bp_gene = len(aligninfo[4][seqid].split(",")) + if perfectmatch > loci_without_mismatch or mismatch_1bp > loci_with_mismatch: + multi_targeting_sgrnas[seqid] = True + elif perfectmatch > 1 or mismatch_1bp > 0: + multi_targeting_sgrnas_info[seqid] = ( + perfectmatch, + mismatch_1bp, + perfectmatch_gene, + mismatch_1bp_gene, + ) + + except: + print("Please check align-info file") + sys.exit(1) + + print("Total %d multi-targeting gRNAs are discarded" % len(multi_targeting_sgrnas)) + + # + # LOAD FOLDCHANGES + # + rnatagset = set() + with open(fold_change) as fin: + fieldname = fin.readline().rstrip().split("\t") + # + # DEFINE CONTROLS + # + columns = columns_to_test.split(",") + try: + try: + column_list = list(map(int, columns)) + column_labels = [fieldname[x + 1] for x in column_list] + except ValueError: + column_labels = columns + column_list = [ + x for x in range(len(fieldname) - 1) if fieldname[x + 1] in column_labels + ] # +1 because of First column start 2 + print("Using column: " + ", ".join(column_labels)) + # print "Using column: " + ", ".join(map(str,column_list)) + + except: + print("Invalid columns") + sys.exit(1) + + for line in fin: + fields = line.rstrip().split("\t") + rnatag = fields[0] + if filter_multi_target is True: # multitargeting sgrna filtering + if rnatag in multi_targeting_sgrnas: + continue # skip multitargeting sgrna. + if rnatag in rnatagset: + print("Error! sgRNA tag duplicates") + sys.exit(1) + rnatagset.add(rnatag) + gsym = fields[1] + + genes[gsym] = 1 + if gsym not in gene2rna: + gene2rna[gsym] = [] + gene2rna[gsym].append(rnatag) + rna2gene[rnatag] = gsym + fc[rnatag] = {} + for i in column_list: + fc[rnatag][i] = float(fields[i + 1]) # per user docs, GENE is column 0, first data column is col 1. + + genes_array = np.array(list(genes.keys())) + gene_idx = np.arange(len(genes)) + print("Number of unique genes: " + str(len(genes))) + + # + # DEFINE REFERENCE SETS + # + coreEss = [] + + with open(essential_genes) as fin: + skip_header = fin.readline() + for line in fin: + coreEss.append(line.rstrip().split("\t")[0]) + coreEss = np.array(coreEss) + print("Number of reference essentials: " + str(len(coreEss))) + + nonEss = [] + with open(non_essential_genes) as fin: + skip_header = fin.readline() + for line in fin: + nonEss.append(line.rstrip().split("\t")[0]) + + nonEss = np.array(nonEss) + print("Number of reference nonessentials: " + str(len(nonEss))) + + # + # LOAD NETWORK + # + + if network_boost is True: + network = {} + edgecount = 0 + with open(network_file) as fin: + for line in fin: + linearray = line.rstrip().split("\t") # GeneA \t GeneB format + if linearray[0] in genes_array and linearray[1] in genes_array: + for i in [0, 1]: + if linearray[i] not in network: + network[linearray[i]] = {} + network[linearray[i]][linearray[-1 * (i - 1)]] = 1 # save edge information + edgecount += 1 + + print("Number of network edges: " + str(edgecount)) + + # + # INITIALIZE BFS + # + + # Define foldchange dynamic threshold. logarithm decay. + # Parameters are defined by regression (achilles data) 2**-7 was used in previous version. + + FC_THRESH = 2 ** (-1.1535 * np.log(len(np.intersect1d(genes_array, nonEss)) + 13.324) + 0.7728) + bf = {} + boostedbf = {} + for g in genes_array: + for rnatag in gene2rna[g]: + bf[rnatag] = [] + + boostedbf[g] = [] # boosted bf at gene level + + # + # TRAINING + # + if use_small_sample: + # declare training class + # training_data = Training(setdiff1d(gene_idx,np.where(in1d(genes_array,coreEss))),cvnum=NUMCV) + # declare training class (only for Gold-standard gene set) + training_data = Training( + np.where(np.in1d(genes_array, np.union1d(coreEss, nonEss)))[0], cvnum=no_of_cross_validations + ) + # all non-goldstandards + all_non_gs = np.where(np.logical_not(np.in1d(genes_array, np.union1d(coreEss, nonEss))))[0] + else: + training_data = Training(gene_idx, cvnum=no_of_cross_validations) # declare training class + + if train_method == 0: + LOOPCOUNT = bootstrap_iterations + elif train_method == 1: + LOOPCOUNT = no_of_cross_validations # 10-folds + + if run_test_mode == True: + fp = open(output_file + ".traininfo", "w") + fp.write("#1: Loopcount\n#2: Training set\n#3: Testset\n") + # No resampling option + if no_resampling == True: + print("# Caution: Resampling is disabled") + LOOPCOUNT = 1 + + print("Iter TrainEss TrainNon TestSet") + sys.stdout.flush() + for loop in range(LOOPCOUNT): + currentbf = {} + printstr = "" + printstr += str(loop) + + # + # bootstrap resample (10-folds cross-validation) from gene list to get the training set + # test set for this iteration is everything not selected in bootstrap resampled (10-folds cross-validation) + # training set + # define essential and nonessential training sets: arrays of indexes + # + if no_resampling == True: + # no resampling + gene_train_idx = gene_idx + gene_test_idx = gene_idx + else: + # CV or bootstrapping + gene_train_idx, gene_test_idx = training_data.get_data(train_method) + if use_small_sample: + # test set is union of rest of training set (gold-standard) and the other genes (all of non-gold-standard) + gene_test_idx = np.union1d(gene_test_idx, all_non_gs) + + if run_test_mode: + fp.write( + "%d\n%s\n%s\n" % (loop, ",".join(genes_array[gene_train_idx]), ",".join(genes_array[gene_test_idx])) + ) + + train_ess = np.where(np.in1d(genes_array[gene_train_idx], coreEss))[0] + train_non = np.where(np.in1d(genes_array[gene_train_idx], nonEss))[0] + printstr += " " + str(len(train_ess)) + printstr += " " + str(len(train_non)) + printstr += " " + str(len(gene_test_idx)) + print(printstr) + sys.stdout.flush() + # + # define ess_train: vector of observed fold changes of essential genes in training set + # + ess_train_fc_list_of_lists = [ + fc[rnatag] for g in genes_array[gene_train_idx[train_ess]] for rnatag in gene2rna[g] + ] + ess_train_fc_flat_list = [obs for sublist in ess_train_fc_list_of_lists for obs in list(sublist.values())] + # + # define non_train vector of observed fold changes of nonessential genes in training set + # + non_train_fc_list_of_lists = [ + fc[rnatag] for g in genes_array[gene_train_idx[train_non]] for rnatag in gene2rna[g] + ] + non_train_fc_flat_list = [obs for sublist in non_train_fc_list_of_lists for obs in list(sublist.values())] + # + # calculate empirical fold change distributions for both + # + kess = stats.gaussian_kde(ess_train_fc_flat_list) + knon = stats.gaussian_kde(non_train_fc_flat_list) + # + # define empirical upper and lower bounds within which to calculate BF = f(fold change) + # + x = np.arange(-10, 2, 0.01) + nonfitx = knon.evaluate(x) + # define lower bound empirical fold change threshold: minimum FC np.where knon is above threshold + f = np.where(nonfitx > FC_THRESH) + xmin = round_to_hundredth(min(x[f])) + # define upper bound empirical fold change threshold: minimum value of log2(ess/non) + subx = np.arange(xmin, max(x[f]), 0.01) + logratio_sample = np.log2(kess.evaluate(subx) / knon.evaluate(subx)) + f = np.where(logratio_sample == logratio_sample.min()) + xmax = round_to_hundredth(subx[f]) + # + # round foldchanges to nearest 0.01 + # precalculate logratios and build lookup table (for speed) + # + logratio_lookup = {} + for i in np.arange(xmin, xmax + 0.01, 0.01): + logratio_lookup[np.around(i * 100)] = np.log2(kess.evaluate(i) / knon.evaluate(i)) + # + # calculate BFs from lookup table for withheld test set + # + + # liner interpolation + testx = list() + testy = list() + + for g in genes_array[gene_train_idx]: + for rnatag in gene2rna[g]: + for foldchange in list(fc[rnatag].values()): + if foldchange >= xmin and foldchange <= xmax: + testx.append(np.around(foldchange * 100) / 100) + testy.append(logratio_lookup[np.around(foldchange * 100)][0]) + try: + slope, intercept, r_value, p_value, std_err = stats.linregress(np.array(testx), np.array(testy)) + except: + print("Regression failed. Check quality of the screen") + sys.exit(1) + # + # BF calculation + # + + for g in genes_array[gene_test_idx]: + for rnatag in gene2rna[g]: + bayes_factor = [] + for rep in column_list: + bayes_factor.append(slope * fc[rnatag][rep] + intercept) + bf[rnatag].append(bayes_factor) + + if run_test_mode == True: + fp.close() + + num_obs = dict() + if rna_level is False: + bf_mean = dict() + bf_std = dict() + bf_norm = dict() # sgRNA number complement + if rna_level or filter_multi_target: + bf_mean_rna_rep = dict() + bf_std_rna_rep = dict() + # bf_norm_rna_rep = dict() + + for g in gene2rna: + num_obs[g] = len(bf[gene2rna[g][0]]) + if rna_level or filter_multi_target: + for rnatag in gene2rna[g]: + bf_mean_rna_rep[rnatag] = dict() + bf_std_rna_rep[rnatag] = dict() + t = list(zip(*bf[rnatag])) + for rep in range(len(column_list)): + bf_mean_rna_rep[rnatag][column_list[rep]] = np.mean(t[rep]) + bf_std_rna_rep[rnatag][column_list[rep]] = np.std(t[rep]) + + if rna_level == False: + sumofbf_list = list() + for i in range(num_obs[g]): + sumofbf = 0.0 + for rnatag in gene2rna[g]: + sumofbf += sum(bf[rnatag][i]) + sumofbf_list.append(sumofbf) # append each iter + bf_mean[g] = np.mean(sumofbf_list) + bf_std[g] = np.std(sumofbf_list) + + # + # BUILD MULTIPLE REGRESSION MODEL FOR MULTI TARGETING GUIDE RNAs + # + if filter_multi_target: + count = 0 + trainset = dict() + bf_multi_corrected_gene = dict() + bf_multi_corrected_rna = dict() + for gene in gene2rna: + # multi_targeting_sgrnas_info[seqid] = (perfectmatch, mismatch_1bp, perfectmatch_gene, mismatch_1bp_gene) + multitarget = list() + onlytarget = list() + for seqid in gene2rna[gene]: + if seqid not in aligninfo.index: + continue + if seqid in multi_targeting_sgrnas_info: + multitarget.append(seqid) + else: + onlytarget.append(seqid) + + if len(onlytarget) > 0: # comparsion between sgRNAs targeting one locus and multiple loci + if len(multitarget) > 0: + bf_only = np.mean([sum(list(bf_mean_rna_rep[seqid].values())) for seqid in onlytarget]) + for seqid in onlytarget: + trainset[seqid] = [1, 0, 0] + + for seqid in multitarget: + if ( + multi_targeting_sgrnas_info[seqid][2] > 1 or multi_targeting_sgrnas_info[seqid][3] > 0 + ): # train model using multi-targeting only targeting one protein coding gene + continue + + count += 1 + increment = sum(list(bf_mean_rna_rep[seqid].values())) - bf_only + + trainset[seqid] = [ + multi_targeting_sgrnas_info[seqid][0], + multi_targeting_sgrnas_info[seqid][1], + increment, + ] + + if count < 10: + print("Not enough train set for calculating multi-targeting effect.\n") + print("It may cause due to unmatched gRNA names between the foldchange file and the align info file.\n") + print("Filtering is not finished\n") + filter_multi_target = False + + else: + trainset = pd.DataFrame().from_dict(trainset).T + X = trainset[[0, 1]] + y = trainset[2] + + regressor = LinearRegression() + regressor.fit(X, y) + coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=["Coefficient"]) + for i in [0, 1]: + if coeff_df["Coefficient"][i] < 0: + print("Regression coefficient is below than zero. Substituted to zero\n") + coeff_df["Coefficient"][i] = 0.0 + print( + "Multiple effects from perfect matched loci = %.3f and 1bp mis-matched loci = %.3f" + % (coeff_df["Coefficient"][0], coeff_df["Coefficient"][1]) + ) + + if rna_level == False: + for g in gene2rna: + penalty = 0.0 + for seqid in gene2rna[g]: + if seqid in multi_targeting_sgrnas_info: + penalty += ( + float(multi_targeting_sgrnas_info[seqid][0] - 1) * coeff_df["Coefficient"][0] + + float(multi_targeting_sgrnas_info[seqid][1]) * coeff_df["Coefficient"][1] + ) + bf_multi_corrected_gene[g] = bf_mean[g] - penalty + else: + for g in gene2rna: + for seqid in gene2rna[g]: + if seqid in multi_targeting_sgrnas_info: + penalty = ( + float(multi_targeting_sgrnas_info[seqid][0] - 1) * coeff_df["Coefficient"][0] + + float(multi_targeting_sgrnas_info[seqid][1]) * coeff_df["Coefficient"][1] + ) + else: + penalty = 0.0 + bf_multi_corrected_rna[seqid] = sum(list(bf_mean_rna_rep[seqid].values())) - penalty + + # + # NORMALIZE sgRNA COUNT + # + if rna_level is False and flat_sgrna == True: + if filter_multi_target == True: + targetbf = bf_multi_corrected_gene + else: + targetbf = bf_mean + + for g in gene2rna: + multiple_factor = equalise_sgrna_no / float(len(gene2rna[g])) + bf_norm[g] = targetbf[g] * multiple_factor + + """ + if bf_std[rnatag] == 0.0: + bf_norm[rnatag] = float('inf') + else: + bf_norm[g] = ( bf[rnatag] - bf_mean[rnatag] ) / bf_std[rnatag] + """ + training_data = Training(gene_idx) # set training class reset + + # + # calculate network scores + # + + if network_boost == True and rna_level == False: # Network boost is only working for gene level + if run_test_mode == True: # TEST MODE + fp = open(output_file + ".netscore", "w") + print("\nNetwork score calculation start\n") + + networkscores = {} + for g in genes_array[gene_idx]: + if g in network: + templist = list() + for neighbor in network[g]: + if neighbor in bf_mean: + templist.append(bf_mean[neighbor]) + + templist.sort(reverse=True) + + networkscores[g] = fibo_weighted_sum(templist) + # + # start training + # + + for loop in range(LOOPCOUNT): + currentnbf = {} + printstr = "" + printstr += str(loop) + + # + # draw train, test sets + # + gene_train_idx, gene_test_idx = training_data.get_data(train_method) + # + # define essential and nonessential training sets: arrays of indexes + # + train_ess = np.where(np.in1d(genes_array[gene_train_idx], coreEss))[0] + train_non = np.where(np.in1d(genes_array[gene_train_idx], nonEss))[0] + printstr += " " + str(len(train_ess)) + printstr += " " + str(len(train_non)) + printstr += " " + str(len(gene_test_idx)) + + sys.stdout.flush() + # + # calculate Network BF for test set + # + ess_ns_list = [networkscores[x] for x in genes_array[gene_train_idx[train_ess]] if x in networkscores] + non_ns_list = [networkscores[x] for x in genes_array[gene_train_idx[train_non]] if x in networkscores] + + kess = stats.gaussian_kde(ess_ns_list) + knon = stats.gaussian_kde(non_ns_list) + # + # set x boundary for liner regression + # + testx = list() + testy = list() + xmin = float(np.inf) + xmax = float(-np.inf) + + for networkscore in np.arange(max(ess_ns_list), min(ess_ns_list), -0.01): + density_ess = kess.evaluate(networkscore)[0] + density_non = knon.evaluate(networkscore)[0] + if density_ess == 0.0 or density_non == 0.0: + continue + + if np.log2(density_ess / density_non) > -5 and networkscore < np.array(ess_ns_list).mean(): # reverse + xmin = min(xmin, networkscore) + + for networkscore in np.arange(min(non_ns_list), max(non_ns_list), 0.01): + density_ess = kess.evaluate(networkscore)[0] + density_non = knon.evaluate(networkscore)[0] + if density_ess == 0.0 or density_non == 0.0: + continue + if np.log2(density_ess / density_non) < 5 and networkscore > np.array(non_ns_list).mean(): # reverse + xmax = max(xmax, networkscore) + # + # liner regression + # + testx = list() + testy = list() + for g in genes_array[gene_train_idx]: + if g in networkscores: + if networkscores[g] >= xmin and networkscores[g] <= xmax: + testx.append(np.around(networkscores[g] * 100) / 100) + testy.append(np.log2(kess.evaluate(networkscores[g])[0] / knon.evaluate(networkscores[g])[0])) + + slope, intercept, r_value, p_value, std_err = stats.linregress(np.array(testx), np.array(testy)) + + for g in genes_array[gene_test_idx]: + if g in networkscores: + if run_test_mode == True: + fp.write("%s\t%f\t%f\n" % (g, networkscores[g], slope * networkscores[g] + intercept)) + nbf = slope * networkscores[g] + intercept + else: + nbf = 0.0 + + boostedbf[g].append(bf_mean[g] + nbf) + if flat_sgrna == True: + boostedbf[g].append(bf_norm[g] + nbf) + + if run_test_mode == True: + fp.close() + + # + # print out results + # + + # Equalizing factor (Replicates) + if flat_rep == True: + eqf = equalise_rep_no / float(len(column_labels)) + else: + eqf = 1 + + # print out + with open(output_file, "w") as fout: + if rna_level == True: + fout.write("RNA\tGENE") + for i in range(len(column_list)): + fout.write("\t{0:s}".format(column_labels[i])) + if train_method == 0: + fout.write("\t{0:s}".format(column_labels[i] + "_STD")) + fout.write("\tBF") + if train_method == 0: + fout.write("\tNumObs") + fout.write("\n") + + for rnatag in sorted(bf.keys()): + # RNA tag + fout.write("{0:s}\t".format(rnatag)) + # Gene + gene = rna2gene[rnatag] + fout.write("{0:s}\t".format(gene)) + + # BF of replicates + for rep in column_list: + fout.write("{0:4.3f}\t".format(bf_mean_rna_rep[rnatag][rep])) + if train_method == 0: + fout.write("{0:4.3f}\t".format(bf_std_rna_rep[rnatag][rep])) + + # Sum BF of replicates + if filter_multi_target == True: + fout.write( + "{0:4.3f}".format(float(bf_multi_corrected_rna[rnatag]) * eqf) + ) # eqf = equalizing factor for the number of replicates + else: + fout.write("{0:4.3f}".format(float(sum(list(bf_mean_rna_rep[rnatag].values()))) * eqf)) + + # Num obs + if train_method == 0: + fout.write("\t{0:d}".format(num_obs[gene])) + fout.write("\n") + else: + fout.write("GENE") + if network_boost == True: + fout.write("\tBoostedBF") + if train_method == 0: + fout.write("\tSTD_BoostedBF") + fout.write("\tBF") + if train_method == 0: + fout.write("\tSTD\tNumObs") + if flat_sgrna == True: + fout.write("\tNormBF") + fout.write("\n") + + for g in sorted(genes.keys()): + # Gene + fout.write("{0:s}".format(g)) + if network_boost == True: + boostedbf_mean = np.mean(boostedbf[g]) + boostedbf_std = np.std(boostedbf[g]) + fout.write("\t{0:4.3f}".format(float(boostedbf_mean) * eqf)) + if train_method == 0: + fout.write("\t{0:4.3f}".format(float(boostedbf_std) * eqf)) + + # BF + if filter_multi_target == True: + fout.write( + "\t{0:4.3f}".format(float(bf_multi_corrected_gene[g]) * eqf) + ) # eqf = equalizing factor for the number of replicates + else: + fout.write("\t{0:4.3f}".format(float(bf_mean[g]) * eqf)) + # STD, Count + if train_method == 0: + fout.write("\t{0:4.3f}\t{1:d}".format(float(bf_std[g]), num_obs[g])) + # Normalized BF + if flat_sgrna == True: + fout.write("\t{0:4.3f}".format(float(bf_norm[g]))) + + fout.write("\n") + + +@click.command(name="pr") +@click.option("-i", "--bayes-factors", required=True, type=click.Path(exists=True)) +@click.option("-o", "--output-file", required=True) +@click.option("-e", "--essential-genes", required=True, type=click.Path(exists=True)) +@click.option("-n", "--non-essential-genes", required=True, type=click.Path(exists=True)) +@click.option("-k", "--use-column", default=None) +def calculate_precision_recall(bayes_factors, output_file, essential_genes, non_essential_genes, use_column): + """ + Calculate precision-recall from an input Bayes Factors file: + + \b + BAGEL.py pr -i [Bayes Factor file] -o [output file] -e [essentials genes] -n [nonessentials genes] + + \b + Required options: + -i, --bayes-factors [Bayes factors] BAGEL output file. + -o, --output-file [output file] Output filename + -e, --essential-genes [reference essentials] File with list of training set of essential genes + -n, --non-essential-genes [reference nonessentials] File with list of training set of nonessential genes + + \b + Other options: + -k [column name] Use other column (default \BF\) + + \b + Example: + BAGEL.py pr -i input.bf -o output.PR -e ref_essentials.txt -n ref_nonessentials.txt + + """ + # + # test for availability of all files + # + essentials = pd.read_csv(essential_genes, index_col=0, sep="\t") + nonessentials = pd.read_csv(non_essential_genes, index_col=0, sep="\t") + bf = pd.read_csv(bayes_factors, index_col=0, sep="\t") + + if use_column is not None: + bf_column = use_column + if bf_column not in bf.dtypes.index: + print("Error! the column name is not in the file") + sys.exit(1) + else: + bf_column = "BF" + + bf.sort_values(by=bf_column, ascending=False, inplace=True) + + cumulative_tp = 0.0 + cumulative_fp = 0.0 + precision = 1.0 + recall = 0.0 + # note float formats + + ess = essentials.index.values + non = nonessentials.index.values + totNumEssentials = len([x for x in bf.index.values if x in ess]) + + with open(output_file, "w") as fout: + fout.write("Gene\t") + fout.write(bf_column) + fout.write("\tRecall\tPrecision\tFDR\n") + + for g in bf.index.values: + if g in ess: + cumulative_tp += 1 + elif g in non: + cumulative_fp += 1 + recall = cumulative_tp / totNumEssentials + if (cumulative_tp > 0) | (cumulative_fp > 0): + precision = cumulative_tp / (cumulative_tp + cumulative_fp) + fout.write( + "{0:s}\t{1:4.3f}\t{2:4.3f}\t{3:4.3f}\t{4:4.3f}\n".format( + g, bf.loc[g, bf_column], recall, precision, 1.0 - precision + ) + ) + + +if __name__ == "__main__": + bagel.add_command(calculate_fold_change) + bagel.add_command(calculate_bayes_factors) + bagel.add_command(calculate_precision_recall) + bagel.add_command(report_bagel_version) + bagel() diff --git a/conf/modules.config b/conf/modules.config index 867f0d58..535823a8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -44,6 +44,38 @@ process { ] } + withName: BAGEL2_BF { + publishDir = [ + path: { "${params.outdir}/bagel2/bayes_factor/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: BAGEL2_PR { + publishDir = [ + path: { "${params.outdir}/bagel2/precision_recall/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: BAGEL2_FC { + publishDir = [ + path: { "${params.outdir}/bagel2/fold_change/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: BAGEL2_GRAPH { + publishDir = [ + path: { "${params.outdir}/bagel2/graphs/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: FASTQC { ext.args = '--quiet' publishDir = [ @@ -64,6 +96,7 @@ process { withName: MAGECK_COUNT { publishDir = [ path: { "${params.outdir}/mageck/count/" }, + mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.prefix = 'count_table' @@ -73,6 +106,7 @@ process { withName: MAGECK_MLE { publishDir = [ path: { "${params.outdir}/mageck/mle/${meta.id}/" }, + mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = @@ -84,6 +118,7 @@ process { ext.args2 = "-t" publishDir = [ path: { "${params.outdir}/mageck/rra/" }, + mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } diff --git a/conf/test_screening.config b/conf/test_screening.config index 83c42d85..0ac8ed89 100644 --- a/conf/test_screening.config +++ b/conf/test_screening.config @@ -25,4 +25,5 @@ params { crisprcleanr = "Brunello_Library" mle_design_matrix = "https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/design_matrix.txt" library = "https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/brunello_target_sequence.txt" + rra_contrasts = "https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/rra_contrasts.txt" } diff --git a/docs/output/screening.md b/docs/output/screening.md index bb45fdcc..869a7d33 100644 --- a/docs/output/screening.md +++ b/docs/output/screening.md @@ -23,6 +23,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Gene essentiality](#gene-essentiality-computation) - [MAGeCK rra](#mageck-rra) - modified robust ranking aggregation (RRA) algorithm - [MAGeCK mle](#mageck-mle) - maximum-likelihood estimation (MLE) for robust identification of CRISPR-screen hits + - [BAGEL2](#BAGEL2) - Bayes Factor to identify essential genes - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution @@ -92,11 +93,32 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - `*_count_sgrna_summary.txt`: sgRNA ranking results, tab separated file containing means, p-values - `*.report.Rmd`: markdown report recapping essential genes - `*_count_table.log`: log of the run + - `*_scatterview.png`: scatter view of the targeted genes and their logFC + - `*_rank.png`: rank view of the targeted genes [MAGeCK](https://sourceforge.net/p/mageck/wiki/Home/) is a computational tool to identify important genes from CRISPR-Cas9 screens. +### BAGEL2 + +
+Output files + +- `bagel2/fold_change` + - `*.foldchange`: foldchange between the reference and treatment contrast provided +- `bagel2/bayes_factor` + - `*.bf`: bayes factor per gene +- `bagel2/precision_recall` + - `*.pr`: precision recall per gene +- `bagel2/graphs` + - `barplot*.png`: barplot of the bayes factor distribution + - `PR*.png`: precision recall plot (Recall vs FDR) + +
+ +[bagel2](https://github.com/hart-lab/bagel) is a computational tool to identify important essential genes for CRISPR-Cas9 screening experiments. + ## MultiQC
diff --git a/docs/usage/screening.md b/docs/usage/screening.md index 1901fd0b..bdbdd4c9 100644 --- a/docs/usage/screening.md +++ b/docs/usage/screening.md @@ -41,11 +41,26 @@ SRR8983580,SRR8983580.small.fastq.gz,,treatment An [example samplesheet](https://github.com/nf-core/test-datasets/blob/crisprseq/testdata/samplesheet_test.csv) has been provided with the pipeline. -The pipeline currently supports 2 algorithms to detect gene essentiality, MAGeCK rra and MAGeCK mle. MAGeCK MLE (Maximum Likelihood Estimation) and MAGeCK RRA (Robust Ranking Aggregation) are two different methods provided by the MAGeCK software package to analyze CRISPR-Cas9 screens. +### library + +If you are running the pipeline with fastq files and wish to obtain a count table, the library parameter is needed. The library table has three mandatory columns : id, target transcript (or gRNA sequence) and gene symbol. +An [example](https://github.com/nf-core/test-datasets/blob/crisprseq/testdata/brunello_target_sequence.txt) has been provided with the pipeline. Many libraries can be found on [addgene](https://www.addgene.org/). + +After the alignment step, the pipeline currently supports 3 algorithms to detect gene essentiality, MAGeCK rra, MAGeCK mle and BAGEL2. MAGeCK MLE (Maximum Likelihood Estimation) and MAGeCK RRA (Robust Ranking Aggregation) are two different methods provided by the MAGeCK software package to analyze CRISPR-Cas9 screens. BAGEL2 identifies gene essentiality through Bayesian Analysis. ### MAGeCK rra -MAGeCK RRA performs robust ranking aggregation to identify genes that are consistently ranked highly across multiple replicate screens. To run MAGeCK rra, `--rra_contrasts` should be used with a `csv` separated file stating the two conditions to be compared. +MAGeCK RRA performs robust ranking aggregation to identify genes that are consistently ranked highly across multiple replicate screens. To run MAGeCK rra, `--rra_contrasts` contains two columns : treatment and reference. These two columns should be separated with a dot comma (;) and contain the `csv` extension. You can also integrate several samples/conditions by comma separating them. Please find an example here below : + +| treatment | reference | +| ----------------------- | ------------------- | +| treatment1 | control1 | +| treatment1,treatment2 | control1,control2 | +| ----------------------- | ------------------- | +| treatment1 | control1 | + +A full example can be found here : +#TODO: add link to rawcontent of nf-core test ### MAGeCK mle @@ -57,7 +72,10 @@ If there are several designs to be run, you can input a folder containing all th CRISPRcleanR is used for gene count normalization and the removal of biases for genomic segments for which copy numbers are amplified. Currently, the pipeline only supports annotation libraries already present in the R package and which can be found [here](https://github.com/francescojm/CRISPRcleanR/blob/master/Reference_Manual.pdf). To use CRISPRcleanR normalization, use `--crisprcleanr library`, `library` being the exact name as the library in the CRISPRcleanR documentation (e.g: "AVANA_Library"). -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +### BAGEL2 + +BAGEL2 (Bayesian Analysis of Gene Essentiality with Location) is a computational tool developed by the Hart Lab at Harvard University. It is designed for analyzing large-scale genetic screens, particularly CRISPR-Cas9 screens, to identify genes that are essential for the survival or growth of cells under different conditions. BAGEL2 integrates information about the location of guide RNAs within a gene and leverages this information to improve the accuracy of gene essentiality predictions. +BAGEL2 uses the same contrasts from `--rra_contrasts`. Note that the pipeline will create the following files in your working directory: diff --git a/modules/local/bagel2/bf.nf b/modules/local/bagel2/bf.nf new file mode 100644 index 00000000..9f5fe090 --- /dev/null +++ b/modules/local/bagel2/bf.nf @@ -0,0 +1,33 @@ +process BAGEL2_BF { + tag "$meta.treatment" + label 'process_single' + + + conda "python=3.11.4 pandas=2.0.3 numpy=1.25.1 scikit-learn=1.3.0 click=8.1.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0': + 'biocontainers/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0' }" + + + input: + tuple val(meta), path(foldchange) + path(reference_essentials) + path(reference_nonessentials) + + output: + tuple val(meta), path("*.bf"), emit: bf + //path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.treatment}" + + """ + BAGEL.py bf -i $foldchange -o '${meta.treatment}_vs_${meta.reference}.bf' $args -e $reference_essentials -n $reference_nonessentials -c ${meta.treatment} + + """ + +} diff --git a/modules/local/bagel2/fc.nf b/modules/local/bagel2/fc.nf new file mode 100644 index 00000000..bec12be1 --- /dev/null +++ b/modules/local/bagel2/fc.nf @@ -0,0 +1,30 @@ +process BAGEL2_FC { + tag "${meta.treatment}_${meta.reference}" + label 'process_single' + + conda "python=3.11.4 pandas=2.0.3 numpy=1.25.1 scikit-learn=1.3.0 click=8.1.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0': + 'biocontainers/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0' }" + + input: + tuple val(meta), path(count_table) + + output: + tuple val(meta), path("*.foldchange"), emit: foldchange + tuple val(meta), path("*.normed_readcount"), emit: normed_counts + //path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + BAGEL.py fc -i $count_table -o ${meta.treatment}_vs_${meta.reference} -c $meta.reference $args + + """ + +} diff --git a/modules/local/bagel2/graph.nf b/modules/local/bagel2/graph.nf new file mode 100644 index 00000000..c14e7810 --- /dev/null +++ b/modules/local/bagel2/graph.nf @@ -0,0 +1,66 @@ +process BAGEL2_GRAPH { + tag "${meta.treatment}_${meta.reference}" + label 'process_single' + + conda "python=3.11.4 pandas=2.0.3 numpy=1.25.1 scikit-learn=1.3.0 click=8.1.6 matplotlib=3.7.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-54e0353146eca1531516863e8235bf7385d76663:c9ff1a9eec871c54cbea815eae778da702623978-0': + 'biocontainers/mulled-v2-54e0353146eca1531516863e8235bf7385d76663:c9ff1a9eec871c54cbea815eae778da702623978-0' }" + + + input: + tuple val(meta), path(pr) + + output: + path("*.png") , emit: pictures + + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + #!/usr/bin/env python3 + + #### author: Laurence Kuhlburger + #### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text. + #### + #### Orient a reference sequence according to reads orientation. + + + import pandas as pd + import matplotlib.pyplot as plt + + pr_data = pd.read_table('${pr}', index_col=0) + + plt.plot(pr_data.Recall, pr_data.Precision, linewidth=2) + plt.xlim(0, 1.01) + plt.ylim(0, 1.02) + plt.xlabel('Recall') + plt.ylabel('Precision (1-FDR)') + plt.title('Precision-Recall Plot') + + # Save the plot to a PNG file + file_name = 'PR_plot_{}_vs_{}.png'.format('${meta.treatment}', '${meta.reference}') + + plt.savefig(file_name) + + # Show the plot (optional) + plt.show() + + pr_data.hist('BF', bins=50, range=(-100,100)) + plt.xlabel('Bayes Factor') + plt.ylabel('Number of Genes') + + file_name = 'barplot_{}_vs_{}.png'.format('${meta.treatment}', '${meta.reference}') + + plt.savefig(file_name) + plt.show() + + """ + + +} diff --git a/modules/local/bagel2/pr.nf b/modules/local/bagel2/pr.nf new file mode 100644 index 00000000..b5cd1793 --- /dev/null +++ b/modules/local/bagel2/pr.nf @@ -0,0 +1,30 @@ +process BAGEL2_PR { + tag "$meta.treatment" + label 'process_single' + + conda "python=3.11.4 pandas=2.0.3 numpy=1.25.1 scikit-learn=1.3.0 click=8.1.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0': + 'biocontainers/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0' }" + + + input: + tuple val(meta), path(bf), path(reference_essentials), path(reference_nonessentials) + + output: + tuple val(meta), path("*.pr") , emit: pr + + //path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + BAGEL.py pr -i $bf -o '${meta.treatment}_vs_${meta.reference}.pr' -e $reference_essentials -n $reference_nonessentials + """ + +} diff --git a/modules/local/mageck/graphrra.nf b/modules/local/mageck/graphrra.nf new file mode 100644 index 00000000..2c531d5d --- /dev/null +++ b/modules/local/mageck/graphrra.nf @@ -0,0 +1,54 @@ +process MAGECK_GRAPHRRA { + tag "$meta.treatment" + label 'process_single' + + conda "bioconda::bioconductor-mageckflute=2.2.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mageckflute:2.2.0--r42hdfd78af_0': + 'biocontainers/bioconductor-mageckflute:2.2.0--r42hdfd78af_0' }" + + input: + tuple val(meta), path(gene_summary) + + output: + tuple val(meta), path("*.png"), emit: graphs + + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + #!/usr/bin/env Rscript + #### author: Laurence Kuhlburger + #### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text. + #### + #### Orient a reference sequence according to reads orientation. + + library(MAGeCKFlute) + library(ggplot2) + options(ggrepel.max.overlaps = Inf) + + gdata = ReadRRA("$gene_summary") + gdata <- transform(gdata, LogFDR = -log10(FDR)) + png(filename = paste0("$meta.treatment","_vs_","$meta.reference","_scatterview.png"), width = 6, height = 4, units = "in", res = 300) + p1 = ScatterView(gdata, x = "Score", y = "LogFDR", label = "id", + model = "volcano", top = 5) + print(p1) + dev.off() + + gdata <- transform(gdata, Rank = rank(Score)) + png(filename = paste0("$meta.treatment","_vs_","$meta.reference","_rank.png"), width = 6, height = 4, units = "in", res = 300) + p1 = ScatterView(gdata, x = "Rank", y = "Score", label = "id", + top = 5, auto_cut_y = TRUE, ylab = "Log2FC", + groups = c("top", "bottom")) + print(p1) + dev.off() + + """ + + +} diff --git a/modules/nf-core/mageck/count/mageck-count.diff b/modules/nf-core/mageck/count/mageck-count.diff index 50ee78b1..53659a20 100644 --- a/modules/nf-core/mageck/count/mageck-count.diff +++ b/modules/nf-core/mageck/count/mageck-count.diff @@ -1,26 +1,54 @@ Changes in module 'nf-core/mageck/count' --- modules/nf-core/mageck/count/main.nf +++ modules/nf-core/mageck/count/main.nf -@@ -12,8 +12,10 @@ +@@ -1,6 +1,6 @@ + process MAGECK_COUNT { + tag "$meta.id" +- label 'process_medium' ++ label 'process_high' + + conda "bioconda::mageck=0.5.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +@@ -12,8 +12,12 @@ path(library) output: - tuple val(meta), path("*count*.txt"), emit: count + tuple val(meta), path("*count.txt"), emit: count tuple val(meta), path("*.count_normalized.txt"), emit: norm ++ tuple val(meta), path("*.countsummary.txt"), emit: summary ++ tuple val(meta), path("*.count_normalized.txt"), emit: normalized + tuple val(meta), path("*.log"), emit: logs + path "versions.yml" , emit: versions when: -@@ -22,7 +24,7 @@ +@@ -22,9 +26,15 @@ script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def input_file = ("$inputfile".endsWith(".fastq.gz")) ? "--fastq ${inputfile}" : "-k ${inputfile}" -+ def input_file = ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz")) ? "--fastq ${inputfile}" : "-k ${inputfile}" ++ // def input_file = ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz")) ? "--fastq ${inputfile}" : "-k ${inputfile}" def sample_label = ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz")) ? "--sample-label ${meta.id}" : '' - +- ++ ++ if (meta.single_end && ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz"))) { ++ input = "--fastq ${inputfile}" ++ } else { ++ input = "--fastq ${inputfile[0]} --fastq-2 ${inputfile[1]}" ++ } ++ """ + mageck \\ + count \\ +@@ -32,7 +42,7 @@ + -l $library \\ + -n $prefix \\ + $sample_label \\ +- $input_file \\ ++ $input + + + cat <<-END_VERSIONS > versions.yml ************************************************************ diff --git a/modules/nf-core/mageck/count/main.nf b/modules/nf-core/mageck/count/main.nf index 30fbd8fe..ced51e85 100644 --- a/modules/nf-core/mageck/count/main.nf +++ b/modules/nf-core/mageck/count/main.nf @@ -1,6 +1,6 @@ process MAGECK_COUNT { tag "$meta.id" - label 'process_medium' + label 'process_high' conda "bioconda::mageck=0.5.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -14,6 +14,8 @@ process MAGECK_COUNT { output: tuple val(meta), path("*count.txt"), emit: count tuple val(meta), path("*.count_normalized.txt"), emit: norm + tuple val(meta), path("*.countsummary.txt"), emit: summary + tuple val(meta), path("*.count_normalized.txt"), emit: normalized tuple val(meta), path("*.log"), emit: logs path "versions.yml" , emit: versions @@ -24,9 +26,15 @@ process MAGECK_COUNT { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def input_file = ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz")) ? "--fastq ${inputfile}" : "-k ${inputfile}" + // def input_file = ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz")) ? "--fastq ${inputfile}" : "-k ${inputfile}" def sample_label = ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz")) ? "--sample-label ${meta.id}" : '' - + + if (meta.single_end && ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz"))) { + input = "--fastq ${inputfile}" + } else { + input = "--fastq ${inputfile[0]} --fastq-2 ${inputfile[1]}" + } + """ mageck \\ count \\ @@ -34,7 +42,7 @@ process MAGECK_COUNT { -l $library \\ -n $prefix \\ $sample_label \\ - $input_file \\ + $input cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/mageck/mle/mageck-mle.diff b/modules/nf-core/mageck/mle/mageck-mle.diff index b635b553..cba65759 100644 --- a/modules/nf-core/mageck/mle/mageck-mle.diff +++ b/modules/nf-core/mageck/mle/mageck-mle.diff @@ -1,6 +1,14 @@ Changes in module 'nf-core/mageck/mle' --- modules/nf-core/mageck/mle/main.nf +++ modules/nf-core/mageck/mle/main.nf +@@ -1,6 +1,6 @@ + process MAGECK_MLE { + tag "$meta.id" +- label 'process_medium' ++ label 'process_high' + + conda "bioconda::mageck=0.5.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -8,8 +8,7 @@ 'biocontainers/mageck:0.5.9--py37h6bb024c_0' }" diff --git a/modules/nf-core/mageck/mle/main.nf b/modules/nf-core/mageck/mle/main.nf index 626c410d..9c744afe 100644 --- a/modules/nf-core/mageck/mle/main.nf +++ b/modules/nf-core/mageck/mle/main.nf @@ -1,6 +1,6 @@ process MAGECK_MLE { tag "$meta.id" - label 'process_medium' + label 'process_high' conda "bioconda::mageck=0.5.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/nf-core/mageck/test/mageck-test.diff b/modules/nf-core/mageck/test/mageck-test.diff index ee8558c3..9da13d3b 100644 --- a/modules/nf-core/mageck/test/mageck-test.diff +++ b/modules/nf-core/mageck/test/mageck-test.diff @@ -8,17 +8,16 @@ Changes in module 'nf-core/mageck/test' label 'process_medium' conda "bioconda::mageck=0.5.9" -@@ -14,6 +14,9 @@ +@@ -14,6 +14,8 @@ tuple val(meta), path("*.gene_summary.txt") , emit: gene_summary tuple val(meta), path("*.sgrna_summary.txt") , emit: sgrna_summary tuple val(meta), path("*.R") , emit: r_script -+ tuple val(meta), path("*.Rmd") , emit: r_report + tuple val(meta), path("*.Rnw") , emit: r_summary + tuple val(meta), path("*.log") , emit: logs path "versions.yml" , emit: versions when: -@@ -21,14 +24,18 @@ +@@ -21,14 +23,18 @@ script: def args = task.ext.args ?: '' @@ -38,10 +37,5 @@ Changes in module 'nf-core/mageck/test' cat <<-END_VERSIONS > versions.yml -@@ -36,4 +43,4 @@ - mageck: \$(mageck -v) - END_VERSIONS - """ --} -+} + ************************************************************ diff --git a/modules/nf-core/mageck/test/main.nf b/modules/nf-core/mageck/test/main.nf index 420edcc1..78d5f58c 100644 --- a/modules/nf-core/mageck/test/main.nf +++ b/modules/nf-core/mageck/test/main.nf @@ -14,7 +14,6 @@ process MAGECK_TEST { tuple val(meta), path("*.gene_summary.txt") , emit: gene_summary tuple val(meta), path("*.sgrna_summary.txt") , emit: sgrna_summary tuple val(meta), path("*.R") , emit: r_script - tuple val(meta), path("*.Rmd") , emit: r_report tuple val(meta), path("*.Rnw") , emit: r_summary tuple val(meta), path("*.log") , emit: logs path "versions.yml" , emit: versions diff --git a/nextflow.config b/nextflow.config index 4b01987c..eeeca587 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,6 +21,8 @@ params { count_table = null min_reads = 30 min_targeted_genes = 3 + bagel_reference_essentials = 'https://raw.githubusercontent.com/hart-lab/bagel/master/CEGv2.txt' + bagel_reference_nonessentials = 'https://raw.githubusercontent.com/hart-lab/bagel/master/NEGv1.txt' // Pipeline steps overrepresented = false diff --git a/nextflow_schema.json b/nextflow_schema.json index c75c5a2d..211f5f1a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -186,12 +186,22 @@ "min_reads": { "type": "number", "description": "a filter threshold value for sgRNAs, based on their average counts in the control sample", - "default": 30 + "default": 30.0 }, "min_targeted_genes": { "type": "number", "description": "Minimal number of different genes targeted by sgRNAs in a biased segment in order for the corresponding counts to be corrected for CRISPRcleanR", - "default": 3 + "default": 3.0 + }, + "bagel_reference_essentials": { + "type": "string", + "description": "Core essential gene set for BAGEL2", + "default": "https://raw.githubusercontent.com/hart-lab/bagel/master/CEGv2.txt" + }, + "bagel_reference_nonessentials": { + "type": "string", + "description": "Non essential gene set for BAGEL2", + "default": "https://raw.githubusercontent.com/hart-lab/bagel/master/NEGv1.txt" } } }, @@ -450,21 +460,12 @@ { "$ref": "#/definitions/umi_parameters" }, - { - "$ref": "#/definitions/targeted_pipeline_steps" - }, - { - "$ref": "#/definitions/umi_parameters" - }, { "$ref": "#/definitions/targeted_parameters" }, { "$ref": "#/definitions/vsearch_parameters" }, - { - "$ref": "#/definitions/vsearch_parameters" - }, { "$ref": "#/definitions/screening_parameters" }, diff --git a/workflows/crisprseq_screening.nf b/workflows/crisprseq_screening.nf index fdff3857..ff2a61b5 100644 --- a/workflows/crisprseq_screening.nf +++ b/workflows/crisprseq_screening.nf @@ -59,8 +59,14 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main' include { MAGECK_COUNT } from '../modules/nf-core/mageck/count/main' include { MAGECK_MLE } from '../modules/nf-core/mageck/mle/main' include { MAGECK_TEST } from '../modules/nf-core/mageck/test/main' +include { MAGECK_GRAPHRRA } from '../modules/local/mageck/graphrra' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { CRISPRCLEANR_NORMALIZE } from '../modules/nf-core/crisprcleanr/normalize/main' +include { BAGEL2_FC } from '../modules/local/bagel2/fc' +include { BAGEL2_BF } from '../modules/local/bagel2/bf' +include { BAGEL2_PR } from '../modules/local/bagel2/pr' +include { BAGEL2_GRAPH } from '../modules/local/bagel2/graph' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -81,11 +87,7 @@ workflow CRISPRSEQ_SCREENING { Channel.fromSamplesheet("input") .map{ meta, fastq_1, fastq_2, x, y, z -> // x (reference), y (protospacer), and z (template) are part of the targeted workflows and we don't need them - if (!fastq_2) { - return [ meta, [ fastq_1 ] ] - } else { - return [ meta, [ fastq_1, fastq_2 ] ] - } + return [ meta + [ single_end:fastq_2?false:true ], fastq_2?[ fastq_1, fastq_2 ]:[ fastq_1 ] ] } .set { ch_input } @@ -95,19 +97,26 @@ workflow CRISPRSEQ_SCREENING { FASTQC ( ch_input ) + + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + ch_input - .map { meta, fastq -> - [meta.condition, fastq] + .map { meta, fastq -> + [meta.condition, fastq, meta.single_end] } .reduce { a, b -> - ["${a[0]},${b[0]}", a[1] + b[1]] + if(a[2] != b[2] ) { + error "Your samplesheet contains a mix of single-end and paired-end data. This is not supported." + } + return ["${a[0]},${b[0]}", a[1] + b[1], b[2]] } - .map { condition, fastqs -> - [[id: condition], fastqs] + .map { condition, fastqs, single_end -> + [[id: condition, single_end: single_end], fastqs] } .set { joined } + joined.dump(tag:"metadata") // // MODULE: Run mageck count @@ -145,13 +154,48 @@ workflow CRISPRSEQ_SCREENING { if(params.rra_contrasts) { Channel.fromPath(params.rra_contrasts) - .splitCsv(header:true, sep:',' ) + .splitCsv(header:true, sep:';' ) .set { ch_contrasts } counts = ch_contrasts.combine(ch_counts) MAGECK_TEST ( counts ) + + MAGECK_GRAPHRRA ( + MAGECK_TEST.out.gene_summary + ) + } + + if(params.rra_contrasts) { + Channel.fromPath(params.rra_contrasts) + .splitCsv(header:true, sep:';' ) + .set { ch_bagel } + counts = ch_bagel.combine(ch_counts) + + //Define non essential and essential genes channels for bagel2 + ch_bagel_reference_essentials= Channel.value(params.bagel_reference_essentials) + ch_bagel_reference_nonessentials= Channel.value(params.bagel_reference_nonessentials) + + BAGEL2_FC ( + counts + ) + + BAGEL2_BF ( + BAGEL2_FC.out.foldchange, + ch_bagel_reference_essentials, + ch_bagel_reference_nonessentials + ) + + ch_bagel_pr = BAGEL2_BF.out.bf.combine(ch_bagel_reference_essentials) + .combine(ch_bagel_reference_nonessentials) + + BAGEL2_PR ( + ch_bagel_pr + ) + BAGEL2_GRAPH ( + BAGEL2_PR.out.pr + ) } if(params.mle_design_matrix) {