Skip to content

Commit

Permalink
fix: fixes multiple issues with the recent cbioportal export changes (#…
Browse files Browse the repository at this point in the history
…425)

Co-authored-by: Manuela Benary <manuela.benary@bihealth.de>
  • Loading branch information
ericblanc20 and Manuela Benary authored Jul 27, 2023
1 parent a40a42d commit 740dda5
Show file tree
Hide file tree
Showing 17 changed files with 44,343 additions and 116 deletions.
6 changes: 3 additions & 3 deletions snappy_pipeline/workflows/cbioportal_export/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@
study_description: REQUIRED # REQUIRED
study_name: REQUIRED # REQUIRED
study_name_short: REQUIRED # REQUIRED
patient_info: [] # Unimplemented
sample_info: [] # Each entry must have a path & a step associated, see example below
# - step: tumor_mutational_burden
patient_info: {} # Unimplemented
sample_info: {} # Each additional sample column must have a name and a (possibly empty) config attached.
# tumor_mutational_burden:
# path: ../tumor_mutational_burden
"""

Expand Down
20 changes: 10 additions & 10 deletions snappy_pipeline/workflows/somatic_variant_calling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@
"vcf_md5": ".vcf.gz.md5",
"vcf_tbi": ".vcf.gz.tbi",
"vcf_tbi_md5": ".vcf.gz.tbi.md5",
"full": ".full.vcf.gz",
"full_md5": ".full.vcf.gz.md5",
"full_vcf": ".full.vcf.gz",
"full_vcf_md5": ".full.vcf.gz.md5",
"full_vcf_tbi": ".full.vcf.gz.tbi",
"full_vcf_tbi_md5": ".full.vcf.gz.tbi.md5",
"txt": ".txt",
Expand All @@ -122,8 +122,8 @@
"vcf_md5": ".vcf.gz.md5",
"vcf_tbi": ".vcf.gz.tbi",
"vcf_tbi_md5": ".vcf.gz.tbi.md5",
"full": ".full.vcf.gz",
"full_md5": ".full.vcf.gz.md5",
"full_vcf": ".full.vcf.gz",
"full_vcf_md5": ".full.vcf.gz.md5",
"full_vcf_tbi": ".full.vcf.gz.tbi",
"full_vcf_tbi_md5": ".full.vcf.gz.tbi.md5",
"tar": ".tar.gz",
Expand All @@ -134,8 +134,8 @@
"vcf_md5": ".vcf.gz.md5",
"vcf_tbi": ".vcf.gz.tbi",
"vcf_tbi_md5": ".vcf.gz.tbi.md5",
"full": ".full.vcf.gz",
"full_md5": ".full.vcf.gz.md5",
"full_vcf": ".full.vcf.gz",
"full_vcf_md5": ".full.vcf.gz.md5",
"full_vcf_tbi": ".full.vcf.gz.tbi",
"full_vcf_tbi_md5": ".full.vcf.gz.tbi.md5",
},
Expand Down Expand Up @@ -675,8 +675,8 @@ def get_output_files(self, action):
}
if action == "filter":
exts = {
"full": ".full.vcf.gz",
"full_md5": ".full.vcf.gz.md5",
"full_vcf": ".full.vcf.gz",
"full_vcf_md5": ".full.vcf.gz.md5",
"full_vcf_tbi": ".full.vcf.gz.tbi",
"full_vcf_tbi_md5": ".full.vcf.gz.tbi.md5",
"vcf": ".vcf.gz",
Expand Down Expand Up @@ -818,8 +818,8 @@ class Strelka2StepPart(SomaticVariantCallingStepPart):
"vcf_md5": ".vcf.gz.md5",
"vcf_tbi": ".vcf.gz.tbi",
"vcf_tbi_md5": ".vcf.gz.tbi.md5",
"full": ".full.vcf.gz",
"full_md5": ".full.vcf.gz.md5",
"full_vcf": ".full.vcf.gz",
"full_vcf_md5": ".full.vcf.gz.md5",
"full_vcf_tbi": ".full.vcf.gz.tbi",
"full_vcf_tbi_md5": ".full.vcf.gz.tbi.md5",
"stats": ".tsv",
Expand Down
8 changes: 3 additions & 5 deletions snappy_wrappers/wrappers/cbioportal/clinical_data/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,13 @@ def write_clinical_samples_tsv(donors):
"""

sample_info_getters = []
for extra_info in snakemake.config["step_config"]["cbioportal_export"]["sample_info"]:
step = extra_info["step"]
config = snakemake.config["step_config"]["cbioportal_export"]
for step, extra_info in config["sample_info"].items():
if step == "tumor_mutational_burden":
sample_info_getters.append(
SampleInfoTMB(
extra_info,
somatic_variant_tool=snakemake.config["step_config"]["cbioportal_export"][
"somatic_variant_calling_tool"
],
somatic_variant_tool=config["somatic_variant_calling_tool"],
)
)
else:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: merge_tables
name: generate_cna

channels:
- conda-forge
Expand Down
2 changes: 0 additions & 2 deletions snappy_wrappers/wrappers/cbioportal/generate_cna/script.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
require(magrittr)

#' Compute discrete copy number & log2 on genes
#'
#' @param fn names of the cns file
Expand Down
20 changes: 9 additions & 11 deletions snappy_wrappers/wrappers/cbioportal/helper_functions.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
require(magrittr)

#' Extracts the feature names & one column of values from tab-delimited sample files
#' and return a data.frame with the values for each sample in the columns, and
#' the feature names as row name.
Expand Down Expand Up @@ -34,7 +32,7 @@ read_sample_files <- function(fns, featureCol, valueCol, header=TRUE) {
if (is.null(tbl)) {
tbl <- tmp
} else {
tbl <- tbl %>% dplyr::full_join(tmp, by=featureCol)
tbl <- tbl |> dplyr::full_join(tmp, by=featureCol)
}
}
tbl <- tbl[!grepl("_PAR_Y$", tbl[[featureCol]]),,drop=FALSE]
Expand Down Expand Up @@ -75,7 +73,7 @@ map_feature_id <- function(mat, mappings, from, to, method=c("sum", "max", "maxa
if (!all(c(from, to) %in% colnames(mappings)))
stop("Feature id type ", from, " or ", to, " not in mappings table")

mappings <- mappings[,c(from, to)] %>% dplyr::distinct()
mappings <- mappings[,c(from, to)] |> dplyr::distinct()
mappings <- mappings[mappings[[from]] %in% rownames(mat),,drop=FALSE]

n <- sum(rownames(mat) %in% mappings[[from]])
Expand Down Expand Up @@ -189,7 +187,7 @@ get_id_mappings <- function(org_obj, verbose=FALSE) {
if (verbose) cat("Gene id mappings taken from file ", org_obj)
id_mappings <- read.table(org_obj, sep="\t", header=1, stringsAsFactors=FALSE, check.names=FALSE, quote="", comment="")
if (all(c("hgnc_symbol", "ensembl_canonical_gene", "entrez_gene_id") %in% colnames(id_mappings)))
id_mappings <- id_mappings %>%
id_mappings <- id_mappings |>
dplyr::select(ENSEMBL=ensembl_canonical_gene, SYMBOL=hgnc_symbol, ENTREZ_ID=entrez_gene_id)
stopifnot(all(c("ENSEMBL", "SYMBOL", "ENTREZ_ID") %in% colnames(id_mappings)))
id_mappings <- id_mappings[,c("ENSEMBL", "SYMBOL", "ENTREZ_ID")]
Expand All @@ -205,12 +203,12 @@ get_id_mappings <- function(org_obj, verbose=FALSE) {
)
}
}
id_mappings <- id_mappings %>%
dplyr::select(ENSEMBL, SYMBOL, ENTREZ_ID) %>%
dplyr::mutate(ENTREZ_ID=as.character(ENTREZ_ID)) %>%
dplyr::filter(!is.na(ENSEMBL) & ENSEMBL!="") %>%
dplyr::filter(!is.na(SYMBOL) & SYMBOL!="") %>%
dplyr::filter(!is.na(ENTREZ_ID) & ENTREZ_ID!="" & grepl("^[0-9]+$", ENTREZ_ID)) %>%
id_mappings <- id_mappings |>
dplyr::select(ENSEMBL, SYMBOL, ENTREZ_ID) |>
dplyr::mutate(ENTREZ_ID=as.character(ENTREZ_ID)) |>
dplyr::filter(!is.na(ENSEMBL) & ENSEMBL!="") |>
dplyr::filter(!is.na(SYMBOL) & SYMBOL!="") |>
dplyr::filter(!is.na(ENTREZ_ID) & ENTREZ_ID!="" & grepl("^[0-9]+$", ENTREZ_ID)) |>
dplyr::distinct()

id_mappings
Expand Down
12 changes: 7 additions & 5 deletions snappy_wrappers/wrappers/cbioportal/merge_tables/script.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
require(magrittr)

#' Read & merge sample files into cBioPortal gene-based data table.
#'
#' Used for expression tables, log2 CNA, pseudo-Gistic table.
Expand Down Expand Up @@ -39,7 +37,7 @@ merge_tables <- function(fns, mappings, type=c("log2", "gistic", "segment", "exp
}

if (type == "gistic") {
stopifnot(all(c("pipeline_id", "amplification") %in% names(args)))
stopifnot(all(c("pipeline_id") %in% names(args)))
# Copy numbers (in "cn" column) are transformed into (pseudo-) gistic codes:
# 0: Deep deletion, 1: heterozygous deletion, 2: copy number neutral, 3: gain, 4: amplification
# In https://doi.org/10.1038/s41586-022-04738-6, the amplification is defined as
Expand All @@ -59,7 +57,7 @@ merge_tables <- function(fns, mappings, type=c("log2", "gistic", "segment", "exp
}

if (type == "log2") {
stopifnot(all(c("pipeline_id") %in% names(args)))
stopifnot(all(c("pipeline_id", "amplification") %in% names(args)))
tmp <- read_sample_files(fns, args$pipeline_id, "log2")
method <- "max"
}
Expand Down Expand Up @@ -107,6 +105,10 @@ merge_segments <- function(fns) {
for (sample_id in names(fns)) {
cat("Loading", fns[sample_id], "for sample", sample_id, "\n")
tmp <- read.table(fns[sample_id], sep="\t", header=1, stringsAsFactors=FALSE, check.names=FALSE)
if (!all(col_names %in% colnames(tmp))) {
i <- match(names(col_names), colnames(tmp))
if (any(!is.na(i))) colnames(tmp)[i[!is.na(i)]] <- col_names[!is.na(i)]
}
stopifnot(all(names(col_names) %in% colnames(tmp)))
tmp <- tmp[,names(col_names)]
colnames(tmp) <- col_names
Expand Down Expand Up @@ -147,7 +149,7 @@ compute_rpkm <- function(counts, tx_obj=TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.
if (verbose) cat("Create DESeq2 object ... ")
genes <- GenomicFeatures::exonsBy(tx_obj, "gene")
genes <- genes[names(genes) %in% rownames(counts)]
counts <- counts[names(genes),]
counts <- counts[names(genes),,drop=FALSE]
donors <- data.frame(Donor=colnames(counts), stringsAsFactors=FALSE)
dds <- DESeq2::DESeqDataSetFromMatrix(counts, colData=donors, design=as.formula("~ 1"), rowRanges=genes)
if (verbose) cat("Done\n")
Expand Down
8 changes: 4 additions & 4 deletions snappy_wrappers/wrappers/mutect2/filter/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,21 +84,21 @@
--ob-priors $tmpdir/read-orientation-model.tar.gz \
--stats {snakemake.input.stats} \
--variant $tmpdir/in.vcf \
--output {snakemake.output.full}
--output {snakemake.output.full_vcf}
# Index & move to final dest
tabix -f {snakemake.output.full}
tabix -f {snakemake.output.full_vcf}
# Keep only PASS variants in main output
bcftools view -i 'FILTER="PASS"' -O z -o {snakemake.output.vcf} {snakemake.output.full}
bcftools view -i 'FILTER="PASS"' -O z -o {snakemake.output.vcf} {snakemake.output.full_vcf}
tabix -f {snakemake.output.vcf}
pushd $(dirname {snakemake.output.vcf})
fn=$(basename {snakemake.output.vcf})
md5sum $fn > $fn.md5
fn=$(basename {snakemake.output.vcf_tbi})
md5sum $fn > $fn.md5
fn=$(basename {snakemake.output.full})
fn=$(basename {snakemake.output.full_vcf})
md5sum $fn > $fn.md5
fn=$(basename {snakemake.output.full_vcf_tbi})
md5sum $fn > $fn.md5
Expand Down
11 changes: 6 additions & 5 deletions snappy_wrappers/wrappers/vcf2maf/vcf_to_table/environment.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
channels:
- bioconda
- conda-forge
- conda-forge
- bioconda

dependencies:
- python >=3.8
- vcfpy
- ruamel.yaml
- python >=3.8
- vcfpy
- ruamel.yaml
5 changes: 3 additions & 2 deletions snappy_wrappers/wrappers/vcf2maf/vcf_to_table/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from common_functions import calc_end_pos, minimize_mutation, strip_sequence_version, variant_type
import exceptions
from protein_mutation_parser import parse_protein_mutation
from variant_classification import variant_classification
from variant_classification import variant_classification_jannovar, variant_classification_vep

# from action import Action

Expand All @@ -27,7 +27,8 @@ def __init__(self, config: typing.Dict[str, typing.Any]):
self.compiled["strip_sequence_version"] = strip_sequence_version
self.compiled["variant_type"] = variant_type
self.compiled["parse_protein_mutation"] = parse_protein_mutation
self.compiled["variant_classification"] = variant_classification
self.compiled["variant_classification_vep"] = variant_classification_vep
self.compiled["variant_classification_jannovar"] = variant_classification_jannovar

for col_name, col_def in config["output"].items():
if "function" in col_def and not col_def["function"] in self.compiled:
Expand Down
Loading

0 comments on commit 740dda5

Please sign in to comment.