fix: fixes multiple issues with the recent cbioportal export changes (#…

…425) Co-authored-by: Manuela Benary <manuela.benary@bihealth.de>
bihealth · Jul 27, 2023 · 740dda5 · 740dda5
1 parent a40a42d
commit 740dda5
Show file tree

Hide file tree

Showing 17 changed files with 44,343 additions and 116 deletions.
diff --git a/snappy_pipeline/workflows/cbioportal_export/__init__.py b/snappy_pipeline/workflows/cbioportal_export/__init__.py
@@ -104,9 +104,9 @@
       study_description: REQUIRED               # REQUIRED
       study_name: REQUIRED                      # REQUIRED
       study_name_short: REQUIRED                # REQUIRED
-    patient_info: []              # Unimplemented
-    sample_info: []               # Each entry must have a path & a step associated, see example below
-    # - step: tumor_mutational_burden
+    patient_info: {}              # Unimplemented
+    sample_info: {}               # Each additional sample column must have a name and a (possibly empty) config attached.
+    # tumor_mutational_burden:
     #   path: ../tumor_mutational_burden
 """
 

diff --git a/snappy_pipeline/workflows/somatic_variant_calling/__init__.py b/snappy_pipeline/workflows/somatic_variant_calling/__init__.py
@@ -108,8 +108,8 @@
         "vcf_md5": ".vcf.gz.md5",
         "vcf_tbi": ".vcf.gz.tbi",
         "vcf_tbi_md5": ".vcf.gz.tbi.md5",
-        "full": ".full.vcf.gz",
-        "full_md5": ".full.vcf.gz.md5",
+        "full_vcf": ".full.vcf.gz",
+        "full_vcf_md5": ".full.vcf.gz.md5",
         "full_vcf_tbi": ".full.vcf.gz.tbi",
         "full_vcf_tbi_md5": ".full.vcf.gz.tbi.md5",
         "txt": ".txt",
@@ -122,8 +122,8 @@
         "vcf_md5": ".vcf.gz.md5",
         "vcf_tbi": ".vcf.gz.tbi",
         "vcf_tbi_md5": ".vcf.gz.tbi.md5",
-        "full": ".full.vcf.gz",
-        "full_md5": ".full.vcf.gz.md5",
+        "full_vcf": ".full.vcf.gz",
+        "full_vcf_md5": ".full.vcf.gz.md5",
         "full_vcf_tbi": ".full.vcf.gz.tbi",
         "full_vcf_tbi_md5": ".full.vcf.gz.tbi.md5",
         "tar": ".tar.gz",
@@ -134,8 +134,8 @@
         "vcf_md5": ".vcf.gz.md5",
         "vcf_tbi": ".vcf.gz.tbi",
         "vcf_tbi_md5": ".vcf.gz.tbi.md5",
-        "full": ".full.vcf.gz",
-        "full_md5": ".full.vcf.gz.md5",
+        "full_vcf": ".full.vcf.gz",
+        "full_vcf_md5": ".full.vcf.gz.md5",
         "full_vcf_tbi": ".full.vcf.gz.tbi",
         "full_vcf_tbi_md5": ".full.vcf.gz.tbi.md5",
     },
@@ -675,8 +675,8 @@ def get_output_files(self, action):
             }
         if action == "filter":
             exts = {
-                "full": ".full.vcf.gz",
-                "full_md5": ".full.vcf.gz.md5",
+                "full_vcf": ".full.vcf.gz",
+                "full_vcf_md5": ".full.vcf.gz.md5",
                 "full_vcf_tbi": ".full.vcf.gz.tbi",
                 "full_vcf_tbi_md5": ".full.vcf.gz.tbi.md5",
                 "vcf": ".vcf.gz",
@@ -818,8 +818,8 @@ class Strelka2StepPart(SomaticVariantCallingStepPart):
         "vcf_md5": ".vcf.gz.md5",
         "vcf_tbi": ".vcf.gz.tbi",
         "vcf_tbi_md5": ".vcf.gz.tbi.md5",
-        "full": ".full.vcf.gz",
-        "full_md5": ".full.vcf.gz.md5",
+        "full_vcf": ".full.vcf.gz",
+        "full_vcf_md5": ".full.vcf.gz.md5",
         "full_vcf_tbi": ".full.vcf.gz.tbi",
         "full_vcf_tbi_md5": ".full.vcf.gz.tbi.md5",
         "stats": ".tsv",

diff --git a/snappy_wrappers/wrappers/cbioportal/clinical_data/wrapper.py b/snappy_wrappers/wrappers/cbioportal/clinical_data/wrapper.py
@@ -80,15 +80,13 @@ def write_clinical_samples_tsv(donors):
     """
 
     sample_info_getters = []
-    for extra_info in snakemake.config["step_config"]["cbioportal_export"]["sample_info"]:
-        step = extra_info["step"]
+    config = snakemake.config["step_config"]["cbioportal_export"]
+    for step, extra_info in config["sample_info"].items():
         if step == "tumor_mutational_burden":
             sample_info_getters.append(
                 SampleInfoTMB(
                     extra_info,
-                    somatic_variant_tool=snakemake.config["step_config"]["cbioportal_export"][
-                        "somatic_variant_calling_tool"
-                    ],
+                    somatic_variant_tool=config["somatic_variant_calling_tool"],
                 )
             )
         else:

diff --git a/snappy_wrappers/wrappers/cbioportal/generate_cna/environment.yaml b/snappy_wrappers/wrappers/cbioportal/generate_cna/environment.yaml
@@ -1,4 +1,4 @@
-name: merge_tables
+name: generate_cna
 
 channels:
 - conda-forge

diff --git a/snappy_wrappers/wrappers/cbioportal/generate_cna/script.R b/snappy_wrappers/wrappers/cbioportal/generate_cna/script.R
@@ -1,5 +1,3 @@
-require(magrittr)
-
 #' Compute discrete copy number & log2 on genes
 #'
 #' @param fn names of the cns file

diff --git a/snappy_wrappers/wrappers/cbioportal/helper_functions.R b/snappy_wrappers/wrappers/cbioportal/helper_functions.R
@@ -1,5 +1,3 @@
-require(magrittr)
-
 #' Extracts the feature names & one column of values from tab-delimited sample files
 #' and return a data.frame with the values for each sample in the columns, and 
 #' the feature names as row name.
@@ -34,7 +32,7 @@ read_sample_files <- function(fns, featureCol, valueCol, header=TRUE) {
         if (is.null(tbl)) {
             tbl <- tmp
         } else {
-            tbl <- tbl %>% dplyr::full_join(tmp, by=featureCol)
+            tbl <- tbl |> dplyr::full_join(tmp, by=featureCol)
         }
     }
     tbl <- tbl[!grepl("_PAR_Y$", tbl[[featureCol]]),,drop=FALSE]
@@ -75,7 +73,7 @@ map_feature_id <- function(mat, mappings, from, to, method=c("sum", "max", "maxa
     if (!all(c(from, to) %in% colnames(mappings)))
         stop("Feature id type ", from, " or ", to, " not in mappings table")
 
-    mappings <- mappings[,c(from, to)] %>% dplyr::distinct()
+    mappings <- mappings[,c(from, to)] |> dplyr::distinct()
     mappings <- mappings[mappings[[from]] %in% rownames(mat),,drop=FALSE]
 
     n <- sum(rownames(mat) %in% mappings[[from]])
@@ -189,7 +187,7 @@ get_id_mappings <- function(org_obj, verbose=FALSE) {
             if (verbose) cat("Gene id mappings taken from file ", org_obj)
             id_mappings <- read.table(org_obj, sep="\t", header=1, stringsAsFactors=FALSE, check.names=FALSE, quote="", comment="")
             if (all(c("hgnc_symbol", "ensembl_canonical_gene", "entrez_gene_id") %in% colnames(id_mappings)))
-                id_mappings <- id_mappings %>%
+                id_mappings <- id_mappings |>
                     dplyr::select(ENSEMBL=ensembl_canonical_gene, SYMBOL=hgnc_symbol, ENTREZ_ID=entrez_gene_id)
             stopifnot(all(c("ENSEMBL", "SYMBOL", "ENTREZ_ID") %in% colnames(id_mappings)))
             id_mappings <- id_mappings[,c("ENSEMBL", "SYMBOL", "ENTREZ_ID")]
@@ -205,12 +203,12 @@ get_id_mappings <- function(org_obj, verbose=FALSE) {
             )
         }
     }
-    id_mappings <- id_mappings %>%
-        dplyr::select(ENSEMBL, SYMBOL, ENTREZ_ID) %>%
-        dplyr::mutate(ENTREZ_ID=as.character(ENTREZ_ID)) %>%
-        dplyr::filter(!is.na(ENSEMBL) & ENSEMBL!="") %>%
-        dplyr::filter(!is.na(SYMBOL) & SYMBOL!="") %>%
-        dplyr::filter(!is.na(ENTREZ_ID) & ENTREZ_ID!="" & grepl("^[0-9]+$", ENTREZ_ID)) %>%
+    id_mappings <- id_mappings |>
+        dplyr::select(ENSEMBL, SYMBOL, ENTREZ_ID) |>
+        dplyr::mutate(ENTREZ_ID=as.character(ENTREZ_ID)) |>
+        dplyr::filter(!is.na(ENSEMBL) & ENSEMBL!="") |>
+        dplyr::filter(!is.na(SYMBOL) & SYMBOL!="") |>
+        dplyr::filter(!is.na(ENTREZ_ID) & ENTREZ_ID!="" & grepl("^[0-9]+$", ENTREZ_ID)) |>
         dplyr::distinct()
 
     id_mappings

diff --git a/snappy_wrappers/wrappers/cbioportal/merge_tables/script.R b/snappy_wrappers/wrappers/cbioportal/merge_tables/script.R
@@ -1,5 +1,3 @@
-require(magrittr)
-
 #' Read & merge sample files into cBioPortal gene-based data table.
 #'
 #' Used for expression tables, log2 CNA, pseudo-Gistic table.
@@ -39,7 +37,7 @@ merge_tables <- function(fns, mappings, type=c("log2", "gistic", "segment", "exp
     }
 
     if (type == "gistic") {
-        stopifnot(all(c("pipeline_id", "amplification") %in% names(args)))
+        stopifnot(all(c("pipeline_id") %in% names(args)))
         # Copy numbers (in "cn" column) are transformed into (pseudo-) gistic codes:
         # 0: Deep deletion, 1: heterozygous deletion, 2: copy number neutral, 3: gain, 4: amplification
         # In https://doi.org/10.1038/s41586-022-04738-6, the amplification is defined as 
@@ -59,7 +57,7 @@ merge_tables <- function(fns, mappings, type=c("log2", "gistic", "segment", "exp
     }
 
     if (type == "log2") {
-        stopifnot(all(c("pipeline_id") %in% names(args)))
+        stopifnot(all(c("pipeline_id", "amplification") %in% names(args)))
         tmp <- read_sample_files(fns, args$pipeline_id, "log2")
         method <- "max"
     }
@@ -107,6 +105,10 @@ merge_segments <- function(fns) {
     for (sample_id in names(fns)) {
         cat("Loading", fns[sample_id], "for sample", sample_id, "\n")
         tmp <- read.table(fns[sample_id], sep="\t", header=1, stringsAsFactors=FALSE, check.names=FALSE)
+        if (!all(col_names %in% colnames(tmp))) {
+            i <- match(names(col_names), colnames(tmp))
+            if (any(!is.na(i))) colnames(tmp)[i[!is.na(i)]] <- col_names[!is.na(i)]
+        }
         stopifnot(all(names(col_names) %in% colnames(tmp)))
         tmp <- tmp[,names(col_names)]
         colnames(tmp) <- col_names    
@@ -147,7 +149,7 @@ compute_rpkm <- function(counts, tx_obj=TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.
     if (verbose) cat("Create DESeq2 object ... ")
     genes <- GenomicFeatures::exonsBy(tx_obj, "gene")
     genes <- genes[names(genes) %in% rownames(counts)]
-    counts <- counts[names(genes),]
+    counts <- counts[names(genes),,drop=FALSE]
     donors <- data.frame(Donor=colnames(counts), stringsAsFactors=FALSE)
     dds <- DESeq2::DESeqDataSetFromMatrix(counts, colData=donors, design=as.formula("~ 1"), rowRanges=genes)
     if (verbose) cat("Done\n")

diff --git a/snappy_wrappers/wrappers/mutect2/filter/wrapper.py b/snappy_wrappers/wrappers/mutect2/filter/wrapper.py
@@ -84,21 +84,21 @@
     --ob-priors $tmpdir/read-orientation-model.tar.gz \
     --stats {snakemake.input.stats} \
     --variant $tmpdir/in.vcf \
-    --output {snakemake.output.full}
+    --output {snakemake.output.full_vcf}
 
 # Index & move to final dest
-tabix -f {snakemake.output.full}
+tabix -f {snakemake.output.full_vcf}
 
 # Keep only PASS variants in main output
-bcftools view -i 'FILTER="PASS"' -O z -o {snakemake.output.vcf} {snakemake.output.full}
+bcftools view -i 'FILTER="PASS"' -O z -o {snakemake.output.vcf} {snakemake.output.full_vcf}
 tabix -f {snakemake.output.vcf}
 
 pushd $(dirname {snakemake.output.vcf})
 fn=$(basename {snakemake.output.vcf})
 md5sum $fn > $fn.md5
 fn=$(basename {snakemake.output.vcf_tbi})
 md5sum $fn > $fn.md5
-fn=$(basename {snakemake.output.full})
+fn=$(basename {snakemake.output.full_vcf})
 md5sum $fn > $fn.md5
 fn=$(basename {snakemake.output.full_vcf_tbi})
 md5sum $fn > $fn.md5

diff --git a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/environment.yaml b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/environment.yaml
@@ -1,7 +1,8 @@
 channels:
-- bioconda
-- conda-forge
+  - conda-forge
+  - bioconda
+
 dependencies:
-- python >=3.8
-- vcfpy
-- ruamel.yaml
+  - python >=3.8
+  - vcfpy
+  - ruamel.yaml
diff --git a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/functions.py b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/functions.py
@@ -8,7 +8,7 @@
 from common_functions import calc_end_pos, minimize_mutation, strip_sequence_version, variant_type
 import exceptions
 from protein_mutation_parser import parse_protein_mutation
-from variant_classification import variant_classification
+from variant_classification import variant_classification_jannovar, variant_classification_vep
 
 # from action import Action
 
@@ -27,7 +27,8 @@ def __init__(self, config: typing.Dict[str, typing.Any]):
         self.compiled["strip_sequence_version"] = strip_sequence_version
         self.compiled["variant_type"] = variant_type
         self.compiled["parse_protein_mutation"] = parse_protein_mutation
-        self.compiled["variant_classification"] = variant_classification
+        self.compiled["variant_classification_vep"] = variant_classification_vep
+        self.compiled["variant_classification_jannovar"] = variant_classification_jannovar
 
         for col_name, col_def in config["output"].items():
             if "function" in col_def and not col_def["function"] in self.compiled: