diff --git a/snappy_pipeline/apps/tpls/project_config.yaml b/snappy_pipeline/apps/tpls/project_config.yaml index 0e73a72b2..769333328 100644 --- a/snappy_pipeline/apps/tpls/project_config.yaml +++ b/snappy_pipeline/apps/tpls/project_config.yaml @@ -2,20 +2,48 @@ # # created: %(created_at)s -# Step Configuration ============================================================================== +# Static data Configuration ======================================================================= # -# Configuration for paths with static data. This has been preconfigured for the paths on the BIH -# cluster. +# Configuration for paths with static data (GRCh37/hs37d5 genome release, no CHR prefix in contig names). +# This has been preconfigured for the paths on the BIH cluster. # -static_data_config: +static_data_config: # hs37d5 cosmic: - path: /fast/projects/cubit/current/static_data/db/COSMIC/v72/GRCh37/CosmicAll.vcf.gz + path: /data/cephfs-1/work/projects/cubit/current/static_data/db/COSMIC/v72/GRCh37/CosmicAll.vcf.gz dbnsfp: - path: /fast/projects/cubit/current/static_data/db/dbNSFP/2.9/dbNSFP2.9.txt.gz + path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbNSFP/2.9/dbNSFP2.9.txt.gz dbsnp: - path: /fast/projects/cubit/current/static_data/db/dbSNP/b147/GRCh37/All_20160408.vcf.gz + path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbSNP/b147/GRCh37/All_20160408.vcf.gz reference: - path: /fast/projects/cubit/current/static_data/reference/GRCh37/hs37d5/hs37d5.fa + path: /data/cephfs-1/work/projects/cubit/current/static_data/reference/GRCh37/hs37d5/hs37d5.fa + features: + path: /data/cephfs-1/work/projects/cubit/current/static_data/annotation/GENCODE/19/GRCh37/gencode.v19.annotation.gtf + +# Step Configuration ============================================================================== +# +# Configuration for paths with static data (GRCh38.d1.vd1 genome release, includes decoys & viral sequences). +# This has been preconfigured for the paths on the BIH cluster (CUBI group members only). +# +# Notes: +# - GRCh38.d1.vd1 is the genome release used by the GDC consortium +# (https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/#alignment-workflow) +# - GENCODE release 36 is the feature annotation used by the GDC consortium for its RNA pipeline +# (https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#rna-seq-alignment-workflow) +# - The GENCODE release 36 corresponds to ENSEMBL release 102 +# (https://www.gencodegenes.org/human/releases.html) +# - Some files have not yet been moved to Tier 1. +# +# static_data_config: # GRCh38.d1.vd1 +# cosmic: +# path: /data/cephfs-1/work/projects/cubit/current/static_data/db/COSMIC/v90/GRCh38/CosmicAll.vcf.gz +# dbnsfp: +# path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbNSFP/3.5/GRCh38/dbNSFP.txt.gz +# dbsnp: +# path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbSNP/b147/GRCh38/common_all_20160407.vcf.gz +# reference: +# path: /fast/work/groups/cubi/projects/biotools/static_data/reference/GRCh38.d1.vd1/GRCh38.d1.vd1.fa +# features: +# path: /fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotation/GENCODE/36/gencode.v36.primary_assembly.annotation.gtf # Step Configuration ============================================================================== # diff --git a/snappy_pipeline/models/annotation.py b/snappy_pipeline/models/annotation.py index 468f0bf96..273511d83 100644 --- a/snappy_pipeline/models/annotation.py +++ b/snappy_pipeline/models/annotation.py @@ -9,9 +9,74 @@ class VepTxFlag(enum.StrEnum): merged = "merged" +class VepPickOrder(enum.StrEnum): + biotype = "biotype" + """Biotype of transcript (protein_coding preferred)""" + mane = "mane" + """MANE transcript status""" + appris = "appris" + """APPRIS isoform annotation""" + tsl = "tsl" + """Transcript support level""" + ccds = "ccds" + """CCDS status of transcript""" + canonical = "canonical" + """Canonical status of transcript""" + rank = "rank" + """Consequence rank""" + length = "length" + """Translated, transcript or feature length (longer preferred)""" + mane_select = "mane_select" + """MANE Select status (available from version 103)""" + mane_plus_clinical = "mane_plus_clinical" + """MANE Plus Clinical transcript status (available from version 103)""" + ensembl = "ensembl" + """Undocumented (not available in 102)""" + refseq = "refseq" + """Undocumented (not available in 102)""" + + +class VepOutputOptions(enum.StrEnum): + everything = "everything" + """sift, polyphen, ccds, uniprot, hgvs, symbol, numbers, domains""" + sift = "sift b" + polyphen = "polyphen b" + ccds = "ccds" + uniprot = "uniprot" + hgvs = "hgvs" + symbol = "symbol" + numbers = "numbers" + domains = "domains" + regulatory = "regulatory" + canonical = "canonical" + protein = "protein" + biotype = "biotype" + tsl = "tsl" + appris = "appris" + gene_phenotype = "gene_phenotype" + af = "af" + af_1kg = "af_1kg" + af_esp = "af_esp" + max_af = "max_af" + pubmed = "pubmed" + mane = "mane" + variant_class = "variant_class" + var_synonyms = "var_synonyms" + """Removed since version 106""" + af_gnomad = "af_gnomad" + """ + Superseded by 'af_gnomade' & 'af_gnomadg' for version 107. + 'af_gnomad' has the same function as 'af_gnomade' + """ + af_gnomade = "af_gnomade" + af_gnomadg = "af_gnomadg" + mirna = "mirna" + """Available from version 109""" + + class Vep(SnappyModel): cache_dir: str = "" - """Defaults to $HOME/.vep Not a good idea on the cluster""" + """Defaults to $HOME/.vep Not a good idea on the cluster, because the database is very large.""" species: str = "homo_sapiens" @@ -23,16 +88,23 @@ class Vep(SnappyModel): tx_flag: VepTxFlag = VepTxFlag.gencode_basic """The flag selecting the transcripts. One of "gencode_basic", "refseq", and "merged".""" - pick_order: list[str] = [ - "biotype", - "mane", - "appris", - "tsl", - "ccds", - "canonical", - "rank", - "length", + pick_order: list[VepPickOrder] = [ + VepPickOrder.biotype, + VepPickOrder.mane, + VepPickOrder.appris, + VepPickOrder.tsl, + VepPickOrder.ccds, + VepPickOrder.canonical, + VepPickOrder.rank, + VepPickOrder.length, ] + """ + Ranking of transcripts returned by VEP. + Important when only one transcript is selected for annotation. + The default order is different from the default order proposed by VEP. + Here, variants in protein coding transcripts will be annotated before MANE transcripts. + """ + num_threads: int = 8 buffer_size: int = 1000 - output_options: list[str] = ["everything"] + output_options: list[VepOutputOptions] = [VepOutputOptions.everything] diff --git a/snappy_pipeline/workflows/ngs_mapping/model.py b/snappy_pipeline/workflows/ngs_mapping/model.py index 30968a324..30b024baf 100644 --- a/snappy_pipeline/workflows/ngs_mapping/model.py +++ b/snappy_pipeline/workflows/ngs_mapping/model.py @@ -34,15 +34,18 @@ class Tools(SnappyModel): class TargetCoverageReportEntry(SnappyModel): """ - Mapping from enrichment kit to target region BED file, for either computing per--target + Mapping from enrichment kit to target region BED file, for either computing per-target region coverage or selecting targeted exons. The following will match both the stock IDT library kit and the ones - with spike-ins seen fromr Yale genomics. The path above would be + with spike-ins seen from Yale genomics. The path above would be mapped to the name "default". - name: IDT_xGen_V1_0 pattern: "xGen Exome Research Panel V1\\.0*" path: "path/to/targets.bed" + + Bed file for many Agilent exome panels can be found in + /fast/work/groups/cubi/projects/biotools/static_data/exome_panel/Agilent """ name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])] @@ -72,7 +75,14 @@ class BwaMode(Enum): class BwaMapper(SnappyModel): - path_index: str + path_index: str = Field( + examples=[ + "/data/cephfs-1/work/projects/cubit/current/static_data/precomputed/BWA/0.7.17/GRCh37/hs37d5/hs37d5.fa", + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38.d1.vd1/precomputed/BWA/0.7.17/GRCh38.d1.vd1.fa", + "/data/cephfs-1/work/groups/cubi/projects/biotools/bwa-mem2/GRCh37/hs37d5/hs37d5.fa", + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38.d1.vd1/precomputed/BWA_MEM2/2.2.1/GRCh38.d1.vd1.fa", + ] + ) """Required if listed in ngs_mapping.tools.dna; otherwise, can be removed.""" num_threads_align: int = 16 num_threads_trimming: int = 8 @@ -145,8 +155,13 @@ class Somatic(SnappyModel): class Bqsr(SnappyModel): - common_variants: str - """Common germline variants (see /fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK)""" + common_variants: str = Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/small_exac_common_3.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/small_exac_common_3.hg38.vcf.gz", + ] + ) + """Common germline variants (see https://console.cloud.google.com/storage/browser/gatk-best-practices)""" class AgentLibPrepType(Enum): @@ -158,7 +173,11 @@ class AgentLibPrepType(Enum): class AgentPrepare(SnappyModel): - path: str + path: str = Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/AGeNT_3.0.6/agent/lib/trimmer-3.0.5.jar" + ] + ) lib_prep_type: AgentLibPrepType = None """One of "halo" (HaloPlex), "hs" (HaloPlexHS), "xt" (SureSelect XT, XT2, XT HS), "v2" (SureSelect XT HS2) & "qxt" (SureSelect QXT)""" @@ -174,8 +193,17 @@ class AgentMarkDuplicatesConsensusMode(Enum): class AgentMarkDuplicates(SnappyModel): - path: str + path: str = Field( + examples=["/fast/work/groups/cubi/projects/biotools/AGeNT_3.0.6/agent/lib/creak-1.0.5.jar"] + ) path_baits: str + """ + Different exome panels cannot be accomodated here, because the selection method used for coverage is not used. + The absolute path of the exome panel must be input. + + Bed file for many Agilent exome panels can be found in + /fast/work/groups/cubi/projects/biotools/static_data/exome_panel/Agilent + """ consensus_mode: AgentMarkDuplicatesConsensusMode = None """One of "SINGLE", "HYBRID", "DUPLEX" """ @@ -194,7 +222,12 @@ class Agent(SnappyModel): class Star(SnappyModel): - path_index: str + path_index: str = Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/hs37d5/annotation/GENCODE/19/precomputed/STAR/2.7.10a/100", + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotations/GENCODE/36/precomputed/STAR/STAR_2.7.10a_100", + ] + ) """Required if listed in ngs_mapping.tools.rna; otherwise, can be removed.""" num_threads_align: int = 16 num_threads_trimming: int = 8 @@ -245,7 +278,12 @@ class Strand(enum.IntEnum): class Strandedness(SnappyModel): - path_exon_bed: str + path_exon_bed: str = Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/hs37d5/annotation/GENCODE/19/gencode.v19.chr_scaff.annotation.cds.collapse_annotation.bed", + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotations/GENCODE/36/gencode.v36.primary_assembly.annotation.cds.collapse_annotation.bed", + ] + ) """Location of usually highly expressed genes. Known protein coding genes is a good choice""" strand: Strand = Strand.UNKNOWN diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py index 1fbacf9d7..7b40d23b2 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py @@ -16,6 +16,13 @@ class Tool(enum.StrEnum): purecn = "purecn" +class SequenzaQualityEncoding: + sanger = "sanger" + """Base quality format (Phred score). Adds an offset of 33 to the qlimit value""" + illumina = "illumina" + """Base quality format (Phred score). Adds an offset of 64 to the qlimit value""" + + class SequenzaExtraArgs(SnappyModel): hom: float = 0.9 """Threshold to select homozygous positions""" @@ -26,7 +33,7 @@ class SequenzaExtraArgs(SnappyModel): qlimit: float = 20 """Minimum nucleotide quality score for inclusion in the counts""" - qformat: str = "sanger" + qformat: str = SequenzaQualityEncoding.sanger """Quality format, options are "sanger" or "illumina". This will add an offset of 33 or 64 respectively to the qlimit value""" @@ -152,6 +159,23 @@ class GenomeName(enum.StrEnum): canFam3 = "canFam3" +class PureCnModel(enum.StrEnum): + betabin = "betabin" + """beta-binomial ditribution for allelic fractions. Accounts for over-dispersion. Needs more than 10 normals""" + beta = "beta" + """beta distribution for allelic fractions""" + + +class PureCnSegmentation(enum.StrEnum): + PSCBS = "PSCBS" + """Good and safe starting point. Version supporting interval weights downloaded from github, lime1/PSCBS""" + CBS = "CBS" + """Circular binary segmentation. Simple, fast & well-tested, Does not fully support SNP information""" + GATK4 = "GATK4" + """Untested, better choice when number of SNPs per interval is large""" + Hclust = "Hclust" + + class PureCn(SnappyModel): genome_name: Annotated[ GenomeName | Literal["unknown"], @@ -173,8 +197,8 @@ class PureCn(SnappyModel): seed: int = 1234567 extra_commands: dict[str, Any] = { - "model": "betabin", - "fun-segmentation": "PSCBS", + "model": PureCnModel.betabin, + "fun-segmentation": PureCnSegmentation.PSCBS, "post-optimize": "", } """Recommended extra arguments for PureCN, extra_commands: {} to clear them all""" @@ -213,7 +237,9 @@ class PureCn(SnappyModel): somatic_variant_caller: str = "mutect2" """ IMPORTANT NOTE: - Mutect2 must be called with "--genotype-germline-sites true --genotype-pon-sites true + Mutect2 must be called with "--genotype-germline-sites true --genotype-pon-sites true". + Using these options, the results will include germline sites required by PureCN. + However, the somatic calls are DIFFERENT from the calls obtained in the somatic_variant_calling step. """ path_somatic_variants: Annotated[str, Field(examples=["../somatic_variant_calling_for_purecn"])] diff --git a/snappy_pipeline/workflows/somatic_variant_calling/model.py b/snappy_pipeline/workflows/somatic_variant_calling/model.py index 5e017785a..7bce21763 100644 --- a/snappy_pipeline/workflows/somatic_variant_calling/model.py +++ b/snappy_pipeline/workflows/somatic_variant_calling/model.py @@ -98,13 +98,38 @@ class Mutect2(Parallel): # Sadly a type of # `FilePath | None = None` # still applies `FilePath` validation on `None`, which errors - panel_of_normals: str | None = "" + panel_of_normals: Annotated[ + str | None, + Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/Mutect2-exome-panel.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/Mutect2-WGS-panel-b37.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/1000g_pon.hg38.vcf.gz", + ] + ), + ] = None """Set path to panel of normals vcf if required""" - germline_resource: str | None = "" + germline_resource: Annotated[ + str | None, + Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/af-only-gnomad.raw.sites.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/af-only-gnomad.hg38.vcf.gz", + ] + ), + ] = None """Germline variants resource (same as panel of normals)""" - common_variants: str | None = "" + common_variants: Annotated[ + str | None, + Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/small_exac_common_3.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/small_exac_common_3.hg38.vcf.gz", + ] + ), + ] = None """Common germline variants for contamination estimation""" extra_arguments: Annotated[ @@ -220,7 +245,25 @@ class SomaticVariantCalling(SnappyStepModel, validators.ToolsMixin): ignore_chroms: Annotated[ list[str], - Field(examples=["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"]), + Field( + examples=[ + ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"], + [ + "*_decoy", + "EBV", + "HPV*", + "HBV", + "HCV-*", + "HIV-*", + "HTLV-1", + "CMV", + "KSHV", + "MCV", + "SV40", + "chrUn_GL000220v1", + ], + ] + ), ] = ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"] """Patterns of contig names to ignore""" diff --git a/snappy_pipeline/workflows/somatic_variant_filtration/model.py b/snappy_pipeline/workflows/somatic_variant_filtration/model.py index 08e808c15..5a4b0db09 100644 --- a/snappy_pipeline/workflows/somatic_variant_filtration/model.py +++ b/snappy_pipeline/workflows/somatic_variant_filtration/model.py @@ -51,7 +51,15 @@ class Bcftools(SnappyModel): include: str = "" """Expression to be used in bcftools view --include""" - exclude: str = "" + exclude: Annotated[ + str, + Field( + examples=[ + "FORMAT/AD[1:1] < 5 | FORMAT/DP[1] < 50 | AD[1:1]/(AD[1:0]+AD[1:1])<0.05", + "((REF='C' & ALT='T') | (REF='G' & ALT='A')) & AD[1:1]/(AD[1:0]+AD[1:1])<=0.10", + ] + ), + ] = "" """Expression to be used in bcftools view --exclude""" @model_validator(mode="after") diff --git a/snappy_wrappers/wrappers/mutect2/run/wrapper.py b/snappy_wrappers/wrappers/mutect2/run/wrapper.py index 83e8b60f6..a740981ad 100644 --- a/snappy_wrappers/wrappers/mutect2/run/wrapper.py +++ b/snappy_wrappers/wrappers/mutect2/run/wrapper.py @@ -8,13 +8,28 @@ reference = snakemake.config["static_data_config"]["reference"]["path"] config = snakemake.config["step_config"]["somatic_variant_calling"]["mutect2"] -extra_arguments = " ".join(config["extra_arguments"]) - if "normal_bam" in snakemake.input.keys(): normal = f'--normal "{snakemake.params.normal_lib_name}" --input {snakemake.input.normal_bam}' else: normal = "" +if snakemake.params.intervals: + intervals = " ".join([f"--intervals {interval}" for interval in snakemake.params.intervals]) +else: + intervals = "" + +# TODO: move config parameters to snakemake params (check with parallel wrapper) +if config["germline_resource"]: + germline_resource = f"--germline-resource {config['germline_resource']}" +else: + germline_resource = "" +if config["panel_of_normals"]: + panel_of_normals = f"--panel-of-normals {config['panel_of_normals']}" +else: + panel_of_normals = "" + +extra_arguments = " ".join(config["extra_arguments"]) + shell.executable("/bin/bash") shell( @@ -47,16 +62,6 @@ out_base=$tmpdir/$(basename {snakemake.output.raw} .vcf.gz) -# Add intervals if required -intervals="" -if [[ -n "{snakemake.params.args[intervals]}" ]] -then - for itv in "{snakemake.params.args[intervals]}" - do - intervals="$intervals --intervals $itv" - done -fi - gatk Mutect2 \ --tmp-dir $tmpdir \ {normal} \ @@ -64,13 +69,8 @@ --reference {reference} \ --output $out_base.vcf \ --f1r2-tar-gz ${{out_base}}.f1r2.tar.gz \ - $(if [[ -n "{config[germline_resource]}" ]]; then \ - echo --germline-resource {config[germline_resource]} - fi) \ - $(if [[ -n "{config[panel_of_normals]}" ]]; then \ - echo --panel-of-normals {config[panel_of_normals]} - fi) \ - $intervals \ + {germline_resource} {panel_of_normals} \ + {intervals} \ {extra_arguments} rm -f $out_base.vcf.idx