From a7712da2401e21d24f36c707b5b51216f9f099f6 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 5 Jul 2024 15:43:40 +0200 Subject: [PATCH 1/5] docs: Check & update of configuration defaults (first part) --- snappy_pipeline/apps/tpls/project_config.yaml | 40 +++++++++++--- .../workflows/ngs_mapping/model.py | 52 ++++++++++++++++--- .../somatic_variant_calling/model.py | 38 ++++++++++++-- 3 files changed, 113 insertions(+), 17 deletions(-) diff --git a/snappy_pipeline/apps/tpls/project_config.yaml b/snappy_pipeline/apps/tpls/project_config.yaml index 0e73a72b2..1bbe91a06 100644 --- a/snappy_pipeline/apps/tpls/project_config.yaml +++ b/snappy_pipeline/apps/tpls/project_config.yaml @@ -4,18 +4,46 @@ # Step Configuration ============================================================================== # -# Configuration for paths with static data. This has been preconfigured for the paths on the BIH -# cluster. +# Configuration for paths with static data (GRCh37/hs37d5 genome release, no CHR prefix in contig names). +# This has been preconfigured for the paths on the BIH cluster. # static_data_config: cosmic: - path: /fast/projects/cubit/current/static_data/db/COSMIC/v72/GRCh37/CosmicAll.vcf.gz + path: /data/cephfs-1/work/projects/cubit/current/static_data/db/COSMIC/v72/GRCh37/CosmicAll.vcf.gz dbnsfp: - path: /fast/projects/cubit/current/static_data/db/dbNSFP/2.9/dbNSFP2.9.txt.gz + path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbNSFP/2.9/dbNSFP2.9.txt.gz dbsnp: - path: /fast/projects/cubit/current/static_data/db/dbSNP/b147/GRCh37/All_20160408.vcf.gz + path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbSNP/b147/GRCh37/All_20160408.vcf.gz reference: - path: /fast/projects/cubit/current/static_data/reference/GRCh37/hs37d5/hs37d5.fa + path: /data/cephfs-1/work/projects/cubit/current/static_data/reference/GRCh37/hs37d5/hs37d5.fa + features: + path: /data/cephfs-1/work/projects/cubit/current/static_data/annotation/GENCODE/19/GRCh37/gencode.v19.annotation.gtf + +# Step Configuration ============================================================================== +# +# Configuration for paths with static data (GRCh38.d1.vd1 genome release, includes decoys & viral sequences). +# This has been preconfigured for the paths on the BIH cluster (CUBI group members only). +# +# Notes: +# - GRCh38.d1.vd1 is the genome release used by the GDC consortium +# (https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/#alignment-workflow) +# - GENCODE release 36 is the feature annotation used by the GDC consortium for its RNA pipeline +# (https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#rna-seq-alignment-workflow) +# - The GENCODE release 36 corresponds to ENSEMBL release 102 +# (https://www.gencodegenes.org/human/releases.html) +# - Some files have not yet been moved to Tier 1. +# +# static_data_config: +# cosmic: +# path: /data/cephfs-1/work/projects/cubit/current/static_data/db/COSMIC/v90/GRCh38/CosmicAll.vcf.gz +# dbnsfp: +# path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbNSFP/3.5/GRCh38/dbNSFP.txt.gz +# dbsnp: +# path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbSNP/b147/GRCh38/common_all_20160407.vcf.gz +# reference: +# path: /fast/work/groups/cubi/projects/biotools/static_data/reference/GRCh38.d1.vd1/GRCh38.d1.vd1.fa +# features: +# path: /fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotation/GENCODE/36/gencode.v36.primary_assembly.annotation.gtf # Step Configuration ============================================================================== # diff --git a/snappy_pipeline/workflows/ngs_mapping/model.py b/snappy_pipeline/workflows/ngs_mapping/model.py index 30968a324..5ff25fd50 100644 --- a/snappy_pipeline/workflows/ngs_mapping/model.py +++ b/snappy_pipeline/workflows/ngs_mapping/model.py @@ -43,6 +43,9 @@ class TargetCoverageReportEntry(SnappyModel): - name: IDT_xGen_V1_0 pattern: "xGen Exome Research Panel V1\\.0*" path: "path/to/targets.bed" + + Bed file for many Agilent exome panels can be found in + /fast/work/groups/cubi/projects/biotools/static_data/exome_panel/Agilent """ name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])] @@ -72,7 +75,14 @@ class BwaMode(Enum): class BwaMapper(SnappyModel): - path_index: str + path_index: str = Field( + examples=[ + "/data/cephfs-1/work/projects/cubit/current/static_data/precomputed/BWA/0.7.17/GRCh37/hs37d5/hs37d5.fa", + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38.d1.vd1/precomputed/BWA/0.7.17/GRCh38.d1.vd1.fa", + "/data/cephfs-1/work/groups/cubi/projects/biotools/bwa-mem2/GRCh37/hs37d5/hs37d5.fa", + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38.d1.vd1/precomputed/BWA_MEM2/2.2.1/GRCh38.d1.vd1.fa", + ] + ) """Required if listed in ngs_mapping.tools.dna; otherwise, can be removed.""" num_threads_align: int = 16 num_threads_trimming: int = 8 @@ -145,8 +155,13 @@ class Somatic(SnappyModel): class Bqsr(SnappyModel): - common_variants: str - """Common germline variants (see /fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK)""" + common_variants: str = Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/small_exac_common_3.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/small_exac_common_3.hg38.vcf.gz", + ] + ) + """Common germline variants (see https://console.cloud.google.com/storage/browser/gatk-best-practices)""" class AgentLibPrepType(Enum): @@ -158,7 +173,11 @@ class AgentLibPrepType(Enum): class AgentPrepare(SnappyModel): - path: str + path: str = Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/AGeNT_3.0.6/agent/lib/trimmer-3.0.5.jar" + ] + ) lib_prep_type: AgentLibPrepType = None """One of "halo" (HaloPlex), "hs" (HaloPlexHS), "xt" (SureSelect XT, XT2, XT HS), "v2" (SureSelect XT HS2) & "qxt" (SureSelect QXT)""" @@ -174,8 +193,17 @@ class AgentMarkDuplicatesConsensusMode(Enum): class AgentMarkDuplicates(SnappyModel): - path: str + path: str = Field( + examples=["/fast/work/groups/cubi/projects/biotools/AGeNT_3.0.6/agent/lib/creak-1.0.5.jar"] + ) path_baits: str + """ + Different exome panels cannot be accomodated here, because the selection method used for coverage is not used. + The absolute path of the exome panel must be input. + + Bed file for many Agilent exome panels can be found in + /fast/work/groups/cubi/projects/biotools/static_data/exome_panel/Agilent + """ consensus_mode: AgentMarkDuplicatesConsensusMode = None """One of "SINGLE", "HYBRID", "DUPLEX" """ @@ -194,7 +222,12 @@ class Agent(SnappyModel): class Star(SnappyModel): - path_index: str + path_index: str = Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/hs37d5/annotation/GENCODE/19/precomputed/STAR/2.7.10a/100", + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotations/GENCODE/36/precomputed/STAR/STAR_2.7.10a_100", + ] + ) """Required if listed in ngs_mapping.tools.rna; otherwise, can be removed.""" num_threads_align: int = 16 num_threads_trimming: int = 8 @@ -245,7 +278,12 @@ class Strand(enum.IntEnum): class Strandedness(SnappyModel): - path_exon_bed: str + path_exon_bed: str = Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/hs37d5/annotation/GENCODE/19/gencode.v19.chr_scaff.annotation.cds.collapse_annotation.bed", + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotations/GENCODE/36/gencode.v36.primary_assembly.annotation.cds.collapse_annotation.bed", + ] + ) """Location of usually highly expressed genes. Known protein coding genes is a good choice""" strand: Strand = Strand.UNKNOWN diff --git a/snappy_pipeline/workflows/somatic_variant_calling/model.py b/snappy_pipeline/workflows/somatic_variant_calling/model.py index 5e017785a..82b973e9a 100644 --- a/snappy_pipeline/workflows/somatic_variant_calling/model.py +++ b/snappy_pipeline/workflows/somatic_variant_calling/model.py @@ -98,13 +98,38 @@ class Mutect2(Parallel): # Sadly a type of # `FilePath | None = None` # still applies `FilePath` validation on `None`, which errors - panel_of_normals: str | None = "" + panel_of_normals: Annotated[ + str | None, + Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/Mutect2-exome-panel.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/Mutect2-WGS-panel-b37.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/1000g_pon.hg38.vcf.gz", + ] + ), + ] = None """Set path to panel of normals vcf if required""" - germline_resource: str | None = "" + germline_resource: Annotated[ + str | None, + Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/af-only-gnomad.raw.sites.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/af-only-gnomad.hg38.vcf.gz", + ] + ), + ] = None """Germline variants resource (same as panel of normals)""" - common_variants: str | None = "" + common_variants: Annotated[ + str | None, + Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/small_exac_common_3.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/small_exac_common_3.hg38.vcf.gz", + ] + ), + ] = None """Common germline variants for contamination estimation""" extra_arguments: Annotated[ @@ -220,7 +245,12 @@ class SomaticVariantCalling(SnappyStepModel, validators.ToolsMixin): ignore_chroms: Annotated[ list[str], - Field(examples=["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"]), + Field( + examples=[ + ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"], + ["*_decoy", "EBV", "HPV*", "HBV", "HCV-*", "HIV-*", "HTLV-1", "CMV", "KSHV", "MCV", "SV40", "chrUn_GL000220v1"] + ] + ), ] = ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"] """Patterns of contig names to ignore""" From f117400be2f13c2e4dd9f4a0d206818113472a90 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Tue, 23 Jul 2024 11:47:00 +0200 Subject: [PATCH 2/5] fix: make linting and continuous integration tests happy --- .../somatic_variant_calling/model.py | 15 +++++++- .../wrappers/mutect2/run/wrapper.py | 38 +++++++++---------- 2 files changed, 33 insertions(+), 20 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_variant_calling/model.py b/snappy_pipeline/workflows/somatic_variant_calling/model.py index 82b973e9a..7bce21763 100644 --- a/snappy_pipeline/workflows/somatic_variant_calling/model.py +++ b/snappy_pipeline/workflows/somatic_variant_calling/model.py @@ -248,7 +248,20 @@ class SomaticVariantCalling(SnappyStepModel, validators.ToolsMixin): Field( examples=[ ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"], - ["*_decoy", "EBV", "HPV*", "HBV", "HCV-*", "HIV-*", "HTLV-1", "CMV", "KSHV", "MCV", "SV40", "chrUn_GL000220v1"] + [ + "*_decoy", + "EBV", + "HPV*", + "HBV", + "HCV-*", + "HIV-*", + "HTLV-1", + "CMV", + "KSHV", + "MCV", + "SV40", + "chrUn_GL000220v1", + ], ] ), ] = ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"] diff --git a/snappy_wrappers/wrappers/mutect2/run/wrapper.py b/snappy_wrappers/wrappers/mutect2/run/wrapper.py index 83e8b60f6..a740981ad 100644 --- a/snappy_wrappers/wrappers/mutect2/run/wrapper.py +++ b/snappy_wrappers/wrappers/mutect2/run/wrapper.py @@ -8,13 +8,28 @@ reference = snakemake.config["static_data_config"]["reference"]["path"] config = snakemake.config["step_config"]["somatic_variant_calling"]["mutect2"] -extra_arguments = " ".join(config["extra_arguments"]) - if "normal_bam" in snakemake.input.keys(): normal = f'--normal "{snakemake.params.normal_lib_name}" --input {snakemake.input.normal_bam}' else: normal = "" +if snakemake.params.intervals: + intervals = " ".join([f"--intervals {interval}" for interval in snakemake.params.intervals]) +else: + intervals = "" + +# TODO: move config parameters to snakemake params (check with parallel wrapper) +if config["germline_resource"]: + germline_resource = f"--germline-resource {config['germline_resource']}" +else: + germline_resource = "" +if config["panel_of_normals"]: + panel_of_normals = f"--panel-of-normals {config['panel_of_normals']}" +else: + panel_of_normals = "" + +extra_arguments = " ".join(config["extra_arguments"]) + shell.executable("/bin/bash") shell( @@ -47,16 +62,6 @@ out_base=$tmpdir/$(basename {snakemake.output.raw} .vcf.gz) -# Add intervals if required -intervals="" -if [[ -n "{snakemake.params.args[intervals]}" ]] -then - for itv in "{snakemake.params.args[intervals]}" - do - intervals="$intervals --intervals $itv" - done -fi - gatk Mutect2 \ --tmp-dir $tmpdir \ {normal} \ @@ -64,13 +69,8 @@ --reference {reference} \ --output $out_base.vcf \ --f1r2-tar-gz ${{out_base}}.f1r2.tar.gz \ - $(if [[ -n "{config[germline_resource]}" ]]; then \ - echo --germline-resource {config[germline_resource]} - fi) \ - $(if [[ -n "{config[panel_of_normals]}" ]]; then \ - echo --panel-of-normals {config[panel_of_normals]} - fi) \ - $intervals \ + {germline_resource} {panel_of_normals} \ + {intervals} \ {extra_arguments} rm -f $out_base.vcf.idx From 7f3fd690c2d79a8cbf3f26dec831fbc9bc7fc7f2 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Tue, 23 Jul 2024 11:49:00 +0200 Subject: [PATCH 3/5] docs: typos & explicit genome release --- snappy_pipeline/apps/tpls/project_config.yaml | 6 +++--- snappy_pipeline/workflows/ngs_mapping/model.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/snappy_pipeline/apps/tpls/project_config.yaml b/snappy_pipeline/apps/tpls/project_config.yaml index 1bbe91a06..769333328 100644 --- a/snappy_pipeline/apps/tpls/project_config.yaml +++ b/snappy_pipeline/apps/tpls/project_config.yaml @@ -2,12 +2,12 @@ # # created: %(created_at)s -# Step Configuration ============================================================================== +# Static data Configuration ======================================================================= # # Configuration for paths with static data (GRCh37/hs37d5 genome release, no CHR prefix in contig names). # This has been preconfigured for the paths on the BIH cluster. # -static_data_config: +static_data_config: # hs37d5 cosmic: path: /data/cephfs-1/work/projects/cubit/current/static_data/db/COSMIC/v72/GRCh37/CosmicAll.vcf.gz dbnsfp: @@ -33,7 +33,7 @@ static_data_config: # (https://www.gencodegenes.org/human/releases.html) # - Some files have not yet been moved to Tier 1. # -# static_data_config: +# static_data_config: # GRCh38.d1.vd1 # cosmic: # path: /data/cephfs-1/work/projects/cubit/current/static_data/db/COSMIC/v90/GRCh38/CosmicAll.vcf.gz # dbnsfp: diff --git a/snappy_pipeline/workflows/ngs_mapping/model.py b/snappy_pipeline/workflows/ngs_mapping/model.py index 5ff25fd50..30b024baf 100644 --- a/snappy_pipeline/workflows/ngs_mapping/model.py +++ b/snappy_pipeline/workflows/ngs_mapping/model.py @@ -34,11 +34,11 @@ class Tools(SnappyModel): class TargetCoverageReportEntry(SnappyModel): """ - Mapping from enrichment kit to target region BED file, for either computing per--target + Mapping from enrichment kit to target region BED file, for either computing per-target region coverage or selecting targeted exons. The following will match both the stock IDT library kit and the ones - with spike-ins seen fromr Yale genomics. The path above would be + with spike-ins seen from Yale genomics. The path above would be mapped to the name "default". - name: IDT_xGen_V1_0 pattern: "xGen Exome Research Panel V1\\.0*" From f524bdbc092336efc3dc59f8eff2f85c59d7c3ac Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 24 Jul 2024 17:58:22 +0200 Subject: [PATCH 4/5] docs: Added description of somatic variants config --- snappy_pipeline/models/annotation.py | 94 ++++++++++++++++--- .../somatic_targeted_seq_cnv_calling/model.py | 34 ++++++- .../somatic_variant_filtration/model.py | 10 +- 3 files changed, 122 insertions(+), 16 deletions(-) diff --git a/snappy_pipeline/models/annotation.py b/snappy_pipeline/models/annotation.py index 468f0bf96..07b6fe2cd 100644 --- a/snappy_pipeline/models/annotation.py +++ b/snappy_pipeline/models/annotation.py @@ -9,9 +9,74 @@ class VepTxFlag(enum.StrEnum): merged = "merged" +class VepPickOrder(enum.StrEnum): + biotype = "biotype" + """Biotype of transcript (protein_coding preferred)""" + mane = "mane" + """MANE transcript status""" + appris = "appris" + """APPRIS isoform annotation""" + tsl = "tsl" + """Transcript support level""" + ccds = "ccds" + """CCDS status of transcript""" + canonical = "canonical" + """Canonical status of transcript""" + rank = "rank" + """Consequence rank""" + length = "length" + """Translated, transcript or feature length (longer preferred)""" + mane_select = "mane_select" + """MANE Select status (available from version 103)""" + mane_plus_clinical = "mane_plus_clinical" + """MANE Plus Clinical transcript status (available from version 103)""" + ensembl = "ensembl" + """Undocumented (not available in 102)""" + refseq = "refseq" + """Undocumented (not available in 102)""" + + +class VepOutputOptions(enum.StrEnum): + everything = "everything" + """sift, polyphen, ccds, uniprot, hgvs, symbol, numbers, domains""" + sift = "sift b" + polyphen = "polyphen b" + ccds = "ccds" + uniprot = "uniprot" + hgvs = "hgvs" + symbol = "symbol" + numbers = "numbers" + domains = "domains" + regulatory = "regulatory" + canonical = "canonical" + protein = "protein" + biotype = "biotype" + tsl = "tsl" + appris = "appris" + gene_phenotype = "gene_phenotype" + af = "af" + af_1kg = "af_1kg" + af_esp = "af_esp" + max_af = "max_af" + pubmed = "pubmed" + mane = "mane" + variant_class = "variant_class" + var_synonyms = "var_synonyms" + """Removed since version 106""" + af_gnomad = "af_gnomad" + """ + Superseded by 'af_gnomade' & 'af_gnomadg' for version 107. + 'af_gnomad' has the same function as 'af_gnomade' + """ + af_gnomade = "af_gnomade" + af_gnomadg = "af_gnomadg" + mirna = "mirna" + """Available from version 109""" + + class Vep(SnappyModel): cache_dir: str = "" - """Defaults to $HOME/.vep Not a good idea on the cluster""" + """Defaults to $HOME/.vep Not a good idea on the cluster, because the database is very large.""" species: str = "homo_sapiens" @@ -23,16 +88,23 @@ class Vep(SnappyModel): tx_flag: VepTxFlag = VepTxFlag.gencode_basic """The flag selecting the transcripts. One of "gencode_basic", "refseq", and "merged".""" - pick_order: list[str] = [ - "biotype", - "mane", - "appris", - "tsl", - "ccds", - "canonical", - "rank", - "length", + pick_order: list[VepPickOrder] = [ + VepPickOrder.biotype, + VepPickOrder.mane, + VepPickOrder.appris, + VepPickOrder.tsl, + VepPickOrder.ccds, + VepPickOrder.canonical, + VepPickOrder.rank, + VepPickOrder.length, ] + """ + Ranking of transcripts returned by VEP. + Important when only one transcript is selected for annotation. + The default order is different from the default order proposed by VEP. + Here, variants in protein coding transcripts will be annotated before MANE transcripts. + """ + num_threads: int = 8 buffer_size: int = 1000 - output_options: list[str] = ["everything"] + output_options: list[VepOutputOptions] = [VepOutputOptions.everything] diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py index 1fbacf9d7..7b40d23b2 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py @@ -16,6 +16,13 @@ class Tool(enum.StrEnum): purecn = "purecn" +class SequenzaQualityEncoding: + sanger = "sanger" + """Base quality format (Phred score). Adds an offset of 33 to the qlimit value""" + illumina = "illumina" + """Base quality format (Phred score). Adds an offset of 64 to the qlimit value""" + + class SequenzaExtraArgs(SnappyModel): hom: float = 0.9 """Threshold to select homozygous positions""" @@ -26,7 +33,7 @@ class SequenzaExtraArgs(SnappyModel): qlimit: float = 20 """Minimum nucleotide quality score for inclusion in the counts""" - qformat: str = "sanger" + qformat: str = SequenzaQualityEncoding.sanger """Quality format, options are "sanger" or "illumina". This will add an offset of 33 or 64 respectively to the qlimit value""" @@ -152,6 +159,23 @@ class GenomeName(enum.StrEnum): canFam3 = "canFam3" +class PureCnModel(enum.StrEnum): + betabin = "betabin" + """beta-binomial ditribution for allelic fractions. Accounts for over-dispersion. Needs more than 10 normals""" + beta = "beta" + """beta distribution for allelic fractions""" + + +class PureCnSegmentation(enum.StrEnum): + PSCBS = "PSCBS" + """Good and safe starting point. Version supporting interval weights downloaded from github, lime1/PSCBS""" + CBS = "CBS" + """Circular binary segmentation. Simple, fast & well-tested, Does not fully support SNP information""" + GATK4 = "GATK4" + """Untested, better choice when number of SNPs per interval is large""" + Hclust = "Hclust" + + class PureCn(SnappyModel): genome_name: Annotated[ GenomeName | Literal["unknown"], @@ -173,8 +197,8 @@ class PureCn(SnappyModel): seed: int = 1234567 extra_commands: dict[str, Any] = { - "model": "betabin", - "fun-segmentation": "PSCBS", + "model": PureCnModel.betabin, + "fun-segmentation": PureCnSegmentation.PSCBS, "post-optimize": "", } """Recommended extra arguments for PureCN, extra_commands: {} to clear them all""" @@ -213,7 +237,9 @@ class PureCn(SnappyModel): somatic_variant_caller: str = "mutect2" """ IMPORTANT NOTE: - Mutect2 must be called with "--genotype-germline-sites true --genotype-pon-sites true + Mutect2 must be called with "--genotype-germline-sites true --genotype-pon-sites true". + Using these options, the results will include germline sites required by PureCN. + However, the somatic calls are DIFFERENT from the calls obtained in the somatic_variant_calling step. """ path_somatic_variants: Annotated[str, Field(examples=["../somatic_variant_calling_for_purecn"])] diff --git a/snappy_pipeline/workflows/somatic_variant_filtration/model.py b/snappy_pipeline/workflows/somatic_variant_filtration/model.py index 08e808c15..b50b79f9e 100644 --- a/snappy_pipeline/workflows/somatic_variant_filtration/model.py +++ b/snappy_pipeline/workflows/somatic_variant_filtration/model.py @@ -51,7 +51,15 @@ class Bcftools(SnappyModel): include: str = "" """Expression to be used in bcftools view --include""" - exclude: str = "" + exclude: Annotated[ + str, + Field( + examples=[ + "FORMAT/AD[1:1] < 5 | FORMAT/DP[1] < 50 | AD[1:1]/(AD[1:0]+AD[1:1])<0.05", + "((REF='C' & ALT='T') | (REF='G' & ALT='A')) & AD[1:1]/(AD[1:0]+AD[1:1])<=0.10" + ] + ), + ] = "" """Expression to be used in bcftools view --exclude""" @model_validator(mode="after") From 4fe223bbebe1a0229d45f40c6cc83c5e2704492e Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 24 Jul 2024 18:10:01 +0200 Subject: [PATCH 5/5] style: make linting happy --- snappy_pipeline/models/annotation.py | 72 +++++++++---------- .../somatic_variant_filtration/model.py | 2 +- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/snappy_pipeline/models/annotation.py b/snappy_pipeline/models/annotation.py index 07b6fe2cd..273511d83 100644 --- a/snappy_pipeline/models/annotation.py +++ b/snappy_pipeline/models/annotation.py @@ -37,41 +37,41 @@ class VepPickOrder(enum.StrEnum): class VepOutputOptions(enum.StrEnum): - everything = "everything" - """sift, polyphen, ccds, uniprot, hgvs, symbol, numbers, domains""" - sift = "sift b" - polyphen = "polyphen b" - ccds = "ccds" - uniprot = "uniprot" - hgvs = "hgvs" - symbol = "symbol" - numbers = "numbers" - domains = "domains" - regulatory = "regulatory" - canonical = "canonical" - protein = "protein" - biotype = "biotype" - tsl = "tsl" - appris = "appris" - gene_phenotype = "gene_phenotype" - af = "af" - af_1kg = "af_1kg" - af_esp = "af_esp" - max_af = "max_af" - pubmed = "pubmed" - mane = "mane" - variant_class = "variant_class" - var_synonyms = "var_synonyms" - """Removed since version 106""" - af_gnomad = "af_gnomad" - """ - Superseded by 'af_gnomade' & 'af_gnomadg' for version 107. - 'af_gnomad' has the same function as 'af_gnomade' - """ - af_gnomade = "af_gnomade" - af_gnomadg = "af_gnomadg" - mirna = "mirna" - """Available from version 109""" + everything = "everything" + """sift, polyphen, ccds, uniprot, hgvs, symbol, numbers, domains""" + sift = "sift b" + polyphen = "polyphen b" + ccds = "ccds" + uniprot = "uniprot" + hgvs = "hgvs" + symbol = "symbol" + numbers = "numbers" + domains = "domains" + regulatory = "regulatory" + canonical = "canonical" + protein = "protein" + biotype = "biotype" + tsl = "tsl" + appris = "appris" + gene_phenotype = "gene_phenotype" + af = "af" + af_1kg = "af_1kg" + af_esp = "af_esp" + max_af = "max_af" + pubmed = "pubmed" + mane = "mane" + variant_class = "variant_class" + var_synonyms = "var_synonyms" + """Removed since version 106""" + af_gnomad = "af_gnomad" + """ + Superseded by 'af_gnomade' & 'af_gnomadg' for version 107. + 'af_gnomad' has the same function as 'af_gnomade' + """ + af_gnomade = "af_gnomade" + af_gnomadg = "af_gnomadg" + mirna = "mirna" + """Available from version 109""" class Vep(SnappyModel): @@ -104,7 +104,7 @@ class Vep(SnappyModel): The default order is different from the default order proposed by VEP. Here, variants in protein coding transcripts will be annotated before MANE transcripts. """ - + num_threads: int = 8 buffer_size: int = 1000 output_options: list[VepOutputOptions] = [VepOutputOptions.everything] diff --git a/snappy_pipeline/workflows/somatic_variant_filtration/model.py b/snappy_pipeline/workflows/somatic_variant_filtration/model.py index b50b79f9e..5a4b0db09 100644 --- a/snappy_pipeline/workflows/somatic_variant_filtration/model.py +++ b/snappy_pipeline/workflows/somatic_variant_filtration/model.py @@ -56,7 +56,7 @@ class Bcftools(SnappyModel): Field( examples=[ "FORMAT/AD[1:1] < 5 | FORMAT/DP[1] < 50 | AD[1:1]/(AD[1:0]+AD[1:1])<0.05", - "((REF='C' & ALT='T') | (REF='G' & ALT='A')) & AD[1:1]/(AD[1:0]+AD[1:1])<=0.10" + "((REF='C' & ALT='T') | (REF='G' & ALT='A')) & AD[1:1]/(AD[1:0]+AD[1:1])<=0.10", ] ), ] = ""