From a7712da2401e21d24f36c707b5b51216f9f099f6 Mon Sep 17 00:00:00 2001
From: Eric Blanc <eric.blanc@bihealth.de>
Date: Fri, 5 Jul 2024 15:43:40 +0200
Subject: [PATCH 1/5] docs: Check & update of configuration defaults (first
 part)

---
 snappy_pipeline/apps/tpls/project_config.yaml | 40 +++++++++++---
 .../workflows/ngs_mapping/model.py            | 52 ++++++++++++++++---
 .../somatic_variant_calling/model.py          | 38 ++++++++++++--
 3 files changed, 113 insertions(+), 17 deletions(-)

diff --git a/snappy_pipeline/apps/tpls/project_config.yaml b/snappy_pipeline/apps/tpls/project_config.yaml
index 0e73a72b2..1bbe91a06 100644
--- a/snappy_pipeline/apps/tpls/project_config.yaml
+++ b/snappy_pipeline/apps/tpls/project_config.yaml
@@ -4,18 +4,46 @@
 
 # Step Configuration ==============================================================================
 #
-# Configuration for paths with static data.  This has been preconfigured for the paths on the BIH
-# cluster.
+# Configuration for paths with static data (GRCh37/hs37d5 genome release, no CHR prefix in contig names).
+# This has been preconfigured for the paths on the BIH cluster.
 #
 static_data_config:
   cosmic:
-    path: /fast/projects/cubit/current/static_data/db/COSMIC/v72/GRCh37/CosmicAll.vcf.gz
+    path: /data/cephfs-1/work/projects/cubit/current/static_data/db/COSMIC/v72/GRCh37/CosmicAll.vcf.gz
   dbnsfp:
-    path: /fast/projects/cubit/current/static_data/db/dbNSFP/2.9/dbNSFP2.9.txt.gz
+    path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbNSFP/2.9/dbNSFP2.9.txt.gz
   dbsnp:
-    path: /fast/projects/cubit/current/static_data/db/dbSNP/b147/GRCh37/All_20160408.vcf.gz
+    path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbSNP/b147/GRCh37/All_20160408.vcf.gz
   reference:
-    path: /fast/projects/cubit/current/static_data/reference/GRCh37/hs37d5/hs37d5.fa
+    path: /data/cephfs-1/work/projects/cubit/current/static_data/reference/GRCh37/hs37d5/hs37d5.fa
+  features:
+    path: /data/cephfs-1/work/projects/cubit/current/static_data/annotation/GENCODE/19/GRCh37/gencode.v19.annotation.gtf
+
+# Step Configuration ==============================================================================
+#
+# Configuration for paths with static data (GRCh38.d1.vd1 genome release, includes decoys & viral sequences).  
+# This has been preconfigured for the paths on the BIH cluster (CUBI group members only).
+#
+# Notes: 
+# - GRCh38.d1.vd1 is the genome release used by the GDC consortium
+#   (https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/#alignment-workflow)
+# - GENCODE release 36 is the feature annotation used by the GDC consortium for its RNA pipeline
+#   (https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#rna-seq-alignment-workflow)
+# - The GENCODE release 36 corresponds to ENSEMBL release 102
+#   (https://www.gencodegenes.org/human/releases.html)
+# - Some files have not yet been moved to Tier 1.   
+#
+# static_data_config:
+#   cosmic:
+#     path: /data/cephfs-1/work/projects/cubit/current/static_data/db/COSMIC/v90/GRCh38/CosmicAll.vcf.gz
+#   dbnsfp:
+#     path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbNSFP/3.5/GRCh38/dbNSFP.txt.gz
+#   dbsnp:
+#     path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbSNP/b147/GRCh38/common_all_20160407.vcf.gz
+#   reference:
+#     path: /fast/work/groups/cubi/projects/biotools/static_data/reference/GRCh38.d1.vd1/GRCh38.d1.vd1.fa
+#   features:
+#     path: /fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotation/GENCODE/36/gencode.v36.primary_assembly.annotation.gtf
 
 # Step Configuration ==============================================================================
 #
diff --git a/snappy_pipeline/workflows/ngs_mapping/model.py b/snappy_pipeline/workflows/ngs_mapping/model.py
index 30968a324..5ff25fd50 100644
--- a/snappy_pipeline/workflows/ngs_mapping/model.py
+++ b/snappy_pipeline/workflows/ngs_mapping/model.py
@@ -43,6 +43,9 @@ class TargetCoverageReportEntry(SnappyModel):
       - name: IDT_xGen_V1_0
         pattern: "xGen Exome Research Panel V1\\.0*"
         path: "path/to/targets.bed"
+
+    Bed file for many Agilent exome panels can be found in
+    /fast/work/groups/cubi/projects/biotools/static_data/exome_panel/Agilent
     """
 
     name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])]
@@ -72,7 +75,14 @@ class BwaMode(Enum):
 
 
 class BwaMapper(SnappyModel):
-    path_index: str
+    path_index: str = Field(
+        examples=[
+            "/data/cephfs-1/work/projects/cubit/current/static_data/precomputed/BWA/0.7.17/GRCh37/hs37d5/hs37d5.fa",
+            "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38.d1.vd1/precomputed/BWA/0.7.17/GRCh38.d1.vd1.fa",
+            "/data/cephfs-1/work/groups/cubi/projects/biotools/bwa-mem2/GRCh37/hs37d5/hs37d5.fa",
+            "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38.d1.vd1/precomputed/BWA_MEM2/2.2.1/GRCh38.d1.vd1.fa",
+        ]
+    )
     """Required if listed in ngs_mapping.tools.dna; otherwise, can be removed."""
     num_threads_align: int = 16
     num_threads_trimming: int = 8
@@ -145,8 +155,13 @@ class Somatic(SnappyModel):
 
 
 class Bqsr(SnappyModel):
-    common_variants: str
-    """Common germline variants (see /fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK)"""
+    common_variants: str = Field(
+        examples=[
+            "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/small_exac_common_3.vcf.gz",
+            "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/small_exac_common_3.hg38.vcf.gz",
+        ]
+    )
+    """Common germline variants (see https://console.cloud.google.com/storage/browser/gatk-best-practices)"""
 
 
 class AgentLibPrepType(Enum):
@@ -158,7 +173,11 @@ class AgentLibPrepType(Enum):
 
 
 class AgentPrepare(SnappyModel):
-    path: str
+    path: str = Field(
+        examples=[
+            "/fast/work/groups/cubi/projects/biotools/AGeNT_3.0.6/agent/lib/trimmer-3.0.5.jar"
+        ]
+    )
 
     lib_prep_type: AgentLibPrepType = None
     """One of "halo" (HaloPlex), "hs" (HaloPlexHS), "xt" (SureSelect XT, XT2, XT HS), "v2" (SureSelect XT HS2) & "qxt" (SureSelect QXT)"""
@@ -174,8 +193,17 @@ class AgentMarkDuplicatesConsensusMode(Enum):
 
 
 class AgentMarkDuplicates(SnappyModel):
-    path: str
+    path: str = Field(
+        examples=["/fast/work/groups/cubi/projects/biotools/AGeNT_3.0.6/agent/lib/creak-1.0.5.jar"]
+    )
     path_baits: str
+    """
+    Different exome panels cannot be accomodated here, because the selection method used for coverage is not used.
+    The absolute path of the exome panel must be input.
+
+    Bed file for many Agilent exome panels can be found in
+    /fast/work/groups/cubi/projects/biotools/static_data/exome_panel/Agilent
+    """
     consensus_mode: AgentMarkDuplicatesConsensusMode = None
     """One of "SINGLE", "HYBRID", "DUPLEX" """
 
@@ -194,7 +222,12 @@ class Agent(SnappyModel):
 
 
 class Star(SnappyModel):
-    path_index: str
+    path_index: str = Field(
+        examples=[
+            "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/hs37d5/annotation/GENCODE/19/precomputed/STAR/2.7.10a/100",
+            "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotations/GENCODE/36/precomputed/STAR/STAR_2.7.10a_100",
+        ]
+    )
     """Required if listed in ngs_mapping.tools.rna; otherwise, can be removed."""
     num_threads_align: int = 16
     num_threads_trimming: int = 8
@@ -245,7 +278,12 @@ class Strand(enum.IntEnum):
 
 
 class Strandedness(SnappyModel):
-    path_exon_bed: str
+    path_exon_bed: str = Field(
+        examples=[
+            "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/hs37d5/annotation/GENCODE/19/gencode.v19.chr_scaff.annotation.cds.collapse_annotation.bed",
+            "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotations/GENCODE/36/gencode.v36.primary_assembly.annotation.cds.collapse_annotation.bed",
+        ]
+    )
     """Location of usually highly expressed genes. Known protein coding genes is a good choice"""
 
     strand: Strand = Strand.UNKNOWN
diff --git a/snappy_pipeline/workflows/somatic_variant_calling/model.py b/snappy_pipeline/workflows/somatic_variant_calling/model.py
index 5e017785a..82b973e9a 100644
--- a/snappy_pipeline/workflows/somatic_variant_calling/model.py
+++ b/snappy_pipeline/workflows/somatic_variant_calling/model.py
@@ -98,13 +98,38 @@ class Mutect2(Parallel):
     # Sadly a type of
     # `FilePath | None = None`
     # still applies `FilePath` validation on `None`, which errors
-    panel_of_normals: str | None = ""
+    panel_of_normals: Annotated[
+        str | None,
+        Field(
+            examples=[
+                "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/Mutect2-exome-panel.vcf.gz",
+                "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/Mutect2-WGS-panel-b37.vcf.gz",
+                "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/1000g_pon.hg38.vcf.gz",
+            ]
+        ),
+    ] = None
     """Set path to panel of normals vcf if required"""
 
-    germline_resource: str | None = ""
+    germline_resource: Annotated[
+        str | None,
+        Field(
+            examples=[
+                "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/af-only-gnomad.raw.sites.vcf.gz",
+                "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/af-only-gnomad.hg38.vcf.gz",
+            ]
+        ),
+    ] = None
     """Germline variants resource (same as panel of normals)"""
 
-    common_variants: str | None = ""
+    common_variants: Annotated[
+        str | None,
+        Field(
+            examples=[
+                "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/small_exac_common_3.vcf.gz",
+                "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/small_exac_common_3.hg38.vcf.gz",
+            ]
+        ),
+    ] = None
     """Common germline variants for contamination estimation"""
 
     extra_arguments: Annotated[
@@ -220,7 +245,12 @@ class SomaticVariantCalling(SnappyStepModel, validators.ToolsMixin):
 
     ignore_chroms: Annotated[
         list[str],
-        Field(examples=["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"]),
+        Field(
+            examples=[
+                ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"],
+                ["*_decoy", "EBV", "HPV*", "HBV", "HCV-*", "HIV-*", "HTLV-1", "CMV", "KSHV", "MCV", "SV40", "chrUn_GL000220v1"]
+            ]
+        ),
     ] = ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"]
     """Patterns of contig names to ignore"""
 

From f117400be2f13c2e4dd9f4a0d206818113472a90 Mon Sep 17 00:00:00 2001
From: Eric Blanc <eric.blanc@bihealth.de>
Date: Tue, 23 Jul 2024 11:47:00 +0200
Subject: [PATCH 2/5] fix: make linting and continuous integration tests happy

---
 .../somatic_variant_calling/model.py          | 15 +++++++-
 .../wrappers/mutect2/run/wrapper.py           | 38 +++++++++----------
 2 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_variant_calling/model.py b/snappy_pipeline/workflows/somatic_variant_calling/model.py
index 82b973e9a..7bce21763 100644
--- a/snappy_pipeline/workflows/somatic_variant_calling/model.py
+++ b/snappy_pipeline/workflows/somatic_variant_calling/model.py
@@ -248,7 +248,20 @@ class SomaticVariantCalling(SnappyStepModel, validators.ToolsMixin):
         Field(
             examples=[
                 ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"],
-                ["*_decoy", "EBV", "HPV*", "HBV", "HCV-*", "HIV-*", "HTLV-1", "CMV", "KSHV", "MCV", "SV40", "chrUn_GL000220v1"]
+                [
+                    "*_decoy",
+                    "EBV",
+                    "HPV*",
+                    "HBV",
+                    "HCV-*",
+                    "HIV-*",
+                    "HTLV-1",
+                    "CMV",
+                    "KSHV",
+                    "MCV",
+                    "SV40",
+                    "chrUn_GL000220v1",
+                ],
             ]
         ),
     ] = ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"]
diff --git a/snappy_wrappers/wrappers/mutect2/run/wrapper.py b/snappy_wrappers/wrappers/mutect2/run/wrapper.py
index 83e8b60f6..a740981ad 100644
--- a/snappy_wrappers/wrappers/mutect2/run/wrapper.py
+++ b/snappy_wrappers/wrappers/mutect2/run/wrapper.py
@@ -8,13 +8,28 @@
 reference = snakemake.config["static_data_config"]["reference"]["path"]
 config = snakemake.config["step_config"]["somatic_variant_calling"]["mutect2"]
 
-extra_arguments = " ".join(config["extra_arguments"])
-
 if "normal_bam" in snakemake.input.keys():
     normal = f'--normal "{snakemake.params.normal_lib_name}" --input {snakemake.input.normal_bam}'
 else:
     normal = ""
 
+if snakemake.params.intervals:
+    intervals = " ".join([f"--intervals {interval}" for interval in snakemake.params.intervals])
+else:
+    intervals = ""
+
+# TODO: move config parameters to snakemake params (check with parallel wrapper)
+if config["germline_resource"]:
+    germline_resource = f"--germline-resource {config['germline_resource']}"
+else:
+    germline_resource = ""
+if config["panel_of_normals"]:
+    panel_of_normals = f"--panel-of-normals {config['panel_of_normals']}"
+else:
+    panel_of_normals = ""
+
+extra_arguments = " ".join(config["extra_arguments"])
+
 shell.executable("/bin/bash")
 
 shell(
@@ -47,16 +62,6 @@
 
 out_base=$tmpdir/$(basename {snakemake.output.raw} .vcf.gz)
 
-# Add intervals if required
-intervals=""
-if [[ -n "{snakemake.params.args[intervals]}" ]]
-then
-    for itv in "{snakemake.params.args[intervals]}"
-    do
-        intervals="$intervals --intervals $itv"
-    done
-fi
-
 gatk Mutect2 \
     --tmp-dir $tmpdir \
     {normal} \
@@ -64,13 +69,8 @@
     --reference {reference} \
     --output $out_base.vcf \
     --f1r2-tar-gz ${{out_base}}.f1r2.tar.gz \
-    $(if [[ -n "{config[germline_resource]}" ]]; then \
-        echo --germline-resource {config[germline_resource]}
-    fi) \
-    $(if [[ -n "{config[panel_of_normals]}" ]]; then \
-        echo --panel-of-normals {config[panel_of_normals]}
-    fi) \
-    $intervals \
+    {germline_resource} {panel_of_normals} \
+    {intervals} \
     {extra_arguments}
 
 rm -f $out_base.vcf.idx

From 7f3fd690c2d79a8cbf3f26dec831fbc9bc7fc7f2 Mon Sep 17 00:00:00 2001
From: Eric Blanc <eric.blanc@bihealth.de>
Date: Tue, 23 Jul 2024 11:49:00 +0200
Subject: [PATCH 3/5] docs: typos & explicit genome release

---
 snappy_pipeline/apps/tpls/project_config.yaml  | 6 +++---
 snappy_pipeline/workflows/ngs_mapping/model.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/snappy_pipeline/apps/tpls/project_config.yaml b/snappy_pipeline/apps/tpls/project_config.yaml
index 1bbe91a06..769333328 100644
--- a/snappy_pipeline/apps/tpls/project_config.yaml
+++ b/snappy_pipeline/apps/tpls/project_config.yaml
@@ -2,12 +2,12 @@
 #
 # created: %(created_at)s
 
-# Step Configuration ==============================================================================
+# Static data Configuration =======================================================================
 #
 # Configuration for paths with static data (GRCh37/hs37d5 genome release, no CHR prefix in contig names).
 # This has been preconfigured for the paths on the BIH cluster.
 #
-static_data_config:
+static_data_config:    # hs37d5
   cosmic:
     path: /data/cephfs-1/work/projects/cubit/current/static_data/db/COSMIC/v72/GRCh37/CosmicAll.vcf.gz
   dbnsfp:
@@ -33,7 +33,7 @@ static_data_config:
 #   (https://www.gencodegenes.org/human/releases.html)
 # - Some files have not yet been moved to Tier 1.   
 #
-# static_data_config:
+# static_data_config:    # GRCh38.d1.vd1
 #   cosmic:
 #     path: /data/cephfs-1/work/projects/cubit/current/static_data/db/COSMIC/v90/GRCh38/CosmicAll.vcf.gz
 #   dbnsfp:
diff --git a/snappy_pipeline/workflows/ngs_mapping/model.py b/snappy_pipeline/workflows/ngs_mapping/model.py
index 5ff25fd50..30b024baf 100644
--- a/snappy_pipeline/workflows/ngs_mapping/model.py
+++ b/snappy_pipeline/workflows/ngs_mapping/model.py
@@ -34,11 +34,11 @@ class Tools(SnappyModel):
 
 class TargetCoverageReportEntry(SnappyModel):
     """
-    Mapping from enrichment kit to target region BED file, for either computing per--target
+    Mapping from enrichment kit to target region BED file, for either computing per-target
     region coverage or selecting targeted exons.
 
     The following will match both the stock IDT library kit and the ones
-    with spike-ins seen fromr Yale genomics.  The path above would be
+    with spike-ins seen from Yale genomics.  The path above would be
     mapped to the name "default".
       - name: IDT_xGen_V1_0
         pattern: "xGen Exome Research Panel V1\\.0*"

From f524bdbc092336efc3dc59f8eff2f85c59d7c3ac Mon Sep 17 00:00:00 2001
From: Eric Blanc <eric.blanc@bihealth.de>
Date: Wed, 24 Jul 2024 17:58:22 +0200
Subject: [PATCH 4/5] docs: Added description of somatic variants config

---
 snappy_pipeline/models/annotation.py          | 94 ++++++++++++++++---
 .../somatic_targeted_seq_cnv_calling/model.py | 34 ++++++-
 .../somatic_variant_filtration/model.py       | 10 +-
 3 files changed, 122 insertions(+), 16 deletions(-)

diff --git a/snappy_pipeline/models/annotation.py b/snappy_pipeline/models/annotation.py
index 468f0bf96..07b6fe2cd 100644
--- a/snappy_pipeline/models/annotation.py
+++ b/snappy_pipeline/models/annotation.py
@@ -9,9 +9,74 @@ class VepTxFlag(enum.StrEnum):
     merged = "merged"
 
 
+class VepPickOrder(enum.StrEnum):
+    biotype = "biotype"
+    """Biotype of transcript (protein_coding preferred)"""
+    mane = "mane"
+    """MANE transcript status"""
+    appris = "appris"
+    """APPRIS isoform annotation"""
+    tsl = "tsl"
+    """Transcript support level"""
+    ccds = "ccds"
+    """CCDS status of transcript"""
+    canonical = "canonical"
+    """Canonical status of transcript"""
+    rank = "rank"
+    """Consequence rank"""
+    length = "length"
+    """Translated, transcript or feature length (longer preferred)"""
+    mane_select = "mane_select"
+    """MANE Select status (available from version 103)"""
+    mane_plus_clinical = "mane_plus_clinical"
+    """MANE Plus Clinical transcript status (available from version 103)"""
+    ensembl = "ensembl"
+    """Undocumented (not available in 102)"""
+    refseq = "refseq"
+    """Undocumented (not available in 102)"""
+
+
+class VepOutputOptions(enum.StrEnum):
+     everything = "everything"
+     """sift, polyphen, ccds, uniprot, hgvs, symbol, numbers, domains"""
+     sift = "sift b"
+     polyphen = "polyphen b"
+     ccds = "ccds"
+     uniprot = "uniprot"
+     hgvs = "hgvs"
+     symbol = "symbol"
+     numbers = "numbers"
+     domains = "domains"
+     regulatory = "regulatory"
+     canonical = "canonical"
+     protein = "protein"
+     biotype = "biotype"
+     tsl = "tsl"
+     appris = "appris"
+     gene_phenotype = "gene_phenotype"
+     af = "af"
+     af_1kg = "af_1kg"
+     af_esp = "af_esp"
+     max_af = "max_af"
+     pubmed = "pubmed"
+     mane = "mane"
+     variant_class = "variant_class"
+     var_synonyms = "var_synonyms"
+     """Removed since version 106"""
+     af_gnomad = "af_gnomad"
+     """
+     Superseded by 'af_gnomade' & 'af_gnomadg' for version 107.
+     'af_gnomad' has the same function as 'af_gnomade'
+     """
+     af_gnomade = "af_gnomade"
+     af_gnomadg = "af_gnomadg"
+     mirna = "mirna"
+     """Available from version 109"""
+
+
 class Vep(SnappyModel):
     cache_dir: str = ""
-    """Defaults to $HOME/.vep Not a good idea on the cluster"""
+    """Defaults to $HOME/.vep Not a good idea on the cluster, because the database is very large."""
 
     species: str = "homo_sapiens"
 
@@ -23,16 +88,23 @@ class Vep(SnappyModel):
     tx_flag: VepTxFlag = VepTxFlag.gencode_basic
     """The flag selecting the transcripts.  One of "gencode_basic", "refseq", and "merged"."""
 
-    pick_order: list[str] = [
-        "biotype",
-        "mane",
-        "appris",
-        "tsl",
-        "ccds",
-        "canonical",
-        "rank",
-        "length",
+    pick_order: list[VepPickOrder] = [
+        VepPickOrder.biotype,
+        VepPickOrder.mane,
+        VepPickOrder.appris,
+        VepPickOrder.tsl,
+        VepPickOrder.ccds,
+        VepPickOrder.canonical,
+        VepPickOrder.rank,
+        VepPickOrder.length,
     ]
+    """
+    Ranking of transcripts returned by VEP.
+    Important when only one transcript is selected for annotation.
+    The default order is different from the default order proposed by VEP.
+    Here, variants in protein coding transcripts will be annotated before MANE transcripts.
+    """
+    
     num_threads: int = 8
     buffer_size: int = 1000
-    output_options: list[str] = ["everything"]
+    output_options: list[VepOutputOptions] = [VepOutputOptions.everything]
diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py
index 1fbacf9d7..7b40d23b2 100644
--- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py
+++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py
@@ -16,6 +16,13 @@ class Tool(enum.StrEnum):
     purecn = "purecn"
 
 
+class SequenzaQualityEncoding:
+    sanger = "sanger"
+    """Base quality format (Phred score). Adds an offset of 33 to the qlimit value"""
+    illumina = "illumina"
+    """Base quality format (Phred score). Adds an offset of 64 to the qlimit value"""
+
+
 class SequenzaExtraArgs(SnappyModel):
     hom: float = 0.9
     """Threshold to select homozygous positions"""
@@ -26,7 +33,7 @@ class SequenzaExtraArgs(SnappyModel):
     qlimit: float = 20
     """Minimum nucleotide quality score for inclusion in the counts"""
 
-    qformat: str = "sanger"
+    qformat: str = SequenzaQualityEncoding.sanger
     """Quality format, options are "sanger" or "illumina". This will add an offset of 33 or 64 respectively to the qlimit value"""
 
 
@@ -152,6 +159,23 @@ class GenomeName(enum.StrEnum):
     canFam3 = "canFam3"
 
 
+class PureCnModel(enum.StrEnum):
+    betabin = "betabin"
+    """beta-binomial ditribution for allelic fractions. Accounts for over-dispersion. Needs more than 10 normals"""
+    beta = "beta"
+    """beta distribution for allelic fractions"""
+
+
+class PureCnSegmentation(enum.StrEnum):
+    PSCBS = "PSCBS"
+    """Good and safe starting point. Version supporting interval weights downloaded from github, lime1/PSCBS"""
+    CBS = "CBS"
+    """Circular binary segmentation. Simple, fast & well-tested, Does not fully support SNP information"""
+    GATK4 = "GATK4"
+    """Untested, better choice when number of SNPs per interval is large"""
+    Hclust = "Hclust"
+
+
 class PureCn(SnappyModel):
     genome_name: Annotated[
         GenomeName | Literal["unknown"],
@@ -173,8 +197,8 @@ class PureCn(SnappyModel):
 
     seed: int = 1234567
     extra_commands: dict[str, Any] = {
-        "model": "betabin",
-        "fun-segmentation": "PSCBS",
+        "model": PureCnModel.betabin,
+        "fun-segmentation": PureCnSegmentation.PSCBS,
         "post-optimize": "",
     }
     """Recommended extra arguments for PureCN, extra_commands: {} to clear them all"""
@@ -213,7 +237,9 @@ class PureCn(SnappyModel):
     somatic_variant_caller: str = "mutect2"
     """
     IMPORTANT NOTE:
-    Mutect2 must be called with "--genotype-germline-sites true --genotype-pon-sites true
+    Mutect2 must be called with "--genotype-germline-sites true --genotype-pon-sites true".
+    Using these options, the results will include germline sites required by PureCN.
+    However, the somatic calls are DIFFERENT from the calls obtained in the somatic_variant_calling step.
     """
 
     path_somatic_variants: Annotated[str, Field(examples=["../somatic_variant_calling_for_purecn"])]
diff --git a/snappy_pipeline/workflows/somatic_variant_filtration/model.py b/snappy_pipeline/workflows/somatic_variant_filtration/model.py
index 08e808c15..b50b79f9e 100644
--- a/snappy_pipeline/workflows/somatic_variant_filtration/model.py
+++ b/snappy_pipeline/workflows/somatic_variant_filtration/model.py
@@ -51,7 +51,15 @@ class Bcftools(SnappyModel):
     include: str = ""
     """Expression to be used in bcftools view --include"""
 
-    exclude: str = ""
+    exclude: Annotated[
+        str,
+        Field(
+            examples=[
+                "FORMAT/AD[1:1] < 5 | FORMAT/DP[1] < 50 | AD[1:1]/(AD[1:0]+AD[1:1])<0.05",
+                "((REF='C' & ALT='T') | (REF='G' & ALT='A')) & AD[1:1]/(AD[1:0]+AD[1:1])<=0.10"
+            ]
+        ),
+    ] = ""
     """Expression to be used in bcftools view --exclude"""
 
     @model_validator(mode="after")

From 4fe223bbebe1a0229d45f40c6cc83c5e2704492e Mon Sep 17 00:00:00 2001
From: Eric Blanc <eric.blanc@bihealth.de>
Date: Wed, 24 Jul 2024 18:10:01 +0200
Subject: [PATCH 5/5] style: make linting happy

---
 snappy_pipeline/models/annotation.py          | 72 +++++++++----------
 .../somatic_variant_filtration/model.py       |  2 +-
 2 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/snappy_pipeline/models/annotation.py b/snappy_pipeline/models/annotation.py
index 07b6fe2cd..273511d83 100644
--- a/snappy_pipeline/models/annotation.py
+++ b/snappy_pipeline/models/annotation.py
@@ -37,41 +37,41 @@ class VepPickOrder(enum.StrEnum):
 
 
 class VepOutputOptions(enum.StrEnum):
-     everything = "everything"
-     """sift, polyphen, ccds, uniprot, hgvs, symbol, numbers, domains"""
-     sift = "sift b"
-     polyphen = "polyphen b"
-     ccds = "ccds"
-     uniprot = "uniprot"
-     hgvs = "hgvs"
-     symbol = "symbol"
-     numbers = "numbers"
-     domains = "domains"
-     regulatory = "regulatory"
-     canonical = "canonical"
-     protein = "protein"
-     biotype = "biotype"
-     tsl = "tsl"
-     appris = "appris"
-     gene_phenotype = "gene_phenotype"
-     af = "af"
-     af_1kg = "af_1kg"
-     af_esp = "af_esp"
-     max_af = "max_af"
-     pubmed = "pubmed"
-     mane = "mane"
-     variant_class = "variant_class"
-     var_synonyms = "var_synonyms"
-     """Removed since version 106"""
-     af_gnomad = "af_gnomad"
-     """
-     Superseded by 'af_gnomade' & 'af_gnomadg' for version 107.
-     'af_gnomad' has the same function as 'af_gnomade'
-     """
-     af_gnomade = "af_gnomade"
-     af_gnomadg = "af_gnomadg"
-     mirna = "mirna"
-     """Available from version 109"""
+    everything = "everything"
+    """sift, polyphen, ccds, uniprot, hgvs, symbol, numbers, domains"""
+    sift = "sift b"
+    polyphen = "polyphen b"
+    ccds = "ccds"
+    uniprot = "uniprot"
+    hgvs = "hgvs"
+    symbol = "symbol"
+    numbers = "numbers"
+    domains = "domains"
+    regulatory = "regulatory"
+    canonical = "canonical"
+    protein = "protein"
+    biotype = "biotype"
+    tsl = "tsl"
+    appris = "appris"
+    gene_phenotype = "gene_phenotype"
+    af = "af"
+    af_1kg = "af_1kg"
+    af_esp = "af_esp"
+    max_af = "max_af"
+    pubmed = "pubmed"
+    mane = "mane"
+    variant_class = "variant_class"
+    var_synonyms = "var_synonyms"
+    """Removed since version 106"""
+    af_gnomad = "af_gnomad"
+    """
+    Superseded by 'af_gnomade' & 'af_gnomadg' for version 107.
+    'af_gnomad' has the same function as 'af_gnomade'
+    """
+    af_gnomade = "af_gnomade"
+    af_gnomadg = "af_gnomadg"
+    mirna = "mirna"
+    """Available from version 109"""
 
 
 class Vep(SnappyModel):
@@ -104,7 +104,7 @@ class Vep(SnappyModel):
     The default order is different from the default order proposed by VEP.
     Here, variants in protein coding transcripts will be annotated before MANE transcripts.
     """
-    
+
     num_threads: int = 8
     buffer_size: int = 1000
     output_options: list[VepOutputOptions] = [VepOutputOptions.everything]
diff --git a/snappy_pipeline/workflows/somatic_variant_filtration/model.py b/snappy_pipeline/workflows/somatic_variant_filtration/model.py
index b50b79f9e..5a4b0db09 100644
--- a/snappy_pipeline/workflows/somatic_variant_filtration/model.py
+++ b/snappy_pipeline/workflows/somatic_variant_filtration/model.py
@@ -56,7 +56,7 @@ class Bcftools(SnappyModel):
         Field(
             examples=[
                 "FORMAT/AD[1:1] < 5 | FORMAT/DP[1] < 50 | AD[1:1]/(AD[1:0]+AD[1:1])<0.05",
-                "((REF='C' & ALT='T') | (REF='G' & ALT='A')) & AD[1:1]/(AD[1:0]+AD[1:1])<=0.10"
+                "((REF='C' & ALT='T') | (REF='G' & ALT='A')) & AD[1:1]/(AD[1:0]+AD[1:1])<=0.10",
             ]
         ),
     ] = ""