snakemake-workflows · FelixMoelder · Dec 5, 2024 · Nov 16, 2023 · Nov 17, 2023 · Nov 17, 2023
diff --git a/.test/config-chm-eval/config.yaml b/.test/config-chm-eval/config.yaml
@@ -12,6 +12,9 @@ ref:
   release: 100
   # Genome build
   build: GRCh38
+  pangenome:
+    activate: false  
+    vcf: ""
 
 primers:
   trimming:
@@ -181,4 +184,4 @@ report:
   max_read_depth: 250
   stratify:
     activate: false
-    by-column: condition
+    by-column: condition
diff --git a/.test/config-giab/config.yaml b/.test/config-giab/config.yaml
@@ -13,6 +13,9 @@ ref:
   # Genome build
   build: GRCh38
   chromosome: 1
+  pangenome:
+    activate: false
+    vcf: ""
 
 primers:
   trimming:
@@ -154,4 +157,4 @@ params:
     min_alternate_fraction: 0.05 # Reduce for calling variants with lower VAFs
 
 gene_coverage:
-  min_avg_coverage: 5
+  min_avg_coverage: 5
diff --git a/.test/config-no-candidate-filtering/config.yaml b/.test/config-no-candidate-filtering/config.yaml
@@ -12,7 +12,10 @@ ref:
   release: 100
   # Genome build
   build: R64-1-1
-
+  pangenome:
+    activate: false
+    vcf: ""
+
 primers:
   trimming:
     activate: false

diff --git a/.test/config-simple/config.yaml b/.test/config-simple/config.yaml
@@ -12,6 +12,9 @@ ref:
   release: 100
   # Genome build
   build: R64-1-1
+  pangenome:
+    activate: false
+    vcf: ""
 
 primers:
   trimming:

diff --git a/.test/config-sra/config.yaml b/.test/config-sra/config.yaml
@@ -12,6 +12,9 @@ ref:
   release: 110
   # Genome build
   build: R64-1-1
+  pangenome:
+    activate: false
+    vcf: ""
 
 primers:
   trimming:

diff --git a/.test/config-target-regions/config.yaml b/.test/config-target-regions/config.yaml
@@ -14,6 +14,9 @@ ref:
   release: 100
   # Genome build
   build: R64-1-1
+  pangenome:
+    activate: false
+    vcf: ""
 
 primers:
   trimming:

diff --git a/.test/config-target-regions/config_multiple_beds.yaml b/.test/config-target-regions/config_multiple_beds.yaml
@@ -16,6 +16,9 @@ ref:
   release: 100
   # Genome build
   build: R64-1-1
+  pangenome:
+    activate: false
+    vcf: ""
 
 primers:
   trimming:

diff --git a/.test/config_primers/config.yaml b/.test/config_primers/config.yaml
@@ -13,6 +13,9 @@ ref:
   snpeff_release: 86
   # Genome build
   build: R64-1-1
+  pangenome:
+    activate: false
+    vcf: ""
 
 primers:
   trimming:

diff --git a/config/config.yaml b/config/config.yaml
@@ -25,6 +25,15 @@ ref:
   release: 111
   # Genome build
   build: GRCh38
+  pangenome: 
+    # if active, reads will be aligned to given pangenome instead of to the linear reference genome
+    # Important: this is only supported for homo_sapiens so far
+    activate: True
+    # URL to pangenome haplotypes (vcf-file)
+    # Graph resources v1.1: https://github.com/human-pangenomics/hpp_pangenome_resources/
+    # Graph resources v1.0: https://github.com/human-pangenomics/hpp_pangenome_resources/blob/main/hprc-v1.0-mc.md
+    # Important: ensure that the haplotype vcf is built against the same reference genome as specified above under build
+    vcf: https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.1-mc-grch38/hprc-v1.1-mc-grch38.raw.vcf.gz
   # Optionally, instead of downloading the whole reference from Ensembl via the
   # parameters above, specify a specific chromosome below and uncomment the line.
   # This is usually only relevant for testing.
@@ -157,6 +166,7 @@ calling:
           # Add varlociraptor events to aggregated over.
           # The probability for the union of these events is used for controlling
           # the FDR.
+          - present
           - somatic_tumor_high
           - somatic_tumor_medium
         filter: # myfilter

diff --git a/config/samples.tsv b/config/samples.tsv
@@ -1 +1,2 @@
 sample_name	alias	group	platform	purity	panel	umi_read	umi_read_structure	datatype	calling
+SRR702070	tumor	SRR702070_group	ILLUMINA	1.0				dna	variants
diff --git a/config/scenario.yaml b/config/scenario.yaml
@@ -31,4 +31,4 @@ events:
   somatic_tumor_high: "tumor:[0.3,1.0] & normal:0.0"
   somatic_normal: "normal:]0.0,0.5["
   germline_hom: "normal:1.0"
-  germline_het: "normal:0.5"
+  germline_het: "normal:0.5"
diff --git a/config/units.tsv b/config/units.tsv
@@ -1 +1,2 @@
 sample_name	unit_name	fq1	fq2	sra	adapters
+SRR702070	lane1	/projects/koesterlab/orthanq/HapMap_data/SRR702070_1.fastq.gz	/projects/koesterlab/orthanq/HapMap_data/SRR702070_2.fastq.gz
diff --git a/workflow/envs/varlociraptor.yaml b/workflow/envs/varlociraptor.yaml
@@ -3,6 +3,6 @@ channels:
   - bioconda
   - nodefaults
 dependencies:
-  - varlociraptor >=8.4.11,<8.5
+  - varlociraptor >=8.4.12,<8.5
   - vega-lite-cli =5.16
   - bcftools =1.19
diff --git a/workflow/envs/vg.yaml b/workflow/envs/vg.yaml
@@ -0,0 +1,5 @@
+name: vg
+channels:
+   - bioconda
+dependencies:
+   - vg =1.60
diff --git a/workflow/rules/benchmarking.smk b/workflow/rules/benchmarking.smk
@@ -1,4 +1,5 @@
-ruleorder: chm_eval_sample > map_reads
+# TODO Is this ruleorder of any use?!
+ruleorder: chm_eval_sample > map_reads_bwa
 
 
 rule gather_benchmark_calls:

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -30,6 +30,8 @@ genome_prefix = f"resources/{genome_name}"
 genome = f"{genome_prefix}.fasta"
 genome_fai = f"{genome}.fai"
 genome_dict = f"{genome_prefix}.dict"
+pangenome_name = f"pangenome.{species}.{build}"
+pangenome_prefix = f"resources/{pangenome_name}"
 
 # cram variables
 use_cram = config.get("use_cram", False)
@@ -260,20 +262,13 @@ def get_control_fdr_input(wildcards):
         return "results/final-calls/{group}.{calling_type}.annotated.bcf"
 
 
-def get_recalibrate_quality_input(wildcards, bai=False):
-    ext = "bai" if bai else "bam"
-    datatype = get_sample_datatype(wildcards.sample)
-    if datatype == "rna":
-        return "results/split/{{sample}}.{ext}".format(ext=ext)
-    # Post-processing of DNA samples
-    if is_activated("calc_consensus_reads"):
-        return "results/consensus/{{sample}}.{ext}".format(ext=ext)
-    elif is_activated("primers/trimming"):
-        return "results/trimmed/{{sample}}.trimmed.{ext}".format(ext=ext)
-    elif is_activated("remove_duplicates"):
-        return "results/dedup/{{sample}}.{ext}".format(ext=ext)
+def get_aligner(wildcards):
+    if get_sample_datatype(wildcards.sample) == "rna":
+        return "star"
+    elif is_activated("ref/pangenome"):
+        return "vg"
     else:
-        return "results/mapped/bwa/{{sample}}.{ext}".format(ext=ext)
+        return "bwa"
 
 
 def get_cutadapt_input(wildcards):
@@ -428,31 +423,46 @@ def get_sample_datatype(sample):
 
 
 def get_markduplicates_input(wildcards):
-    aligner = "star" if get_sample_datatype(wildcards.sample) == "rna" else "bwa"
+    aligner = get_aligner(wildcards)
     if sample_has_umis(wildcards.sample):
+        # Special case for vg as umi annotation (if active) is done before finalizing bam output
+        # Could also directly go to else-branch if aligner != "vg"
         return "results/mapped/{aligner}/{{sample}}.annotated.bam".format(
             aligner=aligner
         )
     else:
         return "results/mapped/{aligner}/{{sample}}.bam".format(aligner=aligner)
 
 
-def get_consensus_input(wildcards):
+def get_recalibrate_quality_input(wildcards, bai=False):
+    ext = "bai" if bai else "bam"
+    datatype = get_sample_datatype(wildcards.sample)
+    if datatype == "rna":
+        return "results/split/{{sample}}.{ext}".format(ext=ext)
+    # Post-processing of DNA samples
+    if is_activated("calc_consensus_reads"):
+        return "results/consensus/{{sample}}.{ext}".format(ext=ext)
+    else:
+        return get_consensus_input(wildcards, bai)
+
+
+def get_consensus_input(wildcards, bai=False):
+    ext = "bai" if bai else "bam"
     if is_activated("primers/trimming"):
-        return "results/trimmed/{sample}.trimmed.bam"
-    elif is_activated("remove_duplicates"):
-        return "results/dedup/{sample}.bam"
+        return "results/trimmed/{{sample}}.trimmed.{ext}".format(ext=ext)
     else:
-        aligner = "star" if get_sample_datatype(wildcards.sample) == "rna" else "bwa"
-        return "results/mapped/{aligner}/{{sample}}.bam".format(aligner=aligner)
+        return get_trimming_input(wildcards, bai)
 
 
-def get_trimming_input(wildcards):
+def get_trimming_input(wildcards, bai=False):
+    ext = "bai" if bai else "bam"
     if is_activated("remove_duplicates"):
-        return "results/dedup/{sample}.bam"
+        return "results/dedup/{{sample}}.{ext}".format(ext=ext)
     else:
-        aligner = "star" if get_sample_datatype(wildcards.sample) == "rna" else "bwa"
-        return "results/mapped/{aligner}/{{sample}}.bam".format(aligner=aligner)
+        aligner = get_aligner(wildcards)
+        return "results/mapped/{aligner}/{{sample}}.{ext}".format(
+            aligner=aligner, ext=ext
+        )
 
 
 def get_primer_bed(wc):
@@ -623,6 +633,13 @@ def get_read_group(wildcards):
     )
 
 
+def get_vg_read_group(wildcards):
+    platform = extract_unique_sample_column_value(wildcards.sample, "platform")
+    return r"--RGLB lib1 --RGPL {platform} --RGPU {sample} --RGSM {sample} --RGID {sample}".format(
+        sample=wildcards.sample, platform=platform
+    )
+
+
 def get_map_reads_sorting_params(wildcards, ordering=False):
     match (sample_has_umis(wildcards.sample), ordering):
         case (True, True):
@@ -635,6 +652,13 @@ def get_map_reads_sorting_params(wildcards, ordering=False):
             return "samtools"
 
 
+def get_add_readgroup_input(wildcards):
+    if sample_has_umis(wildcards.sample):
+        return "results/mapped/vg/{sample}.annotated.bam"
+    else:
+        return "results/mapped/vg/{sample}.mate_fixed.bam"
+
+
 def get_mutational_burden_targets():
     mutational_burden_targets = []
     if is_activated("mutational_burden"):
@@ -685,15 +709,19 @@ def get_selected_annotations():
     return selection
 
 
-def get_annotated_bcf(wildcards):
+def get_annotated_bcf(wildcards, index=False):
+    ext = ".csi" if index else ""
     selection = (
         get_selected_annotations() if wildcards.calling_type == "variants" else ""
     )
-    return "results/calls/{group}.{calling_type}.{scatteritem}{selection}.bcf".format(
-        group=wildcards.group,
-        calling_type=wildcards.calling_type,
-        selection=selection,
-        scatteritem=wildcards.scatteritem,
+    return (
+        "results/calls/{group}.{calling_type}.{scatteritem}{selection}.bcf{ext}".format(
+            group=wildcards.group,
+            calling_type=wildcards.calling_type,
+            selection=selection,
+            scatteritem=wildcards.scatteritem,
+            ext=ext,
+        )
     )
 
 

diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk
@@ -19,6 +19,7 @@ rule filter_candidates_by_annotation:
 rule filter_by_annotation:
     input:
         bcf=get_annotated_bcf,
+        csi=partial(get_annotated_bcf, index=True),
         aux=get_annotation_filter_aux_files,
     output:
         "results/calls/{group}.{event}.{calling_type}.{scatteritem}.filtered_ann.bcf",

diff --git a/workflow/rules/maf.smk b/workflow/rules/maf.smk
@@ -33,12 +33,12 @@ rule group_vcf_to_maf:
             dpath="maf/primary_alias", within=config, default="tumor"
         ),
         vcf_control_alias_option=(
-            f'--vcf-normal-id {lookup(dpath= "maf/control_alias", within= config, default= "")}'
+            f'--vcf-normal-id {lookup(dpath = "maf/control_alias", within = config , default = "")}'
             if lookup(dpath="maf/control_alias", within=config, default=False)
             else ""
         ),
         normal_id=lambda wc: (
-            f'--normal-id {wc.group}_{lookup(dpath= "maf/control_alias", within= config, default= "")}'
+            f'--normal-id {wc.group}_{lookup(dpath = "maf/control_alias", within = config , default = "")}'
             if lookup(dpath="maf/control_alias", within=config, default=False)
             else ""
         ),
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		sample_name alias group platform purity panel umi_read umi_read_structure datatype calling
		SRR702070 tumor SRR702070_group ILLUMINA 1.0 dna variants
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		sample_name unit_name fq1 fq2 sra adapters
		SRR702070 lane1 /projects/koesterlab/orthanq/HapMap_data/SRR702070_1.fastq.gz /projects/koesterlab/orthanq/HapMap_data/SRR702070_2.fastq.gz