diff --git a/.test/config-chm-eval/config.yaml b/.test/config-chm-eval/config.yaml index ee0a3b706..a3219ad1c 100644 --- a/.test/config-chm-eval/config.yaml +++ b/.test/config-chm-eval/config.yaml @@ -12,6 +12,9 @@ ref: release: 100 # Genome build build: GRCh38 + pangenome: + activate: false + vcf: "" primers: trimming: @@ -180,4 +183,4 @@ report: max_read_depth: 250 stratify: activate: false - by-column: condition \ No newline at end of file + by-column: condition diff --git a/.test/config-giab/config.yaml b/.test/config-giab/config.yaml index f2f19cf94..10e9c94d8 100644 --- a/.test/config-giab/config.yaml +++ b/.test/config-giab/config.yaml @@ -13,6 +13,9 @@ ref: # Genome build build: GRCh38 chromosome: 1 + pangenome: + activate: false + vcf: "" primers: trimming: @@ -153,4 +156,4 @@ params: min_alternate_fraction: 0.05 # Reduce for calling variants with lower VAFs gene_coverage: - min_avg_coverage: 5 \ No newline at end of file + min_avg_coverage: 5 diff --git a/.test/config-no-candidate-filtering/config.yaml b/.test/config-no-candidate-filtering/config.yaml index a8d95d245..897a5a00e 100644 --- a/.test/config-no-candidate-filtering/config.yaml +++ b/.test/config-no-candidate-filtering/config.yaml @@ -12,7 +12,10 @@ ref: release: 100 # Genome build build: R64-1-1 - + pangenome: + activate: false + vcf: "" + primers: trimming: primers_fa1: "" diff --git a/.test/config-simple/config.yaml b/.test/config-simple/config.yaml index 266953044..de1d124c2 100644 --- a/.test/config-simple/config.yaml +++ b/.test/config-simple/config.yaml @@ -12,6 +12,9 @@ ref: release: 100 # Genome build build: R64-1-1 + pangenome: + activate: false + vcf: "" primers: trimming: diff --git a/.test/config-sra/config.yaml b/.test/config-sra/config.yaml index 960f85244..22751b786 100644 --- a/.test/config-sra/config.yaml +++ b/.test/config-sra/config.yaml @@ -12,6 +12,9 @@ ref: release: 110 # Genome build build: R64-1-1 + pangenome: + activate: false + vcf: "" primers: trimming: diff --git a/.test/config-target-regions/config.yaml b/.test/config-target-regions/config.yaml index f3ef41d8e..e1172464d 100644 --- a/.test/config-target-regions/config.yaml +++ b/.test/config-target-regions/config.yaml @@ -14,6 +14,9 @@ ref: release: 100 # Genome build build: R64-1-1 + pangenome: + activate: false + vcf: "" primers: trimming: diff --git a/.test/config-target-regions/config_multiple_beds.yaml b/.test/config-target-regions/config_multiple_beds.yaml index 9ae6fbf45..660845052 100644 --- a/.test/config-target-regions/config_multiple_beds.yaml +++ b/.test/config-target-regions/config_multiple_beds.yaml @@ -16,6 +16,9 @@ ref: release: 100 # Genome build build: R64-1-1 + pangenome: + activate: false + vcf: "" primers: trimming: diff --git a/.test/config_primers/config.yaml b/.test/config_primers/config.yaml index 7ba497e75..c2921ba8d 100644 --- a/.test/config_primers/config.yaml +++ b/.test/config_primers/config.yaml @@ -13,6 +13,9 @@ ref: snpeff_release: 86 # Genome build build: R64-1-1 + pangenome: + activate: false + vcf: "" primers: trimming: diff --git a/config/config.yaml b/config/config.yaml index 6f537bf00..1c2235a8e 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -25,6 +25,21 @@ ref: release: 111 # Genome build build: GRCh38 + pangenome: + # if active, reads will be aligned to given pangenome instead of to the linear reference genome + # Important: this is only supported for homo_sapiens so far + activate: true + # URL to pangenome haplotypes (vcf-file) + # Graph resources v1.1: https://github.com/human-pangenomics/hpp_pangenome_resources/ + # Graph resources v1.0: https://github.com/human-pangenomics/hpp_pangenome_resources/blob/main/hprc-v1.0-mc.md + # Important: ensure that the haplotype vcf is built against the same reference genome as specified above under build + vcf: https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.1-mc-grch38/hprc-v1.1-mc-grch38.raw.vcf.gz + # optional: In case contigs of the hoplotype vcf need to be renamed + # a list of python expressions can be defined. + # Contigs can be accessed by `contig`. + # Each expression will be successively saved into `renamed_contig` + rename_expressions: + - "'MT' if contig == 'chrM' else contig[3:]" # Optionally, instead of downloading the whole reference from Ensembl via the # parameters above, specify a specific chromosome below and uncomment the line. # This is usually only relevant for testing. @@ -158,6 +173,7 @@ calling: # Add varlociraptor events to aggregated over. # The probability for the union of these events is used for controlling # the FDR. + - present - somatic_tumor_high - somatic_tumor_medium filter: # myfilter diff --git a/config/samples.tsv b/config/samples.tsv index dcd1d552c..3ccc601f6 100644 --- a/config/samples.tsv +++ b/config/samples.tsv @@ -1 +1,2 @@ sample_name alias group platform purity panel umi_read umi_read_structure datatype calling +SRR702070 tumor SRR702070_group ILLUMINA 1.0 dna variants diff --git a/config/units.tsv b/config/units.tsv index b69ac9911..33a3337b6 100644 --- a/config/units.tsv +++ b/config/units.tsv @@ -1 +1,2 @@ sample_name unit_name fq1 fq2 sra adapters +SRR702070 lane1 /projects/koesterlab/orthanq/HapMap_data/SRR702070_1.fastq.gz /projects/koesterlab/orthanq/HapMap_data/SRR702070_2.fastq.gz \ No newline at end of file diff --git a/workflow/envs/pysam.yaml b/workflow/envs/pysam.yaml new file mode 100644 index 000000000..ee5575c08 --- /dev/null +++ b/workflow/envs/pysam.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python =3.12 + - pysam =0.22 \ No newline at end of file diff --git a/workflow/envs/varlociraptor.yaml b/workflow/envs/varlociraptor.yaml index f1add320e..c6f9b4f3b 100644 --- a/workflow/envs/varlociraptor.yaml +++ b/workflow/envs/varlociraptor.yaml @@ -3,6 +3,6 @@ channels: - bioconda - nodefaults dependencies: - - varlociraptor >=8.4.11,<8.5 + - varlociraptor >=8.4.13,<8.5 - vega-lite-cli =5.16 - - bcftools =1.19 + - bcftools =1.21 diff --git a/workflow/rules/benchmarking.smk b/workflow/rules/benchmarking.smk index 38ebe7f41..be24b8e2e 100644 --- a/workflow/rules/benchmarking.smk +++ b/workflow/rules/benchmarking.smk @@ -1,4 +1,5 @@ -ruleorder: chm_eval_sample > map_reads +# TODO Is this ruleorder of any use?! +ruleorder: chm_eval_sample > map_reads_bwa rule gather_benchmark_calls: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 47f7361b3..662278dc7 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -30,6 +30,8 @@ genome_prefix = f"resources/{genome_name}" genome = f"{genome_prefix}.fasta" genome_fai = f"{genome}.fai" genome_dict = f"{genome_prefix}.dict" +pangenome_name = f"pangenome.{species}.{build}" +pangenome_prefix = f"resources/{pangenome_name}" # cram variables use_cram = config.get("use_cram", False) @@ -263,6 +265,8 @@ def get_control_fdr_input(wildcards): def get_aligner(wildcards): if get_sample_datatype(wildcards.sample) == "rna": return "star" + elif is_activated("ref/pangenome"): + return "vg" else: return "bwa" @@ -436,7 +440,7 @@ def get_recalibrate_quality_input(wildcards, bai=False): def get_consensus_input(wildcards, bai=False): ext = "bai" if bai else "bam" if sample_has_primers(wildcards): - return "results/trimmed/{{sample}}.trimmed.{ext}".format(ext=ext) + return f"results/trimmed/{{sample}}.trimmed.{ext}" else: return get_trimming_input(wildcards, bai) @@ -452,6 +456,13 @@ def get_trimming_input(wildcards, bai=False): ) +def get_vg_autoindex_vcf(): + if config["ref"]["pangenome"].get("rename_expressions"): + return f"{pangenome_prefix}.renamed.vcf.gz" + else: + return f"{pangenome_prefix}.vcf.gz" + + def get_primer_bed(wc): if isinstance(primer_panels, pd.DataFrame): if not pd.isna(primer_panels.loc[wc.panel, "fa2"]): @@ -618,6 +629,13 @@ def get_read_group(wildcards): ) +def get_vg_read_group(wildcards): + platform = extract_unique_sample_column_value(wildcards.sample, "platform") + return r"--RGLB lib1 --RGPL {platform} --RGPU {sample} --RGSM {sample} --RGID {sample}".format( + sample=wildcards.sample, platform=platform + ) + + def get_map_reads_sorting_params(wildcards, ordering=False): match (sample_has_umis(wildcards.sample), ordering): case (True, True): @@ -630,6 +648,13 @@ def get_map_reads_sorting_params(wildcards, ordering=False): return "samtools" +def get_add_readgroup_input(wildcards): + if sample_has_umis(wildcards.sample): + return "results/mapped/vg/{sample}.annotated.bam" + else: + return "results/mapped/vg/{sample}.mate_fixed.bam" + + def get_mutational_burden_targets(): mutational_burden_targets = [] if is_activated("mutational_burden"): @@ -680,15 +705,19 @@ def get_selected_annotations(): return selection -def get_annotated_bcf(wildcards): +def get_annotated_bcf(wildcards, index=False): + ext = ".csi" if index else "" selection = ( get_selected_annotations() if wildcards.calling_type == "variants" else "" ) - return "results/calls/{group}.{calling_type}.{scatteritem}{selection}.bcf".format( - group=wildcards.group, - calling_type=wildcards.calling_type, - selection=selection, - scatteritem=wildcards.scatteritem, + return ( + "results/calls/{group}.{calling_type}.{scatteritem}{selection}.bcf{ext}".format( + group=wildcards.group, + calling_type=wildcards.calling_type, + selection=selection, + scatteritem=wildcards.scatteritem, + ext=ext, + ) ) diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk index 0df77964e..55d8d3515 100644 --- a/workflow/rules/filtering.smk +++ b/workflow/rules/filtering.smk @@ -19,6 +19,7 @@ rule filter_candidates_by_annotation: rule filter_by_annotation: input: bcf=get_annotated_bcf, + csi=partial(get_annotated_bcf, index=True), aux=get_annotation_filter_aux_files, output: "results/calls/{group}.{event}.{calling_type}.{scatteritem}.filtered_ann.bcf", diff --git a/workflow/rules/maf.smk b/workflow/rules/maf.smk index 27f885143..d6de45e68 100644 --- a/workflow/rules/maf.smk +++ b/workflow/rules/maf.smk @@ -33,12 +33,12 @@ rule group_vcf_to_maf: dpath="maf/primary_alias", within=config, default="tumor" ), vcf_control_alias_option=( - f'--vcf-normal-id {lookup(dpath= "maf/control_alias", within= config, default= "")}' + f'--vcf-normal-id {lookup(dpath = "maf/control_alias", within = config , default = "")}' if lookup(dpath="maf/control_alias", within=config, default=False) else "" ), normal_id=lambda wc: ( - f'--normal-id {wc.group}_{lookup(dpath= "maf/control_alias", within= config, default= "")}' + f'--normal-id {wc.group}_{lookup(dpath = "maf/control_alias", within = config , default = "")}' if lookup(dpath="maf/control_alias", within=config, default=False) else "" ), diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index aa764e5c0..fab5df796 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -1,4 +1,4 @@ -rule map_reads: +rule map_reads_bwa: input: reads=get_map_reads_input, idx=rules.bwa_index.output, @@ -15,6 +15,59 @@ rule map_reads: "v3.8.0/bio/bwa/mem" +# Create distance and minimizer index before mapping +# Otherwise it will be performed on the first execution leading to race conditions for multiple samples +rule map_reads_vg: + input: + reads=get_map_reads_input, + index=rules.vg_autoindex.output, + output: + temp("results/mapped/vg/{sample}.preprocessed.bam"), + log: + "logs/mapped/vg/{sample}.log", + benchmark: + "benchmarks/vg_giraffe/{sample}.tsv" + params: + extra="", + sorting="fgbio", + sort_order="queryname", + threads: 8 + wrapper: + "v5.3.0/bio/vg/giraffe" + + +# samtools fixmate requires querysorted input +rule fix_mate: + input: + "results/mapped/vg/{sample}.preprocessed.bam", + output: + temp("results/mapped/vg/{sample}.mate_fixed.bam"), + log: + "logs/samtools/fix_mate/{sample}.log", + threads: 8 + params: + extra="", + wrapper: + "v4.7.2/bio/samtools/fixmate" + + +# adding read groups is necessary because base recalibration throws errors +# for not being able to find read group information +rule add_read_group: + input: + "results/mapped/vg/{sample}.mate_fixed.bam", + output: + temp("results/mapped/vg/{sample}.bam"), + log: + "logs/picard/add_rg/{sample}.log", + params: + extra=get_vg_read_group, + resources: + mem_mb=1024, + wrapper: + "v2.3.2/bio/picard/addorreplacereadgroups" + + rule merge_untrimmed_fastqs: input: get_untrimmed_fastqs, @@ -41,6 +94,7 @@ rule sort_untrimmed_fastqs: "fgbio SortFastq -i {input} -o {output} 2> {log}" +# fgbio AnnotateBamsWithUmis requires querynamed sorted fastqs and bams rule annotate_umis: input: bam="results/mapped/{aligner}/{sample}.bam", diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 44b12b1ad..6d7d4725a 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -146,3 +146,57 @@ rule get_vep_plugins: "logs/vep/plugins.log", wrapper: "v3.3.5/bio/vep/plugins" + + +rule get_pangenome_haplotypes: + output: + temp(f"{pangenome_prefix}.vcf.gz"), + params: + url=config["ref"]["pangenome"]["vcf"], + log: + "logs/pangenome/haplotypes.log", + shell: + "curl -o {output} {params.url} 2> {log}" + + +rule rename_haplotype_contigs: + input: + f"{pangenome_prefix}.vcf.gz", + output: + temp("resources/haplotype_contigs_renamed.tsv"), + params: + expressions=config["ref"]["pangenome"].get("rename_expressions", []), + log: + "logs/pangenome/chrom_replacement.log", + conda: + "../envs/pysam.yaml" + script: + "../scripts/rename_contigs.py" + + +rule rename_haplotype_chroms: + input: + vcf="resources/{pangenome}.vcf.gz", + tsv="resources/haplotype_contigs_renamed.tsv", + output: + temp("resources/{pangenome}.renamed.vcf.gz"), + log: + "logs/pangenome/{pangenome}_renamed.log", + conda: + "../envs/bcftools.yaml" + shell: + "bcftools annotate --rename-chrs {input.tsv} {input.vcf} -Oz -o {output} 2> {log}" + + +rule vg_autoindex: + input: + ref=genome, + vcf=get_vg_autoindex_vcf(), + output: + multiext(pangenome_prefix, ".giraffe.gbz", ".dist", ".min"), + log: + "logs/vg/autoindex/pangenome.log", + cache: "omit-software" + threads: max(workflow.cores, 1) + wrapper: + "v5.3.0/bio/vg/autoindex" diff --git a/workflow/scripts/rename_contigs.py b/workflow/scripts/rename_contigs.py new file mode 100644 index 000000000..e5c177576 --- /dev/null +++ b/workflow/scripts/rename_contigs.py @@ -0,0 +1,15 @@ +import pysam + +sys.stderr = open(snakemake.log[0], "w") + +rename_expressions = snakemake.params.expressions + +vcf_file = pysam.VariantFile(snakemake.input[0]) +contigs = list(vcf_file.header.contigs) +vcf_file.close() + +with open(snakemake.output[0], "w") as out: + for contig in contigs: + for expression in snakemake.params.expressions: + updated_contig = eval(expression) + print(f"{contig}\t{updated_contig}", file=out)