diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e1d87bf..06d5d84 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -83,6 +83,7 @@ jobs: - mapping-bwa-mem-1 - mapping-bwa-mem2-1 - mapping-bwa-mem2-2 + - mappings-table - pileup-1 - trimming-adapterremoval-1 - trimming-adapterremoval-2 diff --git a/.gitignore b/.gitignore index d8348dc..4de22ea 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ test/vep-cache.zip test/prep.log test/reference test/reference.tar.gz +test/mappings.tsv test/samples.tsv test/samples-pe.tsv test/out-* diff --git a/config/config.yaml b/config/config.yaml index 433f46f..be704e1 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -7,11 +7,12 @@ # Adjust the below dummy file paths and other options as needed. data: - # Needed. Samples table that lists all samples with their units (re-sequencing runs) and fastq files. + # Needed for the standard use case of calling variants from `fastq` input files. + # The samples table lists all samples with their units (re-sequencing runs) and fastq files. # The path to the table file itself, as well as the paths to the fastq files within the table need # to be absolute! No relative paths (e.g., `../data`), and no paths using shortcuts such as `~`. # See the wiki (https://github.com/lczech/grenepipe/wiki) for the expected format. - # The file name needs to end in ".fasta", ".fasta.gz", ".fa", ".fa.gz", ".fna", ".fas", ".fas.gz". + # The file names need to end in ".fasta", ".fasta.gz", ".fa", ".fa.gz", ".fna", ".fas", ".fas.gz". samples-table: "/path/to/data/samples.tsv" # Needed. Path to the reference genome in fasta format, uncompressed, or gzipped. @@ -48,6 +49,12 @@ data: # column are 1, the number of rows in the table corresponds to the number of samples. samples-count: 0 + # Advanced usage: Skip all fastq related steps, i.e., the trimming, mapping, and the bam file + # processing, and instead start the whole pipeline with a given set of bam files, running only + # the variant calling and filtering. + # See https://github.com/moiexpositoalonsolab/grenepipe/wiki/Advanced-Usage#running-the-variant-calling-from-a-set-of-bam-files + mappings-table: "" + # If the file provided by `reference-genome` above do not already exist, we can automatically # download it from ensembl (or any other URL), using the following specification. # When downloading from ensembl, try filling in the specificiation of your reference. @@ -593,6 +600,9 @@ params: # processing all (optional) steps (filtering, clipping, dedup, base recalibration). # However, we can also run on the raw merged samples, where reads just have been mapped, sorted, # and merged per sample (all units of a sample into one bam file), but before any other processing. + # When running the pipeline from a set of bam files, that is, when processing a mappings-table + # instead of mapping the the fastq files of the `samples-table` in the pipeline itself, + # the bam files from that table are used instead, and these settings here are ignored. # Valid values: "processed", "merged" stats-bams: "processed" flagstat-bams: "processed" diff --git a/example/README.md b/example/README.md index 84c0ca0..85177c9 100644 --- a/example/README.md +++ b/example/README.md @@ -42,6 +42,11 @@ Furthermore, for testing, we provide a simple `regions.bed` file that restricts to just some of the regions in the chromosomes. Note that this is mostly an experimental feature at the moment (see the `config.yaml`, under `settings: restrict-regions`, for details). +Lastly, the `mapping` directory contains mapped `bam` files of the samples, also for testing +purposes. This can for instance be used in order to run the pipeline for variant calling +starting from a set of given `bam` files, instead of mapping `fastq` files first. +See [here](https://github.com/moiexpositoalonsolab/grenepipe/wiki/Advanced-Usage#running-only-parts-of-the-pipeline) for details on this. + Pipeline ============== diff --git a/example/mapping/S1.bam b/example/mapping/S1.bam new file mode 100644 index 0000000..0ea5dac Binary files /dev/null and b/example/mapping/S1.bam differ diff --git a/example/mapping/S2.bam b/example/mapping/S2.bam new file mode 100644 index 0000000..ef96329 Binary files /dev/null and b/example/mapping/S2.bam differ diff --git a/example/mapping/S3.bam b/example/mapping/S3.bam new file mode 100644 index 0000000..2c64ba6 Binary files /dev/null and b/example/mapping/S3.bam differ diff --git a/test/cases/mappings-table.txt b/test/cases/mappings-table.txt new file mode 100644 index 0000000..1197c22 --- /dev/null +++ b/test/cases/mappings-table.txt @@ -0,0 +1,2 @@ +samples-count: 0 samples-count: 3 +mappings-table: "" mappings-table: "#BASEPATH#/test/mappings.tsv" diff --git a/test/mappings-template.tsv b/test/mappings-template.tsv new file mode 100644 index 0000000..c78d5c6 --- /dev/null +++ b/test/mappings-template.tsv @@ -0,0 +1,4 @@ +sample bam +S1 #BASEPATH#/example/mapping/S1.bam +S2 #BASEPATH#/example/mapping/S2.bam +S3 #BASEPATH#/example/mapping/S3.bam diff --git a/test/run.sh b/test/run.sh index e95560d..d37d291 100755 --- a/test/run.sh +++ b/test/run.sh @@ -41,9 +41,10 @@ if [[ ! -d ./test/reference ]]; then cp ./example/regions.bed ./test/reference/regions.bed fi -# Copy the samples table, so that we can change the paths without changing the original, +# Copy the samples tables, so that we can change the paths without changing the original, # We need to use a different sed separator here that cannot occur in paths, to avoid conflict. -cat ./test/samples_template.tsv | sed "s?#BASEPATH#?${BASEPATH}?g" > ./test/samples.tsv +cat ./test/samples-template.tsv | sed "s?#BASEPATH#?${BASEPATH}?g" > ./test/samples.tsv +cat ./test/mappings-template.tsv | sed "s?#BASEPATH#?${BASEPATH}?g" > ./test/mappings.tsv # For seqprep, we also want a samples table with only paired-end reads. # So let's make a copy and remove the line that contains the single-end file. diff --git a/test/samples_template.tsv b/test/samples-template.tsv similarity index 100% rename from test/samples_template.tsv rename to test/samples-template.tsv diff --git a/workflow/rules/initialize-bam.smk b/workflow/rules/initialize-bam.smk new file mode 100644 index 0000000..0d0f7c1 --- /dev/null +++ b/workflow/rules/initialize-bam.smk @@ -0,0 +1,155 @@ +# ================================================================================================= +# Read Mapping Table +# ================================================================================================= + +# Similar setup to what we do in `initialize-fastq.smk`, see there for details. +if "global" in config: + raise Exception("Config key 'global' already defined. Someone messed with our setup.") +else: + config["global"] = {} + +# Read mappings table, and enforce to use strings in the index +config["global"]["samples"] = pd.read_csv( + config["data"]["mappings-table"], sep="\t", dtype=str +).set_index(["sample"], drop=False) + +# Get a list of all samples names, in the same order as the input sample table. +# We cannot use a simple approach here, as this messes up the sample +# order, which we do not want... (good that we noticed that bug though!) +# So instead, we iterate, and add sample names incrementally. +config["global"]["sample-names"] = list() +for index, row in config["global"]["samples"].iterrows(): + s = row["sample"] + if s not in config["global"]["sample-names"]: + config["global"]["sample-names"].append(s) + + +# Helper function for user output to summarize the samples found in the table. +# In the fastq case, we here print the units as well. +# For bam files, it's just simply the number of samples. +def get_sample_units_print(): + return str(len(config["global"]["sample-names"])) + + +# Wildcard constraints: only allow sample names from the spreadsheet to be used +wildcard_constraints: + sample="|".join(config["global"]["sample-names"]), + + +# ================================================================================================= +# Sample and File Validity Checks +# ================================================================================================= + + +# Check that the number of samples fits with the expectation, if provided by the user. +if config["data"].get("samples-count", 0) > 0: + if config["data"]["samples-count"] != len(config["global"]["sample-names"]): + raise Exception( + "Inconsistent number of samples found in the sample table (" + + str(len(config["global"]["sample-names"])) + + ") and the samples count consistency check provided in the config file under " + + "`data: samples-count` (" + + str(config["data"]["samples-count"]) + + ")." + ) + +# We recommend to use absolute paths. Check that for the samples table. +if not os.path.isabs(config["data"]["mappings-table"]): + logger.warning( + "Path to the samples table as provided in the config file is not an absolute path. " + "We recommend using absolute paths for all files.\n" + ) + +# Run integrity checks similar to what we do for the fastq files. +uniq_sample_names = list() +problematic_filenames = 0 +relative_filenames = 0 +for index, row in config["global"]["samples"].iterrows(): + if row["sample"] in uniq_sample_names: + raise Exception( + "Multiple rows with identical sample name found in samples table: " + + str(row["sample"]) + ) + uniq_sample_names.append(row["sample"]) + + # Do a check that the sample names are valid file names. + # They are used for file names, and would cause weird errors if they contain bad chars. + if not valid_filename(row["sample"]): + raise Exception( + "Invalid sample name name found in samples table that contains characters " + + "which cannot be used as sample names for naming output files: " + + str(row["sample"]) + + "; for maximum robustness, we only allow alpha-numerical, dots, dashes, " + "and underscores. " + + "Use for example the script `tools/copy-samples.py --mode link [...] --clean` " + + "to create a new samples table and symlinks to the existing fastq files to solve this." + ) + + # Do a check of the fastq file names. + if not os.path.isfile(row["bam"]): + raise Exception( + "Input bam files listed in the input files mappings table " + + config["data"]["mappings-table"] + + " not found: " + + str(row["bam"]) + ) + if not valid_filepath(row["bam"]): + problematic_filenames += 1 + if not os.path.isabs(row["bam"]): + relative_filenames += 1 + +# Warning about input names and files. +if problematic_filenames > 0: + logger.warning( + str(problematic_filenames) + + " of the " + + str(len(config["global"]["sample-names"])) + + " input samples listed in the input files mappings table " + + config["data"]["mappings-table"] + + " contain problematic characters. We generally advise to only use alpha-numeric " + "characters, dots, dashes, and underscores. " + "We will try to continue running with these files, but it might lead to errors.\n" + ) +if relative_filenames > 0: + logger.warning( + str(relative_filenames) + + " of the " + + str(len(config["global"]["sample-names"])) + + " input samples listed in the input files mappings table " + + config["data"]["mappings-table"] + + " use relative file paths. We generally advise to only use absolute paths. " + "We will try to continue running with these files, but it might lead to errors.\n" + ) + + +# Check if a given string can be converted to a number, https://stackoverflow.com/q/354038/4184258 +def is_number(s): + try: + float(s) # for int, long, float + except ValueError: + return False + return True + + +# We check if any sample names are only numbers, and warn about this. Bioinformatics is messy... +numeric_sample_names = 0 +for sn in config["global"]["sample-names"]: + if is_number(sn): + numeric_sample_names += 1 + +if numeric_sample_names > 0: + logger.warning( + str(numeric_sample_names) + + " of the " + + str(len(config["global"]["sample-names"])) + + " input sample names listed in the input files mappings table " + + config["data"]["mappings-table"] + + " are just numbers, or can be converted to numbers. We generally advise to avoid this, " + "as it might confuse downstream processing such as Excel and R. The same applies for names " + 'that can be converted to dates ("Dec21"), but we do not check this here. ' + "We will now continue running with these files, as we can work with this here, " + "but recommend to change the sample names.\n" + ) + +del uniq_sample_names, problematic_filenames, relative_filenames +del numeric_sample_names diff --git a/workflow/rules/initialize-fastq.smk b/workflow/rules/initialize-fastq.smk index cb8f4ab..a1d753e 100644 --- a/workflow/rules/initialize-fastq.smk +++ b/workflow/rules/initialize-fastq.smk @@ -39,12 +39,6 @@ config["global"]["unit-names"] = list( ) -# Wildcard constraints: only allow sample names from the spreadsheet to be used -wildcard_constraints: - sample="|".join(config["global"]["sample-names"]), - unit="|".join(config["global"]["unit-names"]), - - # Helper function to get a list of all units of a given sample name. def get_sample_units(sample): res = list() @@ -65,6 +59,12 @@ def get_sample_units_print(): ) +# Wildcard constraints: only allow sample names from the spreadsheet to be used +wildcard_constraints: + sample="|".join(config["global"]["sample-names"]), + unit="|".join(config["global"]["unit-names"]), + + # ================================================================================================= # Sample and File Validity Checks # ================================================================================================= @@ -76,7 +76,8 @@ if config["data"].get("samples-count", 0) > 0: raise Exception( "Inconsistent number of samples found in the sample table (" + str(len(config["global"]["sample-names"])) - + ") and the samples count consistency check provided in the config file (" + + ") and the samples count consistency check provided in the config file under " + + "`data: samples-count` (" + str(config["data"]["samples-count"]) + ")." ) @@ -124,7 +125,7 @@ for index, row in config["global"]["samples"].iterrows(): not pd.isnull(row["fq2"]) and not os.path.isfile(row["fq2"]) ): raise Exception( - "Input fastq files listed in the input files table " + "Input fastq files listed in the input files samples table " + config["data"]["samples-table"] + " not found: " + str(row["fq1"]) @@ -146,7 +147,7 @@ if problematic_filenames > 0: str(problematic_filenames) + " of the " + str(len(config["global"]["sample-names"])) - + " input samples listed in the input files table " + + " input samples listed in the input files samples table " + config["data"]["samples-table"] + " contain problematic characters. We generally advise to only use alpha-numeric " "characters, dots, dashes, and underscores. " @@ -159,7 +160,7 @@ if relative_filenames > 0: str(relative_filenames) + " of the " + str(len(config["global"]["sample-names"])) - + " input samples listed in the input files table " + + " input samples listed in the input files samples table " + config["data"]["samples-table"] + " use relative file paths. We generally advise to only use absolute paths. " "Use for example the script `tools/generate-table.py` " @@ -167,7 +168,6 @@ if relative_filenames > 0: "We will try to continue running with these files, but it might lead to errors.\n" ) -del problematic_filenames, relative_filenames # Check if a given string can be converted to a number, https://stackoverflow.com/q/354038/4184258 @@ -190,7 +190,7 @@ if numeric_sample_names > 0: str(numeric_sample_names) + " of the " + str(len(config["global"]["sample-names"])) - + " input sample names listed in the input files table " + + " input sample names listed in the input files samples table " + config["data"]["samples-table"] + " are just numbers, or can be converted to numbers. We generally advise to avoid this, " "as it might confuse downstream processing such as Excel and R. The same applies for names " @@ -199,4 +199,5 @@ if numeric_sample_names > 0: "but recommend to change the sample names.\n" ) +del problematic_filenames, relative_filenames del numeric_sample_names diff --git a/workflow/rules/initialize.smk b/workflow/rules/initialize.smk index a1925cc..599ade0 100644 --- a/workflow/rules/initialize.smk +++ b/workflow/rules/initialize.smk @@ -34,10 +34,14 @@ snakemake.utils.validate(config, schema="../schemas/config.schema.yaml") report: os.path.join(workflow.basedir, "reports/workflow.rst") -# Include the functions neeed to initialize the pipeline for analysing a set of fastq samples. -# This reads the samples table, and provides validation and user output functions for it. +# Include the functions neeed to initialize the pipeline for analysing a set of fastq samples, +# or, if the mappings table is given, starting from there. +# This reads the samples/mappings table, and provides validation and user output functions for it. include: "initialize-reference.smk" -include: "initialize-fastq.smk" +if "mappings-table" in config["data"] and config["data"]["mappings-table"]: + include: "initialize-bam.smk" +else: + include: "initialize-fastq.smk" # ================================================================================================= diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index fd84bc3..cacd4df 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -2,16 +2,24 @@ # Read Group and Helper Functions # ================================================================================================= - # Generic rule for all bai files. # Bit weird as it produces log files with nested paths, but that's okay for now. +# To avoid having absolute paths as part of the log file output though, +# which can happen for the absolute paths provided by the mappings table, +# we add an underscore, just to silence the snakemake warning... +# Super ugly, but log file paths do not work with lambdas or functions in snakemake, +# so there is no easy solution. And introducing separate bam index functions +# for every case where we need those is stupid as well. +# So we live with the ugly understore and neste paths... It's just a log file, +# after all. Still, ugly due to snakemake being weird. rule bam_index: input: "{prefix}.bam", output: "{prefix}.bam.bai", log: - "logs/mapping/samtools-index/{prefix}.log", + # "logs/mapping/samtools-index/{prefix}.log", + "logs/mapping/samtools-index/_" + "{prefix}.log" group: "mapping_extra" wrapper: @@ -279,6 +287,12 @@ if config["settings"]["recalibrate-base-qualities"]: # Final Mapping Result # ================================================================================================= +# Helper function for the case that a `mappings-table` is provided, in which case we do not run +# any mapping ourselves, and skipp all of the above. Instead, we then simply want to use the +# sample bam file from the table here. +def get_bam_from_mappings_table(sample): + return config["global"]["samples"].loc[sample, ["bam"]].dropna() + # At this point, we have several choices of which files we want to hand down to the next # pipleine step. We offer a function so that downstream does not have to deal with this. # The way this works is as follows: Each optional step that could be applied @@ -287,25 +301,34 @@ if config["settings"]["recalibrate-base-qualities"]: # (see above), what we get here is the file name of the last step that we want to apply. -def get_mapping_result(bai=False): - # case 1: no duplicate removal - f = "mapping/merged/{sample}.bam" +def get_mapping_result(sample, bai=False): + # Special case: we are using the mappings table of bam files, + # instead of any of the ones produced with the rules here. + if "mappings-table" in config["data"] and config["data"]["mappings-table"]: + f = get_bam_from_mappings_table(sample) - # case 2: filtering via samtools view - if config["settings"]["filter-mapped-reads"]: - f = "mapping/filtered/{sample}.bam" + else: + # All other cases: pick the "final" bam file following the settings + # that is the one to be used downstream from here. - # case 3: clipping reads with BamUtil - if config["settings"]["clip-read-overlaps"]: - f = "mapping/clipped/{sample}.bam" + # case 1: no duplicate removal + f = "mapping/merged/{sample}.bam".format(sample=sample) + + # case 2: filtering via samtools view + if config["settings"]["filter-mapped-reads"]: + f = "mapping/filtered/{sample}.bam".format(sample=sample) + + # case 3: clipping reads with BamUtil + if config["settings"]["clip-read-overlaps"]: + f = "mapping/clipped/{sample}.bam".format(sample=sample) - # case 4: remove duplicates - if config["settings"]["remove-duplicates"]: - f = "mapping/dedup/{sample}.bam" + # case 4: remove duplicates + if config["settings"]["remove-duplicates"]: + f = "mapping/dedup/{sample}.bam".format(sample=sample) - # case 5: recalibrate base qualities - if config["settings"]["recalibrate-base-qualities"]: - f = "mapping/recal/{sample}.bam" + # case 5: recalibrate base qualities + if config["settings"]["recalibrate-base-qualities"]: + f = "mapping/recal/{sample}.bam".format(sample=sample) # Additionally, this function is run for getting bai files as well if bai: @@ -320,12 +343,12 @@ def get_mapping_result(bai=False): # setting (whether to remove duplicates, and whether to recalibrate the base qualities), # by using the get_mapping_result function, that gives the respective files depending on the config. def get_sample_bams(sample): - return expand(get_mapping_result(), sample=sample) + return get_mapping_result(sample) # Return the bai file(s) for a given sample def get_sample_bais(sample): - return expand(get_mapping_result(True), sample=sample) + return get_mapping_result(sample,True) # Return the bam file(s) for a sample, given a wildcard param from a rule. @@ -342,16 +365,16 @@ def get_sample_bais_wildcards(wildcards): def get_all_bams(): # Make a list of all bams in the order as the samples list. res = list() - for smp in config["global"]["sample-names"]: - res.append(get_mapping_result().format(sample=smp)) + for sample in config["global"]["sample-names"]: + res.append(get_mapping_result(sample)) return res # Return the bai file(s) for all samples def get_all_bais(): res = list() - for smp in config["global"]["sample-names"]: - res.append(get_mapping_result(True).format(sample=smp)) + for sample in config["global"]["sample-names"]: + res.append(get_mapping_result(sample, True)) return res @@ -374,6 +397,7 @@ def get_all_bais(): rule all_bams: input: bams=get_all_bams(), + qc="qc/multiqc.html", output: bams=expand("mapping/final/{sample}.bam", sample=config["global"]["sample-names"]), done=touch("mapping/final.done"), diff --git a/workflow/rules/qc-bam.smk b/workflow/rules/qc-bam.smk index c0acc0c..52b8c5e 100644 --- a/workflow/rules/qc-bam.smk +++ b/workflow/rules/qc-bam.smk @@ -6,13 +6,15 @@ import platform # have been run. Here, we offer a simple function to switch between these inputs, # in order to avoid duplicating thise code below. # It expects the tool and key from the config.yaml -def bam_qc_input(tool, key): +def bam_qc_input(tool, key, wildcards): if key not in config["params"][tool]: raise Exception("config key " + key + " not found for tool " + tool) - if config["params"][tool][key] == "processed": - return get_mapping_result() + if "mappings-table" in config["data"] and config["data"]["mappings-table"]: + return get_sample_bams(wildcards.sample) + elif config["params"][tool][key] == "processed": + return get_sample_bams(wildcards.sample) elif config["params"][tool][key] == "merged": - return "mapping/merged/{sample}.bam" + return "mapping/merged/{sample}.bam".format(sample=wildcards.sample) else: raise Exception( "Unknown setting for " + tool + " " + key + ": " + config["params"][tool][key] @@ -26,7 +28,8 @@ def bam_qc_input(tool, key): rule samtools_stats: input: - bam_qc_input("samtools", "stats-bams"), + # bam_qc_input("samtools", "stats-bams"), + lambda wildcards: bam_qc_input("samtools", "stats-bams", wildcards), output: "qc/samtools-stats/{sample}.txt", log: @@ -54,7 +57,8 @@ localrules: rule samtools_flagstat: input: - bam_qc_input("samtools", "flagstat-bams"), + # bam_qc_input("samtools", "flagstat-bams"), + lambda wildcards: bam_qc_input("samtools", "flagstat-bams", wildcards), output: "qc/samtools-flagstat/{sample}.txt", log: @@ -88,7 +92,8 @@ localrules: # Rule for running qualimap for each bam file of a sample, with its units merged. rule qualimap_sample: input: - bam_qc_input("qualimap", "bams"), + # bam_qc_input("qualimap", "bams"), + lambda wildcards: bam_qc_input("qualimap", "bams", wildcards), output: # The output of this rule (for now) is a directory, so we have no way of telling whether # the rule succeeded by that. We hence add the `done` file here as well, which (according @@ -188,10 +193,17 @@ def picard_collectmultiplemetrics_exts(): return res +# Need a function here, to handle wildcards, as a lambda with a named input does not work... +# def picard_collectmultiplemetrics_input(wildcards): +# return bam_qc_input("picard", "CollectMultipleMetrics-bams", wildcards), + + rule picard_collectmultiplemetrics: input: - bam=bam_qc_input("picard", "CollectMultipleMetrics-bams"), - ref=config["data"]["reference-genome"], + # bam=bam_qc_input("picard", "CollectMultipleMetrics-bams"), + bam = lambda wildcards: bam_qc_input("picard", "CollectMultipleMetrics-bams", wildcards), + # bam = picard_collectmultiplemetrics_input + ref = config["data"]["reference-genome"], output: expand("qc/picard/{{sample}}{ext}", ext=picard_collectmultiplemetrics_exts()), log: diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 74282bf..5a081b5 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -1,5 +1,11 @@ +# We use a switch to decide when we are running the pipeline normally with a set of fastq files, +# and when we are running it instead with a set of bam files to start with. +# In the latter case, some of the fastq and bam qc tools are not applicable. +use_fastq_input = not( "mappings-table" in config["data"] and config["data"]["mappings-table"] ) + # We outsource the individual QC steps for clarity. -include: "qc-fastq.smk" +if use_fastq_input: + include: "qc-fastq.smk" include: "qc-bam.smk" include: "qc-vcf.smk" @@ -17,17 +23,18 @@ include: "qc-vcf.smk" # rule in the snakemake log be incredibly long. However, the multiqc wrapper that we are using uses # the directories of these files anyway, so we can just start with these in the first place. # That is, our `done` files are in the same dir as the actual QC files, so multiqc still finds them. + rule multiqc: input: # Fastq QC tools - "qc/fastqc/fastqc.done", - "trimming/trimming-reports.done", + "qc/fastqc/fastqc.done" if use_fastq_input else [], + "trimming/trimming-reports.done" if use_fastq_input else [], # Mapping QC tools "qc/samtools-stats/samtools-stats.done", "qc/samtools-flagstat/samtools-flagstat.done", "qc/qualimap/qualimap.done", "qc/picard/collectmultiplemetrics.done", - get_dedup_done() if config["settings"]["remove-duplicates"] else [], + get_dedup_done() if use_fastq_input and config["settings"]["remove-duplicates"] else [], # VCF QC tools, if requested # We only need the stats.vchk file of bcftools stats, but request the pdf here as well, # so that the bctools internal plots get generated by the bcftools_stats_plot rule as well. @@ -37,8 +44,8 @@ rule multiqc: "annotation/snpeff.csv" if config["settings"]["snpeff"] else [], "annotation/vep_summary.html" if config["settings"]["vep"] else [], # Damage profiling, if requested - "damage/mapdamage/mapdamage.done" if config["settings"]["mapdamage"] else [], - "damage/damageprofiler/damageprofiler.done" if config["settings"]["damageprofiler"] else [], + "damage/mapdamage/mapdamage.done" if use_fastq_input and config["settings"]["mapdamage"] else [], + "damage/damageprofiler/damageprofiler.done" if use_fastq_input and config["settings"]["damageprofiler"] else [], output: report("qc/multiqc.html", caption="../report/multiqc.rst", category="Quality control"), "qc/multiqc.zip", @@ -74,6 +81,7 @@ rule all_qc: input: # Quality control "qc/multiqc.html", + "mapping/final.done", # Reference genome statistics, provided in the prep steps config["data"]["reference-genome"] + ".seqkit",