Add variant calling from bam files workflow

moiexpositoalonsolab · Jul 15, 2024 · 114df0c · 114df0c
1 parent e898334
commit 114df0c
Show file tree

Hide file tree

Showing 17 changed files with 285 additions and 57 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -83,6 +83,7 @@ jobs:
  - mapping-bwa-mem-1
  - mapping-bwa-mem2-1
  - mapping-bwa-mem2-2
+ - mappings-table
  - pileup-1
  - trimming-adapterremoval-1
  - trimming-adapterremoval-2

diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,7 @@ test/vep-cache.zip
 test/prep.log
 test/reference
 test/reference.tar.gz
+test/mappings.tsv
 test/samples.tsv
 test/samples-pe.tsv
 test/out-*

diff --git a/config/config.yaml b/config/config.yaml
@@ -7,11 +7,12 @@
 # Adjust the below dummy file paths and other options as needed.
 data:
 
- # Needed. Samples table that lists all samples with their units (re-sequencing runs) and fastq files.
+ # Needed for the standard use case of calling variants from `fastq` input files.
+ # The samples table lists all samples with their units (re-sequencing runs) and fastq files.
  # The path to the table file itself, as well as the paths to the fastq files within the table need
  # to be absolute! No relative paths (e.g., `../data`), and no paths using shortcuts such as `~`.
  # See the wiki (https://github.com/lczech/grenepipe/wiki) for the expected format.
- # The file name needs to end in ".fasta", ".fasta.gz", ".fa", ".fa.gz", ".fna", ".fas", ".fas.gz".
+ # The file names need to end in ".fasta", ".fasta.gz", ".fa", ".fa.gz", ".fna", ".fas", ".fas.gz".
  samples-table: "/path/to/data/samples.tsv"
 
  # Needed. Path to the reference genome in fasta format, uncompressed, or gzipped.
@@ -48,6 +49,12 @@ data:
  # column are 1, the number of rows in the table corresponds to the number of samples.
  samples-count: 0
 
+ # Advanced usage: Skip all fastq related steps, i.e., the trimming, mapping, and the bam file
+ # processing, and instead start the whole pipeline with a given set of bam files, running only
+ # the variant calling and filtering.
+ # See https://github.com/moiexpositoalonsolab/grenepipe/wiki/Advanced-Usage#running-the-variant-calling-from-a-set-of-bam-files
+ mappings-table: ""
+
  # If the file provided by `reference-genome` above do not already exist, we can automatically
  # download it from ensembl (or any other URL), using the following specification.
  # When downloading from ensembl, try filling in the specificiation of your reference.
@@ -593,6 +600,9 @@ params:
  # processing all (optional) steps (filtering, clipping, dedup, base recalibration).
  # However, we can also run on the raw merged samples, where reads just have been mapped, sorted,
  # and merged per sample (all units of a sample into one bam file), but before any other processing.
+ # When running the pipeline from a set of bam files, that is, when processing a mappings-table
+ # instead of mapping the the fastq files of the `samples-table` in the pipeline itself,
+ # the bam files from that table are used instead, and these settings here are ignored.
  # Valid values: "processed", "merged"
  stats-bams: "processed"
  flagstat-bams: "processed"

diff --git a/example/README.md b/example/README.md
@@ -42,6 +42,11 @@ Furthermore, for testing, we provide a simple `regions.bed` file that restricts
 to just some of the regions in the chromosomes. Note that this is mostly an experimental feature
 at the moment (see the `config.yaml`, under `settings: restrict-regions`, for details).
 
+Lastly, the `mapping` directory contains mapped `bam` files of the samples, also for testing
+purposes. This can for instance be used in order to run the pipeline for variant calling
+starting from a set of given `bam` files, instead of mapping `fastq` files first.
+See [here](https://github.com/moiexpositoalonsolab/grenepipe/wiki/Advanced-Usage#running-only-parts-of-the-pipeline) for details on this.
+
 
 Pipeline
 ==============

diff --git a/example/mapping/S1.bam b/example/mapping/S1.bam
diff --git a/example/mapping/S2.bam b/example/mapping/S2.bam
diff --git a/example/mapping/S3.bam b/example/mapping/S3.bam
diff --git a/test/cases/mappings-table.txt b/test/cases/mappings-table.txt
@@ -0,0 +1,2 @@
+samples-count: 0 samples-count: 3
+mappings-table: "" mappings-table: "#BASEPATH#/test/mappings.tsv"
diff --git a/test/mappings-template.tsv b/test/mappings-template.tsv
@@ -0,0 +1,4 @@
+sample bam
+S1 #BASEPATH#/example/mapping/S1.bam
+S2 #BASEPATH#/example/mapping/S2.bam
+S3 #BASEPATH#/example/mapping/S3.bam
diff --git a/test/run.sh b/test/run.sh
@@ -41,9 +41,10 @@ if [[ ! -d ./test/reference ]]; then
  cp ./example/regions.bed ./test/reference/regions.bed
 fi
 
-# Copy the samples table, so that we can change the paths without changing the original,
+# Copy the samples tables, so that we can change the paths without changing the original,
 # We need to use a different sed separator here that cannot occur in paths, to avoid conflict.
-cat ./test/samples_template.tsv | sed "s?#BASEPATH#?${BASEPATH}?g" > ./test/samples.tsv
+cat ./test/samples-template.tsv | sed "s?#BASEPATH#?${BASEPATH}?g" > ./test/samples.tsv
+cat ./test/mappings-template.tsv | sed "s?#BASEPATH#?${BASEPATH}?g" > ./test/mappings.tsv
 
 # For seqprep, we also want a samples table with only paired-end reads.
 # So let's make a copy and remove the line that contains the single-end file.

diff --git a/test/samples_template.tsv → test/samples-template.tsv b/test/samples_template.tsv → test/samples-template.tsv
diff --git a/workflow/rules/initialize-bam.smk b/workflow/rules/initialize-bam.smk
@@ -0,0 +1,155 @@
+# =================================================================================================
+# Read Mapping Table
+# =================================================================================================
+
+# Similar setup to what we do in `initialize-fastq.smk`, see there for details.
+if "global" in config:
+ raise Exception("Config key 'global' already defined. Someone messed with our setup.")
+else:
+ config["global"] = {}
+
+# Read mappings table, and enforce to use strings in the index
+config["global"]["samples"] = pd.read_csv(
+ config["data"]["mappings-table"], sep="\t", dtype=str
+).set_index(["sample"], drop=False)
+
+# Get a list of all samples names, in the same order as the input sample table.
+# We cannot use a simple approach here, as this messes up the sample
+# order, which we do not want... (good that we noticed that bug though!)
+# So instead, we iterate, and add sample names incrementally.
+config["global"]["sample-names"] = list()
+for index, row in config["global"]["samples"].iterrows():
+ s = row["sample"]
+ if s not in config["global"]["sample-names"]:
+ config["global"]["sample-names"].append(s)
+
+
+# Helper function for user output to summarize the samples found in the table.
+# In the fastq case, we here print the units as well.
+# For bam files, it's just simply the number of samples.
+def get_sample_units_print():
+ return str(len(config["global"]["sample-names"]))
+
+
+# Wildcard constraints: only allow sample names from the spreadsheet to be used
+wildcard_constraints:
+ sample="|".join(config["global"]["sample-names"]),
+
+
+# =================================================================================================
+# Sample and File Validity Checks
+# =================================================================================================
+
+
+# Check that the number of samples fits with the expectation, if provided by the user.
+if config["data"].get("samples-count", 0) > 0:
+ if config["data"]["samples-count"] != len(config["global"]["sample-names"]):
+ raise Exception(
+ "Inconsistent number of samples found in the sample table ("
+ + str(len(config["global"]["sample-names"]))
+ + ") and the samples count consistency check provided in the config file under "
+ + "`data: samples-count` ("
+ + str(config["data"]["samples-count"])
+ + ")."
+ )
+
+# We recommend to use absolute paths. Check that for the samples table.
+if not os.path.isabs(config["data"]["mappings-table"]):
+ logger.warning(
+ "Path to the samples table as provided in the config file is not an absolute path. "
+ "We recommend using absolute paths for all files.\n"
+ )
+
+# Run integrity checks similar to what we do for the fastq files.
+uniq_sample_names = list()
+problematic_filenames = 0
+relative_filenames = 0
+for index, row in config["global"]["samples"].iterrows():
+ if row["sample"] in uniq_sample_names:
+ raise Exception(
+ "Multiple rows with identical sample name found in samples table: "
+ + str(row["sample"])
+ )
+ uniq_sample_names.append(row["sample"])
+
+ # Do a check that the sample names are valid file names.
+ # They are used for file names, and would cause weird errors if they contain bad chars.
+ if not valid_filename(row["sample"]):
+ raise Exception(
+ "Invalid sample name name found in samples table that contains characters "
+ + "which cannot be used as sample names for naming output files: "
+ + str(row["sample"])
+ + "; for maximum robustness, we only allow alpha-numerical, dots, dashes, "
+ "and underscores. "
+ + "Use for example the script `tools/copy-samples.py --mode link [...] --clean` "
+ + "to create a new samples table and symlinks to the existing fastq files to solve this."
+ )
+
+ # Do a check of the fastq file names.
+ if not os.path.isfile(row["bam"]):
+ raise Exception(
+ "Input bam files listed in the input files mappings table "
+ + config["data"]["mappings-table"]
+ + " not found: "
+ + str(row["bam"])
+ )
+ if not valid_filepath(row["bam"]):
+ problematic_filenames += 1
+ if not os.path.isabs(row["bam"]):
+ relative_filenames += 1
+
+# Warning about input names and files.
+if problematic_filenames > 0:
+ logger.warning(
+ str(problematic_filenames)
+ + " of the "
+ + str(len(config["global"]["sample-names"]))
+ + " input samples listed in the input files mappings table "
+ + config["data"]["mappings-table"]
+ + " contain problematic characters. We generally advise to only use alpha-numeric "
+ "characters, dots, dashes, and underscores. "
+ "We will try to continue running with these files, but it might lead to errors.\n"
+ )
+if relative_filenames > 0:
+ logger.warning(
+ str(relative_filenames)
+ + " of the "
+ + str(len(config["global"]["sample-names"]))
+ + " input samples listed in the input files mappings table "
+ + config["data"]["mappings-table"]
+ + " use relative file paths. We generally advise to only use absolute paths. "
+ "We will try to continue running with these files, but it might lead to errors.\n"
+ )
+
+
+# Check if a given string can be converted to a number, https://stackoverflow.com/q/354038/4184258
+def is_number(s):
+ try:
+ float(s) # for int, long, float
+ except ValueError:
+ return False
+ return True
+
+
+# We check if any sample names are only numbers, and warn about this. Bioinformatics is messy...
+numeric_sample_names = 0
+for sn in config["global"]["sample-names"]:
+ if is_number(sn):
+ numeric_sample_names += 1
+
+if numeric_sample_names > 0:
+ logger.warning(
+ str(numeric_sample_names)
+ + " of the "
+ + str(len(config["global"]["sample-names"]))
+ + " input sample names listed in the input files mappings table "
+ + config["data"]["mappings-table"]
+ + " are just numbers, or can be converted to numbers. We generally advise to avoid this, "
+ "as it might confuse downstream processing such as Excel and R. The same applies for names "
+ 'that can be converted to dates ("Dec21"), but we do not check this here. '
+ "We will now continue running with these files, as we can work with this here, "
+ "but recommend to change the sample names.\n"
+ )
+
+del uniq_sample_names, problematic_filenames, relative_filenames
+del numeric_sample_names
diff --git a/workflow/rules/initialize-fastq.smk b/workflow/rules/initialize-fastq.smk
@@ -39,12 +39,6 @@ config["global"]["unit-names"] = list(
 )
 
 
-# Wildcard constraints: only allow sample names from the spreadsheet to be used
-wildcard_constraints:
- sample="|".join(config["global"]["sample-names"]),
- unit="|".join(config["global"]["unit-names"]),
-
-
 # Helper function to get a list of all units of a given sample name.
 def get_sample_units(sample):
  res = list()
@@ -65,6 +59,12 @@ def get_sample_units_print():
  )
 
 
+# Wildcard constraints: only allow sample names from the spreadsheet to be used
+wildcard_constraints:
+ sample="|".join(config["global"]["sample-names"]),
+ unit="|".join(config["global"]["unit-names"]),
+
+
 # =================================================================================================
 # Sample and File Validity Checks
 # =================================================================================================
@@ -76,7 +76,8 @@ if config["data"].get("samples-count", 0) > 0:
  raise Exception(
  "Inconsistent number of samples found in the sample table ("
  + str(len(config["global"]["sample-names"]))
- + ") and the samples count consistency check provided in the config file ("
+ + ") and the samples count consistency check provided in the config file under "
+ + "`data: samples-count` ("
  + str(config["data"]["samples-count"])
  + ")."
  )
@@ -124,7 +125,7 @@ for index, row in config["global"]["samples"].iterrows():
  not pd.isnull(row["fq2"]) and not os.path.isfile(row["fq2"])
  ):
  raise Exception(
- "Input fastq files listed in the input files table "
+ "Input fastq files listed in the input files samples table "
  + config["data"]["samples-table"]
  + " not found: "
  + str(row["fq1"])
@@ -146,7 +147,7 @@ if problematic_filenames > 0:
  str(problematic_filenames)
  + " of the "
  + str(len(config["global"]["sample-names"]))
- + " input samples listed in the input files table "
+ + " input samples listed in the input files samples table "
  + config["data"]["samples-table"]
  + " contain problematic characters. We generally advise to only use alpha-numeric "
  "characters, dots, dashes, and underscores. "
@@ -159,15 +160,14 @@ if relative_filenames > 0:
  str(relative_filenames)
  + " of the "
  + str(len(config["global"]["sample-names"]))
- + " input samples listed in the input files table "
+ + " input samples listed in the input files samples table "
  + config["data"]["samples-table"]
  + " use relative file paths. We generally advise to only use absolute paths. "
  "Use for example the script `tools/generate-table.py` "
  "to create a samples table with absolute paths. "
  "We will try to continue running with these files, but it might lead to errors.\n"
  )
 
-del problematic_filenames, relative_filenames
 
 
 # Check if a given string can be converted to a number, https://stackoverflow.com/q/354038/4184258
@@ -190,7 +190,7 @@ if numeric_sample_names > 0:
  str(numeric_sample_names)
  + " of the "
  + str(len(config["global"]["sample-names"]))
- + " input sample names listed in the input files table "
+ + " input sample names listed in the input files samples table "
  + config["data"]["samples-table"]
  + " are just numbers, or can be converted to numbers. We generally advise to avoid this, "
  "as it might confuse downstream processing such as Excel and R. The same applies for names "
@@ -199,4 +199,5 @@ if numeric_sample_names > 0:
  "but recommend to change the sample names.\n"
  )
 
+del problematic_filenames, relative_filenames
 del numeric_sample_names
diff --git a/workflow/rules/initialize.smk b/workflow/rules/initialize.smk
@@ -34,10 +34,14 @@ snakemake.utils.validate(config, schema="../schemas/config.schema.yaml")
 report: os.path.join(workflow.basedir, "reports/workflow.rst")
 
 
-# Include the functions neeed to initialize the pipeline for analysing a set of fastq samples.
-# This reads the samples table, and provides validation and user output functions for it.
+# Include the functions neeed to initialize the pipeline for analysing a set of fastq samples,
+# or, if the mappings table is given, starting from there.
+# This reads the samples/mappings table, and provides validation and user output functions for it.
 include: "initialize-reference.smk"
-include: "initialize-fastq.smk"
+if "mappings-table" in config["data"] and config["data"]["mappings-table"]:
+ include: "initialize-bam.smk"
+else:
+ include: "initialize-fastq.smk"
 
 
 # =================================================================================================