biodiversitydata-se · johnne · Jul 15, 2022 · Sep 13, 2022 · Sep 13, 2022 · Sep 13, 2022
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -27,9 +27,6 @@ jobs:
     - uses: s-weigand/setup-conda@v1
     - name: Install the conda environment
       run: conda env update -n base -f environment.yml
-    - name: Prep data
-      run: |
-        gzip tests/data/*/*.fastq
     - name: Run workflow
       run: |
-        snakemake --use-conda --conda-frontend mamba -j 4 --directory tests/
+        snakemake --profile test
diff --git a/README.md b/README.md
@@ -169,6 +169,101 @@ you've already run the workflow to completion this will force the workflow to
 rerun the intermediate steps since intermediate fastq files are removed when the
 workflow finishes.
 
+## Extracting a subset of samples
+In case you have a data directory with a lot fastq files, and you've generated
+a sample list as described above (by letting the workflow search the contents 
+of a data directory) but you want to run the workflow on only a subset of those 
+samples there's a utility script at `workflow/scripts/extract_samples.py`. This
+script allows you to input:
+
+1. the full sample list
+2. a file with sample names as given by the sequencer (first column) 
+together with user defined sample names (second column) and
+3. a search string
+
+and will then output lines from the full sample list that match with the 
+specified text string. 
+
+For example:
+
+Say you have a data directory `/data/sequencing1/` with the following data:
+```
+└── sequencing1
+    ├── sample0001
+    │   ├── sample0001_R1.fastq.gz
+    │   └── sample0001_R2.fastq.gz
+    ├── sample0002
+    │   ├── sample0002_R1.fastq.gz
+    │   └── sample0002_R2.fastq.gz
+    ...
+    └── sample0010
+        ├── sample0010_R1.fastq.gz
+        └── sample0010_R2.fastq.gz
+```
+
+Running the workflow with:
+
+```bash
+snakemake --config data_dir=data/sequencing1 -j 1 samples/sequencing1.tsv
+```
+
+would then generate a sample list `samples/sequencing1.tsv` like this:
+
+| sample     | R1                                                  | R1_type       | R1_exists | R2                                                 | R2_type       | R2_exists |
+|------------|-----------------------------------------------------|---------------|-----------|----------------------------------------------------|---------------|-----------|
+| sample0001 | data/sequencing1/sample0001/sample0001_R1.fastq.gz  | <class 'str'> | yes       | data/sequencing1/sample0001/sample0001_R2.fastq.gz | <class 'str'> | yes       |
+| sample0002 | data/sequencing1/sample0002/sample0002_R1.fastq.gz  | <class 'str'> | yes       | data/sequencing1/sample0002/sample0002_R2.fastq.gz | <class 'str'> | yes       |
+| sample0003 | data/sequencing1/sample0003/sample0003_R1.fastq.gz  | <class 'str'> | yes       | data/sequencing1/sample0003/sample0003_R2.fastq.gz | <class 'str'> | yes       |
+...
+
+Now, say that you also have a file `sequencing1_sample_info.txt` with the 
+following contents:
+
+```
+SEQUENCING_ID    USER ID
+sample0001       SOIL_MAR_1
+sample0002       SOIL_JUN_1
+sample0003       WATER_MAR_1
+sample0004       WATER_JUN_1
+sample0005       SOIL_MAR_2
+sample0006       SOIL_JUN_2
+sample0007       WATER_MAR_1
+sample0008       WATER_JUN_2
+sample0009       seq1_neg_control1
+sample0010       seq1_neg_control2
+```
+
+Note that this file can have more columns, but sample names in the first column 
+have to be included in the beginning of the sample names as given in the first 
+column of the full sample list (`samples/sequencing1.tsv` in this example).
+
+Now to extract only samples matching `_MAR_` in the `USER ID` column of `sequencing1_sample_info.txt` 
+we can run:
+
+```bash
+python workflow/scripts/extract_samples.py -s samples/sequencing1.tsv -i sequencing1_sample_info.txt -t "_MAR_"
+```
+
+which would output:
+```
+sample  R1      R1_type R1_exists       R2      R2_type R2_exists
+sample0001      data/sequencing1/sample0001/sample0001_R1.fastq.gz      <class 'str'>   yes     data/sequencing1/sample0001/sample0001_R2.fastq.gz      <class 'str'>   yes
+sample0003      data/sequencing1/sample0003/sample0003_R1.fastq.gz      <class 'str'>   yes     data/sequencing1/sample0003/sample0003_R2.fastq.gz      <class 'str'>   yes
+sample0005      data/sequencing1/sample0005/sample0005_R1.fastq.gz      <class 'str'>   yes     data/sequencing1/sample0005/sample0005_R2.fastq.gz      <class 'str'>   yes
+sample0007      data/sequencing1/sample0007/sample0007_R1.fastq.gz      <class 'str'>   yes     data/sequencing1/sample0007/sample0007_R2.fastq.gz      <class 'str'>   yes
+sample0008      data/sequencing1/sample0008/sample0008_R1.fastq.gz      <class 'str'>   yes     data/sequencing1/sample0008/sample0008_R2.fastq.gz      <class 'str'>   yes
+```
+
+The text string you give with `-t` can be a regular expression, for example
+if we want to output all samples except the two negative controls
+(`seq1_neg_control1`, `seq1_neg_control2`) we could run:
+
+```
+python workflow/scripts/extract_samples.py -s samples/sequencing1.tsv -i sequencing1_sample_info.txt -t "_\d$"
+```
+
+where `-t "_\d$"` matches all samples ending in an underscore and a digit.
+
 ## Explanation of the cutadapt steps
 
 The workflow processes fastq files in four consecutive `cutadapt` steps:

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -242,8 +242,8 @@ rule multiqc:
         logs=expand("logs/cutadapt/{sample}_trim_s{{s}}_R{i}.log",
             sample=samples.keys(), i=[1,2]),
     output:
-        "results/multiqc_{s}.html"
-    log: "logs/multiqc/multiqc_{s}.log"
+        "{dir}/multiqc_{s}.html"
+    log: "logs/{dir}multiqc/multiqc_{s}.log"
     message: "Generating MultiQC report for step {wildcards.s}"
     params:
         outdir=lambda wildcards, output: os.path.dirname(output[0]),

diff --git a/workflow/envs/cutadapt.yml b/workflow/envs/cutadapt.yml
@@ -1,7 +1,7 @@
 name: cutadapt
 channels:
-  - bioconda
   - conda-forge
+  - bioconda
   - defaults
 dependencies:
   - cutadapt

diff --git a/workflow/envs/fastqc.yml b/workflow/envs/fastqc.yml
@@ -1,7 +1,7 @@
 name: fastqc
 channels:
-  - bioconda
   - conda-forge
+  - bioconda
   - defaults
 dependencies:
   - fastqc=0.11.9
diff --git a/workflow/scripts/extract_samples.py b/workflow/scripts/extract_samples.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+
+import re
+import pandas as pd
+from argparse import ArgumentParser
+import sys
+
+
+def compile_regex(text, ignore_case=False):
+    flags = 0
+    if ignore_case:
+        flags = re.IGNORECASE
+    return re.compile(text, flags)
+
+
+def main(args):
+    regex = compile_regex(args.text, args.ignore_case)
+    # read sample list from workflow
+    sample_df = pd.read_csv(args.sample_list, sep="\t", index_col=0)
+    # read NGI delivery list
+    ngi_df = pd.read_csv(
+        args.info_file, sep="\t", index_col=0, usecols=[0, 1], names=["NGI", "USER"]
+    )
+    # extract samples using text search
+    ngi_samples = list(ngi_df.loc[ngi_df["USER"].str.contains(regex)].index)
+    sys.stderr.write(
+        f"Matched {len(ngi_samples)} samples in {args.info_file} using {args.text}\n"
+    )
+    if len(ngi_samples) == 0:
+        sys.exit("No samples matched, exiting\n")
+    # create a regex based on matched sample strings
+    sample_regex = re.compile("|".join([f"^{x}" for x in ngi_samples]))
+    # slice sample list using sample_regex
+    extracted_samples = sample_df.loc[
+        sample_df.index.str.contains(sample_regex, regex=True)
+    ]
+    sys.stderr.write(
+        f"Matched {extracted_samples.shape[0]} samples in sample list {args.sample_list}\n"
+    )
+    extracted_samples.to_csv(sys.stdout, sep="\t")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "-s",
+        "--sample_list",
+        required=True,
+        help="Sample list file from preprocessing workflow",
+    )
+    parser.add_argument(
+        "-i",
+        "--info_file",
+        required=True,
+        help="Sample info file with user ids in second column",
+    )
+    parser.add_argument(
+        "-t",
+        "--text",
+        required=True,
+        help="Regex text to use for searching for file " "names",
+    )
+    parser.add_argument(
+        "--ignore_case", action="store_true", help="Ignore case in regex"
+    )
+    args = parser.parse_args()
+    main(args)