convert sample-config to YAML-derived dictionary with defaults

rnabioco · Aug 28, 2024 · c07b3d6 · c07b3d6
1 parent b131023
commit c07b3d6
Show file tree

Hide file tree

Showing 6 changed files with 66 additions and 72 deletions.
diff --git a/Snakefile b/Snakefile
@@ -1,46 +1,42 @@
 shell.executable("/bin/bash")
 
-""" Snakemake pipeline for 10X Genomics 3' single-cell RNA-seq 3' end counting """
+""" Snakemake pipeline for single-cell RNA-seq 3' end counting """
 
 configfile: "config.yaml"
 
 DATA = config["DATA"]
 RESULTS = config["RESULTS"]
-PAIRED_SAMPLES = config["PAIRED_SAMPLES"]
-R1_SAMPLES = config["R1_SAMPLES"]
-R2_SAMPLES = config["R2_SAMPLES"]
 STAR_INDEX = config["STAR_INDEX"]
 POLYA_SITES = config["POLYA_SITES"]
-FASTQS = config["FASTQS"]
 WHITELIST_V2 = config["WHITELIST_V2"]
 WHITELIST_V3 = config["WHITELIST_V3"]
 STAR = config["STAR"]
-READS = ["paired", "R1", "R2"]
+DEFAULTS = config["DEFAULTS"]
+SAMPLES = config["SAMPLES"]
+READS = ["R1", "R2", "paired"]
 
 import json
 with open('chemistry.json') as fp:
    chemistry = json.load(fp)
+
+# with open('platform.json') as fp:
+#    platform = json.load(fp)
 
-# paired positional approach
-PAIRs = []
-if PAIRED_SAMPLES:
-  PAIRs.extend(expand("{results}/counts/{sample}_{read}_counts.tsv.gz", results = RESULTS, sample = PAIRED_SAMPLES, read = "paired"))
-  PAIRs.extend(expand("{results}/{sample}/{sample}_{read}_Aligned.sortedByCoord.out.bam", results = RESULTS, sample = PAIRED_SAMPLES, read = "paired"))
-  PAIRs.extend(expand("{results}/bed/{sample}_{read}.bed.gz", results = RESULTS, sample = PAIRED_SAMPLES, read = "paired"))  
-# read 1 positional approach
-R1s = []
-if R1_SAMPLES:
-  R1s.extend(expand("{results}/counts/{sample}_{read}_counts.tsv.gz", results = RESULTS, sample = R1_SAMPLES, read = "R1"))
-  R1s.extend(expand("{results}/{sample}/{sample}_{read}_Aligned.sortedByCoord.out.bam", results = RESULTS, sample = R1_SAMPLES, read = "R1"))
-  R1s.extend(expand("{results}/bed/{sample}_{read}.bed.gz", results = RESULTS, sample = R1_SAMPLES, read = "R1"))  
-# read 2 trimmed approach
-R2s = []
-if R2_SAMPLES:
-  R2s.extend(expand("{results}/counts/{sample}_{read}_counts.tsv.gz", results = RESULTS, sample = R2_SAMPLES, read = "R2"))
-  R2s.extend(expand("{results}/{sample}/{sample}_{read}_Aligned.sortedByCoord.out.bam", results = RESULTS, sample = R2_SAMPLES, read = "R2"))
-  R2s.extend(expand("{results}/bed/{sample}_{read}.bed.gz", results = RESULTS, sample = R2_SAMPLES, read = "R2"))
-# combine
-all_outputs = PAIRs + R1s + R2s #+ expand("{results}/report/multiqc_report.html", results = RESULTS)
+def _get_config(sample, item):
+  try:
+    return SAMPLES[sample][item]
+  except KeyError:
+    return DEFAULTS[item]
+
+# assemble outputs for rule all
+SAMPLE_OUTS = []
+for x in SAMPLES:
+  SAMPLE_OUTS.extend(expand("{results}/counts/{sample}_{alignments}_counts.tsv.gz", results = RESULTS, sample = x, alignments = _get_config(x, "alignments")))
+  SAMPLE_OUTS.extend(expand("{results}/{sample}/{sample}_{alignments}_Aligned.sortedByCoord.out.bam", results = RESULTS, sample = x, alignments = _get_config(x, "alignments")))
+  SAMPLE_OUTS.extend(expand("{results}/bed/{sample}_{alignments}.bed.gz", results = RESULTS, sample = x, alignments = _get_config(x, "alignments")))  
+
+# optionally add multiqc
+all_outputs = SAMPLE_OUTS #+ expand("{results}/report/multiqc_report.html", results = RESULTS)
 print(all_outputs)
 
 rule all:

diff --git a/config.yaml b/config.yaml
@@ -1,26 +1,27 @@
 # config for scraps snakemake pipeline #
+
 DATA: 
 # Data folder containing FASTQs
   "sample_data/raw_data"
 
-# results directory
 RESULTS:
+# results directory
   "results"
 
-# path to STAR executable, if not in PATH
 STAR:
+# path to STAR executable, if not in PATH
   "STAR"
 
-# path to STAR genome index
 STAR_INDEX:
+# path to STAR genome index
   "index/cr2020A_star"
-
+
+WHITELIST_V3:  
 # path to Chromium V3 whitelist
-WHITELIST_V3:
   "whitelist/3M-february-2018.txt"
 
-# path to Chromium V2 whitelist
 WHITELIST_V2:
+# path to Chromium V2 whitelist
   "whitelist/737K-august-2016.txt"
 
 POLYA_SITES:
@@ -31,24 +32,36 @@ POLYA_SITES:
 #POLYA_FORMAT:
 # SAF or GTF, currently auto detecting filename .saf or .saf.gz, case-insensitive
 #   "SAF"
+
+DEFAULTS:
+# default config options, overridden by SAMPLES
+  chemisty: chromiumV3
+  platform: illumina
+  alignments:
+    - R2
+    - paired
+  extra_args: ""
 
-# TSV relating all sequenced libraries to sane capture names and other metadata
-FASTQS:
-  "sample_fastqs.tsv"
-
-PAIRED_SAMPLES:
-# Sample basenames
-  - test
-  - test2
-  - test3
-
-R1_SAMPLES:
-# Sample basenames
+SAMPLES:
+# per-sample labels, FASTQ basenames, and config options
+# required:
+#   basename
+# optional:
+#   chemisty
+#   platform
+#   alignments
+#   cutadapt_args
+#   star_args
+  test:
+    basename: sample
+    chemistry: chromiumV2
+  test2:
+    basename: dropseq
+    chemistry: dropseq
+  test3:
+    basename: microwellseq
+    chemistry: microwellseq
 
-R2_SAMPLES:
-  - test
-  - test2
-  - test3
 
 # for multiqc report
 report_section_order:
@@ -57,4 +70,4 @@ report_section_order:
   star:
     order: 100
   featureCounts:
-    order: -1000
+    order: -1000
diff --git a/rules/count.snake b/rules/count.snake
@@ -1,8 +1,6 @@
 """ Extract correction length target"""
 def _get_chem_length_R1(wildcards):
-    chem_version = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample].chemistry.unique()[0]
-    args = chemistry[chem_version]["length"]
-    return args
+    return chemistry[_get_config(wildcards.sample, "chemistry")]["length"]
 
 """ rules for scraps counting """
 

diff --git a/rules/cutadapt_star.snake b/rules/cutadapt_star.snake
@@ -5,46 +5,37 @@ import pandas as pd
 
 """ Snakemake rules for scraps cutadapt and STARsolo processing """
 
-SAMPLES_DF = pd.read_csv(FASTQS, sep = "\t")
-
 """ Extract per-sample fastq paths """
 def _get_fq_paths(wildcards):
-    samples_filtered = SAMPLES_DF[(SAMPLES_DF.capture == wildcards.sample)]
-
-    fqs = map(lambda x: os.path.join(DATA, x + "*R1*"), samples_filtered.fastqs)
+    fqs = map(lambda x: os.path.join(DATA, x + "*R1*"), SAMPLES[wildcards.sample]["basename"])
     fqs = map(lambda x: glob.glob(x), fqs)
     fqs = list(chain.from_iterable(fqs))
 
     return fqs
 
 """ Process complex barcodes into simple, if needed"""
 def _get_bc_cut(wildcards):
-    chem_version = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample].chemistry.unique()[0]
-    args = chemistry[chem_version]["bc_cut"]
+    args = chemistry[_get_config(wildcards.sample, "chemistry")]["bc_cut"]
     return args
 
 """ Extract per-capture chemistry from gex libs (R2)"""
 def _get_chem_version_R2(wildcards):
-    chem_version = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample].chemistry.unique()[0]
-    args = eval(chemistry[chem_version]["R2"])
+    args = eval(chemistry[_get_config(wildcards.sample, "chemistry")]["R2"])
     return args
 
 """ Extract per-capture chemistry from gex libs (R1/paired)"""
 def _get_chem_version_R1(wildcards):
-    chem_version = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample].chemistry.unique()[0]
-    args = eval(chemistry[chem_version]["R1"])
+    args = eval(chemistry[_get_config(wildcards.sample, "chemistry")]["R1"])
     return args
 
 """ Extract per-capture extra arguments for gex paired alignments """
 def _get_chem_version_paired(wildcards):
-    chem_version = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample].chemistry.unique()[0]
-    args = eval(chemistry[chem_version]["paired"])
+    args = eval(chemistry[_get_config(wildcards.sample, "chemistry")]["paired"])
     return args
 
 """ Extract per-capture extra arguments for gex libs """
 def _get_extra_args(wildcards):
-    captures_filtered = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample]
-    return list(captures_filtered.extra_args.unique())
+    return _get_config(wildcards.sample, "extra_args")
 
 """ This rule trims R1-only libraries """
 rule cutadapt_R1:

diff --git a/rules/qc.snake b/rules/qc.snake
@@ -2,7 +2,7 @@
 
 rule qc_check:
    input:
-     expand("{results}/counts/{sample}_{read}_counts.tsv.gz", results = RESULTS, sample = R1_SAMPLES, read = READS)
+     expand("{results}/counts/{sample}_{read}_counts.tsv.gz", results = RESULTS, sample = SAMPLES.keys(), read = READS)
    output:
      "{results}/report/multiqc_report.html"
    params:

diff --git a/sample_fastqs.tsv b/sample_fastqs.tsv