Skip to content

Commit

Permalink
convert sample-config to YAML-derived dictionary with defaults
Browse files Browse the repository at this point in the history
  • Loading branch information
agillen committed Aug 28, 2024
1 parent b131023 commit c07b3d6
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 72 deletions.
48 changes: 22 additions & 26 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,46 +1,42 @@
shell.executable("/bin/bash")

""" Snakemake pipeline for 10X Genomics 3' single-cell RNA-seq 3' end counting """
""" Snakemake pipeline for single-cell RNA-seq 3' end counting """

configfile: "config.yaml"

DATA = config["DATA"]
RESULTS = config["RESULTS"]
PAIRED_SAMPLES = config["PAIRED_SAMPLES"]
R1_SAMPLES = config["R1_SAMPLES"]
R2_SAMPLES = config["R2_SAMPLES"]
STAR_INDEX = config["STAR_INDEX"]
POLYA_SITES = config["POLYA_SITES"]
FASTQS = config["FASTQS"]
WHITELIST_V2 = config["WHITELIST_V2"]
WHITELIST_V3 = config["WHITELIST_V3"]
STAR = config["STAR"]
READS = ["paired", "R1", "R2"]
DEFAULTS = config["DEFAULTS"]
SAMPLES = config["SAMPLES"]
READS = ["R1", "R2", "paired"]

import json
with open('chemistry.json') as fp:
chemistry = json.load(fp)

# with open('platform.json') as fp:
# platform = json.load(fp)

# paired positional approach
PAIRs = []
if PAIRED_SAMPLES:
PAIRs.extend(expand("{results}/counts/{sample}_{read}_counts.tsv.gz", results = RESULTS, sample = PAIRED_SAMPLES, read = "paired"))
PAIRs.extend(expand("{results}/{sample}/{sample}_{read}_Aligned.sortedByCoord.out.bam", results = RESULTS, sample = PAIRED_SAMPLES, read = "paired"))
PAIRs.extend(expand("{results}/bed/{sample}_{read}.bed.gz", results = RESULTS, sample = PAIRED_SAMPLES, read = "paired"))
# read 1 positional approach
R1s = []
if R1_SAMPLES:
R1s.extend(expand("{results}/counts/{sample}_{read}_counts.tsv.gz", results = RESULTS, sample = R1_SAMPLES, read = "R1"))
R1s.extend(expand("{results}/{sample}/{sample}_{read}_Aligned.sortedByCoord.out.bam", results = RESULTS, sample = R1_SAMPLES, read = "R1"))
R1s.extend(expand("{results}/bed/{sample}_{read}.bed.gz", results = RESULTS, sample = R1_SAMPLES, read = "R1"))
# read 2 trimmed approach
R2s = []
if R2_SAMPLES:
R2s.extend(expand("{results}/counts/{sample}_{read}_counts.tsv.gz", results = RESULTS, sample = R2_SAMPLES, read = "R2"))
R2s.extend(expand("{results}/{sample}/{sample}_{read}_Aligned.sortedByCoord.out.bam", results = RESULTS, sample = R2_SAMPLES, read = "R2"))
R2s.extend(expand("{results}/bed/{sample}_{read}.bed.gz", results = RESULTS, sample = R2_SAMPLES, read = "R2"))
# combine
all_outputs = PAIRs + R1s + R2s #+ expand("{results}/report/multiqc_report.html", results = RESULTS)
def _get_config(sample, item):
try:
return SAMPLES[sample][item]
except KeyError:
return DEFAULTS[item]

# assemble outputs for rule all
SAMPLE_OUTS = []
for x in SAMPLES:
SAMPLE_OUTS.extend(expand("{results}/counts/{sample}_{alignments}_counts.tsv.gz", results = RESULTS, sample = x, alignments = _get_config(x, "alignments")))
SAMPLE_OUTS.extend(expand("{results}/{sample}/{sample}_{alignments}_Aligned.sortedByCoord.out.bam", results = RESULTS, sample = x, alignments = _get_config(x, "alignments")))
SAMPLE_OUTS.extend(expand("{results}/bed/{sample}_{alignments}.bed.gz", results = RESULTS, sample = x, alignments = _get_config(x, "alignments")))

# optionally add multiqc
all_outputs = SAMPLE_OUTS #+ expand("{results}/report/multiqc_report.html", results = RESULTS)
print(all_outputs)

rule all:
Expand Down
59 changes: 36 additions & 23 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,26 +1,27 @@
# config for scraps snakemake pipeline #

DATA:
# Data folder containing FASTQs
"sample_data/raw_data"

# results directory
RESULTS:
# results directory
"results"

# path to STAR executable, if not in PATH
STAR:
# path to STAR executable, if not in PATH
"STAR"

# path to STAR genome index
STAR_INDEX:
# path to STAR genome index
"index/cr2020A_star"


WHITELIST_V3:
# path to Chromium V3 whitelist
WHITELIST_V3:
"whitelist/3M-february-2018.txt"

# path to Chromium V2 whitelist
WHITELIST_V2:
# path to Chromium V2 whitelist
"whitelist/737K-august-2016.txt"

POLYA_SITES:
Expand All @@ -31,24 +32,36 @@ POLYA_SITES:
#POLYA_FORMAT:
# SAF or GTF, currently auto detecting filename .saf or .saf.gz, case-insensitive
# "SAF"

DEFAULTS:
# default config options, overridden by SAMPLES
chemisty: chromiumV3
platform: illumina
alignments:
- R2
- paired
extra_args: ""

# TSV relating all sequenced libraries to sane capture names and other metadata
FASTQS:
"sample_fastqs.tsv"

PAIRED_SAMPLES:
# Sample basenames
- test
- test2
- test3

R1_SAMPLES:
# Sample basenames
SAMPLES:
# per-sample labels, FASTQ basenames, and config options
# required:
# basename
# optional:
# chemisty
# platform
# alignments
# cutadapt_args
# star_args
test:
basename: sample
chemistry: chromiumV2
test2:
basename: dropseq
chemistry: dropseq
test3:
basename: microwellseq
chemistry: microwellseq

R2_SAMPLES:
- test
- test2
- test3

# for multiqc report
report_section_order:
Expand All @@ -57,4 +70,4 @@ report_section_order:
star:
order: 100
featureCounts:
order: -1000
order: -1000
4 changes: 1 addition & 3 deletions rules/count.snake
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
""" Extract correction length target"""
def _get_chem_length_R1(wildcards):
chem_version = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample].chemistry.unique()[0]
args = chemistry[chem_version]["length"]
return args
return chemistry[_get_config(wildcards.sample, "chemistry")]["length"]

""" rules for scraps counting """

Expand Down
21 changes: 6 additions & 15 deletions rules/cutadapt_star.snake
Original file line number Diff line number Diff line change
Expand Up @@ -5,46 +5,37 @@ import pandas as pd

""" Snakemake rules for scraps cutadapt and STARsolo processing """

SAMPLES_DF = pd.read_csv(FASTQS, sep = "\t")

""" Extract per-sample fastq paths """
def _get_fq_paths(wildcards):
samples_filtered = SAMPLES_DF[(SAMPLES_DF.capture == wildcards.sample)]

fqs = map(lambda x: os.path.join(DATA, x + "*R1*"), samples_filtered.fastqs)
fqs = map(lambda x: os.path.join(DATA, x + "*R1*"), SAMPLES[wildcards.sample]["basename"])
fqs = map(lambda x: glob.glob(x), fqs)
fqs = list(chain.from_iterable(fqs))

return fqs

""" Process complex barcodes into simple, if needed"""
def _get_bc_cut(wildcards):
chem_version = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample].chemistry.unique()[0]
args = chemistry[chem_version]["bc_cut"]
args = chemistry[_get_config(wildcards.sample, "chemistry")]["bc_cut"]
return args

""" Extract per-capture chemistry from gex libs (R2)"""
def _get_chem_version_R2(wildcards):
chem_version = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample].chemistry.unique()[0]
args = eval(chemistry[chem_version]["R2"])
args = eval(chemistry[_get_config(wildcards.sample, "chemistry")]["R2"])
return args

""" Extract per-capture chemistry from gex libs (R1/paired)"""
def _get_chem_version_R1(wildcards):
chem_version = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample].chemistry.unique()[0]
args = eval(chemistry[chem_version]["R1"])
args = eval(chemistry[_get_config(wildcards.sample, "chemistry")]["R1"])
return args

""" Extract per-capture extra arguments for gex paired alignments """
def _get_chem_version_paired(wildcards):
chem_version = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample].chemistry.unique()[0]
args = eval(chemistry[chem_version]["paired"])
args = eval(chemistry[_get_config(wildcards.sample, "chemistry")]["paired"])
return args

""" Extract per-capture extra arguments for gex libs """
def _get_extra_args(wildcards):
captures_filtered = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample]
return list(captures_filtered.extra_args.unique())
return _get_config(wildcards.sample, "extra_args")

""" This rule trims R1-only libraries """
rule cutadapt_R1:
Expand Down
2 changes: 1 addition & 1 deletion rules/qc.snake
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

rule qc_check:
input:
expand("{results}/counts/{sample}_{read}_counts.tsv.gz", results = RESULTS, sample = R1_SAMPLES, read = READS)
expand("{results}/counts/{sample}_{read}_counts.tsv.gz", results = RESULTS, sample = SAMPLES.keys(), read = READS)
output:
"{results}/report/multiqc_report.html"
params:
Expand Down
4 changes: 0 additions & 4 deletions sample_fastqs.tsv

This file was deleted.

0 comments on commit c07b3d6

Please sign in to comment.