Skip to content

Commit

Permalink
separated aggregate targets
Browse files Browse the repository at this point in the history
  • Loading branch information
ktmeaton committed Sep 22, 2020
1 parent 8c96a5a commit 69a1343
Show file tree
Hide file tree
Showing 11 changed files with 233 additions and 64 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ and this project "attempts" to adhere to [Semantic Versioning](http://semver.org
- Github Actions: Convert local to remote narratives
- Breakup snippy step in pipeline to snippy - snp stats - snpeff

## [v0.2.0] - 2020-0922 - Snakemake

- possible change "download" directories to data?

## [v0.1.4] - 2020-0824 - SRA and Local Data

### Added
Expand Down
8 changes: 5 additions & 3 deletions config/snakemake.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,20 @@ conda_eager_env : "nf-core-eager-2.2.0dev"
# SQLITE Parameters
sqlite_db : "yersinia_pestis_db.sqlite"
sqlite_select_command_asm : "SELECT AssemblyFTPGenbank FROM Master WHERE (BioSampleComment LIKE '%KEEP%Assembly%')"
sqlite_select_command_sra : "SELECT BioSampleAccession,SRARunAccession,SRALibraryLayout,SRAFileURL FROM Master WHERE (BioSampleComment LIKE '%KEEP: EAGER Ancient%')"
#sqlite_select_command_sra : "SELECT BioSampleAccession,SRARunAccession,SRALibraryLayout,SRAFileURL FROM Master WHERE (BioSampleComment LIKE '%KEEP: EAGER Ancient%')"
sqlite_select_command_ref : "SELECT AssemblyFTPGenbank FROM Master WHERE (BioSampleComment LIKE '%Reference%')"
max_datasets_assembly : 3
max_datasets_sra : 2

# Testing queries
sqlite_select_command_sra : "SELECT BioSampleAccession,SRARunAccession,SRALibraryLayout,SRAFileURL FROM Master WHERE (SRARunAccession = 'SRR1048902' OR SRARunAccession = 'SRR1048905')"

#reference_locus : "AL590842"
#reference_locus_start : "0"
#reference_locus_end : "4653728"

# Eager param
#eager_tsv : "example/local_data_eager.tsv"
#eager_rev : "7b51863957"
eager_rev : "7b51863957"
#eager_clip_readlength : 35
#eager_bwaalnn : 0.01
#eager_bwaalnl : 16
Expand Down
21 changes: 17 additions & 4 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,12 @@ envs_dir = os.path.join(project_dir, "workflow", "envs")
logs_dir = os.path.join(project_dir, "workflow", "logs")
report_dir = os.path.join(project_dir, "workflow", "report")


# Include the config file
configfile: os.path.join(project_dir, "config", "snakemake.yaml")

db_path = os.path.join(results_dir, "sqlite_db", config["sqlite_db"])

# Sub snakefiles
include: "rules/sqlite_import.smk"
include: "rules/download.smk"
Expand All @@ -39,8 +42,6 @@ include: "rules/functions.smk"
# Report file
report: "report/workflow.rst"

identify_sra_sample()

# -----------------------------------------------------------------------------#
# Main Target #
# -----------------------------------------------------------------------------#
Expand All @@ -50,8 +51,20 @@ rule all:
The default pipeline target.
"""
input:
"results/sqlite_import/download_sra.txt"
#expand("results/iqtree/iqtree.core-filter{missing_data}.treefile", missing_data = config["snippy_missing_data"])
#---Database Import---#
#results_dir + "/sqlite_import/download_reference.txt",
#results_dir + "/sqlite_import/download_assembly.txt",
#results_dir + "/sqlite_import/eager_sra.tsv",
#---Data Download---#
#download_fna_output,
#download_ref_output,
#download_sra_output,
#---Alignment---#
#eager_sra_output2,
#snippy_pairwise_fna_output,
#results_dir + "/snippy_multi/snippy-core.txt"
# IQTREE
iqtree_output

# -----------------------------------------------------------------------------#
# Help and Usage #
Expand Down
46 changes: 46 additions & 0 deletions workflow/envs/eager.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: nf-core-eager-2.2.0dev
channels:
- defaults
- bioconda
- conda-forge
dependencies:
- bioconda::rename=1.601
- conda-forge::pymdown-extensions=7.1
- conda-forge::pygments=2.6.1
- conda-forge::markdown=3.2.2 #Dont upgrade anymore
- conda-forge::openjdk=8.0.144
- bioconda::fastqc=0.11.9
- bioconda::adapterremoval=2.3.1
- bioconda::adapterremovalfixprefix=0.0.5
- bioconda::bwa=0.7.17
- bioconda::picard=2.22.9
- bioconda::samtools=1.9
- bioconda::dedup=0.12.6
- bioconda::angsd=0.933
- bioconda::circularmapper=1.93.4
- bioconda::gatk4=4.1.7.0
- bioconda::qualimap=2.2.2d
- bioconda::vcf2genome=0.91
- bioconda::damageprofiler=0.4.9
- bioconda::multiqc=1.9
- bioconda::pmdtools=0.60
- bioconda::bedtools=2.29.2
- conda-forge::libiconv=1.15
- conda-forge::pigz=2.3.4
- bioconda::sequencetools=1.4.0.6
- bioconda::preseq=2.0.3
- bioconda::fastp=0.20.1
- bioconda::bamutil=1.0.14
- bioconda::mtnucratio=0.7
- pysam=0.15.4 #Says python3.7 or less
- python=3.7.3
- bioconda::kraken2=2.0.9beta
- conda-forge::pandas=1.0.4 #.4 is python3.8+ compatible
- bioconda::freebayes=1.3.2 #should be fine with python 3.8, but says <3.7 on webpage
- bioconda::sexdeterrmine=1.1.1
- bioconda::multivcfanalyzer=0.85.2
- bioconda::hops=0.34
- conda-forge::biopython=1.76
- conda-forge::xopen=0.9.0
- bioconda::bowtie2=2.4.1
#Missing Schmutzi,snpAD
18 changes: 18 additions & 0 deletions workflow/rules/alignment.smk
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,21 @@ rule snippy_multi:
--mask auto \
--mask-char {params.mask_char} \
`cat {input.all_dir_file}` 2> {log}"

# -----------------------------------------------------------------------------#
rule eager:
"""
Pre-process and map SRA fastq samples to a reference genome with nf-core/eager.
"""
input:
eager_tsv = "results/sqlite_import/eager_sra.tsv",
fastq = "results/download_{reads_origin}/{biosample}/{sra_acc}_1.fastq.gz",
output:
damageprofiler = "results/eager_{reads_origin}/damageprofiler/{sra_acc}_rmdup_{biosample}/DamagePlot.pdf"
conda:
os.path.join(envs_dir,"eager.yaml")
log:
os.path.join(logs_dir, "eager_{reads_origin}","{biosample}_{sra_acc}.log")
shell:
"echo testing nf-core/eager; "
"touch {output.damageprofiler}"
26 changes: 10 additions & 16 deletions workflow/rules/download.smk
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ rule download_fna:
"""
message: "Downloading and decompressing fasta file {wildcards.sample}."
input:
"results/sqlite_import/{download_dir}.txt"
results_dir + "/sqlite_import/{download_dir}.txt"
output:
"results/{download_dir}/{sample}.fna"
results_dir + "/{download_dir}/{sample}.fna"
run:
for file in input:
with open(file) as temp_file:
Expand All @@ -24,22 +24,16 @@ rule download_sra:
"""
Download SRA fastq files.
"""
message: "Downloading and dumping fastq files for BioSample {wildcards.sample}."
message: "Downloading and dumping fastq files for BioSample {wildcards.biosample}."
input:
eager_tsv = "results/sqlite_import/eager_sra.tsv"
eager_tsv = results_dir + "/sqlite_import/eager_sra.tsv"
output:
fastq = "results/download_sra/{sample}.test"
fastq = results_dir + "/download_sra/{biosample}/{sra_acc}_1.fastq.gz"
conda:
os.path.join(envs_dir,"sra.yaml")
shell:
"{scripts_dir}/download_sra.sh {project_dir} results/download_sra {wildcards.sample} {input.eager_tsv}; "
#"touch {output.fastq}"


# -----------------------------------------------------------------------------#
rule aggregate_sra:
"""
Aggregate the needed sra files.
"""
input:
expand("results/download_sra/{sample}.test", sample=identify_sra_sample()),
"{scripts_dir}/download_sra.sh \
{project_dir} \
{results_dir}/download_sra/ \
{wildcards.biosample} \
{wildcards.sra_acc}"
95 changes: 86 additions & 9 deletions workflow/rules/functions.smk
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import sqlite3

def identify_reference_sample():
""" Parse the sqlite database to identify the reference genome name."""
sqlite_db_path = os.path.join(results_dir,"sqlite_db",config["sqlite_db"])
Expand All @@ -8,6 +10,17 @@ def identify_reference_sample():
cur.close()
return ref_name

def identify_reference_ftp():
""" Parse the sqlite database to identify the reference genome FTP url."""
sqlite_db_path = os.path.join(results_dir,"sqlite_db",config["sqlite_db"])
conn = sqlite3.connect(sqlite_db_path)
cur = conn.cursor()
ref_url = cur.execute(config["sqlite_select_command_ref"]).fetchone()[0]
ref_fna_gz = ref_url.split("/")[9] + "_genomic.fna.gz"
ref_url = ref_url + "/" + ref_fna_gz
cur.close()
return ref_url

def identify_samples():
""" Return all assembly and SRA sample names."""
return identify_assembly_sample() + identify_sra_sample()
Expand All @@ -29,17 +42,81 @@ def identify_assembly_sample():
cur.close()
return asm_name_list

def identify_sra_sample():
""" Parse the sqlite database to identify the SRA accessions."""
def identify_assembly_ftp():
""" Parse the sqlite database to identify the assembly genome FTP url."""
sqlite_db_path = os.path.join(results_dir,"sqlite_db",config["sqlite_db"])
conn = sqlite3.connect(sqlite_db_path)
cur = conn.cursor()
# Assembled Genome URLs
sra_acc_urls = cur.execute(config["sqlite_select_command_sra"]).fetchall()
sra_acc_list = []
for item in sra_acc_urls:
sra_acc_list.append(item[0])
# Filter based on max number of sra samples for analysis
sra_acc_list = sra_acc_list[0:config["max_datasets_sra"]]
asm_fna_urls = cur.execute(config["sqlite_select_command_asm"]).fetchall()
asm_ftp_list = []
for url_list in asm_fna_urls:
for url in url_list[0].split(";"):
if url:
asm_ftp_list.append(url + "/"+ url.split("/")[9] + "_genomic.fna.gz")
# Filter based on max number of assemblies for analysis
asm_ftp_list = asm_ftp_list[0:config["max_datasets_assembly"]]
cur.close()
return sra_acc_list
return asm_ftp_list


def identify_sra_sample():
"""
Parse the sqlite database to identify the SRA accessions.
Return a list of accessions and layouts.
"""
sqlite_db_path = os.path.join(results_dir,"sqlite_db",config["sqlite_db"])
conn = sqlite3.connect(sqlite_db_path)
cur = conn.cursor()
sra_fetch = cur.execute(config["sqlite_select_command_sra"]).fetchall()
sra_sample_dict = {"biosample": [], "sra_acc": []}
for record in sra_fetch:
if record:
sra_sample_dict["biosample"].append(record[0])
sra_sample_dict["sra_acc"].append(record[1])
return(sra_sample_dict)

def sql_select(sqlite_db, query, i=0):
'''Run select query on the sqlite db.'''
conn = sqlite3.connect(sqlite_db)
cur = conn.cursor()
result = cur.execute(query).fetchall()
result_col = [line[i] for line in result]
cur.close()
return result_col

def parse_eager_tsv(eager_tsv, column):
'''Extract a column from the eager TSV file, excluding the header.'''
with open(eager_tsv) as temp_file:
return [line.strip().split("\t")[column] for line in temp_file][1:]


# Custom targets for aggregation rules

# Data Download
download_sra_output = expand(results_dir + "/download_sra/{biosample}/{sra_acc}_1.fastq.gz",
zip,
biosample=identify_sra_sample()["biosample"],
sra_acc=identify_sra_sample()["sra_acc"])

download_fna_output = expand(results_dir +"/download_assembly/{sample}.fna",
sample=identify_assembly_sample(),
)
download_ref_output = expand(results_dir +"/download_reference/{sample}.fna",
sample=identify_reference_sample(),
)
# Alignment
eager_sra_output1 = expand(results_dir + "/eager_sra/final_bams/{biosample}.bam",
biosample=identify_sra_sample()["biosample"])

eager_sra_output2 = expand(results_dir + "/eager_sra/damageprofiler/{sra_acc}_rmdup_{biosample}/DamagePlot.pdf",
zip,
sra_acc=identify_sra_sample()["sra_acc"],
biosample=identify_sra_sample()["biosample"])

snippy_pairwise_fna_output = expand(results_dir + "/snippy_pairwise/{sample}/{sample}_snippy.aligned.fa",
sample=identify_assembly_sample())

# Phylogeny

iqtree_output = expand(results_dir + "/iqtree/iqtree.core-filter{missing_data}.treefile", missing_data = config["snippy_missing_data"])
8 changes: 4 additions & 4 deletions workflow/rules/phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ rule iqtree:
Construct a maximum likelihood phylogeny.
"""
input:
snp_aln = "results/snippy_multi/snippy-core.full.aln",
snp_aln = results_dir + "/snippy_multi/snippy-core.full.aln",
output:
report(expand("results/iqtree/iqtree.core-filter{missing_data}.treefile", missing_data = config["snippy_missing_data"]),
report(expand(results_dir + "/iqtree/iqtree.core-filter{missing_data}.treefile", missing_data = config["snippy_missing_data"]),
caption=os.path.join(report_dir,"iqtree.rst"),
category="Phylogenetics",
subcategory="IQTREE"),
tree = expand("results/iqtree/iqtree.core-filter{missing_data}.treefile", missing_data = config["snippy_missing_data"]),
tree = expand(results_dir + "/iqtree/iqtree.core-filter{missing_data}.treefile", missing_data = config["snippy_missing_data"]),
params:
outgroup = config["iqtree_outgroup"],
seed = random.randint(0, 99999999),
Expand All @@ -36,4 +36,4 @@ rule iqtree:
-seed {params.seed} \
--runs {params.runs} \
{params.other} \
-pre results/iqtree/iqtree.core-filter{params.missing_data}"
-pre {results_dir}/iqtree/iqtree.core-filter{params.missing_data}"
14 changes: 7 additions & 7 deletions workflow/rules/sqlite_import.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ rule sqlite_import_reference:
params:
sql_command = config["sqlite_select_command_ref"],
input:
db = "results/sqlite_db/" + config["sqlite_db"]
db = results_dir + "/sqlite_db/" + config["sqlite_db"]
output:
ref_txt = "results/sqlite_import/download_reference.txt"
ref_txt = results_dir + "/sqlite_import/download_reference.txt"
run:
# Write the reference FTP url
with open(output.ref_txt, "w") as temp_file:
Expand All @@ -25,10 +25,10 @@ rule sqlite_import_assembly:
Import Assembly genome url from database.
"""
input:
db = "results/sqlite_db/" + config["sqlite_db"]
db = results_dir + "/sqlite_db/" + config["sqlite_db"]
output:
asm_txt = "results/sqlite_import/download_assembly.txt",
asm_snippy_dir = "results/snippy_multi/snippy_pairwise_assembly.txt"
asm_txt = results_dir + "/sqlite_import/download_assembly.txt",
asm_snippy_dir = results_dir + "/snippy_multi/snippy_pairwise_assembly.txt"
run:
# Write the assembly FTP url
with open(output.asm_txt, "w") as temp_file:
Expand All @@ -49,9 +49,9 @@ rule sqlite_import_sra:
organism = config["organism"],
max_sra = config["max_datasets_sra"]
input:
db = "results/sqlite_db/" + config["sqlite_db"]
db = results_dir + "/sqlite_db/" + config["sqlite_db"]
output:
eager_tsv = "results/sqlite_import/eager_sra.tsv",
eager_tsv = results_dir + "/sqlite_import/eager_sra.tsv",
shell:
"{scripts_dir}/sqlite_EAGER_tsv.py \
--database {input.db} \
Expand Down
Loading

0 comments on commit 69a1343

Please sign in to comment.