Skip to content

Commit

Permalink
create and update db
Browse files Browse the repository at this point in the history
  • Loading branch information
ktmeaton committed Mar 26, 2020
1 parent 411c886 commit a6dcf38
Showing 1 changed file with 242 additions and 124 deletions.
366 changes: 242 additions & 124 deletions pipeline0.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
Notes and Nomenclatures
- All channel objects must use the 'ch_' prefix in variable naming
- Flags to skip a process must use the 'skip_' refix then the process name
- The process docstrings will only describe output channels, not other file types
- Process docstrings only document channels in 'Ouput', all other files under 'Publish'
- Verbosity for variable names is greatly preferred over succinctness.
- Input channel name should reflect the process currently operating on it
- Output channel name should reflect the process that will receive it
*/

// -------------------------------------------------------------------------- //
Expand Down Expand Up @@ -84,140 +86,256 @@ if (params.help){

log.info pipelineHeader()

// -------------------------------------------------------------------------- //
// NCBImeta Entry Point //
// -------------------------------------------------------------------------- //

// For now, use if statements rather than the internal 'when' directive
if (!params.skip_ncbimeta_db_create && params.ncbimeta_create){
process ncbimeta_db_create{
/*
Run NCBImeta queries to generate db from scratch.
Input:
ch_ncbimeta_yaml (yaml): NCBImeta config file.
Output:
ch_ncbimeta_sqlite_update (sqlite): NCBImeta SQLite database for process ncbimeta_db_update.
ch_ncbimeta_yaml_update (yaml): NCBImeta config file for process ncbimeta_db_update.
Publish:
${params.ncbimeta_sqlite_db} (sqlite): NCBImeta SQLite database.
ncbimeta_yaml (yaml): NCBImeta config file.
*.log (text): Text logs of NCBImeta database creation.
*/

// Other variables and config
tag "$ncbimeta_yaml"
echo true
publishDir "${params.outdir}/ncbimeta_db/create", mode: 'copy'
publishDir "${params.outdir}/ncbimeta_db/update/latest", mode: 'copy'
ch_ncbimeta_yaml_create = Channel.fromPath(params.ncbimeta_create, checkIfExists: true)
.ifEmpty { exit 1, "NCBImeta config file not found: ${params.ncbimeta-create}" }
// IO and conditional behavior
input:
file ncbimeta_yaml from ch_ncbimeta_yaml_create
output:
file "${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db}" into ch_ncbimeta_sqlite_update
file ncbimeta_yaml into ch_ncbimeta_yaml_update
file "${params.ncbimeta_output_dir}/log/*.log"

// Shell script to execute
script:
"""
NCBImeta.py --config ${ncbimeta_yaml}
"""
}
}

if(!params.skip_ncbimeta_db_update && params.ncbimeta_update){

process ncbimeta_db_update{
/*
Run NCBImeta queries to update, annotate, and join a previously created database.
Note this requires supplying an absolute path to a database
Input:
ch_ncbimeta_yaml_update (yaml): NCBImeta config file from process ncbimeta_db_create.
ch_ncbimeta_annot_update (text): NCBImeta annotation file.
ch_ncbimeta_sqlite_update (sqlite): NCBImeta SQLite database from process ncbimeta_db_create.
Output:
ch_ncbimeta_sqlite_import (sqlite): NCBImeta SQLite database for process sqlite_import.
Publish:
ncbimeta_annot (text): NCBImeta annotation file.
ncbimeta_yaml (yaml): NCBImeta config file.
*.log (text): : Text logs of NCBImeta database update.
*/

// Other variables and config
tag "$ncbimeta_sqlite"
echo true
// ISSUE: Can these be a symlink to each other (update and update/latest)?
publishDir "${params.outdir}/ncbimeta_db/update/${workflow.start}_${workflow.runName}", mode: 'copy'
publishDir "${params.outdir}/ncbimeta_db/update/latest", mode: 'copy', overwrite: 'true'
// The config file, annotation file, and database file, are being read from paths, not channels
ch_ncbimeta_yaml_update = Channel.fromPath(params.ncbimeta_update, checkIfExists: true)
.ifEmpty { exit 1, "NCBImeta config file not found: ${params.ncbimeta_update}" }
ch_ncbimeta_annot_update = Channel.fromPath(params.ncbimeta_annot, checkIfExists: true)
.ifEmpty { exit 1, "NCBImeta annotation file not found: ${params.ncbimeta_annot}" }
ch_ncbimeta_sqlite_update = Channel.fromPath("${params.ncbimeta_sqlite_db_latest}", checkIfExists: true)
.ifEmpty { exit 1, "NCBImeta SQLite database not found: ${params.ncbimeta_sqlite_db_latest}" }

// IO and conditional behavior
input:
file ncbimeta_yaml from ch_ncbimeta_yaml_update
file ncbimeta_annot from ch_ncbimeta_annot_update
file ncbimeta_sqlite from ch_ncbimeta_sqlite_update
output:
file "${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db}" into ch_ncbimeta_sqlite_update
file ncbimeta_annot
file ncbimeta_yaml
file "${params.ncbimeta_output_dir}/log/*.log"

// Shell script to execute
script:
"""
# Make directories to mirror NCBImeta expected structure
mkdir ${params.ncbimeta_output_dir};
mkdir ${params.ncbimeta_output_dir}/database;
mkdir ${params.ncbimeta_output_dir}/log;
# Copy over input files
cp ${ncbimeta_sqlite} ${params.ncbimeta_output_dir}/database;
cp ${params.outdir}/ncbimeta_db/update/latest/${params.ncbimeta_output_dir}/log/* ${params.ncbimeta_output_dir}/log;
# Execute NCBImeta
NCBImeta.py --config ${ncbimeta_yaml}
NCBImetaAnnotateReplace.py --table ${params.ncbimeta_annot_table} --annot ${ncbimeta_annot} --database ${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db}
NCBImetaJoin.py --database ${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db} --anchor ${params.ncbimeta_join_first_anchor} --accessory ${params.ncbimeta_join_first_accessory} --final ${params.ncbimeta_join_first_final} --unique ${params.ncbimeta_join_first_uniq}
NCBImetaJoin.py --database ${params.ncbimeta_output_dir}/database/${params.ncbimeta_sqlite_db} --anchor ${params.ncbimeta_join_second_anchor} --accessory ${params.ncbimeta_join_second_accessory} --final ${params.ncbimeta_join_second_final} --unique ${params.ncbimeta_join_second_uniq}
"""
}
}

// -------------------------------------------------------------------------- //
// Reference Genome Processing //
// -------------------------------------------------------------------------- //

// ----------------------------------Download---------------------------------//

process reference_download{
/*
Download the reference genome of interest from the FTP site.
Input:
reference_genome_fna (fasta): The reference genome fasta accessed by url via FTP.
Output:
ch_reference_genome_snippy_pairwise (fasta.gz): The compressed reference genome for snippy_pairwise process.
ch_reference_detect_repeats (fasta): The reference genome for detect_repeats process.
ch_reference_genome_detect_low_complexity (fasta): The reference genome for detect_low_complexity process.
Publish:
reference_genome/${reference_genome_fna.baseName} (fasta): The reference genome.
*/

// Other variables and config
tag "$reference_genome_fna"
echo true
publishDir "${params.outdir}/reference_genome", mode: 'copy'

// IO and conditional behavior
input:
file reference_genome_fna from file(params.reference_genome_ftp)
output:
file "${reference_genome_fna.baseName}" into ch_reference_genome_snippy_pairwise, ch_reference_detect_repeats, ch_reference_genome_low_complexity
when:
!params.skip_reference_download

// Shell script to execute
script:
"""
gunzip -f ${reference_genome_fna}
"""
}
if (!params.skip_reference_download){

// -----------------------------Detect Repeats--------------------------------//
process reference_download{
/*
Download the reference genome of interest from the FTP site.
Input:
reference_genome_ftp (fasta): The reference genome fasta accessed by url via FTP.
process reference_detect_repeats{
/*
Detect in-exact repeats in reference genome using the program mummer.
Convert the identified regions file to a bed format.
Input:
reference_genome_fna (fasta): The reference genome fasta from the process reference_download.
Output:
ch_bed_ref_detect_repeats (bed): A bed file containing regions of in-exact repeats.
Publish:
${reference_genome_fna.baseName}.inexact.coords (coords): Alignment coordinate file generated by mummer.
${reference_genome_fna.baseName}.inexact.repeats (coords): Filtered file for sequence similarity and self-alignments
${reference_genome_fna.baseName}.inexact.repeats.bed: Bed file created from filtered coordinates and adjusted for 0-base system.
*/
// Other variables and config
tag "$reference_genome_fna"
publishDir "${params.outdir}/snippy_filtering", mode: 'copy'
echo true

// IO and conditional behavior
input:
file reference_genome_fna from ch_reference_detect_repeats
output:
file "${reference_genome_fna.baseName}.inexact.repeats.bed" into ch_bed_ref_detect_repeats
file "${reference_genome_fna.baseName}.inexact.repeats"
file "${reference_genome_fna.baseName}.inexact.coords"
when:
!params.skip_reference_detect_repeats

// Shell script to execute
script:
"""
PREFIX=${reference_genome_fna.baseName}
# Align reference to itself to find inexact repeats
nucmer --maxmatch --nosimplify --prefix=\${PREFIX}.inexact ${reference_genome_fna} ${reference_genome_fna}
# Convert the delta file to a simplified, tab-delimited coordinate file
show-coords -r -c -l -T \${PREFIX}.inexact.delta | tail -n+5 > \${PREFIX}.inexact.coords
# Remove all "repeats" that are simply each reference aligned to itself
# also retain only repeats with more than 90% sequence similarity.
awk -F "\t" '{if (\$1 == \$3 && \$2 == \$4 && \$12 == \$13)
{next;}
else if (\$7 > 90)
{print \$0}}' \${PREFIX}.inexact.coords > \${PREFIX}.inexact.repeats
# Convert to bed file format, changing to 0-base position coordinates
awk -F "\t" '{print \$12 "\t" \$1-1 "\t" \$2-1;
if (\$3 > \$4){tmp=\$4; \$4=\$3; \$3=tmp;}
print \$13 "\t" \$3-1 "\t" \$4-1;}' \${PREFIX}.inexact.repeats | \
sort -k1,1 -k2,2n | \
bedtools merge > \${PREFIX}.inexact.repeats.bed
"""
Output:
ch_reference_genome_snippy_pairwise (fasta.gz): The compressed reference genome for snippy_pairwise process.
ch_reference_detect_repeats (fasta): The reference genome for detect_repeats process.
ch_reference_genome_detect_low_complexity (fasta): The reference genome for detect_low_complexity process.
Publish:
reference_genome/${reference_genome_fna.baseName} (fasta): The reference genome.
*/

// Other variables and config
tag "$reference_genome_fna"
echo true
publishDir "${params.outdir}/reference_genome", mode: 'copy'

// IO and conditional behavior
input:
file reference_genome_fna from file(params.reference_genome_ftp)
output:
file "${reference_genome_fna.baseName}" into ch_reference_genome_snippy_pairwise, ch_reference_genome_detect_repeats, ch_reference_genome_low_complexity

// Shell script to execute
script:
"""
gunzip -f ${reference_genome_fna}
"""
}

}
// -----------------------------Detect Repeats--------------------------------//

if (!params.skip_reference_detect_repeats){

process reference_detect_repeats{
/*
Detect in-exact repeats in reference genome using the program mummer.
Convert the identified regions file to a bed format.
Input:
ch_reference_genome_detect_repeats (fasta): The reference genome fasta from the process reference_download.
Output:
ch_bed_ref_detect_repeats (bed): A bed file containing regions of in-exact repeats.
Publish:
${reference_genome_fna.baseName}.inexact.coords (coords): Alignment coordinate file generated by mummer.
${reference_genome_fna.baseName}.inexact.repeats (coords): Filtered file for sequence similarity and self-alignments
${reference_genome_fna.baseName}.inexact.repeats.bed: Bed file created from filtered coordinates and adjusted for 0-base system.
*/
// Other variables and config
tag "$reference_genome_fna"
publishDir "${params.outdir}/snippy_filtering", mode: 'copy'
echo true

// IO and conditional behavior
input:
file reference_genome_fna from ch_reference_genome_detect_repeats
output:
file "${reference_genome_fna.baseName}.inexact.repeats.bed" into ch_bed_ref_detect_repeats
file "${reference_genome_fna.baseName}.inexact.repeats"
file "${reference_genome_fna.baseName}.inexact.coords"

// Shell script to execute
script:
"""
PREFIX=${reference_genome_fna.baseName}
# Align reference to itself to find inexact repeats
nucmer --maxmatch --nosimplify --prefix=\${PREFIX}.inexact ${reference_genome_fna} ${reference_genome_fna}
# Convert the delta file to a simplified, tab-delimited coordinate file
show-coords -r -c -l -T \${PREFIX}.inexact.delta | tail -n+5 > \${PREFIX}.inexact.coords
# Remove all "repeats" that are simply each reference aligned to itself
# also retain only repeats with more than 90% sequence similarity.
awk -F "\t" '{if (\$1 == \$3 && \$2 == \$4 && \$12 == \$13)
{next;}
else if (\$7 > 90)
{print \$0}}' \${PREFIX}.inexact.coords > \${PREFIX}.inexact.repeats
# Convert to bed file format, changing to 0-base position coordinates
awk -F "\t" '{print \$12 "\t" \$1-1 "\t" \$2-1;
if (\$3 > \$4){tmp=\$4; \$4=\$3; \$3=tmp;}
print \$13 "\t" \$3-1 "\t" \$4-1;}' \${PREFIX}.inexact.repeats | \
sort -k1,1 -k2,2n | \
bedtools merge > \${PREFIX}.inexact.repeats.bed
"""
}

}
// -------------------------Detect Low Complexity-----------------------------//

process reference_detect_low_complexity{
/*
Detect low complexity regions with dustmasker.
Convert the identified regions file to a bed format.
Input:
reference_genome_fna (fasta): The reference genome fasta from the process reference_download.
Output:
ch_bed_ref_low_complexity (bed): A bed file containing regions of low-complexity regions.
Publish:
${reference_genome_fna.baseName}.dustmasker.intervals (intervals) Interval file containing regions of low-complexity.
${reference_genome_fna.baseName}.dustmasker.bed (bed) Bed file created from intervals and adjusted for 0-base system.
*/
// Other variables and config
tag "$reference_genome_fna"
publishDir "${params.outdir}/snippy_filtering", mode: 'copy'
echo true

// IO and conditional behavior
input:
file reference_genome_fna from ch_reference_genome_low_complexity
output:
file "${reference_genome_fna.baseName}.dustmasker.intervals"
file "${reference_genome_fna.baseName}.dustmasker.bed" into ch_bed_ref_low_complex
when:
!params.skip_reference_detect_low_complexity

// Shell script to execute
script:
"""
dustmasker -in ${reference_genome_fna} -outfmt interval > ${reference_genome_fna.baseName}.dustmasker.intervals
${params.scriptdir}/intervals2bed.sh ${reference_genome_fna.baseName}.dustmasker.intervals ${reference_genome_fna.baseName}.dustmasker.bed
"""
if (!params.skip_reference_detect_low_complexity){

process reference_detect_low_complexity{
/*
Detect low complexity regions with dustmasker.
Convert the identified regions file to a bed format.
Input:
ch_reference_genome_low_complexity (fasta): The reference genome fasta from the process reference_download.
Output:
ch_bed_ref_low_complexity (bed): A bed file containing regions of low-complexity regions.
Publish:
${reference_genome_fna.baseName}.dustmasker.intervals (intervals) Interval file containing regions of low-complexity.
${reference_genome_fna.baseName}.dustmasker.bed (bed) Bed file created from intervals and adjusted for 0-base system.
*/
// Other variables and config
tag "$reference_genome_fna"
publishDir "${params.outdir}/snippy_filtering", mode: 'copy'
echo true

// IO and conditional behavior
input:
file reference_genome_fna from ch_reference_genome_low_complexity
output:
file "${reference_genome_fna.baseName}.dustmasker.intervals"
file "${reference_genome_fna.baseName}.dustmasker.bed" into ch_bed_ref_low_complex
when:
!params.skip_reference_detect_low_complexity

// Shell script to execute
script:
"""
dustmasker -in ${reference_genome_fna} -outfmt interval > ${reference_genome_fna.baseName}.dustmasker.intervals
${params.scriptdir}/intervals2bed.sh ${reference_genome_fna.baseName}.dustmasker.intervals ${reference_genome_fna.baseName}.dustmasker.bed
"""
}

}

0 comments on commit a6dcf38

Please sign in to comment.