Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updated tests running average coverage calculations in cloud executors #102

Merged
merged 6 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 9 additions & 11 deletions modules/local/report.nf
Original file line number Diff line number Diff line change
Expand Up @@ -130,14 +130,13 @@ def generate_coverage_data(sample_data, bp_field, species){
&& species[species_data_pos].fixed_genome_size != null){

def length = species[species_data_pos].fixed_genome_size.toLong()
def cov = base_pairs / q_length
def cov = base_pairs / length
apetkau marked this conversation as resolved.
Show resolved Hide resolved

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just checking to make sure I understand the difference between q_length and length:

  • is q_length a dynamic value specific to each sample entry in sample_data?
  • is length a fixed value of the species genome size, as in a standard reference for each species.?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is correct! however, I think I have something mixed up. Just reading through the code again to make sure I am reporting the correct thing.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

q_length or query length here is the genome length generated by quast in this case or whatever path is pointed to in the QCReportFields of the nextflow.config while the length field is the length specified in the QCReport section for a specific organism.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fantastic! Thanks for looking into this!

entry.value[params.coverage_calc_fields.fixed_cov] = cov.round(2)

}

}
}

return sample_data
apetkau marked this conversation as resolved.
Show resolved Hide resolved
}


Expand Down Expand Up @@ -439,9 +438,9 @@ def convert_type(type, val){
}

def recurse_keys(value, keys_rk){
// TODO add in generic return for if a value is not found
def temp = traverse_values(value, keys_rk.path)
def value_found = true

if(temp == null){
value_found = false;
}
Expand All @@ -455,21 +454,19 @@ def recurse_keys(value, keys_rk){
}

def traverse_values(value, path){

def temp = value
def value_found = true
def ret_val = null

if(path instanceof String){
temp = temp[path]
return temp
}

for(key in path){
def key_val
if(key.isNumber()){
key_val = key.toInteger()
}else{
key_val = key
def key_val = key
if(key_val.isNumber()){
key_val = key_val.toInteger()
apetkau marked this conversation as resolved.
Show resolved Hide resolved
}

if(temp.containsKey(key_val)){
Expand Down Expand Up @@ -674,7 +671,8 @@ def generate_qc_data(data, search_phrases, qc_species_tag){
for(k in data){
if(!k.value.meta.metagenomic){
def species = get_species(k.value[k.key][top_hit_tag], search_phrases, shortest_token)
generate_coverage_data(data[k.key], params.coverage_calc_fields.bp_field, species) // update coverage first so its values can be used in generating qc messages
// update coverage first so its values can be used in generating qc messages
generate_coverage_data(data[k.key], params.coverage_calc_fields.bp_field, species)
data[k.key][quality_analysis] = get_qc_data_species(k.value[k.key], species)
data[k.key][qc_species_tag] = species[species_tag_location]
}else{
Expand Down
13 changes: 9 additions & 4 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -885,9 +885,15 @@ params {
low_msg = "Base quality is poor, resequencing is recommended."
}
average_coverage {
//path = ["${params.quast.report_tag}", "0", "Avg. coverage depth"] // uncomment to use QUASTS coverage
//path = ["${params.coverage_calc_fields.auto_cov}"] // uncomment to use quasts base_pairs/genome length for coverage
path = ["${params.coverage_calc_fields.fixed_cov}"]
/*
* Example path below can be used for qaust coverage
path = [params.quast.report_tag, "0", "Avg. coverage depth"]
* Use the path below to use quasts base_pairs/genome length for coverage
path = [params.coverage_calc_fields.auto_cov]

*/

path = [params.coverage_calc_fields.fixed_cov]
coerce_type = 'Float'
compare_fields = ['min_average_coverage']
comp_type = 'ge'
Expand Down Expand Up @@ -967,7 +973,6 @@ profiles {
}
docker {
docker.enabled = true
docker.userEmulation = true
singularity.enabled = false
podman.enabled = false
shifter.enabled = false
Expand Down
169 changes: 167 additions & 2 deletions tests/main.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,171 @@ nextflow_pipeline {

skip_allele_calling = true

QCReport {
fallthrough {
apetkau marked this conversation as resolved.
Show resolved Hide resolved
search = "No organism specific QC data available."
raw_average_quality = 30
min_n50 = null
max_n50 = null
min_nr_contigs = null
max_nr_contigs = null
fixed_genome_size = 1000
min_length = null
max_length = null
max_checkm_contamination = 3.0
min_average_coverage = 30
}
}


skip_bakta = true
skip_staramr = false
skip_mobrecon = false
skip_checkm = false
skip_raw_read_metrics = false
skip_polishing = false

max_memory = "2.GB"
max_cpus = 1
}
}

then {

assert workflow.success
assert path("$launchDir/results").exists()

// parse output json file
def json = path("$launchDir/results/FinalReports/Aggregated/Json/final_report.json").json

assert json.short.short.RawReadSummary.R1."total_bp".equals(118750)
assert json.short.short.RawReadSummary.R1."total_reads".equals(475)
assert json.short.short.RawReadSummary.R1."read_qual_mean".equals(40.0)
assert json.short.short.RawReadSummary.R1."mean_sequence_length".equals(250.0)

assert json.short.short.FastP.summary.sequencing.equals("paired end (250 cycles + 250 cycles)")
assert json.short.short.FastP.summary.before_filtering.total_reads.equals(950)
assert json.short.short.FastP.filtering_result.passed_filter_reads.equals(950)
assert json.short.short.FastP.filtering_result.low_quality_reads.equals(0)
assert json.short.short.FastP.insert_size.peak.equals(347)

//assert json.short.meta.metagenomic.equals(false) // Currently, this is "null".
assert json.short.meta.assembly.equals(false)
assert json.short.meta.hybrid.equals(false)
assert json.short.meta.single_end.equals(false)
assert json.short.meta.merge.equals(false)
assert json.short.meta.downsampled.equals(false)

assert json.short.short.AssemblyCompleted.equals(true)
assert json.short.short.QUAST."0"."Total length (>= 0 bp)".equals("4949")
assert json.short.short.QUAST."0"."Largest contig".equals("4949")
assert json.short.short.QUAST."0"."GC (%)".equals("52.96")
assert json.short.short.QUAST."0"."Avg. coverage depth".equals("47")

// Below two values should be empty
assert json.short.short.StarAMR."0"."Genotype".equals("None")
assert json.short.short.StarAMR."0"."Predicted Phenotype".equals("Susceptible")
assert json.short.short.StarAMR."0"."Genome Length".equals("4949")

assert json.short.short.CheckM."0"."# genomes".equals("5656")
assert json.short.short.CheckM."0"."# markers".equals("56")
assert json.short.short.CheckM."0"."# marker sets".equals("24")
assert json.short.short.CheckM."0".Contamination.equals("0.00")

assert json.short.short.SevenGeneMLSTReport[0].filename.equals("short.filtered.fasta.gz")

assert json.short.short.Abricate."0".RESISTANCE.equals("NoData") // All Abricate results for this are "NoData".

def assembly_path = "$launchDir/results/Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz"
assert path(assembly_path).exists()

// parse assembly file
def assembly_header = path(assembly_path).linesGzip[0]
assert assembly_header.startsWith(">NODE_1_length_4949_cov_23.917254") // _pilon_pilon_pilon gets appended

// compare IRIDA Next JSON output
def iridanext_json = path("$launchDir/results/iridanext.output.json").json
def iridanext_global = iridanext_json.files.global
def iridanext_samples = iridanext_json.files.samples
def iridanext_metadata = iridanext_json.metadata.samples

// output files
assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1
assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1
assert iridanext_samples.short.findAll { it.path == "Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz" }.size() == 1
assert iridanext_samples.short.findAll { it.path == "Assembly/Quality/QUAST/short/short.transposed_short.quast.quality.tsv" }.size() == 1
assert iridanext_samples.short.findAll { it.path == "Assembly/Quality/SeqKitStats/short.seqkit.stats.summary.tsv" }.size() == 1
assert iridanext_samples.short.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/short.mash.screen.taxa.screen.screen" }.size() == 1
assert iridanext_samples.short.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1
assert iridanext_samples.short.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1
assert iridanext_samples.short.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1
assert iridanext_samples.short.findAll { it.path == "FinalReports/FlattenedReports/short.flat_sample.json.gz" }.size() == 1

// output metadata
assert iridanext_metadata.short."QC Status" == "PASSED"
assert iridanext_metadata.short."Checkm Status" == "PASSED"
assert iridanext_metadata.short."Checkm Value" == 0.0
assert iridanext_metadata.short."Average Coverage Status" == "PASSED"
assert iridanext_metadata.short."Average Coverage Value" == 237.5
assert iridanext_metadata.short."n50 Status" == "WARNING"
assert iridanext_metadata.short."n50 Value" == 4949
assert iridanext_metadata.short."Raw Average Quality Status" == "PASSED"
assert iridanext_metadata.short."Raw Average Quality Value" == 40.0
assert iridanext_metadata.short."Length Status" == "WARNING"
assert iridanext_metadata.short."Length Value" == 4949
assert iridanext_metadata.short."nr contigs Status" == "WARNING"
assert iridanext_metadata.short."nr contigs Value" == 1
assert iridanext_metadata.short."QC Summary" == "PASSED Species ID: No Species Identified; Passed Tests: 6/6; Organism QC Criteria: No organism specific QC data available."

assert iridanext_metadata.short."Downsampled" == false
assert iridanext_metadata.short."predicted_identification_name" == "No Species Identified"
assert iridanext_metadata.short."predicted_identification_method" == "Mash"
assert iridanext_metadata.short."GC (%)" == "52.96"
assert iridanext_metadata.short."Mean Sequence Length Forward" == 250
assert iridanext_metadata.short."BaseCount" == 237500

assert iridanext_metadata.short."StarAMR Genotype" == "None"
assert iridanext_metadata.short."StarAMR Predicted Phenotype" == "Susceptible"
}

}

test("Should assemble but fail average coverage check") {
tag "succeed_assembly_fail_coverage"

when {
params {
input = "https://raw.githubusercontent.com/phac-nml/mikrokondo/dev/tests/data/samplesheets/samplesheet-small-assembly.csv"
outdir = "results"

platform = "illumina"

mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh"
mh_min_kmer = 1

dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi"
kraken2_db = "$baseDir/tests/data/kraken2/test"

min_reads = 100

skip_allele_calling = true

QCReport {
fallthrough {
search = "No organism specific QC data available."
raw_average_quality = 30
min_n50 = null
max_n50 = null
min_nr_contigs = null
max_nr_contigs = null
fixed_genome_size = 1000000
min_length = null
max_length = null
max_checkm_contamination = 3.0
min_average_coverage = 30
}
}


skip_bakta = true
skip_staramr = false
Expand Down Expand Up @@ -208,8 +373,8 @@ nextflow_pipeline {
assert iridanext_metadata.short."QC Status" == "FAILED"
assert iridanext_metadata.short."Checkm Status" == "PASSED"
assert iridanext_metadata.short."Checkm Value" == 0.0
assert !iridanext_metadata.short.containsKey("Average Coverage Status")
assert !iridanext_metadata.short.containsKey("Average Coverage Value")
assert iridanext_metadata.short."Average Coverage Status" == "FAILED"
assert iridanext_metadata.short."Average Coverage Value" == 0.24
assert iridanext_metadata.short."n50 Status" == "WARNING"
assert iridanext_metadata.short."n50 Value" == 4949
assert iridanext_metadata.short."Raw Average Quality Status" == "PASSED"
Expand Down
Loading