From 7d368fe12d7cfd73d4b1218fe9f63aa6df8b80f7 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 12 Feb 2025 13:24:34 +0000 Subject: [PATCH] Removed the need for ASCC to rely on mapping, this technically means mapping happens twice but the BTK mapping is better for use in BTK, the main mapping contains too much data --- bin/generate_samplesheet.py | 18 ++++++++++-------- modules/local/generate_samplesheet.nf | 5 +++-- modules/local/sanger_tol_btk.nf | 2 ++ workflows/ascc_genomic.nf | 18 +++++++++--------- workflows/ascc_organellar.nf | 2 +- 5 files changed, 25 insertions(+), 20 deletions(-) diff --git a/bin/generate_samplesheet.py b/bin/generate_samplesheet.py index cb7b5ec6..a71303d3 100755 --- a/bin/generate_samplesheet.py +++ b/bin/generate_samplesheet.py @@ -17,11 +17,12 @@ def parse_args(): parser = argparse.ArgumentParser(description="Generate a csv file for BTK") parser.add_argument("sample_name", type=str, help="Name of sample") parser.add_argument( - "mapped_bam_file", + "path_to_reads", type=str, - help="Path containing the mapped BAM generated with PacBio data and the ASCC input assembly", + help="Path containing the PacBio reads", ) - parser.add_argument("-v", "--version", action="version", version="1.0.0") + parser.add_argument("-v", "--version", action="version", version="1.1.0") + return parser.parse_args() @@ -31,12 +32,13 @@ def main(): data_list = [] data_list.append("sample,datatype,datafile\n") - if args.mapped_bam_file.endswith(".bam"): - data_list.append(f"{args.sample_name},pacbio,{args.mapped_bam_file}\n") - else: - sys.exit("I was expecting a mapped BAM file") - with open(f"{args.sample_name}_samplesheet.csv", "w") as file: + [data_list.append(f"{args.sample_name},pacbio,{args.path_to_reads}{file}\n") for file in os.listdir(args.path_to_reads) if file.endswith('.fasta.gz') or file.endswith('.fa.gz')] + + if len(data_list) <= 1: + sys.exit("I was expecting at least one FASTA.GZ file") + + with open("samplesheet.csv", "w") as file: file.write("".join(data_list)) diff --git a/modules/local/generate_samplesheet.nf b/modules/local/generate_samplesheet.nf index 3d13c5d2..97a7dc23 100644 --- a/modules/local/generate_samplesheet.nf +++ b/modules/local/generate_samplesheet.nf @@ -8,7 +8,8 @@ process GENERATE_SAMPLESHEET { 'biocontainers/python:3.9' }" input: - tuple val(meta), path(pacbio_path) + tuple val(meta), path(reference) + path( "input_pacbio_files/*" ) path(alarm_file) output: @@ -21,7 +22,7 @@ process GENERATE_SAMPLESHEET { """ generate_samplesheet.py \\ $prefix \\ - "\$(realpath $pacbio_path)" + input_pacbio_files/ cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/sanger_tol_btk.nf b/modules/local/sanger_tol_btk.nf index 0de3a464..41a01a2e 100644 --- a/modules/local/sanger_tol_btk.nf +++ b/modules/local/sanger_tol_btk.nf @@ -9,6 +9,7 @@ process SANGER_TOL_BTK { path blastn path blastx path tax_dump + path( "input_pacbio_files/*" ) val busco_lineages_folder val busco_lineages val taxon @@ -53,6 +54,7 @@ process SANGER_TOL_BTK { --blastn "\$(realpath $blastn)" \\ --blastx "\$(realpath $blastx)" \\ --use_work_dir_as_temp true \\ + --align \\ $args mv ${prefix}_btk_out/pipeline_info blobtoolkit_pipeline_info diff --git a/workflows/ascc_genomic.nf b/workflows/ascc_genomic.nf index eddda21b..2ebcda5e 100644 --- a/workflows/ascc_genomic.nf +++ b/workflows/ascc_genomic.nf @@ -323,8 +323,7 @@ workflow ASCC_GENOMIC { ) { PACBIO_BARCODE_CHECK ( reference_tuple_from_GG, - params.reads_path, // TODO: TEAM WANT TO BE ABLE TO SPECIFY PACBIO FILES - // MAY NEED A PROCESS TO PULL THEM INTO A SINGLE FOLDER BEFORE PROCESING + params.reads_path, params.reads_type, params.pacbio_barcode_file, params.pacbio_barcode_names @@ -675,7 +674,8 @@ workflow ASCC_GENOMIC { // USE IN THE BTK PIPELINE // GENERATE_SAMPLESHEET ( - RUN_READ_COVERAGE.out.bam_ch, + reference_tuple_from_GG, + params.reads_path, AUTOFILTER_AND_CHECK_ASSEMBLY.out.alarm_file ) ch_versions = ch_versions.mix(GENERATE_SAMPLESHEET.out.versions) @@ -715,6 +715,7 @@ workflow ASCC_GENOMIC { params.nt_database_path, params.diamond_uniprot_database_path, params.ncbi_taxonomy_path, + params.reads_path, params.busco_lineages_folder, params.busco_lineages, params.taxid, @@ -836,7 +837,7 @@ workflow ASCC_GENOMIC { .map { id, data -> [id: id, data: data] } - .set {number_1} + .set {ascc_merged_data} def processes = [ 'GC_COV', 'Coverage', 'TIARA', @@ -845,7 +846,7 @@ workflow ASCC_GENOMIC { ] def processChannels = processes.collectEntries { process -> - [(process): number_1 + [(process): ascc_merged_data .map { sample -> def data = sample.data.find { it.meta.process == process } data ? [sample.id, data.meta, data.file] : [sample.id, [process: process], []] @@ -853,18 +854,17 @@ workflow ASCC_GENOMIC { ] } - def combined_channel_1 = processChannels['GC_COV'] + def ascc_combined_channels = processChannels['GC_COV'] processes.tail().each { process -> - combined_channel_1 = combined_channel_1 + ascc_combined_channels = ascc_combined_channels .combine(processChannels[process], by: 0) } - combined_channel_1.view() // // SUBWORKFLOW: MERGES DATA THAT IS NOT USED IN THE CREATION OF THE BTK_DATASETS FOLDER // ASCC_MERGE_TABLES ( - combined_channel_1.map { it[1..-1] } // Remove the first item in tuple (mapping key) + ascc_combined_channels.map { it[1..-1] } // Remove the first item in tuple (mapping key) ) ch_versions = ch_versions.mix(ASCC_MERGE_TABLES.out.versions) } diff --git a/workflows/ascc_organellar.nf b/workflows/ascc_organellar.nf index 11068365..5d88cc2d 100644 --- a/workflows/ascc_organellar.nf +++ b/workflows/ascc_organellar.nf @@ -121,7 +121,7 @@ workflow ASCC_ORGANELLAR { if ( (include_workflow_steps.contains('pacbio_barcodes') || include_workflow_steps.contains('ALL')) && !exclude_workflow_steps.contains("pacbio_barcodes") ) { PACBIO_BARCODE_CHECK ( ESSENTIAL_JOBS.out.reference_tuple_from_GG, - params.reads_path, // TODO: COME ON MAN + params.reads_path, params.reads_type, params.pacbio_barcode_file, params.pacbio_barcode_names