diff --git a/.github/workflows/ci_nextflow.yml b/.github/workflows/ci_nextflow.yml new file mode 100644 index 0000000..d06a9e3 --- /dev/null +++ b/.github/workflows/ci_nextflow.yml @@ -0,0 +1,57 @@ +name: CI - Nextflow test +# This workflow is triggered on pushes and PRs to the repository. +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + docker: + runs-on: ubuntu-latest + strategy: + matrix: + nxf_ver: ['21.04.1', ''] + steps: + - uses: actions/checkout@v2.3.4 + - name: Install Nextflow + run: | + export NXF_VER=${{ matrix.nxf_ver }} + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + - name: Basic workflow tests + run: | + nextflow run ${GITHUB_WORKSPACE} -profile docker --fastq "test/data/tiny_{1,2}.*" \ + --reference "test/data/tiny_reference.fasta" --max_cpus 2 --max_memory 4.GB \ + --abyss false --gatb_minia false --idba false --metahipmer2 false \ + --minia false --megahit false --metaspades false --spades false --unicycler false \ + --velvetoptimiser false + FILE=report/index.html + if [ ! -f "$FILE" ]; then echo "Run failed!"; exit 1; else echo "Run successful!"; fi + + singularity: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + singularity_version: ['3.6.4'] + nxf_ver: ['21.04.1', ''] + steps: + - uses: actions/checkout@v1 + - uses: eWaterCycle/setup-singularity@v6 + with: + singularity-version: ${{ matrix.singularity_version }} + - name: Install Nextflow + run: | + export NXF_VER=${{ matrix.nxf_ver }} + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + - name: Basic workflow tests + run: | + nextflow run ${GITHUB_WORKSPACE} -profile singularity --fastq "test/data/tiny_{1,2}.*" \ + --reference "test/data/tiny_reference.fasta" --max_cpus 2 --max_memory 4.GB \ + --abyss false --gatb_minia false --idba false --metahipmer2 false \ + --minia false --megahit false --metaspades false --spades false --unicycler false \ + --velvetoptimiser false + FILE=report/index.html + if [ ! -f "$FILE" ]; then echo "Run failed!"; exit 1; else echo "Run successful!"; fi \ No newline at end of file diff --git a/.github/workflows/ci_templates.yml b/.github/workflows/ci_templates.yml new file mode 100644 index 0000000..91c8c8c --- /dev/null +++ b/.github/workflows/ci_templates.yml @@ -0,0 +1,29 @@ +name: CI - Python templates test + +on: + # Triggers the workflow on push or pull request events but only for the main branch + push: + branches: [ main ] + pull_request: + branches: [ main ] + + workflow_dispatch: + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + with: + python-version: 3.8 + + - name: install dependencies + run: | + python -m pip install --upgrade pip + pip install -r docker/LMAS/requirements.txt + pip install pytest + + - name: run pytest + run: pytest + diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml deleted file mode 100644 index 81d7b9b..0000000 --- a/.github/workflows/main.yml +++ /dev/null @@ -1,39 +0,0 @@ -# This is a basic workflow to help you get started with Actions - -name: CI - Python templates - -# Controls when the action will run. -on: - # Triggers the workflow on push or pull request events but only for the main branch - push: - branches: [ main ] - pull_request: - branches: [ main ] - - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - -# A workflow run is made up of one or more jobs that can run sequentially or in parallel -jobs: - # This workflow contains a single job called "build" - build: - # The type of runner that the job will run on - runs-on: ubuntu-latest - - # Steps represent a sequence of tasks that will be executed as part of the job - steps: - # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - - uses: actions/checkout@v2 - with: - python-version: 3.8 - - - name: install dependencies - run: | - python -m pip install --upgrade pip - pip install -r docker/LMAS/requirements.txt - pip install pytest - - # Runs a single command using the runners shell - - name: run pytest - run: pytest - diff --git a/.gitignore b/.gitignore index 39edecd..deb5beb 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,6 @@ test/misassembly_test.json test/misassembly_test_all.json docs/resources/.DS_Store .DS_Store +results/ +report/ +!modules/ \ No newline at end of file diff --git a/LMAS b/LMAS new file mode 100755 index 0000000..1a1fd27 --- /dev/null +++ b/LMAS @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +# This is a wrapper around LMAS-nf. +# By default `LMAS` will attempt to execute the main Nextflow pipeline. +# If no user input is provided, it prints the help message + +# If no user input, print usage +if [[ $# == 0 ]]; then + nextflow run main.nf --help + exit +fi + +# Run the pipeline +nextflow run main.nf $@ diff --git a/LMAS.nf b/LMAS.nf deleted file mode 100644 index fcbd543..0000000 --- a/LMAS.nf +++ /dev/null @@ -1,1046 +0,0 @@ -#!/usr/bin/env nextflow - -import Helper -import CollectInitialMetadata - -// Pipeline version -if (workflow.commitId){ - version = "0.2 $workflow.revision" -} else { - version = "0.2 (local version)" -} - -// Help message -params.help = false -if (params.help){ - Help.print_help(params) - exit 0 -} - -// Check parameters -// Main input -if (!params.reference) { exit 1, "ERROR: '--reference' parameter missing" } -if (!params.fastq) { exit 1, "ERROR: '--fastq' parameter missing" } -if (params.reference instanceof Boolean) { - exit 1, "ERROR: '--reference' must be a path pattern. Provided value: '$params.reference'" -} -if (params.fastq instanceof Boolean) { - exit 1, "ERROR: '--fastq' must be a path pattern. Provided value:'$params.fastq'" -} - -// Assemblers -if (!params.abyss && !params.bcalm && !params.gatb_minia && !params.idba && !params.metahipmer2 && !params.minia && !params.megahit && !params.metaspades && !params.spades && !params.skesa && !params.unicycler && !params.velvetoptimiser){ - exit 1, 'ERROR: All assembly processes set to false. Exiting.'} -if ( !params.abyssKmerSize.toString().isNumber() ) { - exit 1, "ERROR: '--bcalmKmerSize' parameter must be a number. Provided value: '${params.abyssKmerSize}'" -} -if ( !params.bcalmKmerSize.toString().isNumber() ) { - exit 1, "ERROR: '--bcalmKmerSize' parameter must be a number. Provided value: '${params.bcalmKmerSize}'" -} -if ( !params.gatb_besst_iter.toString().isNumber() ) { - exit 1, "ERROR: '--gatb_besst_iter' parameter must be a number. Provided value: '${params.gatb_besst_iter}'" -} -if ( params.metaspadesKmerSize.toString().split(" ").size() <= 1 ) { - if (params.metaspadesKmerSize.toString() != 'auto') { - exit 1, "ERROR: '--metaspadesKmerSize' parameter must be a sequence of space separated numbers or 'auto'. Provided value: ${params.metaspadesKmerSize}" - } -} -if ( params.spadesKmerSize.toString().split(" ").size() <= 1 ){ - if (params.spadesKmerSize.toString() != 'auto'){ - exit 1, "ERROR: '--spadesKmerSize' parameter must be a sequence of space separated numbers or 'auto'. Provided value: ${params.spadesKmerSize}" - } -} -if ( !params.minLength.toString().isNumber() ) { - exit 1, "ERROR: '--minLength' parameter must be a number. Provided value: '${params.minLength}'" -} - - -// QA Options -def plot_mode_expected = ['linear', 'log'] as Set -def plot_parameter_diff = plot_mode_expected - params.plot_scale - -// Metadata collection for start message -def infoMap = [:] -if (params.containsKey('fastq')) { - infoMap.put('fastq', file(params.fastq).size()) -} -if (params.containsKey('reference')) { - if (file(params.reference) instanceof LinkedList) { - infoMap.put('reference', file(params.reference).size()) - } else { - infoMap.put('fasta', 1) - } -} - -Help.start_info(infoMap, "$workflow.start", "$workflow.profile", version) -CollectInitialMetadata.print_metadata(workflow) - -/* -Workflow Start! -*/ - -// MAIN PARAMETERS -// FastQ -// size: -1 -> allows for single and paired-end files to be passed through. Change if necessary -IN_fastq_raw = Channel.fromFilePairs(params.fastq, size: -1).ifEmpty { - exit 1, "No fastq files provided with pattern:'${params.fastq}'" } - -// Reference -IN_reference_raw = Channel.fromPath(params.reference).ifEmpty { - exit 1, "No reference fasta file provided with pattern:'${params.reference}'" } -IN_reference_raw.into { TO_TRIPLE; TO_REPORT } - -// Optional parameters -if (plot_parameter_diff.size() > 1){ - println "[Pipeline warning] Parameter --plot_scale is not valid! Running with default 'log'\n" - Channel.from('log').set { IN_PLOT_SCALE } -} else { - Channel.from(params.plot_scale).set { IN_PLOT_SCALE } -} -IN_PLOT_SCALE.into { IN_PLOT_SCALE_1; IN_PLOT_SCALE_2; IN_PLOT_SCALE_3 } - -// SET CHANNELS FOR ASSEMBLERS -IN_fastq_raw.into { - IN_PROCESS_READS; - IN_ABYSS; - IN_BCALM2; - IN_GATB_MINIA_PIPELINE; - IN_IDBA; - IN_METAHIPMER2; - IN_MINIA; - IN_MEGAHIT; - IN_METASPADES; - IN_UNICYCLER; - IN_SPADES; - IN_SKESA; - IN_VELVETOPTIMISER; - IN_TO_MAP } //mapping channel - minimap2 - -// TRIPLE THE REFERENCE REPLICONS -process PROCESS_REFERENCE { - input: - file reference_fasta from TO_TRIPLE - - output: - file 'triple_reference.fasta' into OUT_REFERENCE_TRIPLE - - script: - template 'process_reference.py' -} - -// SET CHANNELS FOR REFERENCE -OUT_REFERENCE_TRIPLE.into { IN_MAPPING_CONTIGS; IN_ASSEMBLY_STATS_MAPPING; IN_GAP_STATS; IN_SNP_STATS } - -process PROCESS_READS { - tag { sample_id } - - input: - tuple sample_id, file(fastq) from IN_PROCESS_READS - - output: - file '*_reads_report.json' into PROCESS_READS - - script: - template 'process_reads.py' -} - -// ASSEMBLERS -// ABYSS -process ABYSS { - tag { sample_id } - publishDir "results/$sample_id/assembly/abyss/" - - when: - params.abyss - - input: - tuple sample_id, file(fastq) from IN_ABYSS - val KmerSize from Channel.value(params.abyssKmerSize) - val BloomSize from Channel.value(params.abyssBloomSize) - - output: - tuple sample_id, val('ABySS'), file('*_ABySS.fasta') into OUT_ABYSS - file '.*version' into ABYSS_VERSION - - script: - """ - abyss-pe version | grep "ABySS" | awk -F ' ' '{print \$3}' > .${sample_id}_ABySS_version - { - abyss-pe name='${sample_id}' j=$task.cpus k=$KmerSize B=$BloomSize in='$fastq' - mv ${sample_id}-contigs.fa ${sample_id}_ABySS.fasta - echo pass > .status - } || { - echo fail > .status - :> ${sample_id}_ABySS.fasta - } - # remove temp files - rm *.dot* *.hist *.path* || true - """ -} - -// BCALM 2 -process BCALM2 { - tag { sample_id } - publishDir "results/$sample_id/assembly/bcalm2/" - - when: - params.bcalm - - input: - tuple sample_id, file(fastq) from IN_BCALM2 - val KmerSize from Channel.value(params.bcalmKmerSize) - - output: - tuple sample_id, val('BCALM2'), file('*_BCALM2.fasta') into OUT_BCALM2 - file '.*version' into BCALM2_VERSION - - script: - """ - ls -1 $fastq > list_reads - bcalm -version | head -n 1 | awk -F ', ' '{print \$2}' | awk -F ' ' '{print \$2}' | awk -F 'v' '{print \$2}' \ - > .${sample_id}_BCALM2_version - { - bcalm -in list_reads -out ${sample_id} -kmer-size $KmerSize - mv ${sample_id}.unitigs.fa ${sample_id}_BCALM2.fasta - echo pass > .status - } || { - echo fail > .status - :> ${sample_id}_BCALM2.fasta - } - # remove temp files - rm list_reads *.fa || true - """ -} - -// GATB MINIA Pipeline -GATB_error_correction = params.gatb_error_correction ? 'true' : 'false' - -process GATBMINIAPIPELINE { - tag { sample_id } - publishDir "results/$sample_id/assembly/GATBMiniaPipeline/" - - when: - params.gatb_minia - - input: - tuple sample_id, file(fastq_pair) from IN_GATB_MINIA_PIPELINE - val kmer_list from Channel.value(params.gatbKmerSize) - val do_error_correction from GATB_error_correction - val besst_iter from Channel.value(params.gatb_besst_iter) - - output: - tuple sample_id, val('GATBMiniaPipeline'), file('*_GATBMiniaPipeline.fasta') into OUT_GATB - file '.*version' into GATB_VERSION - - script: - """ - echo '' > .${sample_id}_GATBMiniaPipeline_version - { - if [ $do_error_correction ]; - then - gatb -1 ${fastq_pair[0]} -2 ${fastq_pair[1]} --kmer-sizes ${kmer_list} \ - -o ${sample_id}_GATBMiniaPipeline --no-scaffolding - else - gatb -1 ${fastq_pair[0]} -2 ${fastq_pair[1]} --kmer-sizes ${kmer_list} \ - -o ${sample_id}_GATBMiniaPipeline --no-scaffolding --no-error-correction - fi - - link=\$(readlink *_final.contigs.fa) && mv \$link ${sample_id}_GATBMiniaPipeline.fasta - - echo pass > .status - } || { - echo fail > .status - :> ${sample_id}_GATBMiniaPipeline.fasta - } - # rm temp dirs - rm -r *_GATBMiniaPipeline.lib* *_GATBMiniaPipeline_besst *.unitigs* *contigs.fa *.h5 || true - rm *list_reads* || true - """ -} - -// IDBA -process reformat_IDBA { - tag { sample_id } - - when: - params.idba - - input: - tuple sample_id, file(fastq_pair) from IN_IDBA - - output: - tuple sample_id, file('*.fasta') into REFORMAT_IDBA - - script: - "reformat.sh in=${fastq_pair[0]} in2=${fastq_pair[1]} out=${sample_id}_reads.fasta" -} - -process IDBA { - tag { sample_id } - publishDir "results/$sample_id/assembly/IDBA-UD/" - - when: - params.idba - - input: - tuple sample_id, file(fasta_reads_single) from REFORMAT_IDBA - - output: - tuple sample_id, val('IDBA-UD'), file('*_IDBA-UD.fasta') into OUT_IDBA - file '.*version' into IDBA_VERSION - - script: - """ - echo '' > .${sample_id}_IDBA_version - { - idba_ud -l ${fasta_reads_single} --num_threads $task.cpus -o . - mv contig.fa ${sample_id}_IDBA-UD.fasta - echo pass > .status - } || { - echo fail > .status - :> ${sample_id}_IDBA-UD.fasta - } - rm begin align-* contig-* graph-* kmer local-* || true - """ -} - -// MEGAHIT -process MEGAHIT { - tag { sample_id } - publishDir "results/$sample_id/assembly/MEGAHIT/", pattern: '*_megahit*.fasta' - - when: - params.megahit - - input: - tuple sample_id, file(fastq_pair) from IN_MEGAHIT - val kmers from Channel.value(params.megahitKmerSize) - - output: - tuple sample_id, val('MEGAHIT'), file('*_MEGAHIT.fasta') into OUT_MEGAHIT - file '.*version' into MEGAHIT_VERSION - - script: - """ - /NGStools/megahit/bin/megahit -v | awk -F ' ' '{print \$2}' | awk -F 'v' '{print \$2}' | awk NF \ - > .${sample_id}_MEGAHIT_version - { - /NGStools/megahit/bin/megahit --num-cpu-threads $task.cpus -o megahit --k-list $kmers \ - -1 ${fastq_pair[0]} -2 ${fastq_pair[1]} - - mv megahit/final.contigs.fa ${sample_id}_MEGAHIT.fasta - echo pass > .status - } || { - echo fail > .status - :> ${sample_id}_MEGAHIT.fasta - } - rm -r megahit || true - """ -} - -// METAHIPMER2 -process reformat_METAHIPMER2 { - tag { sample_id } - - when: - params.metahipmer2 - - input: - tuple sample_id, file(fastq_pair) from IN_METAHIPMER2 - - output: - tuple sample_id, file('*.fastq') into REFORMAT_METAHIPMER2 - - script: - "reformat.sh in=${fastq_pair[0]} in2=${fastq_pair[1]} out=${sample_id}_reads.fastq" -} - -process METAHIPMER2 { - tag { sample_id } - publishDir "results/$sample_id/assembly/MetaHipMer2/" - - when: - params.metahipmer2 - - input: - tuple sample_id, file(fasta_reads_single) from REFORMAT_METAHIPMER2 - val kmer from Channel.value(params.metahipmer2KmerSize) - - output: - tuple sample_id, val('MetaHipMer2'), file('*_MetaHipMer2.fasta') into OUT_METAHIPMER2 - file '.*version' into METAHIPMER2_VERSION - - script: - """ - mhm2.py -h | grep "version" | awk -F ' ' '{print \$3}' > .${sample_id}_MetaHipMer2_version - { - mhm2.py -r $fasta_reads_single -k $kmer -s 0 --max-kmer-store 20 --procs $task.cpus \ - --max-rpcs-in-flight 50 --shared-heap 800mb - - mv mhm2-run*/final_assembly.fasta ${sample_id}_MetaHipMer2.fasta - echo pass > .status - } || { - echo fail > .status - :> ${sample_id}_MetaHipMer2.fasta - } - rm -r mhm2-run* || true - """ -} - -// METASPADES -process METASPADES { - tag { sample_id } - publishDir "results/$sample_id/assembly/metaSPAdes/" - - when: - params.metaspades - - input: - tuple sample_id, file(fastq_pair) from IN_METASPADES - val kmers from Channel.value(params.metaspadesKmerSize) - - output: - tuple sample_id, val('metaSPAdes'), file('*_metaspades.fasta') into OUT_METASPADES - file '.*version' into METASPADES_VERSION - - script: - """ - metaspades.py --version &> version - cat version | awk -F ' ' '{print \$4}' | awk -F 'v' '{print \$2}' > .${sample_id}_metaSPAdes_version - rm version - { - metaspades.py --only-assembler --threads $task.cpus -k $kmers \ - -1 ${fastq_pair[0]} -2 ${fastq_pair[1]} -o metaspades - - mv metaspades/contigs.fasta ${sample_id}_metaspades.fasta - echo pass > .status - } || { - echo fail > .status - :> ${sample_id}_metaspades.fasta - } - rm -r metaspades || true - """ -} - -// MINIA -process MINIA { - tag {sample_id} - publishDir "results/$sample_id/assembly/MINIA/" - - when: - params.minia - - input: - tuple sample_id, file(fastq) from IN_MINIA - val kmer from Channel.value(params.miniaKmerSize) - - output: - tuple sample_id, val('MINIA'), file('*_minia.fasta') into OUT_MINIA - file '.*version' into MINIA_VERSION - - script: - """ - minia -v | head -n 1 | awk -F ' ' '{print \$3}' | awk -F 'v' '{print \$2}' | awk NF > .${sample_id}_MINIA_version - { - ls -1 $fastq > list_reads - minia -in list_reads -out ${sample_id}_minia.fasta -nb-cores $task.cpu - - mv ${sample_id}_minia.fasta.contigs.fa ${sample_id}_minia.fasta - echo pass > .status - } || { - echo fail > .status - :> ${sample_id}_minia.fasta - } - rm list_reads *.unitigs.* *.h5 || true - """ -} - -// SKESA -process SKESA { - tag { sample_id } - publishDir "results/$sample_id/assembly/SKESA/" - - when: - params.skesa - - input: - tuple sample_id, file(fastq_pair) from IN_SKESA - - output: - tuple sample_id, val('SKESA'), file('*_skesa.fasta') into OUT_SKESA - file '.*version' into SKESA_VERSION - - script: - """ - skesa -v | tail -n 1 | awk -F ' ' '{print \$2}' | awk NF > .${sample_id}_SKESA_version - { - skesa --cores $task.cpus --memory $task.memory --use_paired_ends --contigs_out ${sample_id}_skesa.fasta \ - --fastq ${fastq_pair[0]} ${fastq_pair[1]} - - echo pass > .status - } || { - echo fail > .status - :> ${sample_id}_skesa.fasta - } - """ -} - -// SPADES -process SPADES { - tag { sample_id } - publishDir "results/$sample_id/assembly/SPAdes/", pattern: '*.fasta' - - when: - params.spades - - input: - tuple sample_id, file(fastq_pair) from IN_SPADES - val kmers from Channel.value(params.spadesKmerSize) - - output: - tuple sample_id, val('SPAdes'), file('*_spades.fasta') into OUT_SPADES - file '.*version' into SPADES_VERSION - - script: - """ - spades.py --version &> version - cat version | awk -F ' ' '{print \$4}' | awk -F 'v' '{print \$2}' > .${sample_id}_SPAdes_version - rm version - { - spades.py --only-assembler --threads $task.cpus -k $kmers \ - -1 ${fastq_pair[0]} -2 ${fastq_pair[1]} -o spades - - mv spades/contigs.fasta ${sample_id}_spades.fasta - } || { - echo fail > .status - :> ${sample_id}_spades.fasta - } - rm -r spades || true - """ -} - -// UNICYCLER -process UNICYCLER { - tag { sample_id } - publishDir "results/$sample_id/assembly/unicycler" - - when: - params.unicycler - - input: - tuple sample_id, file(fastq_pair) from IN_UNICYCLER - - output: - tuple sample_id, val('Unicycler'), file('*_unicycler.fasta') into OUT_UNICYCLER - file '.*version' into UNICYCLER_VERSION - - script: - """ - unicycler --version | awk -F ' v' '{print \$2}' | awk NF > .${sample_id}_Unicycler_version - { - unicycler -t $task.cpus -o . --no_correct --no_pilon \ - -1 ${fastq_pair[0]} -2 ${fastq_pair[1]} - - mv assembly.fasta ${sample_id}_unicycler.fasta - echo pass > .status - } || { - echo fail > .status - :> ${sample_id}_unicycler.fasta - } - rm *best_spades_graph* *overlaps_removed* *bridges_applied* *final_clean* || true - """ -} - -// VELVETOPTIMISER -process VELVETOPTIMISER { - tag { sample_id } - publishDir "results/$sample_id/assembly/VelvetOtimiser" - - when: - params.velvetoptimiser - - input: - tuple sample_id, file(fastq_pair) from IN_VELVETOPTIMISER - - output: - tuple sample_id, val('VelvetOptimiser'), file('*.fasta') into OUT_VELVETOPTIMISER - file '.*version' into VELVETOPTIMISER_VERSION - - script: - """ - VelvetOptimiser.pl --version | awk -F ' ' '{print \$2}' | awk NF > .${sample_id}_VelvetOptimiser_version - { - VelvetOptimiser.pl -v -s $params.velvetoptimiser_hashs -e $params.velvetoptimiser_hashe -t $task.cpus \ - -f '-shortPaired -fastq.gz -separate ${fastq_pair[0]} ${fastq_pair[1]}' - - mv auto_data*/contigs.fa ${sample_id}_velvetoptimiser.fasta - echo pass > .status - } || { - echo fail > .status - :> ${sample_id}_velvetoptimiser.fasta - } - rm -r auto_data* || true - """ -} - -// VERSION COLLECTION -ABYSS_VERSION.mix(BCALM2_VERSION, - GATB_VERSION, - IDBA_VERSION, - MINIA_VERSION, - MEGAHIT_VERSION, - METAHIPMER2_VERSION, - METASPADES_VERSION, - SKESA_VERSION, - SPADES_VERSION, - UNICYCLER_VERSION, - VELVETOPTIMISER_VERSION).set{ALL_VERSIONS} - -process PROCESS_VERSION { - - input: - file version from ALL_VERSIONS.collect() - - output: - file 'versions.json' into VERSIONS_JSON - - script: - template 'process_versions.py' -} - -// ASSEMBLY COLLECTION -OUT_ABYSS.mix(OUT_BCALM2, - OUT_GATB, - OUT_IDBA, - OUT_MEGAHIT, - OUT_METAHIPMER2, - OUT_METASPADES, - OUT_MINIA, - OUT_SKESA, - OUT_SPADES, - OUT_UNICYCLER, - OUT_VELVETOPTIMISER).set { ALL_ASSEMBLERS } - -ALL_ASSEMBLERS.into { TO_FILTER; TO_GLOBAL_STATS; TO_READ_MAPPING_ALL } - -// FILTER ASSEMBLY -process FILTER_ASSEMBLY { - - tag { sample_id; assembler } - publishDir "results/$sample_id/assembly/filtered/" - - input: - tuple sample_id, assembler, file(assembly) from TO_FILTER - val minLen from Channel.value(params.minLength) - - output: - tuple sample_id, assembler, file('filtered_*') into OUT_FILTERED - - script: - "reformat.sh in=${assembly} out=filtered_${assembly} minlength=${minLen}" -} - -OUT_FILTERED.into { IN_ASSEMBLY_MAPPING; IN_READ_MAPPING_FILTERED } - -// READ MAPPING -process READ_MAPPING{ - - tag { assembler } - - publishDir "results/$sample_id/mapping/reads" - - input: - tuple sample_id, assembler, assembly, filtered_assembly from TO_READ_MAPPING_ALL.join(IN_READ_MAPPING_FILTERED, by: [0,1]) - - output: - file '*_read_mapping_*.txt' optional true - tuple sample_id, assembler, file('*_read_mapping_report.json') into OUT_READ_MAPPING - - script: - template 'read_mapping.py' -} - -// ASSEMBLY MAPPING -process ASSEMBLY_MAPPING{ - - tag { sample_id; assembler } - - publishDir "results/$sample_id/mapping/assembly" - - input: - tuple sample_id, assembler, file(assembly) from IN_ASSEMBLY_MAPPING - each reference from IN_MAPPING_CONTIGS - - output: - tuple sample_id, assembler, file(assembly), file('*.paf') into OUT_ASSEMBLY_MAPPING - - script: - """ - minimap2 --cs -N 50 --secondary=no -t $task.cpus -r 10000 -g 10000 -x asm20 --eqx ${reference} ${assembly} \ - > ${sample_id}_${assembler}.paf - """ - -} - -OUT_ASSEMBLY_MAPPING.into { IN_ASSEMBLY_MAPPING_FOR_STATS; IN_GAP_ASSESSMENT; IN_SNP_ASSESSMENT; IN_MISASSEMBLY } - -// ASSEMBLY STATS GLOBAL -process ASSEMBLY_STATS_GLOBAL { - tag { assembler } - - publishDir "results/$sample_id/stats/assembly" - - input: - tuple sample_id, assembler, file(assembly), file(read_mapping) from TO_GLOBAL_STATS.join(OUT_READ_MAPPING, by: [0,1]) - - output: - file '*report.json' into OUT_ASSEMBLY_STATS_GLOBAL_JSON - file '*.csv' into OUT_ASSEMBLY_STATS_GLOBAL_TSV - - script: - template 'assembly_stats_global.py' -} - -process PROCESS_ASSEMBLY_STATS_GLOBAL { - - publishDir 'results/stats' - - input: - file assembly_stats_global_files from OUT_ASSEMBLY_STATS_GLOBAL_TSV.collect() - file json_report from OUT_ASSEMBLY_STATS_GLOBAL_JSON.collect() - - output: - file 'global_assembly_stats.json' into PROCESS_ASSEMBLY_STATS_GLOBAL_OUT - - script: - template 'process_assembly_stats_global.py' - -} - -process ASSEMBLY_STATS_MAPPING { - - tag { assembler } - - publishDir "results/$sample_id/stats/" - - input: - tuple sample_id, assembler, file(assembly), file(mapping) from IN_ASSEMBLY_MAPPING_FOR_STATS - each reference from IN_ASSEMBLY_STATS_MAPPING - - output: - file '*_report.json' into OUT_ASSEMBLY_STATS_MAPPING_JSON - file '*breadth_of_coverage_contigs.csv' into OUT_COVERAGE_PER_CONTIG - file '*_df.csv' into OUT_DF_ASSEMBLY_STATS_MAPPING - file '*_lx.csv' into OUT_LX_PLOT - file '*_nax.csv' into OUT_NAX_PLOT - file '*_ngx.csv' into OUT_NGX_PLOT - file '*_phred.csv' into OUT_PHRED - - script: - template 'assembly_stats_mapping.py' - -} - -process PROCESS_ASSEMBLY_STATS_MAPPING { - - publishDir 'results/stats/' - - input: - file json_report from OUT_ASSEMBLY_STATS_MAPPING_JSON.collect() - - output: - file 'global_assembly_mapping_stats.json' into PROCESS_ASSEMBLY_STATS_MAPPING_OUT - - script: - template 'process_assembly_stats_mapping.py' - -} - -process PROCESS_COMPLETNESS { - - publishDir 'results/plots/', pattern: '*.html' - - input: - file coverage_files from OUT_COVERAGE_PER_CONTIG.collect() - - output: - file '*.html' - file 'completness_plots.json' into PLOT_PROCESS_COMPLETNESS - - script: - template 'completness_plot.py' -} - -process PLOT_LX { - - publishDir 'results/plots/', pattern: '*.html' - - input: - file lx_files from OUT_LX_PLOT.collect() - val(scale) from IN_PLOT_SCALE_1 - - output: - file '*.html' - file 'lx.json' into PLOT_LX - - script: - template 'lx_plot.py' -} - -process PLOT_NAX { - - publishDir 'results/plots/', pattern: '*.html' - - input: - file nax_files from OUT_NAX_PLOT.collect() - val(scale) from IN_PLOT_SCALE_2 - - output: - file '*.html' - file 'nax.json' into PLOT_NAX - - script: - template 'nax_plot.py' -} - -process PLOT_NGX { - - publishDir 'results/plots/', pattern: '*.html' - - input: - file ngx_files from OUT_NGX_PLOT.collect() - val(scale) from IN_PLOT_SCALE_3 - - output: - file '*.html' - file 'ngx.json' into PLOT_NGX - - script: - template 'ngx_plot.py' -} - -process PROCESS_SHRIMP_PLOT { - - publishDir 'results/plots/', pattern: '*.html' - - input: - file phred_files from OUT_PHRED.collect() - - output: - file '*.html' - file 'phred.json' into PLOT_PHRED - - script: - template 'shrimp_plot.py' -} - -process PLOT_CONTIG_DISTRIBUTION { - - publishDir 'results/plots/', pattern: '*.html' - - input: - file dataframes from OUT_DF_ASSEMBLY_STATS_MAPPING.collect() - - output: - file '*.html' - file '*.json' into PLOT_CONTIG_DISTRIBUTION - - script: - template 'plot_contig_size.py' -} - -process GAP_ASSESSMENT { - - tag { assembler } - publishDir "results/$sample_id/stats/" - - input: - tuple sample_id, assembler, file(assembly), file(mapping) from IN_GAP_ASSESSMENT - each reference from IN_GAP_STATS - - output: - file '*_gap_dict.json' into OUT_GAP_DISTANCE - file '*_gaps.csv' into OUT_GAP_PLOT_REF - - script: - template 'gap_assessment.py' -} - -process PLOT_GAP_BOXPLOT { - - publishDir 'results/plots/', pattern: '*.html' - - input: - file gap_distance_json from OUT_GAP_DISTANCE.collect() - - output: - file '*.html' - file '*gap_distance_histogram.json' into OUT_GAP_HISTOGRAM - - script: - template 'plot_gap_sizes.py' - -} - -process PLOT_GAP_REFERENCE { - - publishDir 'results/plots/', pattern: '*.html' - - input: - file gap_coords_dataframes from OUT_GAP_PLOT_REF.collect() - - output: - file '*.html' - file '*.json' into OUT_GAP_REFERENCE - - script: - template 'plot_gap_reference.py' -} - -process SNP_ASSESSMENT { - - tag { assembler } - - input: - tuple sample_id, assembler, file(assembly), file(mapping) from IN_SNP_ASSESSMENT - each reference from IN_SNP_STATS - - output: - file '*.tsv' - file '*_snps.csv' into OUT_SNP_PLOT_REF - - script: - template 'snp_assessment.py' -} - -process PLOT_SNP_REFERENCE { - - publishDir 'results/plots/', pattern: '*.html' - - input: - file snp_coords_dataframes from OUT_SNP_PLOT_REF.collect() - - output: - file '*.html' - file '*.json' into OUT_SNP_REFERENCE - - script: - template 'plot_snp.py' -} - -process MISASSEMBLY { - - tag { assembler } - - input: - tuple sample_id, assembler, file(assembly), file(mapping) from IN_MISASSEMBLY - - output: - file '*_trace.pkl' into OUT_MISASSEMBLY_TRACE - file '*_contig_lenght.pkl' into OUT_MISASSEMBLY_CONTIGS - file '*_misassembly.json' into MISASSEMBLY_REPORT - file '*_misassembled_reference.json' into MISASSEMBLY_DICTIONARY - file '*_misassembly.csv' into PLOT_MISASSEMBLY_REF - - script: - template 'misassembly.py' - -} - -process PROCESS_MISASSEMBLY { - - publishDir 'results/plots/', pattern: '*.html' - - input: - file misassembly_trace from OUT_MISASSEMBLY_TRACE.collect() - file misassembly_contigs from OUT_MISASSEMBLY_CONTIGS.collect() - file report_data from MISASSEMBLY_REPORT.collect() - file report_per_reference from MISASSEMBLY_DICTIONARY.collect() - - output: - file '*.html' - file '*_misassembly.json' into OUT_MISASSEMBLY_PLOT - file 'misassembly_report.json' into OUT_MISASSEMBLY_REPORT - file 'misassembly_report_per_ref.json' into MISASSEMBLY_PER_REF - - script: - template 'process_misassembly.py' - -} - -process PLOT_MISASSEMBLY { - - publishDir 'results/plots/', pattern: '*.html' - - input: - file misassembly_dataframes from PLOT_MISASSEMBLY_REF.collect() - - output: - file '*.html' - file '*.json' into OUT_MISASSEMBLY_REFERENCE - - script: - template 'plot_misassembly.py' - -} - -/** Reports -Compiles the reports from every process -**/ - -OUT_ASSEMBLY_STATS_GLOBAL_JSON.set{master_report} - -process compile_reports { - - publishDir 'report/', mode: 'copy' - - input: - file reads_json from PROCESS_READS.collect() - file global_assembly_stats from PROCESS_ASSEMBLY_STATS_GLOBAL_OUT - file pipeline_stats from Channel.fromPath("${workflow.projectDir}/pipeline_stats.txt") - file js from Channel.fromPath("${workflow.projectDir}/resources/main.js.zip") - file lmas_png from Channel.fromPath("${workflow.projectDir}/resources/lmas.zip") - file reference_file from TO_REPORT - file contig_size_distribution from PLOT_CONTIG_DISTRIBUTION - file mapping_assembly_stats from PROCESS_ASSEMBLY_STATS_MAPPING_OUT - file completness_plots from PLOT_PROCESS_COMPLETNESS - file lx_plots from PLOT_LX - file shrimp_plots from PLOT_PHRED - file gap_reference_json from OUT_GAP_REFERENCE - file snp_reference_json from OUT_SNP_REFERENCE - file gap_histogram from OUT_GAP_HISTOGRAM - file plot_misassemblies from OUT_MISASSEMBLY_PLOT - file misassembly_data from OUT_MISASSEMBLY_REPORT - file nax_plots from PLOT_NAX - file ngx_plots from PLOT_NGX - file versions_json from VERSIONS_JSON - file misassembly_per_ref from MISASSEMBLY_PER_REF - file plot_misassembly_per_ref from OUT_MISASSEMBLY_REFERENCE - file about_md from Channel.fromPath(params.md) - file containers_config from Channel.fromPath("${workflow.projectDir}/configs/containers.config") - - output: - file 'pipeline_report*.json' - file 'index.html' - file 'main.js' - file '*.jpg' - file 'performance_metadata.json' - file 'reference_metadata.json' - - script: - template 'compile_reports.py' -} - -workflow.onComplete { - // Display complete message - log.info "Completed at: " + workflow.complete - log.info "Duration : " + workflow.duration - log.info "Success : " + workflow.success - log.info "Exit status : " + workflow.exitStatus -} - -workflow.onError { - // Display error message - log.info "Workflow execution stopped with the following message:" - log.info " " + workflow.errorMessage -} diff --git a/README.md b/README.md index cdbdc41..329b17b 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,20 @@ # LMAS +![Nextflow CI](https://github.com/cimendes/LMAS/actions/workflows/ci_nextflow.yml/badge.svg) +![Pytest CI](https://github.com/cimendes/LMAS/actions/workflows/ci_templates.yml/badge.svg) [![Documentation Status](https://readthedocs.org/projects/lmas/badge/?version=latest)](https://lmas.readthedocs.io/en/latest/?badge=latest) -[![DOI Dataset](https://zenodo.org/badge/DOI/10.5281/zenodo.4742651.svg)](https://doi.org/10.5281/zenodo.4742651) +[![Nextflow](https://img.shields.io/badge/nextflow-DLS2-23aa62.svg?labelColor=000000)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.01.0-23aa62.svg?labelColor=000000)](https://www.nextflow.io/) +[![Anaconda-Server Badge](https://anaconda.org/bioconda/LMAS/badges/installer/conda.svg)](https://bioconda.github.io/recipes/lmas/README.html) +[![Anaconda-Server Badge](https://anaconda.org/bioconda/LMAS/badges/downloads.svg)](https://anaconda.org/bioconda/LMAS) + +[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) +[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) +[![run with shifter](https://img.shields.io/badge/run%20with-shifter-lightgrey?labelColor=000000)](https://github.com/NERSC/shifter/) + +[![DOI Dataset](https://zenodo.org/badge/DOI/10.5281/zenodo.4742651.svg)](https://doi.org/10.5281/zenodo.4742651) +[![Demo Report](https://img.shields.io/badge/Demo-lmas--demo.herokuapp.com%2F-blue)](https://lmas-demo.herokuapp.com/) _ __ __ _ ___ @@ -12,74 +24,179 @@ Last Metagenomic Assembler Standing -## Table of Contents - - * [Overview](#overview) - * [Instalation](#instalation) - + [Nextflow](#nextflow) - + [Container Engine](#container-engine) - + [Clone LMAS](#clone-lmas) - * [Running LMAS](#running-lmas) - * [Customizing LMAS](#customizing-lmas) - * [Output and Report](#output-and-report) - * [Proof of concept](#proof-of-concept) - + [ZymoBIOMICS Microbial Community Standard](#zymobiomics-microbial-community-standard) - * [Citation and Contacts](#citation-and-contacts) - - ## Overview The *de novo* assembly of raw sequence data is a key process when analysing data from shotgun metagenomic sequencing. It allows recovering draft genomes from a pool of mixed raw reads, yielding longer sequences that offer contextual information and afford a more complete picture of the microbial community. It also represents one of the greatest bottlenecks when obtaining trustworthy, reproducible results. -LMAS is an automated workflow enabling the benchmarking of traditional and metagenomic prokaryotic *de novo* assembly software using defined mock communities. The results are presented in an interactive HTML report where selected global and reference specific performance metrics can be explored. +**LMAS is an automated workflow enabling the benchmarking of traditional and metagenomic prokaryotic *de novo* assembly software using defined mock communities**. The results are presented in an interactive HTML report where selected global and reference specific performance metrics can be explored. -LMAS expects **tripled reference sequences**. Each reference should be provided in a single FASTA file where the linearized -reference replicons are concatenated three times to ensure that contigs can fully align even with start-end overlap and -regardless of their starting position relative to that of the reference. Read data, in **paired-end form*, reflecting the -sequences in the reference genomes, is required to be passed on for assembly and downstream quality assessment. +![LMAS Workflow](https://github.com/cimendes/LMAS/blob/main/docs/resources/LMAS_ECCMID.png) -The mock communities can be provided by the user to better reflect the samples of interest. New assemblers can be added with minimal changes to the pipeline, so that LMAS can be expanded as novel algorithms are developed. ## Instalation -Before installing LMAS, a few dependencies must be installed in your system: +All components of LMAS are executed in docker containers, which means that you’ll need to have a container engine installed. The container engines available are the ones supported by Nextflow: -### Nextflow +* [Docker](https://www.nextflow.io/docs/latest/docker.html), +* [Singularity](https://www.nextflow.io/docs/latest/singularity.html), +* [Shifter](https://github.com/NERSC/shifter) + +If you already have any one of these installed, you are good to go as the provided docker containers are compatible +with all engines available. If not, you’ll need to install one. + +### Conda + +LMAS can be easily installed through [Conda](https://conda.io/en/latest/), an open source package management system and environment management system that runs on Windows, macOS and Linux. After its installation, LMAS is available on [Bioconda](https://anaconda.org/bioconda/LMAS) and can be easily installed with: + +```bash +conda install -c bioconda lmas +``` + +### Manual installation + +To install LMAS manually you'll first have to install nextflow. +#### Nextflow Nextflow (version 20.01.0 or higher) can be used on any POSIX compatible system (Linux, OS X, etc). It requires BASH and Java 8 (or higher) to be installed. More instructions are available [here](https://www.nextflow.io/docs/latest/getstarted.html). -### Container Engine +#### Clone LMAS -All components of LMAS are executed in docker containers, which means that you’ll need to have a container engine -installed. The container engines available are the ones supported by Nextflow: +You can clone this repository with `git clone git@github.com:cimendes/LMAS.git`, and all files will be in your local machine. -* [Docker](https://www.nextflow.io/docs/latest/docker.html), -* [Singularity](https://www.nextflow.io/docs/latest/singularity.html), -* [Shifter](https://github.com/NERSC/shifter) (undocumented) +## Running LMAS -If you already have any one of these installed, you are good to go as the provided docker containers are compatible -with all engines available. If not, you’ll need to install one. +To run LMAS you can simply call it with: +```bash + LMAS +``` +If no option or `--help` is provided, LMAS will display its help message. Otherwise, the `--fastq` and `--reference` options are mandatory. By default they are set to `'data/fastq/*_{1,2}.*'` and `'data/reference/*.fasta'` respectively. -### Clone LMAS +Alternatively you can call LMAS directly with Nextflow: +```bash + nextflow run main.nf +``` +To use LMAS the following options are available: -You can clone this repository with `git clone https://github.com/cimendes/LMAS.git`, and all files will be in your local machine. -## Running LMAS + _ __ __ _ ___ + /\︵︵/\ | | | \/ | /_\ / __| + (◕('人')◕) | |__| |\/| |/ _ \\__ \ + |︶| |____|_| |_/_/ \_\___/ -After you have a local installation of LMAS, the mock community data needs to be downloaded. + Last Metagenomic Assembler Standing -The triple reference sequences can be passed with the `--reference` parameter, and `--fastq` recieves the raw data for assembly. + Input parameters: + --fastq Path expression to paired-end fastq files. + (default: data/fastq/*_{1,2}.*) + --reference Path to the genome reference fasta file. + (default: data/reference/*.fasta) + --md Path to markdown with input sample description for report (optional). + (default: data/*.md) + + Mapping and filtering paramenters: + --minLength Value for minimum contig length, in basepairs. + (default: 1000) + --mapped_reads_threshold Value for the minimum percentage of a read aligning to the + contig to be considered as mapped. + (default: 0.75) + + Assembly quality assessment parameters: + --n_target Target value for the N, NA and NG metrics, ranging from 0 to 1. + (default: 0.5) + --l_target Target value for the L metric, ranging from 0 to 1. + (default: 0.5) + --plot_scale Scale of x-axis for the L, NA and NG metrics plots. + Allowed values: 'linear' or 'log'. + (default: log) + + Assembly execution parameters: + --abyss Boolean controling the execution of the ABySS assembler. + (default: true) + --abyssKmerSize K-mer size for the ABySS assembler, as an intiger. + (default 96) + --abyssBloomSize Bloom filter size for the ABySS assembler. + It must be a sting with a value and an unit. + (default: 2G) + --gatb_minia Boolean controling the execution of the GATB Minia Pipeline assembler. + (default: true) + --gatbKmerSize K-mer sizes for the GATB Minia Pipeline assembler. + It must be a sting with the values separated with a comma. + (default 21,61,101,141,181) + --gatb_besst_iter Number of iteration during Besst scaffolding for the + GATB Minia Pipeline assembler. + (default 10000) + --gatb_error_correction Boolean to control weather to skip error correction for the + GATB Minia Pipeline assembler. + (default false) + --idba Boolean controling the execution of the IDBA-UD assembler. + (default true) + --metahipmer2 Boolean controling the execution of the MetaHipMer2 assembler. + (default true) + --metahipmer2KmerSize K-mer sizes for the MetaHipMer2 assembler. + It must be a sting with the values separated with a comma. + (default 21,33,55,77,99) + --minia Boolean controling the execution of the minia assembler. + (default: true) + --miniaKmerSize K-mer size for the minia assembler, as an intiger. + (default 31) + --megahit Boolean controling the execution of the MEGAHIT assembler. + (default true) + --megahitKmerSize K-mer sizes for the MEGAHIT assembler. + It must be a sting with the values separated with a comma. + (default 21,29,39,59,79,99,119,141) + --metaspades Boolean controling the execution of the metaSPAdes assembler. + (default true) + --metaspadesKmerSize K-mer sizes for the metaSPAdes assembler. + It must be a sting with 'auto' or the values separated with a space. + (default auto) + --spades Boolean controling the execution of the SPAdes assembler. + (default true) + --spadesKmerSize K-mer sizes for the SPAdes assembler. + It must be a sting with 'auto' or the values separated with a space. + (default auto) + --skesa Boolean controling the execution of the SKESA assembler. + (default true) + --unicycler Boolean controling the execution of the Unicycler assembler. + (default true) + --velvetoptimiser Boolean controling the execution of the VelvetOptimiser assembler. + (default: true) + --velvetoptimiser_hashs Starting K-mer size for the VelvetOptimiser assembler, as an intiger. + (default 19) + --velvetoptimiser_hashe End K-mer size for the VelvetOptimiser assembler, as an intiger. + (default 31) + + Execution resources parameters: + --cpus Number of CPUs for the assembly and mapping processes, as an intiger. + This resource is double for each retry until max_cpus is reached. + (default 8) + --memory Memory for the assembly and mapping processes, in the format of + 'value'.'unit'. + This resource is double for each retry until max_memory is reached. + (default 32 GB) + --time Time limit for the assembly and mapping processes, in the format of + 'value'.'unit'. + This resource is double for each retry until max_time is reached. + (default 1d) + --max_cpus Maximum number of CPUs for the assembly and mapping processes, + as an intiger. It overwrites the --cpu parameter. + (default 32) + --max_memory Maximum memory for the assembly and mapping processes, in the format of + 'value'.'unit'. It overwrites the --memory parameter. + (default 100 GB) + --max_time Maximum time for the assembly and mapping processes, in the format of + 'value'.'unit'. It overwrites the --time parameter. + (default 3d) + +The reference sequences, in a single file, can be passed with the `--reference` parameter, and `--fastq` recieves the raw data for assembly. The raw data is a collection of sequence fragments from the references, and can be either obtained *in silico* or from real sequencing platforms. - -## Customizing LMAS - Users can customize the workflow execution either by using command line options or by modifying a simple plain-text -configuration file (`params.config`), where parameters are set as key-value pairs. The version of tools used can also -be changed by providing new container tags in the appropriate configuration file (`containers.config`), as well as the -resources for each process (`resources.config`). +configuration file (`conf/params.config`), where parameters are set as key-value pairs. The version of tools used can also +be changed by providing new container tags in the appropriate configuration file (`conf/containers.config`). + +Users can select what profile to use with the `-profile` option. Several configurations are availabel in the profile configuration file (`conf/profiles.config`). For a local execution we recommend running LMAS with either `-profile docker` or `-profile singularity`. HPC compatibility is available for SLURM, SGE, LSF, among others. ## Output and Report @@ -92,16 +209,18 @@ workflow was executed. To open the report simply click on the **index.html** fil your default browser. LMAS comes pre-packaged with the JS source code for the interactive report, available in the `resources/` folder. -The source code for the report is available in the [lmas_report](https://github.com/cimendes/lmas_report) repository. +The source code for the report is available in the [LMAS.js](https://github.com/B-UMMI/LMAS.js) repository. -## Evaluation +## Quick Start ### ZymoBIOMICS Microbial Community Standard -A script to download and structure the ZymoBIOMICS data to be used as input is provided (`get_data.sh`). +A bash script to download and structure the ZymoBIOMICS data to be used as input is provided (`get_data.sh`). -Running this scipt downloads the [eight bacterial genomes and four plasmids of the ZymoBIOMICS Microbial Community Standards](https://zenodo.org/record/4588970#.YEeA83X7RhE) were used as the triple reference. -It contains tripled complete sequences for the following species: + sh get_data.sh + +Running this scipt downloads the [eight bacterial genomes and four plasmids of the ZymoBIOMICS Microbial Community Standards](https://zenodo.org/record/4588970#.YEeA83X7RhE) were used as reference. +It contains complete sequences for the following species: - *Bacillus subtilis* - *Enterococcus faecalis* - *Escherichia coli* @@ -115,9 +234,23 @@ It contains tripled complete sequences for the following species: - *Staphylococcus aureus* plasmid 2 - *Staphylococcus aureus* plasmid 3 -It also downloads the raw sequence data of the mock communities, with an even and logarithmic distribution of species ([ERR2984773](https://www.ebi.ac.uk/ena/browser/view/ERR2984773) and [ERR2935805](https://www.ebi.ac.uk/ena/browser/view/ERR2935805)), and a simulated sample of the evenly distributed reads generated from the genomes in the Zymobiomics standard ([mockSample](https://zenodo.org/record/4588970#.YEeA83X7RhE)). +It also downloads the raw sequence data of the mock communities, with an even ([ERR2984773](https://www.ebi.ac.uk/ena/browser/view/ERR2984773)) and logarithmic distribution of species ([ERR2935805](https://www.ebi.ac.uk/ena/browser/view/ERR2935805)), and the [complete reference sequences](https://zenodo.org/record/5579145/files/ZymoBIOMICS_genomes.fasta) + +Simulated samples of the evenly and log distributed reads, with and without error, generated from the genomes in the Zymobiomics standard with [inSilicoSeq](https://github.com/HadrienG/InSilicoSeq) (version 1.5.2): +- ENN - Evenly distributed sample with no error model +- EMS - Evenly distributed sample with Illumina MiSeq error model +- LNN - Log distributed sample with no error model +- LHS - Log distributed sample with Illumina HiSeq error model + +[DOI Dataset](https://doi.org/10.5281/zenodo.4588970) + +After downloading the data you can simply run LMAS, with default parameters, with the following command: + + LMAS -profile docker + +or -The resulting LMAS report is available at [https://lmas-demo.herokuapp.com](https://lmas-demo.herokuapp.com) + nextflow run main.nf -profile docker ## Citation and Contacts @@ -125,4 +258,4 @@ LMAS is developed at the Molecular [Microbiology and Infection Unit (UMMI)](http This project is licensed under the [GPLv3 license](https://github.com/cimendes/LMAS/blob/main/LICENSE). -If you use LMAS please cite this repository. +If you use LMAS please [cite this repository](https://github.com/cimendes/LMAS/blob/main/CITATION.cff). diff --git a/configs/containers.config b/conf/containers.config similarity index 86% rename from configs/containers.config rename to conf/containers.config index 2840801..a39bb0c 100644 --- a/configs/containers.config +++ b/conf/containers.config @@ -1,25 +1,19 @@ process { + withName: REFORMAT { + container = "pcerqueira/bbtools:38.44" + } withName: ABYSS { container = "cimendes/abyss:2.3.1-1" } - withName: BCALM2 { - container = "cimendes/bcalm:2.2.3-1" - } withName: GATBMINIAPIPELINE { container = "cimendes/gatb-minia-pipeline:31.07.2020-1" } - withName: reformat_IDBA { - container = "pcerqueira/bbtools:38.44" - } withName: IDBA { container = "cimendes/idba:1.1.3-1" } withName: MEGAHIT { container = "cimendes/megahit-assembler:1.2.9-1" } - withName: reformat_METAHIPMER2 { - container = "pcerqueira/bbtools:38.44" - } withName: METAHIPMER2 { container = "cimendes/mhm2:v2.0.0-65-gaad446d-generic" } diff --git a/configs/params.config b/conf/params.config similarity index 89% rename from configs/params.config rename to conf/params.config index e497c51..856ac77 100644 --- a/configs/params.config +++ b/conf/params.config @@ -13,14 +13,10 @@ params { --------------------------- */ //abyss - abyss = true + abyss = false abyssKmerSize = 96 abyssBloomSize = '2G' - //BCALM2 - bcalm = true - bcalmKmerSize = 31 - //GATB Minia Pipeline gatb_minia = true gatbKmerSize = '21,61,101,141,181' @@ -31,11 +27,11 @@ params { idba = true //METAHIPMER2 - metahipmer2 = true + metahipmer2 = false metahipmer2KmerSize = '21,33,55,77,99' //Minia - minia = true + minia = false miniaKmerSize = 31 //MEGAHIT @@ -57,7 +53,7 @@ params { unicycler = true //VelvetOptimiser - velvetoptimiser = true + velvetoptimiser = false velvetoptimiser_hashs = 19 velvetoptimiser_hashe = 31 @@ -73,4 +69,4 @@ params { n_target = 0.5 l_target = 0.9 plot_scale = 'log' -} \ No newline at end of file +} diff --git a/configs/profiles.config b/conf/profiles.config similarity index 58% rename from configs/profiles.config rename to conf/profiles.config index deec516..d1e7ab5 100644 --- a/configs/profiles.config +++ b/conf/profiles.config @@ -7,94 +7,18 @@ profiles { singularity.enabled = true } - docker { - docker.enabled = true + singularity { + singularity.enabled = true } - oneida { - - process.executor = "slurm" + docker { docker.enabled = true - - process{ - - // MEMORY USAGE PER PROCESS // - // general memory usage - memory = "4GB" - - } - - } - - // INCD PROFILE - incd { - - process.executor = "slurm" - singularity.enabled = true - - singularity { - cacheDir = "/mnt/singularity_cache" - autoMounts = true - } - - // Error and retry strategies - process.errorStrategy = "retry" - maxRetries = 3 - - process.$chewbbaca.queue = "chewBBACA" - - process { - - // MEMORY USAGE PER PROCESS // - // general memory usage - memory = "4GB" - - } - } - // SLURM PROFILE - slurm { - - // Change executor for SLURM - process.executor = "slurm" - // Change container engine for Shifter + imm_lobo { + process.clusterOptions = "--qos oneida" shifter.enabled = true - - process { - - clusterOptions = "--qos=oneida" - - errorStrategy = "retry" - maxRetries = 5 - - // MEMORY USAGE PER PROCESS // - // general memory usage - memory = "4GB" - - } - - } - - // SLURM PROFILE - slurmOneida { - - // Change executor for SLURM process.executor = "slurm" - // Change container engine for Shifter - shifter.enabled = true - - process { - - clusterOptions = "--qos=oneida" - - // MEMORY USAGE PER PROCESS // - // general memory usage - memory = "4GB" - - // Set QOS for chewbbaca in order to run a single job - $chewbbaca.clusterOptions = "--qos=chewbbaca" - } } // SLURM executor @@ -113,6 +37,11 @@ profiles { process.executor = "slurm" } + slurm_singularity { + singularity.enabled = true + process.executor = "slurm" + } + // SGE executor sge_sing { singularity.enabled = true diff --git a/configs/resources.config b/configs/resources.config deleted file mode 100644 index a22cdc9..0000000 --- a/configs/resources.config +++ /dev/null @@ -1,54 +0,0 @@ -process { - withName: ABYSS { - cpus = 8 - memory = {32.Gb*task.attempt} - } - withName: BCALM2 { - cpus = 8 - memory = {32.Gb*task.attempt} - } - withName: GATBMINIAPIPELINE { - cpus = 8 - memory = {32.Gb*task.attempt} - } - withName: IDBA { - cpus = 8 - memory = {32.Gb*task.attempt} - } - withName: MEGAHIT { - cpus = 8 - memory = {32.Gb*task.attempt} - } - withName: METAHIPMER2 { - cpus = 8 - memory = {32.Gb*task.attempt} - } - withName: METASPADES { - cpus = 8 - memory = {32.Gb*task.attempt} - } - withName: MINIA { - cpus = 8 - memory = {32.Gb*task.attempt} - } - withName: SKESA { - cpus = 8 - memory = {32.Gb*task.attempt} - } - withName: SPADES { - cpus = 8 - memory = {32.Gb*task.attempt} - } - withName: UNICYCLER { - cpus = 8 - memory = {32.Gb*task.attempt} - } - withName: VELVETOPTIMISER { - cpus = 8 - memory = {32.Gb*task.attempt} - } - withName: READ_MAPPING { - cpus = 8 - memory = {20.Gb*task.attempt} - } -} \ No newline at end of file diff --git a/configs/test.config b/configs/test.config deleted file mode 100644 index e69de29..0000000 diff --git a/docker/bcalm/Dockerfile b/docker/bcalm/Dockerfile deleted file mode 100644 index 8b467d7..0000000 --- a/docker/bcalm/Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -FROM ubuntu:16.04 - -LABEL software="BCALM" \ - software.version="2.2.3" \ - about.home="https://github.com/GATB/bcalm" \ - about.summary="Compacted de Bruijn graph construction in low memory" \ - about.documentation="https://github.com/GATB/bcalm#readme" \ - about.license="https://github.com/GATB/bcalm/blob/master/LICENSE" \ - author="Inês Mendes " - -RUN apt-get update && apt-get -y install build-essential \ - wget \ - git - -RUN apt-get -y install software-properties-common -RUN add-apt-repository ppa:ubuntu-toolchain-r/test -RUN apt-get update -RUN apt-get -y install gcc-4.9 -RUN apt-get -y upgrade libstdc++6 -WORKDIR /NGStools - -# Dependencies -# BCALM -RUN wget https://github.com/GATB/bcalm/releases/download/v2.2.3/bcalm-binaries-v2.2.3-Linux.tar.gz -RUN tar xf bcalm-binaries-v2.2.3-Linux.tar.gz && rm bcalm-binaries-v2.2.3-Linux.tar.gz - -ENV PATH="/NGStools/bcalm-binaries-v2.2.3-Linux/bin:${PATH}" - -WORKDIR /data \ No newline at end of file diff --git a/docs/about/about.rst b/docs/about/about.rst index 8b2c5a6..cca51c6 100644 --- a/docs/about/about.rst +++ b/docs/about/about.rst @@ -8,4 +8,6 @@ at the `Faculty of Health Sciences, Ben-Gurion University of the Negev `_. -The source code of LMAS is available at ``_. \ No newline at end of file +The source code of LMAS is available at ``_. + +If you use LMAS please `cite LMAS repository `_. \ No newline at end of file diff --git a/docs/dev/add_process.rst b/docs/dev/add_process.rst index c024434..1e78b0d 100644 --- a/docs/dev/add_process.rst +++ b/docs/dev/add_process.rst @@ -2,32 +2,37 @@ Add Assembler Process ===================== New assemblers can be added with minimal changes to the pipeline, -so that LMAS can be expanded as novel algorithms are developed. +so that LMAS can be expanded as novel algorithms are developed. It's implementation in DSL2 +greatly facilitates this process. -The assemblers implemented are available in the **main file** in -`LMAS.nf `_, to be executed by Nextflow. +The assemblers implemented are available in the **assembly** module located in the +`modules folder `_. +The `assembly.nf `_ is the nextflow file that contains all the assembly processes. The current available assemblers are: -* BCALM2 -* GATB-Minia Pipeline -* IDBA -* MINIA -* MEGAHIT -* METASPADES -* SKESA -* SPADES -* UNICYCLER -* VELVETOPTIMISER -To add an assembler, it must be ensured that **short-read paired-end sequence data** can be provided as input. +* `ABySS `_ +* `GATB-Minia Pipeline `_ +* `IDBA `_ +* `MEGAHIT `_ +* `MetaHipMer2 `_ +* `metaSPAdes `_ +* `MINIA `_ +* `SKESA `_ +* `SPAdes `_ +* `Unicycler `_ +* `VelvetOptimiser `_ + +Detailed information is available in the `Short-Read (Meta)Genomic Assemblers <../user/assemblers.html>`_ page. + +.. warning:: To add an assembler, it must be ensured that **short-read paired-end sequence data** can be provided as input. -More information is available at `assemblers <../user/assemblers.html>`_. Changing assembler version ----------------------------------- The easiest way to change a version of a particular assembler in LMAS is by changing the containers for the assembler process. -This is done through altering the container property in the `containers.config `_ file. +This is done through altering the container property in the `containers.config `_ file. For example, for the ``SPADES`` process, the container "cimendes/spades:3.15.0-1" can be altered to another one that implements a different version of the tool: @@ -46,7 +51,7 @@ different version of the tool: } .. warning:: You must ensure that the assembler executable is available in the $PATH and that ps is installed - in the container for it to work with LMAS. + in the container for it to work with LMAS or any other Nextflow workflow. Adding a new assembler ----------------------------------- @@ -62,52 +67,29 @@ Some information is required: * Minimal command to execute the assembler with short-read paired-end sequencing datasets; * Parameters (such as k-mer lists) to be passed onto the assembler. -By default, all assemblies are run with 4 CPUs and 16Gb of memory. +By default, all assemblies are run with 8 CPUs and 32GB of memory. -Add process to LMAS.nf manually -::::::::::::::::::::::::::::::::: -To add a new assembler to LMAS, a few steps must be completed: +Add process to assembly.nf manually +::::::::::::::::::::::::::::::::::::::::::::: -1. **Add a new channel for the FASTQ datasets** +To add a new assembler to LMAS, a few steps must be completed. All alterations +needed will be perfomed in the `assembly.nf `_ file, +the `params.config `_ file and the +`containers.config `_. -A new channel to provide the raw sequence data to the new assembler must be created. -Simply add a new channel, named for example ``IN_``, to the ``into`` operator -that splits the data in the ``IN_fastq_raw`` channel in this `line `_. +1. Add the needed parameters -It should look like: - -.. code-block:: bash +In the the `params.config `_ file, +add a new key-value pair for any parameter necessary to run the assembler, such as the list of k-mer values to use. +The fastq input data is passed through the main `--fastq` parameter so it should not be included. - // SET CHANNELS FOR ASSEMBLERS - IN_fastq_raw.into{ - IN_PROCESS_READS; - IN_BCALM2; - IN_GATB_MINIA_PIPELINE; - IN_MINIA; - IN_MEGAHIT; - IN_METASPADES; - IN_UNICYCLER; - IN_IDBA; - IN_SPADES; - IN_SKESA; - IN_VELVETOPTIMISER; - IN_NEW_ASSEMBLER; // new channel added - IN_TO_MAP} //mapping channel - minimap2 - -.. warning:: Make sure the channel name isn't used elsewhere. Otherwise Nextflow will throw an error. +.. warning:: All assemblers in LMAS are toggleble through a `--` parameter, and this should be included in this file. 2. **Add a new process with the assembler** -Parameters to be passed on to this new process can be added in the `params.config `_ file. -You can access this values in the ``.nf`` file with ``params.``. -For example: - -.. code-block:: bash - - IN_NEW_ASSEMBLER_kmers = Channel.value(params.newassemblerKmers) - -.. warning:: Parameters need to be passed into a process through a channel. +In the `assembly.nf `_ file, you need +to add the process to execute the new assembler in the section marked with `\\PROCESSES`. To create the new process, you can use the following template, substituting ``NEW_ASSEMBLER`` with the new assembler name: @@ -116,15 +98,19 @@ assembler name: process NEW_ASSEMBLER { tag { sample_id } + label 'process_assembly' publishDir 'results/assembly/NEW_ASSEMBLER/' input: - set sample_id, file(fastq_pair) from IN_NEW_ASSEMBLER + set sample_id, path(fastq) val kmers from IN_NEW_ASSEMBLER_kmers + when: + pararm.NEW_ASSEMBLER + output: - set sample_id, val("NEW_ASSEMBLER"), file('*.fasta') into OUT_NEW_ASSEMBLER - file(".*version") into NEW_ASSEMBLER_VERSION + set sample_id, val("NEW_ASSEMBLER"), file('*.fasta'), emit: assembly + file(".*version"), emit: version script: """ @@ -143,81 +129,74 @@ assembler name: .. warning:: You can access each of the fastq files with ${fastq_pair[1]} and ${fastq_pair[2]}. +You can access this values in the ``.nf`` file with ``params.``. +For example: -3. **Add version to main version collection** +.. code-block:: bash -The channel with the version information must be merged into the main version collection channel -for it to be processed accordingly in this `line `_. + IN_NEW_ASSEMBLER_kmers = Channel.value(params.newassemblerKmers) -It should look like: +.. warning:: Parameters need to be passed into a process through a channel. -.. code-block:: bash +This should be added inside the `assembly_wf` worflow in the end of the file. - // VERSION COLLECTION - BCALM2_VERSION.mix(GATB_VERSION, - MINIA_VERSION, - MEGAHIT_VERSION, - METASPADES_VERSION, - UNICYCLER_VERSION, - SPADES_VERSION, - SKESA_VERSION, - VELVETOPTIMISER_VERSION, - NEW_ASSEMBLER_VERSION, // new channel added - IDBA_VERSION).set{ALL_VERSIONS} +Additionally, The new process needs to be added in the `main:` section of the +workflow. -4. **Add assembly to main assembly collection** +3. **Add assembly to main assembly collection** -The channel with the assembly produced must be merged into the main assembly collection channel -for it to be processed. This is done in this `line `_. +The channel with the version information must be merged into the main assembly collection channel, emitted by the `assembly_wf` workflow. It should look like: .. code-block:: bash - // ASSEMBLY COLLECTION - OUT_BCALM2.mix(OUT_GATB, - OUT_MINIA, - OUT_MEGAHIT, - OUT_METASPADES, - OUT_UNICYCLER, - OUT_SPADES, - OUT_SKESA, - OUT_VELVETOPTIMISER, - OUT_NEW_ASSEMBLER, // new channel added - OUT_IDBA).set{ALL_ASSEMBLERS} + all_assemblies = ABYSS.out.assembly | mix(GATBMINIAPIPELINE.out.assembly, + IDBA.out.assembly, + MEGAHIT.out.assembly, + METAHIPMER2.out.assembly, + METASPADES.out.assembly, + MINIA.out.assembly, + NEW_ASSEMBLER.out.version, // new channel added + SKESA.out.assembly, + SPADES.out.assembly, + UNICYCLER.out.assembly, + VELVETOPTIMISER.out.assembly) -5. **Add the resources for the new assembler** +.. warning:: To facilitate reading, please respect the alphabetical order. -The resources for the new assembler need to be added to the ``resources.config`` file. +4. **Add version to main version collection** + +The channel with the version information must be merged into the main version collection channel, emitted by the `assembly_wf` workflow. It should look like: .. code-block:: bash - - withName: NEW_ASSEMBLER { - cpus = 4 - memory = {16.Gb*task.attempt} - } -6. **Add the container for the new assembler** + all_versions = ABYSS.out.version | mix(GATBMINIAPIPELINE.out.version, + IDBA.out.version, + MEGAHIT.out.version, + METAHIPMER2.out.version, + METASPADES.out.version, + MINIA.out.version, + NEW_ASSEMBLER.out.version, // new channel added + SKESA.out.version, + SPADES.out.version, + UNICYCLER.out.version, + VELVETOPTIMISER.out.version) | collect -The container for the new assembler need to be added to the ``resources.config`` file. +.. warning:: To facilitate reading, please respect the alphabetical order. -It should look like: -.. code-block:: bash - - withName: NEW_ASSEMBLER { - container = "username/NEW_ASSEMBLER:tag" - } - -7. (Optional) **Add parameters for the new assembler** +5. **Add the container for the new assembler** -Parameters that the new assembler requires for its execution need to be added to the ``params.config`` file. +The container for the new assembler need to be added to the ``container.config`` file +in the `conf/` directory. It should look like: .. code-block:: bash - - //NEW_ASSEMBLER - new_assembler_parameter = parameter + + withName: NEW_ASSEMBLER { + container = "/NEW_ASSEMBLER:" + } diff --git a/docs/dev/general.rst b/docs/dev/general.rst index 70f8f0a..8150115 100644 --- a/docs/dev/general.rst +++ b/docs/dev/general.rst @@ -1,7 +1,7 @@ General Orientation =================== -LMAS code is in two repositories: `source repository `_ and +LMAS code is in two repositories: `source repository `_ and `report repository `_. .. figure:: ../resources/LMAS_DIAGRAM_FULL.png @@ -11,47 +11,56 @@ LMAS code is in two repositories: `source repository `_. -The **main file** is `LMAS.nf `_, to be executed by Nextflow. +This repository contains the code for LMAS workflow. The workflow is developed in `Nextflow `_ DSL2. +The **main file** is `main.nf `_, to be executed by Nextflow. +A wrapper file `LMAS` is provided for running the workflow. If no options are provided it will output LMAS help information on how +to use the workflow, otherwise it will call `nextflow` with the main file. Configuration :::::::::::::: -The **main configuration** file is `nextflow.config `_ and contains -the main configuration parameters for the execution of LMAS. The **parameters, containers and resources** configuration files, -that can be altered by the user to adapt LMAS execution, are -`params.config `_, -`containers.config `_, -and `resources.config `_, respectively. +The **main configuration** file is `nextflow.config `_ and contains +the main configuration parameters for the execution of LMAS. The **parameters, containers and resources** configuration files, +located in the `conf` directory, that can be altered by the user to adapt LMAS execution. A detailed description is available in +the `User guide <../user/basic_usage.html>`_. -Information on how to adjust these values is available `here <../user/parameters.html>`_. +Information on how to costumize these values is available `here <../user/parameters.html>`_. + +Modules +:::::::::: + +LMAS is split into 5 modules that are run in the main workflow and are responsible for preprocessing the input data, +assemble the data in parallel, compute all the global and reference-dependent metrics and compile the final report. +LMAS modularity, thanks to its implementation in DSL2, facilitates the maintenance and future updates. These modules are +located in the `modules `_ folder. Templates :::::::::: -The `templates `_ folder contains the custom python scripts used +The `templates `_ folder contains the custom python scripts used by LMAS to process the data and compute the evaluation metrics, and are called in the -`LMAS.nf `_ file. +`LMAS.nf `_ file. Resources ::::::::: -The `resources `_ folder contains the compiled source code +The `resources `_ folder contains the compiled source code for the LMAS report. The report code is available in the `report repository `_. Lib :::: -The `lib `_ folder contains custom Groovy code used by LMAS for -the ``--help`` function. +The `lib `_ folder contains custom Groovy code used by LMAS for +the ``--help`` function and parameter validation. Docker :::::: The dockerfile for the main LMAS container, including all necessary python dependencies for the custom code in the -`templates `_ is available in the -`docker folder `_ +`templates `_ is available in the +`docker folder `_. Additionally, the docker files used to +build all the container for all assemblers in LMAS are also available in the `docker` folder. LMAS report diff --git a/docs/getting_started/installation.rst b/docs/getting_started/installation.rst index 8e580d9..fccf4c0 100644 --- a/docs/getting_started/installation.rst +++ b/docs/getting_started/installation.rst @@ -1,31 +1,11 @@ Installation ============ -LMAS can be installed through Github (https://github.com/cimendes/LMAS). -It requires a `Nextflow `_ installation (version ≥ 21.04.1) -and can be used on any POSIX compatible system (Linux, OS X, etc). All components of LMAS are executed in `Docker containers`_, -being a container engine required. - -Nextflow allows integration with multiple alternatives, such as `Shifter `_ or -`Singularity `_, so a particular one isn’t required. +Container engine +---------------------- -To ensure the robustness of custom python code for the quality assessment of assemblies, **continuous integration** of the python templates -is performed with `pytest `_ and `GitHub Actions `_. - -Below it's a step by step guide on how to install LMAS and all its dependencies. - -Step 1. Nextflow ------------------ - -`Nextflow `_ (version 20.01.0 or higher) can be used on any POSIX compatible system (Linux, OS X, etc). -It requires BASH and Java 8 (or higher) to be installed. - -.. important:: - - Instructions on how to install Nextflow are available `here `_ - -Step 2. Container engine -------------------------- +All components of LMAS are executed in docker containers, which means that you’ll need to have a container engine installed. +The container engines available are the ones supported by Nextflow. All components of LMAS are executed in docker containers, which means that you’ll need to have a container engine installed. The container engines available are the ones supported by Nextflow: @@ -37,7 +17,6 @@ installed. The container engines available are the ones supported by Nextflow: If you already have any one of these installed, you are good to go as the provided docker containers are compatible with all engines available. If not, you’ll need to install one. - Singularity ::::::::::: @@ -61,8 +40,46 @@ https://www.docker.com/community-edition#/download. To run docker as a non-root user, you'll need to follow the instructions on the website: https://docs.docker.com/install/linux/linux-postinstall/#manage-docker-as-a-non-root-user -Step 3. Clone LMAS -------------------- + +Installation through conda +---------------------------- + +LMAS can be easily installed through `Conda`_, an open source package management +system and environment management system that runs on Windows, macOS and Linux. +After its installation, LMAS is available on `Bioconda`_ and can be easily installed with: + +.. code-block:: bash + + conda install -c bioconda lmas + +Manual installation +-------------------------- + +LMAS can be installed through Github (https://github.com/cimendes/LMAS). +It requires a `Nextflow `_ installation (version ≥ 21.04.1) +and can be used on any POSIX compatible system (Linux, OS X, etc). All components of LMAS are executed in `Docker containers `_, +being a container engine required. + +Nextflow allows integration with multiple alternatives, such as `Shifter `_ or +`Singularity `_, so a particular one isn’t required. + +To ensure the robustness of LMAS workflow and the custom python code for the quality assessment of assemblies, **continuous integration** of both the main workflow +and the python templates is performed with `GitHub Actions `_ and `pytest `_. + +Below it's a step by step guide on how to install LMAS and all its dependencies. + +Step 1. Nextflow +^^^^^^^^^^^^^^^^^ + +`Nextflow `_ (version 20.01.0 or higher) can be used on any POSIX compatible system (Linux, OS X, etc). +It requires BASH and Java 8 (or higher) to be installed. + +.. important:: + + Instructions on how to install Nextflow are available `here `_ + +Step 2. Clone LMAS +^^^^^^^^^^^^^^^^^^^^^^^^ You can clone this repository with git. @@ -71,4 +88,131 @@ You can clone this repository with git. git clone https://github.com/cimendes/LMAS.git All files will be on your local machine. -The main execution file for Nextflow is ``LMAS.nf``. \ No newline at end of file + +To run LMAS you can simply call it with: + +.. code-block:: bash + + ./LMAS + +If no option or `--help` is provided, LMAS will display its help message. Otherwise, the `--fastq` and `--reference` options are mandatory. By default they are set to `'data/fastq/*_{1,2}.*'` and `'data/reference/*.fasta'` respectively. + +The main execution file for Nextflow is ``main.nf``. Alternatively you can call LMAS directly with Nextflow: + +.. code-block:: bash + + nextflow run main.nf + +To use LMAS the following options are available: + +.. code-block:: bash + + _ __ __ _ ___ + /\︵︵/\ | | | \/ | /_\ / __| + (◕('人')◕) | |__| |\/| |/ _ \\__ \ + |︶| |____|_| |_/_/ \_\___/ + + Last Metagenomic Assembler Standing + + Input parameters: + --fastq Path expression to paired-end fastq files. + (default: data/fastq/*_{1,2}.*) + --reference Path to the genome reference fasta file. + (default: data/reference/*.fasta) + --md Path to markdown with input sample description for report (optional). + (default: data/*.md) + + Mapping and filtering paramenters: + --minLength Value for minimum contig length, in basepairs. + (default: 1000) + --mapped_reads_threshold Value for the minimum percentage of a read aligning to the + contig to be considered as mapped. + (default: 0.75) + + Assembly quality assessment parameters: + --n_target Target value for the N, NA and NG metrics, ranging from 0 to 1. + (default: 0.5) + --l_target Target value for the L metric, ranging from 0 to 1. + (default: 0.5) + --plot_scale Scale of x-axis for the L, NA and NG metrics plots. + Allowed values: 'linear' or 'log'. + (default: log) + + Assembly execution parameters: + --abyss Boolean controling the execution of the ABySS assembler. + (default: true) + --abyssKmerSize K-mer size for the ABySS assembler, as an intiger. + (default 96) + --abyssBloomSize Bloom filter size for the ABySS assembler. + It must be a sting with a value and an unit. + (default: 2G) + --gatb_minia Boolean controling the execution of the GATB Minia Pipeline assembler. + (default: true) + --gatbKmerSize K-mer sizes for the GATB Minia Pipeline assembler. + It must be a sting with the values separated with a comma. + (default 21,61,101,141,181) + --gatb_besst_iter Number of iteration during Besst scaffolding for the + GATB Minia Pipeline assembler. + (default 10000) + --gatb_error_correction Boolean to control weather to skip error correction for the + GATB Minia Pipeline assembler. + (default false) + --idba Boolean controling the execution of the IDBA-UD assembler. + (default true) + --metahipmer2 Boolean controling the execution of the MetaHipMer2 assembler. + (default true) + --metahipmer2KmerSize K-mer sizes for the MetaHipMer2 assembler. + It must be a sting with the values separated with a comma. + (default 21,33,55,77,99) + --minia Boolean controling the execution of the minia assembler. + (default: true) + --miniaKmerSize K-mer size for the minia assembler, as an intiger. + (default 31) + --megahit Boolean controling the execution of the MEGAHIT assembler. + (default true) + --megahitKmerSize K-mer sizes for the MEGAHIT assembler. + It must be a sting with the values separated with a comma. + (default 21,29,39,59,79,99,119,141) + --metaspades Boolean controling the execution of the metaSPAdes assembler. + (default true) + --metaspadesKmerSize K-mer sizes for the metaSPAdes assembler. + It must be a sting with 'auto' or the values separated with a space. + (default auto) + --spades Boolean controling the execution of the SPAdes assembler. + (default true) + --spadesKmerSize K-mer sizes for the SPAdes assembler. + It must be a sting with 'auto' or the values separated with a space. + (default auto) + --skesa Boolean controling the execution of the SKESA assembler. + (default true) + --unicycler Boolean controling the execution of the Unicycler assembler. + (default true) + --velvetoptimiser Boolean controling the execution of the VelvetOptimiser assembler. + (default: true) + --velvetoptimiser_hashs Starting K-mer size for the VelvetOptimiser assembler, as an intiger. + (default 19) + --velvetoptimiser_hashe End K-mer size for the VelvetOptimiser assembler, as an intiger. + (default 31) + + Execution resources parameters: + --cpus Number of CPUs for the assembly and mapping processes, as an intiger. + This resource is double for each retry until max_cpus is reached. + (default 8) + --memory Memory for the assembly and mapping processes, in the format of + 'value'.'unit'. + This resource is double for each retry until max_memory is reached. + (default 32 GB) + --time Time limit for the assembly and mapping processes, in the format of + 'value'.'unit'. + This resource is double for each retry until max_time is reached. + (default 1d) + --max_cpus Maximum number of CPUs for the assembly and mapping processes, + as an intiger. It overwrites the --cpu parameter. + (default 32) + --max_memory Maximum memory for the assembly and mapping processes, in the format of + 'value'.'unit'. It overwrites the --memory parameter. + (default 100 GB) + --max_time Maximum time for the assembly and mapping processes, in the format of + 'value'.'unit'. It overwrites the --time parameter. + (default 3d) + diff --git a/docs/getting_started/overview.rst b/docs/getting_started/overview.rst index c618ae6..fd1c8ba 100644 --- a/docs/getting_started/overview.rst +++ b/docs/getting_started/overview.rst @@ -38,6 +38,8 @@ a **container engine installation**, such as `Docker `_ or `Singularity `_. The **local installation of the LMAS workflow**, including the Docker containers, requires 7.3 gigabytes (GB) -of free disk space. The **default requirements to execute the workflow** are at least 20 GB of memory and 4 CPUs. +of free disk space. The **default requirements to execute the workflow** are at least 32 GB of memory and 8 CPUs, with +a maximum of 100 GB of memory and 32 CPU. This can be easily adjusted but might compromise the performance of the +assemblers contained in LMAS. The assemblers can be skipped individually through the use of parameters. The disk space required for execution depends greatly on the size of the input data but, in average, LMAS generates approximately 17 GB of data per GB of input data. diff --git a/docs/report/overview.rst b/docs/report/overview.rst index 6cdc4e8..429a133 100644 --- a/docs/report/overview.rst +++ b/docs/report/overview.rst @@ -4,7 +4,7 @@ Overview LMAS creates an interactive HTML report, stored in the ``report/`` folder in the directory where the workflow was executed. To open the report simply click on the ``index.html`` file and the report will open on your default browser. -The JavaScript source code for the interactive report comes bundled with LMAS but is freely available at https://github.com/cimendes/lmas_report. +The JavaScript source code for the interactive report comes bundled with LMAS but is freely available at https://github.com/B-UMMI/LMAS.js. It was built with the JavaScript frameworks React (https://reactjs.org/, version 16.8.0) and Material-UI (https://material-ui.com/, version 4.11.00). All interactive charts were rendered with the graph visualization library Plotly.js (https://plotly.com/javascript/, version 1.57.1) through its React component, react-plotly (https://plotly.com/javascript/react/, version 2.5.0). diff --git a/docs/resources/LMAS_DIAGRAM_FULL.png b/docs/resources/LMAS_DIAGRAM_FULL.png index 27cea06..d5d3208 100644 Binary files a/docs/resources/LMAS_DIAGRAM_FULL.png and b/docs/resources/LMAS_DIAGRAM_FULL.png differ diff --git a/docs/resources/LMAS_ECCMID.png b/docs/resources/LMAS_ECCMID.png index f75f382..93039ae 100644 Binary files a/docs/resources/LMAS_ECCMID.png and b/docs/resources/LMAS_ECCMID.png differ diff --git a/docs/user/assemblers.rst b/docs/user/assemblers.rst index 34066f4..ec0aef8 100644 --- a/docs/user/assemblers.rst +++ b/docs/user/assemblers.rst @@ -35,18 +35,6 @@ graph, reducing memory requirements for de novo assembly. **It's a traditional s * **Date of last release:** 22/04/2021 * **Container:** `cimendes/abyss:2.3.1-1 `_ -BCALM2 -^^^^^^ - -This assembler, published by `Chikhi et al, 2016 `_ in -*Bioinformatics*, is a fast and low memory algorithm for graph compaction, consisting of three stages: careful distribution -of input k-mers into buckets, parallel compaction of the buckets, and a parallel reunification step to glue together -the compacted strings into unitigs. **It's a traditional single k-mer value De Bruijn assembler.** - -* **Source code:** https://github.com/GATB/bcalm -* **Date of last release:** 22/05/2020 -* **Container:** `cimendes/bcalm:2.2.3-1 `_ - GATB-Minia Pipeline ^^^^^^^^^^^^^^^^^^^ diff --git a/docs/user/basic_usage.rst b/docs/user/basic_usage.rst index 8b9a449..af81a77 100644 --- a/docs/user/basic_usage.rst +++ b/docs/user/basic_usage.rst @@ -23,56 +23,182 @@ When you clone it, LMAS has the following folder structure: .. code-block:: bash LMAS - ├── bin/ - ├── containers.config - ├── docker/ - ├── docs/ + ├── CITATION.cff + ├── conf + ├── docker + ├── docs ├── get_data.sh - ├── lib/ + ├── lib ├── LICENSE - ├── LMAS.nf + ├── LMAS + ├── main.nf + ├── modules ├── nextflow.config - ├── params.config - ├── profiles.config ├── README.md - ├── resources/ - ├── resources.config - └── templates/ - -* The ``LMAS.nf`` is the main execution file for LMAS. -* The ``get_data.sh`` bash script file downloads the ZymoBIOMICS Microbial Community Standard data. -* The ``containers.config``, ``nextflow.config``, ``params.config``, ``profiles.config`` and ``resources.config`` are LMAS configuration files. -* The ``bin/`` and ``templates/`` folders contain custom LMAS code for data processing. -* The ``docs/`` folder contains LMAS documentation source files. -* The ``docker/`` folder contains the dockerfile for LMAS' base container. + ├── resources + ├── templates + └── test + +* The ``LMAS`` is the main execution file for LMAS. +* The ``main.nf`` is the workflow execution file for `Nextflow `_. +* The ``modules`` folder contains the LMAS `Nextflow `_ DSL2 modules. +* The ``nextflow.config`` and the files in ``conf/`` are the LMAS configuration files. +* The ``lib/`` and ``templates/`` folders contain custom LMAS code for data processing. +* The ``docs/`` folder contains the LMAS documentation source files. +* The ``docker/`` folder contains the dockerfile for the base container and all assemblers in LMAS. * The ``resources/`` folder contains the LMAS report compiled code. +* The ``test`` folder contains LMAS test data and files. +* The ``get_data.sh`` bash script file downloads the ZymoBIOMICS Microbial Community Standard data. Customizing LMAS workflow configuration --------------------------------------- Users can customize the **workflow execution** either by using **command-line options**, with ``--