From 7406f97f72bb8f2e780af55194985ce0bff2562f Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Mon, 13 Mar 2023 18:51:24 +0000 Subject: [PATCH 01/28] update intended version number --- scripts/signal_postprocess.py | 2 +- signal.py | 41 ++++++++++++++++++++++------------- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/scripts/signal_postprocess.py b/scripts/signal_postprocess.py index a0a0aba..20fbcde 100755 --- a/scripts/signal_postprocess.py +++ b/scripts/signal_postprocess.py @@ -17,7 +17,7 @@ assert long_git_id.startswith('$Id: ') #short_git_id = long_git_id[5:12] -short_git_id = "v1.5.9" +short_git_id = "v1.6.0" # Suppresses matplotlib warning (https://github.com/jaleezyy/covid-19-signal/issues/59) # Creates a small memory leak, but it's nontrivial to fix, and won't be a practical concern! diff --git a/signal.py b/signal.py index 00764cd..b5e13bc 100755 --- a/signal.py +++ b/signal.py @@ -9,7 +9,7 @@ from pathlib import Path def create_parser(): - allowed = {'all': False, 'postprocess': False, 'ncov_tools': False} + allowed = {'install': False, 'all': False, 'postprocess': False, 'ncov_tools': False} parser = argparse.ArgumentParser(prog='signal.py', description="SARS-CoV-2 Illumina GeNome Assembly Line (SIGNAL) aims to take Illumina short-read sequences and perform consensus assembly + variant calling for ongoing surveillance and research efforts towards the emergent coronavirus: Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2).") parser.add_argument('all', nargs='*', @@ -18,6 +18,8 @@ def create_parser(): help="Run SIGNAL postprocessing on completed SIGNAL run. '--configfile' is required but will be generated if '--directory' is provided") parser.add_argument('ncov_tools', nargs='*', help="Generate configuration file and filesystem setup required and then execute ncov-tools quality control assessment. Requires 'ncov-tools' submodule! '--configfile' is required but will be generated if '--directory' is provided") + parser.add_argument('install', nargs='*', + help="Install individual rule environments and ensure SIGNAL is functional") parser.add_argument('-c', '--configfile', type=check_file, default=None, help="Configuration file (i.e., config.yaml) for SIGNAL analysis") parser.add_argument('-d', '--directory', type=check_directory, default=None, @@ -28,16 +30,18 @@ def create_parser(): parser.add_argument('--add-breseq', action='store_true', help="Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for analysis to complete)") parser.add_argument('-neg', '--neg-prefix', default=None, help="Configuration file generator parameter. Comma-separated list of negative sontrol sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will be left empty, if not provided") parser.add_argument('--dependencies', action='store_true', help="Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other flags! (~10 GB storage required)") + parser.add_argument('--data', default='data', help="Data dependencies parameter. Set location for data dependancies. If '--dependancies' is run, a folder will be created in the specified directory. If '--config-only' or '--directory' is used, the value will be applied to the configuration file. Default = 'data'") parser.add_argument('-ri', '--rerun-incomplete', action='store_true', help="Snakemake parameter. Re-run any incomplete samples from a previously failed run") parser.add_argument('--unlock', action='store_true', help="Snakemake parameter. Remove a lock on the working directory after a failed run") parser.add_argument('-F', '--forceall', action='store_true', help='Snakemake parameter. Force the re-run of all rules regardless of prior output') parser.add_argument('-n', '--dry-run', action='store_true', help='Snakemake parameter. Do not execute anything and only display what would be done') + ### add --quiet parser.add_argument('--verbose', action='store_true', help="Snakemake parameter. Display snakemake debugging output") parser.add_argument('-v', '--version', action='store_true', help="Display version number") args, unknown = parser.parse_known_args() provided = [] - for opt in allowed: # ['all', 'postprocess', 'ncov_tools'] + for opt in allowed: # ['install', 'all', 'postprocess', 'ncov_tools'] if len(getattr(args, opt)) > 0: provided = provided + getattr(args, opt) getattr(args, opt).clear() @@ -122,8 +126,8 @@ def write_sample_table(sample_data, output_table): for sample in sample_data: out_fh.write(",".join(sample) + '\n') -def download_dependences(): - dir_name = 'data' +def download_dependences(data): + dir_name = data script = os.path.join(script_path, 'scripts', 'get_data_dependencies.sh') subprocess.run(['bash', script, '-d', dir_name, '-a', 'MN908947.3']) @@ -135,7 +139,7 @@ def generate_sample_table(project_directory, project_name): out_table = project_name + "_sample_table.csv" subprocess.run(['bash', script, '-d', project_directory, '-n', out_table]) -def write_config_file(run_name, config_file, opt_tasks): +def write_config_file(run_name, config_file, data_directory, opt_tasks): ### opt_tasks = [args.breseq, args.freebayes, [args.neg_prefix]] - latter only applies to SIGNAL v1.5.8 and earlier config = f"""# This file contains a high-level summary of pipeline configuration and inputs. @@ -157,26 +161,26 @@ def write_config_file(run_name, config_file, opt_tasks): scheme_bed: 'resources/primer_schemes/artic_v3/nCoV-2019.bed' # Path from snakemake dir to bwa indexed human + viral reference genome -composite_reference: 'data/composite_human_viral_reference.fna' +composite_reference: "{data_directory}/composite_human_viral_reference.fna" # Used as bwa reference genome when removing host sequences. # Also used as 'ivar' reference genome in variant detection + consensus. # Used as -r,-g arguments to 'quast' # contig needed for hostremoval filtering script viral_reference_contig_name: 'MN908947.3' -viral_reference_genome: 'data/MN908947.3.fasta' -viral_reference_feature_coords: 'data/MN908947.3.gff3' +viral_reference_genome: "{data_directory}/MN908947.3.fasta" +viral_reference_feature_coords: "{data_directory}/MN908947.3.gff3" # breseq_reference must be defined if run_breseq == True run_breseq: {opt_tasks[0]} # Used as --reference argument to 'breseq' -breseq_reference: 'data/MN908947.3.gbk' +breseq_reference: "{data_directory}/MN908947.3.gbk" # run freebayes for variant and consensus calling (as well as ivar) run_freebayes: {opt_tasks[1]} # Used as --db argument to 'kraken2' -kraken2_db: 'data/Kraken2/db' +kraken2_db: "{data_directory}/Kraken2/db" # For Ivar's amplicon filter # https://github.com/andersen-lab/ivar/commit/7027563fd75581c78dabc6040ebffdee2b24abe6 @@ -226,7 +230,7 @@ def write_config_file(run_name, config_file, opt_tasks): amplicon_loc_bed: 'resources/primer_schemes/artic_v3/ncov-qc_V3.scheme.bed' # fasta of sequences to include with pangolin phylogeny -phylo_include_seqs: "data/blank.fasta" +phylo_include_seqs: "{data_directory}/blank.fasta" # List of negative control sample names or prefixes (i.e., ['Blank'] will cover Blank1, Blank2, etc.) negative_control_prefix: {opt_tasks[2]}""" @@ -234,11 +238,14 @@ def write_config_file(run_name, config_file, opt_tasks): with open(config_file, 'w') as fh: fh.write(config) +def test_signal(data): + pass + if __name__ == '__main__': # note: add root_dir to determine the root directory of SIGNAL script_path = os.path.join(os.path.abspath(sys.argv[0]).rsplit("/",1)[0]) args, allowed = create_parser() - version = 'v1.5.9' + version = 'v1.6.0' alt_options = [] if args.version: @@ -246,7 +253,7 @@ def write_config_file(run_name, config_file, opt_tasks): if args.dependencies: print("Downloading necessary reference and dependency files!") - download_dependences() + download_dependences(args.data) exit("Download complete!") if args.configfile is None: @@ -258,7 +265,7 @@ def write_config_file(run_name, config_file, opt_tasks): neg = [pre.replace(" ","") for pre in args.neg_prefix.split(",")] else: neg = [args.neg_prefix] - write_config_file(run_name, config_file, [args.add_breseq, args.remove_freebayes, neg]) + write_config_file(run_name, config_file, args.data, [args.add_breseq, args.remove_freebayes, neg]) if args.config_only: exit("Configuration file and sample table generated!") else: @@ -274,7 +281,7 @@ def write_config_file(run_name, config_file, opt_tasks): if args.rerun_incomplete: alt_options.append('--rerun-incomplete') opt = " ".join(alt_options) for task in allowed: - if allowed[task] is True: + if (allowed[task] is True) and (task != 'install'): print(f"Running SIGNAL {task}!") try: subprocess.run(f"snakemake --conda-frontend mamba --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp {opt}", shell=True, check=True) @@ -288,5 +295,9 @@ def write_config_file(run_name, config_file, opt_tasks): subprocess.run(f"snakemake --conda-frontend conda --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True) except subprocess.CalledProcessError: exit(f"Something went wrong running SIGNAL {task}! Check input and try again!") + else: + print(f"Installing SIGNAL environments!") + + exit() exit("SIGNAL completed successfully!") From 1286eb68c05d36775b5507f6a8ed8bb12c169f81 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Mon, 13 Mar 2023 19:42:52 +0000 Subject: [PATCH 02/28] add expected file change rules --- Snakefile | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index 76a4e5d..e8de728 100644 --- a/Snakefile +++ b/Snakefile @@ -129,21 +129,25 @@ rule clean_reads: rule consensus: input: expand('{sn}/core/{sn}.consensus.fa', sn=sample_names) +rule core_genomes: + input: 'all_genomes.fa' + rule ivar_variants: input: expand('{sn}/core/{sn}_ivar_variants.tsv', sn=sample_names) rule breseq: input: expand('{sn}/breseq/output/index.html', sn=sample_names) + rule freebayes: - input: + input: + 'all_freebayes_genomes.fa', expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names), expand('{sn}/freebayes/{sn}.variants.norm.vcf', sn=sample_names), 'freebayes_lineage_assignments.tsv', expand('{sn}/freebayes/quast/{sn}_quast_report.html', sn=sample_names), expand('{sn}/freebayes/{sn}_consensus_compare.vcf', sn=sample_names) - rule coverage: input: expand('{sn}/coverage/{sn}_depth.txt', sn=sample_names) @@ -158,6 +162,7 @@ rule quast: rule lineages: input: + rules.core_genomes.input, 'input_pangolin_versions.txt', 'input_nextclade_versions.txt', 'lineage_assignments.tsv' @@ -769,6 +774,9 @@ rule run_quast_freebayes: 'quast {input} -r {params.genome} -g {params.fcoords} --output-dir {params.outdir} --threads {threads} >{log} && ' 'for f in {params.unlabelled_reports}; do mv $f ${{f/report/{params.sample_name}}}; done' +rule collect_core_genomes: + + rule run_lineage_assignment: threads: 4 conda: 'conda_envs/assign_lineages.yaml' @@ -797,6 +805,9 @@ rule run_lineage_assignment: 'cat {input} > all_genomes.fa && ' '{params.assignment_script_path} -i all_genomes.fa -t {threads} -o {output.lin_out} -p {output.pango_ver_out} -n {output.nextclade_ver_out} --mode {params.analysis_mode}' +rule collect_freebayes_genomes: + + rule run_lineage_assignment_freebayes: threads: 4 conda: 'conda_envs/assign_lineages.yaml' From b711850bc1d0584028def93bd73a2ddcb2d26c4a Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Wed, 15 Mar 2023 13:04:57 +0000 Subject: [PATCH 03/28] update dependencies --- resources/dependencies | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/resources/dependencies b/resources/dependencies index 5573562..b4f5097 100644 --- a/resources/dependencies +++ b/resources/dependencies @@ -4,6 +4,8 @@ rule all: "ivar", "snp_mapping", "trim_qc", + "assign_lineages.yaml", + "freebayes.yaml", "postprocessing" shell: "rm {input}" @@ -27,6 +29,16 @@ rule trim_qc: output: "trim_qc" shell: "touch {output}" +rule assign_lineages: + conda: "../conda_envs/assign_lineages.yaml" + output: "assign_lineages" + shell: "touch {output}" + +rule freebayes: + conda: "../conda_envs/freebayes.yaml" + output: "freebayes" + shell: "touch {output}" + rule postprocessing: conda: "../conda_envs/postprocessing.yaml" output: "postprocessing" From 6febff3872ac4f973106a78b5a1230fd12873975 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Wed, 15 Mar 2023 13:29:55 +0000 Subject: [PATCH 04/28] update rules for collating consensus genomes --- Snakefile | 56 ++++++++++++++++++++++++++++++++++++++++--------------- signal.py | 17 +++++++++-------- 2 files changed, 50 insertions(+), 23 deletions(-) diff --git a/Snakefile b/Snakefile index e8de728..1df78d6 100644 --- a/Snakefile +++ b/Snakefile @@ -127,10 +127,9 @@ rule clean_reads: expand('{sn}/mapped_clean_reads/{sn}_R{r}.fastq.gz', sn=sample_names, r=[1,2]) rule consensus: - input: expand('{sn}/core/{sn}.consensus.fa', sn=sample_names) - -rule core_genomes: - input: 'all_genomes.fa' + input: expand('{sn}/core/{sn}.consensus.fa', sn=sample_names), + 'all_genomes.fa', +# 'failed_samples.log' rule ivar_variants: input: expand('{sn}/core/{sn}_ivar_variants.tsv', sn=sample_names) @@ -138,10 +137,10 @@ rule ivar_variants: rule breseq: input: expand('{sn}/breseq/output/index.html', sn=sample_names) - rule freebayes: input: 'all_freebayes_genomes.fa', +# 'failed_samples.log', expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names), expand('{sn}/freebayes/{sn}.variants.norm.vcf', sn=sample_names), 'freebayes_lineage_assignments.tsv', @@ -162,7 +161,6 @@ rule quast: rule lineages: input: - rules.core_genomes.input, 'input_pangolin_versions.txt', 'input_nextclade_versions.txt', 'lineage_assignments.tsv' @@ -332,7 +330,7 @@ rule raw_reads_composite_reference_bwa_map: shell: '(bwa mem -t {threads} {params.composite_index} ' '{input.raw_r1} {input.raw_r2} | ' - '{params.script_path} -c {params.viral_contig_name} > {output}) 2> {log}' + "{params.script_path} -c {params.viral_contig_name} > {output}) 2> {log} || echo '' > {output}" rule get_host_removed_reads: threads: 2 @@ -775,7 +773,15 @@ rule run_quast_freebayes: 'for f in {params.unlabelled_reports}; do mv $f ${{f/report/{params.sample_name}}}; done' rule collect_core_genomes: - + output: + all = "all_genomes.fa", + #failed = "failed_samples.log" + input: + expand(['{sn}/core/{sn}.consensus.fa'], sn=sample_names) + shell: + """ + cat {input} > {output.all} + """ rule run_lineage_assignment: threads: 4 @@ -785,7 +791,7 @@ rule run_lineage_assignment: nextclade_ver_out = 'input_nextclade_versions.txt', lin_out = 'lineage_assignments.tsv' input: - expand('{sn}/core/{sn}.consensus.fa', sn=sample_names) + 'all_genomes.fa' params: pangolin_ver = versions['pangolin'], pangolearn_ver = versions['pangolearn'], @@ -802,11 +808,32 @@ rule run_lineage_assignment: shell: "echo -e 'pangolin: {params.pangolin_ver}\nconstellations: {params.constellations_ver}\nscorpio: {params.scorpio_ver}\npangolearn: {params.pangolearn_ver}\npango-designation: {params.designation_ver}\npangolin-data: {params.data_ver}' > {output.pango_ver_out} && " "echo -e 'nextclade: {params.nextclade_ver}\nnextclade-dataset: {params.nextclade_data}\nnextclade-include-recomb: {params.nextclade_recomb}' > {output.nextclade_ver_out} && " - 'cat {input} > all_genomes.fa && ' - '{params.assignment_script_path} -i all_genomes.fa -t {threads} -o {output.lin_out} -p {output.pango_ver_out} -n {output.nextclade_ver_out} --mode {params.analysis_mode}' + '{params.assignment_script_path} -i {input} -t {threads} -o {output.lin_out} -p {output.pango_ver_out} -n {output.nextclade_ver_out} --mode {params.analysis_mode}' rule collect_freebayes_genomes: - + output: + "all_freebayes_genomes.fa", + input: + expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names), +# params: +# failed = "failed_samples.log" + shell: + """ + cat {input} > {output} + """ +# shell: +# """ +# samples=({input}) +# for file in $samples; do +# s=$(basename $file | cut -d. -f1) +# count=$(cat $file | grep -v '>' | grep -cv 'N') +# if [[ -f $file ]] && [[ ! $count -eq 0 ]]; then +# cat $file >> {output} +# else +# echo $s >> {params.failed} +# fi +# done +# """ rule run_lineage_assignment_freebayes: threads: 4 @@ -816,10 +843,9 @@ rule run_lineage_assignment_freebayes: input: p_vers = 'input_pangolin_versions.txt', n_vers = 'input_nextclade_versions.txt', - consensus = expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names) + consensus = 'all_freebayes_genomes.fa' params: analysis_mode = pango_speed, assignment_script_path = os.path.join(exec_dir, 'scripts', 'assign_lineages.py') shell: - 'cat {input.consensus} > all_freebayes_genomes.fa && ' - '{params.assignment_script_path} -i all_freebayes_genomes.fa -t {threads} -o {output} -p {input.p_vers} -n {input.n_vers} --mode {params.analysis_mode} --skip' + '{params.assignment_script_path} -i {input.consensus} -t {threads} -o {output} -p {input.p_vers} -n {input.n_vers} --mode {params.analysis_mode} --skip' diff --git a/signal.py b/signal.py index b5e13bc..0b39b82 100755 --- a/signal.py +++ b/signal.py @@ -28,14 +28,14 @@ def create_parser(): parser.add_argument('--config-only', action='store_true', help="Generate sample table and configuration file (i.e., config.yaml) and exit. '--directory' required") parser.add_argument('--remove-freebayes', action='store_false', help="Configuration file generator parameter. Set flag to DISABLE freebayes variant calling (improves overall speed)") parser.add_argument('--add-breseq', action='store_true', help="Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for analysis to complete)") - parser.add_argument('-neg', '--neg-prefix', default=None, help="Configuration file generator parameter. Comma-separated list of negative sontrol sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will be left empty, if not provided") + parser.add_argument('-neg', '--neg-prefix', default=None, help="Configuration file generator parameter. Comma-separated list of negative control sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will be left empty, if not provided") parser.add_argument('--dependencies', action='store_true', help="Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other flags! (~10 GB storage required)") parser.add_argument('--data', default='data', help="Data dependencies parameter. Set location for data dependancies. If '--dependancies' is run, a folder will be created in the specified directory. If '--config-only' or '--directory' is used, the value will be applied to the configuration file. Default = 'data'") parser.add_argument('-ri', '--rerun-incomplete', action='store_true', help="Snakemake parameter. Re-run any incomplete samples from a previously failed run") parser.add_argument('--unlock', action='store_true', help="Snakemake parameter. Remove a lock on the working directory after a failed run") parser.add_argument('-F', '--forceall', action='store_true', help='Snakemake parameter. Force the re-run of all rules regardless of prior output') parser.add_argument('-n', '--dry-run', action='store_true', help='Snakemake parameter. Do not execute anything and only display what would be done') - ### add --quiet + parser.add_argument('-q', '--quiet', action='store_true', help="Snakemake parameter. Do not output any progress or rule information. If used with '--dry-run`, it will just display a summary of the DAG of jobs") parser.add_argument('--verbose', action='store_true', help="Snakemake parameter. Display snakemake debugging output") parser.add_argument('-v', '--version', action='store_true', help="Display version number") args, unknown = parser.parse_known_args() @@ -275,13 +275,17 @@ def test_signal(data): exit("No task specified! Please provide at least one of 'all', 'postprocess', or 'ncov_tools'! See 'signal.py -h' for details!") else: if args.verbose: alt_options.append('--verbose') + if args.quiet: alt_options.append('--quiet') if args.unlock: alt_options.append('--unlock') if args.forceall: alt_options.append('--forceall') if args.dry_run: alt_options.append('--dry-run') if args.rerun_incomplete: alt_options.append('--rerun-incomplete') opt = " ".join(alt_options) for task in allowed: - if (allowed[task] is True) and (task != 'install'): + if allowed[task] is True: + if task == 'install': + print(f"Installing SIGNAL environments!") + exit() print(f"Running SIGNAL {task}!") try: subprocess.run(f"snakemake --conda-frontend mamba --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp {opt}", shell=True, check=True) @@ -294,10 +298,7 @@ def test_signal(data): print("Retrying...") subprocess.run(f"snakemake --conda-frontend conda --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True) except subprocess.CalledProcessError: - exit(f"Something went wrong running SIGNAL {task}! Check input and try again!") - else: - print(f"Installing SIGNAL environments!") - + exit(f"Something went wrong running SIGNAL {task}! Check input and logs and try again!") exit() - exit("SIGNAL completed successfully!") + exit("SIGNAL run complete! Check corresponding snakemake logs for any details!") From 882b7afe880f1afa009f9971def8fd565cae4a63 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Wed, 15 Mar 2023 13:31:50 +0000 Subject: [PATCH 05/28] update dependencies --- resources/dependencies | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/dependencies b/resources/dependencies index b4f5097..e223532 100644 --- a/resources/dependencies +++ b/resources/dependencies @@ -4,8 +4,8 @@ rule all: "ivar", "snp_mapping", "trim_qc", - "assign_lineages.yaml", - "freebayes.yaml", + "assign_lineages", + "freebayes", "postprocessing" shell: "rm {input}" From c89bea97942f25fdf93c1c76ae524d2940586836 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Thu, 16 Mar 2023 02:55:43 +0000 Subject: [PATCH 06/28] remove nodejs --- conda_envs/assign_lineages.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/conda_envs/assign_lineages.yaml b/conda_envs/assign_lineages.yaml index a5eecb8..fc4e8fd 100644 --- a/conda_envs/assign_lineages.yaml +++ b/conda_envs/assign_lineages.yaml @@ -11,7 +11,6 @@ dependencies: - python>=3.7 - snakemake-minimal - gofasta - - nodejs - usher - pandas - pysam==0.16.0.1 From c44fc96bfd779f7a31a052facdbd8ee3dd5f80ae Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Thu, 16 Mar 2023 02:56:23 +0000 Subject: [PATCH 07/28] add conditionals if dependency found --- scripts/get_data_dependencies.sh | 66 ++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 20 deletions(-) diff --git a/scripts/get_data_dependencies.sh b/scripts/get_data_dependencies.sh index c525294..43bcf68 100755 --- a/scripts/get_data_dependencies.sh +++ b/scripts/get_data_dependencies.sh @@ -14,34 +14,42 @@ accession="MN908947.3" HELP=""" Flags: - -d : Directory to configure database within (~10GB) - -a : Accession to use as viral reference (default=MN908947.3) + -d : Directory to configure database within (~10GB) + -a : Accession to use as viral reference (default=MN908947.3) """ while getopts ":d:a:" option; do - case "${option}" in - d) database_dir=$OPTARG;; - a) accession=$OPTARG;; - esac + case "${option}" in + d) database_dir=$OPTARG;; + a) accession=$OPTARG;; + esac done if [ $database_dir = 0 ] ; then - echo "You must specify a data directory to install data dependencies." - echo "$HELP" - exit 1 + echo "You must specify a data directory to install data dependencies." + echo "$HELP" + exit 1 fi echo -e "Warning: \n - final databases require ~10GB of storage\n - building databases temporarily requires a peak of ~35GB of storage and ~4GB of memory \n - script takes up to ~1.5 hours (system depending)" # make database dir and get abspath to it -mkdir -p $database_dir +if [ ! -d $database_dir ]; then mkdir -p $database_dir; fi database_dir=$(realpath $database_dir) # use curl to grab "simple data dependencies" -curl -s "https://raw.githubusercontent.com/timflutre/trimmomatic/3694641a92d4dd9311267fed85b05c7a11141e7c/adapters/NexteraPE-PE.fa" > $database_dir/NexteraPE-PE.fa -curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=gb&retmode=txt" > $database_dir/$accession.gbk -curl -s "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${accession}" > $database_dir/$accession.gff3 -curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=fasta&retmode=txt" > $database_dir/$accession.fasta +if [ ! -f $database_dir/'NexteraPE-PE.fa' ]; then + curl -s "https://raw.githubusercontent.com/timflutre/trimmomatic/3694641a92d4dd9311267fed85b05c7a11141e7c/adapters/NexteraPE-PE.fa" > $database_dir/NexteraPE-PE.fa +fi +if [ ! -f $database_dir/${accession}.gbk ]; then + curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=gb&retmode=txt" > $database_dir/$accession.gbk +fi +if [ ! -f $database_dir/${accession}.gff3 ]; then + curl -s "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${accession}" > $database_dir/$accession.gff3 +fi +if [ ! -f $database_dir/${accession}.fasta ]; then + curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=fasta&retmode=txt" > $database_dir/$accession.fasta +fi # install and activate env for kraken/bwa to build their databases/index CONDA_BASE=$($CONDA_EXE info --base) @@ -51,19 +59,37 @@ conda activate data_dependencies # get the GRCh38 human genome # as per https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use -curl -s "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" > $database_dir/GRC38_no_alt_analysis_set.fna.gz -gunzip $database_dir/GRC38_no_alt_analysis_set.fna.gz +if [ ! -f $database_dir/"GRC38_no_alt_analysis_set.fna" ]; then + curl -s "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" > $database_dir/GRC38_no_alt_analysis_set.fna.gz + gunzip $database_dir/GRC38_no_alt_analysis_set.fna.gz +fi # create composite reference of human and virus for competitive bwt mapping # based host removal +if [ ! -f $database_dir/'composite_human_viral_reference.fna' ]; then cat $database_dir/GRC38_no_alt_analysis_set.fna $database_dir/$accession.fasta > $database_dir/composite_human_viral_reference.fna -bwa index $database_dir/composite_human_viral_reference.fna +fi +for file in $database_dir/composite_human_viral_reference.fna.{amb,ann,bwt,pac,sa}; do + if [ ! -f $file ]; then + bwa index $database_dir/composite_human_viral_reference.fna + break + else + continue + fi +done # get kraken2 viral db mkdir -p $database_dir/Kraken2/db -curl -s "https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20210517.tar.gz" > $database_dir/Kraken2/db/k2_viral_20210517.tar.gz -cd $database_dir/Kraken2/db -tar xvf k2_viral_20210517.tar.gz +for file in $database_dir/Kraken2/db/{hash,opts,taxo}.k2d; do + if [ ! -f $file ]; then + curl -s "https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20210517.tar.gz" > $database_dir/Kraken2/db/k2_viral_20210517.tar.gz + cd $database_dir/Kraken2/db + tar xvf k2_viral_20210517.tar.gz + break + else + continue + fi +done # create blank fasta for 'phylo_include_seqs' touch $database_dir/blank.fasta From 3b0a189caf0ea3c72ae4dd99c52b9120f3a16926 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Thu, 16 Mar 2023 02:57:21 +0000 Subject: [PATCH 08/28] add install options and improved handling of failed samples --- Snakefile | 76 ++++++++++++++++++++++++++++++++++++++++--------------- signal.py | 59 +++++++++++++++++++++++++++--------------- 2 files changed, 93 insertions(+), 42 deletions(-) diff --git a/Snakefile b/Snakefile index 1df78d6..4c45f1d 100644 --- a/Snakefile +++ b/Snakefile @@ -378,7 +378,7 @@ rule run_trimgalore: shell: 'trim_galore --quality {params.min_qual} --length {params.min_len} ' ' -o {params.output_prefix} --cores {threads} --fastqc ' - '--paired {input.raw_r1} {input.raw_r2} 2> {log} || touch {output}' + "--paired {input.raw_r1} {input.raw_r2} 2> {log} || (echo -e 'Total reads processed: 0\nReads written (passing filters): 0 (0.0%)\nTotal basepairs processed: 0 bp\nTotal written (filtered): 0 bp (0.0%)' >> {log}; touch {output})" rule run_filtering_of_residual_adapters: threads: 2 @@ -774,13 +774,28 @@ rule run_quast_freebayes: rule collect_core_genomes: output: - all = "all_genomes.fa", - #failed = "failed_samples.log" + "all_genomes.fa" input: expand(['{sn}/core/{sn}.consensus.fa'], sn=sample_names) shell: """ - cat {input} > {output.all} + cat {input} > {output} + sample='' + count='' + echo "Samples that failed to assemble:" > failed_samples.log + while read -r line; + do + if [[ $line =~ '>' ]]; then + sample=$(echo $line | cut -d'.' -f1 | cut -d'_' -f2) + else + count=$(echo $line | wc -c) + if [[ $count -eq 1 ]]; then + echo $sample >> failed_samples.log + else + continue + fi + fi + done < {output} """ rule run_lineage_assignment: @@ -812,28 +827,47 @@ rule run_lineage_assignment: rule collect_freebayes_genomes: output: - "all_freebayes_genomes.fa", + "all_freebayes_genomes.fa" input: - expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names), -# params: -# failed = "failed_samples.log" + expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names) shell: """ cat {input} > {output} + sample='' + seq='' + count='' + out='' + if [[ -f 'failed_samples.log' ]]; then + out='.failed_freebayes_samples.tmp' + cat failed_samples.log | sed 1,1d > $out + echo "Samples that failed to assemble:" > failed_samples.log + else + out='failed_samples.log' + echo "Samples that failed to assemble:" > $out + fi + while read -r line; + do + if [[ $line =~ '>' ]]; then + if [[ $(echo $seq | wc -c) -eq 1 ]]; then # check if new seq + count=$(echo $seq | grep -vc 'N') + if [[ $count -eq 0 ]]; then + echo $sample >> $out + fi + sample=$(echo $line | cut -d'>' -f2) # start new seq + seq='' + else + sample=$(echo $line | cut -d'>' -f2) # first seq + fi + else + seq+=$line # append seq + fi + done < {output} + + if [[ ! $out == 'failed_samples.log' ]]; then + sort -b -d -f $out | uniq >> failed_samples.log + rm $out + fi """ -# shell: -# """ -# samples=({input}) -# for file in $samples; do -# s=$(basename $file | cut -d. -f1) -# count=$(cat $file | grep -v '>' | grep -cv 'N') -# if [[ -f $file ]] && [[ ! $count -eq 0 ]]; then -# cat $file >> {output} -# else -# echo $s >> {params.failed} -# fi -# done -# """ rule run_lineage_assignment_freebayes: threads: 4 diff --git a/signal.py b/signal.py index 0b39b82..1788cd0 100755 --- a/signal.py +++ b/signal.py @@ -19,7 +19,7 @@ def create_parser(): parser.add_argument('ncov_tools', nargs='*', help="Generate configuration file and filesystem setup required and then execute ncov-tools quality control assessment. Requires 'ncov-tools' submodule! '--configfile' is required but will be generated if '--directory' is provided") parser.add_argument('install', nargs='*', - help="Install individual rule environments and ensure SIGNAL is functional") + help="Install individual rule environments and ensure SIGNAL is functional. The only parameters operable will be '--data' and '--skip-test'. Will override other operations!") parser.add_argument('-c', '--configfile', type=check_file, default=None, help="Configuration file (i.e., config.yaml) for SIGNAL analysis") parser.add_argument('-d', '--directory', type=check_directory, default=None, @@ -29,13 +29,15 @@ def create_parser(): parser.add_argument('--remove-freebayes', action='store_false', help="Configuration file generator parameter. Set flag to DISABLE freebayes variant calling (improves overall speed)") parser.add_argument('--add-breseq', action='store_true', help="Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for analysis to complete)") parser.add_argument('-neg', '--neg-prefix', default=None, help="Configuration file generator parameter. Comma-separated list of negative control sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will be left empty, if not provided") - parser.add_argument('--dependencies', action='store_true', help="Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other flags! (~10 GB storage required)") - parser.add_argument('--data', default='data', help="Data dependencies parameter. Set location for data dependancies. If '--dependancies' is run, a folder will be created in the specified directory. If '--config-only' or '--directory' is used, the value will be applied to the configuration file. Default = 'data'") + parser.add_argument('--dependencies', action='store_true', help="Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other parameters! (~10 GB storage required)") + parser.add_argument('--data', default='data', help="SIGNAL install and data dependencies parameter. Set location for data dependancies. When used with 'SIGNAL install', any tests run will use the dependencies located at this directory. If '--dependancies' is run, a folder will be created in the specified directory. If '--config-only' or '--directory' is used, the value will be applied to the configuration file. Default = 'data'") + parser.add_argument('--skip-test', action='store_true', help='SIGNAL install parameter. Skip SIGNAL testing after environment installation using curated test data') parser.add_argument('-ri', '--rerun-incomplete', action='store_true', help="Snakemake parameter. Re-run any incomplete samples from a previously failed run") + parser.add_argument('-ii', '--ignore-incomplete', action='store_true', help='Snakemake parameter. Do not check for incomplete output files') parser.add_argument('--unlock', action='store_true', help="Snakemake parameter. Remove a lock on the working directory after a failed run") parser.add_argument('-F', '--forceall', action='store_true', help='Snakemake parameter. Force the re-run of all rules regardless of prior output') parser.add_argument('-n', '--dry-run', action='store_true', help='Snakemake parameter. Do not execute anything and only display what would be done') - parser.add_argument('-q', '--quiet', action='store_true', help="Snakemake parameter. Do not output any progress or rule information. If used with '--dry-run`, it will just display a summary of the DAG of jobs") + parser.add_argument('--quiet', action='store_true', help="Snakemake parameter. Do not output any progress or rule information. If used with '--dry-run`, it will just display a summary of the DAG of jobs") parser.add_argument('--verbose', action='store_true', help="Snakemake parameter. Display snakemake debugging output") parser.add_argument('-v', '--version', action='store_true', help="Display version number") args, unknown = parser.parse_known_args() @@ -53,16 +55,7 @@ def create_parser(): allowed[val.lower()] = True else: print(f"Ignoring unknown command: {val}") - - # Unknown - # for x in unknown: - # filter out unknown options (like -b or --b or alll) - # exit with error - # if x.startswith(('-', '--')): - # parser.error(f"unknown argument {x}") - # identify what belongs where - # getattr(result, 'provided').append(x) - + return args, allowed def check_directory(path: str) -> Path: @@ -126,8 +119,7 @@ def write_sample_table(sample_data, output_table): for sample in sample_data: out_fh.write(",".join(sample) + '\n') -def download_dependences(data): - dir_name = data +def download_dependences(dir_name): script = os.path.join(script_path, 'scripts', 'get_data_dependencies.sh') subprocess.run(['bash', script, '-d', dir_name, '-a', 'MN908947.3']) @@ -238,8 +230,21 @@ def write_config_file(run_name, config_file, data_directory, opt_tasks): with open(config_file, 'w') as fh: fh.write(config) -def test_signal(data): - pass +def install_signal(data='data'): + """ + Install SIGNAL dependencies per rule and test using a sample dataset, if desired + """ + dep_snakefile = os.path.join(script_path, 'resources', 'dependancies') + assert os.path.exists(dep_snakefile) + try: + subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend mamba --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet") + except subprocess.CalledProcessError: # likely missing mamba + subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend conda --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet") + + # Test SIGNAL with data + if os.path.exists(data): + pass + if __name__ == '__main__': # note: add root_dir to determine the root directory of SIGNAL @@ -251,11 +256,15 @@ def test_signal(data): if args.version: exit(f"{version}") + if allowed['install']: + install_signal(args.data) + exit() + if args.dependencies: print("Downloading necessary reference and dependency files!") download_dependences(args.data) exit("Download complete!") - + if args.configfile is None: assert args.directory is not None, "Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!)" run_name = args.directory.name @@ -280,9 +289,10 @@ def test_signal(data): if args.forceall: alt_options.append('--forceall') if args.dry_run: alt_options.append('--dry-run') if args.rerun_incomplete: alt_options.append('--rerun-incomplete') + if args.ignore_incomplete: alt_options.append('--ignore-incomplete') opt = " ".join(alt_options) for task in allowed: - if allowed[task] is True: + if (allowed[task] is True) and (task != 'install'): if task == 'install': print(f"Installing SIGNAL environments!") exit() @@ -298,7 +308,14 @@ def test_signal(data): print("Retrying...") subprocess.run(f"snakemake --conda-frontend conda --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True) except subprocess.CalledProcessError: - exit(f"Something went wrong running SIGNAL {task}! Check input and logs and try again!") + if task == 'all': + print(f"Some jobs failed while running SIGNAL {task}! Samples that failed assembly can be found in 'failed_samples.log'! Otherwise, check your inputs and logs and try again!") + elif task == 'postprocess': + print(f"Some jobs failed while running SIGNAL {task}! Some output files may be missing! Check SIGNAL results and try again!") + elif task == 'ncov_tools': + print(f"Some jobs failed while running SIGNAL {task}! Check snakemake logs and the ncov-tools directory for additional details!") + else: + print(f"Some jobs failed while running SIGNAL {task}! Check inputs and logs and try again!") exit() exit("SIGNAL run complete! Check corresponding snakemake logs for any details!") From 26e2e09193631991ea96521e661d3afcdcae7d80 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Thu, 16 Mar 2023 14:44:50 +0000 Subject: [PATCH 09/28] update ncov_tools linking to remove failed samples --- Snakefile | 3 ++- scripts/ncov-tools.py | 35 +++++++++++++++++++++++++++-------- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/Snakefile b/Snakefile index 4c45f1d..1c32c76 100644 --- a/Snakefile +++ b/Snakefile @@ -242,7 +242,8 @@ rule ncov_tools: negative_control_prefix = config['negative_control_prefix'], freebayes_run = config['run_freebayes'], pangolin = versions['pangolin'], - mode = pango_speed + mode = pango_speed, + failed = os.path.join(result_dir, 'failed_samples.log') input: consensus = expand('{sn}/core/{sn}.consensus.fa', sn=sample_names), primertrimmed_bams = expand("{sn}/core/{sn}_viral_reference.mapping.primertrimmed.sorted.bam", sn=sample_names), diff --git a/scripts/ncov-tools.py b/scripts/ncov-tools.py index 76e5201..dc86baf 100755 --- a/scripts/ncov-tools.py +++ b/scripts/ncov-tools.py @@ -6,17 +6,21 @@ import fileinput import glob -def link_ivar(root, replace=False): +def link_ivar(root, replace=False, neg, failed): print("Linking iVar files to ncov-tools!") for variants in snakemake.input['variants']: sample = variants.split('/')[0] + if (sample in failed) and (sample not in neg): + continue ln_path = f"{root}/{sample}.variants.tsv" if (not os.path.exists(ln_path)) or (replace is True): os.link(variants, ln_path) for consensus in snakemake.input['consensus']: sample = consensus.split('/')[0] + if (sample in failed) and (sample not in neg): + continue ln_path = f"{root}/{sample}.consensus.fasta" if (not os.path.exists(ln_path)) or (replace is True): os.link(consensus, ln_path) @@ -30,15 +34,17 @@ def link_ivar(root, replace=False): # take sample name from iVar results, redirect to where corresponding FreeBayes should be # if FreeBayes file cannot be found, break from loop, replace all with iVar -def link_freebayes(root): +def link_freebayes(root, neg, failed): print("Linking FreeBayes files to ncov-tools!") for variants in snakemake.input['variants']: sample = variants.split('/')[0] + if (sample in failed) and (sample not in neg): + continue expected_path = os.path.join(sample, 'freebayes', sample+'.variants.norm.vcf') if not os.path.exists(expected_path): print("Missing FreeBayes variant file! Switching to iVar input!") - link_ivar(root, True) + link_ivar(root, True, neg, failed) break else: ln_path = f"{root}/{sample}.variants.vcf" @@ -47,10 +53,12 @@ def link_freebayes(root): for consensus in snakemake.input['consensus']: sample = consensus.split('/')[0] + if (sample in failed) and (sample not in neg): + continue expected_path = os.path.join(sample, 'freebayes', sample+'.consensus.fasta') if not os.path.exists(expected_path): print("Missing FreeBayes variant file! Switching to iVar input!") - link_ivar(root, True) + link_ivar(root, True, neg, failed) break else: ln_path = f"{root}/{sample}.consensus.fasta" @@ -99,6 +107,12 @@ def set_up(): neg_list = list(neg_samples) print("Negative control samples found include: %s" %(neg_list)) +### Pull failed samples (SIGNAL log file: failed_samples.log) + if os.path.exists(snakemake.params['failed']): + with open(snakemake.params['failed']) as fail: + failed_list = [i.strip() for i in fail.readlines()[1:]] + else: + failed_list = [] ### config.yaml parameters config = {'data_root': f"'{data_root}'", @@ -126,21 +140,26 @@ def set_up(): print("Linking alignment BAMs to ncov-tools!") for bam in snakemake.input['bams']: sample = bam.split('/')[0] + # if sample failed and not a negative, skip linking + if (sample in failed_list) and (sample not in neg_list): + continue ln_path = f"{data_root}/{sample}.bam" - if (not os.path.exists(ln_path)) or (replace is True): + if not os.path.exists(ln_path): os.link(bam, ln_path) for primer_trimmed_bam in snakemake.input['primertrimmed_bams']: sample = primer_trimmed_bam.split('/')[0] + if (sample in failed_list) and (sample not in neg_list): + continue ln_path = f"{data_root}/{sample}.mapped.primertrimmed.sorted.bam" - if (not os.path.exists(ln_path)) or (replace is True): + if not os.path.exists(ln_path): os.link(primer_trimmed_bam, ln_path) if snakemake.params['freebayes_run']: - link_freebayes(data_root) + link_freebayes(data_root, neg_list, failed_list) config['variants_pattern'] = "'{data_root}/{sample}.variants.vcf'" else: - link_ivar(data_root) + link_ivar(data_root, neg_list, failed_list) with open(os.path.join(exec_dir, 'ncov-tools', 'config.yaml'), 'w') as fh: for key, value in config.items(): From dea7448c55d165f9a328e53acf326eddcaf79faf Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Thu, 16 Mar 2023 22:47:23 +0000 Subject: [PATCH 10/28] update linking of SIGNAL results prior to ncov-tools --- Snakefile | 2 +- scripts/ncov-tools.py | 9 +++++---- scripts/run_ncov_tools.sh | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Snakefile b/Snakefile index 1c32c76..d807d96 100644 --- a/Snakefile +++ b/Snakefile @@ -243,7 +243,7 @@ rule ncov_tools: freebayes_run = config['run_freebayes'], pangolin = versions['pangolin'], mode = pango_speed, - failed = os.path.join(result_dir, 'failed_samples.log') + failed = 'failed_samples.log' input: consensus = expand('{sn}/core/{sn}.consensus.fa', sn=sample_names), primertrimmed_bams = expand("{sn}/core/{sn}_viral_reference.mapping.primertrimmed.sorted.bam", sn=sample_names), diff --git a/scripts/ncov-tools.py b/scripts/ncov-tools.py index dc86baf..e959778 100755 --- a/scripts/ncov-tools.py +++ b/scripts/ncov-tools.py @@ -6,7 +6,7 @@ import fileinput import glob -def link_ivar(root, replace=False, neg, failed): +def link_ivar(root, neg, failed, replace=False): print("Linking iVar files to ncov-tools!") for variants in snakemake.input['variants']: @@ -44,7 +44,7 @@ def link_freebayes(root, neg, failed): expected_path = os.path.join(sample, 'freebayes', sample+'.variants.norm.vcf') if not os.path.exists(expected_path): print("Missing FreeBayes variant file! Switching to iVar input!") - link_ivar(root, True, neg, failed) + link_ivar(root, neg, failed, replace=True) break else: ln_path = f"{root}/{sample}.variants.vcf" @@ -58,7 +58,7 @@ def link_freebayes(root, neg, failed): expected_path = os.path.join(sample, 'freebayes', sample+'.consensus.fasta') if not os.path.exists(expected_path): print("Missing FreeBayes variant file! Switching to iVar input!") - link_ivar(root, True, neg, failed) + link_ivar(root, neg, failed, replace=True) break else: ln_path = f"{root}/{sample}.consensus.fasta" @@ -113,6 +113,7 @@ def set_up(): failed_list = [i.strip() for i in fail.readlines()[1:]] else: failed_list = [] + print("Failed samples found include: %s" %(failed_list)) ### config.yaml parameters config = {'data_root': f"'{data_root}'", @@ -159,7 +160,7 @@ def set_up(): link_freebayes(data_root, neg_list, failed_list) config['variants_pattern'] = "'{data_root}/{sample}.variants.vcf'" else: - link_ivar(data_root, neg_list, failed_list) + link_ivar(data_root, neg_list, failed_list, replace=False) with open(os.path.join(exec_dir, 'ncov-tools', 'config.yaml'), 'w') as fh: for key, value in config.items(): diff --git a/scripts/run_ncov_tools.sh b/scripts/run_ncov_tools.sh index 2a5f098..5b26293 100755 --- a/scripts/run_ncov_tools.sh +++ b/scripts/run_ncov_tools.sh @@ -48,7 +48,7 @@ RESULTS=$PWD cd ../ncov-tools # run ncov-tools -snakemake -s workflow/Snakefile --cores ${CORES} all +snakemake -k -s workflow/Snakefile --cores ${CORES} all # move ncovresults to SIGNAL results directory mv ${SIGNAL}'_ncovresults' ${RESULTS}/ncov-tools-results From ac8bfaef9f4b19c9372ccda5c233a51b4d926fe7 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Thu, 16 Mar 2023 22:48:05 +0000 Subject: [PATCH 11/28] rename and restructure execution of snakemake + install --- signal.py => signalexe.py | 63 +++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 25 deletions(-) rename signal.py => signalexe.py (88%) diff --git a/signal.py b/signalexe.py similarity index 88% rename from signal.py rename to signalexe.py index 1788cd0..8d1006d 100755 --- a/signal.py +++ b/signalexe.py @@ -3,10 +3,16 @@ # v1.5.0+ # signal.py assumes Snakefile is in current working directory (i.e., SIGNAL root) +import signal import argparse import subprocess, os, sys import re from pathlib import Path +import platform + +# for compatibility between platforms +if platform.system() != 'Linux': + signal.SIGHUP = 1 def create_parser(): allowed = {'install': False, 'all': False, 'postprocess': False, 'ncov_tools': False} @@ -57,7 +63,14 @@ def create_parser(): print(f"Ignoring unknown command: {val}") return args, allowed - + +def check_frontend(): + try: + subprocess.check_call(['mamba', 'list'], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) + return 'mamba' + except subprocess.CalledProcessError: + return 'conda' + def check_directory(path: str) -> Path: """ Check an input directory exists and is readable @@ -230,16 +243,16 @@ def write_config_file(run_name, config_file, data_directory, opt_tasks): with open(config_file, 'w') as fh: fh.write(config) -def install_signal(data='data'): +def install_signal(frontend, data='data'): """ Install SIGNAL dependencies per rule and test using a sample dataset, if desired """ - dep_snakefile = os.path.join(script_path, 'resources', 'dependancies') + dep_snakefile = os.path.join(script_path, 'resources', 'dependencies') assert os.path.exists(dep_snakefile) try: - subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend mamba --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet") + subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend {frontend} --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet", shell=True, check=True) except subprocess.CalledProcessError: # likely missing mamba - subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend conda --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet") + exit("Installation of environments failed!") # Test SIGNAL with data if os.path.exists(data): @@ -256,8 +269,10 @@ def install_signal(data='data'): if args.version: exit(f"{version}") + conda_frontend = check_frontend() # 'mamba' or 'conda' + if allowed['install']: - install_signal(args.data) + install_signal(conda_frontend, args.data) exit() if args.dependencies: @@ -293,29 +308,27 @@ def install_signal(data='data'): opt = " ".join(alt_options) for task in allowed: if (allowed[task] is True) and (task != 'install'): - if task == 'install': - print(f"Installing SIGNAL environments!") - exit() print(f"Running SIGNAL {task}!") try: - subprocess.run(f"snakemake --conda-frontend mamba --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp {opt}", shell=True, check=True) - except subprocess.CalledProcessError: # likely missing mamba + subprocess.run(f"snakemake --conda-frontend {conda_frontend} --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp {opt}", shell=True, check=True) + except subprocess.CalledProcessError: if task == "ncov_tools": check_submodule(os.getcwd()) - if opt.split(" ")[-1] == '--rerun-incomplete': # remove redundant flag - opt = " ".join(opt.split(" ")[:-1]) - try: - print("Retrying...") - subprocess.run(f"snakemake --conda-frontend conda --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True) - except subprocess.CalledProcessError: - if task == 'all': - print(f"Some jobs failed while running SIGNAL {task}! Samples that failed assembly can be found in 'failed_samples.log'! Otherwise, check your inputs and logs and try again!") - elif task == 'postprocess': - print(f"Some jobs failed while running SIGNAL {task}! Some output files may be missing! Check SIGNAL results and try again!") - elif task == 'ncov_tools': + if opt.split(" ")[-1] == '--rerun-incomplete': # remove redundant flag + opt = " ".join(opt.split(" ")[:-1]) + try: + print("Retrying...ncov-tools!") + subprocess.run(f"snakemake --conda-frontend {conda_frontend} --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True) + except subprocess.CalledProcessError: print(f"Some jobs failed while running SIGNAL {task}! Check snakemake logs and the ncov-tools directory for additional details!") - else: - print(f"Some jobs failed while running SIGNAL {task}! Check inputs and logs and try again!") - exit() + continue + elif task == 'all': + print(f"Some jobs failed while running SIGNAL {task}! Samples that failed assembly can be found in 'failed_samples.log'! Otherwise, check your inputs and logs and try again!") + continue + elif task == 'postprocess': + print(f"Some jobs failed while running SIGNAL {task}! Some output files may be missing! Check SIGNAL results and try again!") + continue + else: + print(f"Some jobs failed while running SIGNAL {task}! Check SIGNAL inputs and results and try again!") exit("SIGNAL run complete! Check corresponding snakemake logs for any details!") From f59568c1a4dbc026fc3032c9da21cf816fbd7cf4 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Thu, 16 Mar 2023 22:49:21 +0000 Subject: [PATCH 12/28] cleanup and replace script name --- signalexe.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/signalexe.py b/signalexe.py index 8d1006d..5a0a4b4 100755 --- a/signalexe.py +++ b/signalexe.py @@ -1,7 +1,7 @@ #!/usr/bin/env python -# v1.5.0+ -# signal.py assumes Snakefile is in current working directory (i.e., SIGNAL root) +# v1.6.0+ +# signalexe.py assumes Snakefile is in current working directory (i.e., SIGNAL root) import signal import argparse @@ -17,7 +17,7 @@ def create_parser(): allowed = {'install': False, 'all': False, 'postprocess': False, 'ncov_tools': False} - parser = argparse.ArgumentParser(prog='signal.py', description="SARS-CoV-2 Illumina GeNome Assembly Line (SIGNAL) aims to take Illumina short-read sequences and perform consensus assembly + variant calling for ongoing surveillance and research efforts towards the emergent coronavirus: Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2).") + parser = argparse.ArgumentParser(prog='signalexe.py', description="SARS-CoV-2 Illumina GeNome Assembly Line (SIGNAL) aims to take Illumina short-read sequences and perform consensus assembly + variant calling for ongoing surveillance and research efforts towards the emergent coronavirus: Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2).") parser.add_argument('all', nargs='*', help="Run SIGNAL with all associated assembly rules. Does not include postprocessing '--configfile' or '--directory' required. The latter will automatically generate a configuration file and sample table. If both provided, then '--configfile' will take priority") parser.add_argument('postprocess', nargs='*', @@ -25,7 +25,7 @@ def create_parser(): parser.add_argument('ncov_tools', nargs='*', help="Generate configuration file and filesystem setup required and then execute ncov-tools quality control assessment. Requires 'ncov-tools' submodule! '--configfile' is required but will be generated if '--directory' is provided") parser.add_argument('install', nargs='*', - help="Install individual rule environments and ensure SIGNAL is functional. The only parameters operable will be '--data' and '--skip-test'. Will override other operations!") + help="Install individual rule environments and ensure SIGNAL is functional. The only parameter operable will be '--data'. Will override other operations!") parser.add_argument('-c', '--configfile', type=check_file, default=None, help="Configuration file (i.e., config.yaml) for SIGNAL analysis") parser.add_argument('-d', '--directory', type=check_directory, default=None, @@ -36,8 +36,8 @@ def create_parser(): parser.add_argument('--add-breseq', action='store_true', help="Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for analysis to complete)") parser.add_argument('-neg', '--neg-prefix', default=None, help="Configuration file generator parameter. Comma-separated list of negative control sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will be left empty, if not provided") parser.add_argument('--dependencies', action='store_true', help="Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other parameters! (~10 GB storage required)") - parser.add_argument('--data', default='data', help="SIGNAL install and data dependencies parameter. Set location for data dependancies. When used with 'SIGNAL install', any tests run will use the dependencies located at this directory. If '--dependancies' is run, a folder will be created in the specified directory. If '--config-only' or '--directory' is used, the value will be applied to the configuration file. Default = 'data'") - parser.add_argument('--skip-test', action='store_true', help='SIGNAL install parameter. Skip SIGNAL testing after environment installation using curated test data') + parser.add_argument('--data', default='data', help="SIGNAL install and data dependencies parameter. Set location for data dependancies. If '--dependancies' is run, a folder will be created in the specified directory. If '--config-only' or '--directory' is used, the value will be applied to the configuration file. (Upcoming feature): When used with 'SIGNAL install', any tests run will use the dependencies located at this directory. Default = 'data'") + #parser.add_argument('--enable-test', action='store_true', help='SIGNAL install parameter. Add SIGNAL testing after environment installation using curated test data') parser.add_argument('-ri', '--rerun-incomplete', action='store_true', help="Snakemake parameter. Re-run any incomplete samples from a previously failed run") parser.add_argument('-ii', '--ignore-incomplete', action='store_true', help='Snakemake parameter. Do not check for incomplete output files') parser.add_argument('--unlock', action='store_true', help="Snakemake parameter. Remove a lock on the working directory after a failed run") @@ -251,10 +251,10 @@ def install_signal(frontend, data='data'): assert os.path.exists(dep_snakefile) try: subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend {frontend} --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet", shell=True, check=True) - except subprocess.CalledProcessError: # likely missing mamba + except subprocess.CalledProcessError: exit("Installation of environments failed!") - # Test SIGNAL with data + ### TODO: Test SIGNAL with curated data if os.path.exists(data): pass @@ -273,7 +273,7 @@ def install_signal(frontend, data='data'): if allowed['install']: install_signal(conda_frontend, args.data) - exit() + exit("Installation of environments completed successfully!") if args.dependencies: print("Downloading necessary reference and dependency files!") @@ -296,7 +296,7 @@ def install_signal(frontend, data='data'): config_file = args.configfile if not any([allowed[x] for x in allowed]): - exit("No task specified! Please provide at least one of 'all', 'postprocess', or 'ncov_tools'! See 'signal.py -h' for details!") + exit("No task specified! Please provide at least one of 'all', 'postprocess', or 'ncov_tools'! See 'signalexe.py -h' for details!") else: if args.verbose: alt_options.append('--verbose') if args.quiet: alt_options.append('--quiet') @@ -323,12 +323,12 @@ def install_signal(frontend, data='data'): print(f"Some jobs failed while running SIGNAL {task}! Check snakemake logs and the ncov-tools directory for additional details!") continue elif task == 'all': - print(f"Some jobs failed while running SIGNAL {task}! Samples that failed assembly can be found in 'failed_samples.log'! Otherwise, check your inputs and logs and try again!") + print(f"Some jobs failed while running SIGNAL {task}! This does NOT necessarily mean your run was erroneous! Samples that failed assembly can be found in 'failed_samples.log'! If no such file exists or is blank, check your inputs and logs and try again!") continue elif task == 'postprocess': - print(f"Some jobs failed while running SIGNAL {task}! Some output files may be missing! Check SIGNAL results and try again!") + print(f"Some jobs failed while running SIGNAL {task}! Some output files may be missing! Check SIGNAL results and logs and try again!") continue else: - print(f"Some jobs failed while running SIGNAL {task}! Check SIGNAL inputs and results and try again!") + print(f"Some jobs failed while running SIGNAL {task}! Check SIGNAL inputs, logs, and results and try again!") exit("SIGNAL run complete! Check corresponding snakemake logs for any details!") From 3343c73376b1df31bb782b1e63daaf660b81b4cf Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Thu, 16 Mar 2023 22:58:34 +0000 Subject: [PATCH 13/28] correct exit codes --- signalexe.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/signalexe.py b/signalexe.py index 5a0a4b4..093659f 100755 --- a/signalexe.py +++ b/signalexe.py @@ -120,7 +120,8 @@ def check_submodule(exec_dir): print("Updating ncov-tools!") subprocess.run(['git', 'submodule', 'update', '--init', '--recursive']) except subprocess.CalledProcessError: - exit("Could not find nor update the required 'ncov-tools' directory! Manually download/update and try again!") + print("Could not find nor update the required 'ncov-tools' directory! Manually download/update and try again!") + sys.exit(1) def write_sample_table(sample_data, output_table): """ @@ -252,7 +253,8 @@ def install_signal(frontend, data='data'): try: subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend {frontend} --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet", shell=True, check=True) except subprocess.CalledProcessError: - exit("Installation of environments failed!") + print("Installation of environments failed!") + sys.exit(1) ### TODO: Test SIGNAL with curated data if os.path.exists(data): @@ -267,21 +269,26 @@ def install_signal(frontend, data='data'): alt_options = [] if args.version: - exit(f"{version}") + print(f"{version}") + sys.exit(0) conda_frontend = check_frontend() # 'mamba' or 'conda' if allowed['install']: install_signal(conda_frontend, args.data) - exit("Installation of environments completed successfully!") + print("Installation of environments completed successfully!") + sys.exit(0) if args.dependencies: print("Downloading necessary reference and dependency files!") download_dependences(args.data) - exit("Download complete!") + print("Complete!") + sys.exit(0) if args.configfile is None: - assert args.directory is not None, "Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!)" + if args.directory is None, + print("Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!") + sys.exit(1) run_name = args.directory.name generate_sample_table(args.directory, run_name) config_file = run_name + "_config.yaml" @@ -291,12 +298,14 @@ def install_signal(frontend, data='data'): neg = [args.neg_prefix] write_config_file(run_name, config_file, args.data, [args.add_breseq, args.remove_freebayes, neg]) if args.config_only: - exit("Configuration file and sample table generated!") + print("Configuration file and sample table generated!") + sys.exit(0) else: config_file = args.configfile if not any([allowed[x] for x in allowed]): - exit("No task specified! Please provide at least one of 'all', 'postprocess', or 'ncov_tools'! See 'signalexe.py -h' for details!") + print("No task specified! Please provide at least one of 'all', 'postprocess', or 'ncov_tools'! See 'signalexe.py -h' for details!") + sys.exit(1) else: if args.verbose: alt_options.append('--verbose') if args.quiet: alt_options.append('--quiet') @@ -331,4 +340,5 @@ def install_signal(frontend, data='data'): else: print(f"Some jobs failed while running SIGNAL {task}! Check SIGNAL inputs, logs, and results and try again!") - exit("SIGNAL run complete! Check corresponding snakemake logs for any details!") + print("SIGNAL run complete! Check corresponding snakemake logs for any details!") + sys.exit(0) From 930bf1750a597f0e9dd807505f9e86f128a42c57 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Thu, 16 Mar 2023 22:59:49 +0000 Subject: [PATCH 14/28] fix syntax --- signalexe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/signalexe.py b/signalexe.py index 093659f..570b885 100755 --- a/signalexe.py +++ b/signalexe.py @@ -286,7 +286,7 @@ def install_signal(frontend, data='data'): sys.exit(0) if args.configfile is None: - if args.directory is None, + if args.directory is None: print("Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!") sys.exit(1) run_name = args.directory.name From 97d2a40ea30509f9016a29ea539898db3bf33820 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Thu, 16 Mar 2023 23:04:58 +0000 Subject: [PATCH 15/28] fix assertion syntax --- signalexe.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/signalexe.py b/signalexe.py index 570b885..50dd081 100755 --- a/signalexe.py +++ b/signalexe.py @@ -283,11 +283,9 @@ def install_signal(frontend, data='data'): print("Downloading necessary reference and dependency files!") download_dependences(args.data) print("Complete!") - sys.exit(0) - if args.configfile is None: - if args.directory is None: - print("Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!") + if (args.configfile is None) and (not allowed['install']): + assert args.directory is not None, "Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!)" sys.exit(1) run_name = args.directory.name generate_sample_table(args.directory, run_name) From 289d0d5d06d2373680fc2be014b3d6ef994aeb28 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Thu, 16 Mar 2023 23:05:55 +0000 Subject: [PATCH 16/28] remove stale exit --- signalexe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/signalexe.py b/signalexe.py index 50dd081..6f6be4b 100755 --- a/signalexe.py +++ b/signalexe.py @@ -286,7 +286,6 @@ def install_signal(frontend, data='data'): if (args.configfile is None) and (not allowed['install']): assert args.directory is not None, "Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!)" - sys.exit(1) run_name = args.directory.name generate_sample_table(args.directory, run_name) config_file = run_name + "_config.yaml" From 33d668b161e9951c101165f3e62234f391c14b48 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Thu, 16 Mar 2023 23:12:02 +0000 Subject: [PATCH 17/28] fix typo leadingto unwanted AssertionError --- signalexe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/signalexe.py b/signalexe.py index 6f6be4b..1347b1e 100755 --- a/signalexe.py +++ b/signalexe.py @@ -133,7 +133,7 @@ def write_sample_table(sample_data, output_table): for sample in sample_data: out_fh.write(",".join(sample) + '\n') -def download_dependences(dir_name): +def download_dependencies(dir_name): script = os.path.join(script_path, 'scripts', 'get_data_dependencies.sh') subprocess.run(['bash', script, '-d', dir_name, '-a', 'MN908947.3']) @@ -281,10 +281,10 @@ def install_signal(frontend, data='data'): if args.dependencies: print("Downloading necessary reference and dependency files!") - download_dependences(args.data) + download_dependencies(args.data) print("Complete!") - if (args.configfile is None) and (not allowed['install']): + if args.configfile is None: assert args.directory is not None, "Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!)" run_name = args.directory.name generate_sample_table(args.directory, run_name) From 9fa7c7ac0e05ce41f0339f0d801e9545ce55e8d6 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Thu, 16 Mar 2023 23:16:40 +0000 Subject: [PATCH 18/28] add missing exit code --- signalexe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/signalexe.py b/signalexe.py index 1347b1e..3e94174 100755 --- a/signalexe.py +++ b/signalexe.py @@ -283,6 +283,7 @@ def install_signal(frontend, data='data'): print("Downloading necessary reference and dependency files!") download_dependencies(args.data) print("Complete!") + sys.exit(0) if args.configfile is None: assert args.directory is not None, "Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!)" From e716438e2dbf616142785bbc9e8899c367146dca Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Thu, 16 Mar 2023 23:33:14 +0000 Subject: [PATCH 19/28] make installation output verbose --- signalexe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/signalexe.py b/signalexe.py index 3e94174..85701d2 100755 --- a/signalexe.py +++ b/signalexe.py @@ -251,7 +251,7 @@ def install_signal(frontend, data='data'): dep_snakefile = os.path.join(script_path, 'resources', 'dependencies') assert os.path.exists(dep_snakefile) try: - subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend {frontend} --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet", shell=True, check=True) + subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend {frontend} --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda", shell=True, check=True) except subprocess.CalledProcessError: print("Installation of environments failed!") sys.exit(1) From 055d11453a54545d95ca2feb074408a26506df10 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Fri, 17 Mar 2023 01:01:38 +0000 Subject: [PATCH 20/28] update ncov_tools re-run condition --- scripts/run_ncov_tools.sh | 6 +++--- signalexe.py | 21 ++++++++++++++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/scripts/run_ncov_tools.sh b/scripts/run_ncov_tools.sh index 5b26293..48cb2a8 100755 --- a/scripts/run_ncov_tools.sh +++ b/scripts/run_ncov_tools.sh @@ -35,9 +35,9 @@ if [ $1 = 'help' ]; then fi if [ $SIGNAL = 0 ] ; then - echo "You must specify the name of the directory holding SIGNAL results." - echo "$HELP" - exit 1 + echo "You must specify the name of the directory holding SIGNAL results." + echo "$HELP" + exit 1 fi # Start point for executing from ncov-tools.py is SIGNAL results directory diff --git a/signalexe.py b/signalexe.py index 85701d2..1d10df6 100755 --- a/signalexe.py +++ b/signalexe.py @@ -119,9 +119,12 @@ def check_submodule(exec_dir): try: print("Updating ncov-tools!") subprocess.run(['git', 'submodule', 'update', '--init', '--recursive']) + return True except subprocess.CalledProcessError: print("Could not find nor update the required 'ncov-tools' directory! Manually download/update and try again!") sys.exit(1) + else: + return False def write_sample_table(sample_data, output_table): """ @@ -320,13 +323,17 @@ def install_signal(frontend, data='data'): subprocess.run(f"snakemake --conda-frontend {conda_frontend} --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp {opt}", shell=True, check=True) except subprocess.CalledProcessError: if task == "ncov_tools": - check_submodule(os.getcwd()) - if opt.split(" ")[-1] == '--rerun-incomplete': # remove redundant flag - opt = " ".join(opt.split(" ")[:-1]) - try: - print("Retrying...ncov-tools!") - subprocess.run(f"snakemake --conda-frontend {conda_frontend} --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True) - except subprocess.CalledProcessError: + mod = check_submodule(os.getcwd()) + if mod: + if opt.split(" ")[-1] == '--rerun-incomplete': # remove redundant flag + opt = " ".join(opt.split(" ")[:-1]) + try: + print("Retrying...ncov-tools!") + subprocess.run(f"snakemake --conda-frontend {conda_frontend} --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True) + except subprocess.CalledProcessError: + print(f"Some jobs failed while running SIGNAL {task}! Check snakemake logs and the ncov-tools directory for additional details!") + continue + else: print(f"Some jobs failed while running SIGNAL {task}! Check snakemake logs and the ncov-tools directory for additional details!") continue elif task == 'all': From a5d9539e25bce92f72d6805b9a5ba0cb71f87a8e Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Fri, 17 Mar 2023 02:52:35 +0000 Subject: [PATCH 21/28] add script to pull select data --- scripts/get_signal_results.sh | 114 ++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100755 scripts/get_signal_results.sh diff --git a/scripts/get_signal_results.sh b/scripts/get_signal_results.sh new file mode 100755 index 0000000..a44825e --- /dev/null +++ b/scripts/get_signal_results.sh @@ -0,0 +1,114 @@ +#!/bin/env bash + +shopt -s extglob + +source=0 +destination=0 +move='false' + +HELP=""" +Usage: +bash get_signal_results.sh -s -d [-m] + +This scripts aims to copy (rsync by default) or move (mv) select output from SIGNAL 'all', 'postprocess', and 'ncov_tools'. + +The following files will be transferred over to the specified destination directory (if found): +SIGNAL 'all' & 'postprocess': +-> signal-results//_sample.txt +-> signal-results//core/.consensus.fa +-> signal-results//core/_ivar_variants.tsv +-> signal-results//freebayes/.consensus.fasta +-> signal-results//freebayes/.variants.norm.vcf + +'ncov_tools': +-> ncov_tools-results/qc_annotation/.ann.vcf +-> ncov-tools-results/qc_reports/_ambiguous_position_report.tsv +-> ncov-tools-results/qc_reports/_mixture_report.tsv +-> ncov-tools-results/qc_reports/_ncov_watch_variants.tsv +-> ncov-tools-results/qc_reports/_negative_control_report.tsv +-> ncov-tools-results/qc_reports/_summary_qc.tsv + +Flags: + -s : SIGNAL results directory + -d : Directory where summary will be outputted + -m : Invoke 'mv' command instead of 'rsync' copying of results. Optional +""" + +while getopts ":s:d:m" option; do + case "${option}" in + s) source=$OPTARG;; + d) destination=$OPTARG;; + m) move='true';; + esac +done + + +if [ $source = 0 ] || [ $destination = 0 ] ; then + echo "You must specify both source and destination locations." + echo "$HELP" + exit 1 +fi + +if [ ! -d $destination ]; then + echo "Invalid destination directory!" + exit 1 +fi + +if [ ! -f $source/summary.html ] && [ ! -f $source/summary.zip ]; then + echo "Invalid SIGNAL directory! Make sure you've run SIGNAL 'all' and 'postprocess'!" + exit 1 +else + run_name=$(basename $source) + final_dir=${destination}/${run_name} + mkdir -p $final_dir/signal-results +fi + +if ${move}; then + cmd='mv' +else + cmd='rsync -avh' + # rsync -avh +fi + +echo -e "We will use ${cmd} for your files!" + +### SIGNAL results_dir +for file in $source/*; do + if [ -d $file ]; then # results_dir/sample + sample=$(basename $file) # sample name, within contain our files + sample_dest=${final_dir}/'signal-results'/${sample} + if [[ ! $sample == 'ncov-tools-results' ]]; then + mkdir -p $sample_dest + fi + for d in $file/*; do + name=$(basename $d) + if [ -d $d ] && [[ $name == 'core' ]]; then + mkdir -p $sample_dest/core + $cmd ${d}/${sample}.consensus.fa $sample_dest/core/${sample}.consensus.fa + $cmd ${d}/${sample}_ivar_variants.tsv $sample_dest/core/${sample}_ivar_variants.tsv + elif [ -d $d ] && [[ $name == 'freebayes' ]]; then + mkdir -p $sample_dest/freebayes + $cmd ${d}/${sample}.consensus.fasta $sample_dest/freebayes/${sample}.consensus.fasta + $cmd ${d}/${sample}.variants.norm.vcf $sample_dest/freebayes/${sample}.variants.norm.vcf + elif [ -f $d ] && [[ $name =~ '_sample.txt' ]]; then + $cmd ${d} $sample_dest/$(basename $d) + else + continue + fi + done + fi +done + +echo "Files from SIGNAL transferred!" + +### NCOV-TOOLS +if [ ! -d $source/ncov-tools-results ]; then + echo "No ncov-tools-results directory found!" +else + ncov_dest=${final_dir}/ncov-tools-results + mkdir -p $ncov_dest/qc_{annotation,reports} + $cmd $source/ncov-tools-results/qc_reports/* $ncov_dest/qc_reports + $cmd $source/ncov-tools-results/qc_annotation/*.ann.vcf $ncov_dest/qc_annotation + + echo "Files from ncov-tools transferred!" +fi \ No newline at end of file From 34da4219d366ed40d55e430a5a463544df334cbf Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Fri, 17 Mar 2023 03:19:42 +0000 Subject: [PATCH 22/28] add cp command option --- scripts/get_signal_results.sh | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/scripts/get_signal_results.sh b/scripts/get_signal_results.sh index a44825e..428253c 100755 --- a/scripts/get_signal_results.sh +++ b/scripts/get_signal_results.sh @@ -5,14 +5,15 @@ shopt -s extglob source=0 destination=0 move='false' +copy='false' HELP=""" Usage: -bash get_signal_results.sh -s -d [-m] +bash get_signal_results.sh -s -d [-m] [-c] This scripts aims to copy (rsync by default) or move (mv) select output from SIGNAL 'all', 'postprocess', and 'ncov_tools'. -The following files will be transferred over to the specified destination directory (if found): +The following files will be transferred over to the specified destination directory (if found): SIGNAL 'all' & 'postprocess': -> signal-results//_sample.txt -> signal-results//core/.consensus.fa @@ -20,7 +21,7 @@ SIGNAL 'all' & 'postprocess': -> signal-results//freebayes/.consensus.fasta -> signal-results//freebayes/.variants.norm.vcf -'ncov_tools': +SIGNAL 'ncov_tools': -> ncov_tools-results/qc_annotation/.ann.vcf -> ncov-tools-results/qc_reports/_ambiguous_position_report.tsv -> ncov-tools-results/qc_reports/_mixture_report.tsv @@ -31,14 +32,16 @@ SIGNAL 'all' & 'postprocess': Flags: -s : SIGNAL results directory -d : Directory where summary will be outputted - -m : Invoke 'mv' command instead of 'rsync' copying of results. Optional + -m : Invoke 'mv' move command instead of 'rsync' copying of results. Optional + -c : Invoke 'cp' copy command instead of 'rsync' copying of results. Optional """ -while getopts ":s:d:m" option; do +while getopts ":s:d:mc" option; do case "${option}" in s) source=$OPTARG;; d) destination=$OPTARG;; m) move='true';; + c) copy='true';; esac done @@ -63,8 +66,13 @@ else mkdir -p $final_dir/signal-results fi -if ${move}; then +if [ ${move} = true ] && [ ${copy} = true ]; then + echo -e "Only pick one of '-m' or '-c' depending on whether you wish to move or copy files, respectively" + exit +elif [ ${move} = true ] && [ ${copy} = false ]; then cmd='mv' +elif [ ${move} = false ] && [ ${copy} = true ]; then + cmd='cp' else cmd='rsync -avh' # rsync -avh From 57599751e7159c651cba48e8ab34852e3bfae0d3 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Fri, 17 Mar 2023 03:21:10 +0000 Subject: [PATCH 23/28] update help to include cp --- scripts/get_signal_results.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/get_signal_results.sh b/scripts/get_signal_results.sh index 428253c..0936c39 100755 --- a/scripts/get_signal_results.sh +++ b/scripts/get_signal_results.sh @@ -11,7 +11,7 @@ HELP=""" Usage: bash get_signal_results.sh -s -d [-m] [-c] -This scripts aims to copy (rsync by default) or move (mv) select output from SIGNAL 'all', 'postprocess', and 'ncov_tools'. +This scripts aims to copy (rsync by default, or cp) or move (mv) select output from SIGNAL 'all', 'postprocess', and 'ncov_tools'. The following files will be transferred over to the specified destination directory (if found): SIGNAL 'all' & 'postprocess': From 081c8438c77730f1fc8b03abf0c1b6e5fbf11086 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Fri, 17 Mar 2023 14:46:16 +0000 Subject: [PATCH 24/28] update nextclade dataset updating --- scripts/assign_lineages.py | 55 ++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/scripts/assign_lineages.py b/scripts/assign_lineages.py index 43f2bfd..0ba5d3c 100755 --- a/scripts/assign_lineages.py +++ b/scripts/assign_lineages.py @@ -8,6 +8,7 @@ import shutil import os, sys from datetime import datetime +import json def check_file(path: str) -> Path: @@ -136,27 +137,53 @@ def update_nextclade_dataset(vers, skip): # If specific tag requested, attempt to install, otherwise install latest accession = 'MN908947' + current_tag = None + if os.path.exists(os.path.join(output_dir, 'tag.json')): + j = open(os.path.join(output_dir, 'tag.json')) + data = json.load(j) + current_tag = data['tag'] + j.close() if requested is not None: + # check existing database, if found + if requested == current_tag: + print(f"Nextclade dataset {requested} already installed! Skipping update!") + else: + try: + print(f"\nDownloading Nextclade {dataset} dataset tagged {requested} for reference {accession}!") + subprocess.run(f"nextclade dataset get " + f"--name '{dataset}' " + f"--reference '{accession}' " + f"--tag {requested} " + f"--output-dir '{output_dir}'", shell=True, check=True) + except subprocess.CalledProcessError: + print(f"\nDatabase not found! Please check whether {requested} tag exists! Downloading latest Nextclade {dataset} dataset for reference {accession}...") + try: + subprocess.run(f"nextclade dataset get " + f"--name '{dataset}' " + f"--reference '{accession}' " + f"--output-dir '{output_dir}'", shell=True, check=True) + except subprocess.CalledProcessError: + if current_tag is not None: + print(f"Something went wrong updating the Nextclade dataset, using {current_tag} instead!") + requested = current_tag + else: + print(f"Something went wrong updating the Nextclade dataset! No database could be found which may result in errors! Skipping update...") + requested = "Unknown" + else: try: - print(f"\nDownloading Nextclade {dataset} dataset tagged {requested} for reference {accession}!") + print(f"\nDownloading latest Nextclade {dataset} dataset for reference {accession}!") subprocess.run(f"nextclade dataset get " f"--name '{dataset}' " f"--reference '{accession}' " - f"--tag {requested} " f"--output-dir '{output_dir}'", shell=True, check=True) except subprocess.CalledProcessError: - print(f"\nDatabase not found! Please check whether {requested} tag exists! Downloading latest Nextclade {dataset} dataset for reference {accession}...") - subprocess.run(f"nextclade dataset get " - f"--name '{dataset}' " - f"--reference '{accession}' " - f"--output-dir '{output_dir}'", shell=True, check=True) - else: - print(f"\nDownloading latest Nextclade {dataset} dataset for reference {accession}!") - subprocess.run(f"nextclade dataset get " - f"--name '{dataset}' " - f"--reference '{accession}' " - f"--output-dir '{output_dir}'", shell=True, check=True) - + if current_tag is not None: + print(f"Something went wrong updating the Nextclade dataset, using {current_tag} instead!") + requested = current_tag + else: + print(f"Something went wrong updating the Nextclade dataset! No database could be found which may result in errors! Skipping update...") + requested = "Unknown" + # Obtain final version information for output nextclade_version = subprocess.run(f"nextclade --version".split(), stdout=subprocess.PIPE).stdout.decode('utf-8').strip().lower() if nextclade_version.startswith("nextclade"): From f31132224cb539e83f39d374f0522fbb2f360bfc Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Fri, 17 Mar 2023 15:30:07 +0000 Subject: [PATCH 25/28] update pangolin version pull from install rather than online --- scripts/ncov-tools.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/scripts/ncov-tools.py b/scripts/ncov-tools.py index e959778..48148bb 100755 --- a/scripts/ncov-tools.py +++ b/scripts/ncov-tools.py @@ -83,10 +83,26 @@ def set_up(): try: assert pangolin == "3" or pangolin == "4" # directly supported versions except AssertionError: - import urllib.request as web - commit_url = web.urlopen(f"https://github.com/cov-lineages/pangolin/releases/latest").geturl() - pangolin = commit_url.split("/")[-1].split(".")[0].lower().strip("v") + # import urllib.request as web + # commit_url = web.urlopen(f"https://github.com/cov-lineages/pangolin/releases/latest").geturl() + # pangolin = commit_url.split("/")[-1].split(".")[0].lower().strip("v") # latest version (should ensure temporary compatibility) + installed_versions = subprocess.run(["pangolin", "--all-versions"], + check=True, + stdout=subprocess.PIPE) + installed_versions = installed_versions.stdout.decode('utf-8') + installed_ver_dict = {} + for dep_ver in map(str.strip, installed_versions.split('\n')): + # skip empty line at end + if len(dep_ver) == 0: + continue + try: + dependency, version = dep_ver.split(': ') + except ValueError: + continue + if dependency == 'pangolin': + pangolin = str(version).split(".",1)[0].strip('v') + break ### Create data directory within ncov-tools data_root = os.path.abspath(os.path.join(exec_dir, 'ncov-tools', "%s" %(result_dir))) From 12732fcfbe2ec83e4e21b257f2d1cbeee65ea759 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Fri, 17 Mar 2023 16:56:38 +0000 Subject: [PATCH 26/28] add cleanup --- scripts/ncov-tools.py | 8 +++++--- scripts/run_ncov_tools.sh | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/ncov-tools.py b/scripts/ncov-tools.py index 48148bb..16cea46 100755 --- a/scripts/ncov-tools.py +++ b/scripts/ncov-tools.py @@ -182,7 +182,7 @@ def set_up(): for key, value in config.items(): fh.write(f"{key}: {value}\n") - return exec_dir, result_dir + return exec_dir, result_dir, data_root def run_all(): os.system(f"snakemake -s workflow/Snakefile --cores {snakemake.threads} all") @@ -221,12 +221,14 @@ def move(cwd, dest, prefix): print("Missing ncov-tools 'qc_analysis' directory") if __name__ == '__main__': - exec_dir, result_dir = set_up() + exec_dir, result_dir, data_root = set_up() run_script = os.path.join(exec_dir, 'scripts', 'run_ncov_tools.sh') #print("Don't forget to update the config.yaml file as needed prior to running ncov-tools.") print("Running ncov-tools using %s cores!" %(snakemake.threads)) subprocess.run([run_script, '-c', str(snakemake.threads), '-s', str(result_dir)]) - + + # clean up + shutil.rmtree(data_root) #run_all() #move(exec_dir, result_root, result_dir) diff --git a/scripts/run_ncov_tools.sh b/scripts/run_ncov_tools.sh index 48cb2a8..5a68aad 100755 --- a/scripts/run_ncov_tools.sh +++ b/scripts/run_ncov_tools.sh @@ -50,7 +50,7 @@ cd ../ncov-tools # run ncov-tools snakemake -k -s workflow/Snakefile --cores ${CORES} all -# move ncovresults to SIGNAL results directory +# move ncovresults to SIGNAL results directory and clean up mv ${SIGNAL}'_ncovresults' ${RESULTS}/ncov-tools-results # return success From d793fba3cf5f7b01b69b65c369bdcc44d95d7f18 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Fri, 17 Mar 2023 17:39:19 +0000 Subject: [PATCH 27/28] update README --- README.md | 146 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 101 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index f9cbea7..546f62c 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ so alternatively install `mamba` and use that (snakemake has beta support for it conda install -c conda-forge mamba mamba create -c conda-forge -c bioconda -n signal snakemake pandas conda mamba conda activate signal + # mamba activate signal is equivalent Additional software dependencies are managed directly by `snakemake` using conda environment files: @@ -79,31 +80,27 @@ Additional software dependencies are managed directly by `snakemake` using conda ## SIGNAL Help Screen: -Using the provided `signal.py` script, the majority of SIGNAL functions can be accessed easily. +Using the provided `signalexe.py` script, the majority of SIGNAL functions can be accessed easily. To display the help screen: ``` -python signal.py -h +python signalexe.py -h -Output: -usage: signal.py [-h] [-c CONFIGFILE] [-d DIRECTORY] [--cores CORES] [--config-only] [--remove-freebayes] [--add-breseq] - [-neg NEG_PREFIX] [--dependencies] [-ri] [--unlock] [-F] [-n] [--verbose] [-v] - [all ...] [postprocess ...] [ncov_tools ...] +usage: signalexe.py [-h] [-c CONFIGFILE] [-d DIRECTORY] [--cores CORES] [--config-only] [--remove-freebayes] [--add-breseq] [-neg NEG_PREFIX] [--dependencies] [--data DATA] [-ri] [-ii] [--unlock] + [-F] [-n] [--quiet] [--verbose] [-v] + [all ...] [postprocess ...] [ncov_tools ...] [install ...] -SARS-CoV-2 Illumina GeNome Assembly Line (SIGNAL) aims to take Illumina short-read sequences and perform consensus assembly + -variant calling for ongoing surveillance and research efforts towards the emergent coronavirus: Severe Acute Respiratory Syndrome -Coronavirus 2 (SARS-CoV-2). +SARS-CoV-2 Illumina GeNome Assembly Line (SIGNAL) aims to take Illumina short-read sequences and perform consensus assembly + variant calling for ongoing surveillance and research efforts towards +the emergent coronavirus: Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2). positional arguments: - all Run SIGNAL with all associated assembly rules. Does not include postprocessing '--configfile' or '-- - directory' required. The latter will automatically generate a configuration file and sample table. If - both provided, then '--configfile' will take priority - postprocess Run SIGNAL postprocessing on completed SIGNAL run. '--configfile' is required but will be generated if ' - --directory' is provided - ncov_tools Generate configuration file and filesystem setup required and then execute ncov-tools quality control - assessment. Requires 'ncov-tools' submodule! '--configfile' is required but will be generated if '-- - directory' is provided + all Run SIGNAL with all associated assembly rules. Does not include postprocessing '--configfile' or '--directory' required. The latter will automatically generate a + configuration file and sample table. If both provided, then '--configfile' will take priority + postprocess Run SIGNAL postprocessing on completed SIGNAL run. '--configfile' is required but will be generated if '--directory' is provided + ncov_tools Generate configuration file and filesystem setup required and then execute ncov-tools quality control assessment. Requires 'ncov-tools' submodule! '--configfile' is required + but will be generated if '--directory' is provided + install Install individual rule environments and ensure SIGNAL is functional. The only parameter operable will be '--data'. Will override other operations! optional arguments: -h, --help show this help message and exit @@ -113,49 +110,57 @@ optional arguments: Path to directory containing reads. Will be used to generate sample table and configuration file --cores CORES Number of cores. Default = 1 --config-only Generate sample table and configuration file (i.e., config.yaml) and exit. '--directory' required - --remove-freebayes Configuration file generator parameter. Set flag to DISABLE freebayes variant calling (improves overall - speed) - --add-breseq Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for - analysis to complete) + --remove-freebayes Configuration file generator parameter. Set flag to DISABLE freebayes variant calling (improves overall speed) + --add-breseq Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for analysis to complete) -neg NEG_PREFIX, --neg-prefix NEG_PREFIX - Configuration file generator parameter. Comma-separated list of negative sontrol sample name(s) or - prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will - be left empty, if not provided - --dependencies Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. - Note: Will override other flags! (~10 GB storage required) + Configuration file generator parameter. Comma-separated list of negative control sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. + Recommended if running ncov-tools. Will be left empty, if not provided + --dependencies Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other parameters! (~10 GB storage required) + --data DATA SIGNAL install and data dependencies parameter. Set location for data dependancies. If '--dependancies' is run, a folder will be created in the specified directory. If '-- + config-only' or '--directory' is used, the value will be applied to the configuration file. (Upcoming feature): When used with 'SIGNAL install', any tests run will use the + dependencies located at this directory. Default = 'data' -ri, --rerun-incomplete Snakemake parameter. Re-run any incomplete samples from a previously failed run + -ii, --ignore-incomplete + Snakemake parameter. Do not check for incomplete output files --unlock Snakemake parameter. Remove a lock on the working directory after a failed run -F, --forceall Snakemake parameter. Force the re-run of all rules regardless of prior output -n, --dry-run Snakemake parameter. Do not execute anything and only display what would be done + --quiet Snakemake parameter. Do not output any progress or rule information. If used with '--dry-run`, it will just display a summary of the DAG of jobs --verbose Snakemake parameter. Display snakemake debugging output -v, --version Display version number ``` ## Summary: -`signal.py` simplies the execution of all functions of SIGNAL. At its simplest, SIGNAL can be run with one line, provided only the directory of sequencing reads. +`signalexe.py` simplies the execution of all functions of SIGNAL. At its simplest, SIGNAL can be run with one line, provided only the directory of sequencing reads. ``` # Download dependances (only needs to be run once; ~10GB of storage required) -python signal.py --dependencies +# --data flag allows you to rename and relocate dependencies directory +python signalexe.py --data data --dependencies -# Generate configuration file and sample table (--neg_prefix can be used to note negative controls) -python signal.py --config-only --directory /path/to/reads +# Generate configuration file and sample table +# --neg_prefix can be used to note negative controls +# --data can be used to specify location of data dependencies +python signalexe.py --config-only --directory /path/to/reads # Execute pipeline (step-by-step; --cores defaults to 1 if not provided) -python signal.py --configfile config.yaml --cores NCORES aLL -python signal.py --configfile config.yaml --cores NCORES postprocess -python signal.py --configfile config.yaml --cores NCORES ncov_tools +# --data can be used to specify location of data dependencies +python signalexe.py --configfile config.yaml --cores NCORES all +python signalexe.py --configfile config.yaml --cores NCORES postprocess +python signalexe.py --configfile config.yaml --cores NCORES ncov_tools # ALTERNATIVE # Execute pipeline (one line) -python signal.py --configfile config.yaml --cores NCORES all postprocess ncov_tools +# --data can be used to specify location of data dependencies +python signalexe.py --configfile config.yaml --cores NCORES all postprocess ncov_tools # ALTERNATIVE # Execute pipeline (one line; no prior configuration file or sample table steps) # --directory can be used in place of --configfile to automatically generate a configuration file -python signal.py --directory /path/to/reads --cores NCORES all postprocess ncov_tools +# --data can be used to specify location of data dependencies +python signalexe.py --directory /path/to/reads --cores NCORES all postprocess ncov_tools ``` Each of the steps in SIGNAL can be run **manually** by accessing the individual scripts or running snakemake. @@ -187,8 +192,9 @@ The pipeline requires: - kraken2 viral database - Human GRCh38 reference fasta (for composite human-viral BWA index) - python signal.py --dependencies + python signalexe.py --dependencies # defaults to a directory called `data` in repository root + # --data can be used to rename and relocate the resultant directory OR @@ -197,14 +203,24 @@ The pipeline requires: **Note: Downloading the database files requires ~10GB of storage, with up to ~35GB required for all temporary downloads!** +### 1.5 Prepare per-rule conda environments: + +SIGNAL uses controlled conda environments for individual steps in the workflow. These are generally produced upon first execution of SIGNAL with input data; however, an option to install the per-rule environments is available through the `signalexe.py` script. + + python signalexe.py install + + # Will install per-rule environments + # Later versions of SIGNAL will include a testing module with curated data to ensure function + ### 2. Generate configuration file: You can use the `--config-only` flag to generate both `config.yaml` and `sample_table.csv` (see step 4). The directory provided will be used to auto-generate a name for the run. ``` -python signal.py --config-only --directory /path/to/reads +python signalexe.py --config-only --directory /path/to/reads # Outputs: 'reads_config.yaml' and 'reads_sample_table.csv' +# --data can be used to specify the location of data dependancies ``` You can also create the configuration file through modifying the `example_config.yaml` to suit your system. @@ -248,7 +264,7 @@ bash scripts/generate_sample_table.sh -d /path/to/more/reads -e sample_table.csv ### 4. Execute pipeline: -For the main `signal.py` script, positional arguments inform the rules of the pipeline to execute with flags supplementing input parameters. +For the main `signalexe.py` script, positional arguments inform the rules of the pipeline to execute with flags supplementing input parameters. The main rules of the pipeline are as followed: @@ -258,7 +274,7 @@ The main rules of the pipeline are as followed: The generated configuration file from the above steps can be used as input. To run the general pipeline: -`python signal.py --configfile config.yaml --cores 4 all` +`python signalexe.py --configfile config.yaml --cores 4 all` is equivalent to running @@ -268,7 +284,7 @@ You can run the snakemake command as written above, but note that if the `--cond Alternatively, you can skip the above configuration and sample table generation steps by simply providing the directory of reads to the main script: -`python signal.py --directory /path/to/reads --cores 4 all` +`python signalexe.py --directory /path/to/reads --cores 4 all` A configuartion file and sample table will automatically be generated prior to running SIGNAL `all`. @@ -278,7 +294,7 @@ FreeBayes variant calling and BreSeq mutational analysis are technically optiona As with the general pipeline, the generated configuration file from the above steps can be used as input. To run `postprocess` which summarizes the SIGNAL results: -`python signal.py --configfile config.yaml --cores 1 postprocess` +`python signalexe.py --configfile config.yaml --cores 1 postprocess` is equivalent to running @@ -306,7 +322,7 @@ Related: because pipeline stages can fail, we run (and recommend running if usin Additionally, SIGNAL can prepare output and execute @jts' [ncov-tools](https://github.com/jts/ncov-tools) to generate phylogenies and alternative summaries. -`python signal.py --configfile config.yaml --cores 1 ncov_tools` +`python signalexe.py --configfile config.yaml --cores 1 ncov_tools` is equivalent to running @@ -318,17 +334,19 @@ SIGNAL will then execute ncov-tools and the **output will be found within the SI ### Multiple operations: -Using `signal.py` positional arguments, you can specify SIGNAL to perform multiple rules in succession. +Using `signalexe.py` positional arguments, you can specify SIGNAL to perform multiple rules in succession. -`python signal.py --configfile config.yaml --cores NCORES all postprocess ncov_tools` +`python signalexe.py --configfile config.yaml --cores NCORES all postprocess ncov_tools` In the above command, SIGNAL `all`, `postprocess`, and `ncov_tools` will run using the provided configuration file as input, which links to a sample table. **Note: Regardless of order for positional arguments, or placement of other parameter flags, SIGNAL will always run in the set order priority: `all` > `postprocess` > `ncov_tools`!** +**Note: If `install` is provided as input, it will override all other positional arguments!** + If no configuration file or sample table was generated for a run, you can provide `--directory` with the path to sequencing reads and SIGNAL will auto-generate both required inputs prior to running any rules. -`python signal.py --directory /path/to/reads --cores NCORES all postprocess ncov_tools` +`python signalexe.py --directory /path/to/reads --cores NCORES all postprocess ncov_tools` Overall, this simplifies executing SIGNAL to one line! @@ -359,6 +377,44 @@ Then execute the pipeline: - To generate summaries of BreSeq among many samples, see [how to summarize BreSeq results using gdtools](resources/dev_scripts/summaries/README.md) +### Convenient extraction script: + +SIGNAL produces several output files and directories on its own alongside the output for ncov-tools. Select files from the output can be copied or transferred for easier parsing using a provided convenience bash script: + +``` +bash scripts/get_signal_results.sh + +Usage: +bash get_signal_results.sh -s -d [-m] [-c] + +This scripts aims to copy (rsync by default, or cp) or move (mv) select output from SIGNAL 'all', 'postprocess', and 'ncov_tools'. + +The following files will be transferred over to the specified destination directory (if found): +SIGNAL 'all' & 'postprocess': +-> signal-results//_sample.txt +-> signal-results//core/.consensus.fa +-> signal-results//core/_ivar_variants.tsv +-> signal-results//freebayes/.consensus.fasta +-> signal-results//freebayes/.variants.norm.vcf + +SIGNAL 'ncov_tools': +-> ncov_tools-results/qc_annotation/.ann.vcf +-> ncov-tools-results/qc_reports/_ambiguous_position_report.tsv +-> ncov-tools-results/qc_reports/_mixture_report.tsv +-> ncov-tools-results/qc_reports/_ncov_watch_variants.tsv +-> ncov-tools-results/qc_reports/_negative_control_report.tsv +-> ncov-tools-results/qc_reports/_summary_qc.tsv + +Flags: + -s : SIGNAL results directory + -d : Directory where summary will be outputted + -m : Invoke 'mv' move command instead of 'rsync' copying of results. Optional + -c : Invoke 'cp' copy command instead of 'rsync' copying of results. Optional + +``` + +The script uses `rsync` to provide accurate copies of select output files organized into `signal-results` and `ncov-tools-results` within a provided destination directory (that must exist). If the `-c` is provided, `cp` will be used instead of `rsync` to produce copies. Similarly, if `-m` is provided, `mv` will be used instead (**WARNING: Any interruption during `mv` could result in data loss.**) + ## Pipeline details: For a step-by-step walkthrough of the pipeline, see [pipeline/README.md](PIPELINE.md). From b6e5992758765dd6c02df11cd6ad8f0e2e32aa77 Mon Sep 17 00:00:00 2001 From: "Jalees A. Nasir" Date: Fri, 17 Mar 2023 17:53:55 +0000 Subject: [PATCH 28/28] update README --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 546f62c..a5ec410 100644 --- a/README.md +++ b/README.md @@ -203,18 +203,18 @@ The pipeline requires: **Note: Downloading the database files requires ~10GB of storage, with up to ~35GB required for all temporary downloads!** -### 1.5 Prepare per-rule conda environments: +### 1.5. Prepare per-rule conda environments (optional, but recommended): SIGNAL uses controlled conda environments for individual steps in the workflow. These are generally produced upon first execution of SIGNAL with input data; however, an option to install the per-rule environments is available through the `signalexe.py` script. python signalexe.py install # Will install per-rule environments - # Later versions of SIGNAL will include a testing module with curated data to ensure function + # Later versions of SIGNAL will include a testing module with curated data to ensure function ### 2. Generate configuration file: -You can use the `--config-only` flag to generate both `config.yaml` and `sample_table.csv` (see step 4). The directory provided will be used to auto-generate a name for the run. +You can use the `--config-only` flag to generate both `config.yaml` and `sample_table.csv`. The directory provided will be used to auto-generate a name for the run. ``` python signalexe.py --config-only --directory /path/to/reads @@ -231,7 +231,7 @@ You can also create the configuration file through modifying the `example_config See the example table `example_sample_table.csv` for an idea of how to organise this table. -**Using the `--config-only` flag, both configuration file and sample table will be generated (see above in step 3) from a given directory path to reads.** +**Using the `--config-only` flag, both configuration file and sample table will be generated (see above in step 2) from a given directory path to reads.** Alternatively, you can attempt to use `generate_sample_table.sh` to circumvent manual creation of the table. @@ -282,7 +282,7 @@ is equivalent to running You can run the snakemake command as written above, but note that if the `--conda-prefix` is not set as this (i.e., `$PWD/.snakemake/conda`), then all envs will be reinstalled for each time you change the `results_dir` in the `config.yaml`. -Alternatively, you can skip the above configuration and sample table generation steps by simply providing the directory of reads to the main script: +Alternatively, you can skip the above configuration and sample table generation steps by simply providing the directory of reads to the main script (see step 2): `python signalexe.py --directory /path/to/reads --cores 4 all`