From 7406f97f72bb8f2e780af55194985ce0bff2562f Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Mon, 13 Mar 2023 18:51:24 +0000
Subject: [PATCH 01/28] update intended version number

---
 scripts/signal_postprocess.py |  2 +-
 signal.py                     | 41 ++++++++++++++++++++++-------------
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/scripts/signal_postprocess.py b/scripts/signal_postprocess.py
index a0a0aba..20fbcde 100755
--- a/scripts/signal_postprocess.py
+++ b/scripts/signal_postprocess.py
@@ -17,7 +17,7 @@
 
 assert long_git_id.startswith('$Id: ')
 #short_git_id = long_git_id[5:12]
-short_git_id = "v1.5.9"
+short_git_id = "v1.6.0"
 
 # Suppresses matplotlib warning (https://github.com/jaleezyy/covid-19-signal/issues/59)
 # Creates a small memory leak, but it's nontrivial to fix, and won't be a practical concern!
diff --git a/signal.py b/signal.py
index 00764cd..b5e13bc 100755
--- a/signal.py
+++ b/signal.py
@@ -9,7 +9,7 @@
 from pathlib import Path
 
 def create_parser():
-	allowed = {'all': False, 'postprocess': False, 'ncov_tools': False}
+	allowed = {'install': False, 'all': False, 'postprocess': False, 'ncov_tools': False}
 
 	parser = argparse.ArgumentParser(prog='signal.py', description="SARS-CoV-2 Illumina GeNome Assembly Line (SIGNAL) aims to take Illumina short-read sequences and perform consensus assembly + variant calling for ongoing surveillance and research efforts towards the emergent coronavirus: Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2).")
 	parser.add_argument('all', nargs='*',
@@ -18,6 +18,8 @@ def create_parser():
 						help="Run SIGNAL postprocessing on completed SIGNAL run. '--configfile' is required but will be generated if '--directory' is provided")
 	parser.add_argument('ncov_tools', nargs='*',
 						help="Generate configuration file and filesystem setup required and then execute ncov-tools quality control assessment. Requires 'ncov-tools' submodule! '--configfile' is required but will be generated if '--directory' is provided")
+	parser.add_argument('install', nargs='*',
+						help="Install individual rule environments and ensure SIGNAL is functional")
 	parser.add_argument('-c', '--configfile', type=check_file, default=None,
 						help="Configuration file (i.e., config.yaml) for SIGNAL analysis")
 	parser.add_argument('-d', '--directory', type=check_directory, default=None,
@@ -28,16 +30,18 @@ def create_parser():
 	parser.add_argument('--add-breseq', action='store_true', help="Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for analysis to complete)")
 	parser.add_argument('-neg', '--neg-prefix', default=None, help="Configuration file generator parameter. Comma-separated list of negative sontrol sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will be left empty, if not provided")
 	parser.add_argument('--dependencies', action='store_true', help="Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other flags! (~10 GB storage required)")
+	parser.add_argument('--data', default='data', help="Data dependencies parameter. Set location for data dependancies. If '--dependancies' is run, a folder will be created in the specified directory. If '--config-only' or '--directory' is used, the value will be applied to the configuration file. Default = 'data'")
 	parser.add_argument('-ri', '--rerun-incomplete', action='store_true', help="Snakemake parameter. Re-run any incomplete samples from a previously failed run")
 	parser.add_argument('--unlock', action='store_true', help="Snakemake parameter. Remove a lock on the working directory after a failed run")
 	parser.add_argument('-F', '--forceall', action='store_true', help='Snakemake parameter. Force the re-run of all rules regardless of prior output')
 	parser.add_argument('-n', '--dry-run', action='store_true', help='Snakemake parameter. Do not execute anything and only display what would be done')
+	### add --quiet
 	parser.add_argument('--verbose', action='store_true', help="Snakemake parameter. Display snakemake debugging output")
 	parser.add_argument('-v', '--version', action='store_true', help="Display version number")
 	args, unknown = parser.parse_known_args()
 
 	provided = []
-	for opt in allowed: # ['all', 'postprocess', 'ncov_tools']
+	for opt in allowed: # ['install', 'all', 'postprocess', 'ncov_tools']
 		if len(getattr(args, opt)) > 0:
 			provided = provided + getattr(args, opt)
 			getattr(args, opt).clear()
@@ -122,8 +126,8 @@ def write_sample_table(sample_data, output_table):
 		for sample in sample_data:
 			out_fh.write(",".join(sample) + '\n')
 
-def download_dependences():
-	dir_name = 'data'
+def download_dependences(data):
+	dir_name = data
 	script = os.path.join(script_path, 'scripts', 'get_data_dependencies.sh')
 	subprocess.run(['bash', script, '-d', dir_name, '-a', 'MN908947.3'])
 
@@ -135,7 +139,7 @@ def generate_sample_table(project_directory, project_name):
 	out_table = project_name + "_sample_table.csv"
 	subprocess.run(['bash', script, '-d', project_directory, '-n', out_table])
 
-def write_config_file(run_name, config_file, opt_tasks):
+def write_config_file(run_name, config_file, data_directory, opt_tasks):
 ### opt_tasks = [args.breseq, args.freebayes, [args.neg_prefix]] - latter only applies to SIGNAL v1.5.8 and earlier
 
 	config = f"""# This file contains a high-level summary of pipeline configuration and inputs.
@@ -157,26 +161,26 @@ def write_config_file(run_name, config_file, opt_tasks):
 scheme_bed: 'resources/primer_schemes/artic_v3/nCoV-2019.bed'
 
 # Path from snakemake dir to bwa indexed human + viral reference genome
-composite_reference: 'data/composite_human_viral_reference.fna'
+composite_reference: "{data_directory}/composite_human_viral_reference.fna"
 
 # Used as bwa reference genome when removing host sequences.
 # Also used as 'ivar' reference genome in variant detection + consensus.
 # Used as -r,-g arguments to 'quast'
 # contig needed for hostremoval filtering script
 viral_reference_contig_name: 'MN908947.3'
-viral_reference_genome: 'data/MN908947.3.fasta'
-viral_reference_feature_coords: 'data/MN908947.3.gff3'
+viral_reference_genome: "{data_directory}/MN908947.3.fasta"
+viral_reference_feature_coords: "{data_directory}/MN908947.3.gff3"
 
 # breseq_reference must be defined if run_breseq == True
 run_breseq: {opt_tasks[0]}
 # Used as --reference argument to 'breseq'
-breseq_reference: 'data/MN908947.3.gbk'
+breseq_reference: "{data_directory}/MN908947.3.gbk"
 
 # run freebayes for variant and consensus calling (as well as ivar)
 run_freebayes: {opt_tasks[1]}
 
 # Used as --db argument to 'kraken2'
-kraken2_db: 'data/Kraken2/db'
+kraken2_db: "{data_directory}/Kraken2/db"
 
 # For Ivar's amplicon filter 
 # https://github.com/andersen-lab/ivar/commit/7027563fd75581c78dabc6040ebffdee2b24abe6
@@ -226,7 +230,7 @@ def write_config_file(run_name, config_file, opt_tasks):
 amplicon_loc_bed: 'resources/primer_schemes/artic_v3/ncov-qc_V3.scheme.bed'
 
 # fasta of sequences to include with pangolin phylogeny
-phylo_include_seqs: "data/blank.fasta"
+phylo_include_seqs: "{data_directory}/blank.fasta"
 
 # List of negative control sample names or prefixes (i.e., ['Blank'] will cover Blank1, Blank2, etc.)
 negative_control_prefix: {opt_tasks[2]}"""
@@ -234,11 +238,14 @@ def write_config_file(run_name, config_file, opt_tasks):
 	with open(config_file, 'w') as fh:
 		fh.write(config)
 
+def test_signal(data):
+	pass
+
 if __name__ == '__main__':
 	# note: add root_dir to determine the root directory of SIGNAL
 	script_path = os.path.join(os.path.abspath(sys.argv[0]).rsplit("/",1)[0])
 	args, allowed = create_parser()
-	version = 'v1.5.9'
+	version = 'v1.6.0'
 	alt_options = []
 	
 	if args.version:
@@ -246,7 +253,7 @@ def write_config_file(run_name, config_file, opt_tasks):
 	
 	if args.dependencies:
 		print("Downloading necessary reference and dependency files!")
-		download_dependences()
+		download_dependences(args.data)
 		exit("Download complete!")
 	
 	if args.configfile is None:
@@ -258,7 +265,7 @@ def write_config_file(run_name, config_file, opt_tasks):
 			neg = [pre.replace(" ","") for pre in args.neg_prefix.split(",")]
 		else:
 			neg = [args.neg_prefix]
-		write_config_file(run_name, config_file, [args.add_breseq, args.remove_freebayes, neg])
+		write_config_file(run_name, config_file, args.data, [args.add_breseq, args.remove_freebayes, neg])
 		if args.config_only:
 			exit("Configuration file and sample table generated!")
 	else:
@@ -274,7 +281,7 @@ def write_config_file(run_name, config_file, opt_tasks):
 		if args.rerun_incomplete: alt_options.append('--rerun-incomplete')
 		opt = " ".join(alt_options)
 		for task in allowed:
-			if allowed[task] is True:
+			if (allowed[task] is True) and (task != 'install'):
 				print(f"Running SIGNAL {task}!")
 				try:
 					subprocess.run(f"snakemake --conda-frontend mamba --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp {opt}", shell=True, check=True)
@@ -288,5 +295,9 @@ def write_config_file(run_name, config_file, opt_tasks):
 						subprocess.run(f"snakemake --conda-frontend conda --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True)
 					except subprocess.CalledProcessError:
 						exit(f"Something went wrong running SIGNAL {task}! Check input and try again!")
+			else:
+				print(f"Installing SIGNAL environments!")
+				
+				exit()
 	
 	exit("SIGNAL completed successfully!")

From 1286eb68c05d36775b5507f6a8ed8bb12c169f81 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Mon, 13 Mar 2023 19:42:52 +0000
Subject: [PATCH 02/28] add expected file change rules

---
 Snakefile | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 76a4e5d..e8de728 100644
--- a/Snakefile
+++ b/Snakefile
@@ -129,21 +129,25 @@ rule clean_reads:
 rule consensus:
     input: expand('{sn}/core/{sn}.consensus.fa', sn=sample_names)
 
+rule core_genomes:
+    input: 'all_genomes.fa'
+
 rule ivar_variants:
     input: expand('{sn}/core/{sn}_ivar_variants.tsv', sn=sample_names)
 
 rule breseq:
     input: expand('{sn}/breseq/output/index.html', sn=sample_names)
 
+
 rule freebayes:
-    input: 
+    input:
+        'all_freebayes_genomes.fa',
         expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names),
         expand('{sn}/freebayes/{sn}.variants.norm.vcf', sn=sample_names),
         'freebayes_lineage_assignments.tsv',
         expand('{sn}/freebayes/quast/{sn}_quast_report.html', sn=sample_names),
         expand('{sn}/freebayes/{sn}_consensus_compare.vcf', sn=sample_names)
 
-    
 rule coverage:
     input: expand('{sn}/coverage/{sn}_depth.txt', sn=sample_names)
 
@@ -158,6 +162,7 @@ rule quast:
 
 rule lineages:
     input:
+        rules.core_genomes.input,
         'input_pangolin_versions.txt',
         'input_nextclade_versions.txt',
         'lineage_assignments.tsv'
@@ -769,6 +774,9 @@ rule run_quast_freebayes:
          'quast {input} -r {params.genome} -g {params.fcoords} --output-dir {params.outdir} --threads {threads} >{log} && '
          'for f in {params.unlabelled_reports}; do mv $f ${{f/report/{params.sample_name}}}; done'
 
+rule collect_core_genomes:
+    
+
 rule run_lineage_assignment:
     threads: 4
     conda: 'conda_envs/assign_lineages.yaml'
@@ -797,6 +805,9 @@ rule run_lineage_assignment:
         'cat {input} > all_genomes.fa && '
         '{params.assignment_script_path} -i all_genomes.fa -t {threads} -o {output.lin_out} -p {output.pango_ver_out} -n {output.nextclade_ver_out} --mode {params.analysis_mode}'
 
+rule collect_freebayes_genomes:
+    
+
 rule run_lineage_assignment_freebayes:
     threads: 4
     conda: 'conda_envs/assign_lineages.yaml'

From b711850bc1d0584028def93bd73a2ddcb2d26c4a Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Wed, 15 Mar 2023 13:04:57 +0000
Subject: [PATCH 03/28] update dependencies

---
 resources/dependencies | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/resources/dependencies b/resources/dependencies
index 5573562..b4f5097 100644
--- a/resources/dependencies
+++ b/resources/dependencies
@@ -4,6 +4,8 @@ rule all:
         "ivar",
         "snp_mapping",
         "trim_qc",
+        "assign_lineages.yaml",
+        "freebayes.yaml",
         "postprocessing"
     shell: "rm {input}"
 
@@ -27,6 +29,16 @@ rule trim_qc:
     output: "trim_qc"
     shell: "touch {output}"
 
+rule assign_lineages:
+    conda: "../conda_envs/assign_lineages.yaml"
+    output: "assign_lineages"
+    shell: "touch {output}"
+
+rule freebayes:
+    conda: "../conda_envs/freebayes.yaml"
+    output: "freebayes"
+    shell: "touch {output}"
+
 rule postprocessing:
     conda: "../conda_envs/postprocessing.yaml"
     output: "postprocessing"

From 6febff3872ac4f973106a78b5a1230fd12873975 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Wed, 15 Mar 2023 13:29:55 +0000
Subject: [PATCH 04/28] update rules for collating consensus genomes

---
 Snakefile | 56 ++++++++++++++++++++++++++++++++++++++++---------------
 signal.py | 17 +++++++++--------
 2 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/Snakefile b/Snakefile
index e8de728..1df78d6 100644
--- a/Snakefile
+++ b/Snakefile
@@ -127,10 +127,9 @@ rule clean_reads:
        expand('{sn}/mapped_clean_reads/{sn}_R{r}.fastq.gz', sn=sample_names, r=[1,2])
 
 rule consensus:
-    input: expand('{sn}/core/{sn}.consensus.fa', sn=sample_names)
-
-rule core_genomes:
-    input: 'all_genomes.fa'
+    input: expand('{sn}/core/{sn}.consensus.fa', sn=sample_names),
+           'all_genomes.fa',
+#           'failed_samples.log'
 
 rule ivar_variants:
     input: expand('{sn}/core/{sn}_ivar_variants.tsv', sn=sample_names)
@@ -138,10 +137,10 @@ rule ivar_variants:
 rule breseq:
     input: expand('{sn}/breseq/output/index.html', sn=sample_names)
 
-
 rule freebayes:
     input:
         'all_freebayes_genomes.fa',
+#        'failed_samples.log',
         expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names),
         expand('{sn}/freebayes/{sn}.variants.norm.vcf', sn=sample_names),
         'freebayes_lineage_assignments.tsv',
@@ -162,7 +161,6 @@ rule quast:
 
 rule lineages:
     input:
-        rules.core_genomes.input,
         'input_pangolin_versions.txt',
         'input_nextclade_versions.txt',
         'lineage_assignments.tsv'
@@ -332,7 +330,7 @@ rule raw_reads_composite_reference_bwa_map:
     shell:
         '(bwa mem -t {threads} {params.composite_index} '
         '{input.raw_r1} {input.raw_r2} | '
-        '{params.script_path} -c {params.viral_contig_name} > {output}) 2> {log}'
+        "{params.script_path} -c {params.viral_contig_name} > {output}) 2> {log} || echo '' > {output}"
 
 rule get_host_removed_reads:
     threads: 2
@@ -775,7 +773,15 @@ rule run_quast_freebayes:
          'for f in {params.unlabelled_reports}; do mv $f ${{f/report/{params.sample_name}}}; done'
 
 rule collect_core_genomes:
-    
+    output:
+        all = "all_genomes.fa",
+        #failed = "failed_samples.log"
+    input:
+        expand(['{sn}/core/{sn}.consensus.fa'], sn=sample_names)
+    shell:
+        """
+        cat {input} > {output.all}
+        """
 
 rule run_lineage_assignment:
     threads: 4
@@ -785,7 +791,7 @@ rule run_lineage_assignment:
         nextclade_ver_out = 'input_nextclade_versions.txt',
         lin_out = 'lineage_assignments.tsv'
     input:
-        expand('{sn}/core/{sn}.consensus.fa', sn=sample_names)
+        'all_genomes.fa'
     params:
         pangolin_ver = versions['pangolin'],
         pangolearn_ver = versions['pangolearn'],
@@ -802,11 +808,32 @@ rule run_lineage_assignment:
     shell:
         "echo -e 'pangolin: {params.pangolin_ver}\nconstellations: {params.constellations_ver}\nscorpio: {params.scorpio_ver}\npangolearn: {params.pangolearn_ver}\npango-designation: {params.designation_ver}\npangolin-data: {params.data_ver}' > {output.pango_ver_out} && "
         "echo -e 'nextclade: {params.nextclade_ver}\nnextclade-dataset: {params.nextclade_data}\nnextclade-include-recomb: {params.nextclade_recomb}' > {output.nextclade_ver_out} && "
-        'cat {input} > all_genomes.fa && '
-        '{params.assignment_script_path} -i all_genomes.fa -t {threads} -o {output.lin_out} -p {output.pango_ver_out} -n {output.nextclade_ver_out} --mode {params.analysis_mode}'
+        '{params.assignment_script_path} -i {input} -t {threads} -o {output.lin_out} -p {output.pango_ver_out} -n {output.nextclade_ver_out} --mode {params.analysis_mode}'
 
 rule collect_freebayes_genomes:
-    
+    output:
+        "all_freebayes_genomes.fa",
+    input:
+        expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names),
+#    params:
+#        failed = "failed_samples.log"
+    shell:
+        """
+        cat {input} > {output}
+        """
+#    shell:
+#        """
+#        samples=({input})
+#        for file in $samples; do
+#            s=$(basename $file | cut -d. -f1)
+#            count=$(cat $file | grep -v '>' | grep -cv 'N')
+#            if [[ -f $file ]] && [[ ! $count -eq 0 ]]; then
+#                cat $file >> {output}
+#            else
+#                echo $s >> {params.failed}
+#            fi
+#        done
+#        """
 
 rule run_lineage_assignment_freebayes:
     threads: 4
@@ -816,10 +843,9 @@ rule run_lineage_assignment_freebayes:
     input:
         p_vers = 'input_pangolin_versions.txt',
         n_vers = 'input_nextclade_versions.txt',
-        consensus = expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names)
+        consensus = 'all_freebayes_genomes.fa'
     params:
         analysis_mode = pango_speed,
         assignment_script_path = os.path.join(exec_dir, 'scripts', 'assign_lineages.py')
     shell:
-        'cat {input.consensus} > all_freebayes_genomes.fa && '
-        '{params.assignment_script_path} -i all_freebayes_genomes.fa -t {threads} -o {output} -p {input.p_vers} -n {input.n_vers} --mode {params.analysis_mode} --skip'
+        '{params.assignment_script_path} -i {input.consensus} -t {threads} -o {output} -p {input.p_vers} -n {input.n_vers} --mode {params.analysis_mode} --skip'
diff --git a/signal.py b/signal.py
index b5e13bc..0b39b82 100755
--- a/signal.py
+++ b/signal.py
@@ -28,14 +28,14 @@ def create_parser():
 	parser.add_argument('--config-only', action='store_true', help="Generate sample table and configuration file (i.e., config.yaml) and exit. '--directory' required")
 	parser.add_argument('--remove-freebayes', action='store_false', help="Configuration file generator parameter. Set flag to DISABLE freebayes variant calling (improves overall speed)")
 	parser.add_argument('--add-breseq', action='store_true', help="Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for analysis to complete)")
-	parser.add_argument('-neg', '--neg-prefix', default=None, help="Configuration file generator parameter. Comma-separated list of negative sontrol sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will be left empty, if not provided")
+	parser.add_argument('-neg', '--neg-prefix', default=None, help="Configuration file generator parameter. Comma-separated list of negative control sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will be left empty, if not provided")
 	parser.add_argument('--dependencies', action='store_true', help="Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other flags! (~10 GB storage required)")
 	parser.add_argument('--data', default='data', help="Data dependencies parameter. Set location for data dependancies. If '--dependancies' is run, a folder will be created in the specified directory. If '--config-only' or '--directory' is used, the value will be applied to the configuration file. Default = 'data'")
 	parser.add_argument('-ri', '--rerun-incomplete', action='store_true', help="Snakemake parameter. Re-run any incomplete samples from a previously failed run")
 	parser.add_argument('--unlock', action='store_true', help="Snakemake parameter. Remove a lock on the working directory after a failed run")
 	parser.add_argument('-F', '--forceall', action='store_true', help='Snakemake parameter. Force the re-run of all rules regardless of prior output')
 	parser.add_argument('-n', '--dry-run', action='store_true', help='Snakemake parameter. Do not execute anything and only display what would be done')
-	### add --quiet
+	parser.add_argument('-q', '--quiet', action='store_true', help="Snakemake parameter. Do not output any progress or rule information. If used with '--dry-run`, it will just display a summary of the DAG of jobs")
 	parser.add_argument('--verbose', action='store_true', help="Snakemake parameter. Display snakemake debugging output")
 	parser.add_argument('-v', '--version', action='store_true', help="Display version number")
 	args, unknown = parser.parse_known_args()
@@ -275,13 +275,17 @@ def test_signal(data):
 		exit("No task specified! Please provide at least one of 'all', 'postprocess', or 'ncov_tools'! See 'signal.py -h' for details!")
 	else:
 		if args.verbose: alt_options.append('--verbose')
+		if args.quiet: alt_options.append('--quiet')
 		if args.unlock: alt_options.append('--unlock')
 		if args.forceall: alt_options.append('--forceall')
 		if args.dry_run: alt_options.append('--dry-run')
 		if args.rerun_incomplete: alt_options.append('--rerun-incomplete')
 		opt = " ".join(alt_options)
 		for task in allowed:
-			if (allowed[task] is True) and (task != 'install'):
+			if allowed[task] is True:
+				if task == 'install':
+					print(f"Installing SIGNAL environments!")
+					exit()
 				print(f"Running SIGNAL {task}!")
 				try:
 					subprocess.run(f"snakemake --conda-frontend mamba --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp {opt}", shell=True, check=True)
@@ -294,10 +298,7 @@ def test_signal(data):
 						print("Retrying...")
 						subprocess.run(f"snakemake --conda-frontend conda --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True)
 					except subprocess.CalledProcessError:
-						exit(f"Something went wrong running SIGNAL {task}! Check input and try again!")
-			else:
-				print(f"Installing SIGNAL environments!")
-				
+						exit(f"Something went wrong running SIGNAL {task}! Check input and logs and try again!")
 				exit()
 	
-	exit("SIGNAL completed successfully!")
+	exit("SIGNAL run complete! Check corresponding snakemake logs for any details!")

From 882b7afe880f1afa009f9971def8fd565cae4a63 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Wed, 15 Mar 2023 13:31:50 +0000
Subject: [PATCH 05/28] update dependencies

---
 resources/dependencies | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/dependencies b/resources/dependencies
index b4f5097..e223532 100644
--- a/resources/dependencies
+++ b/resources/dependencies
@@ -4,8 +4,8 @@ rule all:
         "ivar",
         "snp_mapping",
         "trim_qc",
-        "assign_lineages.yaml",
-        "freebayes.yaml",
+        "assign_lineages",
+        "freebayes",
         "postprocessing"
     shell: "rm {input}"
 

From c89bea97942f25fdf93c1c76ae524d2940586836 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Thu, 16 Mar 2023 02:55:43 +0000
Subject: [PATCH 06/28] remove nodejs

---
 conda_envs/assign_lineages.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/conda_envs/assign_lineages.yaml b/conda_envs/assign_lineages.yaml
index a5eecb8..fc4e8fd 100644
--- a/conda_envs/assign_lineages.yaml
+++ b/conda_envs/assign_lineages.yaml
@@ -11,7 +11,6 @@ dependencies:
   - python>=3.7
   - snakemake-minimal
   - gofasta
-  - nodejs
   - usher
   - pandas
   - pysam==0.16.0.1

From c44fc96bfd779f7a31a052facdbd8ee3dd5f80ae Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Thu, 16 Mar 2023 02:56:23 +0000
Subject: [PATCH 07/28] add conditionals if dependency found

---
 scripts/get_data_dependencies.sh | 66 ++++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 20 deletions(-)

diff --git a/scripts/get_data_dependencies.sh b/scripts/get_data_dependencies.sh
index c525294..43bcf68 100755
--- a/scripts/get_data_dependencies.sh
+++ b/scripts/get_data_dependencies.sh
@@ -14,34 +14,42 @@ accession="MN908947.3"
 
 HELP="""
 Flags:
-    -d  :  Directory to configure database within (~10GB)
-    -a  :  Accession to use as viral reference (default=MN908947.3)
+	-d  :  Directory to configure database within (~10GB)
+	-a  :  Accession to use as viral reference (default=MN908947.3)
 """
 
 while getopts ":d:a:" option; do
-    case "${option}" in
-        d) database_dir=$OPTARG;;
-        a) accession=$OPTARG;;
-    esac
+	case "${option}" in
+		d) database_dir=$OPTARG;;
+		a) accession=$OPTARG;;
+	esac
 done
 
 if [ $database_dir = 0 ] ; then
-    echo "You must specify a data directory to install data dependencies."
-    echo "$HELP"
-    exit 1
+	echo "You must specify a data directory to install data dependencies."
+	echo "$HELP"
+	exit 1
 fi
 
 echo -e "Warning: \n - final databases require ~10GB of storage\n - building databases temporarily requires a peak of ~35GB of storage and ~4GB of memory \n - script takes up to ~1.5 hours (system depending)"
 
 # make database dir and get abspath to it
-mkdir -p $database_dir
+if [ ! -d $database_dir ]; then mkdir -p $database_dir; fi
 database_dir=$(realpath $database_dir)
 
 # use curl to grab "simple data dependencies"
-curl -s "https://raw.githubusercontent.com/timflutre/trimmomatic/3694641a92d4dd9311267fed85b05c7a11141e7c/adapters/NexteraPE-PE.fa" > $database_dir/NexteraPE-PE.fa
-curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=gb&retmode=txt" > $database_dir/$accession.gbk
-curl -s "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${accession}" > $database_dir/$accession.gff3
-curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=fasta&retmode=txt" > $database_dir/$accession.fasta
+if [ ! -f $database_dir/'NexteraPE-PE.fa' ]; then
+	curl -s "https://raw.githubusercontent.com/timflutre/trimmomatic/3694641a92d4dd9311267fed85b05c7a11141e7c/adapters/NexteraPE-PE.fa" > $database_dir/NexteraPE-PE.fa
+fi
+if [ ! -f $database_dir/${accession}.gbk ]; then
+	curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=gb&retmode=txt" > $database_dir/$accession.gbk
+fi
+if [ ! -f $database_dir/${accession}.gff3 ]; then
+	curl -s "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${accession}" > $database_dir/$accession.gff3
+fi
+if [ ! -f $database_dir/${accession}.fasta ]; then
+	curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${accession}&rettype=fasta&retmode=txt" > $database_dir/$accession.fasta
+fi
 
 # install and activate env for kraken/bwa to build their databases/index
 CONDA_BASE=$($CONDA_EXE info --base)
@@ -51,19 +59,37 @@ conda activate data_dependencies
 
 # get the GRCh38 human genome
 # as per https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use
-curl -s "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" > $database_dir/GRC38_no_alt_analysis_set.fna.gz
-gunzip $database_dir/GRC38_no_alt_analysis_set.fna.gz
+if [ ! -f $database_dir/"GRC38_no_alt_analysis_set.fna" ]; then
+	curl -s "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" > $database_dir/GRC38_no_alt_analysis_set.fna.gz
+	gunzip $database_dir/GRC38_no_alt_analysis_set.fna.gz
+fi
 
 # create composite reference of human and virus for competitive bwt mapping 
 # based host removal
+if [ ! -f $database_dir/'composite_human_viral_reference.fna' ]; then
 cat $database_dir/GRC38_no_alt_analysis_set.fna $database_dir/$accession.fasta > $database_dir/composite_human_viral_reference.fna
-bwa index $database_dir/composite_human_viral_reference.fna
+fi
+for file in $database_dir/composite_human_viral_reference.fna.{amb,ann,bwt,pac,sa}; do
+	if [ ! -f $file ]; then
+		bwa index $database_dir/composite_human_viral_reference.fna
+		break
+	else
+		continue
+	fi
+done
 
 # get kraken2 viral db
 mkdir -p $database_dir/Kraken2/db
-curl -s "https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20210517.tar.gz" > $database_dir/Kraken2/db/k2_viral_20210517.tar.gz
-cd $database_dir/Kraken2/db
-tar xvf k2_viral_20210517.tar.gz
+for file in $database_dir/Kraken2/db/{hash,opts,taxo}.k2d; do
+	if [ ! -f $file ]; then
+		curl -s "https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20210517.tar.gz" > $database_dir/Kraken2/db/k2_viral_20210517.tar.gz
+		cd $database_dir/Kraken2/db
+		tar xvf k2_viral_20210517.tar.gz
+		break
+	else
+		continue
+	fi
+done
 
 # create blank fasta for 'phylo_include_seqs'
 touch $database_dir/blank.fasta

From 3b0a189caf0ea3c72ae4dd99c52b9120f3a16926 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Thu, 16 Mar 2023 02:57:21 +0000
Subject: [PATCH 08/28] add install options and improved handling of failed
 samples

---
 Snakefile | 76 ++++++++++++++++++++++++++++++++++++++++---------------
 signal.py | 59 +++++++++++++++++++++++++++---------------
 2 files changed, 93 insertions(+), 42 deletions(-)

diff --git a/Snakefile b/Snakefile
index 1df78d6..4c45f1d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -378,7 +378,7 @@ rule run_trimgalore:
     shell:
         'trim_galore --quality {params.min_qual} --length {params.min_len} '
         ' -o {params.output_prefix} --cores {threads} --fastqc '
-        '--paired {input.raw_r1} {input.raw_r2} 2> {log} || touch {output}'
+        "--paired {input.raw_r1} {input.raw_r2} 2> {log} || (echo -e 'Total reads processed:  0\nReads written (passing filters):  0 (0.0%)\nTotal basepairs processed:  0 bp\nTotal written (filtered):  0 bp (0.0%)' >> {log}; touch {output})"
 
 rule run_filtering_of_residual_adapters:
     threads: 2
@@ -774,13 +774,28 @@ rule run_quast_freebayes:
 
 rule collect_core_genomes:
     output:
-        all = "all_genomes.fa",
-        #failed = "failed_samples.log"
+        "all_genomes.fa"
     input:
         expand(['{sn}/core/{sn}.consensus.fa'], sn=sample_names)
     shell:
         """
-        cat {input} > {output.all}
+        cat {input} > {output}
+        sample=''
+        count=''
+        echo "Samples that failed to assemble:" > failed_samples.log
+        while read -r line;
+        do
+            if [[ $line =~ '>' ]]; then
+                sample=$(echo $line | cut -d'.' -f1 | cut -d'_' -f2)
+            else
+                count=$(echo $line | wc -c)
+                if [[ $count -eq 1 ]]; then
+                    echo $sample >> failed_samples.log
+                else
+                    continue
+                fi
+            fi
+        done < {output}
         """
 
 rule run_lineage_assignment:
@@ -812,28 +827,47 @@ rule run_lineage_assignment:
 
 rule collect_freebayes_genomes:
     output:
-        "all_freebayes_genomes.fa",
+        "all_freebayes_genomes.fa"
     input:
-        expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names),
-#    params:
-#        failed = "failed_samples.log"
+        expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names)
     shell:
         """
         cat {input} > {output}
+        sample=''
+        seq=''
+        count=''
+        out=''
+        if [[ -f 'failed_samples.log' ]]; then
+            out='.failed_freebayes_samples.tmp'
+            cat failed_samples.log | sed 1,1d > $out
+            echo "Samples that failed to assemble:" > failed_samples.log
+        else
+            out='failed_samples.log'
+            echo "Samples that failed to assemble:" > $out
+        fi
+        while read -r line;
+        do
+            if [[ $line =~ '>' ]]; then
+                if [[ $(echo $seq | wc -c) -eq 1 ]]; then # check if new seq
+                    count=$(echo $seq | grep -vc 'N')
+                    if [[ $count -eq 0 ]]; then
+                        echo $sample >> $out
+                    fi
+                    sample=$(echo $line | cut -d'>' -f2) # start new seq
+                    seq=''
+                else
+                    sample=$(echo $line | cut -d'>' -f2) # first seq
+                fi
+            else
+                seq+=$line # append seq
+            fi
+        done < {output}
+        
+        if [[ ! $out == 'failed_samples.log' ]]; then
+            sort -b -d -f $out | uniq >> failed_samples.log
+            rm $out
+        fi
         """
-#    shell:
-#        """
-#        samples=({input})
-#        for file in $samples; do
-#            s=$(basename $file | cut -d. -f1)
-#            count=$(cat $file | grep -v '>' | grep -cv 'N')
-#            if [[ -f $file ]] && [[ ! $count -eq 0 ]]; then
-#                cat $file >> {output}
-#            else
-#                echo $s >> {params.failed}
-#            fi
-#        done
-#        """
 
 rule run_lineage_assignment_freebayes:
     threads: 4
diff --git a/signal.py b/signal.py
index 0b39b82..1788cd0 100755
--- a/signal.py
+++ b/signal.py
@@ -19,7 +19,7 @@ def create_parser():
 	parser.add_argument('ncov_tools', nargs='*',
 						help="Generate configuration file and filesystem setup required and then execute ncov-tools quality control assessment. Requires 'ncov-tools' submodule! '--configfile' is required but will be generated if '--directory' is provided")
 	parser.add_argument('install', nargs='*',
-						help="Install individual rule environments and ensure SIGNAL is functional")
+						help="Install individual rule environments and ensure SIGNAL is functional. The only parameters operable will be '--data' and '--skip-test'. Will override other operations!")
 	parser.add_argument('-c', '--configfile', type=check_file, default=None,
 						help="Configuration file (i.e., config.yaml) for SIGNAL analysis")
 	parser.add_argument('-d', '--directory', type=check_directory, default=None,
@@ -29,13 +29,15 @@ def create_parser():
 	parser.add_argument('--remove-freebayes', action='store_false', help="Configuration file generator parameter. Set flag to DISABLE freebayes variant calling (improves overall speed)")
 	parser.add_argument('--add-breseq', action='store_true', help="Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for analysis to complete)")
 	parser.add_argument('-neg', '--neg-prefix', default=None, help="Configuration file generator parameter. Comma-separated list of negative control sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will be left empty, if not provided")
-	parser.add_argument('--dependencies', action='store_true', help="Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other flags! (~10 GB storage required)")
-	parser.add_argument('--data', default='data', help="Data dependencies parameter. Set location for data dependancies. If '--dependancies' is run, a folder will be created in the specified directory. If '--config-only' or '--directory' is used, the value will be applied to the configuration file. Default = 'data'")
+	parser.add_argument('--dependencies', action='store_true', help="Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other parameters! (~10 GB storage required)")
+	parser.add_argument('--data', default='data', help="SIGNAL install and data dependencies parameter. Set location for data dependancies. When used with 'SIGNAL install', any tests run will use the dependencies located at this directory. If '--dependancies' is run, a folder will be created in the specified directory. If '--config-only' or '--directory' is used, the value will be applied to the configuration file. Default = 'data'")
+	parser.add_argument('--skip-test', action='store_true', help='SIGNAL install parameter. Skip SIGNAL testing after environment installation using curated test data')
 	parser.add_argument('-ri', '--rerun-incomplete', action='store_true', help="Snakemake parameter. Re-run any incomplete samples from a previously failed run")
+	parser.add_argument('-ii', '--ignore-incomplete', action='store_true', help='Snakemake parameter. Do not check for incomplete output files')
 	parser.add_argument('--unlock', action='store_true', help="Snakemake parameter. Remove a lock on the working directory after a failed run")
 	parser.add_argument('-F', '--forceall', action='store_true', help='Snakemake parameter. Force the re-run of all rules regardless of prior output')
 	parser.add_argument('-n', '--dry-run', action='store_true', help='Snakemake parameter. Do not execute anything and only display what would be done')
-	parser.add_argument('-q', '--quiet', action='store_true', help="Snakemake parameter. Do not output any progress or rule information. If used with '--dry-run`, it will just display a summary of the DAG of jobs")
+	parser.add_argument('--quiet', action='store_true', help="Snakemake parameter. Do not output any progress or rule information. If used with '--dry-run`, it will just display a summary of the DAG of jobs")
 	parser.add_argument('--verbose', action='store_true', help="Snakemake parameter. Display snakemake debugging output")
 	parser.add_argument('-v', '--version', action='store_true', help="Display version number")
 	args, unknown = parser.parse_known_args()
@@ -53,16 +55,7 @@ def create_parser():
 			allowed[val.lower()] = True
 		else:
 			print(f"Ignoring unknown command: {val}")
-	
-	# Unknown
-	# for x in unknown:
-		# filter out unknown options (like -b or --b or alll)
-		# exit with error
-		# if x.startswith(('-', '--')):
-			# parser.error(f"unknown argument {x}")
-		# identify what belongs where
-		# getattr(result, 'provided').append(x)
-	
+
 	return args, allowed
 	
 def check_directory(path: str) -> Path:
@@ -126,8 +119,7 @@ def write_sample_table(sample_data, output_table):
 		for sample in sample_data:
 			out_fh.write(",".join(sample) + '\n')
 
-def download_dependences(data):
-	dir_name = data
+def download_dependences(dir_name):
 	script = os.path.join(script_path, 'scripts', 'get_data_dependencies.sh')
 	subprocess.run(['bash', script, '-d', dir_name, '-a', 'MN908947.3'])
 
@@ -238,8 +230,21 @@ def write_config_file(run_name, config_file, data_directory, opt_tasks):
 	with open(config_file, 'w') as fh:
 		fh.write(config)
 
-def test_signal(data):
-	pass
+def install_signal(data='data'):
+	"""
+	Install SIGNAL dependencies per rule and test using a sample dataset, if desired
+	"""
+	dep_snakefile = os.path.join(script_path, 'resources', 'dependancies')
+	assert os.path.exists(dep_snakefile)
+	try:
+		subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend mamba --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet")
+	except subprocess.CalledProcessError: # likely missing mamba 
+		subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend conda --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet")
+	
+	# Test SIGNAL with data
+	if os.path.exists(data):
+		pass
+	
 
 if __name__ == '__main__':
 	# note: add root_dir to determine the root directory of SIGNAL
@@ -251,11 +256,15 @@ def test_signal(data):
 	if args.version:
 		exit(f"{version}")
 	
+	if allowed['install']:
+		install_signal(args.data)
+		exit()
+	
 	if args.dependencies:
 		print("Downloading necessary reference and dependency files!")
 		download_dependences(args.data)
 		exit("Download complete!")
-	
+		
 	if args.configfile is None:
 		assert args.directory is not None, "Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!)"
 		run_name = args.directory.name
@@ -280,9 +289,10 @@ def test_signal(data):
 		if args.forceall: alt_options.append('--forceall')
 		if args.dry_run: alt_options.append('--dry-run')
 		if args.rerun_incomplete: alt_options.append('--rerun-incomplete')
+		if args.ignore_incomplete: alt_options.append('--ignore-incomplete')
 		opt = " ".join(alt_options)
 		for task in allowed:
-			if allowed[task] is True:
+			if (allowed[task] is True) and (task != 'install'):
 				if task == 'install':
 					print(f"Installing SIGNAL environments!")
 					exit()
@@ -298,7 +308,14 @@ def test_signal(data):
 						print("Retrying...")
 						subprocess.run(f"snakemake --conda-frontend conda --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True)
 					except subprocess.CalledProcessError:
-						exit(f"Something went wrong running SIGNAL {task}! Check input and logs and try again!")
+						if task == 'all':
+							print(f"Some jobs failed while running SIGNAL {task}! Samples that failed assembly can be found in 'failed_samples.log'! Otherwise, check your inputs and logs and try again!")
+						elif task == 'postprocess':
+							print(f"Some jobs failed while running SIGNAL {task}! Some output files may be missing! Check SIGNAL results and try again!")
+						elif task == 'ncov_tools':
+							print(f"Some jobs failed while running SIGNAL {task}! Check snakemake logs and the ncov-tools directory for additional details!")
+						else:
+							print(f"Some jobs failed while running SIGNAL {task}! Check inputs and logs and try again!")
 				exit()
 	
 	exit("SIGNAL run complete! Check corresponding snakemake logs for any details!")

From 26e2e09193631991ea96521e661d3afcdcae7d80 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Thu, 16 Mar 2023 14:44:50 +0000
Subject: [PATCH 09/28] update ncov_tools linking to remove failed samples

---
 Snakefile             |  3 ++-
 scripts/ncov-tools.py | 35 +++++++++++++++++++++++++++--------
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/Snakefile b/Snakefile
index 4c45f1d..1c32c76 100644
--- a/Snakefile
+++ b/Snakefile
@@ -242,7 +242,8 @@ rule ncov_tools:
         negative_control_prefix = config['negative_control_prefix'],
         freebayes_run = config['run_freebayes'],
         pangolin = versions['pangolin'],
-        mode = pango_speed
+        mode = pango_speed,
+        failed = os.path.join(result_dir, 'failed_samples.log') 
     input:
         consensus = expand('{sn}/core/{sn}.consensus.fa', sn=sample_names),
         primertrimmed_bams = expand("{sn}/core/{sn}_viral_reference.mapping.primertrimmed.sorted.bam", sn=sample_names),
diff --git a/scripts/ncov-tools.py b/scripts/ncov-tools.py
index 76e5201..dc86baf 100755
--- a/scripts/ncov-tools.py
+++ b/scripts/ncov-tools.py
@@ -6,17 +6,21 @@
 import fileinput
 import glob
 
-def link_ivar(root, replace=False):
+def link_ivar(root, replace=False, neg, failed):
 	print("Linking iVar files to ncov-tools!")
 
 	for variants in snakemake.input['variants']:
 		sample = variants.split('/')[0]
+		if (sample in failed) and (sample not in neg):
+			continue
 		ln_path = f"{root}/{sample}.variants.tsv"
 		if (not os.path.exists(ln_path)) or (replace is True):
 			os.link(variants, ln_path)
 
 	for consensus in snakemake.input['consensus']:
 		sample = consensus.split('/')[0]
+		if (sample in failed) and (sample not in neg):
+			continue
 		ln_path = f"{root}/{sample}.consensus.fasta"
 		if (not os.path.exists(ln_path)) or (replace is True):
 			os.link(consensus, ln_path)
@@ -30,15 +34,17 @@ def link_ivar(root, replace=False):
 
 # take sample name from iVar results, redirect to where corresponding FreeBayes should be
 # if FreeBayes file cannot be found, break from loop, replace all with iVar
-def link_freebayes(root):
+def link_freebayes(root, neg, failed):
 	print("Linking FreeBayes files to ncov-tools!")
 
 	for variants in snakemake.input['variants']:
 		sample = variants.split('/')[0]
+		if (sample in failed) and (sample not in neg):
+			continue
 		expected_path = os.path.join(sample, 'freebayes', sample+'.variants.norm.vcf')
 		if not os.path.exists(expected_path):
 			print("Missing FreeBayes variant file! Switching to iVar input!")
-			link_ivar(root, True)
+			link_ivar(root, True, neg, failed)
 			break
 		else:
 			ln_path = f"{root}/{sample}.variants.vcf"
@@ -47,10 +53,12 @@ def link_freebayes(root):
 
 	for consensus in snakemake.input['consensus']:
 		sample = consensus.split('/')[0]
+		if (sample in failed) and (sample not in neg):
+			continue
 		expected_path = os.path.join(sample, 'freebayes', sample+'.consensus.fasta')
 		if not os.path.exists(expected_path):
 			print("Missing FreeBayes variant file! Switching to iVar input!")
-			link_ivar(root, True)
+			link_ivar(root, True, neg, failed)
 			break
 		else:
 			ln_path = f"{root}/{sample}.consensus.fasta"
@@ -99,6 +107,12 @@ def set_up():
 	neg_list = list(neg_samples)
 	print("Negative control samples found include: %s" %(neg_list))
 
+### Pull failed samples (SIGNAL log file: failed_samples.log)
+	if os.path.exists(snakemake.params['failed']):
+		with open(snakemake.params['failed']) as fail:
+			failed_list = [i.strip() for i in fail.readlines()[1:]]
+	else:
+		failed_list = []
 
 ### config.yaml parameters
 	config = {'data_root': f"'{data_root}'",
@@ -126,21 +140,26 @@ def set_up():
 	print("Linking alignment BAMs to ncov-tools!")
 	for bam in snakemake.input['bams']:
 		sample = bam.split('/')[0]
+		# if sample failed and not a negative, skip linking
+		if (sample in failed_list) and (sample not in neg_list):
+			continue
 		ln_path = f"{data_root}/{sample}.bam"
-		if (not os.path.exists(ln_path)) or (replace is True):
+		if not os.path.exists(ln_path):
 			os.link(bam, ln_path)
 
 	for primer_trimmed_bam in snakemake.input['primertrimmed_bams']:
 		sample = primer_trimmed_bam.split('/')[0]
+		if (sample in failed_list) and (sample not in neg_list):
+			continue
 		ln_path = f"{data_root}/{sample}.mapped.primertrimmed.sorted.bam"
-		if (not os.path.exists(ln_path)) or (replace is True):
+		if not os.path.exists(ln_path):
 			os.link(primer_trimmed_bam, ln_path)
 			
 	if snakemake.params['freebayes_run']:
-		link_freebayes(data_root)
+		link_freebayes(data_root, neg_list, failed_list)
 		config['variants_pattern'] = "'{data_root}/{sample}.variants.vcf'"
 	else:
-		link_ivar(data_root)
+		link_ivar(data_root, neg_list, failed_list)
 
 	with open(os.path.join(exec_dir, 'ncov-tools', 'config.yaml'), 'w') as fh:
 		for key, value in config.items():

From dea7448c55d165f9a328e53acf326eddcaf79faf Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Thu, 16 Mar 2023 22:47:23 +0000
Subject: [PATCH 10/28] update linking of SIGNAL results prior to ncov-tools

---
 Snakefile                 | 2 +-
 scripts/ncov-tools.py     | 9 +++++----
 scripts/run_ncov_tools.sh | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/Snakefile b/Snakefile
index 1c32c76..d807d96 100644
--- a/Snakefile
+++ b/Snakefile
@@ -243,7 +243,7 @@ rule ncov_tools:
         freebayes_run = config['run_freebayes'],
         pangolin = versions['pangolin'],
         mode = pango_speed,
-        failed = os.path.join(result_dir, 'failed_samples.log') 
+        failed = 'failed_samples.log'
     input:
         consensus = expand('{sn}/core/{sn}.consensus.fa', sn=sample_names),
         primertrimmed_bams = expand("{sn}/core/{sn}_viral_reference.mapping.primertrimmed.sorted.bam", sn=sample_names),
diff --git a/scripts/ncov-tools.py b/scripts/ncov-tools.py
index dc86baf..e959778 100755
--- a/scripts/ncov-tools.py
+++ b/scripts/ncov-tools.py
@@ -6,7 +6,7 @@
 import fileinput
 import glob
 
-def link_ivar(root, replace=False, neg, failed):
+def link_ivar(root, neg, failed, replace=False):
 	print("Linking iVar files to ncov-tools!")
 
 	for variants in snakemake.input['variants']:
@@ -44,7 +44,7 @@ def link_freebayes(root, neg, failed):
 		expected_path = os.path.join(sample, 'freebayes', sample+'.variants.norm.vcf')
 		if not os.path.exists(expected_path):
 			print("Missing FreeBayes variant file! Switching to iVar input!")
-			link_ivar(root, True, neg, failed)
+			link_ivar(root, neg, failed, replace=True)
 			break
 		else:
 			ln_path = f"{root}/{sample}.variants.vcf"
@@ -58,7 +58,7 @@ def link_freebayes(root, neg, failed):
 		expected_path = os.path.join(sample, 'freebayes', sample+'.consensus.fasta')
 		if not os.path.exists(expected_path):
 			print("Missing FreeBayes variant file! Switching to iVar input!")
-			link_ivar(root, True, neg, failed)
+			link_ivar(root, neg, failed, replace=True)
 			break
 		else:
 			ln_path = f"{root}/{sample}.consensus.fasta"
@@ -113,6 +113,7 @@ def set_up():
 			failed_list = [i.strip() for i in fail.readlines()[1:]]
 	else:
 		failed_list = []
+	print("Failed samples found include: %s" %(failed_list))
 
 ### config.yaml parameters
 	config = {'data_root': f"'{data_root}'",
@@ -159,7 +160,7 @@ def set_up():
 		link_freebayes(data_root, neg_list, failed_list)
 		config['variants_pattern'] = "'{data_root}/{sample}.variants.vcf'"
 	else:
-		link_ivar(data_root, neg_list, failed_list)
+		link_ivar(data_root, neg_list, failed_list, replace=False)
 
 	with open(os.path.join(exec_dir, 'ncov-tools', 'config.yaml'), 'w') as fh:
 		for key, value in config.items():
diff --git a/scripts/run_ncov_tools.sh b/scripts/run_ncov_tools.sh
index 2a5f098..5b26293 100755
--- a/scripts/run_ncov_tools.sh
+++ b/scripts/run_ncov_tools.sh
@@ -48,7 +48,7 @@ RESULTS=$PWD
 cd ../ncov-tools
 
 # run ncov-tools
-snakemake -s workflow/Snakefile --cores ${CORES} all
+snakemake -k -s workflow/Snakefile --cores ${CORES} all
 
 # move ncovresults to SIGNAL results directory
 mv ${SIGNAL}'_ncovresults' ${RESULTS}/ncov-tools-results

From ac8bfaef9f4b19c9372ccda5c233a51b4d926fe7 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Thu, 16 Mar 2023 22:48:05 +0000
Subject: [PATCH 11/28] rename and restructure execution of snakemake + install

---
 signal.py => signalexe.py | 63 +++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 25 deletions(-)
 rename signal.py => signalexe.py (88%)

diff --git a/signal.py b/signalexe.py
similarity index 88%
rename from signal.py
rename to signalexe.py
index 1788cd0..8d1006d 100755
--- a/signal.py
+++ b/signalexe.py
@@ -3,10 +3,16 @@
 # v1.5.0+
 # signal.py assumes Snakefile is in current working directory (i.e., SIGNAL root)
 
+import signal
 import argparse
 import subprocess, os, sys
 import re
 from pathlib import Path
+import platform
+
+# for compatibility between platforms
+if platform.system() != 'Linux':
+	signal.SIGHUP = 1
 
 def create_parser():
 	allowed = {'install': False, 'all': False, 'postprocess': False, 'ncov_tools': False}
@@ -57,7 +63,14 @@ def create_parser():
 			print(f"Ignoring unknown command: {val}")
 
 	return args, allowed
-	
+
+def check_frontend():
+	try:
+		subprocess.check_call(['mamba', 'list'], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
+		return 'mamba'
+	except subprocess.CalledProcessError:
+		return 'conda'
+
 def check_directory(path: str) -> Path:
 	"""
 	Check an input directory exists and is readable
@@ -230,16 +243,16 @@ def write_config_file(run_name, config_file, data_directory, opt_tasks):
 	with open(config_file, 'w') as fh:
 		fh.write(config)
 
-def install_signal(data='data'):
+def install_signal(frontend, data='data'):
 	"""
 	Install SIGNAL dependencies per rule and test using a sample dataset, if desired
 	"""
-	dep_snakefile = os.path.join(script_path, 'resources', 'dependancies')
+	dep_snakefile = os.path.join(script_path, 'resources', 'dependencies')
 	assert os.path.exists(dep_snakefile)
 	try:
-		subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend mamba --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet")
+		subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend {frontend} --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet", shell=True, check=True)
 	except subprocess.CalledProcessError: # likely missing mamba 
-		subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend conda --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet")
+		exit("Installation of environments failed!")
 	
 	# Test SIGNAL with data
 	if os.path.exists(data):
@@ -256,8 +269,10 @@ def install_signal(data='data'):
 	if args.version:
 		exit(f"{version}")
 	
+	conda_frontend = check_frontend() # 'mamba' or 'conda'
+	
 	if allowed['install']:
-		install_signal(args.data)
+		install_signal(conda_frontend, args.data)
 		exit()
 	
 	if args.dependencies:
@@ -293,29 +308,27 @@ def install_signal(data='data'):
 		opt = " ".join(alt_options)
 		for task in allowed:
 			if (allowed[task] is True) and (task != 'install'):
-				if task == 'install':
-					print(f"Installing SIGNAL environments!")
-					exit()
 				print(f"Running SIGNAL {task}!")
 				try:
-					subprocess.run(f"snakemake --conda-frontend mamba --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp {opt}", shell=True, check=True)
-				except subprocess.CalledProcessError: # likely missing mamba 
+					subprocess.run(f"snakemake --conda-frontend {conda_frontend} --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp {opt}", shell=True, check=True)
+				except subprocess.CalledProcessError:
 					if task == "ncov_tools":
 						check_submodule(os.getcwd())
-					if opt.split(" ")[-1] == '--rerun-incomplete': # remove redundant flag
-						opt = " ".join(opt.split(" ")[:-1])
-					try:
-						print("Retrying...")
-						subprocess.run(f"snakemake --conda-frontend conda --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True)
-					except subprocess.CalledProcessError:
-						if task == 'all':
-							print(f"Some jobs failed while running SIGNAL {task}! Samples that failed assembly can be found in 'failed_samples.log'! Otherwise, check your inputs and logs and try again!")
-						elif task == 'postprocess':
-							print(f"Some jobs failed while running SIGNAL {task}! Some output files may be missing! Check SIGNAL results and try again!")
-						elif task == 'ncov_tools':
+						if opt.split(" ")[-1] == '--rerun-incomplete': # remove redundant flag
+							opt = " ".join(opt.split(" ")[:-1])
+						try:
+							print("Retrying...ncov-tools!")
+							subprocess.run(f"snakemake --conda-frontend {conda_frontend} --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True)
+						except subprocess.CalledProcessError:
 							print(f"Some jobs failed while running SIGNAL {task}! Check snakemake logs and the ncov-tools directory for additional details!")
-						else:
-							print(f"Some jobs failed while running SIGNAL {task}! Check inputs and logs and try again!")
-				exit()
+							continue
+					elif task == 'all':
+						print(f"Some jobs failed while running SIGNAL {task}! Samples that failed assembly can be found in 'failed_samples.log'! Otherwise, check your inputs and logs and try again!")
+						continue
+					elif task == 'postprocess':
+						print(f"Some jobs failed while running SIGNAL {task}! Some output files may be missing! Check SIGNAL results and try again!")
+						continue
+					else:
+						print(f"Some jobs failed while running SIGNAL {task}! Check SIGNAL inputs and results and try again!")
 	
 	exit("SIGNAL run complete! Check corresponding snakemake logs for any details!")

From f59568c1a4dbc026fc3032c9da21cf816fbd7cf4 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Thu, 16 Mar 2023 22:49:21 +0000
Subject: [PATCH 12/28] cleanup and replace script name

---
 signalexe.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/signalexe.py b/signalexe.py
index 8d1006d..5a0a4b4 100755
--- a/signalexe.py
+++ b/signalexe.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
-# v1.5.0+
-# signal.py assumes Snakefile is in current working directory (i.e., SIGNAL root)
+# v1.6.0+
+# signalexe.py assumes Snakefile is in current working directory (i.e., SIGNAL root)
 
 import signal
 import argparse
@@ -17,7 +17,7 @@
 def create_parser():
 	allowed = {'install': False, 'all': False, 'postprocess': False, 'ncov_tools': False}
 
-	parser = argparse.ArgumentParser(prog='signal.py', description="SARS-CoV-2 Illumina GeNome Assembly Line (SIGNAL) aims to take Illumina short-read sequences and perform consensus assembly + variant calling for ongoing surveillance and research efforts towards the emergent coronavirus: Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2).")
+	parser = argparse.ArgumentParser(prog='signalexe.py', description="SARS-CoV-2 Illumina GeNome Assembly Line (SIGNAL) aims to take Illumina short-read sequences and perform consensus assembly + variant calling for ongoing surveillance and research efforts towards the emergent coronavirus: Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2).")
 	parser.add_argument('all', nargs='*',
 						help="Run SIGNAL with all associated assembly rules. Does not include postprocessing '--configfile' or '--directory' required. The latter will automatically generate a configuration file and sample table. If both provided, then '--configfile' will take priority")
 	parser.add_argument('postprocess', nargs='*',
@@ -25,7 +25,7 @@ def create_parser():
 	parser.add_argument('ncov_tools', nargs='*',
 						help="Generate configuration file and filesystem setup required and then execute ncov-tools quality control assessment. Requires 'ncov-tools' submodule! '--configfile' is required but will be generated if '--directory' is provided")
 	parser.add_argument('install', nargs='*',
-						help="Install individual rule environments and ensure SIGNAL is functional. The only parameters operable will be '--data' and '--skip-test'. Will override other operations!")
+						help="Install individual rule environments and ensure SIGNAL is functional. The only parameter operable will be '--data'. Will override other operations!")
 	parser.add_argument('-c', '--configfile', type=check_file, default=None,
 						help="Configuration file (i.e., config.yaml) for SIGNAL analysis")
 	parser.add_argument('-d', '--directory', type=check_directory, default=None,
@@ -36,8 +36,8 @@ def create_parser():
 	parser.add_argument('--add-breseq', action='store_true', help="Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for analysis to complete)")
 	parser.add_argument('-neg', '--neg-prefix', default=None, help="Configuration file generator parameter. Comma-separated list of negative control sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will be left empty, if not provided")
 	parser.add_argument('--dependencies', action='store_true', help="Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other parameters! (~10 GB storage required)")
-	parser.add_argument('--data', default='data', help="SIGNAL install and data dependencies parameter. Set location for data dependancies. When used with 'SIGNAL install', any tests run will use the dependencies located at this directory. If '--dependancies' is run, a folder will be created in the specified directory. If '--config-only' or '--directory' is used, the value will be applied to the configuration file. Default = 'data'")
-	parser.add_argument('--skip-test', action='store_true', help='SIGNAL install parameter. Skip SIGNAL testing after environment installation using curated test data')
+	parser.add_argument('--data', default='data', help="SIGNAL install and data dependencies parameter. Set location for data dependancies. If '--dependancies' is run, a folder will be created in the specified directory. If '--config-only' or '--directory' is used, the value will be applied to the configuration file. (Upcoming feature): When used with 'SIGNAL install', any tests run will use the dependencies located at this directory. Default = 'data'")
+	#parser.add_argument('--enable-test', action='store_true', help='SIGNAL install parameter. Add SIGNAL testing after environment installation using curated test data')
 	parser.add_argument('-ri', '--rerun-incomplete', action='store_true', help="Snakemake parameter. Re-run any incomplete samples from a previously failed run")
 	parser.add_argument('-ii', '--ignore-incomplete', action='store_true', help='Snakemake parameter. Do not check for incomplete output files')
 	parser.add_argument('--unlock', action='store_true', help="Snakemake parameter. Remove a lock on the working directory after a failed run")
@@ -251,10 +251,10 @@ def install_signal(frontend, data='data'):
 	assert os.path.exists(dep_snakefile)
 	try:
 		subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend {frontend} --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet", shell=True, check=True)
-	except subprocess.CalledProcessError: # likely missing mamba 
+	except subprocess.CalledProcessError: 
 		exit("Installation of environments failed!")
 	
-	# Test SIGNAL with data
+	### TODO: Test SIGNAL with curated data
 	if os.path.exists(data):
 		pass
 	
@@ -273,7 +273,7 @@ def install_signal(frontend, data='data'):
 	
 	if allowed['install']:
 		install_signal(conda_frontend, args.data)
-		exit()
+		exit("Installation of environments completed successfully!")
 	
 	if args.dependencies:
 		print("Downloading necessary reference and dependency files!")
@@ -296,7 +296,7 @@ def install_signal(frontend, data='data'):
 		config_file = args.configfile
 	
 	if not any([allowed[x] for x in allowed]):
-		exit("No task specified! Please provide at least one of 'all', 'postprocess', or 'ncov_tools'! See 'signal.py -h' for details!")
+		exit("No task specified! Please provide at least one of 'all', 'postprocess', or 'ncov_tools'! See 'signalexe.py -h' for details!")
 	else:
 		if args.verbose: alt_options.append('--verbose')
 		if args.quiet: alt_options.append('--quiet')
@@ -323,12 +323,12 @@ def install_signal(frontend, data='data'):
 							print(f"Some jobs failed while running SIGNAL {task}! Check snakemake logs and the ncov-tools directory for additional details!")
 							continue
 					elif task == 'all':
-						print(f"Some jobs failed while running SIGNAL {task}! Samples that failed assembly can be found in 'failed_samples.log'! Otherwise, check your inputs and logs and try again!")
+						print(f"Some jobs failed while running SIGNAL {task}! This does NOT necessarily mean your run was erroneous! Samples that failed assembly can be found in 'failed_samples.log'! If no such file exists or is blank, check your inputs and logs and try again!")
 						continue
 					elif task == 'postprocess':
-						print(f"Some jobs failed while running SIGNAL {task}! Some output files may be missing! Check SIGNAL results and try again!")
+						print(f"Some jobs failed while running SIGNAL {task}! Some output files may be missing! Check SIGNAL results and logs and try again!")
 						continue
 					else:
-						print(f"Some jobs failed while running SIGNAL {task}! Check SIGNAL inputs and results and try again!")
+						print(f"Some jobs failed while running SIGNAL {task}! Check SIGNAL inputs, logs, and results and try again!")
 	
 	exit("SIGNAL run complete! Check corresponding snakemake logs for any details!")

From 3343c73376b1df31bb782b1e63daaf660b81b4cf Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Thu, 16 Mar 2023 22:58:34 +0000
Subject: [PATCH 13/28] correct exit codes

---
 signalexe.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/signalexe.py b/signalexe.py
index 5a0a4b4..093659f 100755
--- a/signalexe.py
+++ b/signalexe.py
@@ -120,7 +120,8 @@ def check_submodule(exec_dir):
 			print("Updating ncov-tools!")
 			subprocess.run(['git', 'submodule', 'update', '--init', '--recursive'])
 		except subprocess.CalledProcessError:
-			exit("Could not find nor update the required 'ncov-tools' directory! Manually download/update and try again!")
+			print("Could not find nor update the required 'ncov-tools' directory! Manually download/update and try again!")
+			sys.exit(1)
 		
 def write_sample_table(sample_data, output_table):
 	"""
@@ -252,7 +253,8 @@ def install_signal(frontend, data='data'):
 	try:
 		subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend {frontend} --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet", shell=True, check=True)
 	except subprocess.CalledProcessError: 
-		exit("Installation of environments failed!")
+		print("Installation of environments failed!")
+		sys.exit(1)
 	
 	### TODO: Test SIGNAL with curated data
 	if os.path.exists(data):
@@ -267,21 +269,26 @@ def install_signal(frontend, data='data'):
 	alt_options = []
 	
 	if args.version:
-		exit(f"{version}")
+		print(f"{version}")
+		sys.exit(0)
 	
 	conda_frontend = check_frontend() # 'mamba' or 'conda'
 	
 	if allowed['install']:
 		install_signal(conda_frontend, args.data)
-		exit("Installation of environments completed successfully!")
+		print("Installation of environments completed successfully!")
+		sys.exit(0)
 	
 	if args.dependencies:
 		print("Downloading necessary reference and dependency files!")
 		download_dependences(args.data)
-		exit("Download complete!")
+		print("Complete!")
+		sys.exit(0)
 		
 	if args.configfile is None:
-		assert args.directory is not None, "Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!)"
+		if args.directory is None, 
+			print("Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!")
+			sys.exit(1)
 		run_name = args.directory.name
 		generate_sample_table(args.directory, run_name)
 		config_file = run_name + "_config.yaml"
@@ -291,12 +298,14 @@ def install_signal(frontend, data='data'):
 			neg = [args.neg_prefix]
 		write_config_file(run_name, config_file, args.data, [args.add_breseq, args.remove_freebayes, neg])
 		if args.config_only:
-			exit("Configuration file and sample table generated!")
+			print("Configuration file and sample table generated!")
+			sys.exit(0)
 	else:
 		config_file = args.configfile
 	
 	if not any([allowed[x] for x in allowed]):
-		exit("No task specified! Please provide at least one of 'all', 'postprocess', or 'ncov_tools'! See 'signalexe.py -h' for details!")
+		print("No task specified! Please provide at least one of 'all', 'postprocess', or 'ncov_tools'! See 'signalexe.py -h' for details!")
+		sys.exit(1)
 	else:
 		if args.verbose: alt_options.append('--verbose')
 		if args.quiet: alt_options.append('--quiet')
@@ -331,4 +340,5 @@ def install_signal(frontend, data='data'):
 					else:
 						print(f"Some jobs failed while running SIGNAL {task}! Check SIGNAL inputs, logs, and results and try again!")
 	
-	exit("SIGNAL run complete! Check corresponding snakemake logs for any details!")
+	print("SIGNAL run complete! Check corresponding snakemake logs for any details!")
+	sys.exit(0)

From 930bf1750a597f0e9dd807505f9e86f128a42c57 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Thu, 16 Mar 2023 22:59:49 +0000
Subject: [PATCH 14/28] fix syntax

---
 signalexe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/signalexe.py b/signalexe.py
index 093659f..570b885 100755
--- a/signalexe.py
+++ b/signalexe.py
@@ -286,7 +286,7 @@ def install_signal(frontend, data='data'):
 		sys.exit(0)
 		
 	if args.configfile is None:
-		if args.directory is None, 
+		if args.directory is None: 
 			print("Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!")
 			sys.exit(1)
 		run_name = args.directory.name

From 97d2a40ea30509f9016a29ea539898db3bf33820 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Thu, 16 Mar 2023 23:04:58 +0000
Subject: [PATCH 15/28] fix assertion syntax

---
 signalexe.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/signalexe.py b/signalexe.py
index 570b885..50dd081 100755
--- a/signalexe.py
+++ b/signalexe.py
@@ -283,11 +283,9 @@ def install_signal(frontend, data='data'):
 		print("Downloading necessary reference and dependency files!")
 		download_dependences(args.data)
 		print("Complete!")
-		sys.exit(0)
 		
-	if args.configfile is None:
-		if args.directory is None: 
-			print("Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!")
+	if (args.configfile is None) and (not allowed['install']):
+		assert args.directory is not None, "Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!)"
 			sys.exit(1)
 		run_name = args.directory.name
 		generate_sample_table(args.directory, run_name)

From 289d0d5d06d2373680fc2be014b3d6ef994aeb28 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Thu, 16 Mar 2023 23:05:55 +0000
Subject: [PATCH 16/28] remove stale exit

---
 signalexe.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/signalexe.py b/signalexe.py
index 50dd081..6f6be4b 100755
--- a/signalexe.py
+++ b/signalexe.py
@@ -286,7 +286,6 @@ def install_signal(frontend, data='data'):
 		
 	if (args.configfile is None) and (not allowed['install']):
 		assert args.directory is not None, "Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!)"
-			sys.exit(1)
 		run_name = args.directory.name
 		generate_sample_table(args.directory, run_name)
 		config_file = run_name + "_config.yaml"

From 33d668b161e9951c101165f3e62234f391c14b48 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Thu, 16 Mar 2023 23:12:02 +0000
Subject: [PATCH 17/28] fix typo leadingto unwanted AssertionError

---
 signalexe.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/signalexe.py b/signalexe.py
index 6f6be4b..1347b1e 100755
--- a/signalexe.py
+++ b/signalexe.py
@@ -133,7 +133,7 @@ def write_sample_table(sample_data, output_table):
 		for sample in sample_data:
 			out_fh.write(",".join(sample) + '\n')
 
-def download_dependences(dir_name):
+def download_dependencies(dir_name):
 	script = os.path.join(script_path, 'scripts', 'get_data_dependencies.sh')
 	subprocess.run(['bash', script, '-d', dir_name, '-a', 'MN908947.3'])
 
@@ -281,10 +281,10 @@ def install_signal(frontend, data='data'):
 	
 	if args.dependencies:
 		print("Downloading necessary reference and dependency files!")
-		download_dependences(args.data)
+		download_dependencies(args.data)
 		print("Complete!")
 		
-	if (args.configfile is None) and (not allowed['install']):
+	if args.configfile is None:
 		assert args.directory is not None, "Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!)"
 		run_name = args.directory.name
 		generate_sample_table(args.directory, run_name)

From 9fa7c7ac0e05ce41f0339f0d801e9545ce55e8d6 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Thu, 16 Mar 2023 23:16:40 +0000
Subject: [PATCH 18/28] add missing exit code

---
 signalexe.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/signalexe.py b/signalexe.py
index 1347b1e..3e94174 100755
--- a/signalexe.py
+++ b/signalexe.py
@@ -283,6 +283,7 @@ def install_signal(frontend, data='data'):
 		print("Downloading necessary reference and dependency files!")
 		download_dependencies(args.data)
 		print("Complete!")
+		sys.exit(0)
 		
 	if args.configfile is None:
 		assert args.directory is not None, "Please provide '--directory' to proceed! ('--configfile' if a configuration file already exists!)"

From e716438e2dbf616142785bbc9e8899c367146dca Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Thu, 16 Mar 2023 23:33:14 +0000
Subject: [PATCH 19/28] make installation output verbose

---
 signalexe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/signalexe.py b/signalexe.py
index 3e94174..85701d2 100755
--- a/signalexe.py
+++ b/signalexe.py
@@ -251,7 +251,7 @@ def install_signal(frontend, data='data'):
 	dep_snakefile = os.path.join(script_path, 'resources', 'dependencies')
 	assert os.path.exists(dep_snakefile)
 	try:
-		subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend {frontend} --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda --quiet", shell=True, check=True)
+		subprocess.run(f"snakemake -s {dep_snakefile} --conda-frontend {frontend} --cores 1 --use-conda --conda-prefix=$PWD/.snakemake/conda", shell=True, check=True)
 	except subprocess.CalledProcessError: 
 		print("Installation of environments failed!")
 		sys.exit(1)

From 055d11453a54545d95ca2feb074408a26506df10 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Fri, 17 Mar 2023 01:01:38 +0000
Subject: [PATCH 20/28] update ncov_tools re-run condition

---
 scripts/run_ncov_tools.sh |  6 +++---
 signalexe.py              | 21 ++++++++++++++-------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/scripts/run_ncov_tools.sh b/scripts/run_ncov_tools.sh
index 5b26293..48cb2a8 100755
--- a/scripts/run_ncov_tools.sh
+++ b/scripts/run_ncov_tools.sh
@@ -35,9 +35,9 @@ if [ $1 = 'help' ]; then
 fi
 
 if [ $SIGNAL = 0 ] ; then
-    echo "You must specify the name of the directory holding SIGNAL results."
-    echo "$HELP"
-    exit 1
+	echo "You must specify the name of the directory holding SIGNAL results."
+	echo "$HELP"
+	exit 1
 fi
 
 # Start point for executing from ncov-tools.py is SIGNAL results directory
diff --git a/signalexe.py b/signalexe.py
index 85701d2..1d10df6 100755
--- a/signalexe.py
+++ b/signalexe.py
@@ -119,9 +119,12 @@ def check_submodule(exec_dir):
 		try:
 			print("Updating ncov-tools!")
 			subprocess.run(['git', 'submodule', 'update', '--init', '--recursive'])
+			return True
 		except subprocess.CalledProcessError:
 			print("Could not find nor update the required 'ncov-tools' directory! Manually download/update and try again!")
 			sys.exit(1)
+	else:
+		return False
 		
 def write_sample_table(sample_data, output_table):
 	"""
@@ -320,13 +323,17 @@ def install_signal(frontend, data='data'):
 					subprocess.run(f"snakemake --conda-frontend {conda_frontend} --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp {opt}", shell=True, check=True)
 				except subprocess.CalledProcessError:
 					if task == "ncov_tools":
-						check_submodule(os.getcwd())
-						if opt.split(" ")[-1] == '--rerun-incomplete': # remove redundant flag
-							opt = " ".join(opt.split(" ")[:-1])
-						try:
-							print("Retrying...ncov-tools!")
-							subprocess.run(f"snakemake --conda-frontend {conda_frontend} --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True)
-						except subprocess.CalledProcessError:
+						mod = check_submodule(os.getcwd())
+						if mod:
+							if opt.split(" ")[-1] == '--rerun-incomplete': # remove redundant flag
+								opt = " ".join(opt.split(" ")[:-1])
+							try:
+								print("Retrying...ncov-tools!")
+								subprocess.run(f"snakemake --conda-frontend {conda_frontend} --configfile {config_file} --cores={args.cores} --use-conda --conda-prefix=$PWD/.snakemake/conda {task} -kp --rerun-incomplete {opt}", shell=True, check=True)
+							except subprocess.CalledProcessError:
+								print(f"Some jobs failed while running SIGNAL {task}! Check snakemake logs and the ncov-tools directory for additional details!")
+								continue
+						else:
 							print(f"Some jobs failed while running SIGNAL {task}! Check snakemake logs and the ncov-tools directory for additional details!")
 							continue
 					elif task == 'all':

From a5d9539e25bce92f72d6805b9a5ba0cb71f87a8e Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Fri, 17 Mar 2023 02:52:35 +0000
Subject: [PATCH 21/28] add script to pull select data

---
 scripts/get_signal_results.sh | 114 ++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100755 scripts/get_signal_results.sh

diff --git a/scripts/get_signal_results.sh b/scripts/get_signal_results.sh
new file mode 100755
index 0000000..a44825e
--- /dev/null
+++ b/scripts/get_signal_results.sh
@@ -0,0 +1,114 @@
+#!/bin/env bash
+
+shopt -s extglob
+
+source=0
+destination=0
+move='false'
+
+HELP="""
+Usage:
+bash get_signal_results.sh -s <SIGNAL_results_dir> -d <destination_dir> [-m]
+
+This scripts aims to copy (rsync by default) or move (mv) select output from SIGNAL 'all', 'postprocess', and 'ncov_tools'.
+
+The following files will be transferred over to the specified  destination directory (if found):
+SIGNAL 'all' & 'postprocess':
+-> signal-results/<sample>/<sample>_sample.txt
+-> signal-results/<sample>/core/<sample>.consensus.fa
+-> signal-results/<sample>/core/<sample>_ivar_variants.tsv
+-> signal-results/<sample>/freebayes/<sample>.consensus.fasta
+-> signal-results/<sample>/freebayes/<sample>.variants.norm.vcf
+
+'ncov_tools':
+-> ncov_tools-results/qc_annotation/<sample>.ann.vcf
+-> ncov-tools-results/qc_reports/<run_name>_ambiguous_position_report.tsv
+-> ncov-tools-results/qc_reports/<run_name>_mixture_report.tsv
+-> ncov-tools-results/qc_reports/<run_name>_ncov_watch_variants.tsv
+-> ncov-tools-results/qc_reports/<run_name>_negative_control_report.tsv
+-> ncov-tools-results/qc_reports/<run_name>_summary_qc.tsv
+
+Flags:
+	-s  :  SIGNAL results directory
+	-d  :  Directory where summary will be outputted
+	-m  :  Invoke 'mv' command instead of 'rsync' copying of results. Optional
+"""
+
+while getopts ":s:d:m" option; do
+	case "${option}" in
+		s) source=$OPTARG;;
+		d) destination=$OPTARG;;
+		m) move='true';;
+	esac
+done
+
+
+if [ $source = 0 ] || [ $destination = 0 ] ; then
+	echo "You must specify both source and destination locations."
+	echo "$HELP"
+	exit 1
+fi
+
+if [ ! -d $destination ]; then
+	echo "Invalid destination directory!"
+	exit 1
+fi
+
+if [ ! -f $source/summary.html ] && [ ! -f $source/summary.zip ]; then
+	echo "Invalid SIGNAL directory! Make sure you've run SIGNAL 'all' and 'postprocess'!"
+	exit 1
+else
+	run_name=$(basename $source)
+	final_dir=${destination}/${run_name}
+	mkdir -p $final_dir/signal-results
+fi
+	
+if ${move}; then
+	cmd='mv'
+else
+	cmd='rsync -avh'
+	# rsync -avh
+fi
+
+echo -e "We will use ${cmd} for your files!"
+
+### SIGNAL results_dir
+for file in $source/*; do 
+	if [ -d $file ]; then # results_dir/sample
+		sample=$(basename $file) # sample name, within contain our files
+		sample_dest=${final_dir}/'signal-results'/${sample}
+		if [[ ! $sample == 'ncov-tools-results' ]]; then
+			mkdir -p $sample_dest
+		fi
+		for d in $file/*; do
+			name=$(basename $d)
+			if [ -d $d ] && [[ $name == 'core' ]]; then
+				mkdir -p $sample_dest/core
+				$cmd ${d}/${sample}.consensus.fa $sample_dest/core/${sample}.consensus.fa
+				$cmd ${d}/${sample}_ivar_variants.tsv $sample_dest/core/${sample}_ivar_variants.tsv
+			elif [ -d $d ] && [[ $name == 'freebayes' ]]; then
+				mkdir -p $sample_dest/freebayes
+				$cmd ${d}/${sample}.consensus.fasta $sample_dest/freebayes/${sample}.consensus.fasta 
+				$cmd ${d}/${sample}.variants.norm.vcf $sample_dest/freebayes/${sample}.variants.norm.vcf
+			elif [ -f $d ] && [[ $name =~ '_sample.txt' ]]; then
+				$cmd ${d} $sample_dest/$(basename $d)
+			else
+				continue
+			fi
+		done
+	fi
+done
+
+echo "Files from SIGNAL transferred!"
+
+### NCOV-TOOLS
+if [ ! -d $source/ncov-tools-results ]; then
+	echo "No ncov-tools-results directory found!"
+else
+	ncov_dest=${final_dir}/ncov-tools-results
+	mkdir -p $ncov_dest/qc_{annotation,reports}
+	$cmd $source/ncov-tools-results/qc_reports/* $ncov_dest/qc_reports
+	$cmd $source/ncov-tools-results/qc_annotation/*.ann.vcf  $ncov_dest/qc_annotation
+	
+	echo "Files from ncov-tools transferred!"
+fi
\ No newline at end of file

From 34da4219d366ed40d55e430a5a463544df334cbf Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Fri, 17 Mar 2023 03:19:42 +0000
Subject: [PATCH 22/28] add cp command option

---
 scripts/get_signal_results.sh | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/scripts/get_signal_results.sh b/scripts/get_signal_results.sh
index a44825e..428253c 100755
--- a/scripts/get_signal_results.sh
+++ b/scripts/get_signal_results.sh
@@ -5,14 +5,15 @@ shopt -s extglob
 source=0
 destination=0
 move='false'
+copy='false'
 
 HELP="""
 Usage:
-bash get_signal_results.sh -s <SIGNAL_results_dir> -d <destination_dir> [-m]
+bash get_signal_results.sh -s <SIGNAL_results_dir> -d <destination_dir> [-m] [-c]
 
 This scripts aims to copy (rsync by default) or move (mv) select output from SIGNAL 'all', 'postprocess', and 'ncov_tools'.
 
-The following files will be transferred over to the specified  destination directory (if found):
+The following files will be transferred over to the specified destination directory (if found):
 SIGNAL 'all' & 'postprocess':
 -> signal-results/<sample>/<sample>_sample.txt
 -> signal-results/<sample>/core/<sample>.consensus.fa
@@ -20,7 +21,7 @@ SIGNAL 'all' & 'postprocess':
 -> signal-results/<sample>/freebayes/<sample>.consensus.fasta
 -> signal-results/<sample>/freebayes/<sample>.variants.norm.vcf
 
-'ncov_tools':
+SIGNAL 'ncov_tools':
 -> ncov_tools-results/qc_annotation/<sample>.ann.vcf
 -> ncov-tools-results/qc_reports/<run_name>_ambiguous_position_report.tsv
 -> ncov-tools-results/qc_reports/<run_name>_mixture_report.tsv
@@ -31,14 +32,16 @@ SIGNAL 'all' & 'postprocess':
 Flags:
 	-s  :  SIGNAL results directory
 	-d  :  Directory where summary will be outputted
-	-m  :  Invoke 'mv' command instead of 'rsync' copying of results. Optional
+	-m  :  Invoke 'mv' move command instead of 'rsync' copying of results. Optional
+	-c  :  Invoke 'cp' copy command instead of 'rsync' copying of results. Optional
 """
 
-while getopts ":s:d:m" option; do
+while getopts ":s:d:mc" option; do
 	case "${option}" in
 		s) source=$OPTARG;;
 		d) destination=$OPTARG;;
 		m) move='true';;
+		c) copy='true';;
 	esac
 done
 
@@ -63,8 +66,13 @@ else
 	mkdir -p $final_dir/signal-results
 fi
 	
-if ${move}; then
+if [ ${move} = true ] && [ ${copy} = true ]; then
+	echo -e "Only pick one of '-m' or '-c' depending on whether you wish to move or copy files, respectively"
+	exit
+elif [ ${move} = true ] && [ ${copy} = false ]; then
 	cmd='mv'
+elif [ ${move} = false ] && [ ${copy} = true ]; then
+	cmd='cp'
 else
 	cmd='rsync -avh'
 	# rsync -avh

From 57599751e7159c651cba48e8ab34852e3bfae0d3 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Fri, 17 Mar 2023 03:21:10 +0000
Subject: [PATCH 23/28] update help to include cp

---
 scripts/get_signal_results.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/get_signal_results.sh b/scripts/get_signal_results.sh
index 428253c..0936c39 100755
--- a/scripts/get_signal_results.sh
+++ b/scripts/get_signal_results.sh
@@ -11,7 +11,7 @@ HELP="""
 Usage:
 bash get_signal_results.sh -s <SIGNAL_results_dir> -d <destination_dir> [-m] [-c]
 
-This scripts aims to copy (rsync by default) or move (mv) select output from SIGNAL 'all', 'postprocess', and 'ncov_tools'.
+This scripts aims to copy (rsync by default, or cp) or move (mv) select output from SIGNAL 'all', 'postprocess', and 'ncov_tools'.
 
 The following files will be transferred over to the specified destination directory (if found):
 SIGNAL 'all' & 'postprocess':

From 081c8438c77730f1fc8b03abf0c1b6e5fbf11086 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Fri, 17 Mar 2023 14:46:16 +0000
Subject: [PATCH 24/28] update nextclade dataset updating

---
 scripts/assign_lineages.py | 55 ++++++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 14 deletions(-)

diff --git a/scripts/assign_lineages.py b/scripts/assign_lineages.py
index 43f2bfd..0ba5d3c 100755
--- a/scripts/assign_lineages.py
+++ b/scripts/assign_lineages.py
@@ -8,6 +8,7 @@
 import shutil
 import os, sys
 from datetime import datetime
+import json
 
 
 def check_file(path: str) -> Path:
@@ -136,27 +137,53 @@ def update_nextclade_dataset(vers, skip):
 
 	# If specific tag requested, attempt to install, otherwise install latest
 	accession = 'MN908947'
+	current_tag = None
+	if os.path.exists(os.path.join(output_dir, 'tag.json')):
+		j = open(os.path.join(output_dir, 'tag.json'))
+		data = json.load(j)
+		current_tag = data['tag']
+		j.close()
 	if requested is not None:
+		# check existing database, if found
+			if requested == current_tag:
+				print(f"Nextclade dataset {requested} already installed! Skipping update!")
+			else:
+				try:
+					print(f"\nDownloading Nextclade {dataset} dataset tagged {requested} for reference {accession}!")
+					subprocess.run(f"nextclade dataset get "
+								f"--name '{dataset}' "
+								f"--reference '{accession}' "
+								f"--tag {requested} "
+								f"--output-dir '{output_dir}'", shell=True, check=True)
+				except subprocess.CalledProcessError:
+					print(f"\nDatabase not found! Please check whether {requested} tag exists! Downloading latest Nextclade {dataset} dataset for reference {accession}...")
+					try:
+						subprocess.run(f"nextclade dataset get "
+									f"--name '{dataset}' "
+									f"--reference '{accession}' "
+									f"--output-dir '{output_dir}'", shell=True, check=True)
+					except subprocess.CalledProcessError:
+						if current_tag is not None:
+							print(f"Something went wrong updating the Nextclade dataset, using {current_tag} instead!")
+							requested = current_tag
+						else:
+							print(f"Something went wrong updating the Nextclade dataset! No database could be found which may result in errors! Skipping update...")
+							requested = "Unknown"
+	else:
 		try:
-			print(f"\nDownloading Nextclade {dataset} dataset tagged {requested} for reference {accession}!")
+			print(f"\nDownloading latest Nextclade {dataset} dataset for reference {accession}!")
 			subprocess.run(f"nextclade dataset get "
 						f"--name '{dataset}' "
 						f"--reference '{accession}' "
-						f"--tag {requested} "
 						f"--output-dir '{output_dir}'", shell=True, check=True)
 		except subprocess.CalledProcessError:
-			print(f"\nDatabase not found! Please check whether {requested} tag exists! Downloading latest Nextclade {dataset} dataset for reference {accession}...")
-			subprocess.run(f"nextclade dataset get "
-						f"--name '{dataset}' "
-						f"--reference '{accession}' "
-						f"--output-dir '{output_dir}'", shell=True, check=True)
-	else:
-		print(f"\nDownloading latest Nextclade {dataset} dataset for reference {accession}!")
-		subprocess.run(f"nextclade dataset get "
-					f"--name '{dataset}' "
-					f"--reference '{accession}' "
-					f"--output-dir '{output_dir}'", shell=True, check=True)
-
+			if current_tag is not None:
+				print(f"Something went wrong updating the Nextclade dataset, using {current_tag} instead!")
+				requested = current_tag
+			else:
+				print(f"Something went wrong updating the Nextclade dataset! No database could be found which may result in errors! Skipping update...")
+				requested = "Unknown"
+	
 	# Obtain final version information for output
 	nextclade_version = subprocess.run(f"nextclade --version".split(), stdout=subprocess.PIPE).stdout.decode('utf-8').strip().lower()
 	if nextclade_version.startswith("nextclade"):

From f31132224cb539e83f39d374f0522fbb2f360bfc Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Fri, 17 Mar 2023 15:30:07 +0000
Subject: [PATCH 25/28] update pangolin version pull from install rather than
 online

---
 scripts/ncov-tools.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/scripts/ncov-tools.py b/scripts/ncov-tools.py
index e959778..48148bb 100755
--- a/scripts/ncov-tools.py
+++ b/scripts/ncov-tools.py
@@ -83,10 +83,26 @@ def set_up():
 	try:
 		assert pangolin == "3" or pangolin == "4" # directly supported versions
 	except AssertionError:
-		import urllib.request as web
-		commit_url = web.urlopen(f"https://github.com/cov-lineages/pangolin/releases/latest").geturl()
-		pangolin = commit_url.split("/")[-1].split(".")[0].lower().strip("v") 
+		# import urllib.request as web
+		# commit_url = web.urlopen(f"https://github.com/cov-lineages/pangolin/releases/latest").geturl()
+		# pangolin = commit_url.split("/")[-1].split(".")[0].lower().strip("v") 
 		# latest version (should ensure temporary compatibility)
+		installed_versions = subprocess.run(["pangolin", "--all-versions"],
+								check=True,
+								stdout=subprocess.PIPE)
+		installed_versions = installed_versions.stdout.decode('utf-8')
+		installed_ver_dict = {}
+		for dep_ver in map(str.strip, installed_versions.split('\n')):
+		# skip empty line at end
+			if len(dep_ver) == 0:
+				continue
+			try:
+				dependency, version = dep_ver.split(': ')
+			except ValueError:
+				continue
+			if dependency == 'pangolin':
+				pangolin = str(version).split(".",1)[0].strip('v')
+				break
 
 ### Create data directory within ncov-tools
 	data_root = os.path.abspath(os.path.join(exec_dir, 'ncov-tools', "%s" %(result_dir)))

From 12732fcfbe2ec83e4e21b257f2d1cbeee65ea759 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Fri, 17 Mar 2023 16:56:38 +0000
Subject: [PATCH 26/28] add cleanup

---
 scripts/ncov-tools.py     | 8 +++++---
 scripts/run_ncov_tools.sh | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/scripts/ncov-tools.py b/scripts/ncov-tools.py
index 48148bb..16cea46 100755
--- a/scripts/ncov-tools.py
+++ b/scripts/ncov-tools.py
@@ -182,7 +182,7 @@ def set_up():
 		for key, value in config.items():
 			fh.write(f"{key}: {value}\n")
 			
-	return exec_dir, result_dir
+	return exec_dir, result_dir, data_root
 
 def run_all():
 	os.system(f"snakemake -s workflow/Snakefile --cores {snakemake.threads} all")
@@ -221,12 +221,14 @@ def move(cwd, dest, prefix):
 		print("Missing ncov-tools 'qc_analysis' directory")
 
 if __name__ == '__main__':
-	exec_dir, result_dir = set_up()
+	exec_dir, result_dir, data_root = set_up()
 	run_script = os.path.join(exec_dir, 'scripts', 'run_ncov_tools.sh')
 	#print("Don't forget to update the config.yaml file as needed prior to running ncov-tools.")
 	print("Running ncov-tools using %s cores!" %(snakemake.threads))
 
 	subprocess.run([run_script, '-c', str(snakemake.threads), '-s', str(result_dir)])
-
+	
+	# clean up
+	shutil.rmtree(data_root)
 	#run_all()
 	#move(exec_dir, result_root, result_dir)
diff --git a/scripts/run_ncov_tools.sh b/scripts/run_ncov_tools.sh
index 48cb2a8..5a68aad 100755
--- a/scripts/run_ncov_tools.sh
+++ b/scripts/run_ncov_tools.sh
@@ -50,7 +50,7 @@ cd ../ncov-tools
 # run ncov-tools
 snakemake -k -s workflow/Snakefile --cores ${CORES} all
 
-# move ncovresults to SIGNAL results directory
+# move ncovresults to SIGNAL results directory and clean up
 mv ${SIGNAL}'_ncovresults' ${RESULTS}/ncov-tools-results
 
 # return success

From d793fba3cf5f7b01b69b65c369bdcc44d95d7f18 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Fri, 17 Mar 2023 17:39:19 +0000
Subject: [PATCH 27/28] update README

---
 README.md | 146 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 101 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index f9cbea7..546f62c 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@ so alternatively install `mamba` and use that (snakemake has beta support for it
         conda install -c conda-forge mamba
         mamba create -c conda-forge -c bioconda -n signal snakemake pandas conda mamba
         conda activate signal
+        # mamba activate signal is equivalent
 
 Additional software dependencies are managed directly by `snakemake` using conda environment files:
 
@@ -79,31 +80,27 @@ Additional software dependencies are managed directly by `snakemake` using conda
 
 ## SIGNAL Help Screen:
 
-Using the provided `signal.py` script, the majority of SIGNAL functions can be accessed easily.
+Using the provided `signalexe.py` script, the majority of SIGNAL functions can be accessed easily.
 
 To display the help screen:
 
 ```
-python signal.py -h
+python signalexe.py -h
 
-Output:
-usage: signal.py [-h] [-c CONFIGFILE] [-d DIRECTORY] [--cores CORES] [--config-only] [--remove-freebayes] [--add-breseq]
-                 [-neg NEG_PREFIX] [--dependencies] [-ri] [--unlock] [-F] [-n] [--verbose] [-v]
-                 [all ...] [postprocess ...] [ncov_tools ...]
+usage: signalexe.py [-h] [-c CONFIGFILE] [-d DIRECTORY] [--cores CORES] [--config-only] [--remove-freebayes] [--add-breseq] [-neg NEG_PREFIX] [--dependencies] [--data DATA] [-ri] [-ii] [--unlock]
+                    [-F] [-n] [--quiet] [--verbose] [-v]
+                    [all ...] [postprocess ...] [ncov_tools ...] [install ...]
 
-SARS-CoV-2 Illumina GeNome Assembly Line (SIGNAL) aims to take Illumina short-read sequences and perform consensus assembly +
-variant calling for ongoing surveillance and research efforts towards the emergent coronavirus: Severe Acute Respiratory Syndrome
-Coronavirus 2 (SARS-CoV-2).
+SARS-CoV-2 Illumina GeNome Assembly Line (SIGNAL) aims to take Illumina short-read sequences and perform consensus assembly + variant calling for ongoing surveillance and research efforts towards
+the emergent coronavirus: Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2).
 
 positional arguments:
-  all                   Run SIGNAL with all associated assembly rules. Does not include postprocessing '--configfile' or '--
-                        directory' required. The latter will automatically generate a configuration file and sample table. If
-                        both provided, then '--configfile' will take priority
-  postprocess           Run SIGNAL postprocessing on completed SIGNAL run. '--configfile' is required but will be generated if '
-                        --directory' is provided
-  ncov_tools            Generate configuration file and filesystem setup required and then execute ncov-tools quality control
-                        assessment. Requires 'ncov-tools' submodule! '--configfile' is required but will be generated if '--
-                        directory' is provided
+  all                   Run SIGNAL with all associated assembly rules. Does not include postprocessing '--configfile' or '--directory' required. The latter will automatically generate a
+                        configuration file and sample table. If both provided, then '--configfile' will take priority
+  postprocess           Run SIGNAL postprocessing on completed SIGNAL run. '--configfile' is required but will be generated if '--directory' is provided
+  ncov_tools            Generate configuration file and filesystem setup required and then execute ncov-tools quality control assessment. Requires 'ncov-tools' submodule! '--configfile' is required
+                        but will be generated if '--directory' is provided
+  install               Install individual rule environments and ensure SIGNAL is functional. The only parameter operable will be '--data'. Will override other operations!
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -113,49 +110,57 @@ optional arguments:
                         Path to directory containing reads. Will be used to generate sample table and configuration file
   --cores CORES         Number of cores. Default = 1
   --config-only         Generate sample table and configuration file (i.e., config.yaml) and exit. '--directory' required
-  --remove-freebayes    Configuration file generator parameter. Set flag to DISABLE freebayes variant calling (improves overall
-                        speed)
-  --add-breseq          Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for
-                        analysis to complete)
+  --remove-freebayes    Configuration file generator parameter. Set flag to DISABLE freebayes variant calling (improves overall speed)
+  --add-breseq          Configuration file generator parameter. Set flag to ENABLE optional breseq step (will take more time for analysis to complete)
   -neg NEG_PREFIX, --neg-prefix NEG_PREFIX
-                        Configuration file generator parameter. Comma-separated list of negative sontrol sample name(s) or
-                        prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc. Recommended if running ncov-tools. Will
-                        be left empty, if not provided
-  --dependencies        Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit.
-                        Note: Will override other flags! (~10 GB storage required)
+                        Configuration file generator parameter. Comma-separated list of negative control sample name(s) or prefix(es). For example, 'Blank' will cover Blank1, Blank2, etc.
+                        Recommended if running ncov-tools. Will be left empty, if not provided
+  --dependencies        Download data dependencies (under a created 'data' directory) required for SIGNAL analysis and exit. Note: Will override other parameters! (~10 GB storage required)
+  --data DATA           SIGNAL install and data dependencies parameter. Set location for data dependancies. If '--dependancies' is run, a folder will be created in the specified directory. If '--
+                        config-only' or '--directory' is used, the value will be applied to the configuration file. (Upcoming feature): When used with 'SIGNAL install', any tests run will use the
+                        dependencies located at this directory. Default = 'data'
   -ri, --rerun-incomplete
                         Snakemake parameter. Re-run any incomplete samples from a previously failed run
+  -ii, --ignore-incomplete
+                        Snakemake parameter. Do not check for incomplete output files
   --unlock              Snakemake parameter. Remove a lock on the working directory after a failed run
   -F, --forceall        Snakemake parameter. Force the re-run of all rules regardless of prior output
   -n, --dry-run         Snakemake parameter. Do not execute anything and only display what would be done
+  --quiet               Snakemake parameter. Do not output any progress or rule information. If used with '--dry-run`, it will just display a summary of the DAG of jobs
   --verbose             Snakemake parameter. Display snakemake debugging output
   -v, --version         Display version number
 ```
 
 ## Summary:
 
-`signal.py` simplies the execution of all functions of SIGNAL. At its simplest, SIGNAL can be run with one line, provided only the directory of sequencing reads.
+`signalexe.py` simplies the execution of all functions of SIGNAL. At its simplest, SIGNAL can be run with one line, provided only the directory of sequencing reads.
 
 ```
 # Download dependances (only needs to be run once; ~10GB of storage required)
-python signal.py --dependencies
+# --data flag allows you to rename and relocate dependencies directory
+python signalexe.py --data data --dependencies
 
-# Generate configuration file and sample table (--neg_prefix can be used to note negative controls)
-python signal.py --config-only --directory /path/to/reads
+# Generate configuration file and sample table
+# --neg_prefix can be used to note negative controls
+# --data can be used to specify location of data dependencies
+python signalexe.py --config-only --directory /path/to/reads
 
 # Execute pipeline (step-by-step; --cores defaults to 1 if not provided)
-python signal.py --configfile config.yaml --cores NCORES aLL
-python signal.py --configfile config.yaml --cores NCORES postprocess
-python signal.py --configfile config.yaml --cores NCORES ncov_tools
+# --data can be used to specify location of data dependencies
+python signalexe.py --configfile config.yaml --cores NCORES all
+python signalexe.py --configfile config.yaml --cores NCORES postprocess
+python signalexe.py --configfile config.yaml --cores NCORES ncov_tools
 
 # ALTERNATIVE
 # Execute pipeline (one line)
-python signal.py --configfile config.yaml --cores NCORES all postprocess ncov_tools
+# --data can be used to specify location of data dependencies
+python signalexe.py --configfile config.yaml --cores NCORES all postprocess ncov_tools
 
 # ALTERNATIVE
 # Execute pipeline (one line; no prior configuration file or sample table steps)
 # --directory can be used in place of --configfile to automatically generate a configuration file
-python signal.py --directory /path/to/reads --cores NCORES all postprocess ncov_tools
+# --data can be used to specify location of data dependencies
+python signalexe.py --directory /path/to/reads --cores NCORES all postprocess ncov_tools
 ```
 
 Each of the steps in SIGNAL can be run **manually** by accessing the individual scripts or running snakemake.
@@ -187,8 +192,9 @@ The pipeline requires:
 - kraken2 viral database
 - Human GRCh38 reference fasta (for composite human-viral BWA index)
 
-       python signal.py --dependencies
+       python signalexe.py --dependencies
        # defaults to a directory called `data` in repository root
+       # --data can be used to rename and relocate the resultant directory
 
        OR
 
@@ -197,14 +203,24 @@ The pipeline requires:
 
 **Note: Downloading the database files requires ~10GB of storage, with up to ~35GB required for all temporary downloads!**
 
+### 1.5 Prepare per-rule conda environments:
+
+SIGNAL uses controlled conda environments for individual steps in the workflow. These are generally produced upon first execution of SIGNAL with input data; however, an option to install the per-rule environments is available through the `signalexe.py` script.
+
+       python signalexe.py install
+
+       # Will install per-rule environments
+       # Later versions of SIGNAL will include a testing module with curated data to ensure  function
+
 ### 2. Generate configuration file:
 
 You can use the `--config-only` flag to generate both `config.yaml` and `sample_table.csv` (see step 4). The directory provided will be used to auto-generate a name for the run.
 
 ```
-python signal.py --config-only --directory /path/to/reads
+python signalexe.py --config-only --directory /path/to/reads
 
 # Outputs: 'reads_config.yaml' and 'reads_sample_table.csv'
+# --data can be used to specify the location of data dependancies
 ```
 
 You can also create the configuration file through modifying the `example_config.yaml` to suit your system.
@@ -248,7 +264,7 @@ bash scripts/generate_sample_table.sh -d /path/to/more/reads -e sample_table.csv
 
 ### 4. Execute pipeline:
 
-For the main `signal.py` script, positional arguments inform the rules of the pipeline to execute with flags supplementing input parameters.
+For the main `signalexe.py` script, positional arguments inform the rules of the pipeline to execute with flags supplementing input parameters.
 
 The main rules of the pipeline are as followed:
 
@@ -258,7 +274,7 @@ The main rules of the pipeline are as followed:
 
 The generated configuration file from the above steps can be used as input. To run the general pipeline:
 
-`python signal.py --configfile config.yaml --cores 4 all`
+`python signalexe.py --configfile config.yaml --cores 4 all`
 
 is equivalent to running
 
@@ -268,7 +284,7 @@ You can run the snakemake command as written above, but note that if the `--cond
 
 Alternatively, you can skip the above configuration and sample table generation steps by simply providing the directory of reads to the main script:
 
-`python signal.py --directory /path/to/reads --cores 4 all`
+`python signalexe.py --directory /path/to/reads --cores 4 all`
 
 A configuartion file and sample table will automatically be generated prior to running SIGNAL `all`.
 
@@ -278,7 +294,7 @@ FreeBayes variant calling and BreSeq mutational analysis are technically optiona
 
 As with the general pipeline, the generated configuration file from the above steps can be used as input. To run `postprocess` which summarizes the SIGNAL results:
 
-`python signal.py --configfile config.yaml --cores 1 postprocess`
+`python signalexe.py --configfile config.yaml --cores 1 postprocess`
 
 is equivalent to running
 
@@ -306,7 +322,7 @@ Related: because pipeline stages can fail, we run (and recommend running if usin
 Additionally, SIGNAL can prepare output and execute @jts' [ncov-tools](https://github.com/jts/ncov-tools)
 to generate phylogenies and alternative summaries.
 
-`python signal.py --configfile config.yaml --cores 1 ncov_tools`
+`python signalexe.py --configfile config.yaml --cores 1 ncov_tools`
 
 is equivalent to running
 
@@ -318,17 +334,19 @@ SIGNAL will then execute ncov-tools and the **output will be found within the SI
 
 ### Multiple operations:
 
-Using `signal.py` positional arguments, you can specify SIGNAL to perform multiple rules in succession.
+Using `signalexe.py` positional arguments, you can specify SIGNAL to perform multiple rules in succession.
 
-`python signal.py --configfile config.yaml --cores NCORES all postprocess ncov_tools`
+`python signalexe.py --configfile config.yaml --cores NCORES all postprocess ncov_tools`
 
 In the above command, SIGNAL `all`, `postprocess`, and `ncov_tools` will run using the provided configuration file as input, which links to a sample table.
 
 **Note: Regardless of order for positional arguments, or placement of other parameter flags, SIGNAL will always run in the set order priority: `all` > `postprocess` > `ncov_tools`!**
 
+**Note: If `install` is provided as input, it will override all other positional arguments!**
+
 If no configuration file or sample table was generated for a run, you can provide `--directory` with the path to sequencing reads and SIGNAL will auto-generate both required inputs prior to running any rules.
 
-`python signal.py --directory /path/to/reads --cores NCORES all postprocess ncov_tools`
+`python signalexe.py --directory /path/to/reads --cores NCORES all postprocess ncov_tools`
 
 Overall, this simplifies executing SIGNAL to one line!
 
@@ -359,6 +377,44 @@ Then execute the pipeline:
 
 - To generate summaries of BreSeq among many samples, see [how to summarize BreSeq results using gdtools](resources/dev_scripts/summaries/README.md)
 
+### Convenient extraction script:
+
+SIGNAL produces several output files and directories on its own alongside the output for ncov-tools. Select files from the output can be copied or transferred for easier parsing using a provided convenience bash script:
+
+```
+bash scripts/get_signal_results.sh
+
+Usage:
+bash get_signal_results.sh -s <SIGNAL_results_dir> -d <destination_dir> [-m] [-c]
+
+This scripts aims to copy (rsync by default, or cp) or move (mv) select output from SIGNAL 'all', 'postprocess', and 'ncov_tools'.
+
+The following files will be transferred over to the specified destination directory (if found):
+SIGNAL 'all' & 'postprocess':
+-> signal-results/<sample>/<sample>_sample.txt
+-> signal-results/<sample>/core/<sample>.consensus.fa
+-> signal-results/<sample>/core/<sample>_ivar_variants.tsv
+-> signal-results/<sample>/freebayes/<sample>.consensus.fasta
+-> signal-results/<sample>/freebayes/<sample>.variants.norm.vcf
+
+SIGNAL 'ncov_tools':
+-> ncov_tools-results/qc_annotation/<sample>.ann.vcf
+-> ncov-tools-results/qc_reports/<run_name>_ambiguous_position_report.tsv
+-> ncov-tools-results/qc_reports/<run_name>_mixture_report.tsv
+-> ncov-tools-results/qc_reports/<run_name>_ncov_watch_variants.tsv
+-> ncov-tools-results/qc_reports/<run_name>_negative_control_report.tsv
+-> ncov-tools-results/qc_reports/<run_name>_summary_qc.tsv
+
+Flags:
+        -s  :  SIGNAL results directory
+        -d  :  Directory where summary will be outputted
+        -m  :  Invoke 'mv' move command instead of 'rsync' copying of results. Optional
+        -c  :  Invoke 'cp' copy command instead of 'rsync' copying of results. Optional
+
+```
+
+The script uses `rsync` to provide accurate copies of select output files organized into `signal-results` and `ncov-tools-results` within a provided destination directory (that must exist). If the `-c` is provided, `cp` will be used instead of `rsync` to produce copies. Similarly, if `-m` is provided, `mv` will be used instead (**WARNING: Any interruption during `mv` could result in data loss.**)
+
 ## Pipeline details:
 
 For a step-by-step walkthrough of the pipeline, see [pipeline/README.md](PIPELINE.md).

From b6e5992758765dd6c02df11cd6ad8f0e2e32aa77 Mon Sep 17 00:00:00 2001
From: "Jalees A. Nasir" <jalees_nasir@hotmail.com>
Date: Fri, 17 Mar 2023 17:53:55 +0000
Subject: [PATCH 28/28] update README

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 546f62c..a5ec410 100644
--- a/README.md
+++ b/README.md
@@ -203,18 +203,18 @@ The pipeline requires:
 
 **Note: Downloading the database files requires ~10GB of storage, with up to ~35GB required for all temporary downloads!**
 
-### 1.5 Prepare per-rule conda environments:
+### 1.5. Prepare per-rule conda environments (optional, but recommended):
 
 SIGNAL uses controlled conda environments for individual steps in the workflow. These are generally produced upon first execution of SIGNAL with input data; however, an option to install the per-rule environments is available through the `signalexe.py` script.
 
        python signalexe.py install
 
        # Will install per-rule environments
-       # Later versions of SIGNAL will include a testing module with curated data to ensure  function
+       # Later versions of SIGNAL will include a testing module with curated data to ensure function
 
 ### 2. Generate configuration file:
 
-You can use the `--config-only` flag to generate both `config.yaml` and `sample_table.csv` (see step 4). The directory provided will be used to auto-generate a name for the run.
+You can use the `--config-only` flag to generate both `config.yaml` and `sample_table.csv`. The directory provided will be used to auto-generate a name for the run.
 
 ```
 python signalexe.py --config-only --directory /path/to/reads
@@ -231,7 +231,7 @@ You can also create the configuration file through modifying the `example_config
 
 See the example table `example_sample_table.csv` for an idea of how to organise this table.
 
-**Using the `--config-only` flag, both configuration file and sample table will be generated (see above in step 3) from a given directory path to reads.**
+**Using the `--config-only` flag, both configuration file and sample table will be generated (see above in step 2) from a given directory path to reads.**
 
 Alternatively, you can attempt to use `generate_sample_table.sh` to circumvent manual creation of the table.
 
@@ -282,7 +282,7 @@ is equivalent to running
 
 You can run the snakemake command as written above, but note that if the `--conda-prefix` is not set as this (i.e., `$PWD/.snakemake/conda`), then all envs will be reinstalled for each time you change the `results_dir` in the `config.yaml`.
 
-Alternatively, you can skip the above configuration and sample table generation steps by simply providing the directory of reads to the main script:
+Alternatively, you can skip the above configuration and sample table generation steps by simply providing the directory of reads to the main script (see step 2):
 
 `python signalexe.py --directory /path/to/reads --cores 4 all`