diff --git a/aviary/aviary.py b/aviary/aviary.py index 1e88a6e1..0364ab62 100755 --- a/aviary/aviary.py +++ b/aviary/aviary.py @@ -1078,14 +1078,14 @@ def main(): aviary batch -f batch_file.tsv -t 32 -o batch_test - An example batch file can be found at: + An example batch file can be found at: https://rhysnewell.github.io/aviary/examples ''') batch_options.add_argument( '-f', '--batch_file', '--batch-file', help='The tab or comma separated batch file containing the input samples to assemble and/or recover MAGs from. \n' - 'An example batch file can be found at XXX. The heading line is required. \n' + 'An example batch file can be found at https://rhysnewell.github.io/aviary/examples. The heading line is required. \n' 'The number of reads provided to each sample is flexible as is the type of assembly being performed (if any). \n' 'Multiple reads can be supplied by providing a comma-separated list (surrounded by double quotes \"\" if using a \n' 'comma separated batch file) within the specific read column.', diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py index bba9dd66..57df8d40 100644 --- a/aviary/modules/processor.py +++ b/aviary/modules/processor.py @@ -43,6 +43,8 @@ from snakemake.io import load_configfile from ruamel.yaml import YAML # used for yaml reading with comments +BATCH_HEADER=['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble'] + # Debug debug={1:logging.CRITICAL, 2:logging.ERROR, @@ -468,6 +470,8 @@ def run_workflow(self, cores=16, profile=None, cluster_retries=None, resources=f"--resources mem_mb={int(self.max_memory)*1024} {self.resources}" if not dryrun else "" ) + logging.debug(f"Command: {cmd}") + if write_to_script is not None: write_to_script.append(cmd) continue @@ -495,20 +499,28 @@ def process_batch(args, prefix): logging.info(f"Reading batch file: {args.batch_file}") - header=0 + header=None + separator=' ' with open(args.batch_file, mode='r') as check_batch: for line in check_batch.readlines(): - if "sample\tshort_reads_1\tshort_reads_2\tlong_reads\tlong_read_type\tassembly\tcoassemble" in line \ - or "sample,short_reads_1,short_reads_2,long_reads,long_read_type,assembly,coassemble" in line \ - or "sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble" in line \ - or "sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble" in line: - header=1 - logging.debug("Inferred header") - else: - logging.debug("No heading inferred.") + line = line.strip() + for sep in ['\t', ',', ' ']: + separated = line.split(sep) + if separated == BATCH_HEADER: + header=0 + separator=sep + logging.debug("Inferred header") + break + elif len(separated) >= 7: + header=None + separator=sep + logging.debug("Inferred no header") + break + if header is None: + logging.debug("No header found") break - batch = pd.read_csv(args.batch_file, sep=None, engine='python', skiprows=header) + batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=BATCH_HEADER, header=header) if len(batch.columns) != 7: logging.critical(f"Batch file contains incorrect number of columns ({len(batch.columns)}). Should contain 7.") logging.critical(f"Current columns: {batch.columns}") @@ -525,10 +537,12 @@ def process_batch(args, prefix): try: script_file = args.write_script - write_to_script = [] except AttributeError: script_file = None - write_to_script = None + + write_to_script = None + if script_file is not None: + write_to_script = [] runs = [] args.interleaved = "none" # hacky solution to skip attribute error diff --git a/docs/_include/examples/example_batch.tsv b/docs/_include/examples/example_batch.tsv index 1749ed9c..a8288358 100755 --- a/docs/_include/examples/example_batch.tsv +++ b/docs/_include/examples/example_batch.tsv @@ -1,5 +1,5 @@ -sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble -sample_1 pe1.fq.gz pe2.fq.gz nanopore.fq.gz ont NA NA -sample_2 interleaved.fq.gz NA pacbio.fq.gz ccs NA NA -sample_3 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq NA True -sample_4 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq assembly.fasta False \ No newline at end of file +sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble +sample_1 pe1.fq.gz pe2.fq.gz nanopore.fq.gz ont NA NA +sample_2 interleaved.fq.gz NA pacbio.fq.gz ccs NA NA +sample_3 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq NA True +sample_4 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq assembly.fasta False \ No newline at end of file diff --git a/test/data/example_batch.tsv b/test/data/example_batch.tsv new file mode 100755 index 00000000..e014b39a --- /dev/null +++ b/test/data/example_batch.tsv @@ -0,0 +1,3 @@ +sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble +sample_1 test/data/wgsim.1.fq.gz test/data/wgsim.2.fq.gz NA ont NA NA +sample_2 test/data/wgsim.1.fq.gz test/data/wgsim.2.fq.gz test/data/pbsim.fq.gz ont NA NA \ No newline at end of file diff --git a/test/test_integration.py b/test/test_integration.py index 687143dd..fe17f77f 100755 --- a/test/test_integration.py +++ b/test/test_integration.py @@ -162,7 +162,6 @@ def test_short_read_recovery_queue_submission(self): cmd = ( f"aviary recover " - f"--output {output_dir} " f"-o {output_dir}/aviary_out " f"-1 {data}/wgsim.1.fq.gz " f"-2 {data}/wgsim.2.fq.gz " @@ -180,6 +179,41 @@ def test_short_read_recovery_queue_submission(self): num_lines = sum(1 for _ in f) self.assertEqual(num_lines, 3) + def test_batch_recovery(self): + output_dir = os.path.join("example", "test_batch_recovery") + self.setup_output_dir(output_dir) + cmd = ( + f"aviary batch " + f"-o {output_dir}/aviary_out " + f"-f {data}/example_batch.tsv " + f"--conda-prefix {path_to_conda} " + f"--skip-binners rosella vamb metabat " + f"--skip-qc " + f"--refinery-max-iterations 0 " + f"--min-read-size 10 --min-mean-q 1 " + f"-n 32 -t 32 " + ) + subprocess.run(cmd, shell=True, check=True) + + self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/sample_1/data/final_contigs.fasta")) + self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/sample_2/data/final_contigs.fasta")) + + bin_info_path_1 = f"{output_dir}/aviary_out/sample_1/bins/bin_info.tsv" + bin_info_path_2 = f"{output_dir}/aviary_out/sample_2/bins/bin_info.tsv" + self.assertTrue(os.path.isfile(bin_info_path_1)) + self.assertTrue(os.path.isfile(bin_info_path_2)) + with open(bin_info_path_1) as f: + num_lines = sum(1 for _ in f) + self.assertEqual(num_lines, 3) + + self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.95")) + self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.97")) + self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.99")) + + self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.95/pangenomes")) + self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.97/pangenomes")) + self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.99/pangenomes")) + if __name__ == "__main__": unittest.main()