Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: batch submission now does not require user to write to script first #199

Merged
merged 5 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions aviary/aviary.py
Original file line number Diff line number Diff line change
Expand Up @@ -1078,14 +1078,14 @@ def main():

aviary batch -f batch_file.tsv -t 32 -o batch_test

An example batch file can be found at:
An example batch file can be found at: https://rhysnewell.github.io/aviary/examples

''')

batch_options.add_argument(
'-f', '--batch_file', '--batch-file',
help='The tab or comma separated batch file containing the input samples to assemble and/or recover MAGs from. \n'
'An example batch file can be found at XXX. The heading line is required. \n'
'An example batch file can be found at https://rhysnewell.github.io/aviary/examples. The heading line is required. \n'
'The number of reads provided to each sample is flexible as is the type of assembly being performed (if any). \n'
'Multiple reads can be supplied by providing a comma-separated list (surrounded by double quotes \"\" if using a \n'
'comma separated batch file) within the specific read column.',
Expand Down
38 changes: 26 additions & 12 deletions aviary/modules/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
from snakemake.io import load_configfile
from ruamel.yaml import YAML # used for yaml reading with comments

BATCH_HEADER=['sample', 'short_reads_1', 'short_reads_2', 'long_reads', 'long_read_type', 'assembly', 'coassemble']

# Debug
debug={1:logging.CRITICAL,
2:logging.ERROR,
Expand Down Expand Up @@ -468,6 +470,8 @@ def run_workflow(self, cores=16, profile=None, cluster_retries=None,
resources=f"--resources mem_mb={int(self.max_memory)*1024} {self.resources}" if not dryrun else ""
)

logging.debug(f"Command: {cmd}")

if write_to_script is not None:
write_to_script.append(cmd)
continue
Expand Down Expand Up @@ -495,20 +499,28 @@ def process_batch(args, prefix):

logging.info(f"Reading batch file: {args.batch_file}")

header=0
header=None
separator=' '
with open(args.batch_file, mode='r') as check_batch:
for line in check_batch.readlines():
if "sample\tshort_reads_1\tshort_reads_2\tlong_reads\tlong_read_type\tassembly\tcoassemble" in line \
or "sample,short_reads_1,short_reads_2,long_reads,long_read_type,assembly,coassemble" in line \
or "sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble" in line \
or "sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble" in line:
header=1
logging.debug("Inferred header")
else:
logging.debug("No heading inferred.")
line = line.strip()
for sep in ['\t', ',', ' ']:
separated = line.split(sep)
if separated == BATCH_HEADER:
header=0
separator=sep
logging.debug("Inferred header")
break
elif len(separated) >= 7:
header=None
separator=sep
logging.debug("Inferred no header")
break
if header is None:
logging.debug("No header found")
break

batch = pd.read_csv(args.batch_file, sep=None, engine='python', skiprows=header)
batch = pd.read_csv(args.batch_file, sep=separator, engine='python', names=BATCH_HEADER, header=header)
if len(batch.columns) != 7:
logging.critical(f"Batch file contains incorrect number of columns ({len(batch.columns)}). Should contain 7.")
logging.critical(f"Current columns: {batch.columns}")
Expand All @@ -525,10 +537,12 @@ def process_batch(args, prefix):

try:
script_file = args.write_script
write_to_script = []
except AttributeError:
script_file = None
write_to_script = None

write_to_script = None
if script_file is not None:
write_to_script = []

runs = []
args.interleaved = "none" # hacky solution to skip attribute error
Expand Down
10 changes: 5 additions & 5 deletions docs/_include/examples/example_batch.tsv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble
sample_1 pe1.fq.gz pe2.fq.gz nanopore.fq.gz ont NA NA
sample_2 interleaved.fq.gz NA pacbio.fq.gz ccs NA NA
sample_3 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq NA True
sample_4 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq assembly.fasta False
sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble
sample_1 pe1.fq.gz pe2.fq.gz nanopore.fq.gz ont NA NA
sample_2 interleaved.fq.gz NA pacbio.fq.gz ccs NA NA
sample_3 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq NA True
sample_4 pe1.1.fq.gz,pe1.2.fq.gz pe2.1.fq.gz,pe2.2.fq.gz n1.fq.gz,n2.fq.gz,n3.fq.gz ont-hq assembly.fasta False
3 changes: 3 additions & 0 deletions test/data/example_batch.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample short_reads_1 short_reads_2 long_reads long_read_type assembly coassemble
sample_1 test/data/wgsim.1.fq.gz test/data/wgsim.2.fq.gz NA ont NA NA
sample_2 test/data/wgsim.1.fq.gz test/data/wgsim.2.fq.gz test/data/pbsim.fq.gz ont NA NA
36 changes: 35 additions & 1 deletion test/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,6 @@ def test_short_read_recovery_queue_submission(self):

cmd = (
f"aviary recover "
f"--output {output_dir} "
f"-o {output_dir}/aviary_out "
f"-1 {data}/wgsim.1.fq.gz "
f"-2 {data}/wgsim.2.fq.gz "
Expand All @@ -180,6 +179,41 @@ def test_short_read_recovery_queue_submission(self):
num_lines = sum(1 for _ in f)
self.assertEqual(num_lines, 3)

def test_batch_recovery(self):
output_dir = os.path.join("example", "test_batch_recovery")
self.setup_output_dir(output_dir)
cmd = (
f"aviary batch "
f"-o {output_dir}/aviary_out "
f"-f {data}/example_batch.tsv "
f"--conda-prefix {path_to_conda} "
f"--skip-binners rosella vamb metabat "
f"--skip-qc "
f"--refinery-max-iterations 0 "
f"--min-read-size 10 --min-mean-q 1 "
f"-n 32 -t 32 "
)
subprocess.run(cmd, shell=True, check=True)

self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/sample_1/data/final_contigs.fasta"))
self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/sample_2/data/final_contigs.fasta"))

bin_info_path_1 = f"{output_dir}/aviary_out/sample_1/bins/bin_info.tsv"
bin_info_path_2 = f"{output_dir}/aviary_out/sample_2/bins/bin_info.tsv"
self.assertTrue(os.path.isfile(bin_info_path_1))
self.assertTrue(os.path.isfile(bin_info_path_2))
with open(bin_info_path_1) as f:
num_lines = sum(1 for _ in f)
self.assertEqual(num_lines, 3)

self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.95"))
self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.97"))
self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.99"))

self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.95/pangenomes"))
self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.97/pangenomes"))
self.assertTrue(os.path.isdir(f"{output_dir}/aviary_out/aviary_cluster_ani_0.99/pangenomes"))


if __name__ == "__main__":
unittest.main()
Loading