From c6b65f5d5c21e84e9ca3189fb24ce34cddde685e Mon Sep 17 00:00:00 2001 From: AroneyS Date: Fri, 6 Sep 2024 09:28:44 +1000 Subject: [PATCH] change coassembly_0 names for single-assembly --- binchicken/workflow/coassemble.smk | 1 + binchicken/workflow/scripts/cluster_graph.py | 9 +++- test/test_cluster_graph.py | 45 ++++++++++++++++++-- test/test_coassemble.py | 8 ++-- test/test_single.py | 36 ++++++++-------- 5 files changed, 71 insertions(+), 28 deletions(-) diff --git a/binchicken/workflow/coassemble.smk b/binchicken/workflow/coassemble.smk index 98ed5777..81a9eda8 100644 --- a/binchicken/workflow/coassemble.smk +++ b/binchicken/workflow/coassemble.smk @@ -473,6 +473,7 @@ checkpoint cluster_graph: max_recovery_samples = config["max_recovery_samples"], coassembly_samples = config["coassembly_samples"], exclude_coassemblies = config["exclude_coassemblies"], + single_assembly = config["single_assembly"], threads: 64 resources: mem_mb=get_mem_mb, diff --git a/binchicken/workflow/scripts/cluster_graph.py b/binchicken/workflow/scripts/cluster_graph.py index fa03f9c7..a6638365 100644 --- a/binchicken/workflow/scripts/cluster_graph.py +++ b/binchicken/workflow/scripts/cluster_graph.py @@ -108,7 +108,8 @@ def pipeline( MIN_CLUSTER_TARGETS=1, MAX_SAMPLES_COMBINATIONS=100, COASSEMBLY_SAMPLES=[], - EXCLUDE_COASSEMBLIES=[]): + EXCLUDE_COASSEMBLIES=[], + single_assembly=False): logging.info(f"Polars using {str(pl.thread_pool_size())} threads") @@ -308,7 +309,9 @@ def filter_max_coassembly_size(df, MAX_COASSEMBLY_SIZE): .with_row_index("coassembly") .select( "samples", "length", "total_targets", "total_size", "recover_samples", - coassembly = pl.lit("coassembly_") + pl.col("coassembly").cast(pl.Utf8) + coassembly = pl.when(single_assembly) + .then(pl.col("samples")) + .otherwise(pl.lit("coassembly_") + pl.col("coassembly").cast(pl.Utf8)) ) ) @@ -333,6 +336,7 @@ def filter_max_coassembly_size(df, MAX_COASSEMBLY_SIZE): MAX_RECOVERY_SAMPLES = snakemake.params.max_recovery_samples COASSEMBLY_SAMPLES = snakemake.params.coassembly_samples EXCLUDE_COASSEMBLIES = snakemake.params.exclude_coassemblies + single_assembly = snakemake.params.single_assembly elusive_edges_path = snakemake.input.elusive_edges read_size_path = snakemake.input.read_size weightings_path = snakemake.input.targets_weighted @@ -363,5 +367,6 @@ def filter_max_coassembly_size(df, MAX_COASSEMBLY_SIZE): EXCLUDE_COASSEMBLIES=EXCLUDE_COASSEMBLIES, MIN_CLUSTER_TARGETS=min_cluster_targets, MAX_SAMPLES_COMBINATIONS=100, + single_assembly=single_assembly, ) clusters.write_csv(elusive_clusters_path, separator="\t") diff --git a/test/test_cluster_graph.py b/test/test_cluster_graph.py index 0e9b1596..f03f5ad4 100644 --- a/test/test_cluster_graph.py +++ b/test/test_cluster_graph.py @@ -54,8 +54,8 @@ } class Tests(unittest.TestCase): - def assertDataFrameEqual(self, a, b): - assert_frame_equal(a, b, check_dtypes=False) + def assertDataFrameEqual(self, a, b, check_row_order=True): + assert_frame_equal(a, b, check_dtypes=False, check_row_order=check_row_order) def assertSeriesEqual(self, a, b): assert_series_equal(a, b, check_dtypes=False) @@ -291,6 +291,42 @@ def test_cluster_no_edges(self): observed = pipeline(elusive_edges, read_size) self.assertDataFrameEqual(expected, observed) + def test_cluster_single_assembly(self): + elusive_edges = pl.DataFrame([ + ["match", 2, "1,2", "1"], + ["match", 2, "1,3", "1,2"], + ["match", 2, "2,3", "1,2,3"], + ["match", 2, "4,5", "4,5,6,7"], + ["match", 2, "4,6", "4,5,6,7,8"], + ["match", 2, "5,6", "4,5,6,7,8,9"], + ], orient="row", schema=ELUSIVE_EDGES_COLUMNS) + read_size = pl.DataFrame([ + ["1", 1000], + ["2", 1000], + ["3", 1000], + ["4", 1000], + ["5", 1000], + ["6", 1000], + ], orient="row", schema=READ_SIZE_COLUMNS) + + expected = pl.DataFrame([ + ["5", 1, 6, 1000, "4,5,6", "5"], + ["6", 1, 6, 1000, "4,5,6", "6"], + ["4", 1, 5, 1000, "4,5,6", "4"], + ["2", 1, 3, 1000, "1,2,3", "2"], + ["3", 1, 3, 1000, "1,2,3", "3"], + ["1", 1, 2, 1000, "1,2,3", "1"], + ], orient="row", schema=ELUSIVE_CLUSTERS_COLUMNS) + observed = pipeline( + elusive_edges, + read_size, + MAX_COASSEMBLY_SAMPLES=1, + MIN_COASSEMBLY_SAMPLES=1, + MAX_RECOVERY_SAMPLES=4, + single_assembly=True + ) + self.assertDataFrameEqual(expected, observed, check_row_order=False) + def test_cluster_only_large_clusters(self): elusive_edges = pl.DataFrame([ ["match", 2, "1,2", "9,10"], @@ -635,8 +671,8 @@ def test_cluster_restrict_coassembly_samples_single_assembly(self): ], orient="row", schema=READ_SIZE_COLUMNS) expected = pl.DataFrame([ - ["2", 1, 6, 1000, "1,2,3", "coassembly_0"], - ["1", 1, 5, 1000, "1,2,3", "coassembly_1"], + ["2", 1, 6, 1000, "1,2,3", "2"], + ["1", 1, 5, 1000, "1,2,3", "1"], ], orient="row", schema=ELUSIVE_CLUSTERS_COLUMNS) observed = pipeline( elusive_edges, @@ -645,6 +681,7 @@ def test_cluster_restrict_coassembly_samples_single_assembly(self): MIN_COASSEMBLY_SAMPLES=1, MAX_RECOVERY_SAMPLES=4, COASSEMBLY_SAMPLES=["1", "2"], + single_assembly=True, ) self.assertDataFrameEqual(expected, observed) diff --git a/test/test_coassemble.py b/test/test_coassemble.py index 4e1944f2..bbdf3764 100644 --- a/test/test_coassemble.py +++ b/test/test_coassemble.py @@ -1590,7 +1590,7 @@ def test_coassemble_preclustered_single_assembly(self): "3", "3624", "sample_3,sample_5", - "coassembly_0" + "sample_5" ]), "\t".join([ "sample_3", @@ -1598,7 +1598,7 @@ def test_coassemble_preclustered_single_assembly(self): "2", "3624", "sample_3,sample_5", - "coassembly_1" + "sample_3" ]), "\t".join([ "sample_2", @@ -1606,7 +1606,7 @@ def test_coassemble_preclustered_single_assembly(self): "2", "3926", "sample_1,sample_2", - "coassembly_2" + "sample_2" ]), "\t".join([ "sample_1", @@ -1614,7 +1614,7 @@ def test_coassemble_preclustered_single_assembly(self): "2", "4832", "sample_1,sample_2", - "coassembly_3" + "sample_1" ]), "" ] diff --git a/test/test_single.py b/test/test_single.py index 74680f79..21d58c87 100644 --- a/test/test_single.py +++ b/test/test_single.py @@ -68,7 +68,7 @@ def test_single(self): "4", "4832", "sample_1,sample_2,sample_3", - "coassembly_0" + "sample_1" ]), "\t".join([ "sample_2", @@ -76,7 +76,7 @@ def test_single(self): "3", "3926", "sample_1,sample_2,sample_3", - "coassembly_1" + "sample_2" ]), "" ] @@ -94,9 +94,9 @@ def test_single(self): SAMPLE_READS_FORWARD.split(" ")[0], "-2", SAMPLE_READS_REVERSE.split(" ")[0], - "--output", os.path.join(test_dir, "coassemble", "coassemble", "coassembly_0", "assemble"), + "--output", os.path.join(test_dir, "coassemble", "coassemble", "sample_1", "assemble"), "-n 64 -t 64 -m 500 --skip-qc &>", - os.path.join(test_dir, "coassemble", "coassemble", "logs", "coassembly_0_assemble.log"), + os.path.join(test_dir, "coassemble", "coassemble", "logs", "sample_1_assemble.log"), "" ]), " ".join([ @@ -104,9 +104,9 @@ def test_single(self): SAMPLE_READS_FORWARD.split(" ")[1], "-2", SAMPLE_READS_REVERSE.split(" ")[1], - "--output", os.path.join(test_dir, "coassemble", "coassemble", "coassembly_1", "assemble"), + "--output", os.path.join(test_dir, "coassemble", "coassemble", "sample_2", "assemble"), "-n 64 -t 64 -m 500 --skip-qc &>", - os.path.join(test_dir, "coassemble", "coassemble", "logs", "coassembly_1_assemble.log"), + os.path.join(test_dir, "coassemble", "coassemble", "logs", "sample_2_assemble.log"), "" ]), "" @@ -120,27 +120,27 @@ def test_single(self): expected = "\n".join( [ " ".join([ - "aviary recover --assembly", os.path.join(test_dir, "coassemble", "coassemble", "coassembly_0", "assemble", "assembly", "final_contigs.fasta"), + "aviary recover --assembly", os.path.join(test_dir, "coassemble", "coassemble", "sample_1", "assemble", "assembly", "final_contigs.fasta"), "-1", SAMPLE_READS_FORWARD, "-2", SAMPLE_READS_REVERSE, - "--output", os.path.join(test_dir, "coassemble", "coassemble", "coassembly_0", "recover"), + "--output", os.path.join(test_dir, "coassemble", "coassemble", "sample_1", "recover"), "--binning-only --refinery-max-iterations 0 " "-n 32 -t 32 -m 250 --skip-qc &>", - os.path.join(test_dir, "coassemble", "coassemble", "logs", "coassembly_0_recover.log"), + os.path.join(test_dir, "coassemble", "coassemble", "logs", "sample_1_recover.log"), "" ]), " ".join([ - "aviary recover --assembly", os.path.join(test_dir, "coassemble", "coassemble", "coassembly_1", "assemble", "assembly", "final_contigs.fasta"), + "aviary recover --assembly", os.path.join(test_dir, "coassemble", "coassemble", "sample_2", "assemble", "assembly", "final_contigs.fasta"), "-1", SAMPLE_READS_FORWARD, "-2", SAMPLE_READS_REVERSE, - "--output", os.path.join(test_dir, "coassemble", "coassemble", "coassembly_1", "recover"), + "--output", os.path.join(test_dir, "coassemble", "coassemble", "sample_2", "recover"), "--binning-only --refinery-max-iterations 0 " "-n 32 -t 32 -m 250 --skip-qc &>", - os.path.join(test_dir, "coassemble", "coassemble", "logs", "coassembly_1_recover.log"), + os.path.join(test_dir, "coassemble", "coassemble", "logs", "sample_2_recover.log"), "" ]), "" @@ -161,14 +161,14 @@ def test_single(self): "total_size", ]), "\t".join([ - "coassembly_0", + "sample_1", "sample_1", "1", "4", "4832", ]), "\t".join([ - "coassembly_1", + "sample_2", "sample_2", "1", "3", @@ -231,7 +231,7 @@ def test_single_preclustered(self): "3", "3624", "sample_3,sample_5", - "coassembly_0" + "sample_5" ]), "\t".join([ "sample_3", @@ -239,7 +239,7 @@ def test_single_preclustered(self): "2", "3624", "sample_3,sample_5", - "coassembly_1" + "sample_3" ]), "\t".join([ "sample_2", @@ -247,7 +247,7 @@ def test_single_preclustered(self): "2", "3926", "sample_1,sample_2", - "coassembly_2" + "sample_2" ]), "\t".join([ "sample_1", @@ -255,7 +255,7 @@ def test_single_preclustered(self): "2", "4832", "sample_1,sample_2", - "coassembly_3" + "sample_1" ]), "" ]