Skip to content

Commit

Permalink
change coassembly_0 names for single-assembly
Browse files Browse the repository at this point in the history
  • Loading branch information
AroneyS committed Sep 5, 2024
1 parent b59935f commit c6b65f5
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 28 deletions.
1 change: 1 addition & 0 deletions binchicken/workflow/coassemble.smk
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,7 @@ checkpoint cluster_graph:
max_recovery_samples = config["max_recovery_samples"],
coassembly_samples = config["coassembly_samples"],
exclude_coassemblies = config["exclude_coassemblies"],
single_assembly = config["single_assembly"],
threads: 64
resources:
mem_mb=get_mem_mb,
Expand Down
9 changes: 7 additions & 2 deletions binchicken/workflow/scripts/cluster_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ def pipeline(
MIN_CLUSTER_TARGETS=1,
MAX_SAMPLES_COMBINATIONS=100,
COASSEMBLY_SAMPLES=[],
EXCLUDE_COASSEMBLIES=[]):
EXCLUDE_COASSEMBLIES=[],
single_assembly=False):

logging.info(f"Polars using {str(pl.thread_pool_size())} threads")

Expand Down Expand Up @@ -308,7 +309,9 @@ def filter_max_coassembly_size(df, MAX_COASSEMBLY_SIZE):
.with_row_index("coassembly")
.select(
"samples", "length", "total_targets", "total_size", "recover_samples",
coassembly = pl.lit("coassembly_") + pl.col("coassembly").cast(pl.Utf8)
coassembly = pl.when(single_assembly)
.then(pl.col("samples"))
.otherwise(pl.lit("coassembly_") + pl.col("coassembly").cast(pl.Utf8))
)
)

Expand All @@ -333,6 +336,7 @@ def filter_max_coassembly_size(df, MAX_COASSEMBLY_SIZE):
MAX_RECOVERY_SAMPLES = snakemake.params.max_recovery_samples
COASSEMBLY_SAMPLES = snakemake.params.coassembly_samples
EXCLUDE_COASSEMBLIES = snakemake.params.exclude_coassemblies
single_assembly = snakemake.params.single_assembly
elusive_edges_path = snakemake.input.elusive_edges
read_size_path = snakemake.input.read_size
weightings_path = snakemake.input.targets_weighted
Expand Down Expand Up @@ -363,5 +367,6 @@ def filter_max_coassembly_size(df, MAX_COASSEMBLY_SIZE):
EXCLUDE_COASSEMBLIES=EXCLUDE_COASSEMBLIES,
MIN_CLUSTER_TARGETS=min_cluster_targets,
MAX_SAMPLES_COMBINATIONS=100,
single_assembly=single_assembly,
)
clusters.write_csv(elusive_clusters_path, separator="\t")
45 changes: 41 additions & 4 deletions test/test_cluster_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@
}

class Tests(unittest.TestCase):
def assertDataFrameEqual(self, a, b):
assert_frame_equal(a, b, check_dtypes=False)
def assertDataFrameEqual(self, a, b, check_row_order=True):
assert_frame_equal(a, b, check_dtypes=False, check_row_order=check_row_order)

def assertSeriesEqual(self, a, b):
assert_series_equal(a, b, check_dtypes=False)
Expand Down Expand Up @@ -291,6 +291,42 @@ def test_cluster_no_edges(self):
observed = pipeline(elusive_edges, read_size)
self.assertDataFrameEqual(expected, observed)

def test_cluster_single_assembly(self):
elusive_edges = pl.DataFrame([
["match", 2, "1,2", "1"],
["match", 2, "1,3", "1,2"],
["match", 2, "2,3", "1,2,3"],
["match", 2, "4,5", "4,5,6,7"],
["match", 2, "4,6", "4,5,6,7,8"],
["match", 2, "5,6", "4,5,6,7,8,9"],
], orient="row", schema=ELUSIVE_EDGES_COLUMNS)
read_size = pl.DataFrame([
["1", 1000],
["2", 1000],
["3", 1000],
["4", 1000],
["5", 1000],
["6", 1000],
], orient="row", schema=READ_SIZE_COLUMNS)

expected = pl.DataFrame([
["5", 1, 6, 1000, "4,5,6", "5"],
["6", 1, 6, 1000, "4,5,6", "6"],
["4", 1, 5, 1000, "4,5,6", "4"],
["2", 1, 3, 1000, "1,2,3", "2"],
["3", 1, 3, 1000, "1,2,3", "3"],
["1", 1, 2, 1000, "1,2,3", "1"],
], orient="row", schema=ELUSIVE_CLUSTERS_COLUMNS)
observed = pipeline(
elusive_edges,
read_size,
MAX_COASSEMBLY_SAMPLES=1,
MIN_COASSEMBLY_SAMPLES=1,
MAX_RECOVERY_SAMPLES=4,
single_assembly=True
)
self.assertDataFrameEqual(expected, observed, check_row_order=False)

def test_cluster_only_large_clusters(self):
elusive_edges = pl.DataFrame([
["match", 2, "1,2", "9,10"],
Expand Down Expand Up @@ -635,8 +671,8 @@ def test_cluster_restrict_coassembly_samples_single_assembly(self):
], orient="row", schema=READ_SIZE_COLUMNS)

expected = pl.DataFrame([
["2", 1, 6, 1000, "1,2,3", "coassembly_0"],
["1", 1, 5, 1000, "1,2,3", "coassembly_1"],
["2", 1, 6, 1000, "1,2,3", "2"],
["1", 1, 5, 1000, "1,2,3", "1"],
], orient="row", schema=ELUSIVE_CLUSTERS_COLUMNS)
observed = pipeline(
elusive_edges,
Expand All @@ -645,6 +681,7 @@ def test_cluster_restrict_coassembly_samples_single_assembly(self):
MIN_COASSEMBLY_SAMPLES=1,
MAX_RECOVERY_SAMPLES=4,
COASSEMBLY_SAMPLES=["1", "2"],
single_assembly=True,
)
self.assertDataFrameEqual(expected, observed)

Expand Down
8 changes: 4 additions & 4 deletions test/test_coassemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -1590,31 +1590,31 @@ def test_coassemble_preclustered_single_assembly(self):
"3",
"3624",
"sample_3,sample_5",
"coassembly_0"
"sample_5"
]),
"\t".join([
"sample_3",
"1",
"2",
"3624",
"sample_3,sample_5",
"coassembly_1"
"sample_3"
]),
"\t".join([
"sample_2",
"1",
"2",
"3926",
"sample_1,sample_2",
"coassembly_2"
"sample_2"
]),
"\t".join([
"sample_1",
"1",
"2",
"4832",
"sample_1,sample_2",
"coassembly_3"
"sample_1"
]),
""
]
Expand Down
36 changes: 18 additions & 18 deletions test/test_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,15 @@ def test_single(self):
"4",
"4832",
"sample_1,sample_2,sample_3",
"coassembly_0"
"sample_1"
]),
"\t".join([
"sample_2",
"1",
"3",
"3926",
"sample_1,sample_2,sample_3",
"coassembly_1"
"sample_2"
]),
""
]
Expand All @@ -94,19 +94,19 @@ def test_single(self):
SAMPLE_READS_FORWARD.split(" ")[0],
"-2",
SAMPLE_READS_REVERSE.split(" ")[0],
"--output", os.path.join(test_dir, "coassemble", "coassemble", "coassembly_0", "assemble"),
"--output", os.path.join(test_dir, "coassemble", "coassemble", "sample_1", "assemble"),
"-n 64 -t 64 -m 500 --skip-qc &>",
os.path.join(test_dir, "coassemble", "coassemble", "logs", "coassembly_0_assemble.log"),
os.path.join(test_dir, "coassemble", "coassemble", "logs", "sample_1_assemble.log"),
""
]),
" ".join([
"aviary assemble --coassemble -1",
SAMPLE_READS_FORWARD.split(" ")[1],
"-2",
SAMPLE_READS_REVERSE.split(" ")[1],
"--output", os.path.join(test_dir, "coassemble", "coassemble", "coassembly_1", "assemble"),
"--output", os.path.join(test_dir, "coassemble", "coassemble", "sample_2", "assemble"),
"-n 64 -t 64 -m 500 --skip-qc &>",
os.path.join(test_dir, "coassemble", "coassemble", "logs", "coassembly_1_assemble.log"),
os.path.join(test_dir, "coassemble", "coassemble", "logs", "sample_2_assemble.log"),
""
]),
""
Expand All @@ -120,27 +120,27 @@ def test_single(self):
expected = "\n".join(
[
" ".join([
"aviary recover --assembly", os.path.join(test_dir, "coassemble", "coassemble", "coassembly_0", "assemble", "assembly", "final_contigs.fasta"),
"aviary recover --assembly", os.path.join(test_dir, "coassemble", "coassemble", "sample_1", "assemble", "assembly", "final_contigs.fasta"),
"-1",
SAMPLE_READS_FORWARD,
"-2",
SAMPLE_READS_REVERSE,
"--output", os.path.join(test_dir, "coassemble", "coassemble", "coassembly_0", "recover"),
"--output", os.path.join(test_dir, "coassemble", "coassemble", "sample_1", "recover"),
"--binning-only --refinery-max-iterations 0 "
"-n 32 -t 32 -m 250 --skip-qc &>",
os.path.join(test_dir, "coassemble", "coassemble", "logs", "coassembly_0_recover.log"),
os.path.join(test_dir, "coassemble", "coassemble", "logs", "sample_1_recover.log"),
""
]),
" ".join([
"aviary recover --assembly", os.path.join(test_dir, "coassemble", "coassemble", "coassembly_1", "assemble", "assembly", "final_contigs.fasta"),
"aviary recover --assembly", os.path.join(test_dir, "coassemble", "coassemble", "sample_2", "assemble", "assembly", "final_contigs.fasta"),
"-1",
SAMPLE_READS_FORWARD,
"-2",
SAMPLE_READS_REVERSE,
"--output", os.path.join(test_dir, "coassemble", "coassemble", "coassembly_1", "recover"),
"--output", os.path.join(test_dir, "coassemble", "coassemble", "sample_2", "recover"),
"--binning-only --refinery-max-iterations 0 "
"-n 32 -t 32 -m 250 --skip-qc &>",
os.path.join(test_dir, "coassemble", "coassemble", "logs", "coassembly_1_recover.log"),
os.path.join(test_dir, "coassemble", "coassemble", "logs", "sample_2_recover.log"),
""
]),
""
Expand All @@ -161,14 +161,14 @@ def test_single(self):
"total_size",
]),
"\t".join([
"coassembly_0",
"sample_1",
"sample_1",
"1",
"4",
"4832",
]),
"\t".join([
"coassembly_1",
"sample_2",
"sample_2",
"1",
"3",
Expand Down Expand Up @@ -231,31 +231,31 @@ def test_single_preclustered(self):
"3",
"3624",
"sample_3,sample_5",
"coassembly_0"
"sample_5"
]),
"\t".join([
"sample_3",
"1",
"2",
"3624",
"sample_3,sample_5",
"coassembly_1"
"sample_3"
]),
"\t".join([
"sample_2",
"1",
"2",
"3926",
"sample_1,sample_2",
"coassembly_2"
"sample_2"
]),
"\t".join([
"sample_1",
"1",
"2",
"4832",
"sample_1,sample_2",
"coassembly_3"
"sample_1"
]),
""
]
Expand Down

0 comments on commit c6b65f5

Please sign in to comment.