AroneyS · AroneyS · Sep 6, 2024 · Sep 5, 2024 · Sep 5, 2024 · Sep 5, 2024
diff --git a/binchicken/binchicken.py b/binchicken/binchicken.py
@@ -1169,6 +1169,12 @@ def main():
                     "binchicken coassemble --forward reads_1.1.fq ... --reverse reads_1.2.fq ... --single-assembly"
                 ),
             ],
+            "single": [
+                btu.Example(
+                    "find relevant samples for differential coverage binning (no coassembly)",
+                    "binchicken single --forward reads_1.1.fq ... --reverse reads_1.2.fq ..."
+                ),
+            ],
             "evaluate": [
                 btu.Example(
                     "evaluate a completed coassembly",
@@ -1358,6 +1364,11 @@ def add_coassemble_arguments(parser):
 
     ###########################################################################
 
+    single_parser = main_parser.new_subparser("single", "Single-assembly of reads with binning samples chosen by unbinned single-copy marker genes")
+    add_coassemble_arguments(single_parser)
+
+    ###########################################################################
+
     evaluate_parser = main_parser.new_subparser("evaluate", "Evaluate coassembled bins")
     # Base arguments
     evaluate_base = evaluate_parser.add_argument_group("Base input arguments")
@@ -1534,6 +1545,11 @@ def coassemble_output_argument_verification(args, evaluate=False):
         coassemble_argument_verification(args)
         coassemble(args)
 
+    if args.subparser_name == "single":
+        args.single_assembly = True
+        coassemble_argument_verification(args)
+        coassemble(args)
+
     elif args.subparser_name == "evaluate":
         coassemble_output_argument_verification(args, evaluate=True)
         if not args.singlem_metapackage and not os.environ['SINGLEM_METAPACKAGE_PATH']:

diff --git a/binchicken/workflow/coassemble.smk b/binchicken/workflow/coassemble.smk
@@ -473,6 +473,7 @@ checkpoint cluster_graph:
         max_recovery_samples = config["max_recovery_samples"],
         coassembly_samples = config["coassembly_samples"],
         exclude_coassemblies = config["exclude_coassemblies"],
+        single_assembly = config["single_assembly"],
     threads: 64
     resources:
         mem_mb=get_mem_mb,
@@ -855,6 +856,7 @@ rule aviary_recover:
         "-1 {params.reads_1} "
         "-2 {params.reads_2} "
         "--output {params.output} "
+        "--checkm2-db-path {params.checkm2} "
         "{params.fast} "
         "-n {params.threads} "
         "-t {params.threads} "

diff --git a/binchicken/workflow/scripts/cluster_graph.py b/binchicken/workflow/scripts/cluster_graph.py
@@ -108,7 +108,8 @@ def pipeline(
         MIN_CLUSTER_TARGETS=1,
         MAX_SAMPLES_COMBINATIONS=100,
         COASSEMBLY_SAMPLES=[],
-        EXCLUDE_COASSEMBLIES=[]):
+        EXCLUDE_COASSEMBLIES=[],
+        single_assembly=False):
 
     logging.info(f"Polars using {str(pl.thread_pool_size())} threads")
 
@@ -308,7 +309,9 @@ def filter_max_coassembly_size(df, MAX_COASSEMBLY_SIZE):
             .with_row_index("coassembly")
             .select(
                 "samples", "length", "total_targets", "total_size", "recover_samples",
-                coassembly = pl.lit("coassembly_") + pl.col("coassembly").cast(pl.Utf8)
+                coassembly = pl.when(single_assembly)
+                    .then(pl.col("samples"))
+                    .otherwise(pl.lit("coassembly_") + pl.col("coassembly").cast(pl.Utf8))
                 )
         )
 
@@ -333,6 +336,7 @@ def filter_max_coassembly_size(df, MAX_COASSEMBLY_SIZE):
     MAX_RECOVERY_SAMPLES = snakemake.params.max_recovery_samples
     COASSEMBLY_SAMPLES = snakemake.params.coassembly_samples
     EXCLUDE_COASSEMBLIES = snakemake.params.exclude_coassemblies
+    single_assembly = snakemake.params.single_assembly
     elusive_edges_path = snakemake.input.elusive_edges
     read_size_path = snakemake.input.read_size
     weightings_path = snakemake.input.targets_weighted
@@ -363,5 +367,6 @@ def filter_max_coassembly_size(df, MAX_COASSEMBLY_SIZE):
         EXCLUDE_COASSEMBLIES=EXCLUDE_COASSEMBLIES,
         MIN_CLUSTER_TARGETS=min_cluster_targets,
         MAX_SAMPLES_COMBINATIONS=100,
+        single_assembly=single_assembly,
         )
     clusters.write_csv(elusive_clusters_path, separator="\t")
diff --git a/build_docs.py b/build_docs.py
@@ -42,7 +42,7 @@ def get_version(relpath):
     logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
 
     subdir_and_commands = [
-        ["tools", ["coassemble", "evaluate", "iterate", "update"]],
+        ["tools", ["coassemble", "single", "evaluate", "iterate", "update"]],
     ]
 
     for subdir, commands in subdir_and_commands:
@@ -56,6 +56,7 @@ def get_version(relpath):
                 # Remove everything before the options section
                 splitters = {
                     "coassemble": "OPTIONS",
+                    "single": "OPTIONS",
                     "evaluate": "OPTIONS",
                     "iterate": "OPTIONS",
                     "update": "OPTIONS",

diff --git a/docs/README.md b/docs/README.md
@@ -10,9 +10,9 @@ Bin Chicken is a tool that performs targeted recovery of low abundance metagenom
 ```bash
 # Assemble and recover from each sample individually
 # 20 samples used for differential abundance binning
-binchicken coassemble \
+binchicken single \
   --forward-list samples_forward.txt --reverse-list samples_reverse.txt \
-  --run-aviary --single-assembly \
+  --run-aviary \
   --cores 64 --output binchicken_single_assembly
 
 # Assemble and recover from 2-sample coassemblies

diff --git a/docs/preludes/single_prelude.md b/docs/preludes/single_prelude.md
@@ -0,0 +1,41 @@
+
+Snakemake pipeline to discover samples for differential abundance binning for single-sample assembly based on co-occurrence of single-copy marker genes, excluding those genes present in reference genomes (e.g. previously recovered genomes).
+
+```bash
+# Example: find relevant samples for differential coverage binning (no coassembly)
+binchicken single --forward reads_1.1.fq ... --reverse reads_1.2.fq ...
+
+# Example: run proposed assemblies through aviary with cluster submission
+# Create snakemake profile at ~/.config/snakemake/qsub with cluster, cluster-status, cluster-cancel, etc.
+# See https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles
+binchicken single --forward reads_1.1.fq ... --reverse reads_1.2.fq ... --run-aviary \
+  --snakemake-profile qsub --cluster-submission --local-cores 64 --cores 64
+```
+
+Important options:
+
+- Maximum number of recovery samples for differential-abundance binning can be specified (`--max-recovery-samples`, default 20)
+- Genomes can be provided and matching marker genes will be excluded (`--genomes`)
+- Reads can be mapped to the matched bins with only unmapped reads being assembled (`--assemble-unmapped`).
+- Assembly and recovery running options:
+  - Run directly through Aviary (`--run-aviary`)
+  - Run Aviary commands manually (see `coassemble/commands` in output)
+  - Run assemblies with differential-abundance-binning samples with the tool of your choice (see `coassemble/target/elusive_clusters.tsv` in output)
+- The taxa of the considered sequences can be filtered to target a specific taxon (e.g. `--taxa-of-interest "p__Planctomycetota"`).
+
+Paired end reads of form reads_1.1.fq, reads_1_1.fq and reads_1_R1.fq, where reads_1 is the sample name are automatically detected and matched to their basename.
+Most intermediate files can be provided to skip intermediate steps (e.g. SingleM otu tables, read sizes or genome transcripts; see `binchicken coassemble --full-help`).
+
+## Kmer preclustering
+
+Clustering groups of more than 1000 samples quickly leads to memory issues due to combinatorics.
+Kmer preclustering can be used (default if >1000 samples are provided, or use `--kmer-precluster always`) to reduce the number of combinations that are considered.
+This greatly reduces memory usage and allows scaling up to at least 250k samples.
+Kmer preclustering can be disabled with `--kmer-precluster never`.
+
+## Cluster submission
+
+Snakemake profiles can be used to automatically submit jobs to HPC clusters (`--snakemake-profile`).
+Note that Aviary assemble commands are submitted to the cluster, while Aviary recover commands are run locally such that Aviary handles cluster submission.
+The `--cluster-submission` flag sets the local Aviary recover thread usage to 1, to enable multiple runs in parallel by setting `--local-cores` to greater than 1.
+This is required to prevent `--local-cores` from limiting the number of threads per submitted job.
diff --git a/docs/preludes/update_prelude.md b/docs/preludes/update_prelude.md
@@ -1,10 +1,11 @@
 
 Applies further processing to a previous Bin Chicken coassemble run.
+Note that all coassemblies can be run by rerunning the `coassemble` command unchanged except for adding `--run-aviary`.
 
 Any combinations of the following:
 
 - Generating unmapped reads files (`--assemble-unmapped`)
-- Running assembly/recovery through Aviary (`--run-aviary`)
+- Running assembly/recovery for all/specific coassemblies through Aviary (`--run-aviary`, `--coassemblies`)
 - Downloading SRA reads (`--sra`)
 
 ```bash

diff --git a/doctave.yaml b/doctave.yaml
@@ -10,6 +10,7 @@ navigation:
   - path: docs/tools
     children:
     - path: docs/tools/coassemble.md
+    - path: docs/tools/single.md
     - path: docs/tools/update.md
     - path: docs/tools/iterate.md
     - path: docs/tools/evaluate.md
diff --git a/test/test_cluster_graph.py b/test/test_cluster_graph.py
@@ -54,8 +54,8 @@
 }
 
 class Tests(unittest.TestCase):
-    def assertDataFrameEqual(self, a, b):
-        assert_frame_equal(a, b, check_dtypes=False)
+    def assertDataFrameEqual(self, a, b, check_row_order=True):
+        assert_frame_equal(a, b, check_dtypes=False, check_row_order=check_row_order)
 
     def assertSeriesEqual(self, a, b):
         assert_series_equal(a, b, check_dtypes=False)
@@ -291,6 +291,42 @@ def test_cluster_no_edges(self):
         observed = pipeline(elusive_edges, read_size)
         self.assertDataFrameEqual(expected, observed)
 
+    def test_cluster_single_assembly(self):
+        elusive_edges = pl.DataFrame([
+            ["match", 2, "1,2", "1"],
+            ["match", 2, "1,3", "1,2"],
+            ["match", 2, "2,3", "1,2,3"],
+            ["match", 2, "4,5", "4,5,6,7"],
+            ["match", 2, "4,6", "4,5,6,7,8"],
+            ["match", 2, "5,6", "4,5,6,7,8,9"],
+        ], orient="row", schema=ELUSIVE_EDGES_COLUMNS)
+        read_size = pl.DataFrame([
+            ["1", 1000],
+            ["2", 1000],
+            ["3", 1000],
+            ["4", 1000],
+            ["5", 1000],
+            ["6", 1000],
+        ], orient="row", schema=READ_SIZE_COLUMNS)
+
+        expected = pl.DataFrame([
+            ["5", 1, 6, 1000, "4,5,6", "5"],
+            ["6", 1, 6, 1000, "4,5,6", "6"],
+            ["4", 1, 5, 1000, "4,5,6", "4"],
+            ["2", 1, 3, 1000, "1,2,3", "2"],
+            ["3", 1, 3, 1000, "1,2,3", "3"],
+            ["1", 1, 2, 1000, "1,2,3", "1"],
+        ], orient="row", schema=ELUSIVE_CLUSTERS_COLUMNS)
+        observed = pipeline(
+            elusive_edges,
+            read_size,
+            MAX_COASSEMBLY_SAMPLES=1,
+            MIN_COASSEMBLY_SAMPLES=1,
+            MAX_RECOVERY_SAMPLES=4,
+            single_assembly=True
+            )
+        self.assertDataFrameEqual(expected, observed, check_row_order=False)
+
     def test_cluster_only_large_clusters(self):
         elusive_edges = pl.DataFrame([
             ["match", 2, "1,2", "9,10"],
@@ -635,8 +671,8 @@ def test_cluster_restrict_coassembly_samples_single_assembly(self):
         ], orient="row", schema=READ_SIZE_COLUMNS)
 
         expected = pl.DataFrame([
-            ["2", 1, 6, 1000, "1,2,3", "coassembly_0"],
-            ["1", 1, 5, 1000, "1,2,3", "coassembly_1"],
+            ["2", 1, 6, 1000, "1,2,3", "2"],
+            ["1", 1, 5, 1000, "1,2,3", "1"],
         ], orient="row", schema=ELUSIVE_CLUSTERS_COLUMNS)
         observed = pipeline(
             elusive_edges,
@@ -645,6 +681,7 @@ def test_cluster_restrict_coassembly_samples_single_assembly(self):
             MIN_COASSEMBLY_SAMPLES=1,
             MAX_RECOVERY_SAMPLES=4,
             COASSEMBLY_SAMPLES=["1", "2"],
+            single_assembly=True,
             )
         self.assertDataFrameEqual(expected, observed)
 

diff --git a/test/test_coassemble.py b/test/test_coassemble.py
@@ -751,15 +751,15 @@ def test_coassemble_single_assembly(self):
                         "4",
                         "4832",
                         "sample_1,sample_2,sample_3",
-                        "coassembly_0"
+                        "sample_1"
                     ]),
                     "\t".join([
                         "sample_2",
                         "1",
                         "3",
                         "3926",
                         "sample_1,sample_2,sample_3",
-                        "coassembly_1"
+                        "sample_2"
                     ]),
                     ""
                 ]
@@ -1590,31 +1590,31 @@ def test_coassemble_preclustered_single_assembly(self):
                         "3",
                         "3624",
                         "sample_3,sample_5",
-                        "coassembly_0"
+                        "sample_5"
                     ]),
                     "\t".join([
                         "sample_3",
                         "1",
                         "2",
                         "3624",
                         "sample_3,sample_5",
-                        "coassembly_1"
+                        "sample_3"
                     ]),
                     "\t".join([
                         "sample_2",
                         "1",
                         "2",
                         "3926",
                         "sample_1,sample_2",
-                        "coassembly_2"
+                        "sample_2"
                     ]),
                     "\t".join([
                         "sample_1",
                         "1",
                         "2",
                         "4832",
                         "sample_1,sample_2",
-                        "coassembly_3"
+                        "sample_1"
                     ]),
                     ""
                 ]