Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add separate subcommand for single-assembly #140

Merged
merged 4 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions binchicken/binchicken.py
Original file line number Diff line number Diff line change
Expand Up @@ -1169,6 +1169,12 @@ def main():
"binchicken coassemble --forward reads_1.1.fq ... --reverse reads_1.2.fq ... --single-assembly"
),
],
"single": [
btu.Example(
"find relevant samples for differential coverage binning (no coassembly)",
"binchicken single --forward reads_1.1.fq ... --reverse reads_1.2.fq ..."
),
],
"evaluate": [
btu.Example(
"evaluate a completed coassembly",
Expand Down Expand Up @@ -1358,6 +1364,11 @@ def add_coassemble_arguments(parser):

###########################################################################

single_parser = main_parser.new_subparser("single", "Single-assembly of reads with binning samples chosen by unbinned single-copy marker genes")
add_coassemble_arguments(single_parser)

###########################################################################

evaluate_parser = main_parser.new_subparser("evaluate", "Evaluate coassembled bins")
# Base arguments
evaluate_base = evaluate_parser.add_argument_group("Base input arguments")
Expand Down Expand Up @@ -1534,6 +1545,11 @@ def coassemble_output_argument_verification(args, evaluate=False):
coassemble_argument_verification(args)
coassemble(args)

if args.subparser_name == "single":
args.single_assembly = True
coassemble_argument_verification(args)
coassemble(args)

elif args.subparser_name == "evaluate":
coassemble_output_argument_verification(args, evaluate=True)
if not args.singlem_metapackage and not os.environ['SINGLEM_METAPACKAGE_PATH']:
Expand Down
2 changes: 2 additions & 0 deletions binchicken/workflow/coassemble.smk
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,7 @@ checkpoint cluster_graph:
max_recovery_samples = config["max_recovery_samples"],
coassembly_samples = config["coassembly_samples"],
exclude_coassemblies = config["exclude_coassemblies"],
single_assembly = config["single_assembly"],
threads: 64
resources:
mem_mb=get_mem_mb,
Expand Down Expand Up @@ -855,6 +856,7 @@ rule aviary_recover:
"-1 {params.reads_1} "
"-2 {params.reads_2} "
"--output {params.output} "
"--checkm2-db-path {params.checkm2} "
"{params.fast} "
"-n {params.threads} "
"-t {params.threads} "
Expand Down
9 changes: 7 additions & 2 deletions binchicken/workflow/scripts/cluster_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ def pipeline(
MIN_CLUSTER_TARGETS=1,
MAX_SAMPLES_COMBINATIONS=100,
COASSEMBLY_SAMPLES=[],
EXCLUDE_COASSEMBLIES=[]):
EXCLUDE_COASSEMBLIES=[],
single_assembly=False):

logging.info(f"Polars using {str(pl.thread_pool_size())} threads")

Expand Down Expand Up @@ -308,7 +309,9 @@ def filter_max_coassembly_size(df, MAX_COASSEMBLY_SIZE):
.with_row_index("coassembly")
.select(
"samples", "length", "total_targets", "total_size", "recover_samples",
coassembly = pl.lit("coassembly_") + pl.col("coassembly").cast(pl.Utf8)
coassembly = pl.when(single_assembly)
.then(pl.col("samples"))
.otherwise(pl.lit("coassembly_") + pl.col("coassembly").cast(pl.Utf8))
)
)

Expand All @@ -333,6 +336,7 @@ def filter_max_coassembly_size(df, MAX_COASSEMBLY_SIZE):
MAX_RECOVERY_SAMPLES = snakemake.params.max_recovery_samples
COASSEMBLY_SAMPLES = snakemake.params.coassembly_samples
EXCLUDE_COASSEMBLIES = snakemake.params.exclude_coassemblies
single_assembly = snakemake.params.single_assembly
elusive_edges_path = snakemake.input.elusive_edges
read_size_path = snakemake.input.read_size
weightings_path = snakemake.input.targets_weighted
Expand Down Expand Up @@ -363,5 +367,6 @@ def filter_max_coassembly_size(df, MAX_COASSEMBLY_SIZE):
EXCLUDE_COASSEMBLIES=EXCLUDE_COASSEMBLIES,
MIN_CLUSTER_TARGETS=min_cluster_targets,
MAX_SAMPLES_COMBINATIONS=100,
single_assembly=single_assembly,
)
clusters.write_csv(elusive_clusters_path, separator="\t")
3 changes: 2 additions & 1 deletion build_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def get_version(relpath):
logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

subdir_and_commands = [
["tools", ["coassemble", "evaluate", "iterate", "update"]],
["tools", ["coassemble", "single", "evaluate", "iterate", "update"]],
]

for subdir, commands in subdir_and_commands:
Expand All @@ -56,6 +56,7 @@ def get_version(relpath):
# Remove everything before the options section
splitters = {
"coassemble": "OPTIONS",
"single": "OPTIONS",
"evaluate": "OPTIONS",
"iterate": "OPTIONS",
"update": "OPTIONS",
Expand Down
4 changes: 2 additions & 2 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ Bin Chicken is a tool that performs targeted recovery of low abundance metagenom
```bash
# Assemble and recover from each sample individually
# 20 samples used for differential abundance binning
binchicken coassemble \
binchicken single \
--forward-list samples_forward.txt --reverse-list samples_reverse.txt \
--run-aviary --single-assembly \
--run-aviary \
--cores 64 --output binchicken_single_assembly

# Assemble and recover from 2-sample coassemblies
Expand Down
41 changes: 41 additions & 0 deletions docs/preludes/single_prelude.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@

Snakemake pipeline to discover samples for differential abundance binning for single-sample assembly based on co-occurrence of single-copy marker genes, excluding those genes present in reference genomes (e.g. previously recovered genomes).

```bash
# Example: find relevant samples for differential coverage binning (no coassembly)
binchicken single --forward reads_1.1.fq ... --reverse reads_1.2.fq ...

# Example: run proposed assemblies through aviary with cluster submission
# Create snakemake profile at ~/.config/snakemake/qsub with cluster, cluster-status, cluster-cancel, etc.
# See https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles
binchicken single --forward reads_1.1.fq ... --reverse reads_1.2.fq ... --run-aviary \
--snakemake-profile qsub --cluster-submission --local-cores 64 --cores 64
```

Important options:

- Maximum number of recovery samples for differential-abundance binning can be specified (`--max-recovery-samples`, default 20)
- Genomes can be provided and matching marker genes will be excluded (`--genomes`)
- Reads can be mapped to the matched bins with only unmapped reads being assembled (`--assemble-unmapped`).
- Assembly and recovery running options:
- Run directly through Aviary (`--run-aviary`)
- Run Aviary commands manually (see `coassemble/commands` in output)
- Run assemblies with differential-abundance-binning samples with the tool of your choice (see `coassemble/target/elusive_clusters.tsv` in output)
- The taxa of the considered sequences can be filtered to target a specific taxon (e.g. `--taxa-of-interest "p__Planctomycetota"`).

Paired end reads of form reads_1.1.fq, reads_1_1.fq and reads_1_R1.fq, where reads_1 is the sample name are automatically detected and matched to their basename.
Most intermediate files can be provided to skip intermediate steps (e.g. SingleM otu tables, read sizes or genome transcripts; see `binchicken coassemble --full-help`).

## Kmer preclustering

Clustering groups of more than 1000 samples quickly leads to memory issues due to combinatorics.
Kmer preclustering can be used (default if >1000 samples are provided, or use `--kmer-precluster always`) to reduce the number of combinations that are considered.
This greatly reduces memory usage and allows scaling up to at least 250k samples.
Kmer preclustering can be disabled with `--kmer-precluster never`.

## Cluster submission

Snakemake profiles can be used to automatically submit jobs to HPC clusters (`--snakemake-profile`).
Note that Aviary assemble commands are submitted to the cluster, while Aviary recover commands are run locally such that Aviary handles cluster submission.
The `--cluster-submission` flag sets the local Aviary recover thread usage to 1, to enable multiple runs in parallel by setting `--local-cores` to greater than 1.
This is required to prevent `--local-cores` from limiting the number of threads per submitted job.
3 changes: 2 additions & 1 deletion docs/preludes/update_prelude.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@

Applies further processing to a previous Bin Chicken coassemble run.
Note that all coassemblies can be run by rerunning the `coassemble` command unchanged except for adding `--run-aviary`.

Any combinations of the following:

- Generating unmapped reads files (`--assemble-unmapped`)
- Running assembly/recovery through Aviary (`--run-aviary`)
- Running assembly/recovery for all/specific coassemblies through Aviary (`--run-aviary`, `--coassemblies`)
- Downloading SRA reads (`--sra`)

```bash
Expand Down
1 change: 1 addition & 0 deletions doctave.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ navigation:
- path: docs/tools
children:
- path: docs/tools/coassemble.md
- path: docs/tools/single.md
- path: docs/tools/update.md
- path: docs/tools/iterate.md
- path: docs/tools/evaluate.md
45 changes: 41 additions & 4 deletions test/test_cluster_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@
}

class Tests(unittest.TestCase):
def assertDataFrameEqual(self, a, b):
assert_frame_equal(a, b, check_dtypes=False)
def assertDataFrameEqual(self, a, b, check_row_order=True):
assert_frame_equal(a, b, check_dtypes=False, check_row_order=check_row_order)

def assertSeriesEqual(self, a, b):
assert_series_equal(a, b, check_dtypes=False)
Expand Down Expand Up @@ -291,6 +291,42 @@ def test_cluster_no_edges(self):
observed = pipeline(elusive_edges, read_size)
self.assertDataFrameEqual(expected, observed)

def test_cluster_single_assembly(self):
elusive_edges = pl.DataFrame([
["match", 2, "1,2", "1"],
["match", 2, "1,3", "1,2"],
["match", 2, "2,3", "1,2,3"],
["match", 2, "4,5", "4,5,6,7"],
["match", 2, "4,6", "4,5,6,7,8"],
["match", 2, "5,6", "4,5,6,7,8,9"],
], orient="row", schema=ELUSIVE_EDGES_COLUMNS)
read_size = pl.DataFrame([
["1", 1000],
["2", 1000],
["3", 1000],
["4", 1000],
["5", 1000],
["6", 1000],
], orient="row", schema=READ_SIZE_COLUMNS)

expected = pl.DataFrame([
["5", 1, 6, 1000, "4,5,6", "5"],
["6", 1, 6, 1000, "4,5,6", "6"],
["4", 1, 5, 1000, "4,5,6", "4"],
["2", 1, 3, 1000, "1,2,3", "2"],
["3", 1, 3, 1000, "1,2,3", "3"],
["1", 1, 2, 1000, "1,2,3", "1"],
], orient="row", schema=ELUSIVE_CLUSTERS_COLUMNS)
observed = pipeline(
elusive_edges,
read_size,
MAX_COASSEMBLY_SAMPLES=1,
MIN_COASSEMBLY_SAMPLES=1,
MAX_RECOVERY_SAMPLES=4,
single_assembly=True
)
self.assertDataFrameEqual(expected, observed, check_row_order=False)

def test_cluster_only_large_clusters(self):
elusive_edges = pl.DataFrame([
["match", 2, "1,2", "9,10"],
Expand Down Expand Up @@ -635,8 +671,8 @@ def test_cluster_restrict_coassembly_samples_single_assembly(self):
], orient="row", schema=READ_SIZE_COLUMNS)

expected = pl.DataFrame([
["2", 1, 6, 1000, "1,2,3", "coassembly_0"],
["1", 1, 5, 1000, "1,2,3", "coassembly_1"],
["2", 1, 6, 1000, "1,2,3", "2"],
["1", 1, 5, 1000, "1,2,3", "1"],
], orient="row", schema=ELUSIVE_CLUSTERS_COLUMNS)
observed = pipeline(
elusive_edges,
Expand All @@ -645,6 +681,7 @@ def test_cluster_restrict_coassembly_samples_single_assembly(self):
MIN_COASSEMBLY_SAMPLES=1,
MAX_RECOVERY_SAMPLES=4,
COASSEMBLY_SAMPLES=["1", "2"],
single_assembly=True,
)
self.assertDataFrameEqual(expected, observed)

Expand Down
12 changes: 6 additions & 6 deletions test/test_coassemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -751,15 +751,15 @@ def test_coassemble_single_assembly(self):
"4",
"4832",
"sample_1,sample_2,sample_3",
"coassembly_0"
"sample_1"
]),
"\t".join([
"sample_2",
"1",
"3",
"3926",
"sample_1,sample_2,sample_3",
"coassembly_1"
"sample_2"
]),
""
]
Expand Down Expand Up @@ -1590,31 +1590,31 @@ def test_coassemble_preclustered_single_assembly(self):
"3",
"3624",
"sample_3,sample_5",
"coassembly_0"
"sample_5"
]),
"\t".join([
"sample_3",
"1",
"2",
"3624",
"sample_3,sample_5",
"coassembly_1"
"sample_3"
]),
"\t".join([
"sample_2",
"1",
"2",
"3926",
"sample_1,sample_2",
"coassembly_2"
"sample_2"
]),
"\t".join([
"sample_1",
"1",
"2",
"4832",
"sample_1,sample_2",
"coassembly_3"
"sample_1"
]),
""
]
Expand Down
Loading
Loading