Skip to content

Commit

Permalink
Merge pull request #192 from rhysnewell/ISS-185
Browse files Browse the repository at this point in the history
  • Loading branch information
rhysnewell authored Mar 12, 2024
2 parents 5eb3ed4 + 276f6c0 commit 7195d25
Show file tree
Hide file tree
Showing 17 changed files with 724 additions and 179 deletions.
1 change: 1 addition & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ authors:
orcid: https://orcid.org/0000-0003-0670-7480
title: "Aviary: Hybrid assembly and genome recovery from metagenomes with Aviary"
version: 0.8.3
doi: 10.5281/zenodo.10158087
date-released: 2023-11-20
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/aviary/README.html)
![](https://anaconda.org/bioconda/aviary/badges/license.svg)
![](https://anaconda.org/bioconda/aviary/badges/version.svg)
![](https://anaconda.org/bioconda/aviary/badges/latest_release_relative_date.svg)
![](https://anaconda.org/bioconda/aviary/badges/platforms.svg)
[![DOI](https://zenodo.org/badge/271448699.svg)](https://zenodo.org/doi/10.5281/zenodo.10158086)


![](docs/_include/images/aviary_logo.png)

# Aviary
Expand Down Expand Up @@ -110,7 +118,7 @@ ask you to set these environment variables upon first running and if they are no
the `aviary configure` subcommand to reset the environment variables:

```commandline
aviary configure -o logs/ --eggnog-db-path /shared/db/eggnog/ --gtdb-path /shared/db/gtdb/ --checkm2-db-path /shared/db/checkm2db/ --download
aviary configure -o logs/ --eggnog-db-path /shared/db/eggnog/ --gtdb-path /shared/db/gtdb/ --checkm2-db-path /shared/db/checkm2db/ --singlem-metapackage-path /shared/db/singlem/ --download
```

This command will check if the databases exist at those given locations, if they don't then aviary will download and change
Expand Down
2 changes: 1 addition & 1 deletion aviary/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.3"
__version__ = "0.9.0"
54 changes: 48 additions & 6 deletions aviary/aviary.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,23 +598,65 @@ def main():
default=3
)

binning_group.add_argument(
'--extra-binners', '--extra_binners', '--extra-binner', '--extra_binner',
help='Optional list of extra binning algorithms to run. Can be any combination of: \n'
'maxbin, maxbin2, concoct \n'
'These binners are skipped by default as they can have long runtimes \n'
'N.B. specifying "maxbin" and "maxbin2" are equivalent \n',
dest='extra_binners',
nargs='*',
choices=["maxbin", "maxbin2", "concoct"]
)

binning_group.add_argument(
'--skip-binners', '--skip_binners', '--skip_binner', '--skip-binner',
help='Optional list of binning algorithms to skip. Can be any combination of: \n'
'rosella, semibin, metabat1, metabat2, metabat, vamb, concoct, maxbin2, maxbin \n'
'Capitals will be auto-corrected. N.B. specifying "metabat" will skip both \n'
'MetaBAT1 and MetaBAT2.',
'rosella, semibin, metabat1, metabat2, metabat, vamb \n'
'N.B. specifying "metabat" will skip both MetaBAT1 and MetaBAT2. \n',
dest='skip_binners',
nargs='*'
# default=["maxbin2"]
nargs='*',
choices=["rosella", "semibin", "metabat1", "metabat2", "metabat", "vamb"]
)

binning_group.add_argument(
'--binning-only', '--binning_only',
help='Only run up to the binning stage. Do not run SingleM, GTDB-tk, or CoverM',
type=str2bool,
nargs='?',
const=True,
dest='binning_only',
default=False,
)

binning_group.add_argument(
'--skip-abundances', '--skip_abundances',
help='Skip CoverM post-binning abundance calculations.',
dest='skip_abundances',
type=str2bool,
nargs='?',
const=True,
default=False,
)

binning_group.add_argument(
'--skip-taxonomy', '--skip_taxonomy',
help='Skip GTDB-tk post-binning taxonomy assignment.',
dest='skip_taxonomy',
type=str2bool,
nargs='?',
const=True,
default=False,
)

binning_group.add_argument(
'--skip-singlem', '--skip_singlem',
help='Skip SingleM post-binning recovery assessment.',
dest='skip_singlem',
type=str2bool,
nargs='?',
const=True,
default=False,
action="store_true",
)

####################################################################
Expand Down
18 changes: 9 additions & 9 deletions aviary/envs/checkm2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@ channels:
- bioconda
- defaults
dependencies:
- python>=3.6, <3.9
- scikit-learn=0.23.2
- h5py=2.10.0
- numpy=1.21.6
- diamond=2.0.4
- tensorflow >= 2.1.0, <=2.6
- python >=3.7, <3.9
- scikit-learn =0.23.2
- h5py =2.10.0
- numpy =1.19.2
- diamond =2.0.4
- tensorflow >= 2.2.0, <2.6.0
- lightgbm =3.2.1
- pandas >=1.4.0, <2.0
- scipy
- prodigal >=2.6.3
- pandas =1.4.0
- scipy =1.8.0
- prodigal =2.6.3
- setuptools
- requests
- packaging
Expand Down
4 changes: 2 additions & 2 deletions aviary/modules/annotation/annotation.smk
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ rule download_gtdb:

# Uncompress and pipe output to TQDM
'echo "[INFO] - Extracting archive..."; '
'tar xvzf "$TARGET_TAR" -C "${{TARGET_DIR}}" --strip 1; '
'tar -xvzf "$TARGET_TAR" -C "${{TARGET_DIR}}" --strip 1; '

# Remove the file after successful extraction
'rm "$TARGET_TAR"; '
Expand All @@ -122,7 +122,7 @@ rule download_singlem_metapackage:
'logs/download_singlem.log'
shell:
'singlem data --output-directory {params.metapackage_folder}_tmp 2> {log} && '
'mv {params.metapackage_folder}_tmp/*.smpkg.zb/payload_directory {params.metapackage_folder}'
'mv {params.metapackage_folder}_tmp/*.smpkg.zb {params.metapackage_folder}'

rule download_checkm2:
params:
Expand Down
69 changes: 28 additions & 41 deletions aviary/modules/binning/binning.smk
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,7 @@ rule finalise_stats:
checkm1_done = "bins/checkm.out",
checkm2_done = "bins/checkm2_output/quality_report.tsv",
coverage_file = "data/coverm_abundances.tsv" if not config["skip_abundances"] else [],
gtdbtk_done = "data/gtdbtk/done"
gtdbtk_done = "data/gtdbtk/done" if not config["skip_taxonomy"] else []
output:
bin_stats = "bins/bin_info.tsv",
checkm_minimal = "bins/checkm_minimal.tsv"
Expand Down Expand Up @@ -732,89 +732,76 @@ rule checkm_das_tool:
rule singlem_pipe_reads:
output:
"data/singlem_out/metagenome.combined_otu_table.csv"
params:
package_path = os.environ["SINGLEM_METAPACKAGE_PATH"]
threads: min(config["max_threads"], 48)
resources:
mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 8*1024*attempt),
runtime = lambda wildcards, attempt: 12*60*attempt,
log:
"data/singlem_out/singlem_reads_log.txt"
"logs/singlem_pipe_reads_log.txt"
conda:
"../../envs/singlem.yaml"
script:
"../../scripts/singlem_reads.py"

rule singlem_appraise:
input:
metagenome = "data/singlem_out/metagenome.combined_otu_table.csv",
gtdbtk_done = "data/gtdbtk/done",
pipe_results = "data/singlem_out/metagenome.combined_otu_table.csv",
assembly = config["fasta"],
# gtdbtk_done = "data/gtdbtk/done",
bins_complete = "bins/checkm.out"
output:
"data/singlem_out/singlem_appraisal.tsv"
binned = "data/singlem_out/binned.otu_table.csv",
unbinned = "data/singlem_out/unbinned.otu_table.csv",
plot = "data/singlem_out/singlem_appraise.svg",
assembled = "data/singlem_out/assembled.otu_table.csv",
singlem = "data/singlem_out/singlem_appraisal.tsv"
params:
pplacer_threads = config['pplacer_threads'],
fasta = config['fasta']
package_path = os.environ["SINGLEM_METAPACKAGE_PATH"],
genomes_folder = "data/refined_bins/final_bins/"
threads: min(config["max_threads"], 48)
resources:
mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 8*1024*attempt),
runtime = lambda wildcards, attempt: 12*60*attempt,
conda:
"../../envs/singlem.yaml"
log:
"data/singlem_out/singlem_log.txt"
shell:
# We use bash -c so that a non-zero exitstatus of the non-final commands doesn't cause the rule (and therefore aviary) to fail
"bash -c 'singlem pipe --threads {threads} --genome-fasta-file bins/final_bins/*.fna --otu-table data/singlem_out/genomes.otu_table.csv && "
"singlem pipe --threads {threads} --genome-fasta-file {params.fasta} --otu-table data/singlem_out/assembly.otu_table.csv && "
"singlem appraise --metagenome-otu-tables {input.metagenome} --genome-otu-tables data/singlem_out/genomes.otu_table.csv "
"--assembly-otu-table data/singlem_out/assembly.otu_table.csv "
"--plot data/singlem_out/singlem_appraise.svg --output-binned-otu-table data/singlem_out/binned.otu_table.csv "
"--output-unbinned-otu-table data/singlem_out/unbinned.otu_table.csv > data/singlem_out/singlem_appraisal.tsv' 2> {log} || "
"echo 'SingleM Errored, please check data/singlem_out/singlem_log.txt'; touch data/singlem_out/singlem_appraisal.tsv"

"logs/singlem_appraise_log.txt"
script:
"../../scripts/singlem_appraise.py"

rule recover_mags:
input:
final_bins = "bins/bin_info.tsv",
gtdbtk = "data/gtdbtk/done",
gtdbtk = "data/gtdbtk/done" if not config["skip_taxonomy"] else [],
coverm = "data/coverm_abundances.tsv" if not config["skip_abundances"] else [],
singlem = "data/singlem_out/singlem_appraisal.tsv"
contig_coverage = "data/coverm.cov",
singlem = "data/singlem_out/singlem_appraisal.tsv" if not config["skip_singlem"] else [],
conda:
"../../envs/coverm.yaml"
output:
bins = "bins/done",
diversity = 'diversity/done'
bins = "bins/done"
threads:
config["max_threads"]
shell:
"cd bins/; "
"ln -s ../data/coverm_abundances.tsv ./; "
"ln -s ../data/coverm.cov ./; "
"cd ../; "
"ln -sr data/singlem_out/ diversity || echo 'SingleM linked'; "
"ln -sr data/gtdbtk taxonomy || echo 'GTDB-tk linked'; "
"touch bins/done; "
"touch diversity/done; "
"rm -f data/binning_bams/*bam; "
"rm -f data/binning_bams/*bai; "
script:
"scripts/finalise_recovery.py"

rule recover_mags_no_singlem:
input:
final_bins = "bins/bin_info.tsv",
gtdbtk = [],
coverm = "data/coverm_abundances.tsv" if not config["skip_abundances"] else [],
contig_coverage = "data/coverm.cov",
singlem = [],
conda:
"../../envs/coverm.yaml"
output:
bins = "bins/done",
threads:
config["max_threads"]
shell:
"cd bins/; "
"ln -s ../data/coverm_abundances.tsv ./; "
"ln -s ../data/coverm.cov ./; "
"cd ../; "
"touch bins/done; "
"rm -f data/binning_bams/*bam; "
"rm -f data/binning_bams/*bai; "
script:
"scripts/finalise_recovery.py"

# Special rule to help out with a buggy output
rule dereplicate_and_get_abundances_paired:
Expand Down
27 changes: 2 additions & 25 deletions aviary/modules/binning/envs/rosella.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,8 @@ channels:
- conda-forge
- numba
- bioconda
- defaults
dependencies:
- python >= 3.8, <= 3.10
- gcc
- cxx-compiler
- rosella >= 0.5.2
- numba >= 0.53, <= 0.57
- numpy <= 1.24
- joblib >= 1.1.0, <= 1.3
- scikit-bio >= 0.5.7
- umap-learn >= 0.5.3
- scipy <= 1.11
- pandas >= 1.3
- pynndescent >= 0.5.7
- hdbscan >= 0.8.28
- scikit-learn >= 1.0.2, <= 1.1
- flight-genome >= 1.6.1
- rosella >= 0.5.3
- flight-genome >= 1.6.3
- coverm >= 0.6.1
- seaborn
- imageio
- matplotlib
- tqdm
- tbb
- joblib
- pebble
- threadpoolctl
- biopython
- checkm-genome==1.1.3
44 changes: 44 additions & 0 deletions aviary/modules/binning/scripts/finalise_recovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
from pathlib import Path
import glob

def check_and_remove_base_file(file_path) -> str:
file_name = os.path.basename(file_path)
if os.path.exists(file_name):
os.remove(file_name)

return file_name


if __name__ == '__main__':
final_bins = snakemake.input.final_bins
coverage_file = snakemake.input.coverm
contig_coverage = snakemake.input.contig_coverage
gtdbtk = snakemake.input.gtdbtk
singlem = snakemake.input.singlem

output_taxonomy = "taxonomy"
output_singlem = "diversity"

os.chdir('bins/')

if len(coverage_file) > 0:
file_name = check_and_remove_base_file(coverage_file)
os.symlink(f"../{coverage_file}", f"{file_name}")

if len(contig_coverage) > 0:
file_name = check_and_remove_base_file(contig_coverage)
os.symlink(f"../{contig_coverage}", f"{file_name}")

os.chdir('..')
if len(gtdbtk) > 0:
check_and_remove_base_file(output_taxonomy)
os.symlink(f"{os.path.dirname(gtdbtk)}", output_taxonomy)
if len(singlem) > 0:
check_and_remove_base_file(output_singlem)
os.symlink(f"{os.path.dirname(singlem)}", output_singlem)

for f in glob.glob('data/binning_bams/*.ba*'):
os.remove(f)

Path('bins/done').touch()
9 changes: 6 additions & 3 deletions aviary/modules/binning/scripts/finalise_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,12 @@ def get_taxonomy(rename_columns="Bin Id"):
taxa.append(df_arc)
except (FileNotFoundError, IndexError) as e:
pass

taxa = pd.concat(taxa)
taxa.rename({'user_genome' : rename_columns}, inplace=True, axis=1)

try:
taxa = pd.concat(taxa)
taxa.rename({'user_genome' : rename_columns}, inplace=True, axis=1)
except ValueError:
taxa = pd.DataFrame(columns=[rename_columns])
return taxa


Expand Down
Loading

0 comments on commit 7195d25

Please sign in to comment.