Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 🎨 🍏 Fix kingdom-handling and mounting NCBI databases into docker container #229

Merged
merged 12 commits into from
Jan 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions autometa/binning/recursive_dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,7 +878,6 @@ def main():

logger.info(f"Selected clustering method: {args.clustering_method}")

# Perform clustering w/o taxonomy
if args.taxonomy:
main_out = taxon_guided_binning(
main=main_df,
Expand All @@ -894,6 +893,7 @@ def main():
verbose=args.verbose,
)
else:
# Perform clustering w/o taxonomy
main_out = get_clusters(
main=main_df,
markers_df=markers_df,
Expand All @@ -914,4 +914,16 @@ def main():


if __name__ == "__main__":
main()
import sys

# Using an http error status code...
# From: https://kinsta.com/blog/http-status-codes/#200-status-codes
# 204: “No Content.”
# This code means that the server has successfully processed the request
# but is not going to return any content.

try:
main()
except (TableFormatError, BinningError) as err:
logger.warn(err)
sys.exit(204)
2 changes: 1 addition & 1 deletion autometa/binning/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def get_metabin_stats(

df = bin_df[metabin_stat_cols].fillna(value={cluster_col: "unclustered"}).copy()

clusters = df.join(markers_df, how="outer").groupby("cluster")
clusters = df.join(markers_df, how="outer").groupby(cluster_col)

percent_metagenome_size = clusters.length.sum() / df.length.sum() * 100
percent_metagenome_seqs = clusters.size() / df.shape[0] * 100
Expand Down
43 changes: 29 additions & 14 deletions autometa/binning/unclustered_recruitment.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
from sklearn.tree import DecisionTreeClassifier

from autometa.common.markers import load as load_markers
from autometa.common.exceptions import BinningError


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -239,7 +238,7 @@ def train_test_split_and_subset(

Returns
-------
(TrainingData, pd.DataFrame)
Tuple(TrainingData, pd.DataFrame)
0th: Features and bin labels split and subset by clustered/unclustered contigs
1st: Unclustered contigs features
"""
Expand All @@ -251,7 +250,7 @@ def train_test_split_and_subset(
labels = get_labels(clustered)
# Finally retrieve features for the subset of clustered contigs
clustered_features_index = features.index.isin(clustered.index)
clustered_features = features[clustered_features_index]
clustered_features = features.loc[clustered_features_index].copy()
# Store features, targets and target_names in TrainingData for namespace lookup later.
train_data = TrainingData(
features=clustered_features,
Expand All @@ -261,7 +260,7 @@ def train_test_split_and_subset(
# Now retrieve features for unclustered contigs
unclustered = binning[binning.cluster.isnull()]
unclustered_features_index = features.index.isin(unclustered.index)
unclustered_features = features[unclustered_features_index]
unclustered_features = features.loc[unclustered_features_index].copy()
return train_data, unclustered_features


Expand Down Expand Up @@ -518,6 +517,11 @@ def main():
help="Path to write Autometa main table used during/after unclustered recruitment.",
required=False,
)
parser.add_argument(
"--output-features",
help="Path to write Autometa features table used during unclustered recruitment.",
required=False,
)
parser.add_argument("--taxonomy", help="Path to taxonomy table.")
parser.add_argument(
"--taxa-dimensions",
Expand Down Expand Up @@ -576,7 +580,7 @@ def main():
prev_num_unclustered = bin_df[bin_df.cluster.isnull()].shape[0]
if not prev_num_unclustered:
logger.warning("No unclustered contigs are available to recruit!")
sys.exit(0)
sys.exit(204)
markers_df = load_markers(fpath=args.markers, format="wide")

logger.debug(
Expand Down Expand Up @@ -619,22 +623,33 @@ def main():
f"unclustered {prev_num_unclustered} -> {now_num_unclustered} (recruited {n_recruited} contigs) in {n_runs} runs"
)
# Re-read in the binning dataframe to merge with the newly recruited contigs
prev_bin_df = pd.read_csv(
args.binning, sep="\t", index_col="contig", usecols=["contig", "cluster"]
)
prev_bin_df = pd.read_csv(args.binning, sep="\t", index_col="contig")
bin_df.rename(columns={"cluster": "recruited_cluster"}, inplace=True)
main_df = pd.merge(
prev_bin_df, bin_df[["recruited_cluster"]], left_index=True, right_index=True
binning_df = pd.merge(
prev_bin_df[["cluster"]],
bin_df[["recruited_cluster"]],
left_index=True,
right_index=True,
)
# Write unclustered recruitment results into main bin df
# index = 'contig', cols = [..., 'cluster', 'recruited_cluster', ...]
main_df.to_csv(
# Write unclustered recruitment results into binning df
# index = 'contig', cols = ['cluster', 'recruited_cluster']
binning_df.to_csv(
args.output_binning, sep="\t", index=True, header=True, float_format="%.5f"
)
if args.output_main:
main_df = pd.merge(
prev_bin_df,
bin_df[["recruited_cluster"]],
left_index=True,
right_index=True,
)
main_df.to_csv(
args.output_main, sep="\t", index=True, header=True, float_format="%.5f"
)
if args.output_features:
# Outputs features matrix used as input to recruitment algorithm
features.to_csv(
args.output_main, sep="\t", index=True, header=True, float_format="%.5f"
args.output_features, sep="\t", index=True, header=True, float_format="%.5f"
)


Expand Down
48 changes: 36 additions & 12 deletions autometa/common/kmers.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,6 @@ def main():
datefmt="%m/%d/%Y %I:%M:%S %p",
level=logger.DEBUG,
)
skip_desc = "(will skip if file exists)"
cpus = mp.cpu_count()
parser = argparse.ArgumentParser(
description="Count k-mer frequencies of given `fasta`",
Expand All @@ -631,20 +630,18 @@ def main():
"--fasta",
help="Metagenomic assembly fasta file",
metavar="filepath",
required=True,
)
parser.add_argument(
"--kmers",
help=f"K-mers frequency tab-delimited table {skip_desc}",
help=f"K-mers frequency tab-delimited table (will skip if file exists)",
metavar="filepath",
required=True,
)
parser.add_argument(
"--size", help="k-mer size in bp", default=5, metavar="int", type=int
)
parser.add_argument(
"--norm-output",
help=f"Path to normalized kmers table {skip_desc}",
help=f"Path to normalized kmers table (will skip if file exists)",
metavar="filepath",
)
parser.add_argument(
Expand All @@ -666,7 +663,7 @@ def main():
)
parser.add_argument(
"--embedding-output",
help=f"Path to write embedded kmers table {skip_desc}",
help=f"Path to write embedded kmers table (will skip if file exists)",
metavar="filepath",
)
parser.add_argument(
Expand Down Expand Up @@ -704,23 +701,50 @@ def main():
)
args = parser.parse_args()

if os.path.exists(args.kmers) and not args.force:
df = pd.read_csv(args.kmers, sep="\t", index_col="contig")
if not args.fasta and not args.kmers and not args.norm_output:
raise ValueError(
"At least one of --fasta, --kmers or --norm-output are required!"
)

norm_df = pd.DataFrame()

if (
args.norm_output
and not os.path.exists(args.norm_output)
and not args.fasta
and not args.kmers
):
# only normalized kmers were provided
raise FileNotFoundError(args.norm_output)
elif args.kmers and not os.path.exists(args.kmers) and not args.fasta:
# only kmer counts were provided
raise FileNotFoundError(args.kmers)
elif args.norm_output and os.path.exists(args.norm_output) and not args.force:
# We already have the normalized kmers
norm_df = pd.read_csv(args.norm_output, sep="\t", index_col="contig")
elif args.kmers and os.path.exists(args.kmers) and not args.force:
# We already have the kmer counts
kmers_df = pd.read_csv(args.kmers, sep="\t", index_col="contig")
else:
df = count(
# Start with counting kmers
kmers_df = count(
assembly=args.fasta,
size=args.size,
out=args.kmers,
force=args.force,
cpus=args.cpus,
)

if args.norm_output:
df = normalize(
df=df, method=args.norm_method, out=args.norm_output, force=args.force
if args.norm_output and norm_df.empty:
norm_df = normalize(
df=kmers_df,
method=args.norm_method,
out=args.norm_output,
force=args.force,
)

if args.embedding_output:
df = kmers_df if norm_df.empty else norm_df
embedded_df = embed(
kmers=df,
out=args.embedding_output,
Expand Down
23 changes: 15 additions & 8 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,17 @@
*/
params {
modules {
'analyze_kmers_options' {
'count_kmers_options' {
publish_by_meta = ['id']
publish_dir = "kmer_analysis"
publish_dir = "count_kmer_analysis"
}
'binning_options' {
'normalize_kmers_options' {
publish_by_meta = ['id']
publish_dir = "binning_results/bins"
publish_dir = "normalize_kmer_analysis"
}
'binning_summary_options' {
'embed_kmers_options' {
publish_by_meta = ['id']
publish_dir = "binning_results/binning_summary"
publish_dir = "embed_kmer_analysis"
}
'diamond_blastp_options' {
args = "--evalue 1e-5 --max-target-seqs 200 -b 6 --outfmt 6"
Expand Down Expand Up @@ -112,10 +112,17 @@ params {
'taxon_assignment' {
publish_by_meta = ['id']
}
'binning_options' {
publish_by_meta = ['id']
publish_dir = "binning"
}
'unclustered_recruitment_options' {
publish_by_meta = ['id']
publish_dir = "binning_results/unclustered_recruitment_results"

publish_dir = "unclustered_recruitment"
}
'binning_summary_options' {
publish_by_meta = ['id']
publish_dir = "binning_summary"
}
}
}
6 changes: 0 additions & 6 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@
"bowtie2/align": {
"git_sha": "e937c7950af70930d1f34bb961403d9d2aa81c7d"
},
"fastqc": {
"git_sha": "e937c7950af70930d1f34bb961403d9d2aa81c7d"
},
"multiqc": {
"git_sha": "e937c7950af70930d1f34bb961403d9d2aa81c7d"
},
"prodigal": {
"git_sha": "e937c7950af70930d1f34bb961403d9d2aa81c7d"
}
Expand Down
7 changes: 5 additions & 2 deletions modules/local/bin_contigs.nf → modules/local/binning.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ include { initOptions; saveFiles; getSoftwareName } from './functions'
params.options = [:]
options = initOptions(params.options)

process BIN_CONTIGS {
tag "Performing Autometa binning on ${meta.id}"
process BINNING {
tag "sample:${meta.id}, clustering:${params.clustering_method}, completeness:${params.completeness}, purity:${params.purity}, cov.std.dev.:${params.cov_stddev_limit}, gc.std.dev.:${params.gc_stddev_limit}"
label 'process_high'
publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode

Expand All @@ -16,6 +16,9 @@ process BIN_CONTIGS {
container "jasonkwan/autometa:${params.autometa_image_tag}"
}

// No markers were annotated for contigs in the table
errorStrategy { task.exitStatus in 204 ? 'ignore' : 'terminate' }

input:
tuple val(meta), path(kmers), path(coverage), path(gc_content), path(markers), path(taxonomy)

Expand Down
5 changes: 2 additions & 3 deletions modules/local/binning_summary.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@ include { initOptions; saveFiles; getSoftwareName } from './functions'
params.options = [:]
options = initOptions(params.options)

params.taxdump_tar_gz_dir = [:]

process BINNING_SUMMARY {
tag "Gathering binning summary for ${meta.id}"
label 'process_high'
Expand All @@ -22,6 +20,7 @@ process BINNING_SUMMARY {
input:
tuple val(meta), path(binning_main), path(markers), path(metagenome)
val(binning_column)
path(ncbi)

output:
tuple val(meta), path("metabin_stats.tsv") , emit: stats
Expand All @@ -33,7 +32,7 @@ process BINNING_SUMMARY {
def software = getSoftwareName(task.process)
"""
autometa-binning-summary \\
--ncbi ${params.taxdump_tar_gz_dir} \\
--ncbi $ncbi \\
--binning-main $binning_main \\
--markers $markers \\
--metagenome $metagenome \\
Expand Down
38 changes: 38 additions & 0 deletions modules/local/count_kmers.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Import generic module functions
include { initOptions; saveFiles; getSoftwareName } from './functions'

params.options = [:]
options = initOptions(params.options)

process COUNT_KMERS {
tag "Counting ${params.kmer_size}-mers for ${meta.id}"
label 'process_medium'
publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode

conda (params.enable_conda ? "autometa" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/autometa"
} else {
container "jasonkwan/autometa:${params.autometa_image_tag}"
}

input:
tuple val(meta), path(metagenome)

output:
tuple val(meta), path("kmers.tsv") , emit: counts
path '*.version.txt' , emit: version

script:
def software = getSoftwareName(task.process)
"""
autometa-kmers \\
--fasta $metagenome \\
--kmers "kmers.tsv" \\
--size "${params.kmer_size}" \\
--cpus "${task.cpus}" \\
--seed 42

echo "TODO" > autometa.version.txt
"""
}
Loading