Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolving issues #16, #17, #18, #21 and update to Autometa API and Logger #25

Merged
merged 20 commits into from
Mar 26, 2020
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
e16b4b8
updated add_contig_taxonomy.py to merge any nodes if databases nodes.…
evanroyrees Feb 19, 2020
2889600
added filepath handling to merged.dmp
evanroyrees Feb 19, 2020
6505fda
resolved merged_fpath variable
evanroyrees Feb 19, 2020
7d16d90
added extraction of merged.dmp from taxdump.tar.gz
evanroyrees Mar 2, 2020
faf8243
Merge pull request #9 from WiscEvan/master
jason-c-kwan Mar 2, 2020
3c2ef35
resolved #10 Contributors added and copyright year updated to 2020.
evanroyrees Mar 5, 2020
1d8982c
Merge branch 'master' of https://github.com/KwanLab/Autometa into dev
evanroyrees Mar 5, 2020
d53fb03
Resolves KwanLab/Autometa#16, Resolves KwanLab/Autometa#17 and simpli…
evanroyrees Mar 11, 2020
31e0e2f
fixed merge conflicts
evanroyrees Mar 11, 2020
b71e813
updates to project configuration handling metagenome numbering. Now r…
evanroyrees Mar 12, 2020
2b812da
significant simplification in API. Created Databases class in databas…
evanroyrees Mar 13, 2020
caeb6f9
updates to check dependencies and control of debugging information wh…
evanroyrees Mar 15, 2020
138c273
updated 'get_versions' function to return the version string if a pro…
evanroyrees Mar 15, 2020
ea282d1
hotfix to case where new project does not contain any metagenomes. sk…
evanroyrees Mar 15, 2020
ee1fbec
Changed OSError to subclass ChildProcessError in prodigal.py. This is…
evanroyrees Mar 16, 2020
bbfaecd
mostly resolves KwanLab/Autometa#21 and resolves KwanLab/Autometa#18.
evanroyrees Mar 17, 2020
5c73579
fix to extract contigs from orf_ids using specific prodigal version. …
evanroyrees Mar 19, 2020
bd007a8
updated pandas numpy module call for nan to pd.NA from pandas version…
evanroyrees Mar 23, 2020
4688efb
update to docstrings added new file key in config and comma-delimited…
evanroyrees Mar 24, 2020
406041b
returning from main rather than unnecessary sys import.
evanroyrees Mar 24, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
287 changes: 95 additions & 192 deletions autometa.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python3
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Copyright 2020 Ian J. Miller, Evan R. Rees, Kyle Wolf, Siddharth Uppal,
Expand Down Expand Up @@ -27,221 +27,124 @@
import os
import sys

import multiprocessing as mp

from autometa.config.user import AutometaUser
from autometa.config import PROJECTS_DIR
from autometa.config import parse_config
from autometa.common.utilities import timeit
from autometa.common.metagenome import Metagenome

logger = logging.getLogger('autometa')


logger = logging.getLogger(__name__)
def init_logger(fpath=None, level=logging.INFO):
"""Initialize logger.

@timeit
def run(mgargs):
"""Run autometa.
By default will initialize streaming logger with DEBUG level messages.
If `fpath` is provided, will write DEBUG level messages to `fpath` and
set streaming messages to INFO.

Parameters
----------
mgargs : argparse.Namespace
metagenome args
fpath : str
</path/to/file.log>
level : int
Overwrite default logging level behavior with provided `level`.
This must be a constant from logging levels.
See https://docs.python.org/3/library/logging.html#levels for details.
i.e. logging.DEBUG, logging.INFO, etc. translates to 0,10, etc...

Returns
-------
None
Description of returned object.
logging.Logger
logging's Logger object to emit messages via methods:
'warn','info','debug','error','exception','critical','fatal'

Raises
-------
ExceptionName
Why the exception is raised.

TypeError
`level` must be an int
ValueError
`level` must be one of 0, 10, 20, 30, 40, 50
"""
mg = Metagenome(
assembly=mgargs.files.metagenome,
outdir=mgargs.parameters.outdir,
nucl_orfs_fpath=mgargs.files.nucleotide_orfs,
prot_orfs_fpath=mgargs.files.amino_acid_orfs,
taxonomy_fpath=mgargs.files.taxonomy,
fwd_reads=mgargs.files.fwd_reads,
rev_reads=mgargs.files.rev_reads,
taxon_method=mgargs.parameters.taxon_method)
try:
# Original (raw) file should not be manipulated so return new object
mg = mg.length_filter(
out=mgargs.files.length_filtered,
cutoff=mgargs.parameters.length_cutoff)
except FileExistsError as err:
logger.debug(f'{mgargs.files.length_filtered} already exists. Continuing..')
mg = Metagenome(
assembly=mgargs.files.length_filtered,
outdir=mgargs.parameters.outdir,
nucl_orfs_fpath=mgargs.files.nucleotide_orfs,
prot_orfs_fpath=mgargs.files.amino_acid_orfs,
taxonomy_fpath=mgargs.files.taxonomy,
fwd_reads=mgargs.files.fwd_reads,
rev_reads=mgargs.files.rev_reads,
taxon_method=mgargs.parameters.taxon_method)
# I.e. asynchronous execution here (work-queue tasks)
mg.get_kmers(
kmer_size=mgargs.parameters.kmer_size,
normalized=mgargs.files.kmer_normalized,
out=mgargs.files.kmer_counts,
multiprocess=mgargs.parameters.kmer_multiprocess,
nproc=mgargs.parameters.cpus,
force=mgargs.parameters.force)

coverages = mg.get_coverages(
out=mgargs.files.coverages,
from_spades=mgargs.parameters.cov_from_spades,
sam=mgargs.files.sam,
bam=mgargs.files.bam,
lengths=mgargs.files.lengths,
bed=mgargs.files.bed)
# Filter by Kingdom
kingdoms = mg.get_kingdoms(
ncbi=mgargs.databases.ncbi,
usepickle=mgargs.parameters.usepickle,
blast=mgargs.files.blastp,
hits=mgargs.files.blastp_hits,
force=mgargs.parameters.force,
cpus=mgargs.parameters.cpus)

if not mgargs.parameters.kingdom in kingdoms:
raise KeyError(f'{mgargs.parameters.kingdom} not recovered in dataset. Recovered: {", ".join(kingdoms.keys())}')
mag = kingdoms.get(mgargs.parameters.kingdom)
bins_df = mag.get_binning(
method=mgargs.parameters.binning_method,
kmers=mgargs.files.kmer_counts,
embedded=mgargs.files.kmer_embedded,
do_pca=mgargs.parameters.do_pca,
pca_dims=mgargs.parameters.pca_dims,
embedding_method=mgargs.parameters.embedding_method,
coverage=coverages,
domain=mgargs.parameters.kingdom,
taxonomy=mgargs.files.taxonomy,
reverse=mgargs.parameters.reversed,
)
binning_cols = ['cluster','completeness','purity']
bins_df[binning_cols].to_csv(
mgargs.files.binning,
sep='\t',
index=True,
header=True)
# TODO: Refine bins by connection mapping, taxon, or other methods
# mag.refine(by='connections')
# mag.refine(by='taxa')
levels = {
logging.NOTSET,
logging.DEBUG,
logging.INFO,
logging.WARNING,
logging.ERROR,
logging.CRITICAL}
if type(level) is not int:
raise TypeError(f'{level} must be an int! {type(level)}')
if level and level not in levels:
raise ValueError(f'{level} not in levels: {levels}!')
formatter = logging.Formatter(
fmt='[%(asctime)s %(levelname)s] %(name)s: %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p')
# Construct file/stream logging handlers
streamhandler = logging.StreamHandler()
streamhandler.setFormatter(formatter)
if fpath:
filehandler = logging.FileHandler(fpath)
filehandler.setFormatter(formatter)
logger.addHandler(filehandler)

streamhandler.setLevel(level)
logger.addHandler(streamhandler)
logger.setLevel(logging.DEBUG)
return logger

def main(args):
if not args.metagenomes_configs and not args.metagenomes and not args.resume:
raise ValueError('Must provide metagenomes-configs or metagenomes')
if args.config:
user = AutometaUser(args.config, dryrun=args.dryrun)
else:
user = AutometaUser(dryrun=args.dryrun)
# Configure environment and databases
user.configure(nproc=args.cpus)
# Workflow control...
# TODO: WorkQueue handling. to process multiple metagenomes at once.
if args.resume:
mg_configs = user.get_mgargs(
projects_dir=args.projects,
project_num=args.project,
metagenome_num=args.resume)
elif args.metagenomes_configs:
try:
mg_configs = user.add_metagenomes(args.metagenomes_configs)
except FileNotFoundError as err:
project_configs = user.new_project(args)
mg_configs = user.add_metagenomes(args.metagenomes_configs)
# Setup logger
# timestamp = time.strftime("%Y-%m-%d_%H-%M-%S",time.gmtime())
# log_fpath = args.log if args.log else f'{timestamp}_autometa.log'
if args.debug:
logger = init_logger(fpath=args.log, level=logging.DEBUG)
else:
project_configs = user.new_project(args)
mg_configs = project_configs.get('metagenomes')
# Run autometa on workflow metagenome args...
for metagenome,mgargs in mg_configs.items():
run(mgargs)
# user.bin_metagenome(metagenome_config)
# TODO: Construct pangenomes from multiple datasets
# get_pangenomes()
logger = init_logger(fpath=args.log)
# Configure AutometaUser
# TODO: master from WorkQueue is AutometaUser
user = AutometaUser(dryrun=args.check_dependencies, nproc=args.cpus)

for config in args.config:
# TODO: Add directions to master from WorkQueue
mgargs = user.prepare_binning_args(config)
user.run_binning(mgargs)
# user.refine_binning()
# user.process_binning()
# user.get_pangenomes()

if __name__ == '__main__':
import argparse
import logging as logger
logger.basicConfig(
format='%(asctime)s : %(name)s : %(levelname)s : %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logger.DEBUG)

###############################
# AutometaUser Project(s) API #
###############################

parser = argparse.ArgumentParser('Main script to run Autometa')
parser.add_argument('--projects',
help=f'</path/autometa/projects/dir> (Default is {PROJECTS_DIR}).',
default=PROJECTS_DIR,
required=False)
parser.add_argument('--project',
help='project number for which to resume autometa binning (required with `--resume` and --add-metagenome).',
type=int)
parser.add_argument('--resume',
help='metagenome number for which to resume autometa binning (`--project` num is required).',
type=int,
default=0)
parser.add_argument('--metagenomes-configs',
import time
cpus = mp.cpu_count()
parser = argparse.ArgumentParser(description='Main script to run Autometa pipeline.')
parser.add_argument('config',
help='</path/to/metagenome.config>',
nargs='*')
parser.add_argument('--dryrun',
help='whether to perform database updating/construction',
parser.add_argument('--cpus',
help=f'Num. cpus to use when updating/constructing databases (default: {cpus} cpus)',
type=int,
default=cpus)
parser.add_argument('--debug',
help='Stream debugging information to terminal',
action='store_true',
default=False)

#######################
# Autometa Parameters #
#######################

parser.add_argument('metagenomes', nargs='*')
parser.add_argument('--length-cutoff', default=3000, type=int)
parser.add_argument('--cov-from-spades',
help='retrieve coverage from spades headers. (Only may be used when SPAdes assemblies are provided)',
parser.add_argument('--log', help='</path/to/autometa.log>', type=str)
parser.add_argument('--check-dependencies',
help='Check user executables and databases accessible to Autometa and exit.',
action='store_true',
default=False)
parser.add_argument(
'--kmer-size',
help='size of k-mer to calculate frequencies.',
default=5, type=int)
parser.add_argument(
'--kmer-multiprocess',
help='use multiprocessing to count k-mers.',
action='store_true', default=False)
parser.add_argument(
'--kmer-normalize',
help='Perform CLR transform on k-mer frequencies.',
action='store_true', default=False)
parser.add_argument('--do-pca',
help='Perform PCA prior to running embedding method', default=False, action='store_true')
parser.add_argument(
'--pca-dims',
help='Number of dimesions to reduce k-mer frequencies using PCA',
default=50, type=int)
parser.add_argument(
'--embedding-method',
help='Embedding method for dimension reduction of contig k-mer frequencies',
default='UMAP',
choices=['TSNE','UMAP'])
parser.add_argument('--taxon-method', default='majority_vote', choices=['majority_vote'])
parser.add_argument('--kingdom',default='bacteria',choices=['bacteria','archaea'])
parser.add_argument('--reversed', help='Reverse order at which taxonomic ranks are clustered', default=True, action='store_false')
parser.add_argument('--binning-method',
default='recursive_dbscan',
choices=['recursive_dbscan'])
parser.add_argument('--completeness', type=float, default=20.)
parser.add_argument('--purity', type=float, default=90.)
parser.add_argument('--verbose', action='store_true', default=False)
parser.add_argument('--force', action='store_true', default=False)
parser.add_argument('--usepickle', action='store_true', default=False)
parser.add_argument('--parallel', help="Use GNU parallel",
action='store_true', default=False)
parser.add_argument('--cpus',default=1, type=int)
parser.add_argument('--config',help='user defined config file')
args = parser.parse_args()
main(args)
try:
main(args)
except KeyboardInterrupt:
logger.info('User cancelled run. Exiting...')
sys.exit(1)
except Exception as err:
issue_request = '''

Please help us fix your problem!

You may file an issue with us at https://github.com/KwanLab/Autometa/issues/new
'''
err.issue_request = issue_request
logger.exception(err)
logger.info(err.issue_request)
14 changes: 13 additions & 1 deletion autometa/binning/recursive_dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

from autometa.common.markers import Markers
from autometa.common import kmers
from autometa.common.exceptions import RecursiveDBSCANError
# TODO: This should be
# from autometa.common.kmers import Kmers
# So later we can simply/and more clearly do Kmers.load(kmers_fpath).embed(method)
evanroyrees marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -331,7 +332,16 @@ def binning(master, markers, domain='bacteria', completeness=20., purity=90.,
pd.DataFrame
master with ['cluster','completeness','purity'] columns added

Raises
-------
RecursiveDBSCANError
No marker information is availble for contigs to be binned.
"""
# First check needs to ensure we have markers available to check binning quality...
if master.loc[master.index.isin(markers.index)].empty:
err = 'No markers for contigs in table. Unable to assess binning quality'
raise RecursiveDBSCANError(err)

if not taxonomy:
return get_clusters(
master,
Expand All @@ -356,6 +366,9 @@ def binning(master, markers, domain='bacteria', completeness=20., purity=90.,
for rank in ranks:
# TODO: We should account for novel taxa here instead of removing 'unclassified'
unclassified_filter = master[rank] != 'unclassified'
n_contigs_in_taxa = master.loc[unclassified_filter].groupby(rank)[rank].count().sum()
n_taxa = master.loc[unclassified_filter].groupby(rank)[rank].count().index.nunique()
logger.info(f'Examining {rank}: {n_taxa:,} unique taxa ({n_contigs_in_taxa:,} contigs)')
# Group contigs by rank and find best clusters within subset
for rank_name_txt, dff in master.loc[unclassified_filter].groupby(rank):
if dff.empty:
Expand Down Expand Up @@ -390,7 +403,6 @@ def binning(master, markers, domain='bacteria', completeness=20., purity=90.,
clustered.cluster = clustered.cluster.map(rename_cluster)
num_clusters += clustered.cluster.nunique()
clusters.append(clustered)

clustered_df = pd.concat(clusters, sort=True)
unclustered_df = master.loc[~master.index.isin(clustered_df.index)]
unclustered_df.loc[:,'cluster'] = pd.np.nan
Expand Down
Loading