Skip to content

Commit

Permalink
introduce light db type #95
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Feb 22, 2023
1 parent 15dcb78 commit a96e545
Show file tree
Hide file tree
Showing 9 changed files with 394 additions and 129 deletions.
55 changes: 37 additions & 18 deletions bakta/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
'pfam.h3i',
'pfam.h3m',
'pfam.h3p',
'psc.dmnd',
'rfam-go.tsv',
'rRNA.i1f',
'rRNA.i1i',
Expand Down Expand Up @@ -76,20 +75,22 @@ def check(db_path: Path) -> dict:
log.exception('could not parse database version file!')
sys.exit('ERROR: could not parse database version file!')

for key in ['date', 'major', 'minor']:
for key in ['date', 'major', 'minor', 'type']:
if(key not in db_info):
log.error('wrong db version info file content! missed key=%s', key)
sys.exit(f"ERROR: wrong db version info file format! Missed key '{key}' in JSON structure.")

log.info('detected: major=%i, minor=%i, date=%s', db_info['major'], db_info['minor'], db_info['date'])
log.info('detected: major=%i, minor=%i, type=%s, date=%s', db_info['major'], db_info['minor'], db_info['type'], db_info['date'])
if(db_info['major'] < bakta.__db_schema_version__):
log.error('wrong database version detected! required=%i, detected=%i', bakta.__db_schema_version__, db_info['major'])
sys.exit(f"ERROR: wrong database version detected!\nBakta version {bakta.__version__} requires database version {bakta.__db_schema_version__}.x, but {db_info['major']}.{db_info['minor']} was detected. Please, update the database from https://doi.org/10.5281/zenodo.4247253")
elif(db_info['major'] > bakta.__db_schema_version__):
log.error('wrong database version detected! required=%i, detected=%i', bakta.__db_schema_version__, db_info['major'])
sys.exit(f"ERROR: wrong database version detected!\nBakta version {bakta.__version__} requires database version {bakta.__db_schema_version__}.x, but {db_info['major']}.{db_info['minor']} was detected. Please, update Bakta or download a compatible database version from https://doi.org/10.5281/zenodo.4247253")

for file_name in FILE_NAMES:
required_db_files = FILE_NAMES
required_db_files.append('psc.dmnd' if db_info['type'] == 'full' else 'pscc.dmnd')
for file_name in required_db_files:
path = db_path.joinpath(file_name)
if(not os.access(str(path), os.R_OK) or not path.is_file()):
log.error('file not readable! file=%s', file_name)
Expand Down Expand Up @@ -155,6 +156,7 @@ def main():
parser_download = subparsers.add_parser('download', help='Download a database') # add download sub-command options
parser_download.add_argument('--output', '-o', action='store', default=Path.cwd(), help='output directory (default = current working directory)')
parser_download.add_argument('--minor', '-n', action='store', type=int, default=0, help='Database minor version (default = most recent db minor version)')
parser_download.add_argument('--type', choices=['full', 'light'], default='full', help='Database type (defaut = full)')

parser_update = subparsers.add_parser('update', help='Update an existing database to the most recent compatible version') # add download sub-command options
parser_update.add_argument('--db', '-d', action='store', default=None, help='Current database path (default = <bakta_path>/db). Can also be provided as BAKTA_DB environment variable.')
Expand Down Expand Up @@ -194,32 +196,45 @@ def main():
else:
compatible_sorted = sorted(compatible_versions, key=lambda v: v['minor'], reverse=True)
required_version = compatible_sorted[0]

tarball_path = output_path.joinpath('db.tar.gz')
db_url = f"https://zenodo.org/record/{required_version['record']}/files/db.tar.gz"
print(f"download database: v{required_version['major']}.{required_version['minor']}, {required_version['date']}, DOI: {required_version['doi']}, URL: {db_url}...")
# required_version = {
# "date": "2023-02-20",
# "major": 5,
# "minor": 0,
# "doi": "10.5281/zenodo.test",
# "record": "test",
# "md5": "",
# "md5-light": "a1a4137ea1bb5e35b86d64edc1988ce7"
# }

tarball_path = output_path.joinpath(f"{'db-light' if args.type == 'light' else 'db'}.tar.gz")
db_url = f"https://zenodo.org/record/{required_version['record']}/files/{'db-light' if args.type == 'light' else 'db'}.tar.gz"
print(f"download database: v{required_version['major']}.{required_version['minor']}, type={db_old_info['type']}, {required_version['date']}, DOI: {required_version['doi']}, URL: {db_url}...")
download(db_url, tarball_path)
print('\t... done')

print('check MD5 sum...')
md5_sum = calc_md5_sum(tarball_path)
if(md5_sum == required_version['md5']):
required_md5 = required_version['md5-light' if args.type == 'light' else 'md5']
if(md5_sum == required_md5):
print(f'\t...database file OK: {md5_sum}')
else:
sys.exit(f"Error: corrupt database file! MD5 should be '{required_version['md5']}' but is '{md5_sum}'")
sys.exit(f"Error: corrupt database file! MD5 should be '{required_md5}' but is '{md5_sum}'")

print(f'extract DB tarball: file={tarball_path}, output={output_path}')
untar(tarball_path, output_path)
tarball_path.unlink()

db_path = output_path.joinpath('db')
db_path = output_path.joinpath('db-light' if args.type == 'light' else 'db')
db_info = check(db_path)
if(db_info['major'] != required_version['major']):
sys.exit(f"ERROR: wrong major db detected! required={required_version['major']}, detected={db_info['major']}")
elif(db_info['minor'] != required_version['minor']):
sys.exit(f"ERROR: wrong minor db detected! required={required_version['minor']}, detected={db_info['minor']}")
elif(db_info['type'] != args.type == 'light'):
sys.exit(f"ERROR: wrong db type detected! required={args.type}, detected={db_info['type']}")
print('successfully downloaded Bakta database!')
print(f"\tversion: {required_version['major']}.{required_version['minor']}")
print(f"\tType: {args.type}")
print(f"\tDOI: {required_version['doi']}")
print(f'\tpath: {db_path}')

Expand All @@ -240,7 +255,7 @@ def main():
tmp_path = cfg.check_tmp_path(args)
db_old_path = cfg.check_db_path(args)
db_old_info = check(db_old_path)
print(f"existing database: v{db_old_info['major']}.{db_old_info['minor']}")
print(f"existing database: v{db_old_info['major']}.{db_old_info['minor']}, type={db_old_info['type']}")
print('fetch DB versions...')
versions = fetch_db_versions()
compatible_versions = [v for v in versions if v['major'] == bakta.__db_schema_version__]
Expand All @@ -255,31 +270,35 @@ def main():
sys.exit()
required_version = compatible_sorted[0]

tarball_path = tmp_path.joinpath('db.tar.gz')
db_url = f"https://zenodo.org/record/{required_version['record']}/files/db.tar.gz"
print(f"download database: v{required_version['major']}.{required_version['minor']}, {required_version['date']}, DOI: {required_version['doi']}, URL: {db_url}...")
tarball_path = output_path.joinpath(f"{'db-light' if args.type == 'light' else 'db'}.tar.gz")
db_url = f"https://zenodo.org/record/{required_version['record']}/files/{'db-light' if args.type == 'light' else 'db'}.tar.gz"
print(f"download database: v{required_version['major']}.{required_version['minor']}, type={db_old_info['type']}, {required_version['date']}, DOI: {required_version['doi']}, URL: {db_url}...")
download(db_url, tarball_path)
print('\t... done')

print('check MD5 sum...')
md5_sum = calc_md5_sum(tarball_path)
if(md5_sum == required_version['md5']):
required_md5 = required_version['md5-light' if args.type == 'light' else 'md5']
if(md5_sum == required_md5):
print(f'\t...database file OK: {md5_sum}')
else:
sys.exit(f"Error: corrupt database file! MD5 should be '{required_version['md5']}' but is '{md5_sum}'")
sys.exit(f"Error: corrupt database file! MD5 should be '{required_md5}' but is '{md5_sum}'")

print(f'extract DB tarball: file={tarball_path}, output-directory={tmp_path}')
untar(tarball_path, tmp_path)
tarball_path.unlink()

db_new_path = tmp_path.joinpath('db')
db_new_path = tmp_path.joinpath('db-light' if args.type == 'light' else 'db')
db_new_info = check(db_new_path)
if(db_new_info['major'] != required_version['major']):
sys.exit(f"ERROR: wrong major db detected! required={required_version['major']}, detected={db_new_info['major']}")
elif(db_new_info['minor'] != required_version['minor']):
sys.exit(f"ERROR: wrong minor db detected! required={required_version['minor']}, detected={db_new_info['minor']}")
elif(db_new_info['type'] != args.type == 'light'):
sys.exit(f"ERROR: wrong db type detected! required={args.type}, detected={db_new_info['type']}")
print('successfully downloaded Bakta DB:')
print(f"\tversion: {required_version['major']}.{required_version['minor']}")
print(f"\tType: {args.type}")
print(f"\tDOI: {required_version['doi']}")
print(f'\tpath: {db_new_path}')
print('remove old database...')
Expand Down
2 changes: 1 addition & 1 deletion bakta/io/insdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def write_insdc(genome: dict, features: Sequence[dict], genbank_output_path: Pat
psc_subject_id = feature['psc']['uniref90_id']
inference.append(f'similar to AA sequence:{bc.DB_XREF_UNIREF}:{psc_subject_id}')
elif('uniref50_id' in feature.get('pscc', {})):
pscc_subject_id = feature['psc']['uniref50_id']
pscc_subject_id = feature['pscc']['uniref50_id']
inference.append(f'similar to AA sequence:{bc.DB_XREF_UNIREF}:{pscc_subject_id}')
qualifiers['inference'] = inference
if(cfg.compliant):
Expand Down
40 changes: 22 additions & 18 deletions bakta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def main():
print(f'Bakta v{bakta.__version__}')
print('Options and arguments:')
print(f'\tinput: {cfg.genome_path}')
print(f"\tdb: {cfg.db_path}, version {cfg.db_info['major']}.{cfg.db_info['minor']}")
print(f"\tdb: {cfg.db_path}, version {cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}")
if(cfg.user_proteins): print(f'\tuser proteins: {cfg.user_proteins}')
print(f'\toutput: {cfg.output_path}')
print(f'\ttmp directory: {cfg.tmp_path}')
Expand Down Expand Up @@ -247,10 +247,15 @@ def main():
print(f'\tdetected IPSs: {len(cdss_ips)}')

if(len(cdss_not_found) > 0):
log.debug('search CDS PSC')
cdss_psc, cdss_pscc, cdss_not_found = psc.search(cdss_not_found)
print(f'\tfound PSCs: {len(cdss_psc)}')
print(f'\tfound PSCCs: {len(cdss_pscc)}')
if(cfg.db_info['type'] == 'full'):
log.debug('search CDS PSC')
cdss_psc, cdss_pscc, cdss_not_found = psc.search(cdss_not_found)
print(f'\tfound PSCs: {len(cdss_psc)}')
print(f'\tfound PSCCs: {len(cdss_pscc)}')
else:
log.debug('search CDS PSCC')
cdss_pscc, cdss_not_found = pscc.search(cdss_not_found)
print(f'\tfound PSCCs: {len(cdss_pscc)}')
print('\tlookup annotations...')
log.debug('lookup CDS PSCs')
psc.lookup(cdss) # lookup PSC info
Expand Down Expand Up @@ -284,18 +289,17 @@ def main():
anno.combine_annotation(cds) # combine IPS & PSC annotations and mark hypotheticals

hypotheticals = [cds for cds in cdss if 'hypothetical' in cds and 'edge' not in cds and cds.get('start_type', 'Edge') != 'Edge']
if(len(hypotheticals) > 0):
if(not cfg.skip_pseudo):
print('\tdetect pseudogenes...')
log.debug('search pseudogene candidates')
pseudo_candidates = feat_cds.predict_pseudo_candidates(hypotheticals)
print(f'\t\tpseudogene candidates: {len(pseudo_candidates)}')
pseudogenes = feat_cds.detect_pseudogenes(pseudo_candidates, cdss, genome) if len(pseudo_candidates) > 0 else []
psc.lookup(pseudogenes, pseudo=True)
pscc.lookup(pseudogenes, pseudo=True)
for pseudogene in pseudogenes:
anno.combine_annotation(pseudogene)
print(f'\t\tfound pseudogenes: {len(pseudogenes)}')
if(len(hypotheticals) > 0 and not cfg.skip_pseudo and cfg.db_info['type'] == 'full'):
print('\tdetect pseudogenes...')
log.debug('search pseudogene candidates')
pseudo_candidates = feat_cds.predict_pseudo_candidates(hypotheticals)
print(f'\t\tpseudogene candidates: {len(pseudo_candidates)}')
pseudogenes = feat_cds.detect_pseudogenes(pseudo_candidates, cdss, genome) if len(pseudo_candidates) > 0 else []
psc.lookup(pseudogenes, pseudo=True)
pscc.lookup(pseudogenes, pseudo=True)
for pseudogene in pseudogenes:
anno.combine_annotation(pseudogene)
print(f'\t\tfound pseudogenes: {len(pseudogenes)}')
hypotheticals = [cds for cds in cdss if 'hypothetical' in cds]
if(len(hypotheticals) > 0):
log.debug('analyze hypotheticals')
Expand Down Expand Up @@ -345,7 +349,7 @@ def main():
print(f'\tdetected IPSs: {len(sorf_ipss)}')

sorf_pscs = []
if(len(sorfs_not_found) > 0):
if(len(sorfs_not_found) > 0 and cfg.db_info['type'] == 'full'):
log.debug('search sORF PSC')
tmp, sorfs_not_found = s_orf.search_pscs(sorfs_not_found)
sorf_pscs.extend(tmp)
Expand Down
4 changes: 2 additions & 2 deletions bakta/psc.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@


def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequence[dict]]:
"""Conduct homology search of CDSs against PCS db."""
"""Conduct homology search of CDSs against PSC db."""
cds_aa_path = cfg.tmp_path.joinpath('cds.psc.faa')
orf.write_internal_faa(cdss, cds_aa_path)
diamond_output_path = cfg.tmp_path.joinpath('diamond.cds.tsv')
diamond_output_path = cfg.tmp_path.joinpath('diamond.psc.tsv')
diamond_db_path = cfg.db_path.joinpath('psc.dmnd')
cmd = [
'diamond',
Expand Down
74 changes: 73 additions & 1 deletion bakta/pscc.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import logging
import subprocess as sp
import sqlite3

from concurrent.futures import ThreadPoolExecutor
from typing import Sequence
from typing import Sequence, Tuple

import bakta.config as cfg
import bakta.constants as bc
import bakta.features.orf as orf


############################################################################
Expand All @@ -18,6 +20,74 @@
log = logging.getLogger('PSCC')


def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequence[dict]]:
"""Conduct homology search of CDSs against PSCC db."""
cds_aa_path = cfg.tmp_path.joinpath('cds.pscc.faa')
orf.write_internal_faa(cdss, cds_aa_path)
diamond_output_path = cfg.tmp_path.joinpath('diamond.pscc.tsv')
diamond_db_path = cfg.db_path.joinpath('pscc.dmnd')
cmd = [
'diamond',
'blastp',
'--db', str(diamond_db_path),
'--query', str(cds_aa_path),
'--out', str(diamond_output_path),
'--id', str(int(bc.MIN_PSCC_IDENTITY * 100)), # '50',
'--query-cover', str(int(bc.MIN_PSC_COVERAGE * 100)), # '80'
'--subject-cover', str(int(bc.MIN_PSC_COVERAGE * 100)), # '80'
'--max-target-seqs', '1', # single best output
'--outfmt', '6',
'--threads', str(cfg.threads),
'--tmpdir', str(cfg.tmp_path), # use tmp folder
'--block-size', '3', # slightly increase block size for faster executions
'--fast'
]
log.debug('cmd=%s', cmd)
proc = sp.run(
cmd,
cwd=str(cfg.tmp_path),
env=cfg.env,
stdout=sp.PIPE,
stderr=sp.PIPE,
universal_newlines=True
)
if(proc.returncode != 0):
log.debug('stdout=\'%s\', stderr=\'%s\'', proc.stdout, proc.stderr)
log.warning('PSC failed! diamond-error-code=%d', proc.returncode)
raise Exception(f'diamond error! error code: {proc.returncode}')

cds_by_hexdigest = orf.get_orf_dictionary(cdss)
with diamond_output_path.open() as fh:
for line in fh:
(aa_identifier, cluster_id, identity, alignment_length, align_mismatches,
align_gaps, query_start, query_end, subject_start, subject_end,
evalue, bitscore) = line.split('\t')
cds = cds_by_hexdigest[aa_identifier]
query_cov = int(alignment_length) / len(cds['aa'])
identity = float(identity) / 100
if(query_cov >= bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSCC_IDENTITY):
cds['pscc'] = {
DB_PSCC_COL_UNIREF50: cluster_id,
'query_cov': query_cov,
'identity': identity,
'valid': identity >= bc.MIN_PSC_IDENTITY
}
log.debug(
'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, UniRef90=%s',
cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, identity, cluster_id
)

psccs_found = []
cds_not_found = []
for cds in cdss:
if('pscc' in cds):
psccs_found.append(cds)
else:
cds_not_found.append(cds)
log.info('found: PSCC=%i', len(psccs_found))
return psccs_found, cds_not_found


def lookup(features: Sequence[dict], pseudo: bool = False):
"""Lookup PSCC information"""
no_pscc_lookups = 0
Expand All @@ -35,6 +105,8 @@ def lookup(features: Sequence[dict], pseudo: bool = False):
else:
if('psc' in feature):
uniref50_id = feature['psc'].get(DB_PSCC_COL_UNIREF50, None)
elif('pscc' in feature):
uniref50_id = feature['pscc'].get(DB_PSCC_COL_UNIREF50, None)
if(uniref50_id is not None):
if(bc.DB_PREFIX_UNIREF_50 in uniref50_id):
uniref50_id = uniref50_id[9:] # remove 'UniRef50_' prefix
Expand Down
2 changes: 1 addition & 1 deletion bakta/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def test_dependencies():
], capture_output=True)
if('No valid AMRFinder database found' in process.stderr.decode()):
log.error('AMRFinderPlus database not installed')
sys.exit(f"ERROR: AMRFinderPlus database not installed! Please, install AMRFinderPlus's internal database by executing: 'amrfinder_update --database ${amrfinderplus_db_path}'. This must be done only once.")
sys.exit(f"ERROR: AMRFinderPlus database not installed! Please, install AMRFinderPlus's internal database by executing: 'amrfinder_update --database {amrfinderplus_db_path}'. This must be done only once.")

if((cfg.skip_cds is not None and cfg.skip_cds is False) or (cfg.skip_sorf is not None and cfg.skip_sorf is False)):
test_dependency(DEPENDENCY_HMMSEARCH)
Expand Down
Loading

0 comments on commit a96e545

Please sign in to comment.