introduce light db type #95

oschwengers · Feb 22, 2023 · a96e545 · a96e545
1 parent 15dcb78
commit a96e545
Show file tree

Hide file tree

Showing 9 changed files with 394 additions and 129 deletions.
diff --git a/bakta/db.py b/bakta/db.py
@@ -43,7 +43,6 @@
         'pfam.h3i',
         'pfam.h3m',
         'pfam.h3p',
-        'psc.dmnd',
         'rfam-go.tsv',
         'rRNA.i1f',
         'rRNA.i1i',
@@ -76,20 +75,22 @@ def check(db_path: Path) -> dict:
         log.exception('could not parse database version file!')
         sys.exit('ERROR: could not parse database version file!')
 
-    for key in ['date', 'major', 'minor']:
+    for key in ['date', 'major', 'minor', 'type']:
         if(key not in db_info):
             log.error('wrong db version info file content! missed key=%s', key)
             sys.exit(f"ERROR: wrong db version info file format! Missed key '{key}' in JSON structure.")
 
-    log.info('detected: major=%i, minor=%i, date=%s', db_info['major'], db_info['minor'], db_info['date'])
+    log.info('detected: major=%i, minor=%i, type=%s, date=%s', db_info['major'], db_info['minor'], db_info['type'], db_info['date'])
     if(db_info['major'] < bakta.__db_schema_version__):
         log.error('wrong database version detected! required=%i, detected=%i', bakta.__db_schema_version__, db_info['major'])
         sys.exit(f"ERROR: wrong database version detected!\nBakta version {bakta.__version__} requires database version {bakta.__db_schema_version__}.x, but {db_info['major']}.{db_info['minor']} was detected. Please, update the database from https://doi.org/10.5281/zenodo.4247253")
     elif(db_info['major'] > bakta.__db_schema_version__):
         log.error('wrong database version detected! required=%i, detected=%i', bakta.__db_schema_version__, db_info['major'])
         sys.exit(f"ERROR: wrong database version detected!\nBakta version {bakta.__version__} requires database version {bakta.__db_schema_version__}.x, but {db_info['major']}.{db_info['minor']} was detected. Please, update Bakta or download a compatible database version from https://doi.org/10.5281/zenodo.4247253")
 
-    for file_name in FILE_NAMES:
+    required_db_files = FILE_NAMES
+    required_db_files.append('psc.dmnd' if db_info['type'] == 'full' else 'pscc.dmnd')
+    for file_name in required_db_files:
         path = db_path.joinpath(file_name)
         if(not os.access(str(path), os.R_OK) or not path.is_file()):
             log.error('file not readable! file=%s', file_name)
@@ -155,6 +156,7 @@ def main():
     parser_download = subparsers.add_parser('download', help='Download a database')  # add download sub-command options
     parser_download.add_argument('--output', '-o', action='store', default=Path.cwd(), help='output directory (default = current working directory)')
     parser_download.add_argument('--minor', '-n', action='store', type=int, default=0, help='Database minor version (default = most recent db minor version)')
+    parser_download.add_argument('--type', choices=['full', 'light'], default='full', help='Database type (defaut = full)')
 
     parser_update = subparsers.add_parser('update', help='Update an existing database to the most recent compatible version')  # add download sub-command options
     parser_update.add_argument('--db', '-d', action='store', default=None, help='Current database path (default = <bakta_path>/db). Can also be provided as BAKTA_DB environment variable.')
@@ -194,32 +196,45 @@ def main():
         else:
             compatible_sorted = sorted(compatible_versions, key=lambda v: v['minor'], reverse=True)
             required_version = compatible_sorted[0]
-
-        tarball_path = output_path.joinpath('db.tar.gz')
-        db_url = f"https://zenodo.org/record/{required_version['record']}/files/db.tar.gz"
-        print(f"download database: v{required_version['major']}.{required_version['minor']}, {required_version['date']}, DOI: {required_version['doi']}, URL: {db_url}...")
+        # required_version = {
+        #     "date": "2023-02-20",
+        #     "major": 5,
+        #     "minor": 0,
+        #     "doi": "10.5281/zenodo.test",
+        #     "record": "test",
+        #     "md5": "",
+        #     "md5-light": "a1a4137ea1bb5e35b86d64edc1988ce7"
+        # }
+
+        tarball_path = output_path.joinpath(f"{'db-light' if args.type == 'light' else 'db'}.tar.gz")
+        db_url = f"https://zenodo.org/record/{required_version['record']}/files/{'db-light' if args.type == 'light' else 'db'}.tar.gz"
+        print(f"download database: v{required_version['major']}.{required_version['minor']}, type={db_old_info['type']}, {required_version['date']}, DOI: {required_version['doi']}, URL: {db_url}...")
         download(db_url, tarball_path)
         print('\t... done')
 
         print('check MD5 sum...')
         md5_sum = calc_md5_sum(tarball_path)
-        if(md5_sum == required_version['md5']):
+        required_md5 = required_version['md5-light' if args.type == 'light' else 'md5']
+        if(md5_sum == required_md5):
             print(f'\t...database file OK: {md5_sum}')
         else:
-            sys.exit(f"Error: corrupt database file! MD5 should be '{required_version['md5']}' but is '{md5_sum}'")
+            sys.exit(f"Error: corrupt database file! MD5 should be '{required_md5}' but is '{md5_sum}'")
 
         print(f'extract DB tarball: file={tarball_path}, output={output_path}')
         untar(tarball_path, output_path)
         tarball_path.unlink()
 
-        db_path = output_path.joinpath('db')
+        db_path = output_path.joinpath('db-light' if args.type == 'light' else 'db')
         db_info = check(db_path)
         if(db_info['major'] != required_version['major']):
             sys.exit(f"ERROR: wrong major db detected! required={required_version['major']}, detected={db_info['major']}")
         elif(db_info['minor'] != required_version['minor']):
             sys.exit(f"ERROR: wrong minor db detected! required={required_version['minor']}, detected={db_info['minor']}")
+        elif(db_info['type'] != args.type == 'light'):
+            sys.exit(f"ERROR: wrong db type detected! required={args.type}, detected={db_info['type']}")
         print('successfully downloaded Bakta database!')
         print(f"\tversion: {required_version['major']}.{required_version['minor']}")
+        print(f"\tType: {args.type}")
         print(f"\tDOI: {required_version['doi']}")
         print(f'\tpath: {db_path}')
 
@@ -240,7 +255,7 @@ def main():
         tmp_path = cfg.check_tmp_path(args)
         db_old_path = cfg.check_db_path(args)
         db_old_info = check(db_old_path)
-        print(f"existing database: v{db_old_info['major']}.{db_old_info['minor']}")
+        print(f"existing database: v{db_old_info['major']}.{db_old_info['minor']}, type={db_old_info['type']}")
         print('fetch DB versions...')
         versions = fetch_db_versions()
         compatible_versions = [v for v in versions if v['major'] == bakta.__db_schema_version__]
@@ -255,31 +270,35 @@ def main():
             sys.exit()
         required_version = compatible_sorted[0]
 
-        tarball_path = tmp_path.joinpath('db.tar.gz')
-        db_url = f"https://zenodo.org/record/{required_version['record']}/files/db.tar.gz"
-        print(f"download database: v{required_version['major']}.{required_version['minor']}, {required_version['date']}, DOI: {required_version['doi']}, URL: {db_url}...")
+        tarball_path = output_path.joinpath(f"{'db-light' if args.type == 'light' else 'db'}.tar.gz")
+        db_url = f"https://zenodo.org/record/{required_version['record']}/files/{'db-light' if args.type == 'light' else 'db'}.tar.gz"
+        print(f"download database: v{required_version['major']}.{required_version['minor']}, type={db_old_info['type']}, {required_version['date']}, DOI: {required_version['doi']}, URL: {db_url}...")
         download(db_url, tarball_path)
         print('\t... done')
 
         print('check MD5 sum...')
         md5_sum = calc_md5_sum(tarball_path)
-        if(md5_sum == required_version['md5']):
+        required_md5 = required_version['md5-light' if args.type == 'light' else 'md5']
+        if(md5_sum == required_md5):
             print(f'\t...database file OK: {md5_sum}')
         else:
-            sys.exit(f"Error: corrupt database file! MD5 should be '{required_version['md5']}' but is '{md5_sum}'")
+            sys.exit(f"Error: corrupt database file! MD5 should be '{required_md5}' but is '{md5_sum}'")
 
         print(f'extract DB tarball: file={tarball_path}, output-directory={tmp_path}')
         untar(tarball_path, tmp_path)
         tarball_path.unlink()
 
-        db_new_path = tmp_path.joinpath('db')
+        db_new_path = tmp_path.joinpath('db-light' if args.type == 'light' else 'db')
         db_new_info = check(db_new_path)
         if(db_new_info['major'] != required_version['major']):
             sys.exit(f"ERROR: wrong major db detected! required={required_version['major']}, detected={db_new_info['major']}")
         elif(db_new_info['minor'] != required_version['minor']):
             sys.exit(f"ERROR: wrong minor db detected! required={required_version['minor']}, detected={db_new_info['minor']}")
+        elif(db_new_info['type'] != args.type == 'light'):
+            sys.exit(f"ERROR: wrong db type detected! required={args.type}, detected={db_new_info['type']}")
         print('successfully downloaded Bakta DB:')
         print(f"\tversion: {required_version['major']}.{required_version['minor']}")
+        print(f"\tType: {args.type}")
         print(f"\tDOI: {required_version['doi']}")
         print(f'\tpath: {db_new_path}')
         print('remove old database...')

diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py
@@ -148,7 +148,7 @@ def write_insdc(genome: dict, features: Sequence[dict], genbank_output_path: Pat
                     psc_subject_id = feature['psc']['uniref90_id']
                     inference.append(f'similar to AA sequence:{bc.DB_XREF_UNIREF}:{psc_subject_id}')
                 elif('uniref50_id' in feature.get('pscc', {})):
-                    pscc_subject_id = feature['psc']['uniref50_id']
+                    pscc_subject_id = feature['pscc']['uniref50_id']
                     inference.append(f'similar to AA sequence:{bc.DB_XREF_UNIREF}:{pscc_subject_id}')
                 qualifiers['inference'] = inference
                 if(cfg.compliant):

diff --git a/bakta/main.py b/bakta/main.py
@@ -60,7 +60,7 @@ def main():
         print(f'Bakta v{bakta.__version__}')
         print('Options and arguments:')
         print(f'\tinput: {cfg.genome_path}')
-        print(f"\tdb: {cfg.db_path}, version {cfg.db_info['major']}.{cfg.db_info['minor']}")
+        print(f"\tdb: {cfg.db_path}, version {cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}")
         if(cfg.user_proteins): print(f'\tuser proteins: {cfg.user_proteins}')
         print(f'\toutput: {cfg.output_path}')
         print(f'\ttmp directory: {cfg.tmp_path}')
@@ -247,10 +247,15 @@ def main():
             print(f'\tdetected IPSs: {len(cdss_ips)}')
 
             if(len(cdss_not_found) > 0):
-                log.debug('search CDS PSC')
-                cdss_psc, cdss_pscc, cdss_not_found = psc.search(cdss_not_found)
-                print(f'\tfound PSCs: {len(cdss_psc)}')
-                print(f'\tfound PSCCs: {len(cdss_pscc)}')
+                if(cfg.db_info['type'] == 'full'):
+                    log.debug('search CDS PSC')
+                    cdss_psc, cdss_pscc, cdss_not_found = psc.search(cdss_not_found)
+                    print(f'\tfound PSCs: {len(cdss_psc)}')
+                    print(f'\tfound PSCCs: {len(cdss_pscc)}')
+                else:
+                    log.debug('search CDS PSCC')
+                    cdss_pscc, cdss_not_found = pscc.search(cdss_not_found)
+                    print(f'\tfound PSCCs: {len(cdss_pscc)}')
             print('\tlookup annotations...')
             log.debug('lookup CDS PSCs')
             psc.lookup(cdss)  # lookup PSC info
@@ -284,18 +289,17 @@ def main():
                 anno.combine_annotation(cds)  # combine IPS & PSC annotations and mark hypotheticals
 
             hypotheticals = [cds for cds in cdss if 'hypothetical' in cds and 'edge' not in cds and cds.get('start_type', 'Edge') != 'Edge']
-            if(len(hypotheticals) > 0):
-                if(not cfg.skip_pseudo):
-                    print('\tdetect pseudogenes...')
-                    log.debug('search pseudogene candidates')
-                    pseudo_candidates = feat_cds.predict_pseudo_candidates(hypotheticals)
-                    print(f'\t\tpseudogene candidates: {len(pseudo_candidates)}')
-                    pseudogenes = feat_cds.detect_pseudogenes(pseudo_candidates, cdss, genome) if len(pseudo_candidates) > 0 else []
-                    psc.lookup(pseudogenes, pseudo=True)
-                    pscc.lookup(pseudogenes, pseudo=True)
-                    for pseudogene in pseudogenes:
-                        anno.combine_annotation(pseudogene)
-                    print(f'\t\tfound pseudogenes: {len(pseudogenes)}')
+            if(len(hypotheticals) > 0  and  not cfg.skip_pseudo  and  cfg.db_info['type'] == 'full'):
+                print('\tdetect pseudogenes...')
+                log.debug('search pseudogene candidates')
+                pseudo_candidates = feat_cds.predict_pseudo_candidates(hypotheticals)
+                print(f'\t\tpseudogene candidates: {len(pseudo_candidates)}')
+                pseudogenes = feat_cds.detect_pseudogenes(pseudo_candidates, cdss, genome) if len(pseudo_candidates) > 0 else []
+                psc.lookup(pseudogenes, pseudo=True)
+                pscc.lookup(pseudogenes, pseudo=True)
+                for pseudogene in pseudogenes:
+                    anno.combine_annotation(pseudogene)
+                print(f'\t\tfound pseudogenes: {len(pseudogenes)}')
             hypotheticals = [cds for cds in cdss if 'hypothetical' in cds]
             if(len(hypotheticals) > 0):
                 log.debug('analyze hypotheticals')
@@ -345,7 +349,7 @@ def main():
         print(f'\tdetected IPSs: {len(sorf_ipss)}')
 
         sorf_pscs = []
-        if(len(sorfs_not_found) > 0):
+        if(len(sorfs_not_found) > 0  and  cfg.db_info['type'] == 'full'):
             log.debug('search sORF PSC')
             tmp, sorfs_not_found = s_orf.search_pscs(sorfs_not_found)
             sorf_pscs.extend(tmp)

diff --git a/bakta/psc.py b/bakta/psc.py
@@ -28,10 +28,10 @@
 
 
 def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequence[dict]]:
-    """Conduct homology search of CDSs against PCS db."""
+    """Conduct homology search of CDSs against PSC db."""
     cds_aa_path = cfg.tmp_path.joinpath('cds.psc.faa')
     orf.write_internal_faa(cdss, cds_aa_path)
-    diamond_output_path = cfg.tmp_path.joinpath('diamond.cds.tsv')
+    diamond_output_path = cfg.tmp_path.joinpath('diamond.psc.tsv')
     diamond_db_path = cfg.db_path.joinpath('psc.dmnd')
     cmd = [
         'diamond',

diff --git a/bakta/pscc.py b/bakta/pscc.py
@@ -1,11 +1,13 @@
 import logging
+import subprocess as sp
 import sqlite3
 
 from concurrent.futures import ThreadPoolExecutor
-from typing import Sequence
+from typing import Sequence, Tuple
 
 import bakta.config as cfg
 import bakta.constants as bc
+import bakta.features.orf as orf
 
 
 ############################################################################
@@ -18,6 +20,74 @@
 log = logging.getLogger('PSCC')
 
 
+def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequence[dict]]:
+    """Conduct homology search of CDSs against PSCC db."""
+    cds_aa_path = cfg.tmp_path.joinpath('cds.pscc.faa')
+    orf.write_internal_faa(cdss, cds_aa_path)
+    diamond_output_path = cfg.tmp_path.joinpath('diamond.pscc.tsv')
+    diamond_db_path = cfg.db_path.joinpath('pscc.dmnd')
+    cmd = [
+        'diamond',
+        'blastp',
+        '--db', str(diamond_db_path),
+        '--query', str(cds_aa_path),
+        '--out', str(diamond_output_path),
+        '--id', str(int(bc.MIN_PSCC_IDENTITY * 100)),  # '50',
+        '--query-cover', str(int(bc.MIN_PSC_COVERAGE * 100)),  # '80'
+        '--subject-cover', str(int(bc.MIN_PSC_COVERAGE * 100)),  # '80'
+        '--max-target-seqs', '1',  # single best output
+        '--outfmt', '6',
+        '--threads', str(cfg.threads),
+        '--tmpdir', str(cfg.tmp_path),  # use tmp folder
+        '--block-size', '3',  # slightly increase block size for faster executions
+        '--fast'
+    ]
+    log.debug('cmd=%s', cmd)
+    proc = sp.run(
+        cmd,
+        cwd=str(cfg.tmp_path),
+        env=cfg.env,
+        stdout=sp.PIPE,
+        stderr=sp.PIPE,
+        universal_newlines=True
+    )
+    if(proc.returncode != 0):
+        log.debug('stdout=\'%s\', stderr=\'%s\'', proc.stdout, proc.stderr)
+        log.warning('PSC failed! diamond-error-code=%d', proc.returncode)
+        raise Exception(f'diamond error! error code: {proc.returncode}')
+
+    cds_by_hexdigest = orf.get_orf_dictionary(cdss)
+    with diamond_output_path.open() as fh:
+        for line in fh:
+            (aa_identifier, cluster_id, identity, alignment_length, align_mismatches,
+                align_gaps, query_start, query_end, subject_start, subject_end,
+                evalue, bitscore) = line.split('\t')
+            cds = cds_by_hexdigest[aa_identifier]
+            query_cov = int(alignment_length) / len(cds['aa'])
+            identity = float(identity) / 100
+            if(query_cov >= bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSCC_IDENTITY):
+                cds['pscc'] = {
+                    DB_PSCC_COL_UNIREF50: cluster_id,
+                    'query_cov': query_cov,
+                    'identity': identity,
+                    'valid': identity >= bc.MIN_PSC_IDENTITY
+                }
+                log.debug(
+                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, UniRef90=%s',
+                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, identity, cluster_id
+                )
+
+    psccs_found = []
+    cds_not_found = []
+    for cds in cdss:
+        if('pscc' in cds):
+            psccs_found.append(cds)
+        else:
+            cds_not_found.append(cds)
+    log.info('found: PSCC=%i', len(psccs_found))
+    return psccs_found, cds_not_found
+
+
 def lookup(features: Sequence[dict], pseudo: bool = False):
     """Lookup PSCC information"""
     no_pscc_lookups = 0
@@ -35,6 +105,8 @@ def lookup(features: Sequence[dict], pseudo: bool = False):
                     else:
                         if('psc' in feature):
                             uniref50_id = feature['psc'].get(DB_PSCC_COL_UNIREF50, None)
+                        elif('pscc' in feature):
+                            uniref50_id = feature['pscc'].get(DB_PSCC_COL_UNIREF50, None)
                     if(uniref50_id is not None):
                         if(bc.DB_PREFIX_UNIREF_50 in uniref50_id):
                             uniref50_id = uniref50_id[9:]  # remove 'UniRef50_' prefix

diff --git a/bakta/utils.py b/bakta/utils.py
@@ -237,7 +237,7 @@ def test_dependencies():
             ], capture_output=True)
         if('No valid AMRFinder database found' in process.stderr.decode()):
             log.error('AMRFinderPlus database not installed')
-            sys.exit(f"ERROR: AMRFinderPlus database not installed! Please, install AMRFinderPlus's internal database by executing: 'amrfinder_update --database ${amrfinderplus_db_path}'. This must be done only once.")
+            sys.exit(f"ERROR: AMRFinderPlus database not installed! Please, install AMRFinderPlus's internal database by executing: 'amrfinder_update --database {amrfinderplus_db_path}'. This must be done only once.")
 
     if((cfg.skip_cds is not None and cfg.skip_cds is False) or (cfg.skip_sorf is not None and cfg.skip_sorf is False)):
         test_dependency(DEPENDENCY_HMMSEARCH)