diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py index 28e1a24..7358aaf 100644 --- a/bakta/features/annotation.py +++ b/bakta/features/annotation.py @@ -19,8 +19,12 @@ RE_PROTEIN_NODE = re.compile(r'NODE_', flags=re.IGNORECASE) RE_PROTEIN_POTENTIAL_CONTIG_NAME = re.compile(r'(genome|shotgun)', flags=re.IGNORECASE) RE_PROTEIN_DOMAIN_CONTAINING = re.compile(r'domain-containing protein', flags=re.IGNORECASE) +RE_PROTEIN_REMNANT = re.compile(r'Remnant of ', re.IGNORECASE) +RE_PROTEIN_TMRNA = re.compile(r'TmRNA', flags=re.IGNORECASE) RE_PROTEIN_NO_LETTERS = re.compile(r'[^A-Za-z]') -RE_PROTEIN_SUSPECT_CHARS = re.compile(r'[.@=?%]') +RE_PROTEIN_SUSPECT_CHARS_DISCARD = re.compile(r'[.#]') +RE_PROTEIN_SUSPECT_CHARS_REPLACE = re.compile(r'[@=?%]') +RE_PROTEIN_SUSPECT_CHARS_BEGINNING = '_\-+.:,;/\\\'' RE_PROTEIN_PERIOD_SEPARATOR = re.compile(r'([a-zA-Z0-9]+)\.([a-zA-Z0-9]+)') RE_PROTEIN_WRONG_PRIMES = re.compile(r'[\u2032\u0060\u00B4]') # prime (′), grave accent (`), acute accent (´) RE_PROTEIN_WEIGHT = re.compile(r' [0-9]+(?:\.[0-9]+)? k?da ', flags=re.IGNORECASE) @@ -536,9 +540,19 @@ def revise_cds_product(product: str): product = re.sub(RE_PROTEIN_PERIOD_SEPARATOR, r'\1-\2', product) # replace separator periods if(product != old_product): log.info('fix product: replace separator periods. new=%s, old=%s', product, old_product) + + old_product = product + if(product[0] in RE_PROTEIN_SUSPECT_CHARS_BEGINNING): # remove suspect first character + product = product[1:] + log.info('fix product: replace invalid first character. new=%s, old=%s', product, old_product) + + old_product = product + product = RE_PROTEIN_SUSPECT_CHARS_DISCARD.sub('', product) # remove suspect characters + if(product != old_product): + log.info('fix product: replace invalid characters. new=%s, old=%s', product, old_product) old_product = product - product = RE_PROTEIN_SUSPECT_CHARS.sub('', product) # remove suspect characters + product = RE_PROTEIN_SUSPECT_CHARS_REPLACE.sub(' ', product) # replace suspect characters by single whitespace if(product != old_product): log.info('fix product: replace invalid characters. new=%s, old=%s', product, old_product) @@ -552,6 +566,11 @@ def revise_cds_product(product: str): if(product != old_product): log.info('fix product: replace FOG ids. new=%s, old=%s', product, old_product) + old_product = product + product = RE_PROTEIN_REMNANT.sub('', product) # remove 'Remnant of's + if(product != old_product): + log.info('fix product: replace remnant ofs. new=%s, old=%s', product, old_product) + old_product = product dufs = [] # replace DUF-containing products for m in RE_DOMAIN_OF_UNKNOWN_FUNCTION.finditer(product): @@ -593,6 +612,11 @@ def revise_cds_product(product: str): product = product.replace('_', '-') if(product != old_product): log.info('fix product: replace domain name underscores. new=%s, old=%s', product, old_product) + + old_product = product + if(RE_PROTEIN_TMRNA.fullmatch(product)): + product = '' + log.info('fix product: discard pure tmRNA product descriptions. new=%s, old=%s', product, old_product) old_product = product if(