Skip to content

Commit

Permalink
Add mutational parsing for AMRFinderPlus #70
Browse files Browse the repository at this point in the history
  • Loading branch information
fmaguire committed Aug 17, 2022
1 parent 2f571f5 commit fb977c2
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 44 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ Parsers needing tested (both automated and just sanity checking output), see [te
`
1. [abricate](hAMRonization/AbricateIO.py)
2. [ariba](hAMRonization/AribaIO.py)
3. [NCBI AMRFinderPlus](hAMRonization/AmrFinderPlusIO.py)
3. [NCBI AMRFinderPlus](hAMRonization/AmrFinderPlusIO.py): last updated for v3.10.40
4. [RGI](hAMRonization/RgiIO.py) (includes RGI-BWT)
5. [resfinder](hAMRonization/ResFinderIO.py)
6. [resfinder4](hAMRonization/ResFinder4IO.py)
Expand Down
60 changes: 42 additions & 18 deletions hAMRonization/AmrFinderPlusIO.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python

import csv
import re
import warnings
from .Interfaces import hAMRonizedResultIterator

Expand All @@ -14,14 +15,11 @@ class AmrFinderPlusIterator(hAMRonizedResultIterator):
def __init__(self, source, metadata):
metadata['analysis_software_name'] = 'amrfinderplus'
metadata['reference_database_name'] = 'NCBI Reference Gene Database'
metadata['genetic_variation_type'] = 'Gene presence detected'
self.metadata = metadata

# check source for whether AMFP has been run in protein or nt mode
with open(source) as fh:
header = next(fh)
if 'Contig id' in header.strip().split('\t'):
self.field_mapping = {

nucleotide_field_mapping = {
'Protein identifier': None,
'Contig id': 'input_sequence_id',
'Start': 'input_gene_start',
Expand All @@ -43,11 +41,12 @@ def __init__(self, source, metadata):
'Accession of closest sequence': 'reference_accession',
'Name of closest sequence': None,
'HMM id': None,
'HMM description': None
'HMM description': None,
'AA Mutation': 'amino_acid_mutation',
'Nucleotide Mutation': 'nucleotide_mutation',
'genetic_variation_type': 'genetic_variation_type'
}
else:
self.field_mapping = {
'Protein identifier': 'input_sequence_id',
protein_field_mapping = {'Protein identifier': 'input_sequence_id',
'Gene symbol': 'gene_symbol',
'Sequence name': 'gene_name',
'Scope': None,
Expand All @@ -65,29 +64,54 @@ def __init__(self, source, metadata):
'Name of closest sequence': None,
'HMM id': None,
'HMM description': None,
'AA Mutation': 'amino_acid_mutation',
'genetic_variation_type': 'genetic_variation_type'
}

with open(source) as fh:
header = next(fh)
try:
first_result = next(fh)
if first_result.strip().split('\t')[0] == 'NA':
self.field_mapping = nucleotide_field_mapping
else:
self.field_mapping = protein_field_mapping
except StopIteration:
# doesn't really matter which mapping as this error indicates
# this is an empty results file
self.field_mapping = nucleotide_field_mapping

super().__init__(source, self.field_mapping, self.metadata)


def parse(self, handle):
"""
Read each and return it
"""
# skip any manually specified fields for later
skipped_mutational = 0
reader = csv.DictReader(handle, delimiter='\t')
for result in reader:
# replace NA value with None for consitency
for field, value in result.items():
if value == "NA":
result[field] = None

# "point" for mutational variants in this field, homolog are "AMR"
if result['Element subtype'] != 'AMR':
skipped_mutational += 1
# "POINT" indicates mutational resistance
# amrfinderplus has no special fields but the mutation itself is
# appended to the symbol name so we want to split this
result['AA Mutation'] = None
result['Nucleotide Mutation'] = None
result['genetic_variation_type'] = 'Gene presence detected'

yield self.hAMRonize(result, self.metadata)
if result['Element subtype'] == 'POINT':
result['genetic_variation_type'] = 'Mutation variation detected'
gene_symbol, mutation = result["Gene symbol"].rsplit('_', 1)
result['Gene symbol'] = gene_symbol
_, ref, pos, alt, _ = re.split(r"(\D+)(\d+)(\D+)", mutation)
# this means it is a protein mutation
if result['Method'] in ['POINTX', 'POINTP']:
result['AA Mutation'] = f"p.{pos}{ref}>{alt}"
elif result['Method'] == 'POINTN':
# e.g., 23S_G2032G ampC_C-11C -> c.2032G>G
result['Nucleotide Mutation'] = f"c.{pos}{ref}>{alt}"

if skipped_mutational > 0:
warnings.warn(f"Skipping {skipped_mutational} mutational AMR "
f"records from {self.metadata['input_file_name']}")
yield self.hAMRonize(result, self.metadata)
10 changes: 0 additions & 10 deletions test/data/raw_outputs/amrfinder/report_nucleotide.tsv

This file was deleted.

11 changes: 0 additions & 11 deletions test/data/raw_outputs/amrfinder/report_protein.tsv

This file was deleted.

Loading

0 comments on commit fb977c2

Please sign in to comment.