Add mutational parsing for AMRFinderPlus #70

pha4ge · Aug 17, 2022 · fb977c2 · fb977c2
1 parent 2f571f5
commit fb977c2
Show file tree

Hide file tree

Showing 7 changed files with 149 additions and 44 deletions.
diff --git a/README.md b/README.md
@@ -175,7 +175,7 @@ Parsers needing tested (both automated and just sanity checking output), see [te
 `
 1. [abricate](hAMRonization/AbricateIO.py)
 2. [ariba](hAMRonization/AribaIO.py)
-3. [NCBI AMRFinderPlus](hAMRonization/AmrFinderPlusIO.py)
+3. [NCBI AMRFinderPlus](hAMRonization/AmrFinderPlusIO.py): last updated for v3.10.40
 4. [RGI](hAMRonization/RgiIO.py) (includes RGI-BWT)
 5. [resfinder](hAMRonization/ResFinderIO.py)
 6. [resfinder4](hAMRonization/ResFinder4IO.py)

diff --git a/hAMRonization/AmrFinderPlusIO.py b/hAMRonization/AmrFinderPlusIO.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 import csv
+import re
 import warnings
 from .Interfaces import hAMRonizedResultIterator
 
@@ -14,14 +15,11 @@ class AmrFinderPlusIterator(hAMRonizedResultIterator):
     def __init__(self, source, metadata):
         metadata['analysis_software_name'] = 'amrfinderplus'
         metadata['reference_database_name'] = 'NCBI Reference Gene Database'
-        metadata['genetic_variation_type'] = 'Gene presence detected'
         self.metadata = metadata
 
         # check source for whether AMFP has been run in protein or nt mode
-        with open(source) as fh:
-            header = next(fh)
-            if 'Contig id' in header.strip().split('\t'):
-                self.field_mapping = {
+
+        nucleotide_field_mapping = {
                     'Protein identifier': None,
                     'Contig id': 'input_sequence_id',
                     'Start': 'input_gene_start',
@@ -43,11 +41,12 @@ def __init__(self, source, metadata):
                     'Accession of closest sequence': 'reference_accession',
                     'Name of closest sequence': None,
                     'HMM id': None,
-                    'HMM description': None
+                    'HMM description': None,
+                    'AA Mutation': 'amino_acid_mutation',
+                    'Nucleotide Mutation': 'nucleotide_mutation',
+                    'genetic_variation_type': 'genetic_variation_type'
                }
-            else:
-                self.field_mapping = {
-                    'Protein identifier': 'input_sequence_id',
+        protein_field_mapping = {'Protein identifier': 'input_sequence_id',
                     'Gene symbol': 'gene_symbol',
                     'Sequence name': 'gene_name',
                     'Scope': None,
@@ -65,29 +64,54 @@ def __init__(self, source, metadata):
                     'Name of closest sequence': None,
                     'HMM id': None,
                     'HMM description': None,
+                    'AA Mutation': 'amino_acid_mutation',
+                    'genetic_variation_type': 'genetic_variation_type'
                     }
 
+        with open(source) as fh:
+            header = next(fh)
+            try:
+                first_result = next(fh)
+                if first_result.strip().split('\t')[0] == 'NA':
+                    self.field_mapping = nucleotide_field_mapping
+                else:
+                    self.field_mapping = protein_field_mapping
+            except StopIteration:
+                # doesn't really matter which mapping as this error indicates
+                # this is an empty results file
+                self.field_mapping = nucleotide_field_mapping
+
         super().__init__(source, self.field_mapping, self.metadata)
 
+
     def parse(self, handle):
         """
         Read each and return it
         """
-        # skip any manually specified fields for later
-        skipped_mutational = 0
         reader = csv.DictReader(handle, delimiter='\t')
         for result in reader:
             # replace NA value with None for consitency
             for field, value in result.items():
                 if value == "NA":
                     result[field] = None
 
-            # "point" for mutational variants in this field, homolog are "AMR"
-            if result['Element subtype'] != 'AMR':
-                skipped_mutational += 1
+            # "POINT" indicates mutational resistance
+            # amrfinderplus has no special fields but the mutation itself is
+            # appended to the symbol name so we want to split this
+            result['AA Mutation'] = None
+            result['Nucleotide Mutation'] = None
+            result['genetic_variation_type'] = 'Gene presence detected'
 
-            yield self.hAMRonize(result, self.metadata)
+            if result['Element subtype'] == 'POINT':
+                result['genetic_variation_type'] = 'Mutation variation detected'
+                gene_symbol, mutation = result["Gene symbol"].rsplit('_', 1)
+                result['Gene symbol'] = gene_symbol
+                _, ref, pos, alt, _ = re.split(r"(\D+)(\d+)(\D+)", mutation)
+                # this means it is a protein mutation
+                if result['Method'] in ['POINTX', 'POINTP']:
+                    result['AA Mutation'] = f"p.{pos}{ref}>{alt}"
+                elif result['Method'] == 'POINTN':
+                    # e.g., 23S_G2032G ampC_C-11C -> c.2032G>G
+                    result['Nucleotide Mutation'] = f"c.{pos}{ref}>{alt}"
 
-        if skipped_mutational > 0:
-            warnings.warn(f"Skipping {skipped_mutational} mutational AMR "
-                          f"records from {self.metadata['input_file_name']}")
+            yield self.hAMRonize(result, self.metadata)
diff --git a/test/data/raw_outputs/amrfinder/report_nucleotide.tsv b/test/data/raw_outputs/amrfinder/report_nucleotide.tsv
diff --git a/test/data/raw_outputs/amrfinder/report_protein.tsv b/test/data/raw_outputs/amrfinder/report_protein.tsv