ENH handle AROs as string rather than int in get_aro_mapping_table() (#…

…56) AROs were previously handled as int in the get_aro_mapping_table() function and this posed challenges when ARO numbers such as '0010004' (leading zeros are cut). To fix this, AROs are now treated as strings so leading zeros can be maintained.
BigDataBiology · Jun 24, 2024 · cf5c459 · cf5c459
1 parent 1e8b215
commit cf5c459
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,9 @@
 - resfinder_curation: grdA_1_QJX10702 -> 3007380 & EstDL136_1_JN242251 -> 3000557
 - megares_curation: MEG_2865|Drugs|Phenicol|Chloramphenicol_hydrolase|ESTD -> 3000557
 
+### Handle AROs as string rather than int in get_aro_mapping_table()
+AROs were previously handled as 'int' in the get_aro_mapping_table() function and this posed challenges when ARO numbers such as 'ARO:0010004' (leading zeros are cut). To fix this, AROs are now treated as strings so leading zeros can be maintained.
+
 ## 0.4.0 - 10 June
 
 - Bundle a specific version of ARO with the package instead of downloading it from the internet (ensures reproducibility)

diff --git a/argnorm/lib.py b/argnorm/lib.py
@@ -42,18 +42,18 @@ def get_aro_mapping_table(database):
 
     aro_mapping_table = pd.read_csv(
             os.path.join(_ROOT, 'data', f'{database}_ARO_mapping.tsv'),
-            sep='\t')
+            sep='\t', dtype={'ARO': str})
     aro_mapping_table.drop_duplicates(subset=['Original ID'], inplace=True)
     aro_mapping_table.set_index('Original ID', inplace=True)
 
     manual_curation = pd.read_csv(
                     os.path.join(_ROOT, 'data/manual_curation', f'{database}_curation.tsv'),
-                    sep='\t', index_col=0)
+                    sep='\t', index_col=0, dtype={'ARO': str})
     manual_curation['Database'] = aro_mapping_table['Database'].iloc[0]
     aro_mapping_table.drop(index=set(manual_curation.index) & set(aro_mapping_table.index), inplace=True)
     aro_mapping_table = pd.concat([aro_mapping_table, manual_curation])
 
-    aro_mapping_table['ARO'] = aro_mapping_table['ARO'].map(lambda a: f'ARO:{int(a)}', na_action='ignore')
+    aro_mapping_table['ARO'] = aro_mapping_table['ARO'].map(lambda a: f'ARO:{a}', na_action='ignore')
     return aro_mapping_table
 
 def map_to_aro(gene, database):

diff --git a/tests/test_lib.py b/tests/test_lib.py
@@ -9,7 +9,8 @@ def test_map_to_aro():
         ["1028085756|WP_063844287.1|1|1|cpt|cpt|phosphotransferase|2|CHLORAMPHENICOL|PHENICOL|chloramphenicol_phosphotransferase_CPT", 'ncbi'],
         ["gb|AAG57600.1|ARO:3000318|mphB", "sarg"],
         ["(Phe)cpt_strepv:U09991:AAB36569:1412-1948:537", "argannot"],
-        ["MEG_4060|Metals|Multi-metal_resistance|Multi-metal_resistance_protein|MREA", "megares"]
+        ["MEG_4060|Metals|Multi-metal_resistance|Multi-metal_resistance_protein|MREA", "megares"],
+        ["gi:447201629:ref:WP_001278885.1:|FEATURES|cob(I)alamin_adenolsyltransferase|unclassified|cob(I)alamin_adenolsyltransferase", "deeparg"]
     ]
 
     ARO = lib.get_aro_ontology()
@@ -19,7 +20,8 @@ def test_map_to_aro():
         ARO.get_term('ARO:3000249'),
         ARO.get_term('ARO:3000318'),
         ARO.get_term('ARO:3000249'),
-        None
+        None,
+        ARO.get_term('ARO:0010004')
     ]
 
     for t, e in zip(test_cases, expected_output):