Skip to content

Commit

Permalink
ENH handle AROs as string rather than int in get_aro_mapping_table() (#…
Browse files Browse the repository at this point in the history
…56)

AROs were previously handled as int in the get_aro_mapping_table() function and this posed challenges when ARO numbers such as '0010004' (leading zeros are cut). To fix this, AROs are now treated as strings so leading zeros can be maintained.
  • Loading branch information
Vedanth-Ramji authored Jun 24, 2024
1 parent 1e8b215 commit cf5c459
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 5 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
- resfinder_curation: grdA_1_QJX10702 -> 3007380 & EstDL136_1_JN242251 -> 3000557
- megares_curation: MEG_2865|Drugs|Phenicol|Chloramphenicol_hydrolase|ESTD -> 3000557

### Handle AROs as string rather than int in get_aro_mapping_table()
AROs were previously handled as 'int' in the get_aro_mapping_table() function and this posed challenges when ARO numbers such as 'ARO:0010004' (leading zeros are cut). To fix this, AROs are now treated as strings so leading zeros can be maintained.

## 0.4.0 - 10 June

- Bundle a specific version of ARO with the package instead of downloading it from the internet (ensures reproducibility)
Expand Down
6 changes: 3 additions & 3 deletions argnorm/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,18 @@ def get_aro_mapping_table(database):

aro_mapping_table = pd.read_csv(
os.path.join(_ROOT, 'data', f'{database}_ARO_mapping.tsv'),
sep='\t')
sep='\t', dtype={'ARO': str})
aro_mapping_table.drop_duplicates(subset=['Original ID'], inplace=True)
aro_mapping_table.set_index('Original ID', inplace=True)

manual_curation = pd.read_csv(
os.path.join(_ROOT, 'data/manual_curation', f'{database}_curation.tsv'),
sep='\t', index_col=0)
sep='\t', index_col=0, dtype={'ARO': str})
manual_curation['Database'] = aro_mapping_table['Database'].iloc[0]
aro_mapping_table.drop(index=set(manual_curation.index) & set(aro_mapping_table.index), inplace=True)
aro_mapping_table = pd.concat([aro_mapping_table, manual_curation])

aro_mapping_table['ARO'] = aro_mapping_table['ARO'].map(lambda a: f'ARO:{int(a)}', na_action='ignore')
aro_mapping_table['ARO'] = aro_mapping_table['ARO'].map(lambda a: f'ARO:{a}', na_action='ignore')
return aro_mapping_table

def map_to_aro(gene, database):
Expand Down
6 changes: 4 additions & 2 deletions tests/test_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ def test_map_to_aro():
["1028085756|WP_063844287.1|1|1|cpt|cpt|phosphotransferase|2|CHLORAMPHENICOL|PHENICOL|chloramphenicol_phosphotransferase_CPT", 'ncbi'],
["gb|AAG57600.1|ARO:3000318|mphB", "sarg"],
["(Phe)cpt_strepv:U09991:AAB36569:1412-1948:537", "argannot"],
["MEG_4060|Metals|Multi-metal_resistance|Multi-metal_resistance_protein|MREA", "megares"]
["MEG_4060|Metals|Multi-metal_resistance|Multi-metal_resistance_protein|MREA", "megares"],
["gi:447201629:ref:WP_001278885.1:|FEATURES|cob(I)alamin_adenolsyltransferase|unclassified|cob(I)alamin_adenolsyltransferase", "deeparg"]
]

ARO = lib.get_aro_ontology()
Expand All @@ -19,7 +20,8 @@ def test_map_to_aro():
ARO.get_term('ARO:3000249'),
ARO.get_term('ARO:3000318'),
ARO.get_term('ARO:3000249'),
None
None,
ARO.get_term('ARO:0010004')
]

for t, e in zip(test_cases, expected_output):
Expand Down

0 comments on commit cf5c459

Please sign in to comment.