-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
31 lines (27 loc) · 1.3 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
'''
Tokenizer for the ProGen model
'''
import os
import pickle
class Tokenizer:
def __init__(self):
with open(os.path.join('mapping_files/','taxa_to_lineage.p'),'rb') as handle:
self.taxa_to_lineage = pickle.load(handle)
with open('mapping_files/taxa_to_ctrl_idx.p','rb') as handle:
self.taxa_to_ctrl_idx = pickle.load(handle)
with open('mapping_files/kw_to_ctrl_idx.p','rb') as handle:
self.kw_to_ctrl_idx = pickle.load(handle)
with open('mapping_files/aa_to_ctrl_idx.p','rb') as handle:
self.aa_to_ctrl_idx = pickle.load(handle)
with open('mapping_files/kw_to_name.p2','rb') as handle:
self.kw_to_name = pickle.load(handle)
with open('mapping_files/probs_to_aa.p', 'rb') as handle:
self.aa_to_probs_index = pickle.load(handle)
# with open('mapping_files/taxid_to_name.p2','rb') as handle:
# taxid_to_name = pickle.load(handle)
def flipdict(my_map):
return {v: k for k, v in my_map.items()}
self.ctrl_idx_to_aa = flipdict(self.aa_to_ctrl_idx)
self.ctrl_idx_to_kw = flipdict(self.kw_to_ctrl_idx)
self.ctrl_idx_to_taxa = flipdict(self.taxa_to_ctrl_idx)
self.probs_index_to_aa = flipdict(self.aa_to_probs_index)