Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework of #9. #15

Merged
merged 3 commits into from
Dec 5, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions rsc/bin/compile_errpatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def _load_entries(args):
"""
good_entries = []
bad_entries = []
for file_path in glob.glob(f'{args.rsc_src}/{args.model_size}.errpatch.*'):
for file_path in glob.glob('{}/{}.errpatch.*'.format(args.rsc_src, args.model_size)):
file_name = os.path.basename(file_path)
logging.info(file_name)
for line_num, line in enumerate(open(file_path, 'r', encoding='UTF-8'), start=1):
Expand Down Expand Up @@ -285,9 +285,9 @@ def _save_trie(rsc_dir, entries):
entry.right_align)
rights.append(entry.right_align)
total_patch += 1
trie.save(f'{rsc_dir}/errpatch.tri')
trie.save('{}/errpatch.tri'.format(rsc_dir))

len_file = f'{rsc_dir}/errpatch.len'
len_file = '{}/errpatch.len'.format(rsc_dir)
with open(len_file, 'wb') as fout:
fout.write(struct.pack('B', 0)) # 인덱스가 1부터 시작하므로 dummy 데이터를 맨 앞에 하나 넣는다.
for idx, right in enumerate(rights, start=1):
Expand All @@ -296,7 +296,7 @@ def _save_trie(rsc_dir, entries):
logging.info('length saved: %s', len_file)
logging.info('expected size: %d', len(rights)+1)

val_file = f'{rsc_dir}/errpatch.val'
val_file = '{}/errpatch.val'.format(rsc_dir)
with open(val_file, 'wb') as fout:
fout.write(struct.pack('h', 0)) # 인덱스가 1부터 시작하므로 dummy 데이터를 맨 앞에 하나 넣는다.
for idx, right in enumerate(rights, start=1):
Expand All @@ -316,7 +316,7 @@ def run(args):
args: program arguments
"""
aligner = Aligner(args.rsc_src)
restore_dic = load_restore_dic(f'{args.rsc_src}/restore.dic')
restore_dic = load_restore_dic('{}/restore.dic'.format(args.rsc_src))
if not restore_dic:
sys.exit(1)
vocab_out = load_vocab_out(args.rsc_src)
Expand Down
14 changes: 7 additions & 7 deletions rsc/bin/compile_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def _load_cfg_rsc(rsc_src, model_size):
Returns:
(config, resource) pair
"""
file_path = f'{rsc_src}/{model_size}.config.json'
file_path = '{}/{}.config.json'.format(rsc_src, model_size)
cfg_dic = json.load(open(file_path, 'r', encoding='UTF-8'))
logging.info('config: %s', json.dumps(cfg_dic, indent=2))
cfg = argparse.Namespace()
Expand Down Expand Up @@ -101,7 +101,7 @@ def _write_config(cfg, rsc, rsc_dir):
cfg_dic['class_num'] = len(rsc.vocab_out)
cfg_dic['conv_kernels'] = [2, 3, 4, 5]
pathlib.Path(rsc_dir).mkdir(parents=True, exist_ok=True)
config_json = f'{rsc_dir}/config.json'
config_json = '{}/config.json'.format(rsc_dir)
with open(config_json, 'w', encoding='UTF-8') as fout:
json.dump(cfg_dic, fout, indent=2, sort_keys=True)

Expand Down Expand Up @@ -171,22 +171,22 @@ def _write_data(rsc, state_dict, rsc_dir):
state_dict: state dictionary of model
rsc_dir: target resource directory
"""
with open(f'{rsc_dir}/embed.bin', 'wb') as fout:
with open('{}/embed.bin'.format(rsc_dir), 'wb') as fout:
# key: [input vocab(char)] * 4(float)
# val: [input vocab(char)] * embed_dim * 4(float)
_write_embedding(rsc, state_dict, fout)

for kernel in range(2, 6):
# weight: [output chan(embed_dim)] * kernel * [input chan(embed_dim)] * 4
# bias: [output chan] * 4
_write_conv('convs', kernel, state_dict, f'{rsc_dir}/conv.{kernel}.fil')
_write_conv('convs', kernel, state_dict, '{}/conv.{}.fil'.format(rsc_dir, kernel))
# weight: hidden_dim * [cnn layers * output chan(embed_dim)] * 4
# bias: hidden_dim * 4
_write_linear('conv2hidden', state_dict, f'{rsc_dir}/cnv2hdn.lin'.format(rsc_dir))
_write_linear('conv2hidden', state_dict, '{}/cnv2hdn.lin'.format(rsc_dir))

# weight: [output vocab(tag)] * hidden_dim * 4
# bias: [output vocab(tag)] * 4
_write_linear('hidden2tag', state_dict, f'{rsc_dir}/hdn2tag.lin')
_write_linear('hidden2tag', state_dict, '{}/hdn2tag.lin'.format(rsc_dir))


def run(args):
Expand All @@ -196,7 +196,7 @@ def run(args):
args: program arguments
"""
cfg, rsc = _load_cfg_rsc(args.rsc_src, args.model_size)
state_dict = torch.load(f'{args.rsc_src}/{args.model_size}.model.state',
state_dict = torch.load('{}/{}.model.state'.format(args.rsc_src, args.model_size),
map_location=lambda storage, loc: storage)
_validate_state_dict(cfg, rsc, state_dict)
_write_config(cfg, rsc, args.rsc_dir)
Expand Down
8 changes: 4 additions & 4 deletions rsc/bin/compile_preanal.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def _load_entries(args):
"""
good_entries = []
bad_entries = []
for file_path in glob.glob(f'{args.rsc_src}/preanal.*'):
for file_path in glob.glob('{}/preanal.*'.format(args.rsc_src)):
file_name = os.path.basename(file_path)
logging.info(file_name)
for line_num, line in enumerate(open(file_path, 'r', encoding='UTF-8'), start=1):
Expand Down Expand Up @@ -256,9 +256,9 @@ def _save_trie(rsc_dir, entries):
val += 1 if entry.is_pfx else 0 # 전망매칭 패턴의 경우 홀수
trie.insert(entry.word, val)
total_tag_nums += len(entry.tag_nums)
trie.save(f'{rsc_dir}/preanal.tri')
trie.save('{}/preanal.tri'.format(rsc_dir))

val_file = f'{rsc_dir}/preanal.val'
val_file = '{}/preanal.val'.format(rsc_dir)
with open(val_file, 'wb') as fout:
fout.write(struct.pack('H', 0)) # 인덱스가 1부터 시작하므로 dummy 데이터를 맨 앞에 하나 넣는다.
for idx, entry in enumerate(entries, start=1):
Expand All @@ -277,7 +277,7 @@ def run(args):
args: program arguments
"""
aligner = Aligner(args.rsc_src)
restore_dic = load_restore_dic(f'{args.rsc_src}/restore.dic')
restore_dic = load_restore_dic('{}/restore.dic'.format(args.rsc_src))
if not restore_dic:
sys.exit(1)
restore_new = defaultdict(dict)
Expand Down
18 changes: 10 additions & 8 deletions rsc/bin/compile_restore.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
###########
# imports #
###########
from __future__ import print_function

import argparse
from collections import defaultdict
import logging
Expand Down Expand Up @@ -67,11 +69,11 @@ def load_vocab_out(rsc_src):
Returns:
출력 태그 vocabulary
"""
file_path = f'{rsc_src}/vocab.out'
file_path = '{}/vocab.out'.format(rsc_src)
vocab_out = [line.strip() for line in open(file_path, 'r', encoding='UTF-8')
if line.strip()]
vocab_out_more = []
file_path = f'{rsc_src}/vocab.out.more'
file_path = '{}/vocab.out.more'.format(rsc_src)
if os.path.exists(file_path):
vocab_out_more = [line.strip() for line in open(file_path, 'r', encoding='UTF-8')
if line.strip()]
Expand All @@ -87,14 +89,14 @@ def append_new_entries(rsc_src, restore_new, vocab_new):
vocab_new: 출력 태그 vocabulary에 추가할 엔트리
"""
if restore_new:
with open(f'{rsc_src}/restore.dic', 'a', encoding='UTF-8') as fout:
with open('{}/restore.dic'.format(rsc_src), 'a', encoding='UTF-8') as fout:
for (char, tag_out), tag_num_mrp_chr_dic in restore_new.items():
for tag_num, mrp_chr in tag_num_mrp_chr_dic.items():
new_entry_str = '{}/{}:{}\t{}'.format(char, tag_out, tag_num, mrp_chr)
logging.info('[RESTORE] %s', new_entry_str)
print(new_entry_str, file=fout)
if vocab_new:
with open(f'{rsc_src}/vocab.out.more', 'a', encoding='UTF-8') as fout:
with open('{}/vocab.out.more'.format(rsc_src), 'a', encoding='UTF-8') as fout:
new_tags = sorted([(num, tag) for tag, num in vocab_new.items()])
for _, tag in new_tags:
logging.info('[TAG] %s', tag)
Expand Down Expand Up @@ -146,8 +148,8 @@ def _save_restore_dic(rsc_dir, bin_dic):
bin_dic: binary dictionary
"""
os.makedirs(rsc_dir, exist_ok=True)
with open(f'{rsc_dir}/restore.key', 'wb') as fkey:
with open(f'{rsc_dir}/restore.val', 'wb') as fval:
with open('{}/restore.key'.format(rsc_dir), 'wb') as fkey:
with open('{}/restore.val'.format(rsc_dir), 'wb') as fval:
for key, vals in sorted(bin_dic.items()):
logging.debug('\t0x%08x => %s', key, ' '.join(['0x%08x' % val for val in vals]))
fkey.write(struct.pack('I', key))
Expand All @@ -168,7 +170,7 @@ def _save_restore_one(rsc_dir, vocab_out, vocab_new):
idx_tags = sorted([(idx, tag) for tag, idx
in list(vocab_out.items()) + list(vocab_new.items())])
os.makedirs(rsc_dir, exist_ok=True)
with open(f'{rsc_dir}/restore.one', 'wb') as fone:
with open('{}/restore.one'.format(rsc_dir), 'wb') as fone:
fone.write(struct.pack('B', 0)) # index 0 is empty(filling) byte
for idx, out_tag in idx_tags:
one_tag = out_tag.split(':')[0]
Expand All @@ -186,7 +188,7 @@ def run(args):
Args:
args: program arguments
"""
restore_dic = load_restore_dic(f'{args.rsc_src}/restore.dic')
restore_dic = load_restore_dic('{}/restore.dic'.format(args.rsc_src))
if not restore_dic:
sys.exit(1)
vocab_out = load_vocab_out(args.rsc_src)
Expand Down
2 changes: 1 addition & 1 deletion rsc/lib/char_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def _open(self, rsc_dir):
Args:
rsc_dir: resource dir
"""
file_path = f'{rsc_dir}/char_align.map'
file_path = '{}/char_align.map'.format(rsc_dir)
file_name = os.path.basename(file_path)
for line_num, line in enumerate(codecs.open(file_path, 'r', encoding='UTF-8'), start=1):
line = line.rstrip('\r\n')
Expand Down
6 changes: 3 additions & 3 deletions rsc/lib/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ def __init__(self, cfg):
"""
:param cfg: config
"""
vocab_in_path = f'{cfg.rsc_src}/vocab.in'
vocab_in_path = '{}/vocab.in'.format(cfg.rsc_src)
self.vocab_in = Vocabulary(vocab_in_path, cfg.cutoff, SPECIAL_CHARS)
vocab_out_path = f'{cfg.rsc_src}/vocab.out'
vocab_out_path = '{}/vocab.out'.format(cfg.rsc_src)
self.vocab_out = Vocabulary(vocab_out_path, 0, None)
restore_dic_path = f'{cfg.rsc_src}/restore.dic'
restore_dic_path = '{}/restore.dic'.format(cfg.rsc_src)
self.restore_dic = self._load_restore_dic(restore_dic_path)

@classmethod
Expand Down
15 changes: 7 additions & 8 deletions src/main/python/khaiii/khaiii.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def __init__(self):
self.reserved = b''

def __str__(self):
return f'{self.lex}/{self.tag}'
return '{}/{}'.format(self.lex, self.tag)

def set(self, morph: ctypes.POINTER(_khaiii_morph_t), align: list):
"""
Expand Down Expand Up @@ -108,7 +108,7 @@ def __init__(self):

def __str__(self):
morphs_str = ' + '.join([str(m) for m in self.morphs])
return f'{self.lex}\t{morphs_str}'
return '{}\t{}'.format(self.lex, morphs_str)

def set(self, word: ctypes.POINTER(_khaiii_word_t), in_str: str, align: list):
"""
Expand Down Expand Up @@ -157,16 +157,15 @@ def __init__(self, lib_path: str = ''):
"""
self._handle = -1
if not lib_path:
ext = 'dylib' if platform.system() == 'Darwin' else 'so'
lib_name = f'libkhaiii.{ext}'
lib_dir = f'{os.path.dirname(__file__)}/lib'
lib_path = f'{lib_dir}/{lib_name}'
lib_name = 'libkhaiii.dylib' if platform.system() == 'Darwin' else 'libkhaiii.so'
lib_dir = os.path.join(os.path.dirname(__file__), 'lib')
lib_path = '{}/{}'.format(lib_dir, lib_name)
if not os.path.exists(lib_path):
lib_path = find_library(lib_name)
if not lib_path:
logging.error('current working directory: %s', os.getcwd())
logging.error('library directory: %s', lib_dir)
raise KhaiiiExcept(f'fail to find library: {lib_name}')
raise KhaiiiExcept('fail to find library: {}'.format(lib_name))
logging.debug('khaiii library path: %s', lib_path)
self._lib = ctypes.CDLL(lib_path)
self._set_arg_res_types()
Expand All @@ -191,7 +190,7 @@ def open(self, rsc_dir: str = '', opt_str: str = ''):
"""
self.close()
if not rsc_dir:
rsc_dir = f'{os.path.dirname(__file__)}/share/khaiii'
rsc_dir = os.path.join(os.path.dirname(__file__), 'share/khaiii')
self._handle = self._lib.khaiii_open(rsc_dir.encode('UTF-8'), opt_str.encode('UTF-8'))
if self._handle < 0:
raise KhaiiiExcept(self._last_error())
Expand Down