forked from guillaumegenthial/sequence_tagging
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
factorization and harmonization with other models for future api
- Loading branch information
1 parent
b6cb907
commit 8c94537
Showing
15 changed files
with
855 additions
and
611 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,49 +1,55 @@ | ||
from config import Config | ||
from data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \ | ||
from model.config import Config | ||
from model.data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \ | ||
get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \ | ||
export_trimmed_glove_vectors, get_processing_word | ||
|
||
|
||
def build_data(config): | ||
""" | ||
Procedure to build data | ||
def main(): | ||
"""Procedure to build data | ||
You MUST RUN this procedure. It iterates over the whole dataset (train, | ||
dev and test) and extract the vocabularies in terms of words, tags, and | ||
characters. Having built the vocabularies it writes them in a file. The | ||
writing of vocabulary in a file assigns an id (the line #) to each word. | ||
It then extract the relevant GloVe vectors and stores them in a np array | ||
such that the i-th entry corresponds to the i-th word in the vocabulary. | ||
Args: | ||
config: defines attributes needed in the function | ||
Returns: | ||
creates vocab files from the datasets | ||
creates a npz embedding file from trimmed glove vectors | ||
config: (instance of Config) has attributes like hyper-params... | ||
""" | ||
# get config and processing of words | ||
config = Config(load=False) | ||
processing_word = get_processing_word(lowercase=True) | ||
|
||
# Generators | ||
dev = CoNLLDataset(config.dev_filename, processing_word) | ||
test = CoNLLDataset(config.test_filename, processing_word) | ||
train = CoNLLDataset(config.train_filename, processing_word) | ||
dev = CoNLLDataset(config.filename_dev, processing_word) | ||
test = CoNLLDataset(config.filename_test, processing_word) | ||
train = CoNLLDataset(config.filename_train, processing_word) | ||
|
||
# Build Word and Tag vocab | ||
vocab_words, vocab_tags = get_vocabs([train, dev, test]) | ||
vocab_glove = get_glove_vocab(config.glove_filename) | ||
vocab_glove = get_glove_vocab(config.filename_glove) | ||
|
||
vocab = vocab_words & vocab_glove | ||
vocab.add(UNK) | ||
vocab.add(NUM) | ||
|
||
# Save vocab | ||
write_vocab(vocab, config.words_filename) | ||
write_vocab(vocab_tags, config.tags_filename) | ||
write_vocab(vocab, config.filename_words) | ||
write_vocab(vocab_tags, config.filename_tags) | ||
|
||
# Trim GloVe Vectors | ||
vocab = load_vocab(config.words_filename) | ||
export_trimmed_glove_vectors(vocab, config.glove_filename, | ||
config.trimmed_filename, config.dim) | ||
vocab = load_vocab(config.filename_words) | ||
export_trimmed_glove_vectors(vocab, config.filename_glove, | ||
config.filename_trimmed, config.dim_word) | ||
|
||
# Build and save char vocab | ||
train = CoNLLDataset(config.train_filename) | ||
train = CoNLLDataset(config.filename_train) | ||
vocab_chars = get_char_vocab(train) | ||
write_vocab(vocab_chars, config.chars_filename) | ||
write_vocab(vocab_chars, config.filename_chars) | ||
|
||
|
||
if __name__ == "__main__": | ||
config = Config() | ||
build_data(config) | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
from model.data_utils import CoNLLDataset | ||
from model.ner_model import NERModel | ||
from model.config import Config | ||
|
||
|
||
def align_data(data): | ||
"""Given dict with lists, creates aligned strings | ||
Adapted from Assignment 3 of CS224N | ||
Args: | ||
data: (dict) data["x"] = ["I", "love", "you"] | ||
(dict) data["y"] = ["O", "O", "O"] | ||
Returns: | ||
data_aligned: (dict) data_align["x"] = "I love you" | ||
data_align["y"] = "O O O " | ||
""" | ||
spacings = [max([len(seq[i]) for seq in data.values()]) | ||
for i in range(len(data[list(data.keys())[0]]))] | ||
data_aligned = dict() | ||
|
||
# for each entry, create aligned string | ||
for key, seq in data.items(): | ||
str_aligned = "" | ||
for token, spacing in zip(seq, spacings): | ||
str_aligned += token + " " * (spacing - len(token) + 1) | ||
|
||
data_aligned[key] = str_aligned | ||
|
||
return data_aligned | ||
|
||
|
||
|
||
def interactive_shell(model): | ||
"""Creates interactive shell to play with model | ||
Args: | ||
model: instance of NERModel | ||
""" | ||
model.logger.info(""" | ||
This is an interactive mode. | ||
To exit, enter 'exit'. | ||
You can enter a sentence like | ||
input> I love Paris""") | ||
|
||
while True: | ||
try: | ||
# for python 2 | ||
sentence = raw_input("input> ") | ||
except NameError: | ||
# for python 3 | ||
sentence = input("input> ") | ||
|
||
words_raw = sentence.strip().split(" ") | ||
|
||
if words_raw == ["exit"]: | ||
break | ||
|
||
preds = model.predict(words_raw) | ||
to_print = align_data({"input": words_raw, "output": preds}) | ||
|
||
for key, seq in to_print.items(): | ||
model.logger.info(seq) | ||
|
||
|
||
def main(): | ||
# create instance of config | ||
config = Config() | ||
|
||
# build model | ||
model = NERModel(config) | ||
model.build() | ||
model.restore_session(config.dir_model) | ||
|
||
# create dataset | ||
test = CoNLLDataset(config.filename_test, config.processing_word, | ||
config.processing_tag, config.max_iter) | ||
|
||
# evaluate and interact | ||
model.evaluate(test) | ||
interactive_shell(model) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.