Skip to content

Commit 8c94537

Browse files
factorization and harmonization with other models for future api
1 parent b6cb907 commit 8c94537

15 files changed

+855
-611
lines changed

README.md

+27-9
Original file line numberDiff line numberDiff line change
@@ -29,37 +29,55 @@ Similar to [Lample et al.](https://arxiv.org/abs/1603.01360) and [Ma and Hovy](h
2929

3030
## Getting started
3131

32+
3233
1. Download the GloVe vectors with
3334

3435
```
3536
make glove
3637
```
3738

38-
Alternatively, you can download them manually [here](https://nlp.stanford.edu/projects/glove/) and update the `glove_filename` entry in `config.py`
39+
Alternatively, you can download them manually [here](https://nlp.stanford.edu/projects/glove/) and update the `glove_filename` entry in `config.py`. You can also choose not to load pretrained word vectors by changing the entry `use_pretrained` to `False` in `model/config.py`.
40+
41+
2. Build the training data, train and evaluate the model with
42+
```
43+
make run
44+
```
45+
46+
47+
## Details
48+
3949

40-
2. Build vocab from the data and extract trimmed glove vectors according to the config in `config.py`.
50+
Here is the breakdown of the commands executed in `make run`:
51+
52+
1. [DO NOT MISS THIS STEP] Build vocab from the data and extract trimmed glove vectors according to the config in `model/config.py`.
4153

4254
```
4355
python build_data.py
4456
```
4557

46-
3. Train and test model with
58+
2. Train the model with
4759

4860
```
49-
python main.py
61+
python train.py
5062
```
5163

52-
Data iterators and utils are in `data_utils.py` and the model with training/test procedures is in `model.py`
5364

54-
Training time on NVidia Tesla K80 is 110 seconds per epoch on CoNLL train set using characters embeddings and CRF.
65+
3. Evaluate and interact with the model with
66+
```
67+
python evaluate.py
68+
```
5569

5670

71+
Data iterators and utils are in `model/data_utils.py` and the model with training/test procedures is in `model/ner_model.py`
72+
73+
Training time on NVidia Tesla K80 is 110 seconds per epoch on CoNLL train set using characters embeddings and CRF.
74+
5775

5876

59-
## Data
77+
## Training Data
6078

6179

62-
The training data must be in the following format (identical to the CoNLL2003 dataset).
80+
The training data must be in the following format (identical to the CoNLL2003 dataset).
6381

6482
A default test file is provided to help you getting started.
6583

@@ -91,7 +109,7 @@ train_filename = "data/coNLL/eng/eng.train.iob"
91109

92110

93111

94-
## License
112+
## License
95113

96114
This project is licensed under the terms of the apache 2.0 license (as Tensorflow and derivatives). If used for research, citation would be appreciated.
97115

build_data.py

+28-22
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,55 @@
1-
from config import Config
2-
from data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \
1+
from model.config import Config
2+
from model.data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \
33
get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \
44
export_trimmed_glove_vectors, get_processing_word
55

66

7-
def build_data(config):
8-
"""
9-
Procedure to build data
7+
def main():
8+
"""Procedure to build data
9+
10+
You MUST RUN this procedure. It iterates over the whole dataset (train,
11+
dev and test) and extract the vocabularies in terms of words, tags, and
12+
characters. Having built the vocabularies it writes them in a file. The
13+
writing of vocabulary in a file assigns an id (the line #) to each word.
14+
It then extract the relevant GloVe vectors and stores them in a np array
15+
such that the i-th entry corresponds to the i-th word in the vocabulary.
16+
1017
1118
Args:
12-
config: defines attributes needed in the function
13-
Returns:
14-
creates vocab files from the datasets
15-
creates a npz embedding file from trimmed glove vectors
19+
config: (instance of Config) has attributes like hyper-params...
20+
1621
"""
22+
# get config and processing of words
23+
config = Config(load=False)
1724
processing_word = get_processing_word(lowercase=True)
1825

1926
# Generators
20-
dev = CoNLLDataset(config.dev_filename, processing_word)
21-
test = CoNLLDataset(config.test_filename, processing_word)
22-
train = CoNLLDataset(config.train_filename, processing_word)
27+
dev = CoNLLDataset(config.filename_dev, processing_word)
28+
test = CoNLLDataset(config.filename_test, processing_word)
29+
train = CoNLLDataset(config.filename_train, processing_word)
2330

2431
# Build Word and Tag vocab
2532
vocab_words, vocab_tags = get_vocabs([train, dev, test])
26-
vocab_glove = get_glove_vocab(config.glove_filename)
33+
vocab_glove = get_glove_vocab(config.filename_glove)
2734

2835
vocab = vocab_words & vocab_glove
2936
vocab.add(UNK)
3037
vocab.add(NUM)
3138

3239
# Save vocab
33-
write_vocab(vocab, config.words_filename)
34-
write_vocab(vocab_tags, config.tags_filename)
40+
write_vocab(vocab, config.filename_words)
41+
write_vocab(vocab_tags, config.filename_tags)
3542

3643
# Trim GloVe Vectors
37-
vocab = load_vocab(config.words_filename)
38-
export_trimmed_glove_vectors(vocab, config.glove_filename,
39-
config.trimmed_filename, config.dim)
44+
vocab = load_vocab(config.filename_words)
45+
export_trimmed_glove_vectors(vocab, config.filename_glove,
46+
config.filename_trimmed, config.dim_word)
4047

4148
# Build and save char vocab
42-
train = CoNLLDataset(config.train_filename)
49+
train = CoNLLDataset(config.filename_train)
4350
vocab_chars = get_char_vocab(train)
44-
write_vocab(vocab_chars, config.chars_filename)
51+
write_vocab(vocab_chars, config.filename_chars)
4552

4653

4754
if __name__ == "__main__":
48-
config = Config()
49-
build_data(config)
55+
main()

config.py

-64
This file was deleted.

data/test.txt

+7-7
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ York I-LOC
3535
The O
3636
European B-ORG
3737
Union I-ORG
38-
is O
39-
a O
38+
is O
39+
a O
4040
political O
4141
and O
4242
economic O
@@ -61,8 +61,8 @@ York I-LOC
6161
The O
6262
European B-ORG
6363
Union I-ORG
64-
is O
65-
a O
64+
is O
65+
a O
6666
political O
6767
and O
6868
economic O
@@ -87,8 +87,8 @@ York I-LOC
8787
The O
8888
European B-ORG
8989
Union I-ORG
90-
is O
91-
a O
90+
is O
91+
a O
9292
political O
9393
and O
9494
economic O
@@ -100,4 +100,4 @@ American I-MISC
100100
actor O
101101
won O
102102
an O
103-
oscar O
103+
oscar O

evaluate.py

+88
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
from model.data_utils import CoNLLDataset
2+
from model.ner_model import NERModel
3+
from model.config import Config
4+
5+
6+
def align_data(data):
7+
"""Given dict with lists, creates aligned strings
8+
9+
Adapted from Assignment 3 of CS224N
10+
11+
Args:
12+
data: (dict) data["x"] = ["I", "love", "you"]
13+
(dict) data["y"] = ["O", "O", "O"]
14+
15+
Returns:
16+
data_aligned: (dict) data_align["x"] = "I love you"
17+
data_align["y"] = "O O O "
18+
19+
"""
20+
spacings = [max([len(seq[i]) for seq in data.values()])
21+
for i in range(len(data[list(data.keys())[0]]))]
22+
data_aligned = dict()
23+
24+
# for each entry, create aligned string
25+
for key, seq in data.items():
26+
str_aligned = ""
27+
for token, spacing in zip(seq, spacings):
28+
str_aligned += token + " " * (spacing - len(token) + 1)
29+
30+
data_aligned[key] = str_aligned
31+
32+
return data_aligned
33+
34+
35+
36+
def interactive_shell(model):
37+
"""Creates interactive shell to play with model
38+
39+
Args:
40+
model: instance of NERModel
41+
42+
"""
43+
model.logger.info("""
44+
This is an interactive mode.
45+
To exit, enter 'exit'.
46+
You can enter a sentence like
47+
input> I love Paris""")
48+
49+
while True:
50+
try:
51+
# for python 2
52+
sentence = raw_input("input> ")
53+
except NameError:
54+
# for python 3
55+
sentence = input("input> ")
56+
57+
words_raw = sentence.strip().split(" ")
58+
59+
if words_raw == ["exit"]:
60+
break
61+
62+
preds = model.predict(words_raw)
63+
to_print = align_data({"input": words_raw, "output": preds})
64+
65+
for key, seq in to_print.items():
66+
model.logger.info(seq)
67+
68+
69+
def main():
70+
# create instance of config
71+
config = Config()
72+
73+
# build model
74+
model = NERModel(config)
75+
model.build()
76+
model.restore_session(config.dir_model)
77+
78+
# create dataset
79+
test = CoNLLDataset(config.filename_test, config.processing_word,
80+
config.processing_tag, config.max_iter)
81+
82+
# evaluate and interact
83+
model.evaluate(test)
84+
interactive_shell(model)
85+
86+
87+
if __name__ == "__main__":
88+
main()

main.py

-46
This file was deleted.

0 commit comments

Comments
 (0)