Skip to content

Commit

Permalink
factorization and harmonization with other models for future api
Browse files Browse the repository at this point in the history
  • Loading branch information
guillaumegenthial committed Sep 14, 2017
1 parent b6cb907 commit 8c94537
Show file tree
Hide file tree
Showing 15 changed files with 855 additions and 611 deletions.
36 changes: 27 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,37 +29,55 @@ Similar to [Lample et al.](https://arxiv.org/abs/1603.01360) and [Ma and Hovy](h

## Getting started


1. Download the GloVe vectors with

```
make glove
```

Alternatively, you can download them manually [here](https://nlp.stanford.edu/projects/glove/) and update the `glove_filename` entry in `config.py`
Alternatively, you can download them manually [here](https://nlp.stanford.edu/projects/glove/) and update the `glove_filename` entry in `config.py`. You can also choose not to load pretrained word vectors by changing the entry `use_pretrained` to `False` in `model/config.py`.

2. Build the training data, train and evaluate the model with
```
make run
```


## Details


2. Build vocab from the data and extract trimmed glove vectors according to the config in `config.py`.
Here is the breakdown of the commands executed in `make run`:

1. [DO NOT MISS THIS STEP] Build vocab from the data and extract trimmed glove vectors according to the config in `model/config.py`.

```
python build_data.py
```

3. Train and test model with
2. Train the model with

```
python main.py
python train.py
```

Data iterators and utils are in `data_utils.py` and the model with training/test procedures is in `model.py`

Training time on NVidia Tesla K80 is 110 seconds per epoch on CoNLL train set using characters embeddings and CRF.
3. Evaluate and interact with the model with
```
python evaluate.py
```


Data iterators and utils are in `model/data_utils.py` and the model with training/test procedures is in `model/ner_model.py`

Training time on NVidia Tesla K80 is 110 seconds per epoch on CoNLL train set using characters embeddings and CRF.



## Data
## Training Data


The training data must be in the following format (identical to the CoNLL2003 dataset).
The training data must be in the following format (identical to the CoNLL2003 dataset).

A default test file is provided to help you getting started.

Expand Down Expand Up @@ -91,7 +109,7 @@ train_filename = "data/coNLL/eng/eng.train.iob"



## License
## License

This project is licensed under the terms of the apache 2.0 license (as Tensorflow and derivatives). If used for research, citation would be appreciated.

50 changes: 28 additions & 22 deletions build_data.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,55 @@
from config import Config
from data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \
from model.config import Config
from model.data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \
get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \
export_trimmed_glove_vectors, get_processing_word


def build_data(config):
"""
Procedure to build data
def main():
"""Procedure to build data
You MUST RUN this procedure. It iterates over the whole dataset (train,
dev and test) and extract the vocabularies in terms of words, tags, and
characters. Having built the vocabularies it writes them in a file. The
writing of vocabulary in a file assigns an id (the line #) to each word.
It then extract the relevant GloVe vectors and stores them in a np array
such that the i-th entry corresponds to the i-th word in the vocabulary.
Args:
config: defines attributes needed in the function
Returns:
creates vocab files from the datasets
creates a npz embedding file from trimmed glove vectors
config: (instance of Config) has attributes like hyper-params...
"""
# get config and processing of words
config = Config(load=False)
processing_word = get_processing_word(lowercase=True)

# Generators
dev = CoNLLDataset(config.dev_filename, processing_word)
test = CoNLLDataset(config.test_filename, processing_word)
train = CoNLLDataset(config.train_filename, processing_word)
dev = CoNLLDataset(config.filename_dev, processing_word)
test = CoNLLDataset(config.filename_test, processing_word)
train = CoNLLDataset(config.filename_train, processing_word)

# Build Word and Tag vocab
vocab_words, vocab_tags = get_vocabs([train, dev, test])
vocab_glove = get_glove_vocab(config.glove_filename)
vocab_glove = get_glove_vocab(config.filename_glove)

vocab = vocab_words & vocab_glove
vocab.add(UNK)
vocab.add(NUM)

# Save vocab
write_vocab(vocab, config.words_filename)
write_vocab(vocab_tags, config.tags_filename)
write_vocab(vocab, config.filename_words)
write_vocab(vocab_tags, config.filename_tags)

# Trim GloVe Vectors
vocab = load_vocab(config.words_filename)
export_trimmed_glove_vectors(vocab, config.glove_filename,
config.trimmed_filename, config.dim)
vocab = load_vocab(config.filename_words)
export_trimmed_glove_vectors(vocab, config.filename_glove,
config.filename_trimmed, config.dim_word)

# Build and save char vocab
train = CoNLLDataset(config.train_filename)
train = CoNLLDataset(config.filename_train)
vocab_chars = get_char_vocab(train)
write_vocab(vocab_chars, config.chars_filename)
write_vocab(vocab_chars, config.filename_chars)


if __name__ == "__main__":
config = Config()
build_data(config)
main()
64 changes: 0 additions & 64 deletions config.py

This file was deleted.

14 changes: 7 additions & 7 deletions data/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ York I-LOC
The O
European B-ORG
Union I-ORG
is O
a O
is O
a O
political O
and O
economic O
Expand All @@ -61,8 +61,8 @@ York I-LOC
The O
European B-ORG
Union I-ORG
is O
a O
is O
a O
political O
and O
economic O
Expand All @@ -87,8 +87,8 @@ York I-LOC
The O
European B-ORG
Union I-ORG
is O
a O
is O
a O
political O
and O
economic O
Expand All @@ -100,4 +100,4 @@ American I-MISC
actor O
won O
an O
oscar O
oscar O
88 changes: 88 additions & 0 deletions evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from model.data_utils import CoNLLDataset
from model.ner_model import NERModel
from model.config import Config


def align_data(data):
"""Given dict with lists, creates aligned strings
Adapted from Assignment 3 of CS224N
Args:
data: (dict) data["x"] = ["I", "love", "you"]
(dict) data["y"] = ["O", "O", "O"]
Returns:
data_aligned: (dict) data_align["x"] = "I love you"
data_align["y"] = "O O O "
"""
spacings = [max([len(seq[i]) for seq in data.values()])
for i in range(len(data[list(data.keys())[0]]))]
data_aligned = dict()

# for each entry, create aligned string
for key, seq in data.items():
str_aligned = ""
for token, spacing in zip(seq, spacings):
str_aligned += token + " " * (spacing - len(token) + 1)

data_aligned[key] = str_aligned

return data_aligned



def interactive_shell(model):
"""Creates interactive shell to play with model
Args:
model: instance of NERModel
"""
model.logger.info("""
This is an interactive mode.
To exit, enter 'exit'.
You can enter a sentence like
input> I love Paris""")

while True:
try:
# for python 2
sentence = raw_input("input> ")
except NameError:
# for python 3
sentence = input("input> ")

words_raw = sentence.strip().split(" ")

if words_raw == ["exit"]:
break

preds = model.predict(words_raw)
to_print = align_data({"input": words_raw, "output": preds})

for key, seq in to_print.items():
model.logger.info(seq)


def main():
# create instance of config
config = Config()

# build model
model = NERModel(config)
model.build()
model.restore_session(config.dir_model)

# create dataset
test = CoNLLDataset(config.filename_test, config.processing_word,
config.processing_tag, config.max_iter)

# evaluate and interact
model.evaluate(test)
interactive_shell(model)


if __name__ == "__main__":
main()
46 changes: 0 additions & 46 deletions main.py

This file was deleted.

Loading

0 comments on commit 8c94537

Please sign in to comment.