factorization and harmonization with other models for future api

jiezhangGt · Sep 14, 2017 · 8c94537 · 8c94537
1 parent b6cb907
commit 8c94537
Show file tree

Hide file tree

Showing 15 changed files with 855 additions and 611 deletions.
diff --git a/README.md b/README.md
@@ -29,37 +29,55 @@ Similar to [Lample et al.](https://arxiv.org/abs/1603.01360) and [Ma and Hovy](h
 
 ## Getting started
 
+
 1. Download the GloVe vectors with
 
 ```
 make glove
 ```
 
-Alternatively, you can download them manually [here](https://nlp.stanford.edu/projects/glove/) and update the `glove_filename` entry in `config.py`
+Alternatively, you can download them manually [here](https://nlp.stanford.edu/projects/glove/) and update the `glove_filename` entry in `config.py`. You can also choose not to load pretrained word vectors by changing the entry `use_pretrained` to `False` in `model/config.py`.
+
+2. Build the training data, train and evaluate the model with
+```
+make run
+```
+
+
+## Details
+
 
-2. Build vocab from the data and extract trimmed glove vectors according to the config in `config.py`.
+Here is the breakdown of the commands executed in `make run`:
+
+1. [DO NOT MISS THIS STEP] Build vocab from the data and extract trimmed glove vectors according to the config in `model/config.py`.
 
 ```
 python build_data.py
 ```
 
-3. Train and test model with 
+2. Train the model with
 
 ```
-python main.py
+python train.py
 ```
 
-Data iterators and utils are in `data_utils.py` and the model with training/test procedures is in `model.py`
 
-Training time on NVidia Tesla K80 is 110 seconds per epoch on CoNLL train set using characters embeddings and CRF.
+3. Evaluate and interact with the model with
+```
+python evaluate.py
+```
 
 
+Data iterators and utils are in `model/data_utils.py` and the model with training/test procedures is in `model/ner_model.py`
+
+Training time on NVidia Tesla K80 is 110 seconds per epoch on CoNLL train set using characters embeddings and CRF.
+
 
 
-## Data
+## Training Data
 
 
-The training data must be in the following format (identical to the CoNLL2003 dataset). 
+The training data must be in the following format (identical to the CoNLL2003 dataset).
 
 A default test file is provided to help you getting started.
 
@@ -91,7 +109,7 @@ train_filename = "data/coNLL/eng/eng.train.iob"
 
 
 
-## License 
+## License
 
 This project is licensed under the terms of the apache 2.0 license (as Tensorflow and derivatives). If used for research, citation would be appreciated.
 
diff --git a/build_data.py b/build_data.py
@@ -1,49 +1,55 @@
-from config import Config
-from data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \
+from model.config import Config
+from model.data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \
     get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \
     export_trimmed_glove_vectors, get_processing_word
 
 
-def build_data(config):
-    """
-    Procedure to build data
+def main():
+    """Procedure to build data
+
+    You MUST RUN this procedure. It iterates over the whole dataset (train,
+    dev and test) and extract the vocabularies in terms of words, tags, and
+    characters. Having built the vocabularies it writes them in a file. The
+    writing of vocabulary in a file assigns an id (the line #) to each word.
+    It then extract the relevant GloVe vectors and stores them in a np array
+    such that the i-th entry corresponds to the i-th word in the vocabulary.
+
 
     Args:
-        config: defines attributes needed in the function
-    Returns:
-        creates vocab files from the datasets
-        creates a npz embedding file from trimmed glove vectors
+        config: (instance of Config) has attributes like hyper-params...
+
     """
+    # get config and processing of words
+    config = Config(load=False)
     processing_word = get_processing_word(lowercase=True)
 
     # Generators
-    dev   = CoNLLDataset(config.dev_filename, processing_word)
-    test  = CoNLLDataset(config.test_filename, processing_word)
-    train = CoNLLDataset(config.train_filename, processing_word)
+    dev   = CoNLLDataset(config.filename_dev, processing_word)
+    test  = CoNLLDataset(config.filename_test, processing_word)
+    train = CoNLLDataset(config.filename_train, processing_word)
 
     # Build Word and Tag vocab
     vocab_words, vocab_tags = get_vocabs([train, dev, test])
-    vocab_glove = get_glove_vocab(config.glove_filename)
+    vocab_glove = get_glove_vocab(config.filename_glove)
 
     vocab = vocab_words & vocab_glove
     vocab.add(UNK)
     vocab.add(NUM)
 
     # Save vocab
-    write_vocab(vocab, config.words_filename)
-    write_vocab(vocab_tags, config.tags_filename)
+    write_vocab(vocab, config.filename_words)
+    write_vocab(vocab_tags, config.filename_tags)
 
     # Trim GloVe Vectors
-    vocab = load_vocab(config.words_filename)
-    export_trimmed_glove_vectors(vocab, config.glove_filename,
-                                config.trimmed_filename, config.dim)
+    vocab = load_vocab(config.filename_words)
+    export_trimmed_glove_vectors(vocab, config.filename_glove,
+                                config.filename_trimmed, config.dim_word)
 
     # Build and save char vocab
-    train = CoNLLDataset(config.train_filename)
+    train = CoNLLDataset(config.filename_train)
     vocab_chars = get_char_vocab(train)
-    write_vocab(vocab_chars, config.chars_filename)
+    write_vocab(vocab_chars, config.filename_chars)
 
 
 if __name__ == "__main__":
-    config = Config()
-    build_data(config)
+    main()
diff --git a/config.py b/config.py
diff --git a/data/test.txt b/data/test.txt
@@ -35,8 +35,8 @@ York I-LOC
 The O
 European B-ORG
 Union I-ORG
-is O 
-a O 
+is O
+a O
 political O
 and O
 economic O
@@ -61,8 +61,8 @@ York I-LOC
 The O
 European B-ORG
 Union I-ORG
-is O 
-a O 
+is O
+a O
 political O
 and O
 economic O
@@ -87,8 +87,8 @@ York I-LOC
 The O
 European B-ORG
 Union I-ORG
-is O 
-a O 
+is O
+a O
 political O
 and O
 economic O
@@ -100,4 +100,4 @@ American I-MISC
 actor O
 won O
 an O
-oscar O
+oscar O
diff --git a/evaluate.py b/evaluate.py
@@ -0,0 +1,88 @@
+from model.data_utils import CoNLLDataset
+from model.ner_model import NERModel
+from model.config import Config
+
+
+def align_data(data):
+    """Given dict with lists, creates aligned strings
+
+    Adapted from Assignment 3 of CS224N
+
+    Args:
+        data: (dict) data["x"] = ["I", "love", "you"]
+              (dict) data["y"] = ["O", "O", "O"]
+
+    Returns:
+        data_aligned: (dict) data_align["x"] = "I love you"
+                           data_align["y"] = "O O    O  "
+
+    """
+    spacings = [max([len(seq[i]) for seq in data.values()])
+                for i in range(len(data[list(data.keys())[0]]))]
+    data_aligned = dict()
+
+    # for each entry, create aligned string
+    for key, seq in data.items():
+        str_aligned = ""
+        for token, spacing in zip(seq, spacings):
+            str_aligned += token + " " * (spacing - len(token) + 1)
+
+        data_aligned[key] = str_aligned
+
+    return data_aligned
+
+
+
+def interactive_shell(model):
+    """Creates interactive shell to play with model
+
+    Args:
+        model: instance of NERModel
+
+    """
+    model.logger.info("""
+This is an interactive mode.
+To exit, enter 'exit'.
+You can enter a sentence like
+input> I love Paris""")
+
+    while True:
+        try:
+            # for python 2
+            sentence = raw_input("input> ")
+        except NameError:
+            # for python 3
+            sentence = input("input> ")
+
+        words_raw = sentence.strip().split(" ")
+
+        if words_raw == ["exit"]:
+            break
+
+        preds = model.predict(words_raw)
+        to_print = align_data({"input": words_raw, "output": preds})
+
+        for key, seq in to_print.items():
+            model.logger.info(seq)
+
+
+def main():
+    # create instance of config
+    config = Config()
+
+    # build model
+    model = NERModel(config)
+    model.build()
+    model.restore_session(config.dir_model)
+
+    # create dataset
+    test  = CoNLLDataset(config.filename_test, config.processing_word,
+                         config.processing_tag, config.max_iter)
+
+    # evaluate and interact
+    model.evaluate(test)
+    interactive_shell(model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/main.py b/main.py