Merge pull request #2435 from flairNLP/dictionary-unking

Add UNKs to label dictionaries by default
flairNLP · Sep 14, 2021 · e94d530 · e94d530
2 parents 7b44afd + b81f611
commit e94d530
Show file tree

Hide file tree

Showing 10 changed files with 213 additions and 21 deletions.
diff --git a/flair/data.py b/flair/data.py
@@ -6,6 +6,7 @@
 from operator import itemgetter
 from typing import List, Dict, Union, Optional
 
+from deprecated import deprecated
 import torch
 from torch.utils.data import Dataset
 from torch.utils.data.dataset import ConcatDataset, Subset
@@ -147,7 +148,7 @@ def load(cls, name: str):
         return Dictionary.load_from_file(name)
 
     def __str__(self):
-        tags = ', '.join(self.get_item_for_index(i) for i in range(min(len(self), 30)))
+        tags = ', '.join(self.get_item_for_index(i) for i in range(min(len(self), 50)))
         return f"Dictionary with {len(self)} tags: {tags}"
 
 
@@ -1383,7 +1384,7 @@ def make_label_dictionary(self, label_type: str) -> Dictionary:
         Creates a dictionary of all labels assigned to the sentences in the corpus.
         :return: dictionary of labels
         """
-        label_dictionary: Dictionary = Dictionary(add_unk=False)
+        label_dictionary: Dictionary = Dictionary(add_unk=True)
         label_dictionary.multi_label = False
 
         from flair.datasets import DataLoader
@@ -1458,6 +1459,7 @@ def get_all_sentences(self) -> Dataset:
         if self.test: parts.append(self.test)
         return ConcatDataset(parts)
 
+    @deprecated(version="0.8", reason="Use 'make_label_dictionary' instead.")
     def make_tag_dictionary(self, tag_type: str) -> Dictionary:
 
         # Make the tag dictionary

diff --git a/flair/hyperparameter/param_selection.py b/flair/hyperparameter/param_selection.py
@@ -207,7 +207,7 @@ def __init__(
         )
 
         self.tag_type = tag_type
-        self.tag_dictionary = self.corpus.make_tag_dictionary(self.tag_type)
+        self.tag_dictionary = self.corpus.make_label_dictionary(self.tag_type)
 
     def _set_up_model(self, params: dict):
         sequence_tagger_params = {

diff --git a/flair/models/tars_model.py b/flair/models/tars_model.py
@@ -196,10 +196,10 @@ def add_and_switch_to_new_task(self,
             if type(label_dictionary) == str:
                 label_dictionary = [label_dictionary]
 
-            # prepare dictionary of tags (without B- I- prefixes)
+            # prepare dictionary of tags (without B- I- prefixes and without UNK)
             tag_dictionary = Dictionary(add_unk=False)
             for tag in label_dictionary:
-                if tag == 'O': continue
+                if tag == '<unk>' or tag == 'O': continue
                 if tag[1] == "-":
                     tag = tag[2:]
                     tag_dictionary.add_item(tag)

diff --git a/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md b/resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md
@@ -19,7 +19,7 @@ corpus = NCBI_DISEASE()
 print(corpus)
 
 # 2. make the tag dictionary from the corpus
-tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")
+tag_dictionary = corpus.make_label_dictionary(label_type="ner")
 
 # 3. initialize embeddings
 from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
@@ -134,7 +134,7 @@ embedding_types = [
 embeddings = StackedEmbeddings(embeddings=embedding_types)
 
 # 3. initialize sequence tagger
-tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")
+tag_dictionary = corpus.make_label_dictionary(label_type="ner")
 
 tagger = SequenceTagger(
     hidden_size=256,

diff --git a/tests/resources/tasks/fashion_disjunct/eng.testa b/tests/resources/tasks/fashion_disjunct/eng.testa
@@ -0,0 +1,21 @@
+Most _ _ O
+wedding _ _ B-Occasion
+dresses _ _ B-NominalProduct
+, _ _ O
+for _ _ O
+example _ _ O
+, _ _ O
+are _ _ O
+simply _ _ O
+too _ _ O
+enormous _ _ O
+and _ _ O
+terrifyingly _ _ B-CreativeWord
+loaded _ _ O
+with _ _ O
+sentimental _ _ O
+value _ _ O
+for _ _ O
+DIY _ _ B-ProductDesign
+dyeing _ _ I-ProductDesign
+. _ _ O
diff --git a/tests/resources/tasks/fashion_disjunct/eng.testb b/tests/resources/tasks/fashion_disjunct/eng.testb
@@ -0,0 +1,20 @@
+For _ _ O
+my _ _ O
+Nurse _ _ B-NamedOrganizationBrand
+Ratched _ _ I-NamedOrganizationBrand
+dress _ _ B-NominalProduct
+, _ _ O
+I _ _ O
+had _ _ O
+brought _ _ O
+two _ _ O
+dyeing _ _ O
+options _ _ O
+— _ _ O
+one _ _ O
+more _ _ O
+ambitious _ _ B-Ambitiousness
+than _ _ O
+the _ _ O
+other _ _ O
+. _ _ O
diff --git a/tests/resources/tasks/fashion_disjunct/eng.train b/tests/resources/tasks/fashion_disjunct/eng.train
@@ -0,0 +1,122 @@
+From _ _ O
+the _ _ O
+charming _ _ O
+Arlésienne _ _ B-NamedPerson
+to _ _ O
+the _ _ O
+shepherdess _ _ O
+in _ _ O
+a _ _ O
+fairy _ _ O
+tale _ _ O
+, _ _ O
+with _ _ O
+faille _ _ B-ProductPart
+, _ _ O
+piqué _ _ B-ProductPart
+, _ _ O
+taffeta _ _ B-ProductPart
+, _ _ O
+tulle _ _ B-ProductPart
+, _ _ O
+embroidery _ _ B-ProductPart
+, _ _ O
+lace _ _ B-ProductPart
+, _ _ O
+the _ _ O
+repertoire _ _ O
+is _ _ O
+inexhaustible _ _ O
+. _ _ O
+
+
+
+
+Subscribe _ _ O
+to _ _ O
+Highsnobiety _ _ B-NamedOrganizationPublisher
+on _ _ O
+YouTube _ _ B-NamedOrganizationOther
+Eric _ _ B-NamedPerson
+Schoenborn _ _ I-NamedPerson
+and _ _ O
+Ed _ _ B-NamedPerson
+Selego _ _ I-NamedPerson
+have _ _ O
+joined _ _ O
+forces _ _ O
+with _ _ O
+Nocturnal _ _ B-NamedOrganizationBrand
+skate _ _ B-Activity
+shop _ _ O
+to _ _ O
+turn _ _ O
+Drexel _ _ B-NamedLocation
+University _ _ I-NamedLocation
+’ _ _ O
+s _ _ O
+Leonard _ _ B-NamedLocation
+Pearlstein _ _ I-NamedLocation
+Gallery _ _ I-NamedLocation
+into _ _ O
+an _ _ O
+interactive _ _ O
+skate _ _ B-Activity
+pop _ _ O
+- _ _ O
+up _ _ O
+park _ _ O
+. _ _ O
+
+Philly _ _ B-NamedPerson
+Radness _ _ I-NamedPerson
+accounts _ _ O
+for _ _ O
+the _ _ O
+second _ _ O
+installment _ _ O
+in _ _ O
+the _ _ O
+Phenomenal _ _ O
+Radness _ _ O
+project _ _ O
+, _ _ O
+after _ _ O
+its _ _ O
+debut _ _ O
+in _ _ O
+Miami _ _ B-NamedLocation
+a _ _ O
+few _ _ O
+years _ _ O
+ago _ _ O
+. _ _ O
+
+Milan _ _ B-NamedLocation
+was _ _ O
+all _ _ O
+the _ _ O
+really _ _ O
+big _ _ O
+girls _ _ O
+. _ _ O
+
+It _ _ O
+was _ _ O
+the _ _ O
+best _ _ O
+! _ _ O
+
+We _ _ O
+go _ _ O
+to _ _ O
+flea _ _ O
+markets _ _ O
+together _ _ O
+when _ _ O
+we _ _ O
+' _ _ O
+re _ _ O
+in _ _ O
+LA _ _ B-NamedLocation
+. _ _ O
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -589,8 +589,8 @@ def test_tagged_corpus_make_label_dictionary():
 
     label_dict = corpus.make_label_dictionary('label')
 
-    assert 2 == len(label_dict)
-    assert "<unk>" not in label_dict.get_items()
+    assert 3 == len(label_dict)
+    assert "<unk>" in label_dict.get_items()
     assert "class_1" in label_dict.get_items()
     assert "class_2" in label_dict.get_items()
 

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -24,7 +24,7 @@ def test_sequence_tagger_no_crf(results_base_path, tasks_base_path):
     corpus: Corpus = ColumnCorpus(
         data_folder=tasks_base_path / "trivial" / "trivial_bioes", column_format={0: "text", 1: "ner"}
     )
-    tag_dictionary = corpus.make_tag_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner")
 
     # tagger without CRF
     tagger: SequenceTagger = SequenceTagger(
@@ -77,7 +77,7 @@ def test_sequence_tagger_with_crf(results_base_path, tasks_base_path):
     corpus: Corpus = ColumnCorpus(
         data_folder=tasks_base_path / "trivial" / "trivial_bioes", column_format={0: "text", 1: "ner"}
     )
-    tag_dictionary = corpus.make_tag_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner")
 
     # tagger without CRF
     tagger: SequenceTagger = SequenceTagger(
@@ -130,7 +130,7 @@ def test_sequence_tagger_stacked(results_base_path, tasks_base_path):
     corpus: Corpus = ColumnCorpus(
         data_folder=tasks_base_path / "trivial" / "trivial_bioes", column_format={0: "text", 1: "ner"}
     )
-    tag_dictionary = corpus.make_tag_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner")
 
     # tagger without CRF
     tagger: SequenceTagger = SequenceTagger(
@@ -183,7 +183,7 @@ def test_sequence_tagger_transformer_finetune(results_base_path, tasks_base_path
     corpus: Corpus = ColumnCorpus(
         data_folder=tasks_base_path / "trivial" / "trivial_bioes", column_format={0: "text", 1: "ner"}
     )
-    tag_dictionary = corpus.make_tag_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner")
 
     # tagger without CRF
     tagger: SequenceTagger = SequenceTagger(

diff --git a/tests/test_sequence_tagger.py b/tests/test_sequence_tagger.py
@@ -61,7 +61,7 @@ def test_train_load_use_tagger(results_base_path, tasks_base_path):
     corpus = flair.datasets.ColumnCorpus(
         data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
     )
-    tag_dictionary = corpus.make_tag_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner")
 
     tagger: SequenceTagger = SequenceTagger(
         hidden_size=64,
@@ -104,7 +104,7 @@ def test_train_load_use_tagger_empty_tags(results_base_path, tasks_base_path):
     corpus = flair.datasets.ColumnCorpus(
         data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
     )
-    tag_dictionary = corpus.make_tag_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner")
 
     tagger: SequenceTagger = SequenceTagger(
         hidden_size=64,
@@ -142,10 +142,37 @@ def test_train_load_use_tagger_empty_tags(results_base_path, tasks_base_path):
     del loaded_model
 
 
+@pytest.mark.integration
+def test_train_load_use_tagger_disjunct_tags(results_base_path, tasks_base_path):
+    corpus = flair.datasets.ColumnCorpus(
+        data_folder=tasks_base_path / "fashion_disjunct", column_format={0: "text", 3: "ner"}
+    )
+    tag_dictionary = corpus.make_label_dictionary("ner")
+
+    tagger: SequenceTagger = SequenceTagger(
+        hidden_size=64,
+        embeddings=turian_embeddings,
+        tag_dictionary=tag_dictionary,
+        tag_type="ner",
+        use_crf=False,
+    )
+
+    # initialize trainer
+    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
+
+    trainer.train(
+        results_base_path,
+        learning_rate=0.1,
+        mini_batch_size=2,
+        max_epochs=2,
+        shuffle=False,
+    )
+
+
 @pytest.mark.integration
 def test_train_load_use_tagger_large(results_base_path, tasks_base_path):
     corpus = flair.datasets.UD_ENGLISH().downsample(0.05)
-    tag_dictionary = corpus.make_tag_dictionary("pos")
+    tag_dictionary = corpus.make_label_dictionary("pos")
 
     tagger: SequenceTagger = SequenceTagger(
         hidden_size=64,
@@ -188,7 +215,7 @@ def test_train_load_use_tagger_flair_embeddings(results_base_path, tasks_base_pa
     corpus = flair.datasets.ColumnCorpus(
         data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
     )
-    tag_dictionary = corpus.make_tag_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner")
 
     tagger: SequenceTagger = SequenceTagger(
         hidden_size=64,
@@ -231,7 +258,7 @@ def test_train_load_use_tagger_adam(results_base_path, tasks_base_path):
     corpus = flair.datasets.ColumnCorpus(
         data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
     )
-    tag_dictionary = corpus.make_tag_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner")
 
     tagger: SequenceTagger = SequenceTagger(
         hidden_size=64,
@@ -277,7 +304,7 @@ def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
     corpus_2 = flair.datasets.NER_GERMAN_GERMEVAL(base_path=tasks_base_path)
 
     corpus = MultiCorpus([corpus_1, corpus_2])
-    tag_dictionary = corpus.make_tag_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner")
 
     tagger: SequenceTagger = SequenceTagger(
         hidden_size=64,
@@ -323,7 +350,7 @@ def test_train_resume_tagger(results_base_path, tasks_base_path):
     corpus_2 = flair.datasets.NER_GERMAN_GERMEVAL(base_path=tasks_base_path)
 
     corpus = MultiCorpus([corpus_1, corpus_2])
-    tag_dictionary = corpus.make_tag_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner")
 
     model: SequenceTagger = SequenceTagger(
         hidden_size=64,
@@ -351,7 +378,7 @@ def test_find_learning_rate(results_base_path, tasks_base_path):
     corpus = flair.datasets.ColumnCorpus(
         data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
     )
-    tag_dictionary = corpus.make_tag_dictionary("ner")
+    tag_dictionary = corpus.make_label_dictionary("ner")
 
     tagger: SequenceTagger = SequenceTagger(
         hidden_size=64,