Skip to content

Commit

Permalink
Merge pull request #2435 from flairNLP/dictionary-unking
Browse files Browse the repository at this point in the history
Add UNKs to label dictionaries by default
  • Loading branch information
alanakbik authored Sep 14, 2021
2 parents 7b44afd + b81f611 commit e94d530
Show file tree
Hide file tree
Showing 10 changed files with 213 additions and 21 deletions.
6 changes: 4 additions & 2 deletions flair/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from operator import itemgetter
from typing import List, Dict, Union, Optional

from deprecated import deprecated
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataset import ConcatDataset, Subset
Expand Down Expand Up @@ -147,7 +148,7 @@ def load(cls, name: str):
return Dictionary.load_from_file(name)

def __str__(self):
tags = ', '.join(self.get_item_for_index(i) for i in range(min(len(self), 30)))
tags = ', '.join(self.get_item_for_index(i) for i in range(min(len(self), 50)))
return f"Dictionary with {len(self)} tags: {tags}"


Expand Down Expand Up @@ -1383,7 +1384,7 @@ def make_label_dictionary(self, label_type: str) -> Dictionary:
Creates a dictionary of all labels assigned to the sentences in the corpus.
:return: dictionary of labels
"""
label_dictionary: Dictionary = Dictionary(add_unk=False)
label_dictionary: Dictionary = Dictionary(add_unk=True)
label_dictionary.multi_label = False

from flair.datasets import DataLoader
Expand Down Expand Up @@ -1458,6 +1459,7 @@ def get_all_sentences(self) -> Dataset:
if self.test: parts.append(self.test)
return ConcatDataset(parts)

@deprecated(version="0.8", reason="Use 'make_label_dictionary' instead.")
def make_tag_dictionary(self, tag_type: str) -> Dictionary:

# Make the tag dictionary
Expand Down
2 changes: 1 addition & 1 deletion flair/hyperparameter/param_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def __init__(
)

self.tag_type = tag_type
self.tag_dictionary = self.corpus.make_tag_dictionary(self.tag_type)
self.tag_dictionary = self.corpus.make_label_dictionary(self.tag_type)

def _set_up_model(self, params: dict):
sequence_tagger_params = {
Expand Down
4 changes: 2 additions & 2 deletions flair/models/tars_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,10 +196,10 @@ def add_and_switch_to_new_task(self,
if type(label_dictionary) == str:
label_dictionary = [label_dictionary]

# prepare dictionary of tags (without B- I- prefixes)
# prepare dictionary of tags (without B- I- prefixes and without UNK)
tag_dictionary = Dictionary(add_unk=False)
for tag in label_dictionary:
if tag == 'O': continue
if tag == '<unk>' or tag == 'O': continue
if tag[1] == "-":
tag = tag[2:]
tag_dictionary.add_item(tag)
Expand Down
4 changes: 2 additions & 2 deletions resources/docs/HUNFLAIR_TUTORIAL_2_TRAINING.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ corpus = NCBI_DISEASE()
print(corpus)

# 2. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")
tag_dictionary = corpus.make_label_dictionary(label_type="ner")

# 3. initialize embeddings
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
Expand Down Expand Up @@ -134,7 +134,7 @@ embedding_types = [
embeddings = StackedEmbeddings(embeddings=embedding_types)

# 3. initialize sequence tagger
tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")
tag_dictionary = corpus.make_label_dictionary(label_type="ner")

tagger = SequenceTagger(
hidden_size=256,
Expand Down
21 changes: 21 additions & 0 deletions tests/resources/tasks/fashion_disjunct/eng.testa
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Most _ _ O
wedding _ _ B-Occasion
dresses _ _ B-NominalProduct
, _ _ O
for _ _ O
example _ _ O
, _ _ O
are _ _ O
simply _ _ O
too _ _ O
enormous _ _ O
and _ _ O
terrifyingly _ _ B-CreativeWord
loaded _ _ O
with _ _ O
sentimental _ _ O
value _ _ O
for _ _ O
DIY _ _ B-ProductDesign
dyeing _ _ I-ProductDesign
. _ _ O
20 changes: 20 additions & 0 deletions tests/resources/tasks/fashion_disjunct/eng.testb
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
For _ _ O
my _ _ O
Nurse _ _ B-NamedOrganizationBrand
Ratched _ _ I-NamedOrganizationBrand
dress _ _ B-NominalProduct
, _ _ O
I _ _ O
had _ _ O
brought _ _ O
two _ _ O
dyeing _ _ O
options _ _ O
— _ _ O
one _ _ O
more _ _ O
ambitious _ _ B-Ambitiousness
than _ _ O
the _ _ O
other _ _ O
. _ _ O
122 changes: 122 additions & 0 deletions tests/resources/tasks/fashion_disjunct/eng.train
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
From _ _ O
the _ _ O
charming _ _ O
Arlésienne _ _ B-NamedPerson
to _ _ O
the _ _ O
shepherdess _ _ O
in _ _ O
a _ _ O
fairy _ _ O
tale _ _ O
, _ _ O
with _ _ O
faille _ _ B-ProductPart
, _ _ O
piqué _ _ B-ProductPart
, _ _ O
taffeta _ _ B-ProductPart
, _ _ O
tulle _ _ B-ProductPart
, _ _ O
embroidery _ _ B-ProductPart
, _ _ O
lace _ _ B-ProductPart
, _ _ O
the _ _ O
repertoire _ _ O
is _ _ O
inexhaustible _ _ O
. _ _ O




Subscribe _ _ O
to _ _ O
Highsnobiety _ _ B-NamedOrganizationPublisher
on _ _ O
YouTube _ _ B-NamedOrganizationOther
Eric _ _ B-NamedPerson
Schoenborn _ _ I-NamedPerson
and _ _ O
Ed _ _ B-NamedPerson
Selego _ _ I-NamedPerson
have _ _ O
joined _ _ O
forces _ _ O
with _ _ O
Nocturnal _ _ B-NamedOrganizationBrand
skate _ _ B-Activity
shop _ _ O
to _ _ O
turn _ _ O
Drexel _ _ B-NamedLocation
University _ _ I-NamedLocation
’ _ _ O
s _ _ O
Leonard _ _ B-NamedLocation
Pearlstein _ _ I-NamedLocation
Gallery _ _ I-NamedLocation
into _ _ O
an _ _ O
interactive _ _ O
skate _ _ B-Activity
pop _ _ O
- _ _ O
up _ _ O
park _ _ O
. _ _ O

Philly _ _ B-NamedPerson
Radness _ _ I-NamedPerson
accounts _ _ O
for _ _ O
the _ _ O
second _ _ O
installment _ _ O
in _ _ O
the _ _ O
Phenomenal _ _ O
Radness _ _ O
project _ _ O
, _ _ O
after _ _ O
its _ _ O
debut _ _ O
in _ _ O
Miami _ _ B-NamedLocation
a _ _ O
few _ _ O
years _ _ O
ago _ _ O
. _ _ O

Milan _ _ B-NamedLocation
was _ _ O
all _ _ O
the _ _ O
really _ _ O
big _ _ O
girls _ _ O
. _ _ O

It _ _ O
was _ _ O
the _ _ O
best _ _ O
! _ _ O

We _ _ O
go _ _ O
to _ _ O
flea _ _ O
markets _ _ O
together _ _ O
when _ _ O
we _ _ O
' _ _ O
re _ _ O
in _ _ O
LA _ _ B-NamedLocation
. _ _ O
4 changes: 2 additions & 2 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,8 +589,8 @@ def test_tagged_corpus_make_label_dictionary():

label_dict = corpus.make_label_dictionary('label')

assert 2 == len(label_dict)
assert "<unk>" not in label_dict.get_items()
assert 3 == len(label_dict)
assert "<unk>" in label_dict.get_items()
assert "class_1" in label_dict.get_items()
assert "class_2" in label_dict.get_items()

Expand Down
8 changes: 4 additions & 4 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_sequence_tagger_no_crf(results_base_path, tasks_base_path):
corpus: Corpus = ColumnCorpus(
data_folder=tasks_base_path / "trivial" / "trivial_bioes", column_format={0: "text", 1: "ner"}
)
tag_dictionary = corpus.make_tag_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner")

# tagger without CRF
tagger: SequenceTagger = SequenceTagger(
Expand Down Expand Up @@ -77,7 +77,7 @@ def test_sequence_tagger_with_crf(results_base_path, tasks_base_path):
corpus: Corpus = ColumnCorpus(
data_folder=tasks_base_path / "trivial" / "trivial_bioes", column_format={0: "text", 1: "ner"}
)
tag_dictionary = corpus.make_tag_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner")

# tagger without CRF
tagger: SequenceTagger = SequenceTagger(
Expand Down Expand Up @@ -130,7 +130,7 @@ def test_sequence_tagger_stacked(results_base_path, tasks_base_path):
corpus: Corpus = ColumnCorpus(
data_folder=tasks_base_path / "trivial" / "trivial_bioes", column_format={0: "text", 1: "ner"}
)
tag_dictionary = corpus.make_tag_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner")

# tagger without CRF
tagger: SequenceTagger = SequenceTagger(
Expand Down Expand Up @@ -183,7 +183,7 @@ def test_sequence_tagger_transformer_finetune(results_base_path, tasks_base_path
corpus: Corpus = ColumnCorpus(
data_folder=tasks_base_path / "trivial" / "trivial_bioes", column_format={0: "text", 1: "ner"}
)
tag_dictionary = corpus.make_tag_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner")

# tagger without CRF
tagger: SequenceTagger = SequenceTagger(
Expand Down
43 changes: 35 additions & 8 deletions tests/test_sequence_tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_train_load_use_tagger(results_base_path, tasks_base_path):
corpus = flair.datasets.ColumnCorpus(
data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
)
tag_dictionary = corpus.make_tag_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner")

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down Expand Up @@ -104,7 +104,7 @@ def test_train_load_use_tagger_empty_tags(results_base_path, tasks_base_path):
corpus = flair.datasets.ColumnCorpus(
data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
)
tag_dictionary = corpus.make_tag_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner")

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down Expand Up @@ -142,10 +142,37 @@ def test_train_load_use_tagger_empty_tags(results_base_path, tasks_base_path):
del loaded_model


@pytest.mark.integration
def test_train_load_use_tagger_disjunct_tags(results_base_path, tasks_base_path):
corpus = flair.datasets.ColumnCorpus(
data_folder=tasks_base_path / "fashion_disjunct", column_format={0: "text", 3: "ner"}
)
tag_dictionary = corpus.make_label_dictionary("ner")

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
embeddings=turian_embeddings,
tag_dictionary=tag_dictionary,
tag_type="ner",
use_crf=False,
)

# initialize trainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train(
results_base_path,
learning_rate=0.1,
mini_batch_size=2,
max_epochs=2,
shuffle=False,
)


@pytest.mark.integration
def test_train_load_use_tagger_large(results_base_path, tasks_base_path):
corpus = flair.datasets.UD_ENGLISH().downsample(0.05)
tag_dictionary = corpus.make_tag_dictionary("pos")
tag_dictionary = corpus.make_label_dictionary("pos")

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down Expand Up @@ -188,7 +215,7 @@ def test_train_load_use_tagger_flair_embeddings(results_base_path, tasks_base_pa
corpus = flair.datasets.ColumnCorpus(
data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
)
tag_dictionary = corpus.make_tag_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner")

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down Expand Up @@ -231,7 +258,7 @@ def test_train_load_use_tagger_adam(results_base_path, tasks_base_path):
corpus = flair.datasets.ColumnCorpus(
data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
)
tag_dictionary = corpus.make_tag_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner")

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down Expand Up @@ -277,7 +304,7 @@ def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
corpus_2 = flair.datasets.NER_GERMAN_GERMEVAL(base_path=tasks_base_path)

corpus = MultiCorpus([corpus_1, corpus_2])
tag_dictionary = corpus.make_tag_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner")

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down Expand Up @@ -323,7 +350,7 @@ def test_train_resume_tagger(results_base_path, tasks_base_path):
corpus_2 = flair.datasets.NER_GERMAN_GERMEVAL(base_path=tasks_base_path)

corpus = MultiCorpus([corpus_1, corpus_2])
tag_dictionary = corpus.make_tag_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner")

model: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down Expand Up @@ -351,7 +378,7 @@ def test_find_learning_rate(results_base_path, tasks_base_path):
corpus = flair.datasets.ColumnCorpus(
data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
)
tag_dictionary = corpus.make_tag_dictionary("ner")
tag_dictionary = corpus.make_label_dictionary("ner")

tagger: SequenceTagger = SequenceTagger(
hidden_size=64,
Expand Down

0 comments on commit e94d530

Please sign in to comment.