Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add German MobIE NER Dataset #3351

Merged
merged 5 commits into from
Oct 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@
NER_GERMAN_EUROPARL,
NER_GERMAN_GERMEVAL,
NER_GERMAN_LEGAL,
NER_GERMAN_MOBIE,
NER_GERMAN_POLITICS,
NER_HIPE_2022,
NER_HUNGARIAN,
Expand Down Expand Up @@ -469,6 +470,7 @@
"NER_GERMAN_EUROPARL",
"NER_GERMAN_GERMEVAL",
"NER_GERMAN_LEGAL",
"NER_GERMAN_MOBIE",
"NER_GERMAN_POLITICS",
"NER_HIPE_2022",
"NER_HUNGARIAN",
Expand Down
47 changes: 47 additions & 0 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -4764,6 +4764,53 @@ def __init__(
)


class NER_GERMAN_MOBIE(ColumnCorpus):
def __init__(
self,
base_path: Optional[Union[str, Path]] = None,
in_memory: bool = True,
**corpusargs,
) -> None:
"""Initialize the German MobIE NER dataset.

The German MobIE Dataset was introduced in the MobIE paper (https://aclanthology.org/2021.konvens-1.22/).

This is a German-language dataset that has been human-annotated with 20 coarse- and fine-grained entity types,
and it includes entity linking information for geographically linkable entities. The dataset comprises 3,232
social media texts and traffic reports, totaling 91K tokens, with 20.5K annotated entities, of which 13.1K are
linked to a knowledge base. In total, 20 different named entities are annotated.
:param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
to point to a different folder but typically this should not be necessary.
:param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
"""
base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
dataset_name = self.__class__.__name__.lower()
data_folder = base_path / dataset_name
data_path = flair.cache_root / "datasets" / dataset_name

columns = {0: "text", 3: "ner"}

train_data_file = data_path / "train.conll2003"
if not train_data_file.is_file():
temp_file = cached_path(
"https://github.com/DFKI-NLP/MobIE/raw/master/v1_20210811/ner_conll03_formatted.zip",
Path("datasets") / dataset_name,
)
from zipfile import ZipFile

with ZipFile(temp_file, "r") as zip_file:
zip_file.extractall(path=data_path)

super().__init__(
data_folder,
columns,
in_memory=in_memory,
comment_symbol=None,
document_separator_token="-DOCSTART-",
**corpusargs,
)


class MASAKHA_POS(MultiCorpus):
def __init__(
self,
Expand Down
23 changes: 23 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -872,6 +872,29 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag
check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)


def test_german_mobie(tasks_base_path):
corpus = flair.datasets.NER_GERMAN_MOBIE()

# See MobIE paper (https://aclanthology.org/2021.konvens-1.22/), table 2
ref_sentences = 7_077
ref_tokens = 90_971

actual_sentences = sum(
[1 for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"]
)
actual_tokens = sum(
[len(sentence) for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"]
)

assert ref_sentences == actual_sentences, (
f"Number of parsed sentences ({actual_sentences}) does not match with "
f"reported number of sentences ({ref_sentences})!"
)
assert (
ref_tokens == actual_tokens
), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!"


def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
corpus = MultiFileJsonlCorpus(
train_files=[tasks_base_path / "jsonl/train.jsonl"],
Expand Down