diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 8100e4821e..2052a79987 100644 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -194,6 +194,7 @@ NER_GERMAN_EUROPARL, NER_GERMAN_GERMEVAL, NER_GERMAN_LEGAL, + NER_GERMAN_MOBIE, NER_GERMAN_POLITICS, NER_HIPE_2022, NER_HUNGARIAN, @@ -469,6 +470,7 @@ "NER_GERMAN_EUROPARL", "NER_GERMAN_GERMEVAL", "NER_GERMAN_LEGAL", + "NER_GERMAN_MOBIE", "NER_GERMAN_POLITICS", "NER_HIPE_2022", "NER_HUNGARIAN", diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index b3e40342ba..c91b1b5f5e 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4764,6 +4764,53 @@ def __init__( ) +class NER_GERMAN_MOBIE(ColumnCorpus): + def __init__( + self, + base_path: Optional[Union[str, Path]] = None, + in_memory: bool = True, + **corpusargs, + ) -> None: + """Initialize the German MobIE NER dataset. + + The German MobIE Dataset was introduced in the MobIE paper (https://aclanthology.org/2021.konvens-1.22/). + + This is a German-language dataset that has been human-annotated with 20 coarse- and fine-grained entity types, + and it includes entity linking information for geographically linkable entities. The dataset comprises 3,232 + social media texts and traffic reports, totaling 91K tokens, with 20.5K annotated entities, of which 13.1K are + linked to a knowledge base. In total, 20 different named entities are annotated. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. + """ + base_path = flair.cache_root / "datasets" if not base_path else Path(base_path) + dataset_name = self.__class__.__name__.lower() + data_folder = base_path / dataset_name + data_path = flair.cache_root / "datasets" / dataset_name + + columns = {0: "text", 3: "ner"} + + train_data_file = data_path / "train.conll2003" + if not train_data_file.is_file(): + temp_file = cached_path( + "https://github.com/DFKI-NLP/MobIE/raw/master/v1_20210811/ner_conll03_formatted.zip", + Path("datasets") / dataset_name, + ) + from zipfile import ZipFile + + with ZipFile(temp_file, "r") as zip_file: + zip_file.extractall(path=data_path) + + super().__init__( + data_folder, + columns, + in_memory=in_memory, + comment_symbol=None, + document_separator_token="-DOCSTART-", + **corpusargs, + ) + + class MASAKHA_POS(MultiCorpus): def __init__( self, diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 56d524d041..080b6e3d46 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -872,6 +872,29 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version) +def test_german_mobie(tasks_base_path): + corpus = flair.datasets.NER_GERMAN_MOBIE() + + # See MobIE paper (https://aclanthology.org/2021.konvens-1.22/), table 2 + ref_sentences = 7_077 + ref_tokens = 90_971 + + actual_sentences = sum( + [1 for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"] + ) + actual_tokens = sum( + [len(sentence) for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"] + ) + + assert ref_sentences == actual_sentences, ( + f"Number of parsed sentences ({actual_sentences}) does not match with " + f"reported number of sentences ({ref_sentences})!" + ) + assert ( + ref_tokens == actual_tokens + ), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!" + + def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path): corpus = MultiFileJsonlCorpus( train_files=[tasks_base_path / "jsonl/train.jsonl"],