diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 31157839a2..48c751f3d9 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -1,3 +1,4 @@ +import copy import json import logging import os @@ -4118,10 +4119,13 @@ def __init__( } # v2.0 only adds new language and splits for AJMC dataset - hipe_available_splits["v2.0"] = hipe_available_splits["v1.0"].copy() + hipe_available_splits["v2.0"] = copy.deepcopy(hipe_available_splits["v1.0"]) hipe_available_splits["v2.0"]["ajmc"] = {"de": ["train", "dev"], "en": ["train", "dev"], "fr": ["train", "dev"]} - hipe_available_splits["v2.1"] = hipe_available_splits["v2.0"].copy() + hipe_available_splits["v2.1"] = copy.deepcopy(hipe_available_splits["v2.0"]) + for dataset_name_values in hipe_available_splits["v2.1"].values(): + for splits in dataset_name_values.values(): + splits.append("test") # test datasets are only available for >= v2.1 eos_marker = "EndOfSentence" document_separator = "# hipe2022:document_id" @@ -4141,10 +4145,6 @@ def __init__( dataset_splits = hipe_available_splits[version][dataset_name][language] - if version == "v2.1": - # test datasets are only available for >= v2.1 - dataset_splits.append("test") - for split in dataset_splits: cached_path( f"{data_url}/HIPE-2022-{version}-{dataset_name}-{split}-{language}.tsv", data_folder / "original" @@ -4160,11 +4160,12 @@ def __init__( new_data_folder = new_data_folder / "with_doc_seperator" new_data_folder.mkdir(parents=True, exist_ok=True) - dev_path = new_data_folder / dev_file - self.preproc_fn = self._prepare_corpus if not preproc_fn else preproc_fn - if not dev_path.exists(): + if not all( # Only reprocess if some files are not there yet + split_path.exists() + for split_path in [new_data_folder / f"{split_file}.txt" for split_file in dataset_splits] + ): for split in dataset_splits: original_filename = f"HIPE-2022-{version}-{dataset_name}-{split}-{language}.tsv" self.preproc_fn( diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 4e4f52fe4c..7b649a8587 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -422,8 +422,9 @@ def test_load_universal_dependencies_conllu_corpus(tasks_base_path): def test_hipe_2022_corpus(tasks_base_path): """ - This test covers the complete v1.0 version of the HIPE 2022, - including the version with document separator. + This test covers the complete HIPE 2022 dataset. + https://github.com/hipe-eval/HIPE-2022-data + Includes variant with document separator, and all versions of the dataset. """ # We have manually checked, that these numbers are correct: