Skip to content

Commit

Permalink
Merge pull request #2827 from Lingepumpe/fix_hipe2022_dataset_creation
Browse files Browse the repository at this point in the history
Fix NER_HIPE_2022 corpus to run prepare_corpus when test.txt is missing
  • Loading branch information
alanakbik authored Jun 27, 2022
2 parents d04bb1e + d71c0f5 commit 3f8f611
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 11 deletions.
19 changes: 10 additions & 9 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import json
import logging
import os
Expand Down Expand Up @@ -4118,10 +4119,13 @@ def __init__(
}

# v2.0 only adds new language and splits for AJMC dataset
hipe_available_splits["v2.0"] = hipe_available_splits["v1.0"].copy()
hipe_available_splits["v2.0"] = copy.deepcopy(hipe_available_splits["v1.0"])
hipe_available_splits["v2.0"]["ajmc"] = {"de": ["train", "dev"], "en": ["train", "dev"], "fr": ["train", "dev"]}

hipe_available_splits["v2.1"] = hipe_available_splits["v2.0"].copy()
hipe_available_splits["v2.1"] = copy.deepcopy(hipe_available_splits["v2.0"])
for dataset_name_values in hipe_available_splits["v2.1"].values():
for splits in dataset_name_values.values():
splits.append("test") # test datasets are only available for >= v2.1

eos_marker = "EndOfSentence"
document_separator = "# hipe2022:document_id"
Expand All @@ -4141,10 +4145,6 @@ def __init__(

dataset_splits = hipe_available_splits[version][dataset_name][language]

if version == "v2.1":
# test datasets are only available for >= v2.1
dataset_splits.append("test")

for split in dataset_splits:
cached_path(
f"{data_url}/HIPE-2022-{version}-{dataset_name}-{split}-{language}.tsv", data_folder / "original"
Expand All @@ -4160,11 +4160,12 @@ def __init__(
new_data_folder = new_data_folder / "with_doc_seperator"
new_data_folder.mkdir(parents=True, exist_ok=True)

dev_path = new_data_folder / dev_file

self.preproc_fn = self._prepare_corpus if not preproc_fn else preproc_fn

if not dev_path.exists():
if not all( # Only reprocess if some files are not there yet
split_path.exists()
for split_path in [new_data_folder / f"{split_file}.txt" for split_file in dataset_splits]
):
for split in dataset_splits:
original_filename = f"HIPE-2022-{version}-{dataset_name}-{split}-{language}.tsv"
self.preproc_fn(
Expand Down
5 changes: 3 additions & 2 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,8 +422,9 @@ def test_load_universal_dependencies_conllu_corpus(tasks_base_path):

def test_hipe_2022_corpus(tasks_base_path):
"""
This test covers the complete v1.0 version of the HIPE 2022,
including the version with document separator.
This test covers the complete HIPE 2022 dataset.
https://github.com/hipe-eval/HIPE-2022-data
Includes variant with document separator, and all versions of the dataset.
"""

# We have manually checked, that these numbers are correct:
Expand Down

0 comments on commit 3f8f611

Please sign in to comment.