-
Notifications
You must be signed in to change notification settings - Fork 115
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
349 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,349 @@ | ||
# coding=utf-8 | ||
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
""" | ||
HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the | ||
Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio | ||
de Sanidad, Consumo y Bienestar Social. | ||
The dataset contains questions about following topics: medicine, nursing, psychology, chemistry, | ||
pharmacology and biology. | ||
Original code: https://huggingface.co/datasets/head_qa/blob/main/head_qa.py | ||
""" | ||
|
||
import json | ||
import os | ||
from typing import Dict, List, Tuple | ||
|
||
import datasets | ||
|
||
from utils import schemas | ||
from utils.configs import BigBioConfig | ||
from utils.constants import Tasks | ||
|
||
_CITATION = """\ | ||
@inproceedings{vilares-gomez-rodriguez-2019-head, | ||
title = "{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning", | ||
author = "Vilares, David and | ||
G{\'o}mez-Rodr{\'i}guez, Carlos", | ||
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", | ||
month = jul, | ||
year = "2019", | ||
address = "Florence, Italy", | ||
publisher = "Association for Computational Linguistics", | ||
url = "https://www.aclweb.org/anthology/P19-1092", | ||
doi = "10.18653/v1/P19-1092", | ||
pages = "960--966"} | ||
""" | ||
|
||
_DATASETNAME = "head_qa" | ||
|
||
_DESCRIPTION = """\ | ||
HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the | ||
Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio | ||
de Sanidad, Consumo y Bienestar Social. | ||
The dataset contains questions about following topics: medicine, nursing, psychology, chemistry, | ||
pharmacology and biology. | ||
""" | ||
|
||
_HOMEPAGE = "https://aghie.github.io/head-qa/" | ||
|
||
_LICENSE = "MIT License" | ||
|
||
_URLS = { | ||
_DATASETNAME: "https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t", | ||
} | ||
|
||
_SUPPORTED_TASKS = [Tasks.TRANSLATION, Tasks.QUESTION_ANSWERING] | ||
|
||
_SOURCE_VERSION = "1.0.0" | ||
|
||
_BIGBIO_VERSION = "1.0.0" | ||
|
||
|
||
class HeadQADataset(datasets.GeneratorBasedBuilder): | ||
"""HEAD-QA: A Healthcare Dataset for Complex Reasoning""" | ||
|
||
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) | ||
BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) | ||
|
||
BUILDER_CONFIGS = [ | ||
BigBioConfig( | ||
name="head_qa_source", | ||
version=SOURCE_VERSION, | ||
description="HeadQA both languages source schema", | ||
schema="source", | ||
subset_id="head_qa", | ||
), | ||
BigBioConfig( | ||
name="head_qa_en_source", | ||
version=SOURCE_VERSION, | ||
description="HeadQA English source schema", | ||
schema="source", | ||
subset_id="head_qa_en", | ||
), | ||
BigBioConfig( | ||
name="head_qa_es_source", | ||
version=SOURCE_VERSION, | ||
description="HeadQA Spanish source schema", | ||
schema="source", | ||
subset_id="head_qa_es", | ||
), | ||
BigBioConfig( | ||
name="head_qa_bigbio_t2t", | ||
version=BIGBIO_VERSION, | ||
description="HeadQA Translation BigBio schema", | ||
schema="bigbio_t2t", | ||
subset_id="head_qa", | ||
), | ||
BigBioConfig( | ||
name="head_qa_en_bigbio_qa", | ||
version=BIGBIO_VERSION, | ||
description="HeadQA English Question Answering BigBio schema", | ||
schema="bigbio_qa", | ||
subset_id="head_qa_en", | ||
), | ||
BigBioConfig( | ||
name="head_qa_es_bigbio_qa", | ||
version=BIGBIO_VERSION, | ||
description="HeadQA Spanish Question Answering BigBio schema", | ||
schema="bigbio_qa", | ||
subset_id="head_qa_es", | ||
), | ||
] | ||
|
||
DEFAULT_CONFIG_NAME = "head_qa_en_source" | ||
|
||
def _info(self) -> datasets.DatasetInfo: | ||
|
||
if self.config.schema == "source" and self.config.subset_id == "head_qa": | ||
features = datasets.Features( | ||
{ | ||
"name": datasets.Value("string"), | ||
"year": datasets.Value("string"), | ||
"category": datasets.Value("string"), | ||
"qid": datasets.Value("int32"), | ||
"qtext": { | ||
"en": datasets.Value("string"), | ||
"es": datasets.Value("string"), | ||
}, | ||
"ra": datasets.Value("int32"), | ||
"image": datasets.Image(), | ||
"answers": [ | ||
{ | ||
"aid": datasets.Value("int32"), | ||
"atext": { | ||
"en": datasets.Value("string"), | ||
"es": datasets.Value("string"), | ||
}, | ||
} | ||
], | ||
} | ||
) | ||
elif self.config.schema == "source" and self.config.subset_id in ["head_qa_en", "head_qa_es"]: | ||
features = datasets.Features( | ||
{ | ||
"name": datasets.Value("string"), | ||
"year": datasets.Value("string"), | ||
"category": datasets.Value("string"), | ||
"qid": datasets.Value("int32"), | ||
"qtext": datasets.Value("string"), | ||
"ra": datasets.Value("int32"), | ||
"image": datasets.Image(), | ||
"answers": [ | ||
{ | ||
"aid": datasets.Value("int32"), | ||
"atext": datasets.Value("string"), | ||
} | ||
], | ||
} | ||
) | ||
elif self.config.schema == "bigbio_t2t": | ||
features = schemas.text2text_features | ||
elif self.config.schema == "bigbio_qa": | ||
features = schemas.qa_features | ||
|
||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=features, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: | ||
"""Returns SplitGenerators.""" | ||
|
||
urls = _URLS[_DATASETNAME] | ||
data_dir = dl_manager.download_and_extract(urls) | ||
|
||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={ | ||
"data_dir": data_dir, | ||
"en_path": os.path.join(data_dir, "HEAD_EN", "train_HEAD_EN.json"), | ||
"es_path": os.path.join(data_dir, "HEAD", "train_HEAD.json"), | ||
}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TEST, | ||
gen_kwargs={ | ||
"data_dir": data_dir, | ||
"en_path": os.path.join(data_dir, "HEAD_EN", "test_HEAD_EN.json"), | ||
"es_path": os.path.join(data_dir, "HEAD", "test_HEAD.json"), | ||
}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.VALIDATION, | ||
gen_kwargs={ | ||
"data_dir": data_dir, | ||
"en_path": os.path.join(data_dir, "HEAD_EN", "dev_HEAD_EN.json"), | ||
"es_path": os.path.join(data_dir, "HEAD", "dev_HEAD.json"), | ||
}, | ||
), | ||
] | ||
|
||
def _generate_examples(self, data_dir, en_path, es_path) -> Tuple[int, Dict]: | ||
"""Yields examples as (key, example) tuples.""" | ||
|
||
if self.config.schema == "source" and self.config.subset_id == "head_qa": | ||
for key, example in self._merge_documents( | ||
self._generate_source_documents(data_dir, en_path), self._generate_source_documents(data_dir, es_path) | ||
): | ||
yield key, example | ||
|
||
elif self.config.schema == "source" and self.config.subset_id in ["head_qa_en", "head_qa_es"]: | ||
if self.config.subset_id == "head_qa_en": | ||
filepath = en_path | ||
elif self.config.subset_id == "head_qa_es": | ||
filepath = es_path | ||
for key, example in self._generate_source_documents(data_dir, filepath): | ||
yield key, example | ||
|
||
elif self.config.schema == "bigbio_t2t": | ||
for key, example in self._merge_documents( | ||
self._generate_source_documents(data_dir, en_path), self._generate_source_documents(data_dir, es_path) | ||
): | ||
for key_t2t, example_t2t in self._generate_source_to_t2t(example): | ||
yield key_t2t, example_t2t | ||
|
||
elif self.config.schema == "bigbio_qa": | ||
if self.config.subset_id == "head_qa_en": | ||
filepath = en_path | ||
elif self.config.subset_id == "head_qa_es": | ||
filepath = es_path | ||
for key, example in self._generate_source_documents(data_dir, filepath): | ||
yield key, self._source_to_qa(example) | ||
|
||
def _generate_source_documents(self, data_dir, filepath): | ||
|
||
with open(filepath, encoding="utf-8") as f: | ||
head_qa = json.load(f) | ||
|
||
for exam_id, exam in enumerate(head_qa["exams"]): | ||
content = head_qa["exams"][exam] | ||
name = content["name"].strip() | ||
year = content["year"].strip() | ||
category = content["category"].strip() | ||
for question in content["data"]: | ||
qid = int(question["qid"].strip()) | ||
qtext = question["qtext"].strip() | ||
ra = int(question["ra"].strip()) | ||
image_path = question["image"].strip() | ||
|
||
aids = [answer["aid"] for answer in question["answers"]] | ||
atexts = [answer["atext"].strip() for answer in question["answers"]] | ||
answers = [{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)] | ||
|
||
id_ = f"{exam_id}_{qid}" | ||
yield id_, { | ||
"name": name, | ||
"year": year, | ||
"category": category, | ||
"qid": qid, | ||
"qtext": qtext, | ||
"ra": ra, | ||
"image": os.path.join(data_dir, image_path) if image_path else None, | ||
"answers": answers, | ||
} | ||
|
||
def _merge_documents(self, gen_en, gen_es): | ||
for (doc_en_id, doc_en), (doc_es_id, doc_es) in zip(gen_en, gen_es): | ||
assert doc_en_id == doc_es_id, "ohno" | ||
self._assert_eq_doc(doc_en, doc_es) | ||
|
||
doc_merge = doc_en.copy() | ||
doc_merge["qtext"] = {"en": doc_en["qtext"], "es": doc_es["qtext"]} | ||
answers = [] | ||
for answer_en, answer_es in zip(doc_en["answers"], doc_es["answers"]): | ||
assert answer_en["aid"] == answer_es["aid"], "ohno" | ||
answers.append( | ||
{ | ||
"aid": answer_en["aid"], | ||
"atext": { | ||
"en": answer_en["atext"], | ||
"es": answer_es["atext"], | ||
}, | ||
} | ||
) | ||
doc_merge["answers"] = answers | ||
yield doc_en_id, doc_merge | ||
|
||
def _assert_eq_doc(self, doc1, doc2): | ||
doc1 = doc1.copy() | ||
doc2 = doc2.copy() | ||
doc1.pop("qtext") | ||
doc1.pop("answers") | ||
doc2.pop("qtext") | ||
doc2.pop("answers") | ||
assert doc1 == doc2, f"ohno {doc1} {doc2}" | ||
|
||
def _source_to_qa(self, example): | ||
example_ = {} | ||
example_["id"] = example["name"] + "_qid_" + str(example["qid"]) | ||
example_["question_id"] = example["qid"] | ||
example_["document_id"] = "" | ||
example_["question"] = example["qtext"] | ||
example_["type"] = example["category"] | ||
example_["choices"] = [answer["atext"] for answer in example["answers"]] | ||
example_["context"] = "" | ||
example_["answer"] = [next(filter(lambda answer: answer["aid"] == example["ra"], example["answers"]))["atext"]] | ||
|
||
return example_ | ||
|
||
def _generate_source_to_t2t(self, example): | ||
id = example["name"] + "_qid_" + str(example["qid"]) | ||
example_ = { | ||
"id": id, | ||
"document_id": "", | ||
"text_1": example["qtext"]["en"], | ||
"text_2": example["qtext"]["es"], | ||
"text_1_name": "en", | ||
"text_2_name": "es", | ||
} | ||
yield id, example_ | ||
|
||
for answer in example["answers"]: | ||
id = example["name"] + "_qid_" + str(example["qid"]) + "_aid_" + str(answer["aid"]) | ||
example_ = { | ||
"id": id, | ||
"document_id": "", | ||
"text_1": answer["atext"]["en"], | ||
"text_2": answer["atext"]["es"], | ||
"text_1_name": "en", | ||
"text_2_name": "es", | ||
} | ||
yield id, example_ |