Skip to content

Commit

Permalink
Initial head qa dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
nomisto committed Apr 22, 2022
1 parent 06f1a34 commit de3f664
Showing 1 changed file with 349 additions and 0 deletions.
349 changes: 349 additions & 0 deletions biodatasets/head_qa/head_qa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the
Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio
de Sanidad, Consumo y Bienestar Social.
The dataset contains questions about following topics: medicine, nursing, psychology, chemistry,
pharmacology and biology.
Original code: https://huggingface.co/datasets/head_qa/blob/main/head_qa.py
"""

import json
import os
from typing import Dict, List, Tuple

import datasets

from utils import schemas
from utils.configs import BigBioConfig
from utils.constants import Tasks

_CITATION = """\
@inproceedings{vilares-gomez-rodriguez-2019-head,
title = "{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning",
author = "Vilares, David and
G{\'o}mez-Rodr{\'i}guez, Carlos",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1092",
doi = "10.18653/v1/P19-1092",
pages = "960--966"}
"""

_DATASETNAME = "head_qa"

_DESCRIPTION = """\
HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the
Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio
de Sanidad, Consumo y Bienestar Social.
The dataset contains questions about following topics: medicine, nursing, psychology, chemistry,
pharmacology and biology.
"""

_HOMEPAGE = "https://aghie.github.io/head-qa/"

_LICENSE = "MIT License"

_URLS = {
_DATASETNAME: "https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t",
}

_SUPPORTED_TASKS = [Tasks.TRANSLATION, Tasks.QUESTION_ANSWERING]

_SOURCE_VERSION = "1.0.0"

_BIGBIO_VERSION = "1.0.0"


class HeadQADataset(datasets.GeneratorBasedBuilder):
"""HEAD-QA: A Healthcare Dataset for Complex Reasoning"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)

BUILDER_CONFIGS = [
BigBioConfig(
name="head_qa_source",
version=SOURCE_VERSION,
description="HeadQA both languages source schema",
schema="source",
subset_id="head_qa",
),
BigBioConfig(
name="head_qa_en_source",
version=SOURCE_VERSION,
description="HeadQA English source schema",
schema="source",
subset_id="head_qa_en",
),
BigBioConfig(
name="head_qa_es_source",
version=SOURCE_VERSION,
description="HeadQA Spanish source schema",
schema="source",
subset_id="head_qa_es",
),
BigBioConfig(
name="head_qa_bigbio_t2t",
version=BIGBIO_VERSION,
description="HeadQA Translation BigBio schema",
schema="bigbio_t2t",
subset_id="head_qa",
),
BigBioConfig(
name="head_qa_en_bigbio_qa",
version=BIGBIO_VERSION,
description="HeadQA English Question Answering BigBio schema",
schema="bigbio_qa",
subset_id="head_qa_en",
),
BigBioConfig(
name="head_qa_es_bigbio_qa",
version=BIGBIO_VERSION,
description="HeadQA Spanish Question Answering BigBio schema",
schema="bigbio_qa",
subset_id="head_qa_es",
),
]

DEFAULT_CONFIG_NAME = "head_qa_en_source"

def _info(self) -> datasets.DatasetInfo:

if self.config.schema == "source" and self.config.subset_id == "head_qa":
features = datasets.Features(
{
"name": datasets.Value("string"),
"year": datasets.Value("string"),
"category": datasets.Value("string"),
"qid": datasets.Value("int32"),
"qtext": {
"en": datasets.Value("string"),
"es": datasets.Value("string"),
},
"ra": datasets.Value("int32"),
"image": datasets.Image(),
"answers": [
{
"aid": datasets.Value("int32"),
"atext": {
"en": datasets.Value("string"),
"es": datasets.Value("string"),
},
}
],
}
)
elif self.config.schema == "source" and self.config.subset_id in ["head_qa_en", "head_qa_es"]:
features = datasets.Features(
{
"name": datasets.Value("string"),
"year": datasets.Value("string"),
"category": datasets.Value("string"),
"qid": datasets.Value("int32"),
"qtext": datasets.Value("string"),
"ra": datasets.Value("int32"),
"image": datasets.Image(),
"answers": [
{
"aid": datasets.Value("int32"),
"atext": datasets.Value("string"),
}
],
}
)
elif self.config.schema == "bigbio_t2t":
features = schemas.text2text_features
elif self.config.schema == "bigbio_qa":
features = schemas.qa_features

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""

urls = _URLS[_DATASETNAME]
data_dir = dl_manager.download_and_extract(urls)

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"data_dir": data_dir,
"en_path": os.path.join(data_dir, "HEAD_EN", "train_HEAD_EN.json"),
"es_path": os.path.join(data_dir, "HEAD", "train_HEAD.json"),
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"data_dir": data_dir,
"en_path": os.path.join(data_dir, "HEAD_EN", "test_HEAD_EN.json"),
"es_path": os.path.join(data_dir, "HEAD", "test_HEAD.json"),
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"data_dir": data_dir,
"en_path": os.path.join(data_dir, "HEAD_EN", "dev_HEAD_EN.json"),
"es_path": os.path.join(data_dir, "HEAD", "dev_HEAD.json"),
},
),
]

def _generate_examples(self, data_dir, en_path, es_path) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""

if self.config.schema == "source" and self.config.subset_id == "head_qa":
for key, example in self._merge_documents(
self._generate_source_documents(data_dir, en_path), self._generate_source_documents(data_dir, es_path)
):
yield key, example

elif self.config.schema == "source" and self.config.subset_id in ["head_qa_en", "head_qa_es"]:
if self.config.subset_id == "head_qa_en":
filepath = en_path
elif self.config.subset_id == "head_qa_es":
filepath = es_path
for key, example in self._generate_source_documents(data_dir, filepath):
yield key, example

elif self.config.schema == "bigbio_t2t":
for key, example in self._merge_documents(
self._generate_source_documents(data_dir, en_path), self._generate_source_documents(data_dir, es_path)
):
for key_t2t, example_t2t in self._generate_source_to_t2t(example):
yield key_t2t, example_t2t

elif self.config.schema == "bigbio_qa":
if self.config.subset_id == "head_qa_en":
filepath = en_path
elif self.config.subset_id == "head_qa_es":
filepath = es_path
for key, example in self._generate_source_documents(data_dir, filepath):
yield key, self._source_to_qa(example)

def _generate_source_documents(self, data_dir, filepath):

with open(filepath, encoding="utf-8") as f:
head_qa = json.load(f)

for exam_id, exam in enumerate(head_qa["exams"]):
content = head_qa["exams"][exam]
name = content["name"].strip()
year = content["year"].strip()
category = content["category"].strip()
for question in content["data"]:
qid = int(question["qid"].strip())
qtext = question["qtext"].strip()
ra = int(question["ra"].strip())
image_path = question["image"].strip()

aids = [answer["aid"] for answer in question["answers"]]
atexts = [answer["atext"].strip() for answer in question["answers"]]
answers = [{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)]

id_ = f"{exam_id}_{qid}"
yield id_, {
"name": name,
"year": year,
"category": category,
"qid": qid,
"qtext": qtext,
"ra": ra,
"image": os.path.join(data_dir, image_path) if image_path else None,
"answers": answers,
}

def _merge_documents(self, gen_en, gen_es):
for (doc_en_id, doc_en), (doc_es_id, doc_es) in zip(gen_en, gen_es):
assert doc_en_id == doc_es_id, "ohno"
self._assert_eq_doc(doc_en, doc_es)

doc_merge = doc_en.copy()
doc_merge["qtext"] = {"en": doc_en["qtext"], "es": doc_es["qtext"]}
answers = []
for answer_en, answer_es in zip(doc_en["answers"], doc_es["answers"]):
assert answer_en["aid"] == answer_es["aid"], "ohno"
answers.append(
{
"aid": answer_en["aid"],
"atext": {
"en": answer_en["atext"],
"es": answer_es["atext"],
},
}
)
doc_merge["answers"] = answers
yield doc_en_id, doc_merge

def _assert_eq_doc(self, doc1, doc2):
doc1 = doc1.copy()
doc2 = doc2.copy()
doc1.pop("qtext")
doc1.pop("answers")
doc2.pop("qtext")
doc2.pop("answers")
assert doc1 == doc2, f"ohno {doc1} {doc2}"

def _source_to_qa(self, example):
example_ = {}
example_["id"] = example["name"] + "_qid_" + str(example["qid"])
example_["question_id"] = example["qid"]
example_["document_id"] = ""
example_["question"] = example["qtext"]
example_["type"] = example["category"]
example_["choices"] = [answer["atext"] for answer in example["answers"]]
example_["context"] = ""
example_["answer"] = [next(filter(lambda answer: answer["aid"] == example["ra"], example["answers"]))["atext"]]

return example_

def _generate_source_to_t2t(self, example):
id = example["name"] + "_qid_" + str(example["qid"])
example_ = {
"id": id,
"document_id": "",
"text_1": example["qtext"]["en"],
"text_2": example["qtext"]["es"],
"text_1_name": "en",
"text_2_name": "es",
}
yield id, example_

for answer in example["answers"]:
id = example["name"] + "_qid_" + str(example["qid"]) + "_aid_" + str(answer["aid"])
example_ = {
"id": id,
"document_id": "",
"text_1": answer["atext"]["en"],
"text_2": answer["atext"]["es"],
"text_1_name": "en",
"text_2_name": "es",
}
yield id, example_

0 comments on commit de3f664

Please sign in to comment.