From 83809c8ed76635d07a4bbbe97404071b34e2fcf5 Mon Sep 17 00:00:00 2001 From: dafnapension Date: Tue, 20 Feb 2024 07:38:51 +0200 Subject: [PATCH] filter on loading rather than increase loading limit Signed-off-by: dafnapension --- prepare/cards/cohere_for_ai.py | 25 ++++++++----------- .../aya_human_annotated/deu.json | 10 ++------ .../aya_human_annotated/eng.json | 10 ++------ .../aya_human_annotated/fra.json | 10 ++------ .../aya_human_annotated/jpn.json | 10 ++------ .../aya_human_annotated/por.json | 10 ++------ .../aya_human_annotated/spa.json | 10 ++------ .../cohere_for_ai/dolly_human_edited/fra.json | 10 ++------ .../cohere_for_ai/dolly_human_edited/spa.json | 10 ++------ .../dolly_machine_translated/deu.json | 10 ++------ .../dolly_machine_translated/eng.json | 10 ++------ .../dolly_machine_translated/fra.json | 10 ++------ .../dolly_machine_translated/jpn.json | 10 ++------ .../dolly_machine_translated/por.json | 10 ++------ .../dolly_machine_translated/spa.json | 10 ++------ src/unitxt/loaders.py | 19 +++++++++++++- 16 files changed, 56 insertions(+), 128 deletions(-) diff --git a/prepare/cards/cohere_for_ai.py b/prepare/cards/cohere_for_ai.py index 92865e174b..ed3ebc4a32 100644 --- a/prepare/cards/cohere_for_ai.py +++ b/prepare/cards/cohere_for_ai.py @@ -6,17 +6,10 @@ ) from src.unitxt.catalog import add_to_catalog from src.unitxt.logging_utils import get_logger -from src.unitxt.operators import FilterByCondition, ListFieldValues -from src.unitxt.settings_utils import get_settings +from src.unitxt.operators import ListFieldValues from src.unitxt.standard import StandardRecipeWithIndexes from src.unitxt.test_utils.card import test_card -settings = get_settings() -orig_settings = settings.global_loader_limit -settings.global_loader_limit = 25000 # to ensure language is encountered - -logger = get_logger() - dataset_name = "CohereForAI/aya_evaluation_suite" subsets = ["aya_human_annotated", "dolly_machine_translated", "dolly_human_edited"] langs = ["eng", "fra", "deu", "spa", "por", "jpn"] @@ -29,12 +22,16 @@ for subset in subsets: for lang in subset_to_langs[subset]: card = TaskCard( - loader=LoadHF(path=dataset_name, name=subset, streaming=True), + loader=LoadHF( + path=dataset_name, + name=subset, + streaming=True, + filtering_lambda=f'lambda instance: instance["language"]=="{lang}"', + ), preprocess_steps=[ SplitRandomMix( {"train": "test[90%]", "validation": "test[5%]", "test": "test[5%]"} ), - FilterByCondition(values={"language": lang}, condition="eq"), RenameFields( field_to_field={"inputs": "question", "targets": "answers"} ), @@ -44,16 +41,15 @@ templates="templates.qa.open.all", ) if lang == subset_to_langs[subset][0]: - test_card( - card, debug=False, loader_limit=25000, strict=False - ) # 25000 to reach every language + test_card(card, debug=False, strict=False) add_to_catalog(card, f"cards.cohere_for_ai.{subset}.{lang}", overwrite=True) ######################## to remove once done ############################ +logger = get_logger() recipe = StandardRecipeWithIndexes( template_card_index=1, card=f"cards.cohere_for_ai.{subsets[0]}.{langs[0]}", - num_demos=1, + num_demos=2, demos_pool_size=10, ) ms = recipe() @@ -73,4 +69,3 @@ logger.info("+++++++++++done+++++++++++++++") logger.info("done") ############# end of to remove once done ################## -settings.global_loader_limit = orig_settings diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/deu.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/deu.json index 32515ed7ec..1608447c5f 100644 --- a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/deu.json +++ b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/deu.json @@ -4,7 +4,8 @@ "type": "load_hf", "path": "CohereForAI/aya_evaluation_suite", "name": "aya_human_annotated", - "streaming": true + "streaming": true, + "filtering_lambda": "lambda instance: instance[\"language\"]==\"deu\"" }, "preprocess_steps": [ { @@ -15,13 +16,6 @@ "test": "test[5%]" } }, - { - "type": "filter_by_condition", - "values": { - "language": "deu" - }, - "condition": "eq" - }, { "type": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/eng.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/eng.json index eba48ec427..b46f2d7c03 100644 --- a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/eng.json +++ b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/eng.json @@ -4,7 +4,8 @@ "type": "load_hf", "path": "CohereForAI/aya_evaluation_suite", "name": "aya_human_annotated", - "streaming": true + "streaming": true, + "filtering_lambda": "lambda instance: instance[\"language\"]==\"eng\"" }, "preprocess_steps": [ { @@ -15,13 +16,6 @@ "test": "test[5%]" } }, - { - "type": "filter_by_condition", - "values": { - "language": "eng" - }, - "condition": "eq" - }, { "type": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/fra.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/fra.json index 0889eb1099..8adcb1d12a 100644 --- a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/fra.json +++ b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/fra.json @@ -4,7 +4,8 @@ "type": "load_hf", "path": "CohereForAI/aya_evaluation_suite", "name": "aya_human_annotated", - "streaming": true + "streaming": true, + "filtering_lambda": "lambda instance: instance[\"language\"]==\"fra\"" }, "preprocess_steps": [ { @@ -15,13 +16,6 @@ "test": "test[5%]" } }, - { - "type": "filter_by_condition", - "values": { - "language": "fra" - }, - "condition": "eq" - }, { "type": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/jpn.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/jpn.json index 1f0d7883a3..ad6aca7d63 100644 --- a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/jpn.json +++ b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/jpn.json @@ -4,7 +4,8 @@ "type": "load_hf", "path": "CohereForAI/aya_evaluation_suite", "name": "aya_human_annotated", - "streaming": true + "streaming": true, + "filtering_lambda": "lambda instance: instance[\"language\"]==\"jpn\"" }, "preprocess_steps": [ { @@ -15,13 +16,6 @@ "test": "test[5%]" } }, - { - "type": "filter_by_condition", - "values": { - "language": "jpn" - }, - "condition": "eq" - }, { "type": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/por.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/por.json index ee65a90559..1ee5d24b34 100644 --- a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/por.json +++ b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/por.json @@ -4,7 +4,8 @@ "type": "load_hf", "path": "CohereForAI/aya_evaluation_suite", "name": "aya_human_annotated", - "streaming": true + "streaming": true, + "filtering_lambda": "lambda instance: instance[\"language\"]==\"por\"" }, "preprocess_steps": [ { @@ -15,13 +16,6 @@ "test": "test[5%]" } }, - { - "type": "filter_by_condition", - "values": { - "language": "por" - }, - "condition": "eq" - }, { "type": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/spa.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/spa.json index d0bf4714cf..9515b8c4ce 100644 --- a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/spa.json +++ b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/spa.json @@ -4,7 +4,8 @@ "type": "load_hf", "path": "CohereForAI/aya_evaluation_suite", "name": "aya_human_annotated", - "streaming": true + "streaming": true, + "filtering_lambda": "lambda instance: instance[\"language\"]==\"spa\"" }, "preprocess_steps": [ { @@ -15,13 +16,6 @@ "test": "test[5%]" } }, - { - "type": "filter_by_condition", - "values": { - "language": "spa" - }, - "condition": "eq" - }, { "type": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/fra.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/fra.json index 66f9f3740c..29a5b61bcd 100644 --- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/fra.json +++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/fra.json @@ -4,7 +4,8 @@ "type": "load_hf", "path": "CohereForAI/aya_evaluation_suite", "name": "dolly_human_edited", - "streaming": true + "streaming": true, + "filtering_lambda": "lambda instance: instance[\"language\"]==\"fra\"" }, "preprocess_steps": [ { @@ -15,13 +16,6 @@ "test": "test[5%]" } }, - { - "type": "filter_by_condition", - "values": { - "language": "fra" - }, - "condition": "eq" - }, { "type": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/spa.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/spa.json index 79287474ea..6bc8df4ffd 100644 --- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/spa.json +++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/spa.json @@ -4,7 +4,8 @@ "type": "load_hf", "path": "CohereForAI/aya_evaluation_suite", "name": "dolly_human_edited", - "streaming": true + "streaming": true, + "filtering_lambda": "lambda instance: instance[\"language\"]==\"spa\"" }, "preprocess_steps": [ { @@ -15,13 +16,6 @@ "test": "test[5%]" } }, - { - "type": "filter_by_condition", - "values": { - "language": "spa" - }, - "condition": "eq" - }, { "type": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/deu.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/deu.json index aa14112fe4..a001f77855 100644 --- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/deu.json +++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/deu.json @@ -4,7 +4,8 @@ "type": "load_hf", "path": "CohereForAI/aya_evaluation_suite", "name": "dolly_machine_translated", - "streaming": true + "streaming": true, + "filtering_lambda": "lambda instance: instance[\"language\"]==\"deu\"" }, "preprocess_steps": [ { @@ -15,13 +16,6 @@ "test": "test[5%]" } }, - { - "type": "filter_by_condition", - "values": { - "language": "deu" - }, - "condition": "eq" - }, { "type": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/eng.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/eng.json index 51eeb13295..21907e4788 100644 --- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/eng.json +++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/eng.json @@ -4,7 +4,8 @@ "type": "load_hf", "path": "CohereForAI/aya_evaluation_suite", "name": "dolly_machine_translated", - "streaming": true + "streaming": true, + "filtering_lambda": "lambda instance: instance[\"language\"]==\"eng\"" }, "preprocess_steps": [ { @@ -15,13 +16,6 @@ "test": "test[5%]" } }, - { - "type": "filter_by_condition", - "values": { - "language": "eng" - }, - "condition": "eq" - }, { "type": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/fra.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/fra.json index 85ff3314ee..a6cb526d8f 100644 --- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/fra.json +++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/fra.json @@ -4,7 +4,8 @@ "type": "load_hf", "path": "CohereForAI/aya_evaluation_suite", "name": "dolly_machine_translated", - "streaming": true + "streaming": true, + "filtering_lambda": "lambda instance: instance[\"language\"]==\"fra\"" }, "preprocess_steps": [ { @@ -15,13 +16,6 @@ "test": "test[5%]" } }, - { - "type": "filter_by_condition", - "values": { - "language": "fra" - }, - "condition": "eq" - }, { "type": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/jpn.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/jpn.json index 29cf829dc9..35d5273b80 100644 --- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/jpn.json +++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/jpn.json @@ -4,7 +4,8 @@ "type": "load_hf", "path": "CohereForAI/aya_evaluation_suite", "name": "dolly_machine_translated", - "streaming": true + "streaming": true, + "filtering_lambda": "lambda instance: instance[\"language\"]==\"jpn\"" }, "preprocess_steps": [ { @@ -15,13 +16,6 @@ "test": "test[5%]" } }, - { - "type": "filter_by_condition", - "values": { - "language": "jpn" - }, - "condition": "eq" - }, { "type": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/por.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/por.json index abe3574d74..a27cc51095 100644 --- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/por.json +++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/por.json @@ -4,7 +4,8 @@ "type": "load_hf", "path": "CohereForAI/aya_evaluation_suite", "name": "dolly_machine_translated", - "streaming": true + "streaming": true, + "filtering_lambda": "lambda instance: instance[\"language\"]==\"por\"" }, "preprocess_steps": [ { @@ -15,13 +16,6 @@ "test": "test[5%]" } }, - { - "type": "filter_by_condition", - "values": { - "language": "por" - }, - "condition": "eq" - }, { "type": "rename_fields", "field_to_field": { diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/spa.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/spa.json index 41820ee369..7f48eab580 100644 --- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/spa.json +++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/spa.json @@ -4,7 +4,8 @@ "type": "load_hf", "path": "CohereForAI/aya_evaluation_suite", "name": "dolly_machine_translated", - "streaming": true + "streaming": true, + "filtering_lambda": "lambda instance: instance[\"language\"]==\"spa\"" }, "preprocess_steps": [ { @@ -15,13 +16,6 @@ "test": "test[5%]" } }, - { - "type": "filter_by_condition", - "values": { - "language": "spa" - }, - "condition": "eq" - }, { "type": "rename_fields", "field_to_field": { diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py index e503d94735..0509bc3cc7 100644 --- a/src/unitxt/loaders.py +++ b/src/unitxt/loaders.py @@ -48,7 +48,7 @@ class Loader(SourceOperator): # It is usually provided to the loader via the recipe (see standard.py) # The loader can use this value to limit the amount of data downloaded from the source # to reduce loading time. However, this may not always be possible, so the - # loader may ingore this. In any case, the recipe, will limit the number of instances in the returned + # loader may ignore this. In any case, the recipe, will limit the number of instances in the returned # stream, after load is complete. loader_limit: int = None streaming: bool = False @@ -84,8 +84,18 @@ class LoadHF(Loader): Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]] ] = None streaming: bool = True + filtering_lambda: Optional[str] = None _cache: dict = InternalField(default=None) + def filtered_load(self, dataset): + logger.info(f"\nLoading filtered by: {self.filtering_lambda};") + return MultiStream( + { + name: dataset[name].filter(eval(self.filtering_lambda)) + for name in dataset + } + ) + def stream_dataset(self): if self._cache is None: with tempfile.TemporaryDirectory() as dir_to_be_deleted: @@ -106,6 +116,9 @@ def stream_dataset(self): f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE." ) from e + if self.filtering_lambda is not None: + dataset = self.filtered_load(dataset) + if self.split is not None: dataset = {self.split: dataset} @@ -135,6 +148,10 @@ def load_dataset(self): raise ValueError( f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE." ) from e + + if self.filtering_lambda is not None: + dataset = self.filtered_load(dataset) + if self.split is None: for split in dataset.keys(): dataset[split] = dataset[split].to_iterable_dataset()