From 83809c8ed76635d07a4bbbe97404071b34e2fcf5 Mon Sep 17 00:00:00 2001
From: dafnapension <dafnashein@yahoo.com>
Date: Tue, 20 Feb 2024 07:38:51 +0200
Subject: [PATCH] filter on loading rather than increase loading limit

Signed-off-by: dafnapension <dafnashein@yahoo.com>
---
 prepare/cards/cohere_for_ai.py                | 25 ++++++++-----------
 .../aya_human_annotated/deu.json              | 10 ++------
 .../aya_human_annotated/eng.json              | 10 ++------
 .../aya_human_annotated/fra.json              | 10 ++------
 .../aya_human_annotated/jpn.json              | 10 ++------
 .../aya_human_annotated/por.json              | 10 ++------
 .../aya_human_annotated/spa.json              | 10 ++------
 .../cohere_for_ai/dolly_human_edited/fra.json | 10 ++------
 .../cohere_for_ai/dolly_human_edited/spa.json | 10 ++------
 .../dolly_machine_translated/deu.json         | 10 ++------
 .../dolly_machine_translated/eng.json         | 10 ++------
 .../dolly_machine_translated/fra.json         | 10 ++------
 .../dolly_machine_translated/jpn.json         | 10 ++------
 .../dolly_machine_translated/por.json         | 10 ++------
 .../dolly_machine_translated/spa.json         | 10 ++------
 src/unitxt/loaders.py                         | 19 +++++++++++++-
 16 files changed, 56 insertions(+), 128 deletions(-)

diff --git a/prepare/cards/cohere_for_ai.py b/prepare/cards/cohere_for_ai.py
index 92865e174b..ed3ebc4a32 100644
--- a/prepare/cards/cohere_for_ai.py
+++ b/prepare/cards/cohere_for_ai.py
@@ -6,17 +6,10 @@
 )
 from src.unitxt.catalog import add_to_catalog
 from src.unitxt.logging_utils import get_logger
-from src.unitxt.operators import FilterByCondition, ListFieldValues
-from src.unitxt.settings_utils import get_settings
+from src.unitxt.operators import ListFieldValues
 from src.unitxt.standard import StandardRecipeWithIndexes
 from src.unitxt.test_utils.card import test_card
 
-settings = get_settings()
-orig_settings = settings.global_loader_limit
-settings.global_loader_limit = 25000  # to ensure language is encountered
-
-logger = get_logger()
-
 dataset_name = "CohereForAI/aya_evaluation_suite"
 subsets = ["aya_human_annotated", "dolly_machine_translated", "dolly_human_edited"]
 langs = ["eng", "fra", "deu", "spa", "por", "jpn"]
@@ -29,12 +22,16 @@
 for subset in subsets:
     for lang in subset_to_langs[subset]:
         card = TaskCard(
-            loader=LoadHF(path=dataset_name, name=subset, streaming=True),
+            loader=LoadHF(
+                path=dataset_name,
+                name=subset,
+                streaming=True,
+                filtering_lambda=f'lambda instance: instance["language"]=="{lang}"',
+            ),
             preprocess_steps=[
                 SplitRandomMix(
                     {"train": "test[90%]", "validation": "test[5%]", "test": "test[5%]"}
                 ),
-                FilterByCondition(values={"language": lang}, condition="eq"),
                 RenameFields(
                     field_to_field={"inputs": "question", "targets": "answers"}
                 ),
@@ -44,16 +41,15 @@
             templates="templates.qa.open.all",
         )
         if lang == subset_to_langs[subset][0]:
-            test_card(
-                card, debug=False, loader_limit=25000, strict=False
-            )  # 25000 to reach every language
+            test_card(card, debug=False, strict=False)
         add_to_catalog(card, f"cards.cohere_for_ai.{subset}.{lang}", overwrite=True)
 
 ########################  to remove once done ############################
+logger = get_logger()
 recipe = StandardRecipeWithIndexes(
     template_card_index=1,
     card=f"cards.cohere_for_ai.{subsets[0]}.{langs[0]}",
-    num_demos=1,
+    num_demos=2,
     demos_pool_size=10,
 )
 ms = recipe()
@@ -73,4 +69,3 @@
 logger.info("+++++++++++done+++++++++++++++")
 logger.info("done")
 ############# end of to remove once done ##################
-settings.global_loader_limit = orig_settings
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/deu.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/deu.json
index 32515ed7ec..1608447c5f 100644
--- a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/deu.json
+++ b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/deu.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "aya_human_annotated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"deu\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "deu"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/eng.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/eng.json
index eba48ec427..b46f2d7c03 100644
--- a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/eng.json
+++ b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/eng.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "aya_human_annotated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"eng\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "eng"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/fra.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/fra.json
index 0889eb1099..8adcb1d12a 100644
--- a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/fra.json
+++ b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/fra.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "aya_human_annotated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"fra\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "fra"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/jpn.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/jpn.json
index 1f0d7883a3..ad6aca7d63 100644
--- a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/jpn.json
+++ b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/jpn.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "aya_human_annotated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"jpn\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "jpn"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/por.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/por.json
index ee65a90559..1ee5d24b34 100644
--- a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/por.json
+++ b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/por.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "aya_human_annotated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"por\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "por"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/spa.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/spa.json
index d0bf4714cf..9515b8c4ce 100644
--- a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/spa.json
+++ b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/spa.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "aya_human_annotated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"spa\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "spa"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/fra.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/fra.json
index 66f9f3740c..29a5b61bcd 100644
--- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/fra.json
+++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/fra.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_human_edited",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"fra\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "fra"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/spa.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/spa.json
index 79287474ea..6bc8df4ffd 100644
--- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/spa.json
+++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/spa.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_human_edited",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"spa\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "spa"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/deu.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/deu.json
index aa14112fe4..a001f77855 100644
--- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/deu.json
+++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/deu.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_machine_translated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"deu\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "deu"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/eng.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/eng.json
index 51eeb13295..21907e4788 100644
--- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/eng.json
+++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/eng.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_machine_translated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"eng\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "eng"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/fra.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/fra.json
index 85ff3314ee..a6cb526d8f 100644
--- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/fra.json
+++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/fra.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_machine_translated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"fra\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "fra"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/jpn.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/jpn.json
index 29cf829dc9..35d5273b80 100644
--- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/jpn.json
+++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/jpn.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_machine_translated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"jpn\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "jpn"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/por.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/por.json
index abe3574d74..a27cc51095 100644
--- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/por.json
+++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/por.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_machine_translated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"por\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "por"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/spa.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/spa.json
index 41820ee369..7f48eab580 100644
--- a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/spa.json
+++ b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/spa.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_machine_translated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"spa\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "spa"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {
diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py
index e503d94735..0509bc3cc7 100644
--- a/src/unitxt/loaders.py
+++ b/src/unitxt/loaders.py
@@ -48,7 +48,7 @@ class Loader(SourceOperator):
     # It is usually provided to the loader via the recipe (see standard.py)
     # The loader can use this value to limit the amount of data downloaded from the source
     # to reduce loading time.  However, this may not always be possible, so the
-    # loader may ingore this.  In any case, the recipe, will limit the number of instances in the returned
+    # loader may ignore this.  In any case, the recipe, will limit the number of instances in the returned
     # stream, after load is complete.
     loader_limit: int = None
     streaming: bool = False
@@ -84,8 +84,18 @@ class LoadHF(Loader):
         Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
     ] = None
     streaming: bool = True
+    filtering_lambda: Optional[str] = None
     _cache: dict = InternalField(default=None)
 
+    def filtered_load(self, dataset):
+        logger.info(f"\nLoading filtered by: {self.filtering_lambda};")
+        return MultiStream(
+            {
+                name: dataset[name].filter(eval(self.filtering_lambda))
+                for name in dataset
+            }
+        )
+
     def stream_dataset(self):
         if self._cache is None:
             with tempfile.TemporaryDirectory() as dir_to_be_deleted:
@@ -106,6 +116,9 @@ def stream_dataset(self):
                             f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE."
                         ) from e
 
+            if self.filtering_lambda is not None:
+                dataset = self.filtered_load(dataset)
+
             if self.split is not None:
                 dataset = {self.split: dataset}
 
@@ -135,6 +148,10 @@ def load_dataset(self):
                         raise ValueError(
                             f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE."
                         ) from e
+
+            if self.filtering_lambda is not None:
+                dataset = self.filtered_load(dataset)
+
             if self.split is None:
                 for split in dataset.keys():
                     dataset[split] = dataset[split].to_iterable_dataset()