filter on loading rather than increase loading limit

Signed-off-by: dafnapension <dafnashein@yahoo.com>
IBM · Feb 28, 2024 · 83809c8 · 83809c8
1 parent 9b640fe
commit 83809c8
Show file tree

Hide file tree

Showing 16 changed files with 56 additions and 128 deletions.
diff --git a/prepare/cards/cohere_for_ai.py b/prepare/cards/cohere_for_ai.py
@@ -6,17 +6,10 @@
 )
 from src.unitxt.catalog import add_to_catalog
 from src.unitxt.logging_utils import get_logger
-from src.unitxt.operators import FilterByCondition, ListFieldValues
-from src.unitxt.settings_utils import get_settings
+from src.unitxt.operators import ListFieldValues
 from src.unitxt.standard import StandardRecipeWithIndexes
 from src.unitxt.test_utils.card import test_card
 
-settings = get_settings()
-orig_settings = settings.global_loader_limit
-settings.global_loader_limit = 25000  # to ensure language is encountered
-
-logger = get_logger()
-
 dataset_name = "CohereForAI/aya_evaluation_suite"
 subsets = ["aya_human_annotated", "dolly_machine_translated", "dolly_human_edited"]
 langs = ["eng", "fra", "deu", "spa", "por", "jpn"]
@@ -29,12 +22,16 @@
 for subset in subsets:
     for lang in subset_to_langs[subset]:
         card = TaskCard(
-            loader=LoadHF(path=dataset_name, name=subset, streaming=True),
+            loader=LoadHF(
+                path=dataset_name,
+                name=subset,
+                streaming=True,
+                filtering_lambda=f'lambda instance: instance["language"]=="{lang}"',
+            ),
             preprocess_steps=[
                 SplitRandomMix(
                     {"train": "test[90%]", "validation": "test[5%]", "test": "test[5%]"}
                 ),
-                FilterByCondition(values={"language": lang}, condition="eq"),
                 RenameFields(
                     field_to_field={"inputs": "question", "targets": "answers"}
                 ),
@@ -44,16 +41,15 @@
             templates="templates.qa.open.all",
         )
         if lang == subset_to_langs[subset][0]:
-            test_card(
-                card, debug=False, loader_limit=25000, strict=False
-            )  # 25000 to reach every language
+            test_card(card, debug=False, strict=False)
         add_to_catalog(card, f"cards.cohere_for_ai.{subset}.{lang}", overwrite=True)
 
 ########################  to remove once done ############################
+logger = get_logger()
 recipe = StandardRecipeWithIndexes(
     template_card_index=1,
     card=f"cards.cohere_for_ai.{subsets[0]}.{langs[0]}",
-    num_demos=1,
+    num_demos=2,
     demos_pool_size=10,
 )
 ms = recipe()
@@ -73,4 +69,3 @@
 logger.info("+++++++++++done+++++++++++++++")
 logger.info("done")
 ############# end of to remove once done ##################
-settings.global_loader_limit = orig_settings
diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/deu.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/deu.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "aya_human_annotated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"deu\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "deu"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {

diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/eng.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/eng.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "aya_human_annotated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"eng\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "eng"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {

diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/fra.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/fra.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "aya_human_annotated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"fra\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "fra"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {

diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/jpn.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/jpn.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "aya_human_annotated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"jpn\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "jpn"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {

diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/por.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/por.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "aya_human_annotated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"por\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "por"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {

diff --git a/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/spa.json b/src/unitxt/catalog/cards/cohere_for_ai/aya_human_annotated/spa.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "aya_human_annotated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"spa\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "spa"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {

diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/fra.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/fra.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_human_edited",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"fra\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "fra"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {

diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/spa.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_human_edited/spa.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_human_edited",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"spa\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "spa"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {

diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/deu.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/deu.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_machine_translated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"deu\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "deu"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {

diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/eng.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/eng.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_machine_translated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"eng\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "eng"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {

diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/fra.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/fra.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_machine_translated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"fra\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "fra"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {

diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/jpn.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/jpn.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_machine_translated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"jpn\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "jpn"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {

diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/por.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/por.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_machine_translated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"por\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "por"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {

diff --git a/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/spa.json b/src/unitxt/catalog/cards/cohere_for_ai/dolly_machine_translated/spa.json
@@ -4,7 +4,8 @@
         "type": "load_hf",
         "path": "CohereForAI/aya_evaluation_suite",
         "name": "dolly_machine_translated",
-        "streaming": true
+        "streaming": true,
+        "filtering_lambda": "lambda instance: instance[\"language\"]==\"spa\""
     },
     "preprocess_steps": [
         {
@@ -15,13 +16,6 @@
                 "test": "test[5%]"
             }
         },
-        {
-            "type": "filter_by_condition",
-            "values": {
-                "language": "spa"
-            },
-            "condition": "eq"
-        },
         {
             "type": "rename_fields",
             "field_to_field": {