Skip to content

Commit

Permalink
filter on loading rather than increase loading limit
Browse files Browse the repository at this point in the history
Signed-off-by: dafnapension <dafnashein@yahoo.com>
  • Loading branch information
dafnapension committed Feb 28, 2024
1 parent 9b640fe commit 83809c8
Show file tree
Hide file tree
Showing 16 changed files with 56 additions and 128 deletions.
25 changes: 10 additions & 15 deletions prepare/cards/cohere_for_ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,10 @@
)
from src.unitxt.catalog import add_to_catalog
from src.unitxt.logging_utils import get_logger
from src.unitxt.operators import FilterByCondition, ListFieldValues
from src.unitxt.settings_utils import get_settings
from src.unitxt.operators import ListFieldValues
from src.unitxt.standard import StandardRecipeWithIndexes
from src.unitxt.test_utils.card import test_card

settings = get_settings()
orig_settings = settings.global_loader_limit
settings.global_loader_limit = 25000 # to ensure language is encountered

logger = get_logger()

dataset_name = "CohereForAI/aya_evaluation_suite"
subsets = ["aya_human_annotated", "dolly_machine_translated", "dolly_human_edited"]
langs = ["eng", "fra", "deu", "spa", "por", "jpn"]
Expand All @@ -29,12 +22,16 @@
for subset in subsets:
for lang in subset_to_langs[subset]:
card = TaskCard(
loader=LoadHF(path=dataset_name, name=subset, streaming=True),
loader=LoadHF(
path=dataset_name,
name=subset,
streaming=True,
filtering_lambda=f'lambda instance: instance["language"]=="{lang}"',
),
preprocess_steps=[
SplitRandomMix(
{"train": "test[90%]", "validation": "test[5%]", "test": "test[5%]"}
),
FilterByCondition(values={"language": lang}, condition="eq"),
RenameFields(
field_to_field={"inputs": "question", "targets": "answers"}
),
Expand All @@ -44,16 +41,15 @@
templates="templates.qa.open.all",
)
if lang == subset_to_langs[subset][0]:
test_card(
card, debug=False, loader_limit=25000, strict=False
) # 25000 to reach every language
test_card(card, debug=False, strict=False)
add_to_catalog(card, f"cards.cohere_for_ai.{subset}.{lang}", overwrite=True)

######################## to remove once done ############################
logger = get_logger()
recipe = StandardRecipeWithIndexes(
template_card_index=1,
card=f"cards.cohere_for_ai.{subsets[0]}.{langs[0]}",
num_demos=1,
num_demos=2,
demos_pool_size=10,
)
ms = recipe()
Expand All @@ -73,4 +69,3 @@
logger.info("+++++++++++done+++++++++++++++")
logger.info("done")
############# end of to remove once done ##################
settings.global_loader_limit = orig_settings
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"type": "load_hf",
"path": "CohereForAI/aya_evaluation_suite",
"name": "aya_human_annotated",
"streaming": true
"streaming": true,
"filtering_lambda": "lambda instance: instance[\"language\"]==\"deu\""
},
"preprocess_steps": [
{
Expand All @@ -15,13 +16,6 @@
"test": "test[5%]"
}
},
{
"type": "filter_by_condition",
"values": {
"language": "deu"
},
"condition": "eq"
},
{
"type": "rename_fields",
"field_to_field": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"type": "load_hf",
"path": "CohereForAI/aya_evaluation_suite",
"name": "aya_human_annotated",
"streaming": true
"streaming": true,
"filtering_lambda": "lambda instance: instance[\"language\"]==\"eng\""
},
"preprocess_steps": [
{
Expand All @@ -15,13 +16,6 @@
"test": "test[5%]"
}
},
{
"type": "filter_by_condition",
"values": {
"language": "eng"
},
"condition": "eq"
},
{
"type": "rename_fields",
"field_to_field": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"type": "load_hf",
"path": "CohereForAI/aya_evaluation_suite",
"name": "aya_human_annotated",
"streaming": true
"streaming": true,
"filtering_lambda": "lambda instance: instance[\"language\"]==\"fra\""
},
"preprocess_steps": [
{
Expand All @@ -15,13 +16,6 @@
"test": "test[5%]"
}
},
{
"type": "filter_by_condition",
"values": {
"language": "fra"
},
"condition": "eq"
},
{
"type": "rename_fields",
"field_to_field": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"type": "load_hf",
"path": "CohereForAI/aya_evaluation_suite",
"name": "aya_human_annotated",
"streaming": true
"streaming": true,
"filtering_lambda": "lambda instance: instance[\"language\"]==\"jpn\""
},
"preprocess_steps": [
{
Expand All @@ -15,13 +16,6 @@
"test": "test[5%]"
}
},
{
"type": "filter_by_condition",
"values": {
"language": "jpn"
},
"condition": "eq"
},
{
"type": "rename_fields",
"field_to_field": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"type": "load_hf",
"path": "CohereForAI/aya_evaluation_suite",
"name": "aya_human_annotated",
"streaming": true
"streaming": true,
"filtering_lambda": "lambda instance: instance[\"language\"]==\"por\""
},
"preprocess_steps": [
{
Expand All @@ -15,13 +16,6 @@
"test": "test[5%]"
}
},
{
"type": "filter_by_condition",
"values": {
"language": "por"
},
"condition": "eq"
},
{
"type": "rename_fields",
"field_to_field": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"type": "load_hf",
"path": "CohereForAI/aya_evaluation_suite",
"name": "aya_human_annotated",
"streaming": true
"streaming": true,
"filtering_lambda": "lambda instance: instance[\"language\"]==\"spa\""
},
"preprocess_steps": [
{
Expand All @@ -15,13 +16,6 @@
"test": "test[5%]"
}
},
{
"type": "filter_by_condition",
"values": {
"language": "spa"
},
"condition": "eq"
},
{
"type": "rename_fields",
"field_to_field": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"type": "load_hf",
"path": "CohereForAI/aya_evaluation_suite",
"name": "dolly_human_edited",
"streaming": true
"streaming": true,
"filtering_lambda": "lambda instance: instance[\"language\"]==\"fra\""
},
"preprocess_steps": [
{
Expand All @@ -15,13 +16,6 @@
"test": "test[5%]"
}
},
{
"type": "filter_by_condition",
"values": {
"language": "fra"
},
"condition": "eq"
},
{
"type": "rename_fields",
"field_to_field": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"type": "load_hf",
"path": "CohereForAI/aya_evaluation_suite",
"name": "dolly_human_edited",
"streaming": true
"streaming": true,
"filtering_lambda": "lambda instance: instance[\"language\"]==\"spa\""
},
"preprocess_steps": [
{
Expand All @@ -15,13 +16,6 @@
"test": "test[5%]"
}
},
{
"type": "filter_by_condition",
"values": {
"language": "spa"
},
"condition": "eq"
},
{
"type": "rename_fields",
"field_to_field": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"type": "load_hf",
"path": "CohereForAI/aya_evaluation_suite",
"name": "dolly_machine_translated",
"streaming": true
"streaming": true,
"filtering_lambda": "lambda instance: instance[\"language\"]==\"deu\""
},
"preprocess_steps": [
{
Expand All @@ -15,13 +16,6 @@
"test": "test[5%]"
}
},
{
"type": "filter_by_condition",
"values": {
"language": "deu"
},
"condition": "eq"
},
{
"type": "rename_fields",
"field_to_field": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"type": "load_hf",
"path": "CohereForAI/aya_evaluation_suite",
"name": "dolly_machine_translated",
"streaming": true
"streaming": true,
"filtering_lambda": "lambda instance: instance[\"language\"]==\"eng\""
},
"preprocess_steps": [
{
Expand All @@ -15,13 +16,6 @@
"test": "test[5%]"
}
},
{
"type": "filter_by_condition",
"values": {
"language": "eng"
},
"condition": "eq"
},
{
"type": "rename_fields",
"field_to_field": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"type": "load_hf",
"path": "CohereForAI/aya_evaluation_suite",
"name": "dolly_machine_translated",
"streaming": true
"streaming": true,
"filtering_lambda": "lambda instance: instance[\"language\"]==\"fra\""
},
"preprocess_steps": [
{
Expand All @@ -15,13 +16,6 @@
"test": "test[5%]"
}
},
{
"type": "filter_by_condition",
"values": {
"language": "fra"
},
"condition": "eq"
},
{
"type": "rename_fields",
"field_to_field": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"type": "load_hf",
"path": "CohereForAI/aya_evaluation_suite",
"name": "dolly_machine_translated",
"streaming": true
"streaming": true,
"filtering_lambda": "lambda instance: instance[\"language\"]==\"jpn\""
},
"preprocess_steps": [
{
Expand All @@ -15,13 +16,6 @@
"test": "test[5%]"
}
},
{
"type": "filter_by_condition",
"values": {
"language": "jpn"
},
"condition": "eq"
},
{
"type": "rename_fields",
"field_to_field": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"type": "load_hf",
"path": "CohereForAI/aya_evaluation_suite",
"name": "dolly_machine_translated",
"streaming": true
"streaming": true,
"filtering_lambda": "lambda instance: instance[\"language\"]==\"por\""
},
"preprocess_steps": [
{
Expand All @@ -15,13 +16,6 @@
"test": "test[5%]"
}
},
{
"type": "filter_by_condition",
"values": {
"language": "por"
},
"condition": "eq"
},
{
"type": "rename_fields",
"field_to_field": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"type": "load_hf",
"path": "CohereForAI/aya_evaluation_suite",
"name": "dolly_machine_translated",
"streaming": true
"streaming": true,
"filtering_lambda": "lambda instance: instance[\"language\"]==\"spa\""
},
"preprocess_steps": [
{
Expand All @@ -15,13 +16,6 @@
"test": "test[5%]"
}
},
{
"type": "filter_by_condition",
"values": {
"language": "spa"
},
"condition": "eq"
},
{
"type": "rename_fields",
"field_to_field": {
Expand Down
Loading

0 comments on commit 83809c8

Please sign in to comment.