Update arabic_evals.py: Fix custom arabic tasks [2nd attempt] (#444)

Fix alghafa prompt function by explicitly determining the list of choices based on task_name. (Not all subsets of AlGhafa Native share same columns) --------- Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
huggingface · Dec 20, 2024 · fbca143 · fbca143
1 parent a1c610d
commit fbca143
Show file tree

Hide file tree

Showing 5 changed files with 9 additions and 14 deletions.
diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml
@@ -16,4 +16,3 @@ jobs:
         fetch-depth: 0
     - name: Secret Scanning
       uses: trufflesecurity/trufflehog@main
-
diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
@@ -86,7 +86,6 @@ def arabic_mmlu_pfn(line, task_name: str = None):
         choices=valid_keys_arabic,  # Return only valid choices (Arabic keys)
         gold_index=answer_index,  # Correct index in the valid Arabic keys
         instruction=instruction,
-        target_for_fewshot_sorting=valid_keys_arabic[answer_index],  # Correct answer in Arabic form
     )
 
 
@@ -149,7 +148,6 @@ def arabic_mmlu_ht_pfn(line, task_name: str = None):
         choices=[str(i) for i in range(1, len(choices) + 1)],  # List of strings instead of ints
         gold_index=answer_index,
         instruction=instruction,
-        target_for_fewshot_sorting=str(answer_index),  # Assuming it's sorted based on the number
     )
 
 
@@ -328,7 +326,6 @@ def aratrust_pfn(line, task_name: str = None):
         choices=LETTER_INDICES_AR[:3],
         gold_index=answer_index,
         instruction=instruction,
-        target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index],
     )
 
 
@@ -413,7 +410,8 @@ def arabic_exams_pfn(line, task_name: str = None):
 def alghafa_pfn(line, task_name: str = None):
     question = line["query"]
     answer_index = int(line["label"])
-    choices = [line[key] for key in ["sol1", "sol2", "sol3", "sol4"]]
+    allowed_keys = [f"sol{i}" for i in range(1, 6)]
+    choices = [line[key] for key in allowed_keys if key in line]
 
     instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n"
     query = f"{instruction}السؤال: {question}\n"
@@ -802,7 +800,6 @@ def madinah_qa_pfn(line, task_name: str = None):
         choices=choices,
         gold_index=answer_index,  # Correct index in the valid keys
         instruction=instruction,
-        target_for_fewshot_sorting=valid_keys_latin[answer_index],  # Correct answer in Latin form
     )
 
 

diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/adding-a-new-metric.mdx
@@ -92,4 +92,3 @@ if __name__ == "__main__":
 
 You can then give your custom metric to lighteval by using `--custom-tasks
 path_to_your_file` when launching it.
-
diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx
@@ -8,7 +8,7 @@ We welcome translations in your language!
 
 To contribute, you'll need to
 1. Open the [translation_literals](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/templates/utils/translation_literals.py) file
-2. Edit the file to add or expand the literal for your language of interest. 
+2. Edit the file to add or expand the literal for your language of interest.
 
 ```python
     Language.ENGLISH: TranslationLiterals(
@@ -42,7 +42,7 @@ To contribute, you'll need to
 
 ## Contributing a new multilingual task
 
-You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use. 
+You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use.
 
 Then, you should take a look at the current [multilingual tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/multilingual/tasks.py) file, to understand how they are defined. For multilingual evaluations the `prompt_function` should be implemented by language-adapted template. The template will take care of correct formatting, correct and consistent usage of language adjusted prompt anchors (e.g Question/Answer) and punctuation.
 
@@ -58,7 +58,7 @@ your_tasks = [
     LightevalTaskConfig(
         # Name of your evaluation
         name=f"evalname_{language.value}_{formulation.name.lower()}",
-        # The evaluation is community contributed 
+        # The evaluation is community contributed
         suite=["community"],
         # This will automatically get the correct metrics for your chosen formulation
         metric=get_metrics_for_formulation(
@@ -72,7 +72,7 @@ your_tasks = [
         # In this function, you choose which template to follow and for which language and formulation
         prompt_function=get_template_prompt_function(
             language=language,
-            # then use the adapter to define the mapping between the 
+            # then use the adapter to define the mapping between the
             # keys of the template (left), and the keys of your dataset
             # (right)
             # To know which template keys are required and available,
@@ -83,9 +83,9 @@ your_tasks = [
             },
             formulation=formulation,
         ),
-        # You can also add specific filters to remove irrelevant samples 
+        # You can also add specific filters to remove irrelevant samples
         hf_filter=lambda line: line["label"] in <condition>,
-        # You then select your huggingface dataset as well as 
+        # You then select your huggingface dataset as well as
         # the splits available for evaluation
         hf_repo=<dataset>,
         hf_subset=<subset>,

diff --git a/docs/source/using-the-python-api.mdx b/docs/source/using-the-python-api.mdx
@@ -35,7 +35,7 @@ def main():
         env_config=EnvConfig(cache_dir="tmp/"),
         # Remove the 2 parameters below once your configuration is tested
         override_batch_size=1,
-        max_samples=10 
+        max_samples=10
     )
 
     model_config = VLLMModelConfig(
Original file line number	Diff line number	Diff line change
Expand Up		@@ -16,4 +16,3 @@ jobs:
		fetch-depth: 0
		- name: Secret Scanning
		uses: trufflesecurity/trufflehog@main
Original file line number	Diff line number	Diff line change
Expand Up		@@ -92,4 +92,3 @@ if __name__ == "__main__":

		You can then give your custom metric to lighteval by using `--custom-tasks
		path_to_your_file` when launching it.