qcri · fdalvi · Aug 9, 2023 · Jun 26, 2023 · Aug 7, 2023 · Aug 8, 2023
diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/demography/name_info/NameInfo_BLOOMZ_ZeroShot.py
@@ -0,0 +1,174 @@
+import os
+import re
+
+from arabic_llm_benchmark.datasets import NameInfoDataset
+from arabic_llm_benchmark.models import BLOOMPetalModel
+from arabic_llm_benchmark.tasks import DemographyNameInfoTask
+
+
+def config():
+ return {
+ "dataset": NameInfoDataset,
+ "dataset_args": {},
+ "task": DemographyNameInfoTask,
+ "task_args": {},
+ "model": BLOOMPetalModel,
+ "model_args": {
+ "api_url": os.environ["API_URL"],
+ "class_labels": [
+ "gb",
+ "us",
+ "cl",
+ "fr",
+ "ru",
+ "pl",
+ "in",
+ "it",
+ "kr",
+ "gh",
+ "ca",
+ "sa",
+ "at",
+ "de",
+ "cn",
+ "br",
+ "dk",
+ "se",
+ "bd",
+ "cu",
+ "jp",
+ "be",
+ "es",
+ "co",
+ "id",
+ "iq",
+ "pk",
+ "tr",
+ "il",
+ "ch",
+ "ar",
+ "ro",
+ "nl",
+ "ps",
+ "ug",
+ "ir",
+ "cg",
+ "do",
+ "ee",
+ "tn",
+ "gr",
+ "np",
+ "ie",
+ "sy",
+ "hu",
+ "eg",
+ "ma",
+ "ve",
+ "ph",
+ "no",
+ "bg",
+ "si",
+ "ke",
+ "au",
+ "et",
+ "py",
+ "af",
+ "pt",
+ "th",
+ "bo",
+ "mx",
+ "lb",
+ "za",
+ "fi",
+ "hr",
+ "vn",
+ "ly",
+ "nz",
+ "qa",
+ "kh",
+ "ci",
+ "ng",
+ "sg",
+ "cm",
+ "dz",
+ "tz",
+ "ae",
+ "pe",
+ "az",
+ "lu",
+ "ec",
+ "cz",
+ "ua",
+ "uy",
+ "sd",
+ "ao",
+ "my",
+ "lv",
+ "kw",
+ "tw",
+ "bh",
+ "lk",
+ "ye",
+ "cr",
+ "jo",
+ "pa",
+ "om",
+ "uz",
+ "by",
+ "kz",
+ ],
+ "max_tries": 3,
+ },
+ "general_args": {
+ "data_path": "data/demographic_attributes/name_info/wikidata_test.txt"
+ },
+ }
+
+
+def prompt(input_sample):
+ prompt_string = (
+ f"You are an expert annotator who can identify the country of a person based on name.\n"
+ f"Label the country of the following person 'name'. Write ONLY the country code in ISO 3166-1 alpha-2 format.\n"
+ f"Provide only label.\n\n"
+ f"name: {input_sample}\n"
+ f"country: \n"
+ )
+
+ return {
+ "prompt": prompt_string,
+ }
+
+
+def post_process(response):
+ label = (
+ response["outputs"]
+ .strip()
+ .replace("<s>", "")
+ .replace("</s>", "")
+ .replace("ISO 3166-1:", "")
+ .replace("ISO 3166-1", "")
+ .lower()
+ )
+ label_list = config()["model_args"]["class_labels"]
+
+ # Regular expressions to catch the pattern
+ match = re.search(r"(country|country_code):\s*(.*)", label)
+ if match:
+ label = match.group(2).strip().lower()
+ if label in label_list:
+ label_fixed = label
+ elif (
+ "I'm sorry, but I cannot predict the country" in label
+ or "I cannot predict the country" in label
+ ):
+ label_fixed = None
+ else:
+ label_fixed = None
+
+ # Consolidating the check for None or empty string
+ if not label_fixed:
+ label_fixed = None
+
+ return label_fixed
+
+ return label_fixed
diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_FewShot.py
@@ -133,10 +133,16 @@ def config():
 
 
 def few_shot_prompt(input_sample, base_prompt, examples):
- out_prompt = base_prompt + "\n\n"
- for example in examples:
+ out_prompt = base_prompt + "\n"
+ out_prompt = out_prompt + "Here are some examples:\n\n"
+
+ for index, example in enumerate(examples):
  out_prompt = (
  out_prompt
+ + "Example "
+ + str(index)
+ + ":"
+ + "\n"
  + "name: "
  + example["input"]
  + "\ncountry: "

diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_ZeroShot.py
@@ -120,27 +120,47 @@ def config():
  "by",
  "kz",
  ],
- "max_tries": 3,
+ "max_tries": 30,
  },
  "general_args": {
- "data_path": "data/demographic_attributes/name_info/wikidata_test.txt",
+ "data_path": "data/demographic_attributes/name_info/wikidata_test.txt"
  },
  }
 
 
 def prompt(input_sample):
+ prompt_string = (
+ f"Label the country of the following person 'name'. Write ONLY the country code in ISO 3166-1 alpha-2 format.\n\n"
+ f"name: {input_sample}\n"
+ f"country: \n"
+ )
  return [
  {
  "role": "system",
- "content": "You are an AI assistant that helps people find information on locations.",
+ "content": "You are an expert annotator who can identify the country of a person based on name.",
  },
  {
  "role": "user",
- "content": f"Predict the country of citizenship of the following person name. Write ONLY the country code in ISO 3166-1 alpha-2 format without explananation.\n {input_sample}",
+ "content": prompt_string,
  },
  ]
 
 
 def post_process(response):
- out = response["choices"][0]["message"]["content"]
- return out.lower()
+ label = response["choices"][0]["message"]["content"]
+
+ label_list = config()["model_args"]["class_labels"]
+
+ if "name: " in label:
+ label_fixed = label.replace("name: ", "").lower()
+ elif label.lower() in label_list:
+ label_fixed = label.lower()
+ elif (
+ "I'm sorry, but I cannot predict the country" in label
+ or "I cannot predict the country" in label
+ ):
+ label_fixed = None
+ else:
+ label_fixed = None
+
+ return label_fixed