stanford-crfm · yifanmai · Sep 30, 2023 · Aug 17, 2023 · Aug 18, 2023 · Aug 18, 2023
diff --git a/src/helm/benchmark/augmentations/cleva_perturbation.py b/src/helm/benchmark/augmentations/cleva_perturbation.py
@@ -43,7 +43,15 @@ class Description(PerturbationDescription):
     name: str = "chinese_typos"
 
     # For downloading resources
-    ASSET_URL = "https://drive.google.com/uc?id=1p5mldLpKxI-63H8YEruGJghtD1dZJI8k"
+    ASSET_URL = "http://39.108.215.175/assets/butter_finger"
+    FILE_NAMES: List[str] = [
+        "pinyin_to_char.json",
+        "toneless_pinyin_to_char.json",
+        "pinyin_to_common_char.json",
+        "toneless_pinyin_to_common_char.json",
+        "pinyin_to_word.json",
+        "toneless_pinyin_to_word.json",
+    ]
 
     def __init__(
         self,
@@ -62,8 +70,11 @@ def __init__(
 
         # Ensure all necessary data are downloaded
         output_dir = os.path.join("benchmark_output", "perturbations", self.name)
-        ensure_directory_exists(os.path.dirname(output_dir))
-        ensure_file_downloaded(source_url=self.ASSET_URL, target_path=output_dir, unpack=True, unpack_type="unzip")
+        ensure_directory_exists(output_dir)
+        for filename in self.FILE_NAMES:
+            target_path = os.path.join(output_dir, filename)
+            SOURCE_URL: str = f"{self.ASSET_URL}/{filename}"
+            ensure_file_downloaded(source_url=SOURCE_URL, target_path=target_path)
 
         # Load the data for the perturbation
         with open(
@@ -285,7 +296,7 @@ class Description(PerturbationDescription):
     name: str = "chinese_synonym"
 
     # For downloading resources
-    SOURCE_URI: str = "https://drive.google.com/uc?id=1gXyZjoUw6yRjrsrh9ERzB_gxVluMTvij"
+    SOURCE_URL: str = "http://39.108.215.175/assets/synonyms.json"
 
     def __init__(self, prob: float, trial_num: int = 10):
         # Assign parameters to instance variables
@@ -294,7 +305,7 @@ def __init__(self, prob: float, trial_num: int = 10):
 
         target_dir = os.path.join("benchmark_output", "perturbations", self.name, "synonyms.json")
         ensure_directory_exists(os.path.dirname(target_dir))
-        ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_dir)
+        ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_dir)
         with open(os.path.join(target_dir)) as f:
             self.synonym_dict: Dict[str, List[str]] = json.load(f)
 
@@ -377,7 +388,7 @@ class ChineseGenderPerturbation(Perturbation):
     MODES = [GENDER_TERM, GENDER_PRONOUN]
 
     """ Resources """
-    SOURCE_URI: str = "https://drive.google.com/uc?id=1tJ5GLKboQrpzzBYTnFxeRuCOBxYhjFLp"
+    SOURCE_URL: str = "http://39.108.215.175/assets/gender_term.txt"
 
     @dataclass(frozen=True)
     class Description(PerturbationDescription):
@@ -424,7 +435,7 @@ class must be one of the genders in it. If not, it must be
 
             target_path = os.path.join("benchmark_output", "perturbations", self.name, "gender_term.txt")
             ensure_directory_exists(os.path.dirname(target_path))
-            ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_path)
+            ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path)
             with open(target_path) as fin:
                 for line in fin.readlines():
                     splits: List[str] = line.strip("\n").split(" ")
@@ -480,7 +491,7 @@ class ChinesePersonNamePerturbation(Perturbation):
     should_perturb_references: bool = True
 
     """ Resources """
-    SOURCE_URI: str = "https://drive.google.com/uc?id=1nKnfsxREkScrNOyhqiFxP5F1SjRgk6r8"
+    SOURCE_URL: str = "http://39.108.215.175/assets/chinese_name_gender.json"
     OUTPUT_PATH = os.path.join("benchmark_output", "perturbations", name)
 
     """ Gender categories """
@@ -545,7 +556,7 @@ def __init__(
 
         target_path = os.path.join("benchmark_output", "perturbations", self.name, "chinese_name_gender.json")
         ensure_directory_exists(os.path.dirname(target_path))
-        ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_path)
+        ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path)
         with open(os.path.join(target_path), "r", encoding="utf-8") as f:
             self.gender2name: Dict[str, List[str]] = json.load(f)
             del self.gender2name["unknown"]
@@ -715,7 +726,7 @@ class MandarinToCantonesePerturbation(Perturbation):
     should_perturb_references: bool = True
 
     """ Resources """
-    SOURCE_URI: str = "https://drive.google.com/uc?id=1vljbwq0hTm7W1tz74gjPnONWJ6kSEwK2"
+    SOURCE_URL: str = "http://39.108.215.175/assets/conversion.json"
 
     @property
     def description(self) -> PerturbationDescription:
@@ -733,7 +744,7 @@ def __init__(
 
         target_path = os.path.join("benchmark_output", "perturbations", self.name, "conversion.json")
         ensure_directory_exists(os.path.dirname(target_path))
-        ensure_file_downloaded(source_url=self.SOURCE_URI, target_path=target_path)
+        ensure_file_downloaded(source_url=self.SOURCE_URL, target_path=target_path)
         with open(target_path) as fin:
             self.phrase_table = json.load(fin)
 

diff --git a/src/helm/benchmark/metrics/classification_metrics.py b/src/helm/benchmark/metrics/classification_metrics.py
@@ -98,10 +98,6 @@ def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
             y_pred.append(pred)
 
         return [
-            Stat(MetricName("multiple_choice_classification_macro_f1")).add(
-                f1_score(y_pred=y_pred, y_true=y_true, average="macro")
-            ),
-            Stat(MetricName("multiple_choice_classification_micro_f1")).add(
-                f1_score(y_pred=y_pred, y_true=y_true, average="micro")
-            ),
+            Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
+            Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
         ]
diff --git a/src/helm/benchmark/metrics/cleva_harms_metrics.py b/src/helm/benchmark/metrics/cleva_harms_metrics.py
@@ -36,15 +36,16 @@ class CLEVABiasMetric(BiasMetric):
     name: str = "chinese_bias"
 
     """ Chinese resources. """
-    FILE_TO_URL: Dict[str, str] = {
-        "asian_word_list.txt": "https://drive.google.com/uc?id=1Enm7x1nj5o5DMXdSD3WFqr47F09QgFaM",
-        "hispanic_word_list.txt": "https://drive.google.com/uc?id=1-JAUBxA0IGiJ0j9nL6xJ8wCP9o-Gh8sS",
-        "white_word_list.txt": "https://drive.google.com/uc?id=1jbKhE4stKs8VyYmvQjx4aXq_m6oSccHp",
-        "male_word_list.txt": "https://drive.google.com/uc?id=1xWEsIYzXvQvOlpcID_zYBqfItIrVDIam",
-        "female_word_list.txt": "https://drive.google.com/uc?id=1Q5e18NfMCqPxdy7mBIteMqPHrJmOs7s_",
-        "profession_word_list.txt": "https://drive.google.com/uc?id=1baBsev6ippugLwUCQ8lHnLaOSBDstsj3",
-        "adjective_word_list.txt": "https://drive.google.com/uc?id=1s-jgE6OW-btc921GX9Aos0EhIbYwmBAT",
-    }
+    ASSET_URL = "http://39.108.215.175/assets"
+    FILE_NAMES: List[str] = [
+        "asian_word_list.txt",
+        "hispanic_word_list.txt",
+        "white_word_list.txt",
+        "male_word_list.txt",
+        "female_word_list.txt",
+        "profession_word_list.txt",
+        "adjective_word_list.txt",
+    ]
 
     def __repr__(self):
         return (
@@ -71,9 +72,9 @@ def __init__(self, mode: str, demographic_category: str, target_category: Option
         # Ensure all necessary data are downloaded
         self.output_dir = os.path.join("benchmark_output", "metrics", self.name)
         ensure_directory_exists(self.output_dir)
-        for FILENAME, URL in self.FILE_TO_URL.items():
-            target_path = os.path.join(self.output_dir, FILENAME)
-            ensure_file_downloaded(source_url=URL, target_path=target_path)
+        for filename in self.FILE_NAMES:
+            target_path = os.path.join(self.output_dir, filename)
+            ensure_file_downloaded(source_url=f"{self.ASSET_URL}/{filename}", target_path=target_path)
 
         # Overwrite inherited mappings
         self.build_mappings()

diff --git a/src/helm/benchmark/presentation/run_specs_cleva_v1.conf b/src/helm/benchmark/presentation/run_specs_cleva_v1.conf
@@ -217,9 +217,9 @@ entries: [
     {description: "cleva:model=text,task=commonsense_reasoning,subtask=textual_entailment,prompt_id=5,version=v1,data_augmentation=cleva", priority: 1}
     {description: "cleva:model=full_functionality_text,task=commonsense_reasoning,subtask=commonsense_question_answering,prompt_id=0,version=v1,data_augmentation=cleva", priority: 1}
 
-    {description: "cleva:model=text,task=mathematical_reasoning,subtask=math_world_problem,prompt_id=0,version=v1,data_augmentation=cleva", priority: 1}
-    {description: "cleva:model=text,task=mathematical_reasoning,subtask=math_world_problem,prompt_id=1,version=v1,data_augmentation=cleva", priority: 1}
-    {description: "cleva:model=text,task=mathematical_reasoning,subtask=math_world_problem,prompt_id=2,version=v1,data_augmentation=cleva", priority: 1}
+    {description: "cleva:model=text,task=mathematical_reasoning,subtask=math_word_problem,prompt_id=0,version=v1,data_augmentation=cleva", priority: 1}
+    {description: "cleva:model=text,task=mathematical_reasoning,subtask=math_word_problem,prompt_id=1,version=v1,data_augmentation=cleva", priority: 1}
+    {description: "cleva:model=text,task=mathematical_reasoning,subtask=math_word_problem,prompt_id=2,version=v1,data_augmentation=cleva", priority: 1}
 
     {description: "cleva:model=text,task=inductive_reasoning,subtask=add,prompt_id=0,version=v1,data_augmentation=cleva", priority: 1}
     {description: "cleva:model=text,task=inductive_reasoning,subtask=add,prompt_id=1,version=v1,data_augmentation=cleva", priority: 1}

diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py
@@ -749,7 +749,7 @@ def get_cleva_generative_task_metric_spec(task: str, subtask: Optional[str], **k
         "pinyin_transliteration:zh2pinyin": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
         "dialogue_generation:task_oriented": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
         "data_to_text_generation": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
-        "mathematical_reasoning:math_world_problem": partial(get_basic_metric_specs, ["cleva_math_result_match"]),
+        "mathematical_reasoning:math_word_problem": partial(get_basic_metric_specs, ["cleva_math_result_match"]),
     }
 
     key: str = task
@@ -2380,7 +2380,7 @@ def get_anthropic_hh_rlhf_spec(num_respondents: int, subset: str) -> RunSpec:
 def get_cleva_spec(task: str, version: str, subtask: str = None, prompt_id: int = 0) -> RunSpec:
     from .scenarios.cleva_scenario import CLEVAScenario  # noqa
 
-    CLEVAScenario.download_dataset()
+    CLEVAScenario.download_dataset(task, version)
 
     _, prompt_setting = CLEVAScenario.get_prompt_setting(task, subtask, version, prompt_id)
     inference_parameters = CLEVAScenario.load_inference_parameters(task, subtask, version, prompt_id)
@@ -2430,7 +2430,7 @@ def get_cleva_spec(task: str, version: str, subtask: str = None, prompt_id: int
                     output_suffix=prompt_setting.output_suffix,
                     max_train_instances=inference_parameters.get("max_train_instances", 5),
                     num_outputs=inference_parameters.get("num_outputs", 5),
-                    max_tokens=inference_parameters.get("max_tokens", 5),
+                    max_tokens=inference_parameters.get("max_tokens", 1),
                     temperature=inference_parameters.get("temperature", 0.0),
                     stop_sequences=inference_parameters.get("stop_sequences", ["\n"]),
                     sample_train=inference_parameters.get("sample_train", True),

diff --git a/src/helm/benchmark/scenarios/cleva_scenario.py b/src/helm/benchmark/scenarios/cleva_scenario.py
@@ -16,7 +16,7 @@
 from .code_scenario import CodeReference, CodeInstance
 
 
-CLEVA_DATA_URL = "https://drive.google.com/uc?id=1uteSvq2dOgsmutOOwEziQd_d9i5Ypan6&confirm=t"
+CLEVA_DATA_URL = "http://39.108.215.175/data"
 CLEVA_DATA_PATH = "benchmark_output/scenarios/cleva"
 
 
@@ -410,10 +410,11 @@ def task(self) -> str:
         pass
 
     @classmethod
-    def download_dataset(cls):
-        target_dir = os.path.join(CLEVA_DATA_PATH, "data")
-        ensure_directory_exists(CLEVA_DATA_PATH)
-        ensure_file_downloaded(source_url=CLEVA_DATA_URL, target_path=target_dir, unpack=True, unpack_type="untar")
+    def download_dataset(cls, task: str, version: str):
+        source_url: str = CLEVA_DATA_URL + f"/{version}/{task}.zip"
+        target_dir: str = os.path.join(CLEVA_DATA_PATH, "data", version)
+        ensure_directory_exists(target_dir)
+        ensure_file_downloaded(source_url=source_url, target_path=os.path.join(target_dir, task), unpack=True)
 
     def load_dataset(self) -> Dict[str, List[Dict[str, Any]]]:
         data_dir: str = os.path.join(CLEVA_DATA_PATH, "data", self.version, self.task)
@@ -1483,7 +1484,7 @@ class CLEVAMathematicalReasoningScenario(CLEVAScenario):
     For example, we use "所以答案是（只给出数字即可）" (English: Thus, the answer is:) before the answer,
     and remove line breaks within the answer.
 
-    An example of the math_world_problem subtask is:
+    An example of the math_word_problem subtask is:
         回答以下数学问题
 
         问题：甲数是168，乙数是甲数的4倍，乙数=？请一步一步给出推理过程。