[ENH] add mistral v0.3, Qwen2 70b, gtp4 mini (#393)

* [ENH] add mistral v0.3, Qwen2 70b, gtp4 mini * [BUG] fix test
tatsu-lab · Aug 17, 2024 · 4e3b281 · 4e3b281
1 parent c4def44
commit 4e3b281
Show file tree

Hide file tree

Showing 17 changed files with 211,475 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -915,9 +915,9 @@ See [this config](https://github.com/tatsu-lab/alpaca_eval/blob/main/src/alpaca_
 <img align="center" alt="verified.png" src="figures/verified.png" width="500"/>
 </p>
 
-A verified result in AlpacaEval indicates that a core maintainer has decoded the outputs from the model and performed the evaluation. Unfortunately, we, the AlpacaEval maintainers, lack the resources to verify all the models and so we will only do that for models that are in the top-10 of the leaderboard. We apologize for any inconvenience this may cause and appreciate your understanding. To have your model verified, please follow the steps below:
+A verified result in AlpacaEval indicates that a core maintainer has decoded the outputs from the model and performed the evaluation. Unfortunately, we, the AlpacaEval maintainers, lack the resources to verify all the models and so we will only do that for models that are in the top-5 of the leaderboard. We apologize for any inconvenience this may cause and appreciate your understanding. To have your model verified, please follow the steps below:
 
-1. Contact `@yann` or `@rtaori` on Discord, or email us if you have our email, providing a brief rationale for why your model should be verified.
+1. Contact `@yann`  on Discord, or email us if you have our email, providing a brief rationale for why your model should be verified.
 2. Await our response and approval before proceeding.
 3. Prepare a script to decode from your model that does not require a GPU, typically the same script used for your model contribution. It should run using `alpaca_eval evaluate_from_model --model_configs '<your_model_name>'` without requiring a local GPU.
 4. Generate temporary API keys for running the script and share them with us. Specifically, we need the keys for both decoding your model and for evaluation (e.g., OpenAI or Anthropic key).

diff --git a/results/Mistral-7B-Instruct-v0.3/model_outputs.json b/results/Mistral-7B-Instruct-v0.3/model_outputs.json
diff --git a/results/Mistral-7B-Instruct-v0.3/weighted_alpaca_eval_gpt4_turbo/annotations.json b/results/Mistral-7B-Instruct-v0.3/weighted_alpaca_eval_gpt4_turbo/annotations.json
diff --git a/results/Qwen2-72B-Instruct/model_outputs.json b/results/Qwen2-72B-Instruct/model_outputs.json
diff --git a/results/Qwen2-72B-Instruct/weighted_alpaca_eval_gpt4_turbo/annotations.json b/results/Qwen2-72B-Instruct/weighted_alpaca_eval_gpt4_turbo/annotations.json
diff --git a/results/gpt-4o-mini-2024-07-18/model_outputs.json b/results/gpt-4o-mini-2024-07-18/model_outputs.json
diff --git a/results/gpt-4o-mini-2024-07-18/weighted_alpaca_eval_gpt4_turbo/annotations.json b/results/gpt-4o-mini-2024-07-18/weighted_alpaca_eval_gpt4_turbo/annotations.json
diff --git a/src/alpaca_eval/annotators/base.py b/src/alpaca_eval/annotators/base.py
@@ -617,6 +617,9 @@ def __init__(
         processors_to_kwargs: Optional[dict[str, dict]] = None,
         is_add_default_processors: bool = True,
         completion_key: str = "completions",
+        # The following two keys are only for the documentation
+        pretty_name: Optional[str] = None,
+        link: Optional[str] = None,
     ):
         self.base_dir = Path(base_dir)
         self.prompt_template = self._get_prompt_template(prompt_template)

diff --git a/src/alpaca_eval/annotators/pairwise_evaluator.py b/src/alpaca_eval/annotators/pairwise_evaluator.py
@@ -346,6 +346,7 @@ def __init__(
         random_seed_column: Sequence[str] = ("instruction",),
         processors_to_kwargs: Optional[dict[str, dict]] = None,
         is_randomize_output_order: bool = True,
+        fn_completion_parser: Optional[Union[Callable, str]] = "regex_parser",
         **kwargs,
     ):
         processors_to_kwargs = processors_to_kwargs or {}
@@ -369,7 +370,11 @@ def _fn_replace_if_switch(df: pd.DataFrame) -> pd.DataFrame:
             )
 
         super().__init__(
-            *args, annotation_column=annotation_column, processors_to_kwargs=processors_to_kwargs, **kwargs
+            *args,
+            annotation_column=annotation_column,
+            processors_to_kwargs=processors_to_kwargs,
+            fn_completion_parser=fn_completion_parser,
+            **kwargs,
         )
         self.random_seed_column = list(random_seed_column)
 

diff --git a/...lpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/...lpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
@@ -16,6 +16,7 @@ Llama-3-Instruct-8B-WPO-HB-v2,57.33198613024009,1.4953200715726744,469,336,0,805
 claude-3-5-sonnet-20240620,40.56021409682828,1.4679655403720542,312,493,0,805,38.75776397515528,community,1488,52.36675427146999,
 yi-large-preview,57.46724251946292,1.4305696667082746,463,338,4,805,57.7639751552795,verified,2335,51.894415134099546,
 gpt4_1106_preview_verbose,64.30360147101865,1.3348590089025316,525,268,12,805,65.96273291925466,dev,2402,51.57500797967598,
+gpt-4o-mini-2024-07-18,44.65413862507926,1.4572395578449813,350,451,4,805,43.72670807453416,minimal,1861,50.727144855901976,0.8284734951761676
 Storm-7B,50.26886905528583,1.4728176780737183,397,408,0,805,49.31677018633541,community,2045,50.45110959343775,
 gpt4_1106_preview,50.0,0.0,0,0,805,805,50.0,minimal,2049,50.0,
 Infinity-Instruct-7M-Gen-Llama3_1-70B,37.46327383827497,1.4734130373862548,299,501,5,805,37.453416149068325,community,1654,46.10043331712677,0.822439983375277
@@ -27,10 +28,11 @@ gpt4_1106_preview_concise,22.92019444047205,1.232517714329424,172,622,11,805,22.
 aligner-2b_claude-3-opus-20240229,34.46337362321739,1.314666526302454,225,475,105,805,34.47204968944099,community,1669,41.823071715247664,
 Nanbeige2-16B-Chat,37.03608605005168,1.4340261272580377,288,514,3,805,35.962732919254655,community,1867,40.591286349562864,0.8504106275373426
 claude-3-opus-20240229,29.10526953334248,1.3941539442369442,223,579,3,805,27.888198757763977,minimal,1388,40.5095080124761,
-Infinity-Instruct-7M-Gen-mistral-7B,34.347412485016434,1.412595625747994,263,541,1,805,32.732919254658384,community,1742,39.669499648314385,0.8048310993594987
-Meta-Llama-3.1-405B-Instruct-Turbo,39.10666895419877,1.4335939943941904,305,497,3,805,38.07453416149068,minimal,1988,39.257327499617425,0.9064666759144326
+Infinity-Instruct-7M-Gen-mistral-7B,34.347412485016434,1.412595625747994,263,541,1,805,32.732919254658384,community,1742,39.66949964831439,0.8048310993594987
+Meta-Llama-3.1-405B-Instruct-Turbo,39.10666895419877,1.4335939943941904,305,497,3,805,38.07453416149068,minimal,1988,39.25732749961743,0.9064666759144326
 SPPO-Llama-3-Instruct-8B-PairRM,39.67286090605648,1.424722356202499,310,494,1,805,38.57142857142858,community,2066,38.56280663670214,0.8694594533275739
 gpt4,23.576789314782605,1.275704201206918,179,618,8,805,22.732919254658384,verified,1365,38.12808974440021,
+Qwen2-72B-Instruct,29.8527557752399,1.3690032071830978,231,569,5,805,29.006211180124225,verified,1626,38.07461345451606,0.8956826164517345
 Meta-Llama-3.1-70B-Instruct-Turbo,39.12691443804968,1.4277422726408466,306,496,3,805,38.19875776397515,minimal,2044,38.05512453607286,0.9009912768416926
 Infinity-Instruct-3M-0625-Llama3-70B,24.277231851026183,1.3152941480778837,188,613,4,805,23.60248447204969,community,1294,37.97881098506053,0.8189316873655579
 aligner-2b_qwen1.5-72b-chat,31.773037737123104,1.2392772646245978,180,473,152,805,31.801242236024844,community,1812,36.725868878524274,
@@ -90,6 +92,7 @@ gpt4_0613_concise,9.400320574596272,0.901021275896262,71,729,5,805,9.13043478260
 pairrm-tulu-2-70b,18.638962967441,1.1924966700012911,140,665,0,805,17.391304347826086,community,1607,21.428403975507223,
 tulu-2-dpo-70b,15.982854374136648,1.1457861368237434,119,683,3,805,14.96894409937888,verified,1418,21.238610038371124,
 Meta-Llama-3.1-8B-Instruct-Turbo,21.841523410839937,1.2489757978275888,168,632,2,802,21.07231920199501,minimal,2181,20.85398744758185,0.7027366237502348
+Mistral-7B-Instruct-v0.3,16.693179605176876,1.114857061574165,120,684,1,805,14.96894409937888,verified,1581,20.61004837179779,0.7774158380429053
 Mistral-7B-ReMax-v0.1,15.999331369031056,1.1288683901451453,120,683,2,805,15.031055900621118,community,1478,20.55136770233589,
 Infinity-Instruct-3M-0625-Yi-1.5-9B,16.203844277153284,1.1057840624447524,123,681,1,805,15.341614906832298,community,1449,20.538372631222003,0.6401022229216694
 Starling-LM-7B-alpha-ExPO,18.17975592036216,1.2498324795896385,148,657,0,805,18.385093167701864,community,1821,19.4741654606294,

diff --git a/...ights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv b/...ights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv
@@ -180,3 +180,6 @@ Meta-Llama-3.1-8B-Instruct-Turbo,-1.2423693493207115,0.8445600798571650,-1.64863
 Meta-Llama-3.1-405B-Instruct-Turbo,-1.2804290170826127,0.9380588083567104,-0.4617925323625654
 gemma-2-9b-it-WPO-HB,-0.8469718533914929,0.5520236303826254,1.3895698020985046
 blendaxai-gm-l3-v35,-1.4053055748469363,0.8041023110396046,1.3538386279295684
+Qwen2-72B-Instruct,-1.6674930210615639,0.9244007518196494,-0.5299232192745307
+gpt-4o-mini-2024-07-18,-1.4396243284854136,0.8239981543339437,0.1463734386267150
+Mistral-7B-Instruct-v0.3,-1.5007159011881868,0.9845683091847074,-1.7652759895328634
diff --git a/src/alpaca_eval/models_configs/Meta-Llama-3.1-405B-Instruct-Turbo/configs.yaml b/src/alpaca_eval/models_configs/Meta-Llama-3.1-405B-Instruct-Turbo/configs.yaml
@@ -3,7 +3,7 @@ Meta-Llama-3.1-405B-Instruct-Turbo:
   fn_completions: "openai_completions"
   completions_kwargs:
     model_name: "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"
-    max_tokens: 3072
+    max_tokens: 4096
     requires_chatml: True
     client_kwargs:
       base_url: 'https://api.together.xyz'

diff --git a/src/alpaca_eval/models_configs/Mistral-7B-Instruct-v0.3/configs.yaml b/src/alpaca_eval/models_configs/Mistral-7B-Instruct-v0.3/configs.yaml
@@ -0,0 +1,11 @@
+Mistral-7B-Instruct-v0.3:
+  prompt_template: "Mixtral-8x7B-Instruct-v0.1/togetherai_prompt.txt" # together already deals with prompt
+  fn_completions: "openai_completions"
+  completions_kwargs:
+    model_name: "mistralai/Mistral-7B-Instruct-v0.3"
+    max_tokens: 4096
+    requires_chatml: True
+    client_kwargs:
+      base_url: 'https://api.together.xyz'
+  pretty_name: "Mistral 7B v0.3"
+  link: "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3"
diff --git a/src/alpaca_eval/models_configs/Qwen2-72B-Instruct/configs.yaml b/src/alpaca_eval/models_configs/Qwen2-72B-Instruct/configs.yaml
@@ -0,0 +1,17 @@
+Qwen2-72B-Instruct:
+  prompt_template: "Qwen2-72B-Instruct/prompt.txt"
+  fn_completions: "openai_completions"
+  completions_kwargs:
+    model_name: "Qwen/Qwen2-72B-Instruct"
+    max_tokens: 4096
+    requires_chatml: True
+    client_kwargs:
+      base_url: 'https://api.together.xyz'
+  pretty_name: "Qwen2 72B Instruct"
+  link: "https://huggingface.co/Qwen/Qwen2-72B-Instruct"
+
+
+
+
+
+
diff --git a/src/alpaca_eval/models_configs/Qwen2-72B-Instruct/prompt.txt b/src/alpaca_eval/models_configs/Qwen2-72B-Instruct/prompt.txt
@@ -0,0 +1,3 @@
+<|im_start|>user
+{instruction}
+<|im_end|>
diff --git a/src/alpaca_eval/models_configs/gpt-4o-2024-08-06/configs.yaml b/src/alpaca_eval/models_configs/gpt-4o-2024-08-06/configs.yaml
@@ -0,0 +1,7 @@
+gpt-4o-2024-08-06:
+  prompt_template: "gpt4_1106_preview/chatml_prompt.txt"
+  fn_completions: "openai_completions"
+  completions_kwargs:
+    model_name: "gpt-4o-2024-08-06"
+    max_tokens: 4096
+  pretty_name: "GPT-4 Omni (08/06)"
diff --git a/src/alpaca_eval/models_configs/gpt-4o-mini-2024-07-18/configs.yaml b/src/alpaca_eval/models_configs/gpt-4o-mini-2024-07-18/configs.yaml
@@ -0,0 +1,7 @@
+gpt-4o-mini-2024-07-18:
+  prompt_template: "gpt4_1106_preview/chatml_prompt.txt"
+  fn_completions: "openai_completions"
+  completions_kwargs:
+    model_name: "gpt-4o-mini-2024-07-18"
+    max_tokens: 4096
+  pretty_name: "GPT-4o Mini (07/18)"