add mistral / mixtral / gemini

tatsu-lab · Jan 3, 2024 · b4f37e9 · b4f37e9
1 parent c4def7d
commit b4f37e9
Show file tree

Hide file tree

Showing 16 changed files with 250 additions and 17 deletions.
diff --git a/setup.py b/setup.py
@@ -23,7 +23,14 @@
 ]
 PACKAGES_ANALYSIS = ["seaborn", "matplotlib", "jupyterlab"]
 PACKAGES_LOCAL = ["accelerate", "transformers", "bitsandbytes", "xformers", "peft", "optimum", "einops"]
-PACKAGES_ALL_API = ["anthropic>=0.3.3", "huggingface_hub", "cohere", "replicate", "boto3>=1.28.58"]
+PACKAGES_ALL_API = [
+    "anthropic>=0.3.3",
+    "huggingface_hub",
+    "cohere",
+    "replicate",
+    "boto3>=1.28.58",
+    "google-generativeai",
+]
 PACKAGES_ALL = PACKAGES_LOCAL + PACKAGES_ALL_API + PACKAGES_ANALYSIS + PACKAGES_DEV
 
 setuptools.setup(

diff --git a/src/alpaca_eval/constants.py b/src/alpaca_eval/constants.py
@@ -10,6 +10,8 @@
 BASE_DIR = Path(__file__).parents[2]
 
 ### API specific ###
+API_MAX_CONCURRENCY = int(os.environ.get("API_MAX_CONCURRENCY", 5))
+
 OPENAI_MAX_CONCURRENCY = int(os.environ.get("OPENAI_MAX_CONCURRENCY", 5))
 OPENAI_CLIENT_CONFIG_PATH = os.environ.get("OPENAI_CLIENT_CONFIG_PATH", BASE_DIR / "client_configs/openai_configs.yaml")
 # the following is for backward compatibility, the recommended way is to use OPENAI_CLIENT_CONFIG_PATH
@@ -22,7 +24,9 @@
 #
 
 ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", None)
-ANTHROPIC_MAX_CONCURRENCY = int(os.environ.get("ANTHROPIC_MAX_CONCURRENCY", 1))
+ANTHROPIC_MAX_CONCURRENCY = int(os.environ.get("ANTHROPIC_MAX_CONCURRENCY", API_MAX_CONCURRENCY))
+
+GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", None)
 
 COHERE_API_KEY = os.environ.get("COHERE_API_KEY", None)
 
@@ -33,7 +37,7 @@
 
 IS_ALPACA_EVAL_2 = ast.literal_eval(os.environ.get("IS_ALPACA_EVAL_2", "True"))
 ANNOTATOR_CONFIG_AE1 = "alpaca_eval_gpt4"
-ANNOTATOR_CONFIG_AE2 = "weighted_alpaca_eval_gpt4_turbo"
+ANNOTATOR_CONFIG_AE2 = "alpaca_eval_gpt4_turbo_fn"  # "weighted_alpaca_eval_gpt4_turbo"
 DEFAULT_ANNOTATOR_CONFIG = ANNOTATOR_CONFIG_AE2 if IS_ALPACA_EVAL_2 else ANNOTATOR_CONFIG_AE1
 DEFAULT_CACHE_DIR = None
 EVALUATORS_CONFIG_DIR = CURRENT_DIR / "evaluators_configs"
@@ -131,6 +135,10 @@ def ALPACAFARM_GOLD_ANNOTATIONS():
     (str(ALPACAEVAL_REFERENCE_OUTPUTS_1), "chatgpt_fn"): ALPACAEVAL_1_LEADERBOARD_PATHS / "chatgpt_fn_leaderboard.csv",
     (str(ALPACAEVAL_REFERENCE_OUTPUTS_2), ANNOTATOR_CONFIG_AE2): ALPACAEVAL_2_LEADERBOARD_PATHS
     / f"{ANNOTATOR_CONFIG_AE2}_leaderboard.csv",
+    (str(ALPACAEVAL_REFERENCE_OUTPUTS_2), "weighted_alpaca_eval_gpt4_turbo"): ALPACAEVAL_2_LEADERBOARD_PATHS
+    / f"weighted_alpaca_eval_gpt4_turbo_leaderboard.csv",
+    (str(ALPACAEVAL_REFERENCE_OUTPUTS_2), "alpaca_eval_cot_gpt4_turbo_fn"): ALPACAEVAL_2_LEADERBOARD_PATHS
+    / f"alpaca_eval_cot_gpt4_turbo_fn_leaderboard.csv",
     # needs to add the non default config. ie either with or without the logprob
 }
 
@@ -172,8 +180,11 @@ def ALPACAFARM_GOLD_ANNOTATIONS():
     "Yi-34B-Chat",
     "llama-2-70b-chat-hf",
     "claude-2",
-    # "cohere",
+    "cohere",
     "chatgpt",
+    "gemini-pro",
+    "Mixtral-8x7B-Instruct-v0.1",
+    "Mistral-7B-Instruct-v0.2"
     # "vicuna-33b-v1.3",
     # "llama-2-13b-chat-hf",
     # "llama-2-7b-chat-hf",

diff --git a/src/alpaca_eval/decoders/__init__.py b/src/alpaca_eval/decoders/__init__.py
@@ -17,6 +17,16 @@ def get_fn_completions(name: Union[str, Callable]) -> Callable:
 
         return anthropic_completions
 
+    elif name == "google_completions":
+        try:
+            from .google import google_completions
+        except ImportError as e:
+            packages = ["google.generativeai"]
+            logging.exception(f"You need {packages} to use google_completions. Error:")
+            raise e
+
+        return google_completions
+
     elif name == "openai_completions":
         try:
             from .openai import openai_completions

diff --git a/src/alpaca_eval/decoders/anthropic.py b/src/alpaca_eval/decoders/anthropic.py
@@ -43,7 +43,7 @@ def anthropic_completions(
     n_examples = len(prompts)
     if n_examples == 0:
         logging.info("No samples to annotate.")
-        return []
+        return dict(completions=[], price_per_example=[], time_per_example=[], completions_all=[])
     else:
         to_log = f"Using `anthropic_completions` on {n_examples} prompts using {model_name} and num_procs={num_procs}."
         logging.info(to_log)

diff --git a/src/alpaca_eval/decoders/cohere.py b/src/alpaca_eval/decoders/cohere.py
@@ -96,6 +96,6 @@ def _cohere_completion_helper(
             return text, num_tokens
 
         except CohereError as e:
-            print(f"Try #{trynum+1}/{max_tries}: Error running prompt {repr(prompt)}: {e}")
+            logging.info(f"Try #{trynum+1}/{max_tries}: Error running prompt {repr(prompt)}: {e}")
 
     return " ", 0  # placeholder response for errors, doesn't allow empty string
diff --git a/src/alpaca_eval/decoders/google.py b/src/alpaca_eval/decoders/google.py
@@ -0,0 +1,148 @@
+import functools
+import logging
+import multiprocessing
+import random
+import time
+from typing import Optional, Sequence, Union
+
+import google.generativeai as genai
+import numpy as np
+import tqdm
+
+from .. import constants, utils
+
+__all__ = ["google_completions"]
+
+
+def google_completions(
+    prompts: Sequence[str],
+    max_output_tokens: Union[int, Sequence[int]] = 2048,
+    model_name="gemini-pro",
+    num_procs: int = constants.API_MAX_CONCURRENCY,
+    **decoding_kwargs,  # ,
+) -> dict[str, list]:
+    """Decode with Anthropic API.
+
+    Parameters
+    ----------
+    prompts : list of str
+        Prompts to get completions for.
+
+    max_output_tokens : int or list of int, optional
+        Number of tokens to sample for each prompt. If a list, must be the same length as `prompts`.
+
+    model_name : str, optional
+        Name of the model to use for decoding.
+
+    num_procs : int, optional
+        Number of parallel processes to use for decoding.
+
+    decoding_kwargs :
+        Additional kwargs to pass to `genai.types.GenerationConfig`.
+    """
+    num_procs = num_procs or constants.API_MAX_CONCURRENCY
+
+    n_examples = len(prompts)
+    if n_examples == 0:
+        logging.info("No samples to annotate.")
+        return dict(completions=[], price_per_example=[], time_per_example=[], completions_all=[])
+    else:
+        to_log = f"Using `google_completions` on {n_examples} prompts using {model_name} and num_procs={num_procs}."
+        logging.info(to_log)
+
+    if isinstance(max_output_tokens, int):
+        max_output_tokens = [max_output_tokens] * n_examples
+
+    inputs = zip(prompts, max_output_tokens)
+
+    kwargs = dict(model_name=model_name, **decoding_kwargs)
+    kwargs_to_log = {k: v for k, v in kwargs.items() if "api_key" not in k}
+    logging.info(f"Kwargs to completion: {kwargs_to_log}")
+    with utils.Timer() as t:
+        if num_procs == 1:
+            responses = [_google_completion_helper(inp, **kwargs) for inp in tqdm.tqdm(inputs, desc="prompts")]
+        else:
+            with multiprocessing.Pool(num_procs) as p:
+                partial_completion_helper = functools.partial(_google_completion_helper, **kwargs)
+                responses = list(
+                    tqdm.tqdm(
+                        p.imap(partial_completion_helper, inputs),
+                        desc="prompts",
+                        total=len(prompts),
+                    )
+                )
+    logging.info(f"Completed {n_examples} examples in {t}.")
+
+    # anthropic doesn't return total tokens but 1 token approx 4 chars
+    price = [_get_price(len(p), len(r), model_name) for p, r in zip(prompts, responses)]
+
+    avg_time = [t.duration / n_examples] * len(responses)
+
+    return dict(completions=responses, price_per_example=price, time_per_example=avg_time, completions_all=responses)
+
+
+def _google_completion_helper(
+    args: tuple[str, int],
+    sleep_time: int = 2,
+    temperature: Optional[float] = 0.7,
+    model_name: str = "gemini-pro",
+    google_api_keys: Optional[Sequence[str]] = None,
+    max_tries=10,
+    **kwargs,
+):
+    prompt, max_output_tokens = args
+
+    google_api_keys = google_api_keys or (constants.GOOGLE_API_KEY,)
+    google_api_key = random.choice(google_api_keys)
+
+    genai.configure(api_key=google_api_key)
+    model = genai.GenerativeModel(model_name)
+    n_tries = 0
+
+    while True:
+        try:
+            response = model.generate_content(
+                prompt,
+                generation_config=genai.types.GenerationConfig(
+                    temperature=temperature,
+                    max_output_tokens=max_output_tokens,
+                    **kwargs,
+                ),
+                # don't block anything for evaluation
+                safety_settings={
+                    "HARM_CATEGORY_HARASSMENT": "block_none",
+                    "HARM_CATEGORY_DANGEROUS_CONTENT": "block_none",
+                    "HARM_CATEGORY_HATE_SPEECH": "block_none",
+                    "HARM_CATEGORY_SEXUALLY_EXPLICIT": "block_none",
+                },
+            )
+            text = response.text
+            # num_tokens = model.count_tokens(text)
+
+            return text
+
+        # error code 429 is rate limit
+        except Exception as e:
+            if "429" in str(e):
+                logging.info(f"Rate limit reached. Sleeping {sleep_time} seconds.")
+                time.sleep(sleep_time)
+
+            else:
+                # TODO: better catching of errors when rate limits
+                logging.exception(f"Unknown error, so we are retrying. Retry #{n_tries}/{max_tries}. Error:")
+                time.sleep(sleep_time)
+                n_tries += 1
+                if n_tries > max_tries:
+                    break
+
+    return ""
+
+
+def _get_price(n_in_char: int, n_out_char: int, model: str) -> float:
+    """Returns the price per token for a given model"""
+    if model == "gemini-pro":
+        return (n_in_char * 0.00025 + n_out_char * 0.0005) / 1000
+
+    else:
+        logging.warning(f"Unknown model {model} for computing price per token.")
+        return np.nan
diff --git a/src/alpaca_eval/decoders/openai.py b/src/alpaca_eval/decoders/openai.py
@@ -30,6 +30,7 @@ def openai_completions(
     is_strip: bool = True,
     num_procs: Optional[int] = constants.OPENAI_MAX_CONCURRENCY,
     batch_size: Optional[int] = None,
+    price_per_token: Optional[float] = None,
     **decoding_kwargs,
 ) -> dict[str, list]:
     r"""Get openai completions for the given prompts. Allows additional parameters such as tokens to avoid and
@@ -58,6 +59,9 @@ def openai_completions(
     is_strip : bool, optional
         Whether to strip trailing and leading spaces from the prompts.
 
+    price_per_token : float, optional
+        Price per token for the model. If not provided, we will try to infer it from the model name.
+
     decoding_kwargs :
         Additional kwargs to pass to `openai.Completion` or `openai.ChatCompletion`.
 
@@ -159,7 +163,7 @@ def openai_completions(
     completions_text = [completion["text"] for completion in completions_all]
 
     price = [
-        completion["total_tokens"] * _get_price_per_token(model_name)
+        completion["total_tokens"] * _get_price_per_token(model_name, price_per_token)
         for completion_batch in completions
         for completion in completion_batch
     ]
@@ -185,17 +189,22 @@ def _openai_completion_helper(
     openai_api_keys: Optional[Sequence[str]] = constants.OPENAI_API_KEYS,
     openai_api_base: Optional[str] = os.getenv("OPENAI_API_BASE") if os.getenv("OPENAI_API_BASE") else openai.base_url,
     ############################
+    client_kwargs: Optional[dict[str, Any]] = None,
     **kwargs,
 ):
+    client_kwargs = client_kwargs or dict()
     prompt_batch, max_tokens = args
     all_clients = utils.get_all_clients(
         client_config_path,
         model_name=kwargs["model"],
         get_backwards_compatible_configs=_get_backwards_compatible_configs,
         default_client_class="openai.OpenAI",
-        openai_organization_ids=openai_organization_ids,
-        openai_api_keys=openai_api_keys,
-        openai_api_base=openai_api_base,
+        backward_compatibility_kwargs=dict(
+            openai_organization_ids=openai_organization_ids,
+            openai_api_keys=openai_api_keys,
+            openai_api_base=openai_api_base,
+        ),
+        **client_kwargs,
     )
 
     # randomly select the client
@@ -337,9 +346,11 @@ def _string_to_dict(to_convert):
     return {s.split("=", 1)[0]: s.split("=", 1)[1] for s in to_convert.split(" ") if len(s) > 0}
 
 
-def _get_price_per_token(model):
+def _get_price_per_token(model, price_per_token=None):
     """Returns the price per token for a given model"""
-    if "gpt-4-1106" in model:
+    if price_per_token is not None:
+        return float(price_per_token)
+    elif "gpt-4-1106" in model:
         return (
             0.01 / 1000
         )  # that's not completely true because decoding is 0.03 but close enough given that most is context

diff --git a/src/alpaca_eval/main.py b/src/alpaca_eval/main.py
@@ -127,9 +127,14 @@ def evaluate(
 
             if max_instances is not None:
                 # first we shuffle both outputs with a fix seed => more representative
-                seed = 123
-                model_outputs = model_outputs.sample(frac=1, random_state=seed)
-                reference_outputs = reference_outputs.sample(frac=1, random_state=seed)
+                if len(model_outputs) != len(reference_outputs):
+                    logging.warning(
+                        "model_outputs and reference_outputs have different lengths, so we cannot shuffle before taking the first max_instances."
+                    )
+                else:
+                    seed = 123
+                    model_outputs = model_outputs.sample(frac=1, random_state=seed)
+                    reference_outputs = reference_outputs.sample(frac=1, random_state=seed)
 
                 model_outputs = model_outputs[:max_instances]
                 reference_outputs = reference_outputs[:max_instances]

diff --git a/src/alpaca_eval/metrics.py b/src/alpaca_eval/metrics.py
@@ -79,6 +79,8 @@ def describe_head2head(self, predictions: npt.ArrayLike) -> dict[str, float]:
             n_wins=n_wins,
             n_wins_base=n_wins_base,
             n_draws=n_draws,
+            # note that n_draws will happen more often for weighted win rate because you can get 1.5 somewhat often due
+            # to float precision
             n_total=n_total,
         )
 

diff --git a/src/alpaca_eval/models_configs/Mistral-7B-Instruct-v0.2/configs.yaml b/src/alpaca_eval/models_configs/Mistral-7B-Instruct-v0.2/configs.yaml
@@ -0,0 +1,12 @@
+Mistral-7B-Instruct-v0.2:
+  prompt_template: "Mixtral-8x7B-Instruct-v0.1/togetherai_prompt.txt" # together already deals with prompt
+  fn_completions: "openai_completions"
+  completions_kwargs:
+    model_name: "mistralai/Mistral-7B-Instruct-v0.2"
+    max_tokens: 4096
+    requires_chatml: True
+    price_per_token: 2e-7
+    client_kwargs:
+      base_url: 'https://api.together.xyz'
+  pretty_name: "Mistral 7B v0.2"
+  link: "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2"
diff --git a/src/alpaca_eval/models_configs/Mixtral-8x7B-Instruct-v0.1/configs.yaml b/src/alpaca_eval/models_configs/Mixtral-8x7B-Instruct-v0.1/configs.yaml
@@ -0,0 +1,12 @@
+Mixtral-8x7B-Instruct-v0.1:
+  prompt_template: "Mixtral-8x7B-Instruct-v0.1/togetherai_prompt.txt" # together already deals with prompt
+  fn_completions: "openai_completions"
+  completions_kwargs:
+    model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
+    max_tokens: 4096
+    requires_chatml: True
+    price_per_token: 6e-7
+    client_kwargs:
+      base_url: 'https://api.together.xyz'
+  pretty_name: "Mixtral 8x7B v0.1"
+  link: "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1"
diff --git a/src/alpaca_eval/models_configs/Mixtral-8x7B-Instruct-v0.1/togetherai_prompt.txt b/src/alpaca_eval/models_configs/Mixtral-8x7B-Instruct-v0.1/togetherai_prompt.txt
@@ -0,0 +1,3 @@
+<|im_start|>user
+{instruction}
+<|im_end|>
diff --git a/src/alpaca_eval/models_configs/Yi-34B-Chat-Verified/configs.yaml b/src/alpaca_eval/models_configs/Yi-34B-Chat-Verified/configs.yaml
@@ -8,5 +8,7 @@ Yi-34B-Chat-Verified:
     num_procs: 1
     max_tokens: 3500
     top_p: 0.8
+    client_kwargs:
+      base_url: "http://api.01ww.xyz/v1"
   pretty_name: "Yi 34B Chat"
   link: "https://huggingface.co/01-ai/Yi-34B-Chat"
diff --git a/src/alpaca_eval/models_configs/gemini-pro/configs.yaml b/src/alpaca_eval/models_configs/gemini-pro/configs.yaml
@@ -0,0 +1,8 @@
+gemini-pro:
+  prompt_template: "gemini-pro/prompt.txt"
+  fn_completions: "google_completions"
+  completions_kwargs:
+    model_name: "gemini-pro"
+    max_output_tokens: 2048
+    candidate_count: 1
+  pretty_name: "Gemini Pro"
diff --git a/src/alpaca_eval/models_configs/gemini-pro/prompt.txt b/src/alpaca_eval/models_configs/gemini-pro/prompt.txt
@@ -0,0 +1 @@
+{instruction}