diff --git a/berkeley-function-call-leaderboard/openfunctions_evaluation.py b/berkeley-function-call-leaderboard/openfunctions_evaluation.py index 1c4ba9c38..e3a08698c 100644 --- a/berkeley-function-call-leaderboard/openfunctions_evaluation.py +++ b/berkeley-function-call-leaderboard/openfunctions_evaluation.py @@ -1,4 +1,4 @@ -import argparse, json, os +import argparse, json, os, time from tqdm import tqdm from model_handler.handler_map import handler_map from model_handler.model_style import ModelStyle @@ -91,6 +91,10 @@ def collect_test_cases(test_filename_total, model_name): def generate_results(args, model_name, test_cases_total): + RETRY_LIMIT = 3 + # 60s for the timer to complete. But often we find that even with 60 there is a conflict. So 65 is a safe no. + RETRY_DELAY = 65 # Delay in seconds + handler = build_handler(model_name, args.temperature, args.top_p, args.max_tokens) if handler.model_style == ModelStyle.OSSMODEL: @@ -114,9 +118,31 @@ def generate_results(args, model_name, test_cases_total): if type(functions) is dict or type(functions) is str: functions = [functions] - result, metadata = handler.inference( - user_question, functions, test_category - ) + retry_count = 0 + + while retry_count < RETRY_LIMIT: + try: + result, metadata = handler.inference( + user_question, functions, test_category + ) + break # Success, exit the loop + except Exception as e: + # TODO: It might be better to handle the exception in the handler itself rather than a universal catch block here, as each handler use different ways to call the endpoint. + # OpenAI has openai.RateLimitError while Anthropic has anthropic.RateLimitError. It would be more robust in the long run. + if "rate limit reached" in str(e).lower() or ( + hasattr(e, "status_code") + and ( + e.status_code == 429 + or e.status_code == 503 + or e.status_code == 500 + ) + ): + print(f"Rate limit reached. Sleeping for 65 seconds. Retry {retry_count + 1}/{RETRY_LIMIT}") + time.sleep(RETRY_DELAY) + retry_count += 1 + else: + print("Maximum retries reached or other error encountered.") + raise e # Rethrow the last caught exception result_to_write = { "id": test_case["id"], "result": result,