[BFCL] Chore: Improve Inference Log Readability (#746)

This PR improves the readability of the inference log; makes it read more like a conversation-style. The instruction on how to interpret/understand the log is also included. The inference log is included as part of the result file. The score file is also improved in structure for better readability. This will not change the leaderboard score. Here is an example inference log. ``` "inference_log": [ { "begin_of_turn_query": [ { "role": "user", "content": "First cd within the 'ResearchDocs' directory and then locate any files titled 'report.csv'." } ], "step_0": [ { "role": "assistant", "content": [ { "cd": "{\"folder\":\"ResearchDocs\"}" } ] }, { "role": "handler_log", "content": "Successfully decoded model response.", "model_response_decoded": [ "cd(folder='ResearchDocs')" ] }, { "role": "tool", "content": "{\"current_working_directory\": \"ResearchDocs\"}" } ], "step_1": [ { "role": "assistant", "content": [ { "find": "{\"path\":\"ResearchDocs\",\"name\":\"report.csv\"}" } ] }, { "role": "handler_log", "content": "Successfully decoded model response.", "model_response_decoded": [ "find(path='ResearchDocs',name='report.csv')" ] }, { "role": "tool", "content": "{\"matches\": [\"ResearchDocs/report.csv\"]}" } ], "step_2": [ { "role": "assistant", "content": "The file titled `report.csv` has been located in the `ResearchDocs` directory." }, { "role": "handler_log", "content": "Error decoding the model response. Proceed to next turn.", "error": "'str' object has no attribute 'items'" } ] }, { "begin_of_turn_query": [ { "role": "user", "content": "Once you locate 'report.csv' in the 'ResearchDocs' directory, delve into it to find lines that reference the 'Quarterly Financial Overview'." } ], "step_0": [ { "role": "assistant", "content": [ { "grep": "{\"file_name\":\"report.csv\",\"pattern\":\"Quarterly Financial Overview\"}" } ] }, { "role": "handler_log", "content": "Successfully decoded model response.", "model_response_decoded": [ "grep(file_name='report.csv',pattern='Quarterly Financial Overview')" ] }, { "role": "tool", "content": "{\"matching_lines\": [\"Line 1: Introduction Line 2: Quarterly Financial Overview Line 3: Details Line 4: More Details Line 5: Quarterly Financial Overview Line 6: Conclusion Line 7: Quarterly Financial Overview Line 8: Quarter has been successful. Line 9: Quarterly Financial Overview Line 10: Final Thoughts\"]}" } ], "step_1": [ { "role": "assistant", "content": "The following lines in `report.csv` reference the \"Quarterly Financial Overview\":\n\n- Line 2: Quarterly Financial Overview\n- Line 5: Quarterly Financial Overview\n- Line 7: Quarterly Financial Overview\n- Line 9: Quarterly Financial Overview" }, { "role": "handler_log", "content": "Error decoding the model response. Proceed to next turn.", "error": "'str' object has no attribute 'items'" } ] }, { "begin_of_turn_query": [ { "role": "user", "content": "Extract and display the last five lines from the 'report.csv' file in the 'ResearchDocs' directory, as these need to be emphasized for recent updates." } ], "step_0": [ { "role": "assistant", "content": [ { "tail": "{\"file_name\":\"report.csv\",\"lines\":5}" } ] }, { "role": "handler_log", "content": "Successfully decoded model response.", "model_response_decoded": [ "tail(file_name='report.csv',lines=5)" ] }, { "role": "tool", "content": "{\"last_lines\": \"Line 1: Introduction Line 2: Quarterly Financial Overview Line 3: Details Line 4: More Details Line 5: Quarterly Financial Overview Line 6: Conclusion Line 7: Quarterly Financial Overview Line 8: Quarter has been successful. Line 9: Quarterly Financial Overview Line 10: Final Thoughts\"}" } ], "step_1": [ { "role": "assistant", "content": "The last five lines from the `report.csv` file are as follows:\n\n- Line 6: Conclusion\n- Line 7: Quarterly Financial Overview\n- Line 8: Quarter has been successful.\n- Line 9: Quarterly Financial Overview\n- Line 10: Final Thoughts" }, { "role": "handler_log", "content": "Error decoding the model response. Proceed to next turn.", "error": "'str' object has no attribute 'items'" } ] }, { "begin_of_turn_query": [ { "role": "user", "content": "Logging in as USR001. Lastly, upon completion of our file review, kindly message my colleague, John Levy, add him as new contact, that 'Latest Quarter Performance has been well.'" } ], "step_0": [ { "role": "assistant", "content": [ { "message_login": "{\"user_id\": \"USR001\"}" }, { "add_contact": "{\"user_name\": \"John Levy\"}" } ] }, { "role": "handler_log", "content": "Successfully decoded model response.", "model_response_decoded": [ "message_login(user_id='USR001')", "add_contact(user_name='John Levy')" ] }, { "role": "tool", "content": "{\"login_status\": true, \"message\": \"User 'USR001' logged in successfully.\"}" }, { "role": "tool", "content": "{\"added_status\": true, \"user_id\": \"USR005\", \"message\": \"Contact 'John Levy' added successfully.\"}" } ], "step_1": [ { "role": "assistant", "content": [ { "get_user_id": "{\"user\":\"John Levy\"}" } ] }, { "role": "handler_log", "content": "Successfully decoded model response.", "model_response_decoded": [ "get_user_id(user='John Levy')" ] }, { "role": "tool", "content": "{\"user_id\": \"USR005\"}" } ], "step_2": [ { "role": "assistant", "content": [ { "send_message": "{\"receiver_id\":\"USR005\",\"message\":\"Latest Quarter Performance has been well.\"}" } ] }, { "role": "handler_log", "content": "Successfully decoded model response.", "model_response_decoded": [ "send_message(receiver_id='USR005',message='Latest Quarter Performance has been well.')" ] }, { "role": "tool", "content": "{\"sent_status\": true, \"message_id\": {\"new_id\": 67410}, \"message\": \"Message sent to 'USR005' successfully.\"}" } ], "step_3": [ { "role": "assistant", "content": "You have been successfully logged in as **USR001**, and your colleague **John Levy** has been added as a new contact. The message \"Latest Quarter Performance has been well.\" has been sent to him successfully." }, { "role": "handler_log", "content": "Error decoding the model response. Proceed to next turn.", "error": "'str' object has no attribute 'items'" } ] } ] ```
ShishirPatil · Nov 11, 2024 · a5f32a8 · a5f32a8
1 parent 52cf553
commit a5f32a8
Show file tree

Hide file tree

Showing 13 changed files with 405 additions and 132 deletions.
diff --git a/.gitignore b/.gitignore
@@ -29,5 +29,8 @@ berkeley-function-call-leaderboard/score/
 berkeley-function-call-leaderboard/.env
 !berkeley-function-call-leaderboard/.env.example
 
+# Ignore multi turn ground truth conversation log
+berkeley-function-call-leaderboard/utils/ground_truth_conversation/
+
 .direnv/
-.venv
+.venv
diff --git a/berkeley-function-call-leaderboard/LOG_GUIDE.md b/berkeley-function-call-leaderboard/LOG_GUIDE.md
@@ -0,0 +1,25 @@
+# Guide to Inference Logs
+
+> An inference log is included along with the llm response in the results file to help you analyze and debug the model's performance, and to better understand the model behavior. To enable a more detailed log, use the `--include-state-log` flag and/or the `--include-input-log` flag in the generation command.
+
+## Log Structure
+
+The log is structured as a list, representing a conversational interaction between the model, system, and user. There are five types of roles in the log:
+
+1. **`user`**: Represents the user's input or query.
+2. **`assistant`**: Represents the model's raw response.
+3. **`tool`**: Represents the output of a function execution, if the model makes a valid function call. Each function call results in a separate `tool` entry.
+4. **`state_info`**: Represents the state of the backend API system at the end of each turn. The initial state is also included at the beginning of the log. This entry is available only if the `--include-state-log` flag is set in the generation command.
+5. **`handler_log`**: Represents internal logs from the inference pipeline. These entries indicate various stages and events within the pipeline, including:
+
+   - **inference_input**: Snapshot of the fully-transformed input just before it's sent to the model API endpoint. Useful for debugging input integrity and format.
+     - Available only if the `--include-input-log` flag is set in the generation command.
+     - This section can be verbose and may affect log readability; it is generally not necessary for most analyses.
+   - **decode_success**: Indicates the successful decoding of the model's raw response, with the decoded response included in the `model_response_decoded` field. Following this, any function calls are executed, and the current turn continues.
+   - **empty_response**: Indicates that the model handler returned an empty response (e.g., no function call) based on the decoding strategy. When this occurs, the pipeline proceeds to the next turn.
+   - **decode_failure**: Indicates a failure in decoding the raw model response, with the raw response included in the `model_response_decoded` field. The pipeline then proceeds to the next turn.
+   - **force_quit**: Indicates that the model handler has forcefully ended the conversation after the model made 20 unsuccessful attempts (eg, steps) within one turn or task; the count reset at the beginning of each turn. No further turns are processed for this entry.
+
+## Single Turn Categories
+
+For single-turn categories, the only log entry available is the inference input (under `handler_log` role), because there is no interaction with the model or system.
diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -120,6 +120,9 @@ For available options for `MODEL_NAME` and `TEST_CATEGORY`, please refer to the
 
 If no `MODEL_NAME` is provided, the model `gorilla-openfunctions-v2` will be used by default. If no `TEST_CATEGORY` is provided, all test categories will be run by default.
 
+> An inference log will be included along with the llm response to help you analyze and debug the model's performance, and to better understand the model behavior. To see a more verbose log, you can set the `--include-state-log` and/or the `--include-input-log` flag in the generation command.
+> Please refer to the `LOG_GUIDE.md` file for more information on how to interpret the inference logs and what each flag does.
+
 #### For API-hosted models:
 
 ```bash

diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py
@@ -68,18 +68,18 @@ def generate(
     test_category: List[str] = typer.Option(
         ["all"], help="A list of test categories to run the evaluation on."
     ),
-    api_sanity_check: bool = typer.Option(
-        False,
-        "--api-sanity-check",
-        "-c",
-        help="Perform the REST API status sanity check before running the evaluation.",
-    ),
     temperature: float = typer.Option(
         0.001, help="The temperature parameter for the model."
     ),
-    include_debugging_log: bool = typer.Option(
+    include_input_log: bool = typer.Option(
+        False,
+        "--include-input-log",
+        help="Include the fully-transformed input to the model inference endpoint in the inference log; only relevant for debugging input integrity and format.",
+    ),
+    include_state_log: bool = typer.Option(
         False,
-        help="Include debugging log in the response file to see model's interaction with the state machine.",
+        "--include-state-log",
+        help="Include info about the state of each API system after each turn in the inference log; only relevant for multi-turn categories.",
     ),
     num_gpus: int = typer.Option(1, help="The number of GPUs to use."),
     num_threads: int = typer.Option(1, help="The number of threads to use."),
@@ -98,9 +98,9 @@ def generate(
         [
             "model",
             "test_category",
-            "api_sanity_check",
             "temperature",
-            "include_debugging_log",
+            "include_input_log",
+            "include_state_log",
             "num_gpus",
             "num_threads",
             "gpu_memory_utilization",
@@ -113,9 +113,9 @@ def generate(
         generationArgs(
             model=model,
             test_category=test_category,
-            api_sanity_check=api_sanity_check,
             temperature=temperature,
-            include_debugging_log=include_debugging_log,
+            include_input_log=include_input_log,
+            include_state_log=include_state_log,
             num_gpus=num_gpus,
             num_threads=num_threads,
             gpu_memory_utilization=gpu_memory_utilization,

diff --git a/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py b/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py
@@ -38,7 +38,8 @@ def get_args():
 
     # Parameters for the model that you want to test.
     parser.add_argument("--temperature", type=float, default=0.001)
-    parser.add_argument("--include-debugging-log", "-d", action="store_true", default=False)
+    parser.add_argument("--include-input-log", action="store_true", default=False)
+    parser.add_argument("--include-state-log", action="store_true", default=False)
     parser.add_argument("--num-threads", default=1, type=int)
     parser.add_argument("--num-gpus", default=1, type=int)
     parser.add_argument("--backend", default="vllm", type=str, choices=["vllm", "sglang"])
@@ -153,7 +154,7 @@ def process_multi_turn_test_case(test_cases, test_category):
     return test_cases
 
 
-def multi_threaded_inference(handler, test_case, include_debugging_log):
+def multi_threaded_inference(handler, test_case, include_input_log, include_state_log):
 
     assert type(test_case["function"]) is list
 
@@ -162,7 +163,7 @@ def multi_threaded_inference(handler, test_case, include_debugging_log):
     while True:
         try:
             result, metadata = handler.inference(
-                copy.deepcopy(test_case), include_debugging_log
+                copy.deepcopy(test_case), include_input_log, include_state_log
             )
             break  # Success, exit the loop
         except Exception as e:
@@ -215,7 +216,8 @@ def generate_results(args, model_name, test_cases_total):
             num_gpus=args.num_gpus,
             gpu_memory_utilization=args.gpu_memory_utilization,
             backend=args.backend,
-            include_debugging_log=args.include_debugging_log,
+            include_input_log=args.include_input_log,
+            include_state_log=args.include_state_log,
         )
 
     else:
@@ -230,7 +232,8 @@ def generate_results(args, model_name, test_cases_total):
                         multi_threaded_inference,
                         handler,
                         test_case,
-                        args.include_debugging_log,
+                        args.include_input_log,
+                        args.include_state_log,
                     )
                     futures.append(future)
 

diff --git a/berkeley-function-call-leaderboard/bfcl/constant.py b/berkeley-function-call-leaderboard/bfcl/constant.py
@@ -7,6 +7,7 @@
 POSSIBLE_ANSWER_PATH = "../data/possible_answer/"
 SCORE_PATH = "../score/"
 DOTENV_PATH = "../.env"
+UTILS_PATH = "../utils/"
 
 VERSION_PREFIX = "BFCL_v3"
 
@@ -196,6 +197,7 @@
 POSSIBLE_ANSWER_PATH = (script_dir / POSSIBLE_ANSWER_PATH).resolve()
 SCORE_PATH = (script_dir / SCORE_PATH).resolve()
 DOTENV_PATH = (script_dir / DOTENV_PATH).resolve()
+UTILS_PATH = (script_dir / UTILS_PATH).resolve()
 
 RESULT_PATH.mkdir(parents=True, exist_ok=True)
 SCORE_PATH.mkdir(parents=True, exist_ok=True)
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
@@ -144,6 +144,7 @@ def multi_turn_runner(
             temp["model_result_raw"] = multi_turn_model_result_list
             temp["model_result_decoded"] = multi_turn_model_result_list_decoded
             temp["possible_answer"] = multi_turn_ground_truth_list
+            temp["inference_log"] = model_result[i].get("inference_log", "")
             result.append(temp)
         else:
             correct_count += 1

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/multi_turn_eval/multi_turn_checker.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/multi_turn_eval/multi_turn_checker.py
@@ -32,6 +32,7 @@ def multi_turn_checker(
 
         # Note that we combine all the sub-step results into a single list, for easier comparison
         single_turn_model_execution_results = []
+        single_turn_model_execution_results_uncombined = []
         single_turn_ground_truth_execution_results = []
         model_instances = {}  # Will be overwritten in the for loop
         single_step_model_execution_results = []  # Will be overwritten in the for loop
@@ -51,6 +52,7 @@ def multi_turn_checker(
                 )
             )
             single_turn_model_execution_results.extend(single_step_model_execution_results)
+            single_turn_model_execution_results_uncombined.append(single_step_model_execution_results)
 
         # Execute the ground truth function calls
         single_turn_ground_truth_execution_results, ground_truth_instances = (
@@ -70,7 +72,7 @@ def multi_turn_checker(
         all_turn_model_execution_results.extend(single_turn_model_execution_results)
         execution_results.append(
             {
-                "model": single_turn_model_execution_results,
+                "model": single_turn_model_execution_results_uncombined,
                 "ground_truth": single_turn_ground_truth_execution_results,
             }
         )
@@ -210,8 +212,8 @@ def response_checker(
             "error_type": "multi_turn:execution_response_mismatch",
             "details": {
                 "missing_items": missing_items,
-                "model_response": model_response_list,
-                "ground_truth_response": ground_truth_response_list,
+                "model_response (including all previous turns)": model_response_list,
+                "ground_truth_response (only the current turn)": ground_truth_response_list,
             },
         }