diff --git a/README.md b/README.md index 2c191c1..7724514 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ ## X-MAS-Bench -1. Specify your model configs in `./configs/X-MAS_Bench_config.json`: +### 1. Specify your model configs in `./configs/X-MAS_Bench_config.json`: ``` "gpt-4o-mini-2024-07-18": { "model_list": [ @@ -17,16 +17,22 @@ } ``` -2. Inference on a dataset (the outputs will be saved under "./X-MAS-Bench/results/") +### 2. Inference on a dataset (the outputs will be saved under "./X-MAS-Bench/results/") ``` # bash scripts/infer_X-MAS_Bench.sh -python X-MAS-Bench/infer_direct.py --model_name --model_config --test_dataset_name + +python X-MAS-Bench/infer_qa.py --model_name --model_config --test_dataset_name ``` +If you want to infer with **revise, aggregation or evaluation**, please make sure you have replace the content of "./X-MAS-Bench/results/" with **source file** in [Google Drive](https://drive.google.com/file/d/1ijQCzujXdYZDV95vWJHUpvOQFWVgUbXF/view?usp=sharing) first. +You can download the .zip file named results.zip to the "./X-MAS-Bench/results/" path and unzip it. -3. Evaluate on a dataset (the outputs will be saved under "./X-MAS-Bench/results/") +If you want to infer with **planning**, please make sure you have load the default source model **"llama-3.1-8b-instruct", "qwen-2.5-7b-instruct" and "qwen-2.5-14b-instruct"**. + +### 3. Evaluate on a dataset (the outputs will be saved under "./X-MAS-Bench/results/") ``` # bash scripts/eval_X-MAS_Bench.sh -python X-MAS-Bench/eval_bench.py --model_name --model_config --dataset_name --infer_name --eval_mode bench-test + +python X-MAS-Bench/eval_bench.py --eval_model_name --model_config --dataset_name --model_name --function_name --eval_mode bench-test # We use llama-3.1-70b-instruct as ``` @@ -35,7 +41,7 @@ You can download the .zip file named results.zip to the "./X-MAS-Bench/results/" ## X-MAS-Design -1. Specify your model configs in `./configs/X-MAS_Design_config.json`: +### 1. Specify your model configs in `./configs/X-MAS_Design_config.json`: ``` "gpt-4o-mini-2024-07-18": { "model_list": [ @@ -45,21 +51,37 @@ You can download the .zip file named results.zip to the "./X-MAS-Bench/results/" } ``` -2. Inference on a dataset (the outputs will be saved under "./X-MAS-Design/results/") +### 2. Inference on a dataset (the outputs will be saved under "./X-MAS-Design/results/") ``` # bash scripts/infer_X-MAS_Design.sh # (Parallel) -python X-MAS-Design/inference_X-MAS.py --method_name --model_name --test_dataset_name --model_api_config +python X-MAS-Design/inference_mas.py --model_api_config --method_name --test_dataset_name # Or (Sequential) -python X-MAS-Design/inference_X-MAS.py --method_name --model_name --test_dataset_name --model_api_config --sequential +python X-MAS-Design/inference_mas.py --model_api_config --method_name --test_dataset_name --sequential ``` -3. Evaluate on a dataset (the outputs will be saved under "./X-MAS-Design/results/") +You can change the default model list in the method_config files for the corresponding method, like model list in "./X-MAS-Design/methods/llm_debate/configs/config_main.yaml" for "./X-MAS-Design/methods/llm_debate/llm_debate_main.py" + +If you want to run a method like llm_debate with your own config or settings, you can modify them accordingly and run the following code. + ``` -bash scripts/eval_X-MAS_Design.sh +# bash scripts/infer_X-MAS_Design.sh + +# (Parallel) +python X-MAS-Design/inference_mas.py --model_api_config --method_name --method_config_name --test_dataset_name + +# Or (Sequential) +python X-MAS-Design/inference_mas.py --model_api_config --method_name --method_config_name --test_dataset_name --sequential +``` + +### 3. Evaluate on a dataset (the outputs will be saved under "./X-MAS-Design/results/") +``` +# bash scripts/eval_X-MAS_Design.sh + +python X-MAS-Design/eval_mas.py --eval_model_name llama-3.1-70b-instruct --model_api_config --method_name --method_config_name --test_dataset_name --eval_mode bench-test ``` ## Citation diff --git a/X-MAS-Bench/eval_bench.py b/X-MAS-Bench/eval_bench.py index fa73641..388ff67 100644 --- a/X-MAS-Bench/eval_bench.py +++ b/X-MAS-Bench/eval_bench.py @@ -658,7 +658,7 @@ def load_source_data(source_dir): print(f"Loaded {len(source_map)} source data items.") return source_map -def get_evaluation(eval_data, model_url_list, model_name, dataset_name, infer_name, max_workers=4, sequential=False): +def get_evaluation(eval_data, model_url_list, model_name, dataset_name, function_name, infer_name, max_workers=4, sequential=False): """ 批量评分函数,使用并行处理来加速评分过程。 @@ -668,7 +668,7 @@ def get_evaluation(eval_data, model_url_list, model_name, dataset_name, infer_na """ # print(eval_data) if "evaluate" in infer_name: - source_dir = f"./X-MAS-Bench/results/{dataset_name}/qwen2.5-32b-instruct_direct_eval.json" + source_dir = f"./X-MAS-Bench/results/{dataset_name}/qa/qwen2.5-32b-instruct_qa_eval.json" source_map = load_source_data(source_dir) else: source_map = {} @@ -704,7 +704,8 @@ def get_evaluation(eval_data, model_url_list, model_name, dataset_name, infer_na if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--model_name", type=str, default="llama-3.1-70b-instruct", help="the LLM for judgement") + parser.add_argument("--eval_model_name", type=str, default="llama-3.1-70b-instruct", help="the LLM for judgement") + parser.add_argument("--model_name", type=str, default="qwen-2.5-32b-instruct", help="the LLM to be judged") parser.add_argument("--function_name", type=str, default="direct", help="the function for judgement") parser.add_argument("--model_config", type=str, default="") parser.add_argument("--dataset_name", type=str, default="MedMCQA") @@ -713,7 +714,7 @@ def get_evaluation(eval_data, model_url_list, model_name, dataset_name, infer_na parser.add_argument("--dry_run", action="store_true") parser.add_argument("--sequential", action="store_true") args = parser.parse_args() - # args.infer_name = f"{args.model_name}_{args.function_name}.jsonl" + args.infer_name = f"{args.model_name}_{args.function_name}.jsonl" print("="*50) print(json.dumps(vars(args), indent=4)) @@ -735,8 +736,7 @@ def get_evaluation(eval_data, model_url_list, model_name, dataset_name, infer_na # print('-'*20 + f"\n>> Evaluating {i}-th dataset: {dataset_name}") if args.eval_mode == "bench-test": - infer_path = f"./X-MAS-Bench/results/{dataset_name}/{args.infer_name}" - # infer_path = f"./results/{dataset_name}/{args.infer_name}" + infer_path = f"./X-MAS-Bench/results/{dataset_name}/{args.function_name}/{args.infer_name}" save_eval_path = infer_path.replace(".jsonl", "_eval.json") @@ -782,7 +782,7 @@ def get_evaluation(eval_data, model_url_list, model_name, dataset_name, infer_na print(f">> Running Loaded {len(eval_data)} samples") - eval_content_list, score_list = get_evaluation(eval_data, model_url_list, args.model_name, dataset_name, args.infer_name, max_workers, args.sequential) + eval_content_list, score_list = get_evaluation(eval_data, model_url_list, args.model_name, dataset_name, args.function_name, args.infer_name, max_workers, args.sequential) # mapping the response back to the original query for i, eval_content, score in zip(range(len(eval_data)), eval_content_list, score_list): diff --git a/X-MAS-Bench/infer_aggregate.py b/X-MAS-Bench/infer_aggregation.py similarity index 94% rename from X-MAS-Bench/infer_aggregate.py rename to X-MAS-Bench/infer_aggregation.py index 5951726..e6363a4 100644 --- a/X-MAS-Bench/infer_aggregate.py +++ b/X-MAS-Bench/infer_aggregation.py @@ -47,7 +47,7 @@ def aggregate_init_answers(query, answer_dict, shuffle_index): def get_sample_pool(test_dataset_name, aggregate_model_names): query_dict = defaultdict(dict) for model_name in aggregate_model_names: - with open(f"X-MAS-Bench/results/{test_dataset_name}/{model_name}_direct.jsonl", "r") as f: + with open(f"X-MAS-Bench/results/{test_dataset_name}/qa/{model_name}_qa.jsonl", "r") as f: for line in f: sample = json.loads(line) query = sample["query"] @@ -55,7 +55,7 @@ def get_sample_pool(test_dataset_name, aggregate_model_names): shuffle_matrix = create_shuffled_matrix(len(query_dict), len(aggregate_model_names)) sample_pool = [] - with open(f"X-MAS-Bench/results/{test_dataset_name}/{aggregate_model_names[0]}_direct.jsonl", "r") as f: + with open(f"X-MAS-Bench/results/{test_dataset_name}/qa/{aggregate_model_names[0]}_qa.jsonl", "r") as f: for i, line in enumerate(f): sample = json.loads(line) query = sample["query"] @@ -130,8 +130,8 @@ def process_sample(sample): try: # ================== Define the output files ================== - output_logging = f"X-MAS-Bench/results/{test_dataset_name}/log/{args.model_name}_aggregate.txt" - output_json = f"X-MAS-Bench/results/{test_dataset_name}/{args.model_name}_aggregate.jsonl" + output_logging = f"X-MAS-Bench/results/{test_dataset_name}/aggregation/log/{args.model_name}_aggregation.txt" + output_json = f"X-MAS-Bench/results/{test_dataset_name}/aggregation/{args.model_name}_aggregation.jsonl" output_dir_log = os.path.dirname(output_logging) output_dir_json = os.path.dirname(output_json) os.makedirs(output_dir_log, exist_ok=True) diff --git a/X-MAS-Bench/infer_evaluate.py b/X-MAS-Bench/infer_evaluation.py similarity index 95% rename from X-MAS-Bench/infer_evaluate.py rename to X-MAS-Bench/infer_evaluation.py index 0d5264a..064f1ec 100644 --- a/X-MAS-Bench/infer_evaluate.py +++ b/X-MAS-Bench/infer_evaluation.py @@ -43,7 +43,7 @@ def evaluate_init_answers(query, answer): def get_sample_pool(test_dataset_name): sample_pool = [] - with open(f"X-MAS-Bench/results/{test_dataset_name}/qwen2.5-32b-instruct/direct/qwen2.5-32b-instruct_direct.jsonl", "r") as f: + with open(f"X-MAS-Bench/results/{test_dataset_name}/qa/qwen2.5-32b-instruct_qa.jsonl", "r") as f: for i, line in enumerate(f): sample = json.loads(line) @@ -113,8 +113,8 @@ def process_sample(sample): # ================== Define the output files ================== - output_logging = f"X-MAS-Bench/results/{test_dataset_name}/log/{args.model_name}_evaluate.txt" - output_json = f"X-MAS-Bench/results/{test_dataset_name}/{args.model_name}_evaluate.jsonl" + output_logging = f"X-MAS-Bench/results/{test_dataset_name}/evaluation/log/{args.model_name}_evaluation.txt" + output_json = f"X-MAS-Bench/results/{test_dataset_name}/evaluation/{args.model_name}_evaluation.jsonl" output_dir_log = os.path.dirname(output_logging) output_dir_json = os.path.dirname(output_json) os.makedirs(output_dir_log, exist_ok=True) diff --git a/X-MAS-Bench/infer_plan.py b/X-MAS-Bench/infer_planning.py similarity index 94% rename from X-MAS-Bench/infer_plan.py rename to X-MAS-Bench/infer_planning.py index 22d7376..7c39975 100644 --- a/X-MAS-Bench/infer_plan.py +++ b/X-MAS-Bench/infer_planning.py @@ -74,16 +74,17 @@ def plan_init_answers(query): return str -def get_sample_pool(test_dataset_name): +def get_sample_pool(test_dataset_name): + test_data_path = f"X-MAS-Bench/benchmarks/{test_dataset_name}.json" + with open(test_data_path, "r") as f: + samples = json.load(f) + sample_pool = [] - with open(f"X-MAS-Bench/results/{test_dataset_name}/qwen2.5-32b-instruct/direct/qwen2.5-32b-instruct_direct.jsonl", "r") as f: - for i, line in enumerate(f): - sample = json.loads(line) - query = sample["query"] - sample_copy = deepcopy(sample) - sample_copy["plan_query"] = plan_init_answers(query) - del sample_copy["generated_output"] - sample_pool.append(sample_copy) + for i,sample in enumerate(samples): + query = sample["query"] + sample_copy = deepcopy(sample) + sample_copy["plan_query"] = plan_init_answers(query) + sample_pool.append(sample_copy) return sample_pool @@ -193,8 +194,8 @@ def output_trans(response): # ================== Define the output files ================== - output_logging = f"X-MAS-Bench/results/{test_dataset_name}/log/{args.model_name}_plan.txt" - output_json = f"X-MAS-Bench/results/{test_dataset_name}/{args.model_name}_plan.jsonl" + output_logging = f"X-MAS-Bench/results/{test_dataset_name}/planning/log/{args.model_name}_planning.txt" + output_json = f"X-MAS-Bench/results/{test_dataset_name}/planning/{args.model_name}_planning.jsonl" output_dir_log = os.path.dirname(output_logging) output_dir_json = os.path.dirname(output_json) os.makedirs(output_dir_log, exist_ok=True) diff --git a/X-MAS-Bench/infer_direct.py b/X-MAS-Bench/infer_qa.py similarity index 95% rename from X-MAS-Bench/infer_direct.py rename to X-MAS-Bench/infer_qa.py index 6275ac9..536d862 100644 --- a/X-MAS-Bench/infer_direct.py +++ b/X-MAS-Bench/infer_qa.py @@ -72,8 +72,8 @@ def process_sample(sample): test_dataset_name = args.test_dataset_name try: # ================== Define the output files ================== - output_logging = f"X-MAS-Bench/results/{test_dataset_name}/log/{args.model_name}_direct.txt" - output_json = f"X-MAS-Bench/results/{test_dataset_name}/{args.model_name}_direct.jsonl" + output_logging = f"X-MAS-Bench/results/{test_dataset_name}/qa/log/{args.model_name}_qa.txt" + output_json = f"X-MAS-Bench/results/{test_dataset_name}/qa/{args.model_name}_qa.jsonl" test_data_path = f"X-MAS-Bench/benchmarks/{test_dataset_name}.json" output_dir_log = os.path.dirname(output_logging) @@ -116,4 +116,4 @@ def process_sample(sample): for _ in tqdm(executor.map(process_sample, sample_pool), total=len(sample_pool), desc=f"Processing queries with {args.model_name} on {test_dataset_name}"): pass except Exception as e: - print(f"direct Traceback: {traceback.format_exc()}") \ No newline at end of file + print(f"qa Traceback: {traceback.format_exc()}") \ No newline at end of file diff --git a/X-MAS-Bench/infer_revise.py b/X-MAS-Bench/infer_revise.py index ccf0ef0..689828f 100644 --- a/X-MAS-Bench/infer_revise.py +++ b/X-MAS-Bench/infer_revise.py @@ -44,7 +44,7 @@ def revise_init_answers(query, answer): def get_sample_pool(test_dataset_name): # query_dict = defaultdict(dict) sample_pool = [] - with open(f"X-MAS-Bench/results/{test_dataset_name}/qwen2.5-32b-instruct/direct/qwen2.5-32b-instruct_direct.jsonl", "r") as f: + with open(f"X-MAS-Bench/results/{test_dataset_name}/qa/qwen2.5-32b-instruct_qa.jsonl", "r") as f: for i, line in enumerate(f): sample = json.loads(line) query = sample["query"] @@ -106,8 +106,8 @@ def process_sample(sample): # ================== Define the output files ================== - output_logging = f"X-MAS-Bench/results/{test_dataset_name}/log/{args.model_name}_revise.txt" - output_json = f"X-MAS-Bench/results/{test_dataset_name}/{args.model_name}_revise.jsonl" + output_logging = f"X-MAS-Bench/results/{test_dataset_name}/revise/log/{args.model_name}_revise.txt" + output_json = f"X-MAS-Bench/results/{test_dataset_name}/revise/{args.model_name}_revise.jsonl" output_dir_log = os.path.dirname(output_logging) output_dir_json = os.path.dirname(output_json) os.makedirs(output_dir_log, exist_ok=True) diff --git a/X-MAS-Design/eval_mas.py b/X-MAS-Design/eval_mas.py index 03fe7e7..ba6ffd2 100644 --- a/X-MAS-Design/eval_mas.py +++ b/X-MAS-Design/eval_mas.py @@ -665,11 +665,7 @@ def get_evaluation(eval_data, model_url_list, model_name, dataset_name, infer_na :return: 返回评分列表 """ # print(eval_data) - if "evaluate" in infer_name: - source_dir = f"./X-MAS-Design/results/{dataset_name}/qwen2.5-32b-instruct_direct_eval.json" - source_map = load_source_data(source_dir) - else: - source_map = {} + source_map = {} eval_content_list, scores = [None] * len(eval_data), [None] * len(eval_data) @@ -702,95 +698,94 @@ def get_evaluation(eval_data, model_url_list, model_name, dataset_name, infer_na if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--model_name", type=str, default="llama-3.1-70b-instruct", help="the LLM for judgement") - parser.add_argument("--model_config", type=str, default="config_.json") - parser.add_argument("--dataset_names", type=str, nargs='+', default=["MATH", "GSM8K", "AQUA-RAT", "MedMCQA"]) + parser.add_argument("--eval_model_name", type=str, default="llama-3.1-70b-instruct", help="the LLM for judgement") + parser.add_argument("--model_api_config", type=str, default="configs/X-MAS_Design_config.json") + parser.add_argument("--method_name", type=str, default="vanilla", help="MAS name.") + parser.add_argument("--method_config_name", type=str, default="config_main", help="The method config file name. If None, the default config file will be used.") + parser.add_argument("--test_dataset_name", type=str, default="example_math", help="The dataset to be used for testing.") parser.add_argument("--eval_mode", type=str, choices=["test", "train", "bench-test"], required=True) parser.add_argument("--infer_name", type=str, default="mas_3_5cot-sc_general_infer.jsonl") parser.add_argument("--dry_run", action="store_true") parser.add_argument("--sequential", action="store_true") args = parser.parse_args() - + args.infer_name = f"{args.method_name}_{args.method_config_name}.jsonl" print("="*50) print(json.dumps(vars(args), indent=4)) # ================== Define the model list ================== - with open(args.model_config, "r") as f: + with open(args.model_api_config, "r") as f: config = json.load(f) model_dict = config["model_dict"] - worker_dict = config["worker_dict"] - model_list = model_dict[args.model_name] - model_url_list = [item[1] for item in model_list] - max_workers = worker_dict[args.model_name] * len(model_list) + + model_list = model_dict[args.eval_model_name]["model_list"] + model_url_list = [item["model_url"] for item in model_list] + max_workers = model_dict[args.eval_model_name]["max_workers_per_model"] * len(model_list) print(f">> {len(model_url_list)} models will be used for evaluation") # ============== main ============== - for i, dataset_name in enumerate(args.dataset_names): - if dataset_name in ["IFEval"]: - continue - - print('-'*20 + f"\n>> Evaluating {i}-th dataset: {dataset_name}") - if args.eval_mode == "bench-test": - infer_path = f"./X-MAS-Design/results/{dataset_name}/{args.infer_name}" - - - save_eval_path = infer_path.replace(".jsonl", "_eval.json") - - eval_data, existing_eval_data = [], [] - - try: - print(infer_path) - eval_data = [] - with open(infer_path, "r") as f: - tmp = f.readlines() - # eval_data = [json.loads(line) for line in tmp] + dataset_name = args.test_dataset_name + + print('-'*20 + f"\n>> Evaluating dataset: {dataset_name}") + if args.eval_mode == "bench-test": + infer_path = f"./X-MAS-Design/results/{dataset_name}/{args.method_name}/{args.infer_name}" - for line in tmp: - try: - eval_data.append(json.loads(line)) - except Exception as e: - print(line) - print(f"{e}") - print(f">> Before filtering: {len(eval_data)} samples") + save_eval_path = infer_path.replace(".jsonl", "_eval.json") - if os.path.exists(save_eval_path): - with open(save_eval_path, "r") as f: - existing_eval_data = json.load(f) + eval_data, existing_eval_data = [], [] - # 获取已评估过的样本的 query 和 mas_name 的组合,mas_name 不存在时只用 query - evaluated_pairs = { - (item['query'], item['mas_name']) if 'mas_name' in item else item['query'] - for item in existing_eval_data if 'gt_score' in item - } + try: + print(infer_path) + eval_data = [] + with open(infer_path, "r") as f: + tmp = f.readlines() + # eval_data = [json.loads(line) for line in tmp] + + for line in tmp: + try: + eval_data.append(json.loads(line)) + except Exception as e: + print(line) + print(f"{e}") + + print(f">> Before filtering: {len(eval_data)} samples") + + if os.path.exists(save_eval_path): + with open(save_eval_path, "r") as f: + existing_eval_data = json.load(f) + + # 获取已评估过的样本的 query 和 mas_name 的组合,mas_name 不存在时只用 query + evaluated_pairs = { + (item['query'], item['mas_name']) if 'mas_name' in item else item['query'] + for item in existing_eval_data if 'gt_score' in item + } - # 筛选出那些没有被评估过的样本 - eval_data = [ - item for item in eval_data - if ('mas_name' in item and (item['query'], item['mas_name']) not in evaluated_pairs) or - ('mas_name' not in item and item['query'] not in evaluated_pairs) - ] + # 筛选出那些没有被评估过的样本 + eval_data = [ + item for item in eval_data + if ('mas_name' in item and (item['query'], item['mas_name']) not in evaluated_pairs) or + ('mas_name' not in item and item['query'] not in evaluated_pairs) + ] - print(f">> After filtering: {len(eval_data)} samples") + print(f">> After filtering: {len(eval_data)} samples") - eval_data = eval_data[1:3] if args.dry_run else eval_data + eval_data = eval_data[1:3] if args.dry_run else eval_data - print(f">> Running Loaded {len(eval_data)} samples") + print(f">> Running Loaded {len(eval_data)} samples") - eval_content_list, score_list = get_evaluation(eval_data, model_url_list, args.model_name, dataset_name, args.infer_name, max_workers, args.sequential) + eval_content_list, score_list = get_evaluation(eval_data, model_url_list, args.eval_model_name, dataset_name, args.infer_name, max_workers, args.sequential) - # mapping the response back to the original query - for i, eval_content, score in zip(range(len(eval_data)), eval_content_list, score_list): - - # 将评分加入到responses中 - eval_data[i]['eval_content'] = eval_content - eval_data[i]['gt_score'] = score - existing_eval_data.append(eval_data[i]) + # mapping the response back to the original query + for i, eval_content, score in zip(range(len(eval_data)), eval_content_list, score_list): + + # 将评分加入到responses中 + eval_data[i]['eval_content'] = eval_content + eval_data[i]['gt_score'] = score + existing_eval_data.append(eval_data[i]) - print(f">> Finished evaluating {len(eval_data)} samples") + print(f">> Finished evaluating {len(eval_data)} samples") - with open(save_eval_path, "w") as f: - json.dump(existing_eval_data, f, ensure_ascii=False, indent=4) - except Exception as e: - print(f"Error occurred during evaluation: {e}") - continue + with open(save_eval_path, "w") as f: + json.dump(existing_eval_data, f, ensure_ascii=False, indent=4) + except Exception as e: + print(f"Error occurred during evaluation: {e}") diff --git a/X-MAS-Design/inference_mas.py b/X-MAS-Design/inference_mas.py index 60de659..c2032a1 100644 --- a/X-MAS-Design/inference_mas.py +++ b/X-MAS-Design/inference_mas.py @@ -56,10 +56,10 @@ def reserve_unprocessed(output_json, test_dataset): parser = argparse.ArgumentParser() # args related to the method parser.add_argument("--method_name", type=str, default="vanilla", help="MAS name.") - parser.add_argument("--method_config_name", type=str, default=None, help="The config file name. If None, the default config file will be used.") + parser.add_argument("--method_config_name", type=str, default="config_main", help="The method config file name. If None, the default config file will be used.") # args related to the model - parser.add_argument("--model_name", type=str, default="gpt-4o-mini-2024-07-18", help="The agent backend to be used for inference.") + # parser.add_argument("--model_name", type=str, default="gpt-4o-mini-2024-07-18", help="The agent backend to be used for inference.") parser.add_argument("--model_api_config", type=str, default="configs/X-MAS_Design_config.json") parser.add_argument("--model_temperature", type=float, default=0.5, help="Temperature for sampling.") parser.add_argument("--model_max_tokens", type=int, default=2048, help="Maximum tokens for sampling.") @@ -80,7 +80,7 @@ def reserve_unprocessed(output_json, test_dataset): model_api_config = load_model_api_config(args.model_api_config) general_config.update({"model_api_config": model_api_config}) print("-"*50, f"\n>> Model API config: {general_config['model_api_config']}") - print("-"*50, f"\n>> Model API config for X-MAS: {model_api_config[args.model_name]}") + print("-"*50, f"\n>> method config for X-MAS: {args.method_config_name}") if args.debug: # MAS inference @@ -104,7 +104,7 @@ def reserve_unprocessed(output_json, test_dataset): test_dataset = json.load(f) # get output path - output_path = args.output_path if args.output_path is not None else f"./X-MAS-Design/results/{args.test_dataset_name}/{args.method_name}_manual_4m_infer.jsonl" + output_path = args.output_path if args.output_path is not None else f"./X-MAS-Design/results/{args.test_dataset_name}/{args.method_name}/{args.method_name}_{args.method_config_name}.jsonl" os.makedirs(os.path.dirname(output_path), exist_ok=True) # reserve unprocessed samples @@ -122,7 +122,7 @@ def reserve_unprocessed(output_json, test_dataset): for sample in test_dataset: process_sample(args, general_config, sample, output_path) else: - max_workers = model_api_config[args.model_name]["max_workers"] + max_workers = model_api_config["gpt-4o-mini-2024-07-18"]["max_workers"] with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: for _ in tqdm(executor.map(lambda sample: process_sample(args, general_config, sample, output_path, lock), test_dataset), total=len(test_dataset), desc=f"Processing queries with {args.method_name} on {args.test_dataset_name}"): pass \ No newline at end of file diff --git a/X-MAS-Design/methods/llm_debate/configs/config_general.yaml b/X-MAS-Design/methods/llm_debate/configs/config_main.yaml similarity index 100% rename from X-MAS-Design/methods/llm_debate/configs/config_general.yaml rename to X-MAS-Design/methods/llm_debate/configs/config_main.yaml diff --git a/X-MAS-Design/methods/llm_debate/llm_debate_main.py b/X-MAS-Design/methods/llm_debate/llm_debate_main.py index 94155ce..e52a4ce 100644 --- a/X-MAS-Design/methods/llm_debate/llm_debate_main.py +++ b/X-MAS-Design/methods/llm_debate/llm_debate_main.py @@ -3,7 +3,7 @@ from ..utils import load_config class LLMDebate(MAS): - def __init__(self, general_config, method_config_name="config_general"): + def __init__(self, general_config, method_config_name="config_main"): super().__init__(general_config) self.method_config = load_config(os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs", f"{method_config_name}.yaml")) diff --git a/X-MAS-Design/methods/mas_base.py b/X-MAS-Design/methods/mas_base.py index d4f9cc9..d3aebc8 100644 --- a/X-MAS-Design/methods/mas_base.py +++ b/X-MAS-Design/methods/mas_base.py @@ -12,15 +12,12 @@ class MAS(): def __init__(self, general_config, method_config_name=None): self.model_api_config = general_config["model_api_config"] - self.model_name = general_config["model_name"] self.model_temperature = general_config["model_temperature"] self.model_max_tokens = general_config["model_max_tokens"] self.model_timeout = general_config["model_timeout"] # Tracking compute costs - self.token_stats = { - self.model_name: {"num_llm_calls": 0, "prompt_tokens": 0, "completion_tokens": 0} - } + self.token_stats = {} self.memory_bank = {} self.tools = {} @@ -35,7 +32,7 @@ def inference(self, query): @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(5), retry_error_callback=handle_retry_error) def call_llm(self, prompt=None, system_prompt=None, messages=None, model_name=None, temperature=None): - model_name = model_name if model_name is not None else self.model_name + model_name = model_name if model_name is not None else "gpt-4o-mini-2024-07-18" try: model_dict = random.choice(self.model_api_config[model_name]["model_list"]) except Exception as e: diff --git a/configs/X-MAS_Bench_config.json b/configs/X-MAS_Bench_config.json index 9655fdf..7df61d0 100644 --- a/configs/X-MAS_Bench_config.json +++ b/configs/X-MAS_Bench_config.json @@ -2,20 +2,20 @@ "model_dict": { "qwen-2.5-32b-instruct": { "model_list": [ - {"model_name": "qwen-2.5-32b-instruct", "model_url": "http://:8090/v1", "api_key": "EMPTY"} + {"model_name": "qwen-2.5-32b-instruct", "model_url": "http://a.b.c.d:e/v1", "api_key": "EMPTY"} ], "max_workers_per_model": 25 }, "gpt-4o-mini-2024-07-18": { "model_list": [ + {"model_name": "gpt-4o-mini-2024-07-18", "model_url": "", "api_key": ""}, {"model_name": "gpt-4o-mini-2024-07-18", "model_url": "", "api_key": ""} ], "max_workers_per_model": 25 }, "llama-3.1-70b-instruct": { "model_list": [ - {"model_name": "llama-3.1-70b-instruct", "model_url": "http://../v1", "api_key": "EMPTY"}, - {"model_name": "llama-3.1-70b-instruct", "model_url": "http://../v1", "api_key": "EMPTY"} + {"model_name": "llama-3.1-70b-instruct", "model_url": "http://a.b.c.d:e/v1", "api_key": "EMPTY"} ], "max_workers_per_model": 25 } diff --git a/configs/X-MAS_Design_config.json b/configs/X-MAS_Design_config.json index ec66f03..7df61d0 100644 --- a/configs/X-MAS_Design_config.json +++ b/configs/X-MAS_Design_config.json @@ -2,22 +2,22 @@ "model_dict": { "qwen-2.5-32b-instruct": { "model_list": [ - {"model_name": "qwen-2.5-32b-instruct", "model_url": "http://:8090/v1", "api_key": "EMPTY"} + {"model_name": "qwen-2.5-32b-instruct", "model_url": "http://a.b.c.d:e/v1", "api_key": "EMPTY"} ], "max_workers_per_model": 25 }, "gpt-4o-mini-2024-07-18": { "model_list": [ + {"model_name": "gpt-4o-mini-2024-07-18", "model_url": "", "api_key": ""}, {"model_name": "gpt-4o-mini-2024-07-18", "model_url": "", "api_key": ""} ], "max_workers_per_model": 25 }, "llama-3.1-70b-instruct": { "model_list": [ - {"model_name": "llama-3.1-70b-instruct", "model_url": "http://../v1", "api_key": "EMPTY"}, - {"model_name": "llama-3.1-70b-instruct", "model_url": "http://../v1", "api_key": "EMPTY"} + {"model_name": "llama-3.1-70b-instruct", "model_url": "http://a.b.c.d:e/v1", "api_key": "EMPTY"} ], "max_workers_per_model": 25 } } -} +} \ No newline at end of file diff --git a/scripts/eval_X-MAS_Bench.sh b/scripts/eval_X-MAS_Bench.sh index 1c5e13c..8f07f64 100644 --- a/scripts/eval_X-MAS_Bench.sh +++ b/scripts/eval_X-MAS_Bench.sh @@ -1,32 +1,18 @@ #!/bin/bash config_path="./configs/X-MAS_Bench_config.json" -TEST_DATASET_NAMES=("GSM-Hard" "MATH-500" "AQUA-RAT" "AIME-2024" "MBPP-Plus" "MBPP" "HumanEval" "HumanEval-Plus" "MedQA" "MedMCQA" "PubMedQA" "FinanceBench" "FinQA" "FPB" "SciEval" "SciKnowEval" "SciBench" "GPQA" "GPQA-Diamond" "MMLU-Pro" "MMLU") -# TEST_DATASET_NAMES=("AIME-2024" "FinanceBench" "FinQA" "FPB" "MBPP-Plus" "MBPP") -# TEST_DATASET_NAMES=("AIME-2024") +# TEST_DATASET_NAMES=("GSM-Hard" "MATH-500" "AQUA-RAT" "AIME-2024" "MBPP-Plus" "MBPP" "HumanEval" "HumanEval-Plus" "MedQA" "MedMCQA" "PubMedQA" "FinanceBench" "FinQA" "FPB" "SciEval" "SciKnowEval" "SciBench" "GPQA" "GPQA-Diamond" "MMLU-Pro" "MMLU") +TEST_DATASET_NAMES=("AIME-2024") +MODEL_NAMES=("qwen-2.5-32b-instruct") +FUNCTION_NAMES=("qa" "aggregation" "planning" "revise" "evaluation") -infer_names=( - "qwen-2.5-7b-instruct_direct.jsonl" - "qwen-2.5-14b-instruct_direct.jsonl" - "qwen-2.5-32b-instruct_direct.jsonl" - "qwen-2.5-72b-instruct_direct.jsonl" - "qwen-2.5-coder-7b-instruct_direct.jsonl" - "qwen-2.5-coder-14b-instruct_direct.jsonl" - "qwen-2.5-coder-32b-instruct_direct.jsonl" - "qwen-2.5-math-7b-instruct_direct.jsonl" - "qwen-2.5-math-72b-instruct_direct.jsonl" - "llama-3.1-8b-instruct_direct.jsonl" - "llama-3.1-70b-instruct_direct.jsonl" - "chemdfm-v1.5-8b_direct.jsonl" - "llama3-xuanyuan3-70b-chat_direct.jsonl" - "llama3-openbiollm-70b_direct.jsonl" -) - for dataset_name in "${TEST_DATASET_NAMES[@]}"; do - for infer_name in "${infer_names[@]}"; do - python X-MAS-Bench/eval_bench.py --model_name llama-3.1-70b-instruct --model_config $config_path --dataset_name $dataset_name --infer_name $infer_name --eval_mode bench-test + for model_name in "${MODEL_NAMES[@]}"; do + for function_name in "${FUNCTION_NAMES[@]}"; do + python X-MAS-Bench/eval_bench.py --eval_model_name llama-3.1-70b-instruct --model_config $config_path --dataset_name $dataset_name --model_name $model_name --function_name $function_name --eval_mode bench-test + done done done # wait diff --git a/scripts/eval_X-MAS_Design.sh b/scripts/eval_X-MAS_Design.sh index 85e5d92..5d744ea 100644 --- a/scripts/eval_X-MAS_Design.sh +++ b/scripts/eval_X-MAS_Design.sh @@ -1,31 +1,37 @@ #!/bin/bash -config_path="./configs/X-MAS_Bench_config.json" -TEST_DATASET_NAMES=("GSM-Hard" "MATH-500" "AQUA-RAT" "AIME-2024" "MBPP-Plus" "MBPP" "HumanEval" "HumanEval-Plus" "MedQA" "MedMCQA" "PubMedQA" "FinanceBench" "FinQA" "FPB" "SciEval" "SciKnowEval" "SciBench" "GPQA" "GPQA-Diamond" "MMLU-Pro" "MMLU") -# TEST_DATASET_NAMES=("AIME-2024" "FinanceBench" "FinQA" "FPB" "MBPP-Plus" "MBPP") -# TEST_DATASET_NAMES=("AIME-2024") - - -infer_names=( - "qwen-2.5-7b-instruct_direct.jsonl" - "qwen-2.5-14b-instruct_direct.jsonl" - "qwen-2.5-32b-instruct_direct.jsonl" - "qwen-2.5-72b-instruct_direct.jsonl" - "qwen-2.5-coder-7b-instruct_direct.jsonl" - "qwen-2.5-coder-14b-instruct_direct.jsonl" - "qwen-2.5-coder-32b-instruct_direct.jsonl" - "qwen-2.5-math-7b-instruct_direct.jsonl" - "qwen-2.5-math-72b-instruct_direct.jsonl" - "llama-3.1-8b-instruct_direct.jsonl" - "llama-3.1-70b-instruct_direct.jsonl" - "chemdfm-v1.5-8b_direct.jsonl" - "llama3-xuanyuan3-70b-chat_direct.jsonl" - "llama3-openbiollm-70b_direct.jsonl" -) - - -for infer_name in "${infer_names[@]}"; do - python X-MAS-Design/eval_mas.py.py --model_name llama-3.1-70b-instruct --model_config $config_path --dataset_names "${TEST_DATASET_NAMES[@]}" --infer_name $infer_name --eval_mode bench-test +model_api_config="./configs/X-MAS_Bench_config.json" +# TEST_DATASET_NAMES=("GSM-Hard" "MATH-500" "AQUA-RAT" "AIME-2024" "MBPP-Plus" "MBPP" "HumanEval" "HumanEval-Plus" "MedQA" "MedMCQA" "PubMedQA" "FinanceBench" "FinQA" "FPB" "SciEval" "SciKnowEval" "SciBench" "GPQA" "GPQA-Diamond" "MMLU-Pro" "MMLU") +TEST_DATASET_NAMES=("AIME-2024") +METHOD_NAME_LIST=( + x_mas_proto + llm_debate + # dylan + # agentverse +) + +for test_dataset_name in "${TEST_DATASET_NAMES[@]}"; do + for method_name in "${METHOD_NAME_LIST[@]}"; do + python X-MAS-Design/eval_mas.py --eval_model_name llama-3.1-70b-instruct --model_api_config $model_api_config --method_name $method_name --test_dataset_name $test_dataset_name --eval_mode bench-test + done done -# wait + +wait + +# ================================== +# If you want to eval llm_debate with your own config or settings, you can uncomment the following lines and modify them accordingly. +# ================================== + +# model_api_config=./configs/X-MAS_Design_config.json + +# method_config_name=config_math + +# test_dataset_name="math-500" + +# method_name="llm_debate" + + +# python X-MAS-Design/eval_mas.py --eval_model_name llama-3.1-70b-instruct --model_api_config $model_api_config --method_name $method_name --method_config_name $method_config_name --test_dataset_name $test_dataset_name --eval_mode bench-test + +# wait \ No newline at end of file diff --git a/scripts/infer_X-MAS_Bench.sh b/scripts/infer_X-MAS_Bench.sh index 8a5406a..5356369 100644 --- a/scripts/infer_X-MAS_Bench.sh +++ b/scripts/infer_X-MAS_Bench.sh @@ -1,19 +1,24 @@ #!/bin/bash +# ================================== +# If you want to infer with **revise, aggregation or evaluation**, please make sure you have replace the content of "./X-MAS-Bench/results/" with **source file** in [Google Drive](https://drive.google.com/file/d/1oukYZLDOuc98i-ICkoZ6OYME9a7-AuH1/view?usp=drive_link) first. You can download the .zip file named results.zip to the "./X-MAS-Bench/results/" path and unzip it. + +#If you want to infer with **planning**, please make sure you have load the default source model **"llama-3.1-8b-instruct", "qwen-2.5-7b-instruct" and "qwen-2.5-14b-instruct"**. + +# ================================== + config_path="./configs/X-MAS_Bench_config.json" -# TEST_DATASET_NAMES=("MedQA" "MedMCQA") -# model_names=("deepseek-r1-distill-qwen-32b" "llama-3.3-70b-instruct" "qwen2.5-32b-instruct") TEST_DATASET_NAMES=("AIME-2024") model_names=("qwen-2.5-32b-instruct") -run_direct() { +run_qa() { local model_name=$1 dataset_name=$2 - python X-MAS-Bench/infer_direct.py --model_name $model_name --model_config $config_path --test_dataset_name $dataset_name + python X-MAS-Bench/infer_qa.py --model_name $model_name --model_config $config_path --test_dataset_name $dataset_name } -run_aggregate() { +run_aggregation() { local model_name=$1 dataset_name=$2 - python X-MAS-Bench/infer_aggregate.py --model_name $model_name --model_config $config_path --test_dataset_name $dataset_name + python X-MAS-Bench/infer_aggregation.py --model_name $model_name --model_config $config_path --test_dataset_name $dataset_name } @@ -22,24 +27,25 @@ run_revise() { python X-MAS-Bench/infer_revise.py --model_name $model_name --model_config $config_path --test_dataset_name $dataset_name } -run_evaluate() { +run_evaluation() { local model_name=$1 dataset_name=$2 - python X-MAS-Bench/infer_evaluate.py --model_name $model_name --model_config $config_path --test_dataset_name $dataset_name + python X-MAS-Bench/infer_evaluation.py --model_name $model_name --model_config $config_path --test_dataset_name $dataset_name } -run_plan() { +run_planning() { local model_name=$1 dataset_name=$2 - python X-MAS-Bench/infer_plan.py --model_name $model_name --model_config $config_path --test_dataset_name $dataset_name + python X-MAS-Bench/infer_planning.py --model_name $model_name --model_config $config_path --test_dataset_name $dataset_name } run_function() { local model_name=$1 dataset_name=$2 - run_direct "$model_name" "$dataset_name" - run_aggregate "$model_name" "$dataset_name" - run_plan "$model_name" "$dataset_name" + run_qa "$model_name" "$dataset_name" + run_aggregation "$model_name" "$dataset_name" + run_planning "$model_name" "$dataset_name" run_revise "$model_name" "$dataset_name" - run_evaluate "$model_name" "$dataset_name" + run_evaluation "$model_name" "$dataset_name" } + for dataset_name in "${TEST_DATASET_NAMES[@]}"; do for model_name in "${model_names[@]}"; do run_function "$model_name" "$dataset_name" & diff --git a/scripts/infer_X-MAS_Design.sh b/scripts/infer_X-MAS_Design.sh index 052f7d6..d7377c9 100644 --- a/scripts/infer_X-MAS_Design.sh +++ b/scripts/infer_X-MAS_Design.sh @@ -1,23 +1,36 @@ model_api_config=./configs/X-MAS_Design_config.json -model_name=qwen-2.5-32b-instruct - -# ================================== -# TEST_DATASET_NAMES=("GSM-Hard" "MATH-500" "AQUA-RAT" "AIME-2024" "MBPP-Plus" "MBPP" "HumanEval" "HumanEval-Plus" "MedQA" "MedMCQA" "PubMedQA" "FinanceBench" "FinQA" "FPB" "SciEval" "SciKnowEval" "SciBench" "GPQA" "GPQA-Diamond" "MMLU-Pro" "MMLU") TEST_DATASET_NAMES=("AIME-2024") +# TEST_DATASET_NAMES=("GSM-Hard" "MATH-500" "AQUA-RAT" "AIME-2024" "MBPP-Plus" "MBPP" "HumanEval" "HumanEval-Plus" "MedQA" "MedMCQA" "PubMedQA" "FinanceBench" "FinQA" "FPB" "SciEval" "SciKnowEval" "SciBench" "GPQA" "GPQA-Diamond" "MMLU-Pro" "MMLU") METHOD_NAME_LIST=( x_mas_proto - # llm_debate + llm_debate # dylan # agentverse ) for test_dataset_name in "${TEST_DATASET_NAMES[@]}"; do for method_name in "${METHOD_NAME_LIST[@]}"; do - python X-MAS-Design/inference_mas.py --method_name $method_name --model_name $model_name --test_dataset_name $test_dataset_name --model_api_config $model_api_config + python X-MAS-Design/inference_mas.py --model_api_config $model_api_config --method_name $method_name --test_dataset_name $test_dataset_name done done wait +# ================================== +# If you want to run llm_debate with your own config or settings, you can uncomment the following lines and modify them accordingly. +# ================================== + +# model_api_config=./configs/X-MAS_Design_config.json + +# method_config_name=config_math + +# test_dataset_name="math-500" + +# method_name="llm_debate" + + +# python X-MAS-Design/inference_mas.py --model_api_config $model_api_config --method_name $method_name --method_config_name $method_config_name --test_dataset_name $test_dataset_name + +# wait