Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix 1514 pr #1572

Merged
merged 3 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions metagpt/ext/sela/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@ You can either download the datasets from the link or prepare the datasets from
- **Download Datasets:** [Dataset Link](https://drive.google.com/drive/folders/151FIZoLygkRfeJgSI9fNMiLsixh1mK0r?usp=sharing)
- **Download and prepare datasets from scratch:**
```bash
cd data
python dataset.py --save_analysis_pool
python hf_data.py --save_analysis_pool
python data/dataset.py --save_analysis_pool
python data/hf_data.py --save_analysis_pool
```

## 2. Configurations
Expand Down
6 changes: 4 additions & 2 deletions metagpt/ext/sela/data/custom_task.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import os
from pathlib import Path

from metagpt.ext.sela.data.dataset import SPECIAL_INSTRUCTIONS
from metagpt.ext.sela.runner.mle_bench.instructions import (
ADDITIONAL_NOTES,
INSTRUCTIONS,
INSTRUCTIONS_OBFUSCATED,
)
from metagpt.ext.sela.utils import mcts_logger

MLE_BENCH_FILES = ["description.md", "description_obfuscated.md"]

Expand Down Expand Up @@ -61,7 +63,7 @@ def get_mle_bench_requirements(dataset_dir, data_config, special_instruction, ob
instructions = INSTRUCTIONS.format(dataset_dir=dataset_dir, output_dir=output_dir)
task_file = "description.md"

with open(os.path.join(dataset_dir, task_file), encoding="utf-8") as f:
with open(Path(dataset_dir) / task_file, encoding="utf-8") as f:
task_description = f.read()
mle_requirement = MLE_REQUIREMENTS.format(
instructions=instructions,
Expand All @@ -70,5 +72,5 @@ def get_mle_bench_requirements(dataset_dir, data_config, special_instruction, ob
output_dir=output_dir,
special_instruction=special_instruction,
)
print(mle_requirement)
mcts_logger.info(mle_requirement)
return mle_requirement
38 changes: 18 additions & 20 deletions metagpt/ext/sela/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from sklearn.model_selection import train_test_split

from metagpt.ext.sela.insights.solution_designer import SolutionDesigner
from metagpt.ext.sela.utils import DATA_CONFIG
from metagpt.ext.sela.utils import DATA_CONFIG, mcts_logger

BASE_USER_REQUIREMENT = """
This is a {datasetname} dataset. Your goal is to predict the target column `{target_col}`.
Expand Down Expand Up @@ -113,15 +113,15 @@ def get_split_dataset_path(dataset_name, config):
datasets_dir = config["datasets_dir"]
if dataset_name in config["datasets"]:
dataset = config["datasets"][dataset_name]
data_path = os.path.join(datasets_dir, dataset["dataset"])
data_path = Path(datasets_dir) / dataset["dataset"]
split_datasets = {
"train": os.path.join(data_path, "split_train.csv"),
"dev": os.path.join(data_path, "split_dev.csv"),
"dev_wo_target": os.path.join(data_path, "split_dev_wo_target.csv"),
"dev_target": os.path.join(data_path, "split_dev_target.csv"),
"test": os.path.join(data_path, "split_test.csv"),
"test_wo_target": os.path.join(data_path, "split_test_wo_target.csv"),
"test_target": os.path.join(data_path, "split_test_target.csv"),
"train": data_path / "split_train.csv",
"dev": data_path / "split_dev.csv",
"dev_wo_target": data_path / "split_dev_wo_target.csv",
"dev_target": data_path / "split_dev_target.csv",
"test": data_path / "split_test.csv",
"test_wo_target": data_path / "split_test_wo_target.csv",
"test_target": data_path / "split_test_target.csv",
}
return split_datasets
else:
Expand All @@ -131,10 +131,8 @@ def get_split_dataset_path(dataset_name, config):


def get_user_requirement(task_name, config):
# datasets_dir = config["datasets_dir"]
if task_name in config["datasets"]:
dataset = config["datasets"][task_name]
# data_path = os.path.join(datasets_dir, dataset["dataset"])
user_requirement = dataset["user_requirement"]
return user_requirement
else:
Expand Down Expand Up @@ -191,7 +189,7 @@ def generate_task_requirement(task_name, data_config, is_di=True, special_instru
additional_instruction=additional_instruction,
data_info_path=data_info_path,
)
print(user_requirement)
mcts_logger.info(user_requirement)
return user_requirement


Expand Down Expand Up @@ -220,22 +218,22 @@ def check_dataset_exists(self):
"split_test_target.csv",
]
for fname in fnames:
if not os.path.exists(Path(self.dataset_dir, self.name, fname)):
if not Path(self.dataset_dir, self.name, fname).exists():
return False
return True

def check_datasetinfo_exists(self):
return os.path.exists(Path(self.dataset_dir, self.name, "dataset_info.json"))
return Path(self.dataset_dir, self.name, "dataset_info.json").exists()

def get_raw_dataset(self):
raw_dir = Path(self.dataset_dir, self.name, "raw")
train_df = None
test_df = None
if not os.path.exists(Path(raw_dir, "train.csv")):
if not Path(raw_dir, "train.csv").exists():
raise FileNotFoundError(f"Raw dataset `train.csv` not found in {raw_dir}")
else:
train_df = pd.read_csv(Path(raw_dir, "train.csv"))
if os.path.exists(Path(raw_dir, "test.csv")):
if Path(raw_dir, "test.csv").exists():
test_df = pd.read_csv(Path(raw_dir, "test.csv"))
return train_df, test_df

Expand Down Expand Up @@ -286,16 +284,16 @@ def create_base_requirement(self):
def save_dataset(self, target_col):
df, test_df = self.get_raw_dataset()
if not self.check_dataset_exists() or self.force_update:
print(f"Saving Dataset {self.name} in {self.dataset_dir}")
mcts_logger.info(f"Saving Dataset {self.name} in {self.dataset_dir}")
self.split_and_save(df, target_col, test_df=test_df)
else:
print(f"Dataset {self.name} already exists")
mcts_logger.info(f"Dataset {self.name} already exists")
if not self.check_datasetinfo_exists() or self.force_update:
print(f"Saving Dataset info for {self.name}")
mcts_logger.info(f"Saving Dataset info for {self.name}")
dataset_info = self.get_dataset_info()
self.save_datasetinfo(dataset_info)
else:
print(f"Dataset info for {self.name} already exists")
mcts_logger.info(f"Dataset info for {self.name} already exists")

def save_datasetinfo(self, dataset_info):
with open(Path(self.dataset_dir, self.name, "dataset_info.json"), "w", encoding="utf-8") as file:
Expand Down
5 changes: 2 additions & 3 deletions metagpt/ext/sela/data/hf_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import asyncio
import io
import os
from pathlib import Path

import pandas as pd
Expand Down Expand Up @@ -63,7 +62,7 @@ def get_raw_dataset(self):
raw_dir = Path(self.dataset_dir, self.name, "raw")
raw_dir.mkdir(parents=True, exist_ok=True)

if os.path.exists(Path(raw_dir, "train.csv")):
if Path(raw_dir, "train.csv").exists():
df = pd.read_csv(Path(raw_dir, "train.csv"), encoding="utf-8")
else:
df = self.dataset["train"].to_pandas()
Expand All @@ -73,7 +72,7 @@ def get_raw_dataset(self):

df.to_csv(Path(raw_dir, "train.csv"), index=False, encoding="utf-8")

if os.path.exists(Path(raw_dir, "test.csv")):
if Path(raw_dir, "test.csv").exists():
test_df = pd.read_csv(Path(raw_dir, "test.csv"), encoding="utf-8")
else:
if self.dataset and "test" in self.dataset:
Expand Down
3 changes: 2 additions & 1 deletion metagpt/ext/sela/evaluation/visualize_mcts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import networkx as nx

from metagpt.ext.sela.search.tree_search import Node
from metagpt.ext.sela.utils import mcts_logger

NODE_TEMPLATE = """\
[Node {id}]
Expand Down Expand Up @@ -139,7 +140,7 @@ def build_tree_recursive(graph, parent_id, node, node_order, start_task_id=2):
instruction = "\n\n".join([role.planner.plan.tasks[i].instruction for i in range(start_task_id)])
else:
instruction = role.planner.plan.tasks[depth + start_task_id - 1].instruction
print(instruction)
mcts_logger.info(instruction)
# Add the current node with attributes to the graph
dev_score = node.raw_reward.get("dev_score", 0) * 100
avg_score = node.avg_value() * 100
Expand Down
8 changes: 4 additions & 4 deletions metagpt/ext/sela/experimenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import asyncio
import json
import os
from pathlib import Path

from pydantic import model_validator

Expand Down Expand Up @@ -133,9 +133,9 @@ def set_plan_and_tool(self) -> "Interpreter":
if self.planner.plan.goal != "":
self.set_actions([WriteAnalysisCode])
self._set_state(0)
print("Plan already exists, skipping initialization.")
mcts_logger.info("Plan already exists, skipping initialization.")
return self
print("Initializing plan and tool...")
mcts_logger.info("Initializing plan and tool...")
return super().set_plan_and_tool()

async def _act_on_task(self, current_task: Task) -> TaskResult:
Expand Down Expand Up @@ -172,7 +172,7 @@ def save_state(self, static_save=False):
mcts_logger.log("MCTS", "Static Saving")
stg_path = self.role_dir
name = self.get_node_name()
role_path = os.path.join(stg_path, f"{name}.json")
role_path = Path(stg_path) / f"{name}.json"
# save state as json file
write_json_file(role_path, self.model_dump())

Expand Down
8 changes: 5 additions & 3 deletions metagpt/ext/sela/runner/aide.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import aide

from metagpt.ext.sela.utils import mcts_logger

os.environ["OPENAI_API_KEY"] = "sk-xxx"
os.environ["OPENAI_BASE_URL"] = "your url"

Expand All @@ -27,9 +29,9 @@

best_solution = exp.run(steps=10)

print(f"Best solution has validation metric: {best_solution.valid_metric}")
print(f"Best solution code: {best_solution.code}")
mcts_logger.info(f"Best solution has validation metric: {best_solution.valid_metric}")
mcts_logger.info(f"Best solution code: {best_solution.code}")
end_time = time.time()
execution_time = end_time - start_time

print(f"run time : {execution_time} seconds")
mcts_logger.info(f"run time : {execution_time} seconds")
15 changes: 7 additions & 8 deletions metagpt/ext/sela/runner/autogluon.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os
from datetime import datetime
from pathlib import Path

import pandas as pd

from metagpt.ext.sela.runner.custom import CustomRunner
from metagpt.ext.sela.utils import DATA_CONFIG


class AGRunner:
Expand Down Expand Up @@ -80,7 +81,7 @@ def load_split_dataset(self, train_path, dev_path, dev_wo_target_path, test_wo_t
"""

# Define the root path to append
root_folder = os.path.join("F:/Download/Dataset/", self.state["task"])
root_folder = Path(DATA_CONFIG["datasets_dir"]) / self.state["task"]

# Load the datasets
train_data = pd.read_csv(train_path)
Expand All @@ -92,12 +93,10 @@ def load_split_dataset(self, train_path, dev_path, dev_wo_target_path, test_wo_t
image_column = train_data.columns[0]

# Append root folder path to the image column in each dataset
train_data[image_column] = train_data[image_column].apply(lambda x: os.path.join(root_folder, x))
dev_data[image_column] = dev_data[image_column].apply(lambda x: os.path.join(root_folder, x))
dev_wo_target_data[image_column] = dev_wo_target_data[image_column].apply(
lambda x: os.path.join(root_folder, x)
)
test_data[image_column] = test_data[image_column].apply(lambda x: os.path.join(root_folder, x))
train_data[image_column] = train_data[image_column].apply(lambda x: Path(root_folder) / x)
dev_data[image_column] = dev_data[image_column].apply(lambda x: Path(root_folder) / x)
dev_wo_target_data[image_column] = dev_wo_target_data[image_column].apply(lambda x: Path(root_folder) / x)
test_data[image_column] = test_data[image_column].apply(lambda x: Path(root_folder) / x)

return train_data, dev_data, dev_wo_target_data, test_data

Expand Down
4 changes: 2 additions & 2 deletions metagpt/ext/sela/runner/custom.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os
from pathlib import Path

import pandas as pd

Expand Down Expand Up @@ -47,7 +47,7 @@ def evaluate_pred_files(self, dev_pred_path, test_pred_path):

def evaluate_predictions(self, preds, split):
metric = self.state["dataset_config"]["metric"]
gt_path = os.path.join(self.state["datasets_dir"][f"{split}_target"])
gt_path = Path(self.state["datasets_dir"][f"{split}_target"])
gt = pd.read_csv(gt_path)["target"]
score = evaluate_score(preds, gt, metric)
return score
Expand Down
3 changes: 2 additions & 1 deletion metagpt/ext/sela/runner/mcts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from metagpt.ext.sela.evaluation.visualize_mcts import get_tree_text
from metagpt.ext.sela.runner.runner import Runner
from metagpt.ext.sela.search.search_algorithm import MCTS, Greedy, Random
from metagpt.ext.sela.utils import mcts_logger


class MCTSRunner(Runner):
Expand Down Expand Up @@ -46,7 +47,7 @@ async def run_experiment(self):
text += f"Best node: {best_node.id}, score: {best_node.raw_reward}\n"
text += f"Dev best node: {dev_best_node.id}, score: {dev_best_node.raw_reward}\n"
text += f"Grader score: {additional_scores['grader']}\n"
print(text)
mcts_logger.info(text)
results = [
{
"best_node": best_node.id,
Expand Down
4 changes: 2 additions & 2 deletions metagpt/ext/sela/runner/random_search.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from metagpt.ext.sela.experimenter import Experimenter
from metagpt.ext.sela.insights.instruction_generator import InstructionGenerator
from metagpt.ext.sela.runner.runner import Runner
from metagpt.ext.sela.utils import get_exp_pool_path
from metagpt.ext.sela.utils import get_exp_pool_path, mcts_logger

EXPS_PROMPT = """
When doing the tasks, you can refer to the insights below:
Expand Down Expand Up @@ -37,7 +37,7 @@ async def run_experiment(self):
di = Experimenter(node_id=str(i), use_reflection=self.args.reflection, role_timeout=self.args.role_timeout)
di.role_dir = f"{di.role_dir}_{self.args.task}"
requirement = user_requirement + EXPS_PROMPT.format(experience=exps[i])
print(requirement)
mcts_logger.info(requirement)
score_dict = await self.run_di(di, requirement, run_idx=i)
results.append(
{
Expand Down
11 changes: 6 additions & 5 deletions metagpt/ext/sela/runner/runner.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import datetime
import json
import os
from pathlib import Path

import numpy as np
import pandas as pd

from metagpt.ext.sela.evaluation.evaluation import evaluate_score
from metagpt.ext.sela.experimenter import Experimenter
from metagpt.ext.sela.search.tree_search import create_initial_state
from metagpt.ext.sela.utils import DATA_CONFIG, save_notebook
from metagpt.ext.sela.utils import DATA_CONFIG, mcts_logger, save_notebook


class Runner:
Expand Down Expand Up @@ -38,7 +39,7 @@ async def run_di(self, di, user_requirement, run_idx):
score_dict = self.evaluate(score_dict, self.state)
run_finished = True
except Exception as e:
print(f"Error: {e}")
mcts_logger.info(f"Error: {e}")
num_runs += 1
# save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}")
save_name = self.get_save_name()
Expand Down Expand Up @@ -94,10 +95,10 @@ async def run_experiment(self):
self.save_result(results)

def evaluate_prediction(self, split, state):
pred_path = os.path.join(state["work_dir"], state["task"], f"{split}_predictions.csv")
pred_path = Path(state["work_dir"]) / state["task"] / f"{split}_predictions.csv"
os.makedirs(state["node_dir"], exist_ok=True)
pred_node_path = os.path.join(state["node_dir"], f"{self.start_time}-{split}_predictions.csv")
gt_path = os.path.join(state["datasets_dir"][f"{split}_target"])
pred_node_path = Path(state["node_dir"]) / f"{self.start_time}-{split}_predictions.csv"
gt_path = Path(state["datasets_dir"]) / f"{split}_target.csv"
preds = pd.read_csv(pred_path)
preds = preds[preds.columns.tolist()[-1]]
preds.to_csv(pred_node_path, index=False)
Expand Down
Loading
Loading