Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion instruct/code_eval/evalplus/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@ cython_debug/
.vscode/

# EvalPlus specific
EvalPlus/
backup/
passrate.p*
min_cov_dir/
Expand Down
4 changes: 4 additions & 0 deletions instruct/code_eval/evalplus/evalplus/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
try:
from evalplus._version import __version__, __version_tuple__
except ImportError:
__version__ = "local-dev"
287 changes: 287 additions & 0 deletions instruct/code_eval/evalplus/evalplus/codegen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
import json
import os
from typing import Dict, List, Optional

from evalplus.data import get_evalperf_data, get_human_eval_plus, get_mbpp_plus
from evalplus.provider import DecoderBase, make_model
from evalplus.sanitize import sanitize
from evalplus.utils import progress


def codegen(
target_path: str,
model: DecoderBase,
dataset: Dict,
greedy=False,
n_samples=1,
id_range=None,
resume=True,
):
task2nexist = {}
if resume and target_path.endswith(".jsonl") and os.path.isfile(target_path):
with open(target_path, "r") as f:
for line in f:
if not line.strip():
continue
task_id = json.loads(line)["task_id"]
task2nexist[task_id] = task2nexist.get(task_id, 0) + 1

if target_path.endswith(".jsonl"):
raw_target_path = target_path.replace(".jsonl", ".raw.jsonl")
else:
raw_target_path = target_path + ".raw"
os.makedirs(target_path, exist_ok=True)

print(f"Sanitized code outputs will be saved to {target_path}")
print(f"Raw outputs will be saved to {raw_target_path}")

backend_type: str = type(model).__name__
with progress(backend_type) as p:
for task_id, task in p.track(dataset.items()):
if id_range is not None:
id_num = int(task_id.split("/")[1])
low, high = id_range
if id_num < low or id_num >= high:
p.console.print(f"Skipping {task_id} as it is not in {id_range}")
continue

if not target_path.endswith(".jsonl"):
p_name = task_id.replace("/", "_")
os.makedirs(os.path.join(target_path, p_name), exist_ok=True)
task2nexist[task_id] = len(
[
f
for f in os.listdir(os.path.join(target_path, p_name))
if f.endswith(".py")
]
)

n_more_samples = n_samples
log = f"Codegen: {task_id} @ {model}"
if resume and task2nexist.get(task_id, 0) > 0:
log += f" (resuming from {task2nexist[task_id]})"
n_more_samples -= task2nexist[task_id]

p.console.print(log)

sidx = n_samples - n_more_samples
while sidx < n_samples:
prompt = task["prompt"].strip() + "\n"
outputs = model.codegen(
prompt,
do_sample=not greedy,
num_samples=n_samples - sidx,
)
assert outputs, "No outputs from model!"
for impl in outputs:
solution = prompt + impl if model.is_direct_completion() else impl
sanitized_solution = sanitize(
solution, entrypoint=task["entry_point"]
)
if target_path.endswith(".jsonl"):
# Writing the sanitized version
with open(target_path, "a") as f:
f.write(
json.dumps(
{"task_id": task_id, "solution": sanitized_solution}
)
+ "\n"
)

# Writing the raw version
with open(raw_target_path, "a") as f:
f.write(
json.dumps({"task_id": task_id, "solution": solution})
+ "\n"
)
else:
# Writing the sanitized version
with open(
os.path.join(target_path, p_name, f"{sidx}.py"),
"w",
encoding="utf-8",
) as f:
f.write(sanitized_solution)

# Writing the raw version
with open(
os.path.join(raw_target_path, p_name, f"{sidx}.py"),
"w",
encoding="utf-8",
) as f:
f.write(solution)
sidx += 1


def run_codegen(
model: str,
dataset: str,
root: str = "evalplus_results",
bs: Optional[int] = None,
n_samples: int = 1,
temperature: float = 0.0,
resume: bool = True,
greedy: bool = False,
id_range: List = None,
version: str = "default",
backend: str = "vllm",
force_base_prompt: bool = False,
base_url: str = None,
tp: int = 1,
evalperf_type: str = None, # For EvalPerf
jsonl_fmt: bool = True,
attn_implementation: str = "eager",
device_map: Optional[str] = None,
trust_remote_code: bool = False,
enable_prefix_caching: bool = False,
enable_chunked_prefill: bool = False,
dtype: str = "bfloat16",
gptqmodel_backend: str = "auto", # For GPTQModel
gguf_file: Optional[str] = None,
**kwargs,
):
assert dataset in ["humaneval", "mbpp", "evalperf"], f"Invalid dataset {dataset}"
assert evalperf_type is None or evalperf_type in [
"instruct",
"perf-instruct",
"perf-CoT",
]

# Make dir for codes generated by each model
identifier = model.replace("/home/ndfl4zki/ndfl4zkiuser04", "")
identifier = identifier.replace("/ckpts/rl/diff-rl", "")
identifier = identifier.replace("/actor/huggingface", "")
identifier = (
identifier.strip("./").replace("/", "--") + f"_{backend}_temp_{temperature}"
)
if "max_new_tokens" in kwargs:
identifier += f"_len_{kwargs['max_new_tokens']}"
if kwargs.get("alg", None) is not None:
identifier += f"_alg_{kwargs['alg']}"
if evalperf_type:
identifier += f"-{evalperf_type}"

if kwargs.get("fast_dllm", False):
identifier += "-fast_dllm"

print(identifier)
target_path = os.path.join(root, dataset, identifier)
if jsonl_fmt:
target_path += ".jsonl"
else:
os.makedirs(target_path, exist_ok=True)

if dataset == "humaneval":
dataset_dict = get_human_eval_plus(version=version)
elif dataset == "mbpp":
dataset_dict = get_mbpp_plus(version=version)
elif dataset == "evalperf":
original_dataset = {**get_human_eval_plus(), **get_mbpp_plus()}
dataset_dict = {k: original_dataset[k] for k in get_evalperf_data()}
assert id_range is None, "id_range not supported for evalperf"
else:
raise ValueError(f"Invalid dataset {dataset}")

all_tasks_complete = False
if jsonl_fmt and os.path.isfile(target_path):
task_counts = {}
with open(target_path, "r") as f:
for line in f:
if not line.strip():
continue
data = json.loads(line)
task_id = data["task_id"]
task_counts[task_id] = task_counts.get(task_id, 0) + 1

all_tasks_complete = all(
task_counts.get(task_id, 0) >= n_samples
for task_id in dataset_dict.keys()
)

if all_tasks_complete:
print("All samples are already cached. Skipping codegen.")
return target_path

if greedy and (temperature != 0 or bs != 1 or n_samples != 1):
temperature = 0.0
bs = 1
n_samples = 1
print("Greedy decoding ON (--greedy): setting bs=1, n_samples=1, temperature=0")

if id_range is not None:
assert len(id_range) == 2, "id_range must be a list of length 2"
assert id_range[0] < id_range[1], "id_range must be increasing"
id_range = tuple(id_range)

if bs is None:
bs = min(n_samples, 32)
print(f"Setting batch size to {bs}")

# Make project dir
os.makedirs(root, exist_ok=True)
# Make dataset dir
os.makedirs(os.path.join(root, dataset), exist_ok=True)

# Model instructions
instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:"
response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:"

if evalperf_type == "perf-instruct":
instruction_prefix = "Please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:"
response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:"
elif evalperf_type == "perf-CoT":
instruction_prefix = "Think step by step: please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:"
response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:"
elif evalperf_type is not None and evalperf_type != "instruct":
raise ValueError(f"Invalid evalperf_type: {evalperf_type}")

# Model creation
model_runner = make_model(
model=model,
backend=backend,
batch_size=bs,
temperature=temperature,
force_base_prompt=force_base_prompt,
dataset=dataset,
base_url=base_url,
tp=tp,
instruction_prefix=instruction_prefix,
response_prefix=response_prefix,
device_map=device_map,
attn_implementation=attn_implementation,
trust_remote_code=trust_remote_code,
enable_prefix_caching=enable_prefix_caching,
enable_chunked_prefill=enable_chunked_prefill,
dtype=dtype,
gptqmodel_backend=gptqmodel_backend,
gguf_file=gguf_file,
**kwargs,
)

codegen(
target_path=target_path,
dataset=dataset_dict,
greedy=greedy,
model=model_runner,
n_samples=n_samples,
resume=resume,
id_range=id_range,
)

# force shutdown the model runner
del model_runner
import gc

gc.collect()

return target_path


def main():
from fire import Fire

Fire(run_codegen)


if __name__ == "__main__":
main()
16 changes: 16 additions & 0 deletions instruct/code_eval/evalplus/evalplus/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
## EvalPlus
DEFAULT_GT_TIME_LIMIT_FACTOR = 4.0
DEFAULT_MIN_TIME_LIMIT = 4.0

## EvalPerf

### General
PERF_PROFILE_ROUNDS = 1
PERF_RAM_GB_PER_PROC = 12

### Evaluation Phase
PERF_EVAL_TIMEOUT_SECOND = 45

### Curation Phase
PERF_CURATE_TIMEOUT_SECOND = 20
PREF_CURATE_MIN_INSTRUCTION = 10000
Loading