DreamLM · zhxieml · Nov 17, 2025 · Nov 17, 2025
diff --git a/instruct/code_eval/evalplus/.gitignore b/instruct/code_eval/evalplus/.gitignore
@@ -164,7 +164,6 @@ cython_debug/
 .vscode/
 
 # EvalPlus specific
-EvalPlus/
 backup/
 passrate.p*
 min_cov_dir/

diff --git a/instruct/code_eval/evalplus/evalplus/__init__.py b/instruct/code_eval/evalplus/evalplus/__init__.py
@@ -0,0 +1,4 @@
+try:
+    from evalplus._version import __version__, __version_tuple__
+except ImportError:
+    __version__ = "local-dev"
diff --git a/instruct/code_eval/evalplus/evalplus/codegen.py b/instruct/code_eval/evalplus/evalplus/codegen.py
@@ -0,0 +1,287 @@
+import json
+import os
+from typing import Dict, List, Optional
+
+from evalplus.data import get_evalperf_data, get_human_eval_plus, get_mbpp_plus
+from evalplus.provider import DecoderBase, make_model
+from evalplus.sanitize import sanitize
+from evalplus.utils import progress
+
+
+def codegen(
+    target_path: str,
+    model: DecoderBase,
+    dataset: Dict,
+    greedy=False,
+    n_samples=1,
+    id_range=None,
+    resume=True,
+):
+    task2nexist = {}
+    if resume and target_path.endswith(".jsonl") and os.path.isfile(target_path):
+        with open(target_path, "r") as f:
+            for line in f:
+                if not line.strip():
+                    continue
+                task_id = json.loads(line)["task_id"]
+                task2nexist[task_id] = task2nexist.get(task_id, 0) + 1
+
+    if target_path.endswith(".jsonl"):
+        raw_target_path = target_path.replace(".jsonl", ".raw.jsonl")
+    else:
+        raw_target_path = target_path + ".raw"
+        os.makedirs(target_path, exist_ok=True)
+
+    print(f"Sanitized code outputs will be saved to {target_path}")
+    print(f"Raw outputs will be saved to {raw_target_path}")
+
+    backend_type: str = type(model).__name__
+    with progress(backend_type) as p:
+        for task_id, task in p.track(dataset.items()):
+            if id_range is not None:
+                id_num = int(task_id.split("/")[1])
+                low, high = id_range
+                if id_num < low or id_num >= high:
+                    p.console.print(f"Skipping {task_id} as it is not in {id_range}")
+                    continue
+
+            if not target_path.endswith(".jsonl"):
+                p_name = task_id.replace("/", "_")
+                os.makedirs(os.path.join(target_path, p_name), exist_ok=True)
+                task2nexist[task_id] = len(
+                    [
+                        f
+                        for f in os.listdir(os.path.join(target_path, p_name))
+                        if f.endswith(".py")
+                    ]
+                )
+
+            n_more_samples = n_samples
+            log = f"Codegen: {task_id} @ {model}"
+            if resume and task2nexist.get(task_id, 0) > 0:
+                log += f" (resuming from {task2nexist[task_id]})"
+                n_more_samples -= task2nexist[task_id]
+
+            p.console.print(log)
+
+            sidx = n_samples - n_more_samples
+            while sidx < n_samples:
+                prompt = task["prompt"].strip() + "\n"
+                outputs = model.codegen(
+                    prompt,
+                    do_sample=not greedy,
+                    num_samples=n_samples - sidx,
+                )
+                assert outputs, "No outputs from model!"
+                for impl in outputs:
+                    solution = prompt + impl if model.is_direct_completion() else impl
+                    sanitized_solution = sanitize(
+                        solution, entrypoint=task["entry_point"]
+                    )
+                    if target_path.endswith(".jsonl"):
+                        # Writing the sanitized version
+                        with open(target_path, "a") as f:
+                            f.write(
+                                json.dumps(
+                                    {"task_id": task_id, "solution": sanitized_solution}
+                                )
+                                + "\n"
+                            )
+
+                        # Writing the raw version
+                        with open(raw_target_path, "a") as f:
+                            f.write(
+                                json.dumps({"task_id": task_id, "solution": solution})
+                                + "\n"
+                            )
+                    else:
+                        # Writing the sanitized version
+                        with open(
+                            os.path.join(target_path, p_name, f"{sidx}.py"),
+                            "w",
+                            encoding="utf-8",
+                        ) as f:
+                            f.write(sanitized_solution)
+
+                        # Writing the raw version
+                        with open(
+                            os.path.join(raw_target_path, p_name, f"{sidx}.py"),
+                            "w",
+                            encoding="utf-8",
+                        ) as f:
+                            f.write(solution)
+                    sidx += 1
+
+
+def run_codegen(
+    model: str,
+    dataset: str,
+    root: str = "evalplus_results",
+    bs: Optional[int] = None,
+    n_samples: int = 1,
+    temperature: float = 0.0,
+    resume: bool = True,
+    greedy: bool = False,
+    id_range: List = None,
+    version: str = "default",
+    backend: str = "vllm",
+    force_base_prompt: bool = False,
+    base_url: str = None,
+    tp: int = 1,
+    evalperf_type: str = None,  # For EvalPerf
+    jsonl_fmt: bool = True,
+    attn_implementation: str = "eager",
+    device_map: Optional[str] = None,
+    trust_remote_code: bool = False,
+    enable_prefix_caching: bool = False,
+    enable_chunked_prefill: bool = False,
+    dtype: str = "bfloat16",
+    gptqmodel_backend: str = "auto",  # For GPTQModel
+    gguf_file: Optional[str] = None,
+    **kwargs,
+):
+    assert dataset in ["humaneval", "mbpp", "evalperf"], f"Invalid dataset {dataset}"
+    assert evalperf_type is None or evalperf_type in [
+        "instruct",
+        "perf-instruct",
+        "perf-CoT",
+    ]
+
+    # Make dir for codes generated by each model
+    identifier = model.replace("/home/ndfl4zki/ndfl4zkiuser04", "")
+    identifier = identifier.replace("/ckpts/rl/diff-rl", "")
+    identifier = identifier.replace("/actor/huggingface", "")
+    identifier = (
+        identifier.strip("./").replace("/", "--") + f"_{backend}_temp_{temperature}"
+    )
+    if "max_new_tokens" in kwargs:
+        identifier += f"_len_{kwargs['max_new_tokens']}"
+    if kwargs.get("alg", None) is not None:
+        identifier += f"_alg_{kwargs['alg']}"
+    if evalperf_type:
+        identifier += f"-{evalperf_type}"
+
+    if kwargs.get("fast_dllm", False):
+        identifier += "-fast_dllm"
+
+    print(identifier)
+    target_path = os.path.join(root, dataset, identifier)
+    if jsonl_fmt:
+        target_path += ".jsonl"
+    else:
+        os.makedirs(target_path, exist_ok=True)
+
+    if dataset == "humaneval":
+        dataset_dict = get_human_eval_plus(version=version)
+    elif dataset == "mbpp":
+        dataset_dict = get_mbpp_plus(version=version)
+    elif dataset == "evalperf":
+        original_dataset = {**get_human_eval_plus(), **get_mbpp_plus()}
+        dataset_dict = {k: original_dataset[k] for k in get_evalperf_data()}
+        assert id_range is None, "id_range not supported for evalperf"
+    else:
+        raise ValueError(f"Invalid dataset {dataset}")
+
+    all_tasks_complete = False
+    if jsonl_fmt and os.path.isfile(target_path):
+        task_counts = {}
+        with open(target_path, "r") as f:
+            for line in f:
+                if not line.strip():
+                    continue
+                data = json.loads(line)
+                task_id = data["task_id"]
+                task_counts[task_id] = task_counts.get(task_id, 0) + 1
+
+            all_tasks_complete = all(
+                task_counts.get(task_id, 0) >= n_samples
+                for task_id in dataset_dict.keys()
+            )
+
+    if all_tasks_complete:
+        print("All samples are already cached. Skipping codegen.")
+        return target_path
+
+    if greedy and (temperature != 0 or bs != 1 or n_samples != 1):
+        temperature = 0.0
+        bs = 1
+        n_samples = 1
+        print("Greedy decoding ON (--greedy): setting bs=1, n_samples=1, temperature=0")
+
+    if id_range is not None:
+        assert len(id_range) == 2, "id_range must be a list of length 2"
+        assert id_range[0] < id_range[1], "id_range must be increasing"
+        id_range = tuple(id_range)
+
+    if bs is None:
+        bs = min(n_samples, 32)
+        print(f"Setting batch size to {bs}")
+
+    # Make project dir
+    os.makedirs(root, exist_ok=True)
+    # Make dataset dir
+    os.makedirs(os.path.join(root, dataset), exist_ok=True)
+
+    # Model instructions
+    instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:"
+    response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:"
+
+    if evalperf_type == "perf-instruct":
+        instruction_prefix = "Please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:"
+        response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:"
+    elif evalperf_type == "perf-CoT":
+        instruction_prefix = "Think step by step: please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:"
+        response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:"
+    elif evalperf_type is not None and evalperf_type != "instruct":
+        raise ValueError(f"Invalid evalperf_type: {evalperf_type}")
+
+    # Model creation
+    model_runner = make_model(
+        model=model,
+        backend=backend,
+        batch_size=bs,
+        temperature=temperature,
+        force_base_prompt=force_base_prompt,
+        dataset=dataset,
+        base_url=base_url,
+        tp=tp,
+        instruction_prefix=instruction_prefix,
+        response_prefix=response_prefix,
+        device_map=device_map,
+        attn_implementation=attn_implementation,
+        trust_remote_code=trust_remote_code,
+        enable_prefix_caching=enable_prefix_caching,
+        enable_chunked_prefill=enable_chunked_prefill,
+        dtype=dtype,
+        gptqmodel_backend=gptqmodel_backend,
+        gguf_file=gguf_file,
+        **kwargs,
+    )
+
+    codegen(
+        target_path=target_path,
+        dataset=dataset_dict,
+        greedy=greedy,
+        model=model_runner,
+        n_samples=n_samples,
+        resume=resume,
+        id_range=id_range,
+    )
+
+    # force shutdown the model runner
+    del model_runner
+    import gc
+
+    gc.collect()
+
+    return target_path
+
+
+def main():
+    from fire import Fire
+
+    Fire(run_codegen)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/instruct/code_eval/evalplus/evalplus/config.py b/instruct/code_eval/evalplus/evalplus/config.py
@@ -0,0 +1,16 @@
+## EvalPlus
+DEFAULT_GT_TIME_LIMIT_FACTOR = 4.0
+DEFAULT_MIN_TIME_LIMIT = 4.0
+
+## EvalPerf
+
+### General
+PERF_PROFILE_ROUNDS = 1
+PERF_RAM_GB_PER_PROC = 12
+
+### Evaluation Phase
+PERF_EVAL_TIMEOUT_SECOND = 45
+
+### Curation Phase
+PERF_CURATE_TIMEOUT_SECOND = 20
+PREF_CURATE_MIN_INSTRUCTION = 10000