Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
aidando73 committed Nov 23, 2024
1 parent 2f72bce commit 5a5bcf7
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 4 deletions.
12 changes: 12 additions & 0 deletions tools/benchmarks/llm_eval_harness/meta_eval/COMMANDS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

```bash
source $STORAGE_DIR/llama-recipes/.venv/bin/activate
```

```bash
huggingface-cli login
```

```bash
python prepare_meta_eval.py --config_path ./eval_config.yaml
```
8 changes: 4 additions & 4 deletions tools/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
model_name: "meta-llama/Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
model_name: "meta-llama/Llama-3.2-1B" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."

evals_dataset: "meta-llama/Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
evals_dataset: "meta-llama/Llama-3.2-1B-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
# Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals"]

tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
# Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.

tensor_parallel_size: 1 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.

data_parallel_size: 4 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
data_parallel_size: 1 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.

gpu_memory_utilization: 0.9 #The VLLM argument that speicify gpu memory utilization, the rest will be reserved for KV cache.

Expand All @@ -19,7 +19,7 @@ batch_size: "auto" # Batch size, can be 'auto', 'auto:N', or an integer. It is s
output_path: "eval_results" # the output folder to store all the eval results and samples.

#limit: 12 # Limit number of examples per task, set 'null' to run all.
limit: null # Limit number of examples per task, set 'null' to run all.
limit: 5 # Limit number of examples per task, set 'null' to run all.

verbosity: "INFO" #Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
task: meta_mmlu
dataset_path: meta-llama/Llama-3.2-1B-evals
dataset_name: Llama-3.2-1B-evals__mmlu__details
test_split: latest
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: ["A", "B", "C", "D"]
# 5-shot prompts are already included in the dataset
# So do not need to generate them
num_fewshot: 0
metadata:
version: 1.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import string
import datasets

def doc_to_text(doc: dict) -> str:
# Strip out the last two characters, which is a space and the answer
return doc["input_final_prompts"][0][:-2]

def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc: dict) -> dict:
# E.g., "Answer: B"
answer = doc["input_correct_responses"][0]
# Assumes that indexes are always A: 0, B: 1, C: 2, D: 3
answer_index = string.ascii_uppercase.index(answer[-1])

out_doc = {
"problem": doc["input_question"],
# The answer is the index of the correct response (0-indexed)
"gold": answer_index,
}
return out_doc

dataset = dataset.select_columns(
["input_question", "input_correct_responses", "input_final_prompts", "is_correct", "input_question_hash",
"input_choice_list"])
dataset = dataset.rename_column("is_correct", "previously_is_correct")
dataset = dataset.map(_process_doc)
return dataset.map(_process_doc)

def doc_to_target(doc: dict) -> str:
return doc["gold"]

0 comments on commit 5a5bcf7

Please sign in to comment.