-
Notifications
You must be signed in to change notification settings - Fork 61
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This script can be used to evaluate the perplexity of a float or neuron LLM model using a configurable dataset (WikiText by default).
- Loading branch information
Showing
1 changed file
with
282 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,282 @@ | ||
# Copyright 2024 The HuggingFace Team. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import argparse | ||
import importlib | ||
import sys | ||
|
||
import numpy as np | ||
import torch | ||
from datasets import load_dataset | ||
from tqdm import tqdm | ||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer | ||
|
||
|
||
if importlib.util.find_spec("optimum.neuron") is not None: | ||
from optimum.neuron import NeuronModelForCausalLM | ||
|
||
|
||
class Perplexity: | ||
""" | ||
A class for calculating the perplexity of a language model. | ||
""" | ||
|
||
def __init__(self, model, tokenizer, dataset_path="wikitext", dataset_name=None, split="test", text_column="text"): | ||
""" | ||
Calculate perplexity using the same method as seen in llama.cpp. | ||
Parameters | ||
---------- | ||
model : AutoModelForCausalLM | ||
The language model for which the perplexity is calculated. | ||
tokenizer : AutoTokenizer | ||
The tokenizer corresponding to the model. | ||
dataset_path : str, optional | ||
The path to the dataset on the Hugging Face dataset hub. Default is 'wikitext'. | ||
dataset_name : str, optional | ||
The name of the dataset. Default is None. | ||
split : str, optional | ||
The split of the dataset to use. Default is 'test'. | ||
text_column : str, optional | ||
The name of the column in the dataset that contains the text data. Default is 'text'. | ||
""" | ||
self._model = model | ||
if hasattr(model.config, "neuron"): | ||
if not model.model.neuron_config.output_all_logits: | ||
raise ValueError( | ||
"To evaluate the perplexity of a Neuron model, it must be configured to return all logits during export." | ||
) | ||
self._neuron = True | ||
else: | ||
self._neuron = False | ||
self._tokenizer = tokenizer | ||
self._dataset_path = dataset_path | ||
self._dataset_name = dataset_name | ||
self._split = split | ||
self._text_column = text_column | ||
self._text = self._prepare_data() | ||
|
||
def _prepare_data(self): | ||
""" | ||
Prepares the dataset by loading and formatting. | ||
Returns | ||
------- | ||
str | ||
The formatted dataset as a single string. | ||
""" | ||
if self._dataset_path == "wikitext": | ||
self._dataset_name = "wikitext-2-raw-v1" | ||
|
||
# Load the dataset | ||
data = load_dataset(self._dataset_path, self._dataset_name, split=self._split) | ||
# Format the text column of the dataset | ||
text_list = [" \n" if s == "" else s for s in data[self._text_column]] | ||
return "".join(text_list) | ||
|
||
@staticmethod | ||
def softmax(logits): | ||
""" | ||
Static method for applying the softmax function. | ||
Parameters | ||
---------- | ||
logits : np.ndarray | ||
The input to the softmax function. | ||
Returns | ||
------- | ||
np.ndarray | ||
The output of the softmax function. | ||
""" | ||
e_x = np.exp(logits - np.max(logits)) | ||
return e_x / e_x.sum(axis=0) | ||
|
||
def calculate_perplexity(self, n_ctx=512, n_batch=512): | ||
""" | ||
Calculates the perplexity of the language model. | ||
Parameters | ||
---------- | ||
n_ctx : int | ||
The context size. | ||
n_batch : int | ||
The batch size. | ||
Returns | ||
------- | ||
list | ||
The list of perplexity scores calculated. | ||
""" | ||
# Tokenize the text | ||
self._tokenizer.model_max_length = sys.maxsize | ||
tokens = self._tokenizer(self._text, truncation=False, return_tensors="pt").input_ids.to(self._model.device) | ||
|
||
nll = 0.0 # Negative log likelihood | ||
count = 0 # Counter for processed tokens | ||
curr_ppl = 0 | ||
all_perplexity = [] | ||
|
||
with tqdm(range(len(tokens[0]) // n_ctx), desc="Perplexity: - ") as progress: | ||
for i in progress: | ||
# Process each batch of tokens | ||
nll, count = self._process_batch(i, n_ctx, n_batch, tokens, nll, count) | ||
|
||
# Calculate and display the current perplexity | ||
curr_ppl = np.exp(nll / count) | ||
all_perplexity.append(curr_ppl) | ||
progress.set_description(f"Perplexity: {curr_ppl:.4f}") | ||
|
||
return all_perplexity | ||
|
||
def _process_batch(self, i, n_ctx, n_batch, tokens, nll, count): | ||
""" | ||
Processes each batch of tokens. | ||
Parameters | ||
---------- | ||
i : int | ||
The batch index. | ||
n_ctx : int | ||
The context size. | ||
n_batch : int | ||
The batch size. | ||
tokens : torch.Tensor | ||
The tokenized text. | ||
nll : float | ||
The current negative log likelihood. | ||
count : int | ||
The current count of processed tokens. | ||
Returns | ||
------- | ||
float | ||
The updated negative log likelihood. | ||
int | ||
The updated count of processed tokens. | ||
""" | ||
start = i * n_ctx | ||
end = start + n_ctx | ||
|
||
num_batches = (n_ctx + n_batch - 1) // n_batch | ||
|
||
logits = [] | ||
|
||
for j in range(num_batches): | ||
batch_start = start + j * n_batch | ||
batch_size = min(end - batch_start, n_batch) | ||
|
||
token_org = tokens[0][batch_start].item() | ||
|
||
if j == 0: | ||
# Replace the first token with the BOS token | ||
tokens[0][batch_start] = self._tokenizer.bos_token_id | ||
|
||
# Compute the logits for the current batch of tokens | ||
batch_logits = self._compute_batch_logits(tokens, batch_start, batch_size) | ||
|
||
tokens[0][batch_start] = token_org | ||
|
||
logits.append(batch_logits) | ||
|
||
# We rely on the fact that attention in the forward pass only looks at previous | ||
# tokens here, so the logits returned for each token are an accurate representation | ||
# of what the model would have predicted at that point. | ||
# | ||
# Example, we have a context window of 512, we will compute perplexity for each of the | ||
# last 256 tokens. Then, we split the input up into context window size chunks to | ||
# process the entire prompt. | ||
|
||
for j in range(min(512, n_ctx // 2), n_ctx - 1): | ||
tok_logits = logits[0][0][j].cpu().numpy() | ||
# Compute the probability of the next token | ||
prob = self.softmax(tok_logits)[tokens[0][start + j + 1]] | ||
|
||
# Update the negative log likelihood and the count of processed tokens | ||
nll += -np.log(prob, where=prob > 0) | ||
count += 1 | ||
|
||
return nll, count | ||
|
||
def _compute_batch_logits(self, tokens, batch_start, batch_size): | ||
""" | ||
Computes the logits for a batch of tokens. | ||
Parameters | ||
---------- | ||
tokens : torch.Tensor | ||
The tokenized text. | ||
batch_start : int | ||
The start index of the batch. | ||
batch_size : int | ||
The size of the batch. | ||
Returns | ||
------- | ||
torch.Tensor | ||
The logits for the batch of tokens. | ||
""" | ||
input_ids = tokens[:, batch_start : batch_start + batch_size] | ||
# Compute the logits without keeping track of gradients | ||
with torch.no_grad(): | ||
if self._neuron: | ||
attention_mask = torch.ones_like(input_ids) | ||
model_inputs = self._model.prepare_inputs_for_prefill(input_ids, attention_mask) | ||
else: | ||
model_inputs = {"input_ids": input_ids} | ||
outputs = self._model.forward(**model_inputs) | ||
return outputs.logits.detach() | ||
|
||
|
||
def perplexity( | ||
model, | ||
tokenizer, | ||
stride: int = 512, | ||
): | ||
print("Evaluating perplexity") | ||
ppl = Perplexity(model, tokenizer) | ||
return np.mean(ppl.calculate_perplexity(n_ctx=stride)) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="Evaluate neuron model perplexity") | ||
parser.add_argument("model", type=str, help="The path to the model to evaluate.") | ||
parser.add_argument("--batch_size", type=int, default=32, help="The batch size during evaluation.") | ||
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") | ||
args = parser.parse_args() | ||
|
||
torch.manual_seed(args.seed) | ||
|
||
model_path = args.model | ||
config = AutoConfig.from_pretrained(model_path) | ||
model_name = config._name_or_path | ||
if hasattr(config, "neuron"): | ||
model_type = "neuron" | ||
model = NeuronModelForCausalLM.from_pretrained(model_path) | ||
else: | ||
device = torch.device("cpu") | ||
if torch.cuda.is_available(): | ||
device = torch.device("cuda") | ||
model_type = "torch" | ||
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", low_cpu_mem_usage=True).to(device) | ||
tokenizer = AutoTokenizer.from_pretrained(model_path) | ||
tokenizer.pad_token_id = tokenizer.eos_token_id | ||
tokenizer.padding_side = "left" | ||
print(f"{model_name}: evaluating perplexity of {model_type} model") | ||
ppl = perplexity(model, tokenizer) | ||
print(ppl) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |