Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate metric to Evaluate in Pytorch examples #18369

Merged
merged 2 commits into from
Aug 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/pytorch/_tests_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ torchvision
jiwer
librosa
torch < 1.12
evaluate
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import numpy as np
from datasets import DatasetDict, load_dataset

import evaluate
import transformers
from transformers import (
AutoConfig,
Expand Down Expand Up @@ -315,7 +316,7 @@ def val_transforms(batch):
id2label[str(i)] = label

# Load the accuracy metric from the datasets package
metric = datasets.load_metric("accuracy")
metric = evaluate.load("accuracy")

# Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with
# `predictions` and `label_ids` fields) and has to return a dictionary string to float.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from dataclasses import dataclass, field
from typing import Optional

import datasets
import numpy as np
import torch
from datasets import load_dataset
Expand All @@ -34,6 +33,7 @@
ToTensor,
)

import evaluate
import transformers
from transformers import (
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
Expand Down Expand Up @@ -252,7 +252,7 @@ def main():
id2label[str(i)] = label

# Load the accuracy metric from the datasets package
metric = datasets.load_metric("accuracy")
metric = evaluate.load("accuracy")

# Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

import datasets
import torch
from datasets import load_dataset, load_metric
from datasets import load_dataset
from torch.utils.data import DataLoader
from torchvision.transforms import (
CenterCrop,
Expand All @@ -35,6 +35,7 @@
)
from tqdm.auto import tqdm

import evaluate
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
Expand Down Expand Up @@ -415,7 +416,7 @@ def collate_fn(examples):
accelerator.init_trackers("image_classification_no_trainer", experiment_config)

# Get the metric function
metric = load_metric("accuracy")
metric = evaluate.load("accuracy")

# Train!
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@
from typing import Optional

import datasets
from datasets import load_dataset, load_metric
from datasets import load_dataset

import evaluate
import transformers
from transformers import (
CONFIG_MAPPING,
Expand Down Expand Up @@ -492,7 +493,7 @@ def preprocess_logits_for_metrics(logits, labels):
logits = logits[0]
return logits.argmax(dim=-1)

metric = load_metric("accuracy")
metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
preds, labels = eval_preds
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/language-modeling/run_mlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@
from typing import Optional

import datasets
from datasets import load_dataset, load_metric
from datasets import load_dataset

import evaluate
import transformers
from transformers import (
CONFIG_MAPPING,
Expand Down Expand Up @@ -515,7 +516,7 @@ def preprocess_logits_for_metrics(logits, labels):
logits = logits[0]
return logits.argmax(dim=-1)

metric = load_metric("accuracy")
metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
preds, labels = eval_preds
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/multiple-choice/run_swag_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,11 @@

import datasets
import torch
from datasets import load_dataset, load_metric
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import evaluate
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
Expand Down Expand Up @@ -514,7 +515,7 @@ def preprocess_function(examples):
accelerator.init_trackers("swag_no_trainer", experiment_config)

# Metrics
metric = load_metric("accuracy")
metric = evaluate.load("accuracy")

# Train!
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/question-answering/run_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@
from typing import Optional

import datasets
from datasets import load_dataset, load_metric
from datasets import load_dataset

import evaluate
import transformers
from trainer_qa import QuestionAnsweringTrainer
from transformers import (
Expand Down Expand Up @@ -593,7 +594,7 @@ def post_processing_function(examples, features, predictions, stage="eval"):
references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
return EvalPrediction(predictions=formatted_predictions, label_ids=references)

metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")

def compute_metrics(p: EvalPrediction):
return metric.compute(predictions=p.predictions, references=p.label_ids)
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/question-answering/run_qa_beam_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@
from typing import Optional

import datasets
from datasets import load_dataset, load_metric
from datasets import load_dataset

import evaluate
import transformers
from trainer_qa import QuestionAnsweringTrainer
from transformers import (
Expand Down Expand Up @@ -625,7 +626,7 @@ def post_processing_function(examples, features, predictions, stage="eval"):
references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
return EvalPrediction(predictions=formatted_predictions, label_ids=references)

metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")

def compute_metrics(p: EvalPrediction):
return metric.compute(predictions=p.predictions, references=p.label_ids)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@
import datasets
import numpy as np
import torch
from datasets import load_dataset, load_metric
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import evaluate
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
Expand Down Expand Up @@ -680,7 +681,7 @@ def post_processing_function(examples, features, predictions, stage="eval"):
references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
return EvalPrediction(predictions=formatted_predictions, label_ids=references)

metric = load_metric("squad_v2" if args.version_2_with_negative else "squad")
metric = evaluate.load("squad_v2" if args.version_2_with_negative else "squad")

def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
"""
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/question-answering/run_qa_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@
import datasets
import numpy as np
import torch
from datasets import load_dataset, load_metric
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import evaluate
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
Expand Down Expand Up @@ -696,7 +697,7 @@ def post_processing_function(examples, features, predictions, stage="eval"):
references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
return EvalPrediction(predictions=formatted_predictions, label_ids=references)

metric = load_metric("squad_v2" if args.version_2_with_negative else "squad")
metric = evaluate.load("squad_v2" if args.version_2_with_negative else "squad")

# Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/question-answering/run_seq2seq_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@
from typing import List, Optional, Tuple

import datasets
from datasets import load_dataset, load_metric
from datasets import load_dataset

import evaluate
import transformers
from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer
from transformers import (
Expand Down Expand Up @@ -581,7 +582,7 @@ def preprocess_validation_function(examples):
pad_to_multiple_of=8 if training_args.fp16 else None,
)

metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")

def compute_metrics(p: EvalPrediction):
return metric.compute(predictions=p.predictions, references=p.label_ids)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from dataclasses import dataclass, field
from typing import Optional

import datasets
import numpy as np
import torch
from datasets import load_dataset
Expand All @@ -30,6 +29,7 @@
from torchvision import transforms
from torchvision.transforms import functional

import evaluate
import transformers
from huggingface_hub import hf_hub_download
from transformers import (
Expand Down Expand Up @@ -337,7 +337,7 @@ def main():
label2id = {v: str(k) for k, v in id2label.items()}

# Load the mean IoU metric from the datasets package
metric = datasets.load_metric("mean_iou")
metric = evaluate.load("mean_iou")

# Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@
import datasets
import numpy as np
import torch
from datasets import load_dataset, load_metric
from datasets import load_dataset
from PIL import Image
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.transforms import functional
from tqdm.auto import tqdm

import evaluate
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
Expand Down Expand Up @@ -500,7 +501,7 @@ def preprocess_val(example_batch):
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

# Instantiate metric
metric = load_metric("mean_iou")
metric = evaluate.load("mean_iou")

# We need to initialize the trackers we use, and also store our configuration.
# We initialize the trackers only on main process because `accelerator.log`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@
import datasets
import numpy as np
import torch
from datasets import DatasetDict, load_dataset, load_metric
from datasets import DatasetDict, load_dataset

import evaluate
import transformers
from transformers import (
AutoConfig,
Expand Down Expand Up @@ -643,7 +644,7 @@ def is_audio_in_length_range(length):
# instantiate a data collator and the trainer

# Define evaluation metrics during training, *i.e.* word error rate, character error rate
eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
eval_metrics = {metric: evaluate.load(metric) for metric in data_args.eval_metrics}

# for large datasets it is advised to run the preprocessing on a
# single machine first with ``args.preprocessing_only`` since there will mostly likely
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@

import datasets
import torch
from datasets import DatasetDict, load_dataset, load_metric
from datasets import DatasetDict, load_dataset

import evaluate
import transformers
from transformers import (
AutoConfig,
Expand Down Expand Up @@ -425,7 +426,7 @@ def is_audio_in_length_range(length):
return

# 8. Load Metric
metric = load_metric("wer")
metric = evaluate.load("wer")

def compute_metrics(pred):
pred_ids = pred.predictions
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/summarization/run_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@
import datasets
import nltk # Here to have a nice missing dependency error message early on
import numpy as np
from datasets import load_dataset, load_metric
from datasets import load_dataset

import evaluate
import transformers
from filelock import FileLock
from transformers import (
Expand Down Expand Up @@ -598,7 +599,7 @@ def preprocess_function(examples):
)

# Metric
metric = load_metric("rouge")
metric = evaluate.load("rouge")

def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@
import nltk
import numpy as np
import torch
from datasets import load_dataset, load_metric
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import evaluate
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
Expand Down Expand Up @@ -583,7 +584,7 @@ def postprocess_text(preds, labels):
accelerator.init_trackers("summarization_no_trainer", experiment_config)

# Metric
metric = load_metric("rouge")
metric = evaluate.load("rouge")

# Train!
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
Expand Down
7 changes: 4 additions & 3 deletions examples/pytorch/text-classification/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@

import datasets
import numpy as np
from datasets import load_dataset, load_metric
from datasets import load_dataset

import evaluate
import transformers
from transformers import (
AutoConfig,
Expand Down Expand Up @@ -480,9 +481,9 @@ def preprocess_function(examples):

# Get the metric function
if data_args.task_name is not None:
metric = load_metric("glue", data_args.task_name)
metric = evaluate.load("glue", data_args.task_name)
else:
metric = load_metric("accuracy")
metric = evaluate.load("accuracy")

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
Expand Down
Loading