Skip to content

Commit

Permalink
Migrate metric to Evaluate in Pytorch examples (#18369)
Browse files Browse the repository at this point in the history
* Migrate metric to Evaluate in pytorch examples

* Remove unused imports
  • Loading branch information
atturaioe authored Aug 1, 2022
1 parent 25ec12e commit 1f84399
Show file tree
Hide file tree
Showing 25 changed files with 72 additions and 49 deletions.
1 change: 1 addition & 0 deletions examples/pytorch/_tests_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ torchvision
jiwer
librosa
torch < 1.12
evaluate
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import numpy as np
from datasets import DatasetDict, load_dataset

import evaluate
import transformers
from transformers import (
AutoConfig,
Expand Down Expand Up @@ -315,7 +316,7 @@ def val_transforms(batch):
id2label[str(i)] = label

# Load the accuracy metric from the datasets package
metric = datasets.load_metric("accuracy")
metric = evaluate.load("accuracy")

# Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with
# `predictions` and `label_ids` fields) and has to return a dictionary string to float.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from dataclasses import dataclass, field
from typing import Optional

import datasets
import numpy as np
import torch
from datasets import load_dataset
Expand All @@ -34,6 +33,7 @@
ToTensor,
)

import evaluate
import transformers
from transformers import (
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
Expand Down Expand Up @@ -252,7 +252,7 @@ def main():
id2label[str(i)] = label

# Load the accuracy metric from the datasets package
metric = datasets.load_metric("accuracy")
metric = evaluate.load("accuracy")

# Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

import datasets
import torch
from datasets import load_dataset, load_metric
from datasets import load_dataset
from torch.utils.data import DataLoader
from torchvision.transforms import (
CenterCrop,
Expand All @@ -35,6 +35,7 @@
)
from tqdm.auto import tqdm

import evaluate
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
Expand Down Expand Up @@ -415,7 +416,7 @@ def collate_fn(examples):
accelerator.init_trackers("image_classification_no_trainer", experiment_config)

# Get the metric function
metric = load_metric("accuracy")
metric = evaluate.load("accuracy")

# Train!
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@
from typing import Optional

import datasets
from datasets import load_dataset, load_metric
from datasets import load_dataset

import evaluate
import transformers
from transformers import (
CONFIG_MAPPING,
Expand Down Expand Up @@ -492,7 +493,7 @@ def preprocess_logits_for_metrics(logits, labels):
logits = logits[0]
return logits.argmax(dim=-1)

metric = load_metric("accuracy")
metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
preds, labels = eval_preds
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/language-modeling/run_mlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@
from typing import Optional

import datasets
from datasets import load_dataset, load_metric
from datasets import load_dataset

import evaluate
import transformers
from transformers import (
CONFIG_MAPPING,
Expand Down Expand Up @@ -515,7 +516,7 @@ def preprocess_logits_for_metrics(logits, labels):
logits = logits[0]
return logits.argmax(dim=-1)

metric = load_metric("accuracy")
metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
preds, labels = eval_preds
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/multiple-choice/run_swag_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,11 @@

import datasets
import torch
from datasets import load_dataset, load_metric
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import evaluate
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
Expand Down Expand Up @@ -514,7 +515,7 @@ def preprocess_function(examples):
accelerator.init_trackers("swag_no_trainer", experiment_config)

# Metrics
metric = load_metric("accuracy")
metric = evaluate.load("accuracy")

# Train!
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/question-answering/run_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@
from typing import Optional

import datasets
from datasets import load_dataset, load_metric
from datasets import load_dataset

import evaluate
import transformers
from trainer_qa import QuestionAnsweringTrainer
from transformers import (
Expand Down Expand Up @@ -593,7 +594,7 @@ def post_processing_function(examples, features, predictions, stage="eval"):
references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
return EvalPrediction(predictions=formatted_predictions, label_ids=references)

metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")

def compute_metrics(p: EvalPrediction):
return metric.compute(predictions=p.predictions, references=p.label_ids)
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/question-answering/run_qa_beam_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@
from typing import Optional

import datasets
from datasets import load_dataset, load_metric
from datasets import load_dataset

import evaluate
import transformers
from trainer_qa import QuestionAnsweringTrainer
from transformers import (
Expand Down Expand Up @@ -625,7 +626,7 @@ def post_processing_function(examples, features, predictions, stage="eval"):
references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
return EvalPrediction(predictions=formatted_predictions, label_ids=references)

metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")

def compute_metrics(p: EvalPrediction):
return metric.compute(predictions=p.predictions, references=p.label_ids)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@
import datasets
import numpy as np
import torch
from datasets import load_dataset, load_metric
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import evaluate
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
Expand Down Expand Up @@ -680,7 +681,7 @@ def post_processing_function(examples, features, predictions, stage="eval"):
references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
return EvalPrediction(predictions=formatted_predictions, label_ids=references)

metric = load_metric("squad_v2" if args.version_2_with_negative else "squad")
metric = evaluate.load("squad_v2" if args.version_2_with_negative else "squad")

def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
"""
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/question-answering/run_qa_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@
import datasets
import numpy as np
import torch
from datasets import load_dataset, load_metric
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import evaluate
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
Expand Down Expand Up @@ -696,7 +697,7 @@ def post_processing_function(examples, features, predictions, stage="eval"):
references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
return EvalPrediction(predictions=formatted_predictions, label_ids=references)

metric = load_metric("squad_v2" if args.version_2_with_negative else "squad")
metric = evaluate.load("squad_v2" if args.version_2_with_negative else "squad")

# Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/question-answering/run_seq2seq_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@
from typing import List, Optional, Tuple

import datasets
from datasets import load_dataset, load_metric
from datasets import load_dataset

import evaluate
import transformers
from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer
from transformers import (
Expand Down Expand Up @@ -581,7 +582,7 @@ def preprocess_validation_function(examples):
pad_to_multiple_of=8 if training_args.fp16 else None,
)

metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")

def compute_metrics(p: EvalPrediction):
return metric.compute(predictions=p.predictions, references=p.label_ids)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from dataclasses import dataclass, field
from typing import Optional

import datasets
import numpy as np
import torch
from datasets import load_dataset
Expand All @@ -30,6 +29,7 @@
from torchvision import transforms
from torchvision.transforms import functional

import evaluate
import transformers
from huggingface_hub import hf_hub_download
from transformers import (
Expand Down Expand Up @@ -337,7 +337,7 @@ def main():
label2id = {v: str(k) for k, v in id2label.items()}

# Load the mean IoU metric from the datasets package
metric = datasets.load_metric("mean_iou")
metric = evaluate.load("mean_iou")

# Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@
import datasets
import numpy as np
import torch
from datasets import load_dataset, load_metric
from datasets import load_dataset
from PIL import Image
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.transforms import functional
from tqdm.auto import tqdm

import evaluate
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
Expand Down Expand Up @@ -500,7 +501,7 @@ def preprocess_val(example_batch):
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

# Instantiate metric
metric = load_metric("mean_iou")
metric = evaluate.load("mean_iou")

# We need to initialize the trackers we use, and also store our configuration.
# We initialize the trackers only on main process because `accelerator.log`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@
import datasets
import numpy as np
import torch
from datasets import DatasetDict, load_dataset, load_metric
from datasets import DatasetDict, load_dataset

import evaluate
import transformers
from transformers import (
AutoConfig,
Expand Down Expand Up @@ -643,7 +644,7 @@ def is_audio_in_length_range(length):
# instantiate a data collator and the trainer

# Define evaluation metrics during training, *i.e.* word error rate, character error rate
eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
eval_metrics = {metric: evaluate.load(metric) for metric in data_args.eval_metrics}

# for large datasets it is advised to run the preprocessing on a
# single machine first with ``args.preprocessing_only`` since there will mostly likely
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@

import datasets
import torch
from datasets import DatasetDict, load_dataset, load_metric
from datasets import DatasetDict, load_dataset

import evaluate
import transformers
from transformers import (
AutoConfig,
Expand Down Expand Up @@ -425,7 +426,7 @@ def is_audio_in_length_range(length):
return

# 8. Load Metric
metric = load_metric("wer")
metric = evaluate.load("wer")

def compute_metrics(pred):
pred_ids = pred.predictions
Expand Down
5 changes: 3 additions & 2 deletions examples/pytorch/summarization/run_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@
import datasets
import nltk # Here to have a nice missing dependency error message early on
import numpy as np
from datasets import load_dataset, load_metric
from datasets import load_dataset

import evaluate
import transformers
from filelock import FileLock
from transformers import (
Expand Down Expand Up @@ -598,7 +599,7 @@ def preprocess_function(examples):
)

# Metric
metric = load_metric("rouge")
metric = evaluate.load("rouge")

def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@
import nltk
import numpy as np
import torch
from datasets import load_dataset, load_metric
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import evaluate
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
Expand Down Expand Up @@ -583,7 +584,7 @@ def postprocess_text(preds, labels):
accelerator.init_trackers("summarization_no_trainer", experiment_config)

# Metric
metric = load_metric("rouge")
metric = evaluate.load("rouge")

# Train!
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
Expand Down
7 changes: 4 additions & 3 deletions examples/pytorch/text-classification/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@

import datasets
import numpy as np
from datasets import load_dataset, load_metric
from datasets import load_dataset

import evaluate
import transformers
from transformers import (
AutoConfig,
Expand Down Expand Up @@ -480,9 +481,9 @@ def preprocess_function(examples):

# Get the metric function
if data_args.task_name is not None:
metric = load_metric("glue", data_args.task_name)
metric = evaluate.load("glue", data_args.task_name)
else:
metric = load_metric("accuracy")
metric = evaluate.load("accuracy")

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
Expand Down
Loading

0 comments on commit 1f84399

Please sign in to comment.