PaddlePaddle · wawltor · Nov 29, 2024 · Nov 6, 2024 · Nov 7, 2024 · Nov 7, 2024
diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
@@ -42,6 +42,8 @@
 from packaging import version
 from paddle import framework
 
+from paddlenlp.utils import infohub
+
 try:
     from paddle.base import core
 except:
@@ -3172,7 +3174,13 @@ def evaluation_loop(
 
         # Metrics!
         if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
-            metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
+            # all_labels maybe is a tuple when prediction_steps output label_mask
+            if isinstance(all_labels, (list, tuple)):
+                # compute_metrics in train.py
+                metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels[0]))
+            else:
+                # compute_metrics in modeling.py
+                metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
         else:
             metrics = {}
 
@@ -3268,6 +3276,17 @@ def prediction_pipeline_step(
                 labels = None
             inputs = inputs.pop("input_ids")
 
+        # evaluation dont support drop last,
+        # so set the `accumulate_steps` to actually
+        # eval batch size.
+        model_config_backup = model.accumulate_steps
+        if isinstance(inputs, tuple):
+            input_ids = inputs[0]
+        else:
+            input_ids = inputs
+
+        model.accumulate_steps = input_ids.shape[0]
+
         with paddle.no_grad():
             if has_labels:
                 with self.autocast_smart_context_manager():
@@ -3276,9 +3295,25 @@ def prediction_pipeline_step(
                 loss = loss.mean().detach()
             else:
                 raise ValueError("pipeline mode eval need label!")
+        # reset the `accumulate_steps`.
+        model.accumulate_steps = model_config_backup
 
         return (loss, None, labels)
 
+    def prediction_pipeline_step_with_logits_acc(
+        self,
+        *args,
+        **kwargs,
+    ):
+        loss, _, labels = self.prediction_pipeline_step(*args, **kwargs)
+        logits = None
+        if "pp_logits" in infohub:
+            logits = paddle.concat(infohub["pp_logits"], axis=0)
+            logits = logits._copy_to(paddle.framework._current_expected_place(), False)
+            infohub["pp_logits"] = []
+
+        return (loss, logits, labels)
+
     def prediction_step(
         self,
         model: nn.Layer,
@@ -3312,6 +3347,8 @@ def prediction_step(
         if self.args.pipeline_parallel_degree > 1:
             # hack for pipeline mode
             inputs = self._prepare_inputs(inputs)
+            if self.args.metric_for_best_model == "accuracy":
+                return self.prediction_pipeline_step_with_logits_acc(model, inputs, prediction_loss_only, ignore_keys)
             return self.prediction_pipeline_step(model, inputs, prediction_loss_only, ignore_keys)
 
         has_labels = all(inputs.get(k) is not None for k in self.label_names)

diff --git a/tests/trainer/test_unified_checkpoint.py b/tests/trainer/test_unified_checkpoint.py
@@ -81,6 +81,7 @@
     "fp16_opt_level": "O2",
     "max_grad_norm": 1.0,
     "dataloader_num_workers": 0,
+    "metric_for_best_model": "accuracy",
     "continue_training": 0,
     "do_train": "true",
     "do_eval": "false",