-
Notifications
You must be signed in to change notification settings - Fork 3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
support pp accuracy calculation #9379
Changes from 6 commits
c749b46
eb799c8
b1e3d1a
eb6c4eb
47d6472
6c8adfc
91a2234
0780d2a
f9dd719
9b901e6
c0645e7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,6 +42,8 @@ | |
from packaging import version | ||
from paddle import framework | ||
|
||
from paddlenlp.utils import infohub | ||
|
||
try: | ||
from paddle.base import core | ||
except: | ||
|
@@ -3172,7 +3174,13 @@ def evaluation_loop( | |
|
||
# Metrics! | ||
if self.compute_metrics is not None and all_preds is not None and all_labels is not None: | ||
metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels)) | ||
# all_labels maybe is a tuple when prediction_steps output label_mask | ||
if isinstance(all_labels, (list, tuple)): | ||
# compute_metrics in train.py | ||
metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels[0])) | ||
else: | ||
# compute_metrics in modeling.py | ||
metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels)) | ||
else: | ||
metrics = {} | ||
|
||
|
@@ -3268,6 +3276,17 @@ def prediction_pipeline_step( | |
labels = None | ||
inputs = inputs.pop("input_ids") | ||
|
||
# evaluation dont support drop last, | ||
# so set the `accumulate_steps` to actually | ||
# eval batch size. | ||
model_config_backup = model.accumulate_steps | ||
if isinstance(inputs, tuple): | ||
input_ids = inputs[0] | ||
else: | ||
input_ids = inputs | ||
|
||
model.accumulate_steps = input_ids.shape[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 要不就要把model.micro_batch_size直接设为1 |
||
|
||
with paddle.no_grad(): | ||
if has_labels: | ||
with self.autocast_smart_context_manager(): | ||
|
@@ -3276,9 +3295,25 @@ def prediction_pipeline_step( | |
loss = loss.mean().detach() | ||
else: | ||
raise ValueError("pipeline mode eval need label!") | ||
# reset the `accumulate_steps`. | ||
model.accumulate_steps = model_config_backup | ||
|
||
return (loss, None, labels) | ||
|
||
def prediction_pipeline_step_with_logits_acc( | ||
self, | ||
*args, | ||
**kwargs, | ||
): | ||
loss, _, labels = self.prediction_pipeline_step(*args, **kwargs) | ||
logits = None | ||
if "pp_logits" in infohub: | ||
logits = paddle.concat(infohub["pp_logits"], axis=0) | ||
logits = logits._copy_to(paddle.framework._current_expected_place(), False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里拷贝的原因是pp_logits是放在cpu memory 或者 cuda pin memory? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 是的,如果这里不放在cpu或者 pin memory 会在 concat 的时候造成增加两倍 logits 大小的峰值显存,导致 OOM |
||
infohub["pp_logits"] = [] | ||
|
||
return (loss, logits, labels) | ||
|
||
def prediction_step( | ||
self, | ||
model: nn.Layer, | ||
|
@@ -3312,6 +3347,8 @@ def prediction_step( | |
if self.args.pipeline_parallel_degree > 1: | ||
# hack for pipeline mode | ||
inputs = self._prepare_inputs(inputs) | ||
if self.args.metric_for_best_model == "accuracy": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个建议不要放在trainer,放在SFTTrainer更加合理 |
||
return self.prediction_pipeline_step_with_logits_acc(model, inputs, prediction_loss_only, ignore_keys) | ||
return self.prediction_pipeline_step(model, inputs, prediction_loss_only, ignore_keys) | ||
|
||
has_labels = all(inputs.get(k) is not None for k in self.label_names) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,6 +81,7 @@ | |
"fp16_opt_level": "O2", | ||
"max_grad_norm": 1.0, | ||
"dataloader_num_workers": 0, | ||
"metric_for_best_model": "accuracy", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 后续也在开源模型适配 |
||
"continue_training": 0, | ||
"do_train": "true", | ||
"do_eval": "false", | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这个命名是不是不太规范? 很明显这个又不是一个model config
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
不能只修改accumulate_steps而不修改micro_batch_size,
https://github.com/PaddlePaddle/Paddle/blob/52f55e159fd8235c841985578e380b9d9dc3a220/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py#L271C1-L275C31