Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support ratios for logging_steps, eval_steps, and save_steps #23235

Merged
merged 10 commits into from
May 9, 2023
8 changes: 8 additions & 0 deletions src/transformers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1712,6 +1712,14 @@ def _inner_training_loop(
f" {args.max_steps}"
)

# Compute absolute values for logging, eval, and save if given as ratio
if args.logging_steps and args.logging_steps < 1:
args.logging_steps = math.ceil(max_steps * args.logging_steps)
if args.eval_steps and args.eval_steps < 1:
args.eval_steps = math.ceil(max_steps * args.eval_steps)
if args.save_steps and args.save_steps < 1:
args.save_steps = math.ceil(max_steps * args.save_steps)

if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
if self.args.n_gpu > 1:
# nn.DataParallel(model) replicates the model, creating new variables and module
Expand Down
72 changes: 63 additions & 9 deletions src/transformers/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,9 @@ class TrainingArguments:

logging_first_step (`bool`, *optional*, defaults to `False`):
Whether to log and evaluate the first `global_step` or not.
logging_steps (`int`, *optional*, defaults to 500):
Number of update steps between two logs if `logging_strategy="steps"`.
logging_steps (`int` or `float`, *optional*, defaults to 500):
Number of update steps between two logs if `logging_strategy="steps"`. Should be an integer or a float in
range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
logging_nan_inf_filter (`bool`, *optional*, defaults to `True`):
Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is `nan`
or `inf` is filtered and the average loss of the current logging window is taken instead.
Expand All @@ -270,8 +271,9 @@ class TrainingArguments:
- `"no"`: No save is done during training.
- `"epoch"`: Save is done at the end of each epoch.
- `"steps"`: Save is done every `save_steps`.
save_steps (`int`, *optional*, defaults to 500):
Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
save_steps (`int` or `float`, *optional*, defaults to 500):
Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Should be an integer or a
float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
save_total_limit (`int`, *optional*):
If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
`output_dir`.
Expand Down Expand Up @@ -332,9 +334,10 @@ class TrainingArguments:
dataloader_drop_last (`bool`, *optional*, defaults to `False`):
Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
or not.
eval_steps (`int`, *optional*):
eval_steps (`int` or `float`, *optional*):
Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same
value as `logging_steps` if not set.
value as `logging_steps` if not set. Should be an integer or a float in range `[0,1)`. If smaller than 1,
will be interpreted as ratio of total training steps.
dataloader_num_workers (`int`, *optional*, defaults to 0):
Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the
main process.
Expand Down Expand Up @@ -721,13 +724,29 @@ class TrainingArguments:
metadata={"help": "The logging strategy to use."},
)
logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"})
logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
logging_steps: float = field(
default=500,
metadata={
"help": (
"Log every X updates steps. Should be an integer or a float in range `[0,1)`."
"If smaller than 1, will be interpreted as ratio of total training steps."
)
},
)
logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."})
save_strategy: Union[IntervalStrategy, str] = field(
default="steps",
metadata={"help": "The checkpoint save strategy to use."},
)
save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
save_steps: float = field(
default=500,
metadata={
"help": (
"Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`."
"If smaller than 1, will be interpreted as ratio of total training steps."
)
},
)
save_total_limit: Optional[int] = field(
default=None,
metadata={
Expand Down Expand Up @@ -854,7 +873,15 @@ class TrainingArguments:
dataloader_drop_last: bool = field(
default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."}
)
eval_steps: Optional[int] = field(default=None, metadata={"help": "Run an evaluation every X steps."})
eval_steps: Optional[float] = field(
default=None,
metadata={
"help": (
"Run an evaluation every X steps. Should be an integer or a float in range `[0,1)`."
"If smaller than 1, will be interpreted as ratio of total training steps."
)
},
)
dataloader_num_workers: int = field(
default=0,
metadata={
Expand Down Expand Up @@ -1186,6 +1213,19 @@ def __post_init__(self):
if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0:
raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps")

if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps > 1:
if self.logging_steps != int(self.logging_steps):
raise ValueError(f"--logging_steps must be an integer if bigger than 1: {self.logging_steps}")
self.logging_steps = int(self.logging_steps)
if self.evaluation_strategy == IntervalStrategy.STEPS and self.eval_steps > 1:
if self.eval_steps != int(self.eval_steps):
raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}")
self.eval_steps = int(self.eval_steps)
if self.save_strategy == IntervalStrategy.STEPS and self.save_steps > 1:
if self.save_steps != int(self.save_steps):
raise ValueError(f"--save_steps must be an integer if bigger than 1: {self.save_steps}")
self.save_steps = int(self.save_steps)

# Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
if self.load_best_model_at_end:
if self.evaluation_strategy != self.save_strategy:
Expand All @@ -1194,6 +1234,20 @@ def __post_init__(self):
f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
)
if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
if self.eval_steps < 1 or self.save_steps < 1:
if not (self.eval_steps < 1 and self.save_steps < 1):
raise ValueError(
"--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
"steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps"
f"{self.save_steps} and eval_steps {self.eval_steps}."
)
# Work around floating point precision issues
LARGE_MULTIPLIER = 1_000_000
if (self.save_steps * LARGE_MULTIPLIER) % (self.eval_steps * LARGE_MULTIPLIER) != 0:
raise ValueError(
"--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
f"steps, but found {self.save_steps}, which is not a multiple of {self.eval_steps}."
)
raise ValueError(
"--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation "
f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."
Expand Down