Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[trainer] allow processor instead of tokenizer #30864

Closed
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ def compute_metrics(pred):
args=training_args,
train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
tokenizer=feature_extractor,
processor=processor,
data_collator=data_collator,
compute_metrics=compute_metrics if training_args.predict_with_generate else None,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,7 @@ def compute_metrics(pred):
compute_metrics=compute_metrics,
train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
tokenizer=feature_extractor,
processor=processor,
optimizers=optimizers,
)

Expand Down
4 changes: 2 additions & 2 deletions examples/research_projects/xtreme-s/run_xtreme_s.py
Original file line number Diff line number Diff line change
Expand Up @@ -844,7 +844,7 @@ def compute_classification_metric(pred):
compute_metrics=compute_asr_metric if training_args.predict_with_generate else None,
train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
tokenizer=feature_extractor,
processor=processor,
)
else:
trainer = Trainer(
Expand All @@ -855,7 +855,7 @@ def compute_classification_metric(pred):
compute_metrics=compute_asr_metric if is_text_target else compute_classification_metric,
train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
tokenizer=feature_extractor,
processor=processor,
)

# 8. Finally, we can start training
Expand Down
11 changes: 10 additions & 1 deletion src/transformers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
MODEL_MAPPING_NAMES,
)
from .optimization import Adafactor, get_scheduler
from .processing_utils import ProcessorMixin
from .pytorch_utils import (
ALL_LAYERNORM_LAYERS,
is_torch_greater_or_equal_than_1_13,
Expand Down Expand Up @@ -318,6 +319,10 @@ class Trainer:
The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs to the
maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an
interrupted training or reuse the fine-tuned model.
processor ([`ProcessorMixin`], *optional*):
The processor used to pre- and post-process the data for multimodal models. If provided, will be used to
automatically pad the inputs to the maximum length when batching inputs, and it will be saved along the
model to make it easier to rerun an interrupted training or reuse the fine-tuned model.
model_init (`Callable[[], PreTrainedModel]`, *optional*):
A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start
from a new instance of the model as given by this function.
Expand Down Expand Up @@ -375,6 +380,7 @@ def __init__(
train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None,
eval_dataset: Optional[Union[Dataset, Dict[str, Dataset], "datasets.Dataset"]] = None,
tokenizer: Optional[PreTrainedTokenizerBase] = None,
processor: Optional[ProcessorMixin] = None,
model_init: Optional[Callable[[], PreTrainedModel]] = None,
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
callbacks: Optional[List[TrainerCallback]] = None,
Expand Down Expand Up @@ -510,6 +516,10 @@ def __init__(
):
self.place_model_on_device = False

self.tokenizer = processor if processor is not None else tokenizer
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of setting self.tokenizer to processor, we should:

  • update all the references of self.tokenizer to self.processor in the trainer. The removes ambiguity for anyone reading the code
  • If tokenizer is passed in as an argument, raise a warning saying it's deprecated in favour of processor
  • Add a property tokenizer which returns self.processor alongside a warning saying self.tokenizer is deprecated

Copy link
Contributor Author

@sanchit-gandhi sanchit-gandhi May 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that tokenizer is not deprecated. If we're fine-tuning LLM's there's no notion of a processor, only a tokenizer. The processor is only relevant when we're training a multimodal model, such as an ASR model.

This is why we maintain the tokenizer attribute in the Trainer. What I propose we do is have two attributes:

  • self.tokenizer -> used for LLMs where there is only a tokenizer. Will be None for multimodal models
  • self.processor -> used for multimodal models where there is a processor. Will be None for LLMs

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would much rather have self.processor :) Or even be more clear: self.multimodal_processor

if processor is not None and hasattr(processor, "feature_extractor"):
tokenizer = processor.feature_extractor
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add a check here to ensure if the user has passed in both tokenizer and processor, to raise an error about you must only pass in one

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO it's ok to let the user pass both and have the processor take precedence (there's no harm in this for the user)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However we never save their original tokenizer. This can lead to confusion down the road because their tokenizer is essentially never used. I'd rather guard this explicitly.;

Copy link
Collaborator

@amyeroberts amyeroberts May 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO it's ok to let the user pass both and have the processor take precedence (there's no harm in this for the user)

I disagree here, it makes the behaviour ambiguous. In effect, this PR means we're deprecating the use of the tokenizer argument, so we should make it clear which argument is preferred and push the user towards that. Or at least throw a warning

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is super audio specific and can create surprising behaviour. If I passed in a processor=processor, I would expect the processor to be used, not the feature extractor. Instead, if the previous scripts e.g. examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py want just the feature extractor to be passed in, then that should be specified when calling the trainer i.e. processor=feature_extractor

Copy link
Contributor Author

@sanchit-gandhi sanchit-gandhi May 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that all this is doing is setting the padding method in the default data collator:

default_collator = (
DataCollatorWithPadding(tokenizer)
if tokenizer is not None and isinstance(tokenizer, (PreTrainedTokenizerBase, SequenceFeatureExtractor))
else default_data_collator
)

There's no pad method defined for processors, so the processor cannot be used here. Only sequence feature extractors and tokenizers have a pad method defined, so they are the only two viable options.

This is why we look for the corresponding attributes in the processor:

if hasattr(processor, "feature_extractor"):
tokenizer = processor.feature_extractor
elif hasattr(processor, "tokenizer"):
tokenizer = processor.tokenizer

Copy link
Collaborator

@amyeroberts amyeroberts May 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't just define padding behaviour. If tokenizer is set, then this object is also uploaded on push_to_hub calls. If we do tokenizer = processor.feature_extractor then a user specifies a processor but only the feature extractor is uploaded

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note here that we're setting:

tokenizer = processor.feature_extractor

Not:

self.tokenizer = processor.feature_extractor

=> the feature extractor is strictly used for padding purposes in the data collator, and not set to an attribute of the trainer

In fact, since we set:

self.tokenizer = processor

the processor is correctly the attribute which is both saved and pushed to the hub.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree though that this behaviour is somewhat "silent' to the user and can be improved on (will iterate on this once we have a design established)


default_collator = (
DataCollatorWithPadding(tokenizer)
if tokenizer is not None and isinstance(tokenizer, (PreTrainedTokenizerBase, SequenceFeatureExtractor))
Expand All @@ -518,7 +528,6 @@ def __init__(
self.data_collator = data_collator if data_collator is not None else default_collator
self.train_dataset = train_dataset
self.eval_dataset = eval_dataset
self.tokenizer = tokenizer

# Bnb Quantized models doesn't support `.to` operation.
if (
Expand Down
3 changes: 3 additions & 0 deletions src/transformers/trainer_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
if TYPE_CHECKING:
from .data.data_collator import DataCollator
from .modeling_utils import PreTrainedModel
from .processing_utils import ProcessorMixin
from .tokenization_utils_base import PreTrainedTokenizerBase
from .trainer_callback import TrainerCallback
from .trainer_utils import EvalPrediction, PredictionOutput
Expand All @@ -48,6 +49,7 @@ def __init__(
train_dataset: Optional[Dataset] = None,
eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
tokenizer: Optional["PreTrainedTokenizerBase"] = None,
processor: Optional["ProcessorMixin"] = None,
model_init: Optional[Callable[[], "PreTrainedModel"]] = None,
compute_metrics: Optional[Callable[["EvalPrediction"], Dict]] = None,
callbacks: Optional[List["TrainerCallback"]] = None,
Expand All @@ -61,6 +63,7 @@ def __init__(
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
processor=processor,
model_init=model_init,
compute_metrics=compute_metrics,
callbacks=callbacks,
Expand Down
Loading