From 8a6a2672cc029cd295acd3854189713d06b12cc0 Mon Sep 17 00:00:00 2001 From: Hannah Date: Tue, 10 Aug 2021 17:32:47 +0200 Subject: [PATCH 01/29] Implement AdapterTRainer with callbacks --- src/transformers/adapters/trainer.py | 230 +++++++++++++++++++++++++ src/transformers/data/datasets/glue.py | 62 +++---- src/transformers/trainer.py | 134 +++----------- tests/test_adapter_trainer.py | 3 +- tests/test_adapter_training.py | 5 +- 5 files changed, 291 insertions(+), 143 deletions(-) create mode 100644 src/transformers/adapters/trainer.py diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py new file mode 100644 index 0000000000..391356d338 --- /dev/null +++ b/src/transformers/adapters/trainer.py @@ -0,0 +1,230 @@ +import os +from typing import Callable, Dict, List, Optional, Tuple, Union + +import torch +from torch import nn +from torch.utils.data.dataset import Dataset + +from transformers import Trainer, PreTrainedModel +from ..configuration_utils import PretrainedConfig +from ..data.data_collator import DataCollator +from ..file_utils import ( + CONFIG_NAME, + WEIGHTS_NAME, + is_sagemaker_mp_enabled, +) +from ..modeling_utils import PreTrainedModel +from ..optimization import Adafactor, AdamW +from ..tokenization_utils_base import PreTrainedTokenizerBase +from ..trainer_callback import ( + TrainerCallback, + TrainerControl, + TrainerState, +) +from ..trainer_pt_utils import ( + get_parameter_names, +) +from ..trainer_utils import ( + EvalPrediction, + ShardedDDPOption, +) +from ..training_args import TrainingArguments + + +# Integrations must be imported before ML frameworks: + + +class AdapterTrainer(Trainer): + def __init__( + self, + model: Union[PreTrainedModel, nn.Module] = None, + args: TrainingArguments = None, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Dataset] = None, + tokenizer: Optional[PreTrainedTokenizerBase] = None, + model_init: Callable[[], PreTrainedModel] = None, + compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, + callbacks: Optional[List[TrainerCallback]] = None, + do_save_full_model: Optional[bool] = None, + do_save_adapters: Optional[bool] = None, + do_save_adapter_fusion: Optional[bool] = None, + adapter_names: Optional[List[List[str]]] = None, + optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + ): + # TODO allow additional callbacks + super().__init__(model, args, data_collator, train_dataset, eval_dataset, tokenizer=tokenizer, model_init=model_init, compute_metrics=compute_metrics, callbacks=[AdapterTrainerCallback(self)], optimizers=optimizers) + + if adapter_names is not None: + self.model.set_active_adapters(adapter_names) + # Set the defaults for loading/ saving model & adapters + if isinstance(self.model, PreTrainedModel): + model_freezed = getattr(self.model.base_model, "model_freezed", False) + else: + model_freezed = False + if model_freezed and self.model.active_adapters: + self.do_save_full_model = False + self.do_save_adapters = True + self.do_save_adapter_fusion = True + else: + self.do_save_full_model = True + self.do_save_adapters = False + self.do_save_adapter_fusion = False + # override with explicit setting + if do_save_full_model is not None: + self.do_save_full_model = do_save_full_model + if do_save_adapters is not None: + self.do_save_adapters = do_save_adapters + if do_save_adapter_fusion is not None: + self.do_save_adapter_fusion = do_save_adapter_fusion + + def create_optimizer(self): + """ + Setup the optimizer. + + We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the + Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass. + """ + if self.optimizer is None: + decay_parameters = get_parameter_names(self.model, [nn.LayerNorm]) + decay_parameters = [name for name in decay_parameters if "bias" not in name] + if hasattr(self.model, "config") and hasattr(self.model.config, "adapter_fusion_models"): + no_decay = [f"adapter_fusion_layer.{n}.value" for n in self.model.config.adapter_fusion_models] + decay_parameters = [name for name in decay_parameters if name not in no_decay] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in self.model.named_parameters() if n in decay_parameters], + "weight_decay": self.args.weight_decay, + }, + { + "params": [p for n, p in self.model.named_parameters() if n not in decay_parameters], + "weight_decay": 0.0, + }, + ] + optimizer_cls = Adafactor if self.args.adafactor else AdamW + if self.args.adafactor: + optimizer_cls = Adafactor + optimizer_kwargs = {"scale_parameter": False, "relative_step": False} + else: + optimizer_cls = AdamW + optimizer_kwargs = { + "betas": (self.args.adam_beta1, self.args.adam_beta2), + "eps": self.args.adam_epsilon, + } + optimizer_kwargs["lr"] = self.args.learning_rate + if self.sharded_ddp == ShardedDDPOption.SIMPLE: + self.optimizer = OSS( + params=optimizer_grouped_parameters, + optim=optimizer_cls, + **optimizer_kwargs, + ) + else: + self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) + + if is_sagemaker_mp_enabled(): + self.optimizer = smp.DistributedOptimizer(self.optimizer) + + +class AdapterTrainerCallback(TrainerCallback): + def __init__(self, trainer): + super().__init__() + self.trainer = trainer + + def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + if self.trainer.do_save_adapters: + self.trainer.model.save_all_adapters(args.output_dir) + if self.trainer.do_save_adapter_fusion: + self.trainer.model.save_all_adapter_fusions(args.output_dir) + + def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + model = kwargs.pop("model") + if args.load_best_model_at_end and state.best_model_checkpoint is not None: + if self.trainer.do_save_full_model: + logger.info( + f"Loading best model from {state.best_model_checkpoint} (score: {state.best_metric})." + ) + + best_model_path = os.path.join(state.best_model_checkpoint, WEIGHTS_NAME) + if os.path.exists(best_model_path): + # We load the model state dict on the CPU to avoid an OOM error. + state_dict = torch.load(best_model_path, map_location="cpu") + # If the model is on the GPU, it still works! + self.trainer._load_state_dict_in_model(state_dict) + else: + logger.warn( + f"Could not locate the best model at {best_model_path}, if you are running a distributed training " + "on multiple nodes, you should activate `--save_on_each_node`." + ) + if self.trainer.do_save_adapters: + #ToDo enable logger + # logger.info( + # f"Loading best adapter(s) from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." + # ) + # attempt to re-load all adapters from checkpoint + for adapter in model.config.adapters.adapters: + adapter_dir = os.path.join(state.best_model_checkpoint, adapter) + if os.path.exists(adapter_dir): + model.load_adapter(adapter_dir) + if self.trainer.do_save_adapter_fusion: + logger.info( + f"Loading best adapter fusion(s) from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." + ) + # attempt to re-load all adapter fusions from checkpoint + fusion_models = getattr(self.model.config, "adapter_fusion_models", []) + for fusion in fusion_models: + fusion_dir = os.path.join(self.state.best_model_checkpoint, fusion) + if os.path.exists(fusion_dir): + self.model.load_adapter_fusion(fusion_dir) + + def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # apply adapter fusion weight regularization on the value matrix + model = kwargs.pop("model") + if ( + hasattr(model.config, "adapter_fusion") + and model.config.adapter_fusion["regularization"] + ): + fusion_reg_loss = model.base_model.get_fusion_regularization_loss() + fusion_reg_loss.backward() + + def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + if args.resume_from_checkpoint is not None: + if os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)): + logger.info(f"Loading model from {resume_from_checkpoint}).") + elif self.do_save_full_model: + raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") + + if os.path.isfile(os.path.join(resume_from_checkpoint, CONFIG_NAME)): + config = PretrainedConfig.from_json_file(os.path.join(resume_from_checkpoint, CONFIG_NAME)) + checkpoint_version = config.transformers_version + if checkpoint_version is not None and checkpoint_version != __version__: + logger.warn( + f"You are resuming training from a checkpoint trained with {checkpoint_version} of " + f"Transformers but your current version is {__version__}. This is not recommended and could " + "yield to errors or unwanted behaviors." + ) + + if args.deepspeed: + # will be resumed in deepspeed_init + pass + else: + if self.do_save_full_model: + # We load the model state dict on the CPU to avoid an OOM error. + state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu") + # If the model is on the GPU, it still works! + self._load_state_dict_in_model(state_dict) + if self.do_save_adapters: + adapter_loaded = False + if os.path.isdir(resume_from_checkpoint): + for file_name in os.listdir(resume_from_checkpoint): + if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)): + if "," in file_name: + self.model.load_adapter_fusion(os.path.join(resume_from_checkpoint, file_name)) + adapter_loaded = True + else: + self.model.load_adapter( + os.path.join(os.path.join(resume_from_checkpoint, file_name)) + ) + adapter_loaded = True + + if not adapter_loaded: + raise Exception("Can't find a valid checkpoint at {}".format(resume_from_checkpoint)) diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py index 1ba786c384..9909807d17 100644 --- a/src/transformers/data/datasets/glue.py +++ b/src/transformers/data/datasets/glue.py @@ -118,38 +118,38 @@ def __init__( # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" - with FileLock(lock_path): - - if os.path.exists(cached_features_file) and not args.overwrite_cache: - start = time.time() - self.features = torch.load(cached_features_file) - logger.info( - f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start - ) + # with FileLock(lock_path): + + if os.path.exists(cached_features_file) and not args.overwrite_cache: + start = time.time() + self.features = torch.load(cached_features_file) + logger.info( + f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start + ) + else: + logger.info(f"Creating features from dataset file at {args.data_dir}") + + if mode == Split.dev: + examples = self.processor.get_dev_examples(args.data_dir) + elif mode == Split.test: + examples = self.processor.get_test_examples(args.data_dir) else: - logger.info(f"Creating features from dataset file at {args.data_dir}") - - if mode == Split.dev: - examples = self.processor.get_dev_examples(args.data_dir) - elif mode == Split.test: - examples = self.processor.get_test_examples(args.data_dir) - else: - examples = self.processor.get_train_examples(args.data_dir) - if limit_length is not None: - examples = examples[:limit_length] - self.features = glue_convert_examples_to_features( - examples, - tokenizer, - max_length=args.max_seq_length, - label_list=label_list, - output_mode=self.output_mode, - ) - start = time.time() - torch.save(self.features, cached_features_file) - # ^ This seems to take a lot of time so I want to investigate why and how we can improve. - logger.info( - f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]" - ) + examples = self.processor.get_train_examples(args.data_dir) + if limit_length is not None: + examples = examples[:limit_length] + self.features = glue_convert_examples_to_features( + examples, + tokenizer, + max_length=args.max_seq_length, + label_list=label_list, + output_mode=self.output_mode, + ) + start = time.time() + torch.save(self.features, cached_features_file) + # ^ This seems to take a lot of time so I want to investigate why and how we can improve. + logger.info( + f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]" + ) def __len__(self): return len(self.features) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 8f67ae3b87..b4dba1724f 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -177,6 +177,7 @@ if is_training_run_on_sagemaker(): logging.add_handler(StreamHandler(sys.stdout)) + if TYPE_CHECKING: import optuna @@ -271,10 +272,6 @@ def __init__( model_init: Callable[[], PreTrainedModel] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, - do_save_full_model: Optional[bool] = None, - do_save_adapters: Optional[bool] = None, - do_save_adapter_fusion: Optional[bool] = None, - adapter_names: Optional[List[List[str]]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), ): if args is None: @@ -399,29 +396,6 @@ def __init__( if self.args.should_save: os.makedirs(self.args.output_dir, exist_ok=True) - if adapter_names is not None: - self.model.set_active_adapters(adapter_names) - # Set the defaults for loading/ saving model & adapters - if isinstance(self.model, PreTrainedModel): - model_freezed = getattr(self.model.base_model, "model_freezed", False) - else: - model_freezed = False - if model_freezed and self.model.active_adapters: - self.do_save_full_model = False - self.do_save_adapters = True - self.do_save_adapter_fusion = True - else: - self.do_save_full_model = True - self.do_save_adapters = False - self.do_save_adapter_fusion = False - # override with explicit setting - if do_save_full_model is not None: - self.do_save_full_model = do_save_full_model - if do_save_adapters is not None: - self.do_save_adapters = do_save_adapters - if do_save_adapter_fusion is not None: - self.do_save_adapter_fusion = do_save_adapter_fusion - if not callable(self.data_collator) and callable(getattr(self.data_collator, "collate_batch", None)): raise ValueError("The `data_collator` should be a simple callable (function, class with `__call__`).") @@ -806,9 +780,6 @@ def create_optimizer(self): if self.optimizer is None: decay_parameters = get_parameter_names(self.model, [nn.LayerNorm]) decay_parameters = [name for name in decay_parameters if "bias" not in name] - if hasattr(self.model, "config") and hasattr(self.model.config, "adapter_fusion_models"): - no_decay = [f"adapter_fusion_layer.{n}.value" for n in self.model.config.adapter_fusion_models] - decay_parameters = [name for name in decay_parameters if name not in no_decay] optimizer_grouped_parameters = [ { "params": [p for n, p in self.model.named_parameters() if n in decay_parameters], @@ -1076,11 +1047,11 @@ def train( raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})") if resume_from_checkpoint is not None: - if os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)): - logger.info(f"Loading model from {resume_from_checkpoint}).") - elif self.do_save_full_model: + if not os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)): raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") + logger.info(f"Loading model from {resume_from_checkpoint}).") + if os.path.isfile(os.path.join(resume_from_checkpoint, CONFIG_NAME)): config = PretrainedConfig.from_json_file(os.path.join(resume_from_checkpoint, CONFIG_NAME)) checkpoint_version = config.transformers_version @@ -1095,27 +1066,10 @@ def train( # will be resumed in deepspeed_init pass else: - if self.do_save_full_model: - # We load the model state dict on the CPU to avoid an OOM error. - state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu") - # If the model is on the GPU, it still works! - self._load_state_dict_in_model(state_dict) - if self.do_save_adapters: - adapter_loaded = False - if os.path.isdir(resume_from_checkpoint): - for file_name in os.listdir(resume_from_checkpoint): - if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)): - if "," in file_name: - self.model.load_adapter_fusion(os.path.join(resume_from_checkpoint, file_name)) - adapter_loaded = True - else: - self.model.load_adapter( - os.path.join(os.path.join(resume_from_checkpoint, file_name)) - ) - adapter_loaded = True - - if not adapter_loaded: - raise Exception("Can't find a valid checkpoint at {}".format(resume_from_checkpoint)) + # We load the model state dict on the CPU to avoid an OOM error. + state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu") + # If the model is on the GPU, it still works! + self._load_state_dict_in_model(state_dict) # If model was re-initialized, put it on the right device and update self.model_wrapped if model_reloaded: @@ -1324,14 +1278,6 @@ def train( steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch ): - # apply adapter fusion weight regularization on the value matrix - if ( - hasattr(self.model.config, "adapter_fusion") - and self.model.config.adapter_fusion["regularization"] - ): - fusion_reg_loss = self.model.base_model.get_fusion_regularization_loss() - fusion_reg_loss.backward() - # Gradient clipping if args.max_grad_norm is not None and args.max_grad_norm > 0 and not self.deepspeed: # deepspeed does its own clipping @@ -1408,43 +1354,21 @@ def train( elif args.local_rank != -1: dist.barrier() - if self.do_save_full_model: - logger.info( - f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." - ) + logger.info( + f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." + ) - best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME) - if os.path.exists(best_model_path): - # We load the model state dict on the CPU to avoid an OOM error. - state_dict = torch.load(best_model_path, map_location="cpu") - # If the model is on the GPU, it still works! - self._load_state_dict_in_model(state_dict) - else: - logger.warn( - f"Could not locate the best model at {best_model_path}, if you are running a distributed training " - "on multiple nodes, you should activate `--save_on_each_node`." - ) - if self.do_save_adapters: - logger.info( - f"Loading best adapter(s) from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." - ) - # attempt to re-load all adapters from checkpoint - for adapter in self.model.config.adapters.adapters: - adapter_dir = os.path.join(self.state.best_model_checkpoint, adapter) - if os.path.exists(adapter_dir): - self.model.load_adapter(adapter_dir) - if self.do_save_adapter_fusion: - logger.info( - f"Loading best adapter fusion(s) from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." + best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME) + if os.path.exists(best_model_path): + # We load the model state dict on the CPU to avoid an OOM error. + state_dict = torch.load(best_model_path, map_location="cpu") + # If the model is on the GPU, it still works! + self._load_state_dict_in_model(state_dict) + else: + logger.warn( + f"Could not locate the best model at {best_model_path}, if you are running a distributed training " + "on multiple nodes, you should activate `--save_on_each_node`." ) - # attempt to re-load all adapter fusions from checkpoint - fusion_models = getattr(self.model.config, "adapter_fusion_models", []) - for fusion in fusion_models: - fusion_dir = os.path.join(self.state.best_model_checkpoint, fusion) - if os.path.exists(fusion_dir): - self.model.load_adapter_fusion(fusion_dir) - if self.place_model_on_device: - self.model = self.model.to(self.args.device) if self.deepspeed: self.deepspeed.load_checkpoint( @@ -1973,12 +1897,7 @@ def _save_tpu(self, output_dir: Optional[str] = None): state_dict = self.model.state_dict() xm.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) else: - if self.do_save_adapters: - self.model.save_all_adapters(output_dir) - if self.do_save_adapter_fusion: - self.model.save_all_adapter_fusions(output_dir) - if self.do_save_full_model: - self.model.save_pretrained(output_dir, save_config=self.args.should_save, save_function=xm.save) + self.model.save_pretrained(output_dir, save_config=self.args.should_save, save_function=xm.save) if self.tokenizer is not None and self.args.should_save: self.tokenizer.save_pretrained(output_dir) @@ -2000,12 +1919,7 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): state_dict = self.model.state_dict() torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) else: - if self.do_save_adapters: - self.model.save_all_adapters(output_dir) - if self.do_save_adapter_fusion: - self.model.save_all_adapter_fusions(output_dir) - if self.do_save_full_model: - self.model.save_pretrained(output_dir, state_dict=state_dict) + self.model.save_pretrained(output_dir, state_dict=state_dict) if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) @@ -2215,6 +2129,7 @@ def evaluation_loop( # if eval is called w/o train init deepspeed here if self.args.deepspeed and not self.deepspeed: + # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval # from the checkpoint eventually deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None) @@ -2624,6 +2539,7 @@ def prediction_loop( # if eval is called w/o train init deepspeed here if self.args.deepspeed and not self.deepspeed: + # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval # from the checkpoint eventually deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None) diff --git a/tests/test_adapter_trainer.py b/tests/test_adapter_trainer.py index 7914f45c2b..d1fc8c6165 100644 --- a/tests/test_adapter_trainer.py +++ b/tests/test_adapter_trainer.py @@ -9,9 +9,10 @@ BertForSequenceClassification, GlueDataset, GlueDataTrainingArguments, - Trainer, + #Trainer, TrainingArguments, ) +from transformers.adapters.trainer import AdapterTrainer as Trainer from transformers.adapters.composition import Fuse from transformers.testing_utils import slow diff --git a/tests/test_adapter_training.py b/tests/test_adapter_training.py index 4eef4815e8..69208137cc 100644 --- a/tests/test_adapter_training.py +++ b/tests/test_adapter_training.py @@ -8,9 +8,10 @@ AutoTokenizer, GlueDataset, GlueDataTrainingArguments, - Trainer, + #Trainer, TrainingArguments, ) +from transformers.adapters.trainer import AdapterTrainer as Trainer from transformers.adapters.composition import Fuse from transformers.testing_utils import require_torch @@ -20,7 +21,7 @@ def filter_parameters(model, filter_string): @require_torch -class AdapterTrainingTestMixin: +class AdapterTrainingT1estMixin: def test_train_single_adapter(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: From dc1e1d86583ed7f995ee0381d112b1c31a6e6e53 Mon Sep 17 00:00:00 2001 From: Hannah Date: Tue, 10 Aug 2021 18:45:56 +0200 Subject: [PATCH 02/29] Fix typo --- src/transformers/data/datasets/glue.py | 62 +++++++++++++------------- tests/test_adapter_training.py | 2 +- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py index 9909807d17..1ba786c384 100644 --- a/src/transformers/data/datasets/glue.py +++ b/src/transformers/data/datasets/glue.py @@ -118,38 +118,38 @@ def __init__( # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" - # with FileLock(lock_path): - - if os.path.exists(cached_features_file) and not args.overwrite_cache: - start = time.time() - self.features = torch.load(cached_features_file) - logger.info( - f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start - ) - else: - logger.info(f"Creating features from dataset file at {args.data_dir}") - - if mode == Split.dev: - examples = self.processor.get_dev_examples(args.data_dir) - elif mode == Split.test: - examples = self.processor.get_test_examples(args.data_dir) + with FileLock(lock_path): + + if os.path.exists(cached_features_file) and not args.overwrite_cache: + start = time.time() + self.features = torch.load(cached_features_file) + logger.info( + f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start + ) else: - examples = self.processor.get_train_examples(args.data_dir) - if limit_length is not None: - examples = examples[:limit_length] - self.features = glue_convert_examples_to_features( - examples, - tokenizer, - max_length=args.max_seq_length, - label_list=label_list, - output_mode=self.output_mode, - ) - start = time.time() - torch.save(self.features, cached_features_file) - # ^ This seems to take a lot of time so I want to investigate why and how we can improve. - logger.info( - f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]" - ) + logger.info(f"Creating features from dataset file at {args.data_dir}") + + if mode == Split.dev: + examples = self.processor.get_dev_examples(args.data_dir) + elif mode == Split.test: + examples = self.processor.get_test_examples(args.data_dir) + else: + examples = self.processor.get_train_examples(args.data_dir) + if limit_length is not None: + examples = examples[:limit_length] + self.features = glue_convert_examples_to_features( + examples, + tokenizer, + max_length=args.max_seq_length, + label_list=label_list, + output_mode=self.output_mode, + ) + start = time.time() + torch.save(self.features, cached_features_file) + # ^ This seems to take a lot of time so I want to investigate why and how we can improve. + logger.info( + f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]" + ) def __len__(self): return len(self.features) diff --git a/tests/test_adapter_training.py b/tests/test_adapter_training.py index 69208137cc..36f249347b 100644 --- a/tests/test_adapter_training.py +++ b/tests/test_adapter_training.py @@ -21,7 +21,7 @@ def filter_parameters(model, filter_string): @require_torch -class AdapterTrainingT1estMixin: +class AdapterTrainingTestMixin: def test_train_single_adapter(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: From 3315d005560dda5d4193a1cb31e25cc06ab1b495 Mon Sep 17 00:00:00 2001 From: Hannah Date: Tue, 10 Aug 2021 18:55:28 +0200 Subject: [PATCH 03/29] Adjust import --- src/transformers/adapters/hub_mixin.py | 39 ++++++------- src/transformers/adapters/trainer.py | 78 ++++++++++++-------------- tests/test_adapter_hub.py | 4 +- tests/test_adapter_trainer.py | 5 +- tests/test_adapter_training.py | 5 +- 5 files changed, 60 insertions(+), 71 deletions(-) diff --git a/src/transformers/adapters/hub_mixin.py b/src/transformers/adapters/hub_mixin.py index 2325b4dcfa..46985393dd 100644 --- a/src/transformers/adapters/hub_mixin.py +++ b/src/transformers/adapters/hub_mixin.py @@ -10,35 +10,29 @@ DEFAULT_TEXT = "" ADAPTER_CARD_TEMPLATE = """ ---- -tags: -{tags} ---- +--- tags: {tags} --- # Adapter `{adapter_repo_name}` for {model_name} -An [adapter](https://adapterhub.ml) for the {model_name} model that was trained on the {dataset_name} dataset{head_info}. +An [adapter](https://adapterhub.ml) for the {model_name} model that was trained on the {dataset_name} +dataset{head_info}. -This adapter was created for usage with the **[adapter-transformers](https://github.com/Adapter-Hub/adapter-transformers)** library. +This adapter was created for usage with the +**[adapter-transformers](https://github.com/Adapter-Hub/adapter-transformers)** library. ## Usage First, install `adapter-transformers`: -``` -pip install -U adapter-transformers -``` -_Note: adapter-transformers is a fork of transformers that acts as a drop-in replacement with adapter support. [More](https://docs.adapterhub.ml/installation.html)_ +``` pip install -U adapter-transformers ``` _Note: adapter-transformers is a fork of transformers that acts as a +drop-in replacement with adapter support. [More](https://docs.adapterhub.ml/installation.html)_ Now, the adapter can be loaded and activated like this: -```python -from transformers import AutoModelWithHeads +```python from transformers import AutoModelWithHeads -model = AutoModelWithHeads.from_pretrained("{model_name}") -adapter_name = model.load_adapter("{adapter_repo_name}") -model.active_adapters = adapter_name -``` +model = AutoModelWithHeads.from_pretrained("{model_name}") adapter_name = model.load_adapter("{adapter_repo_name}") +model.active_adapters = adapter_name ``` ## Architecture & Training @@ -124,7 +118,8 @@ def push_adapter_to_hub( use_auth_token: Union[bool, str] = True, overwrite_adapter_card: bool = False, ): - """Upload an adapter to HuggingFace's Model Hub. + """ + Upload an adapter to HuggingFace's Model Hub. Args: repo_name (str): The name of the repository on the model hub to upload to. @@ -132,10 +127,11 @@ def push_adapter_to_hub( organization (str, optional): Organization in which to push the adapter (you must be a member of this organization). Defaults to None. adapterhub_tag (str, optional): Tag of the format `/` for categorization on https://adapterhub.ml/explore/. - See https://docs.adapterhub.ml/contributing.html#add-a-new-task-or-subtask for more. - If not specified, `datasets_tag` must be given in case a new adapter card is generated. Defaults to None. + See https://docs.adapterhub.ml/contributing.html#add-a-new-task-or-subtask for more. If not specified, + `datasets_tag` must be given in case a new adapter card is generated. Defaults to None. datasets_tag (str, optional): Dataset identifier from https://huggingface.co/datasets. - If not specified, `adapterhub_tag` must be given in case a new adapter card is generated. Defaults to None. + If not specified, `adapterhub_tag` must be given in case a new adapter card is generated. Defaults to + None. local_path (str, optional): Local path used as clone directory of the adapter repository. If not specified, will create a temporary directory. Defaults to None. commit_message (:obj:`str`, `optional`): @@ -145,7 +141,8 @@ def push_adapter_to_hub( Whether or not the repository created should be private (requires a paying subscription). use_auth_token (:obj:`bool` or :obj:`str`, `optional`): The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token - generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). Defaults to True. + generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). Defaults to + True. overwrite_adapter_card (bool, optional): Overwrite an existing adapter card with a newly generated one. If set to `False`, will only generate an adapter card, if none exists. Defaults to False. diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index 391356d338..db7a52e7af 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -5,29 +5,17 @@ from torch import nn from torch.utils.data.dataset import Dataset -from transformers import Trainer, PreTrainedModel +from transformers import PreTrainedModel, Trainer + from ..configuration_utils import PretrainedConfig from ..data.data_collator import DataCollator -from ..file_utils import ( - CONFIG_NAME, - WEIGHTS_NAME, - is_sagemaker_mp_enabled, -) +from ..file_utils import CONFIG_NAME, WEIGHTS_NAME, is_sagemaker_mp_enabled from ..modeling_utils import PreTrainedModel from ..optimization import Adafactor, AdamW from ..tokenization_utils_base import PreTrainedTokenizerBase -from ..trainer_callback import ( - TrainerCallback, - TrainerControl, - TrainerState, -) -from ..trainer_pt_utils import ( - get_parameter_names, -) -from ..trainer_utils import ( - EvalPrediction, - ShardedDDPOption, -) +from ..trainer_callback import TrainerCallback, TrainerControl, TrainerState +from ..trainer_pt_utils import get_parameter_names +from ..trainer_utils import EvalPrediction, ShardedDDPOption from ..training_args import TrainingArguments @@ -36,24 +24,35 @@ class AdapterTrainer(Trainer): def __init__( - self, - model: Union[PreTrainedModel, nn.Module] = None, - args: TrainingArguments = None, - data_collator: Optional[DataCollator] = None, - train_dataset: Optional[Dataset] = None, - eval_dataset: Optional[Dataset] = None, - tokenizer: Optional[PreTrainedTokenizerBase] = None, - model_init: Callable[[], PreTrainedModel] = None, - compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, - callbacks: Optional[List[TrainerCallback]] = None, - do_save_full_model: Optional[bool] = None, - do_save_adapters: Optional[bool] = None, - do_save_adapter_fusion: Optional[bool] = None, - adapter_names: Optional[List[List[str]]] = None, - optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + self, + model: Union[PreTrainedModel, nn.Module] = None, + args: TrainingArguments = None, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Dataset] = None, + tokenizer: Optional[PreTrainedTokenizerBase] = None, + model_init: Callable[[], PreTrainedModel] = None, + compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, + callbacks: Optional[List[TrainerCallback]] = None, + do_save_full_model: Optional[bool] = None, + do_save_adapters: Optional[bool] = None, + do_save_adapter_fusion: Optional[bool] = None, + adapter_names: Optional[List[List[str]]] = None, + optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), ): # TODO allow additional callbacks - super().__init__(model, args, data_collator, train_dataset, eval_dataset, tokenizer=tokenizer, model_init=model_init, compute_metrics=compute_metrics, callbacks=[AdapterTrainerCallback(self)], optimizers=optimizers) + super().__init__( + model, + args, + data_collator, + train_dataset, + eval_dataset, + tokenizer=tokenizer, + model_init=model_init, + compute_metrics=compute_metrics, + callbacks=[AdapterTrainerCallback(self)], + optimizers=optimizers, + ) if adapter_names is not None: self.model.set_active_adapters(adapter_names) @@ -140,9 +139,7 @@ def on_train_end(self, args: TrainingArguments, state: TrainerState, control: Tr model = kwargs.pop("model") if args.load_best_model_at_end and state.best_model_checkpoint is not None: if self.trainer.do_save_full_model: - logger.info( - f"Loading best model from {state.best_model_checkpoint} (score: {state.best_metric})." - ) + logger.info(f"Loading best model from {state.best_model_checkpoint} (score: {state.best_metric}).") best_model_path = os.path.join(state.best_model_checkpoint, WEIGHTS_NAME) if os.path.exists(best_model_path): @@ -156,7 +153,7 @@ def on_train_end(self, args: TrainingArguments, state: TrainerState, control: Tr "on multiple nodes, you should activate `--save_on_each_node`." ) if self.trainer.do_save_adapters: - #ToDo enable logger + # ToDo enable logger # logger.info( # f"Loading best adapter(s) from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." # ) @@ -179,10 +176,7 @@ def on_train_end(self, args: TrainingArguments, state: TrainerState, control: Tr def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): # apply adapter fusion weight regularization on the value matrix model = kwargs.pop("model") - if ( - hasattr(model.config, "adapter_fusion") - and model.config.adapter_fusion["regularization"] - ): + if hasattr(model.config, "adapter_fusion") and model.config.adapter_fusion["regularization"]: fusion_reg_loss = model.base_model.get_fusion_regularization_loss() fusion_reg_loss.backward() diff --git a/tests/test_adapter_hub.py b/tests/test_adapter_hub.py index 66ad8a82a5..0299591b2c 100644 --- a/tests/test_adapter_hub.py +++ b/tests/test_adapter_hub.py @@ -3,7 +3,7 @@ import numpy as np -from transformers import ( # get_adapter_config_hash, +from transformers import ( # get_adapter_config_hash,; Trainer, ADAPTER_CONFIG_MAP, AdapterConfig, AutoModel, @@ -12,11 +12,11 @@ BertModelWithHeads, GlueDataset, GlueDataTrainingArguments, - Trainer, TrainingArguments, get_adapter_config_hash, glue_compute_metrics, ) +from transformers.adapters.trainer import AdapterTrainer as Trainer from transformers.adapters.utils import find_in_index from transformers.testing_utils import require_torch diff --git a/tests/test_adapter_trainer.py b/tests/test_adapter_trainer.py index d1fc8c6165..aeebfef9c2 100644 --- a/tests/test_adapter_trainer.py +++ b/tests/test_adapter_trainer.py @@ -2,18 +2,17 @@ import torch -from transformers import ( +from transformers import ( # Trainer, AutoModelForSequenceClassification, AutoTokenizer, BertConfig, BertForSequenceClassification, GlueDataset, GlueDataTrainingArguments, - #Trainer, TrainingArguments, ) -from transformers.adapters.trainer import AdapterTrainer as Trainer from transformers.adapters.composition import Fuse +from transformers.adapters.trainer import AdapterTrainer as Trainer from transformers.testing_utils import slow diff --git a/tests/test_adapter_training.py b/tests/test_adapter_training.py index 36f249347b..60af35c29c 100644 --- a/tests/test_adapter_training.py +++ b/tests/test_adapter_training.py @@ -2,17 +2,16 @@ import torch -from transformers import ( +from transformers import ( # Trainer, AutoModelForSequenceClassification, AutoModelWithHeads, AutoTokenizer, GlueDataset, GlueDataTrainingArguments, - #Trainer, TrainingArguments, ) -from transformers.adapters.trainer import AdapterTrainer as Trainer from transformers.adapters.composition import Fuse +from transformers.adapters.trainer import AdapterTrainer as Trainer from transformers.testing_utils import require_torch From dc7bbab7d4292ebf4ad6489be768648230c8b01c Mon Sep 17 00:00:00 2001 From: hSterz Date: Sun, 22 Aug 2021 10:25:02 +0200 Subject: [PATCH 04/29] Adapted run_translation for extended tests --- examples/translation/run_translation.py | 2 +- src/transformers/adapters/trainer.py | 225 ++++++++++++++++++++++-- 2 files changed, 210 insertions(+), 17 deletions(-) diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py index 889ade19fc..568602f33a 100644 --- a/examples/translation/run_translation.py +++ b/examples/translation/run_translation.py @@ -43,11 +43,11 @@ MBartTokenizer, MBartTokenizerFast, MultiLingAdapterArguments, - Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator, set_seed, ) +from transformers.adapters.trainer import AdapterSeq2SeqTrainer as Seq2SeqTrainer from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version from transformers.utils.versions import require_version diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index db7a52e7af..f5e477a714 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -1,25 +1,34 @@ import os -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch from torch import nn +from torch.cuda.amp import autocast from torch.utils.data.dataset import Dataset -from transformers import PreTrainedModel, Trainer +from transformers import PreTrainedModel, Trainer, __version__ +from transformers.deepspeed import is_deepspeed_zero3_enabled +from transformers.dependency_versions_check import dep_version_check +from transformers.integrations import is_fairscale_available from ..configuration_utils import PretrainedConfig from ..data.data_collator import DataCollator -from ..file_utils import CONFIG_NAME, WEIGHTS_NAME, is_sagemaker_mp_enabled +from ..file_utils import CONFIG_NAME, WEIGHTS_NAME, is_sagemaker_mp_enabled, logger from ..modeling_utils import PreTrainedModel from ..optimization import Adafactor, AdamW from ..tokenization_utils_base import PreTrainedTokenizerBase from ..trainer_callback import TrainerCallback, TrainerControl, TrainerState from ..trainer_pt_utils import get_parameter_names -from ..trainer_utils import EvalPrediction, ShardedDDPOption +from ..trainer_utils import EvalPrediction, PredictionOutput, ShardedDDPOption from ..training_args import TrainingArguments -# Integrations must be imported before ML frameworks: +if is_fairscale_available(): + dep_version_check("fairscale") + from fairscale.optim import OSS + +if is_sagemaker_mp_enabled(): + import smdistributed.modelparallel.torch as smp class AdapterTrainer(Trainer): @@ -182,13 +191,13 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): if args.resume_from_checkpoint is not None: - if os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)): - logger.info(f"Loading model from {resume_from_checkpoint}).") + if os.path.isfile(os.path.join(args.resume_from_checkpoint, WEIGHTS_NAME)): + logger.info(f"Loading model from {args.resume_from_checkpoint}).") elif self.do_save_full_model: - raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") + raise ValueError(f"Can't find a valid checkpoint at {args.resume_from_checkpoint}") - if os.path.isfile(os.path.join(resume_from_checkpoint, CONFIG_NAME)): - config = PretrainedConfig.from_json_file(os.path.join(resume_from_checkpoint, CONFIG_NAME)) + if os.path.isfile(os.path.join(args.resume_from_checkpoint, CONFIG_NAME)): + config = PretrainedConfig.from_json_file(os.path.join(args.resume_from_checkpoint, CONFIG_NAME)) checkpoint_version = config.transformers_version if checkpoint_version is not None and checkpoint_version != __version__: logger.warn( @@ -208,17 +217,201 @@ def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: self._load_state_dict_in_model(state_dict) if self.do_save_adapters: adapter_loaded = False - if os.path.isdir(resume_from_checkpoint): - for file_name in os.listdir(resume_from_checkpoint): - if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)): + if os.path.isdir(args.resume_from_checkpoint): + for file_name in os.listdir(args.resume_from_checkpoint): + if os.path.isdir(os.path.join(args.resume_from_checkpoint, file_name)): if "," in file_name: - self.model.load_adapter_fusion(os.path.join(resume_from_checkpoint, file_name)) + self.model.load_adapter_fusion( + os.path.join(args.resume_from_checkpoint, file_name) + ) adapter_loaded = True else: self.model.load_adapter( - os.path.join(os.path.join(resume_from_checkpoint, file_name)) + os.path.join(os.path.join(args.resume_from_checkpoint, file_name)) ) adapter_loaded = True if not adapter_loaded: - raise Exception("Can't find a valid checkpoint at {}".format(resume_from_checkpoint)) + raise Exception("Can't find a valid checkpoint at {}".format(args.resume_from_checkpoint)) + + +class AdapterSeq2SeqTrainer(AdapterTrainer): + def evaluate( + self, + eval_dataset: Optional[Dataset] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + max_length: Optional[int] = None, + num_beams: Optional[int] = None, + ) -> Dict[str, float]: + """ + Run evaluation and returns metrics. + + The calling script will be responsible for providing a method to compute metrics, as they are task-dependent + (pass it to the init :obj:`compute_metrics` argument). + + You can also subclass and override this method to inject custom behavior. + + Args: + eval_dataset (:obj:`Dataset`, `optional`): + Pass a dataset if you wish to override :obj:`self.eval_dataset`. If it is an :obj:`datasets.Dataset`, + columns not accepted by the ``model.forward()`` method are automatically removed. It must implement the + :obj:`__len__` method. + ignore_keys (:obj:`List[str]`, `optional`): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"eval"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "eval_bleu" if the prefix is ``"eval"`` (default) + max_length (:obj:`int`, `optional`): + The maximum target length to use when predicting with the generate method. + num_beams (:obj:`int`, `optional`): + Number of beams for beam search that will be used when predicting with the generate method. 1 means no + beam search. + + Returns: + A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The + dictionary also contains the epoch number which comes from the training state. + """ + self._max_length = max_length + self._num_beams = num_beams + return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) + + def predict( + self, + test_dataset: Dataset, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + max_length: Optional[int] = None, + num_beams: Optional[int] = None, + ) -> PredictionOutput: + """ + Run prediction and returns predictions and potential metrics. + + Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method + will also return metrics, like in :obj:`evaluate()`. + + Args: + test_dataset (:obj:`Dataset`): + Dataset to run the predictions on. If it is an :obj:`datasets.Dataset`, columns not accepted by the + ``model.forward()`` method are automatically removed. Has to implement the method :obj:`__len__` + ignore_keys (:obj:`List[str]`, `optional`): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"eval"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "eval_bleu" if the prefix is ``"eval"`` (default) + max_length (:obj:`int`, `optional`): + The maximum target length to use when predicting with the generate method. + num_beams (:obj:`int`, `optional`): + Number of beams for beam search that will be used when predicting with the generate method. 1 means no + beam search. + + .. note:: + + If your predictions or labels have different sequence lengths (for instance because you're doing dynamic + padding in a token classification task) the predictions will be padded (on the right) to allow for + concatenation into one array. The padding index is -100. + + Returns: `NamedTuple` A namedtuple with the following keys: + + - predictions (:obj:`np.ndarray`): The predictions on :obj:`test_dataset`. + - label_ids (:obj:`np.ndarray`, `optional`): The labels (if the dataset contained some). + - metrics (:obj:`Dict[str, float]`, `optional`): The potential dictionary of metrics (if the dataset + contained labels). + """ + self._max_length = max_length + self._num_beams = num_beams + return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) + + def prediction_step( + self, + model: nn.Module, + inputs: Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + + # XXX: adapt synced_gpus for fairscale as well + gen_kwargs = { + "max_length": self._max_length if self._max_length is not None else self.model.config.max_length, + "num_beams": self._num_beams if self._num_beams is not None else self.model.config.num_beams, + "synced_gpus": True if is_deepspeed_zero3_enabled() else False, + } + + generated_tokens = self.model.generate( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + **gen_kwargs, + ) + # in case the batch is shorter than max length, the output should be padded + if generated_tokens.shape[-1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + + with torch.no_grad(): + if self.use_amp: + with autocast(): + outputs = model(**inputs) + else: + outputs = model(**inputs) + if has_labels: + if self.label_smoother is not None: + loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() + else: + loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach() + else: + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) + + labels = inputs["labels"] + if labels.shape[-1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + + return (loss, generated_tokens, labels) + + def _pad_tensors_to_max_len(self, tensor, max_length): + if self.tokenizer is None: + raise ValueError( + f"Tensor need to be padded to `max_length={max_length}` but no tokenizer was passed when creating " + "this `Trainer`. Make sure to create your `Trainer` with the appropriate tokenizer." + ) + # If PAD token is not defined at least EOS token has to be defined + pad_token_id = ( + self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id + ) + + padded_tensor = pad_token_id * torch.ones( + (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device + ) + padded_tensor[:, : tensor.shape[-1]] = tensor + return padded_tensor From e3a2c52e8ee1333522f620af2c0c5c9876910c1a Mon Sep 17 00:00:00 2001 From: hSterz Date: Sun, 22 Aug 2021 10:43:45 +0200 Subject: [PATCH 05/29] Allowed additional callbacks and logging --- src/transformers/adapters/trainer.py | 226 +++---------------------- src/transformers/data/datasets/glue.py | 62 +++---- 2 files changed, 54 insertions(+), 234 deletions(-) diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index f5e477a714..6ed41e167d 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -6,7 +6,7 @@ from torch.cuda.amp import autocast from torch.utils.data.dataset import Dataset -from transformers import PreTrainedModel, Trainer, __version__ +from transformers import PreTrainedModel, Trainer, __version__, Seq2SeqTrainer from transformers.deepspeed import is_deepspeed_zero3_enabled from transformers.dependency_versions_check import dep_version_check from transformers.integrations import is_fairscale_available @@ -22,7 +22,6 @@ from ..trainer_utils import EvalPrediction, PredictionOutput, ShardedDDPOption from ..training_args import TrainingArguments - if is_fairscale_available(): dep_version_check("fairscale") from fairscale.optim import OSS @@ -33,23 +32,22 @@ class AdapterTrainer(Trainer): def __init__( - self, - model: Union[PreTrainedModel, nn.Module] = None, - args: TrainingArguments = None, - data_collator: Optional[DataCollator] = None, - train_dataset: Optional[Dataset] = None, - eval_dataset: Optional[Dataset] = None, - tokenizer: Optional[PreTrainedTokenizerBase] = None, - model_init: Callable[[], PreTrainedModel] = None, - compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, - callbacks: Optional[List[TrainerCallback]] = None, - do_save_full_model: Optional[bool] = None, - do_save_adapters: Optional[bool] = None, - do_save_adapter_fusion: Optional[bool] = None, - adapter_names: Optional[List[List[str]]] = None, - optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + self, + model: Union[PreTrainedModel, nn.Module] = None, + args: TrainingArguments = None, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Dataset] = None, + tokenizer: Optional[PreTrainedTokenizerBase] = None, + model_init: Callable[[], PreTrainedModel] = None, + compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, + callbacks: Optional[List[TrainerCallback]] = None, + do_save_full_model: Optional[bool] = None, + do_save_adapters: Optional[bool] = None, + do_save_adapter_fusion: Optional[bool] = None, + adapter_names: Optional[List[List[str]]] = None, + optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), ): - # TODO allow additional callbacks super().__init__( model, args, @@ -59,7 +57,7 @@ def __init__( tokenizer=tokenizer, model_init=model_init, compute_metrics=compute_metrics, - callbacks=[AdapterTrainerCallback(self)], + callbacks=[AdapterTrainerCallback(self)] + callbacks if callbacks else [AdapterTrainerCallback(self)], optimizers=optimizers, ) @@ -163,9 +161,9 @@ def on_train_end(self, args: TrainingArguments, state: TrainerState, control: Tr ) if self.trainer.do_save_adapters: # ToDo enable logger - # logger.info( - # f"Loading best adapter(s) from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." - # ) + logger.info( + f"Loading best adapter(s) from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." + ) # attempt to re-load all adapters from checkpoint for adapter in model.config.adapters.adapters: adapter_dir = os.path.join(state.best_model_checkpoint, adapter) @@ -212,7 +210,7 @@ def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: else: if self.do_save_full_model: # We load the model state dict on the CPU to avoid an OOM error. - state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu") + state_dict = torch.load(os.path.join(args.resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu") # If the model is on the GPU, it still works! self._load_state_dict_in_model(state_dict) if self.do_save_adapters: @@ -235,183 +233,5 @@ def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: raise Exception("Can't find a valid checkpoint at {}".format(args.resume_from_checkpoint)) -class AdapterSeq2SeqTrainer(AdapterTrainer): - def evaluate( - self, - eval_dataset: Optional[Dataset] = None, - ignore_keys: Optional[List[str]] = None, - metric_key_prefix: str = "eval", - max_length: Optional[int] = None, - num_beams: Optional[int] = None, - ) -> Dict[str, float]: - """ - Run evaluation and returns metrics. - - The calling script will be responsible for providing a method to compute metrics, as they are task-dependent - (pass it to the init :obj:`compute_metrics` argument). - - You can also subclass and override this method to inject custom behavior. - - Args: - eval_dataset (:obj:`Dataset`, `optional`): - Pass a dataset if you wish to override :obj:`self.eval_dataset`. If it is an :obj:`datasets.Dataset`, - columns not accepted by the ``model.forward()`` method are automatically removed. It must implement the - :obj:`__len__` method. - ignore_keys (:obj:`List[str]`, `optional`): - A list of keys in the output of your model (if it is a dictionary) that should be ignored when - gathering predictions. - metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"eval"`): - An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named - "eval_bleu" if the prefix is ``"eval"`` (default) - max_length (:obj:`int`, `optional`): - The maximum target length to use when predicting with the generate method. - num_beams (:obj:`int`, `optional`): - Number of beams for beam search that will be used when predicting with the generate method. 1 means no - beam search. - - Returns: - A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The - dictionary also contains the epoch number which comes from the training state. - """ - self._max_length = max_length - self._num_beams = num_beams - return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) - - def predict( - self, - test_dataset: Dataset, - ignore_keys: Optional[List[str]] = None, - metric_key_prefix: str = "eval", - max_length: Optional[int] = None, - num_beams: Optional[int] = None, - ) -> PredictionOutput: - """ - Run prediction and returns predictions and potential metrics. - - Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method - will also return metrics, like in :obj:`evaluate()`. - - Args: - test_dataset (:obj:`Dataset`): - Dataset to run the predictions on. If it is an :obj:`datasets.Dataset`, columns not accepted by the - ``model.forward()`` method are automatically removed. Has to implement the method :obj:`__len__` - ignore_keys (:obj:`List[str]`, `optional`): - A list of keys in the output of your model (if it is a dictionary) that should be ignored when - gathering predictions. - metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"eval"`): - An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named - "eval_bleu" if the prefix is ``"eval"`` (default) - max_length (:obj:`int`, `optional`): - The maximum target length to use when predicting with the generate method. - num_beams (:obj:`int`, `optional`): - Number of beams for beam search that will be used when predicting with the generate method. 1 means no - beam search. - - .. note:: - - If your predictions or labels have different sequence lengths (for instance because you're doing dynamic - padding in a token classification task) the predictions will be padded (on the right) to allow for - concatenation into one array. The padding index is -100. - - Returns: `NamedTuple` A namedtuple with the following keys: - - - predictions (:obj:`np.ndarray`): The predictions on :obj:`test_dataset`. - - label_ids (:obj:`np.ndarray`, `optional`): The labels (if the dataset contained some). - - metrics (:obj:`Dict[str, float]`, `optional`): The potential dictionary of metrics (if the dataset - contained labels). - """ - self._max_length = max_length - self._num_beams = num_beams - return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) - - def prediction_step( - self, - model: nn.Module, - inputs: Dict[str, Union[torch.Tensor, Any]], - prediction_loss_only: bool, - ignore_keys: Optional[List[str]] = None, - ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: - """ - Perform an evaluation step on :obj:`model` using obj:`inputs`. - - Subclass and override to inject custom behavior. - - Args: - model (:obj:`nn.Module`): - The model to evaluate. - inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): - The inputs and targets of the model. - - The dictionary will be unpacked before being fed to the model. Most models expect the targets under the - argument :obj:`labels`. Check your model's documentation for all accepted arguments. - prediction_loss_only (:obj:`bool`): - Whether or not to return the loss only. - - Return: - Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and - labels (each being optional). - """ - - if not self.args.predict_with_generate or prediction_loss_only: - return super().prediction_step( - model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys - ) - - has_labels = "labels" in inputs - inputs = self._prepare_inputs(inputs) - - # XXX: adapt synced_gpus for fairscale as well - gen_kwargs = { - "max_length": self._max_length if self._max_length is not None else self.model.config.max_length, - "num_beams": self._num_beams if self._num_beams is not None else self.model.config.num_beams, - "synced_gpus": True if is_deepspeed_zero3_enabled() else False, - } - - generated_tokens = self.model.generate( - inputs["input_ids"], - attention_mask=inputs["attention_mask"], - **gen_kwargs, - ) - # in case the batch is shorter than max length, the output should be padded - if generated_tokens.shape[-1] < gen_kwargs["max_length"]: - generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) - - with torch.no_grad(): - if self.use_amp: - with autocast(): - outputs = model(**inputs) - else: - outputs = model(**inputs) - if has_labels: - if self.label_smoother is not None: - loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() - else: - loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach() - else: - loss = None - - if self.args.prediction_loss_only: - return (loss, None, None) - - labels = inputs["labels"] - if labels.shape[-1] < gen_kwargs["max_length"]: - labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) - - return (loss, generated_tokens, labels) - - def _pad_tensors_to_max_len(self, tensor, max_length): - if self.tokenizer is None: - raise ValueError( - f"Tensor need to be padded to `max_length={max_length}` but no tokenizer was passed when creating " - "this `Trainer`. Make sure to create your `Trainer` with the appropriate tokenizer." - ) - # If PAD token is not defined at least EOS token has to be defined - pad_token_id = ( - self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id - ) - - padded_tensor = pad_token_id * torch.ones( - (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device - ) - padded_tensor[:, : tensor.shape[-1]] = tensor - return padded_tensor +class AdapterSeq2SeqTrainer(AdapterTrainer, Seq2SeqTrainer): + pass diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py index 1ba786c384..cb5d0037cd 100644 --- a/src/transformers/data/datasets/glue.py +++ b/src/transformers/data/datasets/glue.py @@ -118,38 +118,38 @@ def __init__( # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" - with FileLock(lock_path): - - if os.path.exists(cached_features_file) and not args.overwrite_cache: - start = time.time() - self.features = torch.load(cached_features_file) - logger.info( - f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start - ) + #with FileLock(lock_path): + + if os.path.exists(cached_features_file) and not args.overwrite_cache: + start = time.time() + self.features = torch.load(cached_features_file) + logger.info( + f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start + ) + else: + logger.info(f"Creating features from dataset file at {args.data_dir}") + + if mode == Split.dev: + examples = self.processor.get_dev_examples(args.data_dir) + elif mode == Split.test: + examples = self.processor.get_test_examples(args.data_dir) else: - logger.info(f"Creating features from dataset file at {args.data_dir}") - - if mode == Split.dev: - examples = self.processor.get_dev_examples(args.data_dir) - elif mode == Split.test: - examples = self.processor.get_test_examples(args.data_dir) - else: - examples = self.processor.get_train_examples(args.data_dir) - if limit_length is not None: - examples = examples[:limit_length] - self.features = glue_convert_examples_to_features( - examples, - tokenizer, - max_length=args.max_seq_length, - label_list=label_list, - output_mode=self.output_mode, - ) - start = time.time() - torch.save(self.features, cached_features_file) - # ^ This seems to take a lot of time so I want to investigate why and how we can improve. - logger.info( - f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]" - ) + examples = self.processor.get_train_examples(args.data_dir) + if limit_length is not None: + examples = examples[:limit_length] + self.features = glue_convert_examples_to_features( + examples, + tokenizer, + max_length=args.max_seq_length, + label_list=label_list, + output_mode=self.output_mode, + ) + start = time.time() + torch.save(self.features, cached_features_file) + # ^ This seems to take a lot of time so I want to investigate why and how we can improve. + logger.info( + f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]" + ) def __len__(self): return len(self.features) From 4ae0c66fd4a8c931498132d46616426a625b9736 Mon Sep 17 00:00:00 2001 From: hSterz Date: Sun, 22 Aug 2021 10:54:05 +0200 Subject: [PATCH 06/29] style --- src/transformers/adapters/trainer.py | 37 ++++++++++++++------------ src/transformers/data/datasets/glue.py | 10 +++---- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index 6ed41e167d..9b64a3f072 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -6,7 +6,7 @@ from torch.cuda.amp import autocast from torch.utils.data.dataset import Dataset -from transformers import PreTrainedModel, Trainer, __version__, Seq2SeqTrainer +from transformers import PreTrainedModel, Seq2SeqTrainer, Trainer, __version__ from transformers.deepspeed import is_deepspeed_zero3_enabled from transformers.dependency_versions_check import dep_version_check from transformers.integrations import is_fairscale_available @@ -22,6 +22,7 @@ from ..trainer_utils import EvalPrediction, PredictionOutput, ShardedDDPOption from ..training_args import TrainingArguments + if is_fairscale_available(): dep_version_check("fairscale") from fairscale.optim import OSS @@ -32,21 +33,21 @@ class AdapterTrainer(Trainer): def __init__( - self, - model: Union[PreTrainedModel, nn.Module] = None, - args: TrainingArguments = None, - data_collator: Optional[DataCollator] = None, - train_dataset: Optional[Dataset] = None, - eval_dataset: Optional[Dataset] = None, - tokenizer: Optional[PreTrainedTokenizerBase] = None, - model_init: Callable[[], PreTrainedModel] = None, - compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, - callbacks: Optional[List[TrainerCallback]] = None, - do_save_full_model: Optional[bool] = None, - do_save_adapters: Optional[bool] = None, - do_save_adapter_fusion: Optional[bool] = None, - adapter_names: Optional[List[List[str]]] = None, - optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + self, + model: Union[PreTrainedModel, nn.Module] = None, + args: TrainingArguments = None, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Dataset] = None, + tokenizer: Optional[PreTrainedTokenizerBase] = None, + model_init: Callable[[], PreTrainedModel] = None, + compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, + callbacks: Optional[List[TrainerCallback]] = None, + do_save_full_model: Optional[bool] = None, + do_save_adapters: Optional[bool] = None, + do_save_adapter_fusion: Optional[bool] = None, + adapter_names: Optional[List[List[str]]] = None, + optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), ): super().__init__( model, @@ -210,7 +211,9 @@ def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: else: if self.do_save_full_model: # We load the model state dict on the CPU to avoid an OOM error. - state_dict = torch.load(os.path.join(args.resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu") + state_dict = torch.load( + os.path.join(args.resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu" + ) # If the model is on the GPU, it still works! self._load_state_dict_in_model(state_dict) if self.do_save_adapters: diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py index cb5d0037cd..43b63e5cf4 100644 --- a/src/transformers/data/datasets/glue.py +++ b/src/transformers/data/datasets/glue.py @@ -118,14 +118,12 @@ def __init__( # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" - #with FileLock(lock_path): + # with FileLock(lock_path): if os.path.exists(cached_features_file) and not args.overwrite_cache: start = time.time() self.features = torch.load(cached_features_file) - logger.info( - f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start - ) + logger.info(f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start) else: logger.info(f"Creating features from dataset file at {args.data_dir}") @@ -147,9 +145,7 @@ def __init__( start = time.time() torch.save(self.features, cached_features_file) # ^ This seems to take a lot of time so I want to investigate why and how we can improve. - logger.info( - f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]" - ) + logger.info(f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]") def __len__(self): return len(self.features) From bce7daa0e2ed210ab9c59c803c68d55aacb7bd02 Mon Sep 17 00:00:00 2001 From: hSterz Date: Sun, 22 Aug 2021 11:03:10 +0200 Subject: [PATCH 07/29] Add changes on develop --- src/transformers/adapters/hub_mixin.py | 39 +++++++++-------- src/transformers/data/datasets/glue.py | 58 ++++++++++++++------------ tests/test_adapter_hub.py | 2 +- tests/test_adapter_trainer.py | 2 +- tests/test_adapter_training.py | 2 +- 5 files changed, 55 insertions(+), 48 deletions(-) diff --git a/src/transformers/adapters/hub_mixin.py b/src/transformers/adapters/hub_mixin.py index 46985393dd..2325b4dcfa 100644 --- a/src/transformers/adapters/hub_mixin.py +++ b/src/transformers/adapters/hub_mixin.py @@ -10,29 +10,35 @@ DEFAULT_TEXT = "" ADAPTER_CARD_TEMPLATE = """ ---- tags: {tags} --- +--- +tags: +{tags} +--- # Adapter `{adapter_repo_name}` for {model_name} -An [adapter](https://adapterhub.ml) for the {model_name} model that was trained on the {dataset_name} -dataset{head_info}. +An [adapter](https://adapterhub.ml) for the {model_name} model that was trained on the {dataset_name} dataset{head_info}. -This adapter was created for usage with the -**[adapter-transformers](https://github.com/Adapter-Hub/adapter-transformers)** library. +This adapter was created for usage with the **[adapter-transformers](https://github.com/Adapter-Hub/adapter-transformers)** library. ## Usage First, install `adapter-transformers`: -``` pip install -U adapter-transformers ``` _Note: adapter-transformers is a fork of transformers that acts as a -drop-in replacement with adapter support. [More](https://docs.adapterhub.ml/installation.html)_ +``` +pip install -U adapter-transformers +``` +_Note: adapter-transformers is a fork of transformers that acts as a drop-in replacement with adapter support. [More](https://docs.adapterhub.ml/installation.html)_ Now, the adapter can be loaded and activated like this: -```python from transformers import AutoModelWithHeads +```python +from transformers import AutoModelWithHeads -model = AutoModelWithHeads.from_pretrained("{model_name}") adapter_name = model.load_adapter("{adapter_repo_name}") -model.active_adapters = adapter_name ``` +model = AutoModelWithHeads.from_pretrained("{model_name}") +adapter_name = model.load_adapter("{adapter_repo_name}") +model.active_adapters = adapter_name +``` ## Architecture & Training @@ -118,8 +124,7 @@ def push_adapter_to_hub( use_auth_token: Union[bool, str] = True, overwrite_adapter_card: bool = False, ): - """ - Upload an adapter to HuggingFace's Model Hub. + """Upload an adapter to HuggingFace's Model Hub. Args: repo_name (str): The name of the repository on the model hub to upload to. @@ -127,11 +132,10 @@ def push_adapter_to_hub( organization (str, optional): Organization in which to push the adapter (you must be a member of this organization). Defaults to None. adapterhub_tag (str, optional): Tag of the format `/` for categorization on https://adapterhub.ml/explore/. - See https://docs.adapterhub.ml/contributing.html#add-a-new-task-or-subtask for more. If not specified, - `datasets_tag` must be given in case a new adapter card is generated. Defaults to None. + See https://docs.adapterhub.ml/contributing.html#add-a-new-task-or-subtask for more. + If not specified, `datasets_tag` must be given in case a new adapter card is generated. Defaults to None. datasets_tag (str, optional): Dataset identifier from https://huggingface.co/datasets. - If not specified, `adapterhub_tag` must be given in case a new adapter card is generated. Defaults to - None. + If not specified, `adapterhub_tag` must be given in case a new adapter card is generated. Defaults to None. local_path (str, optional): Local path used as clone directory of the adapter repository. If not specified, will create a temporary directory. Defaults to None. commit_message (:obj:`str`, `optional`): @@ -141,8 +145,7 @@ def push_adapter_to_hub( Whether or not the repository created should be private (requires a paying subscription). use_auth_token (:obj:`bool` or :obj:`str`, `optional`): The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token - generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). Defaults to - True. + generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). Defaults to True. overwrite_adapter_card (bool, optional): Overwrite an existing adapter card with a newly generated one. If set to `False`, will only generate an adapter card, if none exists. Defaults to False. diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py index 43b63e5cf4..1ba786c384 100644 --- a/src/transformers/data/datasets/glue.py +++ b/src/transformers/data/datasets/glue.py @@ -118,34 +118,38 @@ def __init__( # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" - # with FileLock(lock_path): - - if os.path.exists(cached_features_file) and not args.overwrite_cache: - start = time.time() - self.features = torch.load(cached_features_file) - logger.info(f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start) - else: - logger.info(f"Creating features from dataset file at {args.data_dir}") - - if mode == Split.dev: - examples = self.processor.get_dev_examples(args.data_dir) - elif mode == Split.test: - examples = self.processor.get_test_examples(args.data_dir) + with FileLock(lock_path): + + if os.path.exists(cached_features_file) and not args.overwrite_cache: + start = time.time() + self.features = torch.load(cached_features_file) + logger.info( + f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start + ) else: - examples = self.processor.get_train_examples(args.data_dir) - if limit_length is not None: - examples = examples[:limit_length] - self.features = glue_convert_examples_to_features( - examples, - tokenizer, - max_length=args.max_seq_length, - label_list=label_list, - output_mode=self.output_mode, - ) - start = time.time() - torch.save(self.features, cached_features_file) - # ^ This seems to take a lot of time so I want to investigate why and how we can improve. - logger.info(f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]") + logger.info(f"Creating features from dataset file at {args.data_dir}") + + if mode == Split.dev: + examples = self.processor.get_dev_examples(args.data_dir) + elif mode == Split.test: + examples = self.processor.get_test_examples(args.data_dir) + else: + examples = self.processor.get_train_examples(args.data_dir) + if limit_length is not None: + examples = examples[:limit_length] + self.features = glue_convert_examples_to_features( + examples, + tokenizer, + max_length=args.max_seq_length, + label_list=label_list, + output_mode=self.output_mode, + ) + start = time.time() + torch.save(self.features, cached_features_file) + # ^ This seems to take a lot of time so I want to investigate why and how we can improve. + logger.info( + f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]" + ) def __len__(self): return len(self.features) diff --git a/tests/test_adapter_hub.py b/tests/test_adapter_hub.py index 0299591b2c..dafba1ecf1 100644 --- a/tests/test_adapter_hub.py +++ b/tests/test_adapter_hub.py @@ -3,7 +3,7 @@ import numpy as np -from transformers import ( # get_adapter_config_hash,; Trainer, +from transformers import ( # get_adapter_config_hash, ADAPTER_CONFIG_MAP, AdapterConfig, AutoModel, diff --git a/tests/test_adapter_trainer.py b/tests/test_adapter_trainer.py index aeebfef9c2..f62f1f9af6 100644 --- a/tests/test_adapter_trainer.py +++ b/tests/test_adapter_trainer.py @@ -2,7 +2,7 @@ import torch -from transformers import ( # Trainer, +from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, BertConfig, diff --git a/tests/test_adapter_training.py b/tests/test_adapter_training.py index 60af35c29c..b68ac47411 100644 --- a/tests/test_adapter_training.py +++ b/tests/test_adapter_training.py @@ -2,7 +2,7 @@ import torch -from transformers import ( # Trainer, +from transformers import ( AutoModelForSequenceClassification, AutoModelWithHeads, AutoTokenizer, From 528bd51edbaa59dd7637793b019ff9a3a3f89757 Mon Sep 17 00:00:00 2001 From: hSterz Date: Sun, 22 Aug 2021 11:08:13 +0200 Subject: [PATCH 08/29] Quality --- src/transformers/adapters/trainer.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index 9b64a3f072..be4b33a311 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -1,13 +1,11 @@ import os -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, Tuple, Union import torch from torch import nn -from torch.cuda.amp import autocast from torch.utils.data.dataset import Dataset from transformers import PreTrainedModel, Seq2SeqTrainer, Trainer, __version__ -from transformers.deepspeed import is_deepspeed_zero3_enabled from transformers.dependency_versions_check import dep_version_check from transformers.integrations import is_fairscale_available @@ -19,7 +17,7 @@ from ..tokenization_utils_base import PreTrainedTokenizerBase from ..trainer_callback import TrainerCallback, TrainerControl, TrainerState from ..trainer_pt_utils import get_parameter_names -from ..trainer_utils import EvalPrediction, PredictionOutput, ShardedDDPOption +from ..trainer_utils import EvalPrediction, ShardedDDPOption from ..training_args import TrainingArguments @@ -108,7 +106,6 @@ def create_optimizer(self): "weight_decay": 0.0, }, ] - optimizer_cls = Adafactor if self.args.adafactor else AdamW if self.args.adafactor: optimizer_cls = Adafactor optimizer_kwargs = {"scale_parameter": False, "relative_step": False} From fd0166a82f95c4a8ab38127d96e18a6a10473207 Mon Sep 17 00:00:00 2001 From: hSterz Date: Mon, 23 Aug 2021 09:01:56 +0200 Subject: [PATCH 09/29] Quality --- src/transformers/adapters/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index be4b33a311..dccde1a1e8 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -12,7 +12,6 @@ from ..configuration_utils import PretrainedConfig from ..data.data_collator import DataCollator from ..file_utils import CONFIG_NAME, WEIGHTS_NAME, is_sagemaker_mp_enabled, logger -from ..modeling_utils import PreTrainedModel from ..optimization import Adafactor, AdamW from ..tokenization_utils_base import PreTrainedTokenizerBase from ..trainer_callback import TrainerCallback, TrainerControl, TrainerState From 21b648c277ec80bd07255fb6e29d52271ce789b1 Mon Sep 17 00:00:00 2001 From: hSterz Date: Tue, 24 Aug 2021 10:37:54 +0200 Subject: [PATCH 10/29] Quality --- src/transformers/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 7d7455604c..b4dba1724f 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -55,7 +55,6 @@ from torch.utils.data.sampler import RandomSampler, SequentialSampler from . import __version__ -from .adapters.composition import AdapterCompositionBlock, Fuse from .configuration_utils import PretrainedConfig from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator from .debug_utils import DebugOption, DebugUnderflowOverflow From 47935b49a1efb457dd6f6f1367836a7b84c0e836 Mon Sep 17 00:00:00 2001 From: hSterz Date: Sun, 5 Sep 2021 11:18:25 +0200 Subject: [PATCH 11/29] Overwrite save method --- src/transformers/adapters/trainer.py | 40 +++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index 229a83eb23..1b7e404083 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -1,7 +1,10 @@ import os +import random import re +import warnings from typing import Callable, Dict, List, Optional, Tuple, Union +import numpy as np import torch from torch import nn from torch.utils.data.dataset import Dataset @@ -10,6 +13,7 @@ from transformers.adapters.composition import AdapterCompositionBlock, Fuse from transformers.dependency_versions_check import dep_version_check from transformers.integrations import is_fairscale_available +from transformers.modeling_utils import unwrap_model from ..configuration_utils import PretrainedConfig from ..data.data_collator import DataCollator @@ -137,18 +141,42 @@ def create_optimizer(self): if is_sagemaker_mp_enabled(): self.optimizer = smp.DistributedOptimizer(self.optimizer) + def _save(self, output_dir: Optional[str] = None, state_dict=None): + # If we are executing this function, we are the process zero, so we don't check for that. + output_dir = output_dir if output_dir is not None else self.args.output_dir + os.makedirs(output_dir, exist_ok=True) + logger.info(f"Saving model checkpoint to {output_dir}") + # Save a trained model and configuration using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + if not isinstance(self.model, PreTrainedModel): + if isinstance(unwrap_model(self.model), PreTrainedModel): + if state_dict is None: + state_dict = self.model.state_dict() + unwrap_model(self.model).save_pretrained(output_dir, state_dict=state_dict) + else: + logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.") + if state_dict is None: + state_dict = self.model.state_dict() + torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) + else: + if self.do_save_adapters: + self.model.save_all_adapters(output_dir) + if self.do_save_adapter_fusion: + self.model.save_all_adapter_fusions(output_dir) + if self.do_save_full_model: + self.model.save_pretrained(output_dir, state_dict=state_dict) + if self.tokenizer is not None: + self.tokenizer.save_pretrained(output_dir) + + # Good practice: save your training arguments together with the trained model + torch.save(self.args, os.path.join(output_dir, "training_args.bin")) + class AdapterTrainerCallback(TrainerCallback): def __init__(self, trainer): super().__init__() self.trainer = trainer - def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): - if self.trainer.do_save_adapters: - self.trainer.model.save_all_adapters(args.output_dir) - if self.trainer.do_save_adapter_fusion: - self.trainer.model.save_all_adapter_fusions(args.output_dir) - def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): model = kwargs.pop("model") if args.load_best_model_at_end and state.best_model_checkpoint is not None: From fbde693477cdef8b1a75d2a8c82dd5b7d2005230 Mon Sep 17 00:00:00 2001 From: hSterz Date: Mon, 6 Sep 2021 00:36:14 +0200 Subject: [PATCH 12/29] Overwriting _save() and creating and overwriting _load() --- src/transformers/adapters/trainer.py | 43 ++++++++++++++++++++++++ src/transformers/trainer.py | 50 +++++++++++++++------------- 2 files changed, 70 insertions(+), 23 deletions(-) diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index 1b7e404083..cafa899908 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -171,6 +171,49 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): # Good practice: save your training arguments together with the trained model torch.save(self.args, os.path.join(output_dir, "training_args.bin")) + def _load(self, resume_from_checkpoint): + args = self.args + if os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)): + logger.info(f"Loading model from {resume_from_checkpoint}).") + elif self.do_save_full_model: + raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") + + if os.path.isfile(os.path.join(resume_from_checkpoint, CONFIG_NAME)): + config = PretrainedConfig.from_json_file(os.path.join(resume_from_checkpoint, CONFIG_NAME)) + checkpoint_version = config.transformers_version + if checkpoint_version is not None and checkpoint_version != __version__: + logger.warn( + f"You are resuming training from a checkpoint trained with {checkpoint_version} of " + f"Transformers but your current version is {__version__}. This is not recommended and could " + "yield to errors or unwanted behaviors." + ) + + if args.deepspeed: + # will be resumed in deepspeed_init + pass + else: + if self.do_save_full_model: + # We load the model state dict on the CPU to avoid an OOM error. + state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu") + # If the model is on the GPU, it still works! + self._load_state_dict_in_model(state_dict) + if self.do_save_adapters: + adapter_loaded = False + if os.path.isdir(resume_from_checkpoint): + for file_name in os.listdir(resume_from_checkpoint): + if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)): + if "," in file_name: + self.model.load_adapter_fusion(os.path.join(resume_from_checkpoint, file_name)) + adapter_loaded = True + else: + self.model.load_adapter( + os.path.join(os.path.join(resume_from_checkpoint, file_name)) + ) + adapter_loaded = True + + if not adapter_loaded: + raise Exception("Can't find a valid checkpoint at {}".format(resume_from_checkpoint)) + class AdapterTrainerCallback(TrainerCallback): def __init__(self, trainer): diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index b4dba1724f..2e9ce61350 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1047,29 +1047,7 @@ def train( raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})") if resume_from_checkpoint is not None: - if not os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)): - raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") - - logger.info(f"Loading model from {resume_from_checkpoint}).") - - if os.path.isfile(os.path.join(resume_from_checkpoint, CONFIG_NAME)): - config = PretrainedConfig.from_json_file(os.path.join(resume_from_checkpoint, CONFIG_NAME)) - checkpoint_version = config.transformers_version - if checkpoint_version is not None and checkpoint_version != __version__: - logger.warn( - f"You are resuming training from a checkpoint trained with {checkpoint_version} of " - f"Transformers but your current version is {__version__}. This is not recommended and could " - "yield to errors or unwanted behaviors." - ) - - if args.deepspeed: - # will be resumed in deepspeed_init - pass - else: - # We load the model state dict on the CPU to avoid an OOM error. - state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu") - # If the model is on the GPU, it still works! - self._load_state_dict_in_model(state_dict) + self._load(resume_from_checkpoint) # If model was re-initialized, put it on the right device and update self.model_wrapped if model_reloaded: @@ -1901,6 +1879,32 @@ def _save_tpu(self, output_dir: Optional[str] = None): if self.tokenizer is not None and self.args.should_save: self.tokenizer.save_pretrained(output_dir) + def _load(self, resume_from_checkpoint): + args = self.args + if not os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)): + raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") + + logger.info(f"Loading model from {resume_from_checkpoint}).") + + if os.path.isfile(os.path.join(resume_from_checkpoint, CONFIG_NAME)): + config = PretrainedConfig.from_json_file(os.path.join(resume_from_checkpoint, CONFIG_NAME)) + checkpoint_version = config.transformers_version + if checkpoint_version is not None and checkpoint_version != __version__: + logger.warn( + f"You are resuming training from a checkpoint trained with {checkpoint_version} of " + f"Transformers but your current version is {__version__}. This is not recommended and could " + "yield to errors or unwanted behaviors." + ) + + if args.deepspeed: + # will be resumed in deepspeed_init + pass + else: + # We load the model state dict on the CPU to avoid an OOM error. + state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu") + # If the model is on the GPU, it still works! + self._load_state_dict_in_model(state_dict) + def _save(self, output_dir: Optional[str] = None, state_dict=None): # If we are executing this function, we are the process zero, so we don't check for that. output_dir = output_dir if output_dir is not None else self.args.output_dir From 0f9e68b0a9f61cf3bb92cf711e0fe32cb3abaaf1 Mon Sep 17 00:00:00 2001 From: hSterz Date: Mon, 6 Sep 2021 00:58:13 +0200 Subject: [PATCH 13/29] Style --- src/transformers/adapters/trainer.py | 7 +------ src/transformers/trainer.py | 3 --- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index cafa899908..06ed69a874 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -1,10 +1,7 @@ import os -import random import re -import warnings from typing import Callable, Dict, List, Optional, Tuple, Union -import numpy as np import torch from torch import nn from torch.utils.data.dataset import Dataset @@ -206,9 +203,7 @@ def _load(self, resume_from_checkpoint): self.model.load_adapter_fusion(os.path.join(resume_from_checkpoint, file_name)) adapter_loaded = True else: - self.model.load_adapter( - os.path.join(os.path.join(resume_from_checkpoint, file_name)) - ) + self.model.load_adapter(os.path.join(os.path.join(resume_from_checkpoint, file_name))) adapter_loaded = True if not adapter_loaded: diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 2e9ce61350..b00b214aae 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -177,7 +177,6 @@ if is_training_run_on_sagemaker(): logging.add_handler(StreamHandler(sys.stdout)) - if TYPE_CHECKING: import optuna @@ -2133,7 +2132,6 @@ def evaluation_loop( # if eval is called w/o train init deepspeed here if self.args.deepspeed and not self.deepspeed: - # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval # from the checkpoint eventually deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None) @@ -2543,7 +2541,6 @@ def prediction_loop( # if eval is called w/o train init deepspeed here if self.args.deepspeed and not self.deepspeed: - # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval # from the checkpoint eventually deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None) From bf31ebc2176486f2bb31cf5786f86f18d93f3d4a Mon Sep 17 00:00:00 2001 From: hSterz Date: Mon, 6 Sep 2021 11:19:14 +0200 Subject: [PATCH 14/29] Added automatic saving an dloading of heads to trainer --- src/transformers/adapters/model_mixin.py | 5 + src/transformers/adapters/trainer.py | 81 ++++++++++------ src/transformers/data/datasets/glue.py | 62 ++++++------ tests/test_adapter_phm.py | 116 +++++++++++++++++++++++ tests/test_adapter_trainer.py | 74 ++++++++++++++- 5 files changed, 276 insertions(+), 62 deletions(-) create mode 100644 tests/test_adapter_phm.py diff --git a/src/transformers/adapters/model_mixin.py b/src/transformers/adapters/model_mixin.py index 6179b901ae..1d9d667d0f 100644 --- a/src/transformers/adapters/model_mixin.py +++ b/src/transformers/adapters/model_mixin.py @@ -653,6 +653,11 @@ def save_all_adapters( custom_weights_loaders=custom_weights_loaders, ) + def save_all_heads(self, save_directory): + for head_name in self.heads: + save_path = join(save_directory, head_name) + self.save_head(save_path, head_name) + def get_labels(self): return list(self.config.id2label.values()) diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index 06ed69a874..b32fa1be64 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -6,7 +6,7 @@ from torch import nn from torch.utils.data.dataset import Dataset -from transformers import PreTrainedModel, Seq2SeqTrainer, Trainer, __version__ +from transformers import PreTrainedModel, Seq2SeqTrainer, Trainer, __version__, ModelWithHeadsAdaptersMixin from transformers.adapters.composition import AdapterCompositionBlock, Fuse from transformers.dependency_versions_check import dep_version_check from transformers.integrations import is_fairscale_available @@ -22,7 +22,6 @@ from ..trainer_utils import EvalPrediction, ShardedDDPOption from ..training_args import TrainingArguments - if is_fairscale_available(): dep_version_check("fairscale") from fairscale.optim import OSS @@ -33,21 +32,21 @@ class AdapterTrainer(Trainer): def __init__( - self, - model: Union[PreTrainedModel, nn.Module] = None, - args: TrainingArguments = None, - data_collator: Optional[DataCollator] = None, - train_dataset: Optional[Dataset] = None, - eval_dataset: Optional[Dataset] = None, - tokenizer: Optional[PreTrainedTokenizerBase] = None, - model_init: Callable[[], PreTrainedModel] = None, - compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, - callbacks: Optional[List[TrainerCallback]] = None, - do_save_full_model: Optional[bool] = None, - do_save_adapters: Optional[bool] = None, - do_save_adapter_fusion: Optional[bool] = None, - adapter_names: Optional[List[List[str]]] = None, - optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + self, + model: Union[PreTrainedModel, nn.Module] = None, + args: TrainingArguments = None, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Dataset] = None, + tokenizer: Optional[PreTrainedTokenizerBase] = None, + model_init: Callable[[], PreTrainedModel] = None, + compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, + callbacks: Optional[List[TrainerCallback]] = None, + do_save_full_model: Optional[bool] = None, + do_save_adapters: Optional[bool] = None, + do_save_adapter_fusion: Optional[bool] = None, + adapter_names: Optional[List[List[str]]] = None, + optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), ): super().__init__( model, @@ -62,6 +61,9 @@ def __init__( optimizers=optimizers, ) + # Setting this to True can lead to unexpected behaviour with adapters + self.args.remove_unused_columns = False + if adapter_names is not None: self.model.set_active_adapters(adapter_names) # Set the defaults for loading/ saving model & adapters @@ -72,9 +74,9 @@ def __init__( if model_freezed and self.model.active_adapters: # Check if training AdapterFusion self.train_adapter_fusion = ( - isinstance(self.model.active_adapters, Fuse) - or isinstance(self.model.active_adapters, AdapterCompositionBlock) - and any([isinstance(child, Fuse) for child in self.model.active_adapters.children]) + isinstance(self.model.active_adapters, Fuse) + or isinstance(self.model.active_adapters, AdapterCompositionBlock) + and any([isinstance(child, Fuse) for child in self.model.active_adapters.children]) ) # Configure model saving self.do_save_full_model = False @@ -162,6 +164,8 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): self.model.save_all_adapter_fusions(output_dir) if self.do_save_full_model: self.model.save_pretrained(output_dir, state_dict=state_dict) + if hasattr(self.model, "heads"): + self.model.save_all_heads(output_dir) if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) @@ -195,20 +199,39 @@ def _load(self, resume_from_checkpoint): # If the model is on the GPU, it still works! self._load_state_dict_in_model(state_dict) if self.do_save_adapters: - adapter_loaded = False if os.path.isdir(resume_from_checkpoint): - for file_name in os.listdir(resume_from_checkpoint): - if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)): - if "," in file_name: - self.model.load_adapter_fusion(os.path.join(resume_from_checkpoint, file_name)) - adapter_loaded = True - else: - self.model.load_adapter(os.path.join(os.path.join(resume_from_checkpoint, file_name))) - adapter_loaded = True + adapter_loaded = self._load_adapters(resume_from_checkpoint) + self._load_adapter_fusions(resume_from_checkpoint) + # Save all heads for a model with heads + if hasattr(self.model, "heads"): + self._load_heads(resume_from_checkpoint) if not adapter_loaded: raise Exception("Can't find a valid checkpoint at {}".format(resume_from_checkpoint)) + def _load_adapters(self, resume_from_checkpoint): + adapter_loaded = False + for file_name in os.listdir(resume_from_checkpoint): + if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)): + if "," not in file_name and "adapter_config.json" in os.listdir( + os.path.join(resume_from_checkpoint, file_name)): + self.model.load_adapter(os.path.join(os.path.join(resume_from_checkpoint, file_name))) + adapter_loaded = True + return adapter_loaded + + def _load_adapter_fusions(self, resume_from_checkpoint): + for file_name in os.listdir(resume_from_checkpoint): + if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)): + if "," in file_name: + self.model.load_adapter_fusion(os.path.join(resume_from_checkpoint, file_name)) + + def _load_heads(self, resume_from_checkpoint): + for file_name in os.listdir(resume_from_checkpoint): + if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)): + if "," not in file_name and "head_config.json" in os.listdir( + os.path.join(resume_from_checkpoint, file_name)): + self.model.load_head(os.path.join(resume_from_checkpoint, file_name)) + class AdapterTrainerCallback(TrainerCallback): def __init__(self, trainer): diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py index 1ba786c384..9909807d17 100644 --- a/src/transformers/data/datasets/glue.py +++ b/src/transformers/data/datasets/glue.py @@ -118,38 +118,38 @@ def __init__( # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" - with FileLock(lock_path): - - if os.path.exists(cached_features_file) and not args.overwrite_cache: - start = time.time() - self.features = torch.load(cached_features_file) - logger.info( - f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start - ) + # with FileLock(lock_path): + + if os.path.exists(cached_features_file) and not args.overwrite_cache: + start = time.time() + self.features = torch.load(cached_features_file) + logger.info( + f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start + ) + else: + logger.info(f"Creating features from dataset file at {args.data_dir}") + + if mode == Split.dev: + examples = self.processor.get_dev_examples(args.data_dir) + elif mode == Split.test: + examples = self.processor.get_test_examples(args.data_dir) else: - logger.info(f"Creating features from dataset file at {args.data_dir}") - - if mode == Split.dev: - examples = self.processor.get_dev_examples(args.data_dir) - elif mode == Split.test: - examples = self.processor.get_test_examples(args.data_dir) - else: - examples = self.processor.get_train_examples(args.data_dir) - if limit_length is not None: - examples = examples[:limit_length] - self.features = glue_convert_examples_to_features( - examples, - tokenizer, - max_length=args.max_seq_length, - label_list=label_list, - output_mode=self.output_mode, - ) - start = time.time() - torch.save(self.features, cached_features_file) - # ^ This seems to take a lot of time so I want to investigate why and how we can improve. - logger.info( - f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]" - ) + examples = self.processor.get_train_examples(args.data_dir) + if limit_length is not None: + examples = examples[:limit_length] + self.features = glue_convert_examples_to_features( + examples, + tokenizer, + max_length=args.max_seq_length, + label_list=label_list, + output_mode=self.output_mode, + ) + start = time.time() + torch.save(self.features, cached_features_file) + # ^ This seems to take a lot of time so I want to investigate why and how we can improve. + logger.info( + f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]" + ) def __len__(self): return len(self.features) diff --git a/tests/test_adapter_phm.py b/tests/test_adapter_phm.py new file mode 100644 index 0000000000..dc15fc53a3 --- /dev/null +++ b/tests/test_adapter_phm.py @@ -0,0 +1,116 @@ +import copy +import random +import unittest + +import torch +from torch import Tensor + +from tests.test_adapter_training import filter_parameters +from transformers import ( + AutoModel, + AutoModelWithHeads, + AutoTokenizer, + GlueDataset, + GlueDataTrainingArguments, + Trainer, + TrainingArguments, +) +from transformers.adapters.configuration import CompactorConfig, PfeifferCompactConfig + + +def shared_parameters(param: Tensor): + def getter(): + return param + + return getter + + +class TestSaveLabel(unittest.TestCase): + def get_input_samples(self, shape, vocab_size=5000, config=None): + total_dims = 1 + for dim in shape: + total_dims *= dim + + values = [] + for _ in range(total_dims): + values.append(random.randint(0, vocab_size - 1)) + input_ids = torch.tensor(data=values, dtype=torch.long).view(shape).contiguous() + # this is needed e.g. for BART + if config and config.eos_token_id is not None: + input_ids[input_ids == config.eos_token_id] = random.randint(0, config.eos_token_id - 1) + input_ids[:, -1] = config.eos_token_id + + return input_ids + + def test_model(self): + name = "compactor" + compactor_config = CompactorConfig(phm_dim=16) + adapter_config = PfeifferCompactConfig(compactor=compactor_config) + model = AutoModel.from_pretrained("bert-base-uncased") + model.add_adapter(name, config=adapter_config) + self.assertTrue(name in model.config.adapters) + self.assertEqual(adapter_config, model.config.adapters.get(name)) + + model.train_adapter(name) + self.assertEqual(set([name]), model.active_adapters.flatten()) + for k, v in dict(model.named_parameters()).items(): + if name in k: + self.assertTrue(v.requires_grad) + else: + self.assertFalse(v.requires_grad) + + input_data = self.get_input_samples((1, 128), config=model.config) + output_data = model(input_data) + self.assertTrue((1, 128, 768), output_data["last_hidden_state"].shape) + + def test_training(self): + model_name = "bert-base-cased" + tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + model = AutoModelWithHeads.from_pretrained(model_name) + + # add two adapters: one will be trained and the other should be frozen + + compactor_config = CompactorConfig(phm_dim=16) + adapter_config = PfeifferCompactConfig(compactor=compactor_config) + model.add_adapter("mrpc", config=adapter_config) + model.add_classification_head("mrpc") + + self.assertIn("mrpc", model.config.adapters.adapters) + + # train the mrpc adapter -> should be activated & unfreezed + model.train_adapter("mrpc") + self.assertEqual(set(["mrpc"]), model.active_adapters.flatten()) + + # all weights of the adapter should be activated + for k, v in filter_parameters(model, "mrpc.").items(): + self.assertTrue(v.requires_grad, k) + # weights of the model should be freezed (check on some examples) + for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): + self.assertFalse(v.requires_grad, k) + + state_dict_pre = copy.deepcopy(model.state_dict()) + + # setup dataset + data_args = GlueDataTrainingArguments( + task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True + ) + train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") + training_args = TrainingArguments( + output_dir="./examples", do_train=True, learning_rate=0.1, max_steps=7, no_cuda=True + ) + + # evaluate + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + ) + trainer.train() + + for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): + if "mrpc" in k1: + self.assertFalse(torch.equal(v1, v2)) + else: + self.assertTrue(torch.equal(v1, v2)) diff --git a/tests/test_adapter_trainer.py b/tests/test_adapter_trainer.py index c32458bca2..793ad69997 100644 --- a/tests/test_adapter_trainer.py +++ b/tests/test_adapter_trainer.py @@ -1,4 +1,5 @@ import unittest +from tempfile import TemporaryDirectory import torch @@ -10,7 +11,7 @@ GlueDataset, GlueDataTrainingArguments, TrainingArguments, -) + AutoModelWithHeads) from transformers.adapters.composition import Fuse from transformers.adapters.trainer import AdapterTrainer as Trainer from transformers.testing_utils import slow @@ -148,7 +149,7 @@ def test_auto_set_save_adapters(self): self.assertTrue(trainer.do_save_adapters) self.assertTrue(trainer.do_save_adapter_fusion) - @slow + # @slow def test_training_load_best_model_at_end_full_model(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") data_args = GlueDataTrainingArguments( @@ -184,6 +185,75 @@ def test_training_load_best_model_at_end_full_model(self): trainer.train() self.assertIsNotNone(trainer.model.active_adapters) + def test_reloading_prediction_head(self): + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + data_args = GlueDataTrainingArguments( + task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True + ) + train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") + + model = AutoModelWithHeads.from_pretrained("bert-base-uncased") + model.add_classification_head("dummy", num_labels=2) + + # add the adapters to be fused + model.add_adapter("adapter") + model.add_adapter("additional_adapter") + + # setup fusion + adapter_setup = Fuse("adapter", "additional_adapter") + model.add_adapter_fusion(adapter_setup) + model.train_adapter_fusion(adapter_setup) + model.set_active_adapters(adapter_setup) + self.assertEqual(adapter_setup, model.active_adapters) + self.assertEqual("dummy", model.active_head) + with TemporaryDirectory() as tempdir: + training_args = TrainingArguments( + output_dir=tempdir, + do_train=True, + learning_rate=0.1, + logging_steps=1, + max_steps=1, + save_steps=1, + remove_unused_columns=False, + ) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + do_save_adapters=True, + do_save_full_model=False, + ) + + trainer.train() + # create second model that should resume the training of the first + model_resume = AutoModelWithHeads.from_pretrained("bert-base-uncased") + model_resume.add_classification_head("dummy", num_labels=2) + model_resume.add_adapter("adapter") + model_resume.add_adapter("additional_adapter") + # setup fusion + adapter_setup = Fuse("adapter", "additional_adapter") + model_resume.add_adapter_fusion(adapter_setup) + model_resume.train_adapter_fusion(adapter_setup) + model_resume.set_active_adapters(adapter_setup) + trainer_resume = Trainer( + model=model_resume, + args=TrainingArguments(do_train=True, max_steps=1, output_dir=tempdir), + train_dataset=train_dataset, + do_save_adapters=True, + do_save_full_model=False, + ) + trainer_resume.train(resume_from_checkpoint=True) + + self.assertEqual("dummy", model.active_head) + self.assertEqual(model.config.adapters.adapters, model_resume.config.adapters.adapters) + + for ((k1, v1), (k2, v2)) in zip(trainer.model.state_dict().items(), trainer_resume.model.state_dict().items()): + self.assertEqual(k1, k2) + if "adapter" in k1 or "dummy" in k1: + self.assertTrue(torch.equal(v1, v2), k1) + + + if __name__ == "__main__": unittest.main() From 5334c20e3ac953af201c9f9ccdc4e32d489b6595 Mon Sep 17 00:00:00 2001 From: hSterz Date: Mon, 6 Sep 2021 11:38:54 +0200 Subject: [PATCH 15/29] Style --- src/transformers/adapters/hub_mixin.py | 39 +++++---- src/transformers/adapters/trainer.py | 45 +++++----- tests/test_adapter_phm.py | 116 ------------------------- tests/test_adapter_trainer.py | 11 +-- 4 files changed, 51 insertions(+), 160 deletions(-) delete mode 100644 tests/test_adapter_phm.py diff --git a/src/transformers/adapters/hub_mixin.py b/src/transformers/adapters/hub_mixin.py index 46985393dd..2325b4dcfa 100644 --- a/src/transformers/adapters/hub_mixin.py +++ b/src/transformers/adapters/hub_mixin.py @@ -10,29 +10,35 @@ DEFAULT_TEXT = "" ADAPTER_CARD_TEMPLATE = """ ---- tags: {tags} --- +--- +tags: +{tags} +--- # Adapter `{adapter_repo_name}` for {model_name} -An [adapter](https://adapterhub.ml) for the {model_name} model that was trained on the {dataset_name} -dataset{head_info}. +An [adapter](https://adapterhub.ml) for the {model_name} model that was trained on the {dataset_name} dataset{head_info}. -This adapter was created for usage with the -**[adapter-transformers](https://github.com/Adapter-Hub/adapter-transformers)** library. +This adapter was created for usage with the **[adapter-transformers](https://github.com/Adapter-Hub/adapter-transformers)** library. ## Usage First, install `adapter-transformers`: -``` pip install -U adapter-transformers ``` _Note: adapter-transformers is a fork of transformers that acts as a -drop-in replacement with adapter support. [More](https://docs.adapterhub.ml/installation.html)_ +``` +pip install -U adapter-transformers +``` +_Note: adapter-transformers is a fork of transformers that acts as a drop-in replacement with adapter support. [More](https://docs.adapterhub.ml/installation.html)_ Now, the adapter can be loaded and activated like this: -```python from transformers import AutoModelWithHeads +```python +from transformers import AutoModelWithHeads -model = AutoModelWithHeads.from_pretrained("{model_name}") adapter_name = model.load_adapter("{adapter_repo_name}") -model.active_adapters = adapter_name ``` +model = AutoModelWithHeads.from_pretrained("{model_name}") +adapter_name = model.load_adapter("{adapter_repo_name}") +model.active_adapters = adapter_name +``` ## Architecture & Training @@ -118,8 +124,7 @@ def push_adapter_to_hub( use_auth_token: Union[bool, str] = True, overwrite_adapter_card: bool = False, ): - """ - Upload an adapter to HuggingFace's Model Hub. + """Upload an adapter to HuggingFace's Model Hub. Args: repo_name (str): The name of the repository on the model hub to upload to. @@ -127,11 +132,10 @@ def push_adapter_to_hub( organization (str, optional): Organization in which to push the adapter (you must be a member of this organization). Defaults to None. adapterhub_tag (str, optional): Tag of the format `/` for categorization on https://adapterhub.ml/explore/. - See https://docs.adapterhub.ml/contributing.html#add-a-new-task-or-subtask for more. If not specified, - `datasets_tag` must be given in case a new adapter card is generated. Defaults to None. + See https://docs.adapterhub.ml/contributing.html#add-a-new-task-or-subtask for more. + If not specified, `datasets_tag` must be given in case a new adapter card is generated. Defaults to None. datasets_tag (str, optional): Dataset identifier from https://huggingface.co/datasets. - If not specified, `adapterhub_tag` must be given in case a new adapter card is generated. Defaults to - None. + If not specified, `adapterhub_tag` must be given in case a new adapter card is generated. Defaults to None. local_path (str, optional): Local path used as clone directory of the adapter repository. If not specified, will create a temporary directory. Defaults to None. commit_message (:obj:`str`, `optional`): @@ -141,8 +145,7 @@ def push_adapter_to_hub( Whether or not the repository created should be private (requires a paying subscription). use_auth_token (:obj:`bool` or :obj:`str`, `optional`): The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token - generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). Defaults to - True. + generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). Defaults to True. overwrite_adapter_card (bool, optional): Overwrite an existing adapter card with a newly generated one. If set to `False`, will only generate an adapter card, if none exists. Defaults to False. diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index b32fa1be64..1a3baff31e 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -6,7 +6,7 @@ from torch import nn from torch.utils.data.dataset import Dataset -from transformers import PreTrainedModel, Seq2SeqTrainer, Trainer, __version__, ModelWithHeadsAdaptersMixin +from transformers import PreTrainedModel, Seq2SeqTrainer, Trainer, __version__ from transformers.adapters.composition import AdapterCompositionBlock, Fuse from transformers.dependency_versions_check import dep_version_check from transformers.integrations import is_fairscale_available @@ -22,6 +22,7 @@ from ..trainer_utils import EvalPrediction, ShardedDDPOption from ..training_args import TrainingArguments + if is_fairscale_available(): dep_version_check("fairscale") from fairscale.optim import OSS @@ -32,21 +33,21 @@ class AdapterTrainer(Trainer): def __init__( - self, - model: Union[PreTrainedModel, nn.Module] = None, - args: TrainingArguments = None, - data_collator: Optional[DataCollator] = None, - train_dataset: Optional[Dataset] = None, - eval_dataset: Optional[Dataset] = None, - tokenizer: Optional[PreTrainedTokenizerBase] = None, - model_init: Callable[[], PreTrainedModel] = None, - compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, - callbacks: Optional[List[TrainerCallback]] = None, - do_save_full_model: Optional[bool] = None, - do_save_adapters: Optional[bool] = None, - do_save_adapter_fusion: Optional[bool] = None, - adapter_names: Optional[List[List[str]]] = None, - optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + self, + model: Union[PreTrainedModel, nn.Module] = None, + args: TrainingArguments = None, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Dataset] = None, + tokenizer: Optional[PreTrainedTokenizerBase] = None, + model_init: Callable[[], PreTrainedModel] = None, + compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, + callbacks: Optional[List[TrainerCallback]] = None, + do_save_full_model: Optional[bool] = None, + do_save_adapters: Optional[bool] = None, + do_save_adapter_fusion: Optional[bool] = None, + adapter_names: Optional[List[List[str]]] = None, + optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), ): super().__init__( model, @@ -74,9 +75,9 @@ def __init__( if model_freezed and self.model.active_adapters: # Check if training AdapterFusion self.train_adapter_fusion = ( - isinstance(self.model.active_adapters, Fuse) - or isinstance(self.model.active_adapters, AdapterCompositionBlock) - and any([isinstance(child, Fuse) for child in self.model.active_adapters.children]) + isinstance(self.model.active_adapters, Fuse) + or isinstance(self.model.active_adapters, AdapterCompositionBlock) + and any([isinstance(child, Fuse) for child in self.model.active_adapters.children]) ) # Configure model saving self.do_save_full_model = False @@ -214,7 +215,8 @@ def _load_adapters(self, resume_from_checkpoint): for file_name in os.listdir(resume_from_checkpoint): if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)): if "," not in file_name and "adapter_config.json" in os.listdir( - os.path.join(resume_from_checkpoint, file_name)): + os.path.join(resume_from_checkpoint, file_name) + ): self.model.load_adapter(os.path.join(os.path.join(resume_from_checkpoint, file_name))) adapter_loaded = True return adapter_loaded @@ -229,7 +231,8 @@ def _load_heads(self, resume_from_checkpoint): for file_name in os.listdir(resume_from_checkpoint): if os.path.isdir(os.path.join(resume_from_checkpoint, file_name)): if "," not in file_name and "head_config.json" in os.listdir( - os.path.join(resume_from_checkpoint, file_name)): + os.path.join(resume_from_checkpoint, file_name) + ): self.model.load_head(os.path.join(resume_from_checkpoint, file_name)) diff --git a/tests/test_adapter_phm.py b/tests/test_adapter_phm.py deleted file mode 100644 index dc15fc53a3..0000000000 --- a/tests/test_adapter_phm.py +++ /dev/null @@ -1,116 +0,0 @@ -import copy -import random -import unittest - -import torch -from torch import Tensor - -from tests.test_adapter_training import filter_parameters -from transformers import ( - AutoModel, - AutoModelWithHeads, - AutoTokenizer, - GlueDataset, - GlueDataTrainingArguments, - Trainer, - TrainingArguments, -) -from transformers.adapters.configuration import CompactorConfig, PfeifferCompactConfig - - -def shared_parameters(param: Tensor): - def getter(): - return param - - return getter - - -class TestSaveLabel(unittest.TestCase): - def get_input_samples(self, shape, vocab_size=5000, config=None): - total_dims = 1 - for dim in shape: - total_dims *= dim - - values = [] - for _ in range(total_dims): - values.append(random.randint(0, vocab_size - 1)) - input_ids = torch.tensor(data=values, dtype=torch.long).view(shape).contiguous() - # this is needed e.g. for BART - if config and config.eos_token_id is not None: - input_ids[input_ids == config.eos_token_id] = random.randint(0, config.eos_token_id - 1) - input_ids[:, -1] = config.eos_token_id - - return input_ids - - def test_model(self): - name = "compactor" - compactor_config = CompactorConfig(phm_dim=16) - adapter_config = PfeifferCompactConfig(compactor=compactor_config) - model = AutoModel.from_pretrained("bert-base-uncased") - model.add_adapter(name, config=adapter_config) - self.assertTrue(name in model.config.adapters) - self.assertEqual(adapter_config, model.config.adapters.get(name)) - - model.train_adapter(name) - self.assertEqual(set([name]), model.active_adapters.flatten()) - for k, v in dict(model.named_parameters()).items(): - if name in k: - self.assertTrue(v.requires_grad) - else: - self.assertFalse(v.requires_grad) - - input_data = self.get_input_samples((1, 128), config=model.config) - output_data = model(input_data) - self.assertTrue((1, 128, 768), output_data["last_hidden_state"].shape) - - def test_training(self): - model_name = "bert-base-cased" - tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - model = AutoModelWithHeads.from_pretrained(model_name) - - # add two adapters: one will be trained and the other should be frozen - - compactor_config = CompactorConfig(phm_dim=16) - adapter_config = PfeifferCompactConfig(compactor=compactor_config) - model.add_adapter("mrpc", config=adapter_config) - model.add_classification_head("mrpc") - - self.assertIn("mrpc", model.config.adapters.adapters) - - # train the mrpc adapter -> should be activated & unfreezed - model.train_adapter("mrpc") - self.assertEqual(set(["mrpc"]), model.active_adapters.flatten()) - - # all weights of the adapter should be activated - for k, v in filter_parameters(model, "mrpc.").items(): - self.assertTrue(v.requires_grad, k) - # weights of the model should be freezed (check on some examples) - for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): - self.assertFalse(v.requires_grad, k) - - state_dict_pre = copy.deepcopy(model.state_dict()) - - # setup dataset - data_args = GlueDataTrainingArguments( - task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True - ) - train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") - training_args = TrainingArguments( - output_dir="./examples", do_train=True, learning_rate=0.1, max_steps=7, no_cuda=True - ) - - # evaluate - trainer = Trainer( - model=model, - args=training_args, - train_dataset=train_dataset, - ) - trainer.train() - - for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): - if "mrpc" in k1: - self.assertFalse(torch.equal(v1, v2)) - else: - self.assertTrue(torch.equal(v1, v2)) diff --git a/tests/test_adapter_trainer.py b/tests/test_adapter_trainer.py index 793ad69997..1ecf3b59d9 100644 --- a/tests/test_adapter_trainer.py +++ b/tests/test_adapter_trainer.py @@ -5,13 +5,14 @@ from transformers import ( AutoModelForSequenceClassification, + AutoModelWithHeads, AutoTokenizer, BertConfig, BertForSequenceClassification, GlueDataset, GlueDataTrainingArguments, TrainingArguments, - AutoModelWithHeads) +) from transformers.adapters.composition import Fuse from transformers.adapters.trainer import AdapterTrainer as Trainer from transformers.testing_utils import slow @@ -149,7 +150,7 @@ def test_auto_set_save_adapters(self): self.assertTrue(trainer.do_save_adapters) self.assertTrue(trainer.do_save_adapter_fusion) - # @slow + @slow def test_training_load_best_model_at_end_full_model(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") data_args = GlueDataTrainingArguments( @@ -247,13 +248,13 @@ def test_reloading_prediction_head(self): self.assertEqual("dummy", model.active_head) self.assertEqual(model.config.adapters.adapters, model_resume.config.adapters.adapters) - for ((k1, v1), (k2, v2)) in zip(trainer.model.state_dict().items(), trainer_resume.model.state_dict().items()): + for ((k1, v1), (k2, v2)) in zip( + trainer.model.state_dict().items(), trainer_resume.model.state_dict().items() + ): self.assertEqual(k1, k2) if "adapter" in k1 or "dummy" in k1: self.assertTrue(torch.equal(v1, v2), k1) - - if __name__ == "__main__": unittest.main() From 43123305d0bcece22b272caef546e24e804a425b Mon Sep 17 00:00:00 2001 From: hSterz Date: Mon, 6 Sep 2021 11:41:17 +0200 Subject: [PATCH 16/29] Style --- src/transformers/adapters/models/bert.py | 7 ++- src/transformers/data/datasets/glue.py | 62 ++++++++++++------------ 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/src/transformers/adapters/models/bert.py b/src/transformers/adapters/models/bert.py index 0ac40a7860..ecafc0e81b 100644 --- a/src/transformers/adapters/models/bert.py +++ b/src/transformers/adapters/models/bert.py @@ -264,10 +264,9 @@ def add_qa_head( self.add_prediction_head(head, overwrite_ok) def add_dependency_parsing_head(self, head_name, num_labels=2, overwrite_ok=False, id2label=None): - """ - Adds a biaffine dependency parsing head on top of the model. The parsing head uses the architecture described - in "Is Supervised Syntactic Parsing Beneficial for Language Understanding? An Empirical Investigation" (Glavaš - & Vulić, 2021) (https://arxiv.org/pdf/2008.06788.pdf). + """Adds a biaffine dependency parsing head on top of the model. + The parsing head uses the architecture described in "Is Supervised Syntactic Parsing Beneficial for Language Understanding? + An Empirical Investigation" (Glavaš & Vulić, 2021) (https://arxiv.org/pdf/2008.06788.pdf). Args: head_name (str): The name of the head. diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py index 9909807d17..1ba786c384 100644 --- a/src/transformers/data/datasets/glue.py +++ b/src/transformers/data/datasets/glue.py @@ -118,38 +118,38 @@ def __init__( # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" - # with FileLock(lock_path): - - if os.path.exists(cached_features_file) and not args.overwrite_cache: - start = time.time() - self.features = torch.load(cached_features_file) - logger.info( - f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start - ) - else: - logger.info(f"Creating features from dataset file at {args.data_dir}") - - if mode == Split.dev: - examples = self.processor.get_dev_examples(args.data_dir) - elif mode == Split.test: - examples = self.processor.get_test_examples(args.data_dir) + with FileLock(lock_path): + + if os.path.exists(cached_features_file) and not args.overwrite_cache: + start = time.time() + self.features = torch.load(cached_features_file) + logger.info( + f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start + ) else: - examples = self.processor.get_train_examples(args.data_dir) - if limit_length is not None: - examples = examples[:limit_length] - self.features = glue_convert_examples_to_features( - examples, - tokenizer, - max_length=args.max_seq_length, - label_list=label_list, - output_mode=self.output_mode, - ) - start = time.time() - torch.save(self.features, cached_features_file) - # ^ This seems to take a lot of time so I want to investigate why and how we can improve. - logger.info( - f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]" - ) + logger.info(f"Creating features from dataset file at {args.data_dir}") + + if mode == Split.dev: + examples = self.processor.get_dev_examples(args.data_dir) + elif mode == Split.test: + examples = self.processor.get_test_examples(args.data_dir) + else: + examples = self.processor.get_train_examples(args.data_dir) + if limit_length is not None: + examples = examples[:limit_length] + self.features = glue_convert_examples_to_features( + examples, + tokenizer, + max_length=args.max_seq_length, + label_list=label_list, + output_mode=self.output_mode, + ) + start = time.time() + torch.save(self.features, cached_features_file) + # ^ This seems to take a lot of time so I want to investigate why and how we can improve. + logger.info( + f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]" + ) def __len__(self): return len(self.features) From 968da305949bbbffc76340fc8531e6ad95653400 Mon Sep 17 00:00:00 2001 From: hSterz Date: Mon, 6 Sep 2021 12:08:21 +0200 Subject: [PATCH 17/29] Fix loading --- src/transformers/adapters/trainer.py | 47 ---------------------------- tests/test_adapter_trainer.py | 7 +++++ 2 files changed, 7 insertions(+), 47 deletions(-) diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index 1a3baff31e..e43fbd58fc 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -285,53 +285,6 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra fusion_reg_loss = model.base_model.get_fusion_regularization_loss() fusion_reg_loss.backward() - def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): - if args.resume_from_checkpoint is not None: - if os.path.isfile(os.path.join(args.resume_from_checkpoint, WEIGHTS_NAME)): - logger.info(f"Loading model from {args.resume_from_checkpoint}).") - elif self.do_save_full_model: - raise ValueError(f"Can't find a valid checkpoint at {args.resume_from_checkpoint}") - - if os.path.isfile(os.path.join(args.resume_from_checkpoint, CONFIG_NAME)): - config = PretrainedConfig.from_json_file(os.path.join(args.resume_from_checkpoint, CONFIG_NAME)) - checkpoint_version = config.transformers_version - if checkpoint_version is not None and checkpoint_version != __version__: - logger.warn( - f"You are resuming training from a checkpoint trained with {checkpoint_version} of " - f"Transformers but your current version is {__version__}. This is not recommended and could " - "yield to errors or unwanted behaviors." - ) - - if args.deepspeed: - # will be resumed in deepspeed_init - pass - else: - if self.do_save_full_model: - # We load the model state dict on the CPU to avoid an OOM error. - state_dict = torch.load( - os.path.join(args.resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu" - ) - # If the model is on the GPU, it still works! - self._load_state_dict_in_model(state_dict) - if self.do_save_adapters: - adapter_loaded = False - if os.path.isdir(args.resume_from_checkpoint): - for file_name in os.listdir(args.resume_from_checkpoint): - if os.path.isdir(os.path.join(args.resume_from_checkpoint, file_name)): - if "," in file_name: - self.model.load_adapter_fusion( - os.path.join(args.resume_from_checkpoint, file_name) - ) - adapter_loaded = True - else: - self.model.load_adapter( - os.path.join(os.path.join(args.resume_from_checkpoint, file_name)) - ) - adapter_loaded = True - - if not adapter_loaded: - raise Exception("Can't find a valid checkpoint at {}".format(args.resume_from_checkpoint)) - class AdapterSeq2SeqTrainer(AdapterTrainer, Seq2SeqTrainer): pass diff --git a/tests/test_adapter_trainer.py b/tests/test_adapter_trainer.py index 1ecf3b59d9..5740c2130e 100644 --- a/tests/test_adapter_trainer.py +++ b/tests/test_adapter_trainer.py @@ -194,6 +194,8 @@ def test_reloading_prediction_head(self): train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") model = AutoModelWithHeads.from_pretrained("bert-base-uncased") + + model.add_classification_head("adapter", num_labels=3) model.add_classification_head("dummy", num_labels=2) # add the adapters to be fused @@ -228,6 +230,8 @@ def test_reloading_prediction_head(self): trainer.train() # create second model that should resume the training of the first model_resume = AutoModelWithHeads.from_pretrained("bert-base-uncased") + + model_resume.add_classification_head("adapter", num_labels=3) model_resume.add_classification_head("dummy", num_labels=2) model_resume.add_adapter("adapter") model_resume.add_adapter("additional_adapter") @@ -255,6 +259,9 @@ def test_reloading_prediction_head(self): if "adapter" in k1 or "dummy" in k1: self.assertTrue(torch.equal(v1, v2), k1) + def test_XY(self): + pass + if __name__ == "__main__": unittest.main() From 7f31ddebcf451359acb068cdc15701a8ef76a65e Mon Sep 17 00:00:00 2001 From: hSterz Date: Mon, 6 Sep 2021 12:58:51 +0200 Subject: [PATCH 18/29] Added tset --- tests/test_adapter_trainer.py | 58 +++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/tests/test_adapter_trainer.py b/tests/test_adapter_trainer.py index 5740c2130e..253594b318 100644 --- a/tests/test_adapter_trainer.py +++ b/tests/test_adapter_trainer.py @@ -1,3 +1,4 @@ +import os import unittest from tempfile import TemporaryDirectory @@ -13,7 +14,7 @@ GlueDataTrainingArguments, TrainingArguments, ) -from transformers.adapters.composition import Fuse +from transformers.adapters.composition import Fuse, Stack from transformers.adapters.trainer import AdapterTrainer as Trainer from transformers.testing_utils import slow @@ -259,8 +260,59 @@ def test_reloading_prediction_head(self): if "adapter" in k1 or "dummy" in k1: self.assertTrue(torch.equal(v1, v2), k1) - def test_XY(self): - pass + def test_general(self): + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + data_args = GlueDataTrainingArguments( + task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True + ) + train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") + + model = AutoModelWithHeads.from_pretrained("bert-base-uncased") + + model.add_classification_head("task", num_labels=3) + + # add the adapters to be fused + model.add_adapter("task") + model.add_adapter("additional_adapter") + + model.train_adapter("task") + self.assertEqual("task", model.active_head) + self.assertEqual(Stack("task"), model.active_adapters) + with TemporaryDirectory() as tempdir: + training_args = TrainingArguments( + output_dir=tempdir, + do_train=True, + learning_rate=0.1, + logging_steps=1, + max_steps=1, + save_steps=1, + remove_unused_columns=False, + ) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + do_save_adapters=True, + do_save_full_model=False, + ) + + trainer.train() + + self.assertFalse(trainer.do_save_full_model) + self.assertTrue(trainer.do_save_adapters) + self.assertFalse(trainer.do_save_adapter_fusion) + + # Check that adapters are actually saved but the full model is not + files_dir_checkpoint = [file_or_dir for file_or_dir in os.listdir(os.path.join(tempdir, "checkpoint-1"))] + self.assertTrue("task" in files_dir_checkpoint) + self.assertTrue("additional_adapter" in files_dir_checkpoint) + # Check that full model weights are not stored + self.assertFalse("pytorch_model.bin" in files_dir_checkpoint) + + # this should always be false in the adapter trainer + self.assertFalse(trainer.args.remove_unused_columns) + self.assertEqual("task", model.active_head) + self.assertEqual(Stack("task"), model.active_adapters) if __name__ == "__main__": From f3bbb52e2229da04894e13c7d73c3e00cd63d6cb Mon Sep 17 00:00:00 2001 From: hSterz Date: Fri, 10 Sep 2021 12:21:47 +0200 Subject: [PATCH 19/29] Additional Testcase --- src/transformers/adapters/trainer.py | 6 ++--- tests/test_adapter_trainer.py | 35 ++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index e43fbd58fc..baf1543562 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -260,7 +260,7 @@ def on_train_end(self, args: TrainingArguments, state: TrainerState, control: Tr ) if self.trainer.do_save_adapters: logger.info( - f"Loading best adapter(s) from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." + f"Loading best adapter(s) from {state.best_model_checkpoint} (score: {state.best_metric})." ) # attempt to re-load all adapters from checkpoint for adapter in model.config.adapters.adapters: @@ -269,12 +269,12 @@ def on_train_end(self, args: TrainingArguments, state: TrainerState, control: Tr model.load_adapter(adapter_dir) if self.trainer.do_save_adapter_fusion: logger.info( - f"Loading best adapter fusion(s) from {self.state.best_model_checkpoint} (score: {self.state.best_metric})." + f"Loading best adapter fusion(s) from {state.best_model_checkpoint} (score: {state.best_metric})." ) # attempt to re-load all adapter fusions from checkpoint fusion_models = getattr(self.model.config, "adapter_fusion_models", []) for fusion in fusion_models: - fusion_dir = os.path.join(self.state.best_model_checkpoint, fusion) + fusion_dir = os.path.join(state.best_model_checkpoint, fusion) if os.path.exists(fusion_dir): self.model.load_adapter_fusion(fusion_dir) diff --git a/tests/test_adapter_trainer.py b/tests/test_adapter_trainer.py index 253594b318..b676e6dd2f 100644 --- a/tests/test_adapter_trainer.py +++ b/tests/test_adapter_trainer.py @@ -16,6 +16,7 @@ ) from transformers.adapters.composition import Fuse, Stack from transformers.adapters.trainer import AdapterTrainer as Trainer +from transformers.file_utils import logger from transformers.testing_utils import slow @@ -187,6 +188,40 @@ def test_training_load_best_model_at_end_full_model(self): trainer.train() self.assertIsNotNone(trainer.model.active_adapters) + def test_training_load_best_model_at_end_adapter(self): + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + data_args = GlueDataTrainingArguments( + task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True + ) + train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") + eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev") + + model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") + model.add_adapter("adapter") + model.train_adapter("adapter") + + training_args = TrainingArguments( + output_dir="./examples", + do_train=True, + learning_rate=0.001, + max_steps=1, + save_steps=1, + remove_unused_columns=False, + load_best_model_at_end=True, + evaluation_strategy="epoch", + num_train_epochs=2, + ) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset + ) + with self.assertLogs(logger) as cm: + trainer.train() + self.assertTrue(any("Loading best adapter(s) from" in line for line in cm.output)) + self.assertEqual(Stack("adapter"), trainer.model.active_adapters) + def test_reloading_prediction_head(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") data_args = GlueDataTrainingArguments( From 9a0587dfb1b9817245a9d82fc65386bd67492927 Mon Sep 17 00:00:00 2001 From: hSterz Date: Fri, 10 Sep 2021 12:57:46 +0200 Subject: [PATCH 20/29] Change Adaptertrainer to only train adapters --- src/transformers/adapters/trainer.py | 79 +++++++--------------------- tests/test_adapter_trainer.py | 49 +++++------------ 2 files changed, 34 insertions(+), 94 deletions(-) diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index baf1543562..7b81d594d1 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -79,22 +79,8 @@ def __init__( or isinstance(self.model.active_adapters, AdapterCompositionBlock) and any([isinstance(child, Fuse) for child in self.model.active_adapters.children]) ) - # Configure model saving - self.do_save_full_model = False - self.do_save_adapters = True - self.do_save_adapter_fusion = self.train_adapter_fusion else: - self.train_adapter_fusion = False - self.do_save_full_model = True - self.do_save_adapters = False - self.do_save_adapter_fusion = False - # override with explicit setting - if do_save_full_model is not None: - self.do_save_full_model = do_save_full_model - if do_save_adapters is not None: - self.do_save_adapters = do_save_adapters - if do_save_adapter_fusion is not None: - self.do_save_adapter_fusion = do_save_adapter_fusion + raise ValueError("Expected a freezed model with adapters to train. If you want tu fully finetune the model use the Trainer class") def create_optimizer(self): """ @@ -159,12 +145,9 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): state_dict = self.model.state_dict() torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) else: - if self.do_save_adapters: - self.model.save_all_adapters(output_dir) - if self.do_save_adapter_fusion: + self.model.save_all_adapters(output_dir) + if self.train_adapter_fusion: self.model.save_all_adapter_fusions(output_dir) - if self.do_save_full_model: - self.model.save_pretrained(output_dir, state_dict=state_dict) if hasattr(self.model, "heads"): self.model.save_all_heads(output_dir) if self.tokenizer is not None: @@ -177,8 +160,6 @@ def _load(self, resume_from_checkpoint): args = self.args if os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)): logger.info(f"Loading model from {resume_from_checkpoint}).") - elif self.do_save_full_model: - raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") if os.path.isfile(os.path.join(resume_from_checkpoint, CONFIG_NAME)): config = PretrainedConfig.from_json_file(os.path.join(resume_from_checkpoint, CONFIG_NAME)) @@ -194,21 +175,15 @@ def _load(self, resume_from_checkpoint): # will be resumed in deepspeed_init pass else: - if self.do_save_full_model: - # We load the model state dict on the CPU to avoid an OOM error. - state_dict = torch.load(os.path.join(resume_from_checkpoint, WEIGHTS_NAME), map_location="cpu") - # If the model is on the GPU, it still works! - self._load_state_dict_in_model(state_dict) - if self.do_save_adapters: - if os.path.isdir(resume_from_checkpoint): - adapter_loaded = self._load_adapters(resume_from_checkpoint) - self._load_adapter_fusions(resume_from_checkpoint) - # Save all heads for a model with heads - if hasattr(self.model, "heads"): - self._load_heads(resume_from_checkpoint) + if os.path.isdir(resume_from_checkpoint): + adapter_loaded = self._load_adapters(resume_from_checkpoint) + self._load_adapter_fusions(resume_from_checkpoint) + # Save all heads for a model with heads + if hasattr(self.model, "heads"): + self._load_heads(resume_from_checkpoint) - if not adapter_loaded: - raise Exception("Can't find a valid checkpoint at {}".format(resume_from_checkpoint)) + if not adapter_loaded: + raise Exception("Can't find a valid checkpoint at {}".format(resume_from_checkpoint)) def _load_adapters(self, resume_from_checkpoint): adapter_loaded = False @@ -244,30 +219,16 @@ def __init__(self, trainer): def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): model = kwargs.pop("model") if args.load_best_model_at_end and state.best_model_checkpoint is not None: - if self.trainer.do_save_full_model: - logger.info(f"Loading best model from {state.best_model_checkpoint} (score: {state.best_metric}).") - best_model_path = os.path.join(state.best_model_checkpoint, WEIGHTS_NAME) - if os.path.exists(best_model_path): - # We load the model state dict on the CPU to avoid an OOM error. - state_dict = torch.load(best_model_path, map_location="cpu") - # If the model is on the GPU, it still works! - self.trainer._load_state_dict_in_model(state_dict) - else: - logger.warn( - f"Could not locate the best model at {best_model_path}, if you are running a distributed training " - "on multiple nodes, you should activate `--save_on_each_node`." - ) - if self.trainer.do_save_adapters: - logger.info( - f"Loading best adapter(s) from {state.best_model_checkpoint} (score: {state.best_metric})." - ) - # attempt to re-load all adapters from checkpoint - for adapter in model.config.adapters.adapters: - adapter_dir = os.path.join(state.best_model_checkpoint, adapter) - if os.path.exists(adapter_dir): - model.load_adapter(adapter_dir) - if self.trainer.do_save_adapter_fusion: + logger.info( + f"Loading best adapter(s) from {state.best_model_checkpoint} (score: {state.best_metric})." + ) + # attempt to re-load all adapters from checkpoint + for adapter in model.config.adapters.adapters: + adapter_dir = os.path.join(state.best_model_checkpoint, adapter) + if os.path.exists(adapter_dir): + model.load_adapter(adapter_dir) + if self.trainer.train_adapter_fusion: logger.info( f"Loading best adapter fusion(s) from {state.best_model_checkpoint} (score: {state.best_metric})." ) diff --git a/tests/test_adapter_trainer.py b/tests/test_adapter_trainer.py index b676e6dd2f..0d567489fc 100644 --- a/tests/test_adapter_trainer.py +++ b/tests/test_adapter_trainer.py @@ -13,9 +13,10 @@ GlueDataset, GlueDataTrainingArguments, TrainingArguments, + Trainer, ) from transformers.adapters.composition import Fuse, Stack -from transformers.adapters.trainer import AdapterTrainer as Trainer +from transformers.adapters.trainer import AdapterTrainer from transformers.file_utils import logger from transformers.testing_utils import slow @@ -33,6 +34,7 @@ def test_resume_training(self): model.add_adapter("adapter") model.add_adapter("additional_adapter") model.set_active_adapters("adapter") + model.train_adapter("adapter") training_args = TrainingArguments( output_dir="./examples", @@ -43,12 +45,10 @@ def test_resume_training(self): save_steps=1, remove_unused_columns=False, ) - trainer = Trainer( + trainer = AdapterTrainer( model=model, args=training_args, train_dataset=train_dataset, - do_save_adapters=True, - do_save_full_model=False, ) trainer.train() @@ -57,12 +57,11 @@ def test_resume_training(self): model_resume.add_adapter("adapter") model_resume.add_adapter("additional_adapter") model_resume.set_active_adapters("adapter") - trainer_resume = Trainer( + model_resume.train_adapter("adapter") + trainer_resume = AdapterTrainer( model=model_resume, args=TrainingArguments(do_train=True, max_steps=1, output_dir="./examples"), train_dataset=train_dataset, - do_save_adapters=True, - do_save_full_model=False, ) trainer_resume.train(resume_from_checkpoint=True) @@ -85,6 +84,7 @@ def test_resume_training_with_fusion(self): model.add_adapter("additional_adapter") model.add_adapter_fusion(Fuse("adapter", "additional_adapter")) model.set_active_adapters(Fuse("adapter", "additional_adapter")) + model.train_adapter(Fuse("adapter", "additional_adapter")) training_args = TrainingArguments( output_dir="./examples", @@ -95,13 +95,10 @@ def test_resume_training_with_fusion(self): save_steps=1, remove_unused_columns=False, ) - trainer = Trainer( + trainer = AdapterTrainer( model=model, args=training_args, train_dataset=train_dataset, - do_save_adapters=True, - do_save_full_model=False, - do_save_adapter_fusion=True, ) trainer.train() @@ -110,12 +107,10 @@ def test_resume_training_with_fusion(self): model_resume.add_adapter("additional_adapter") model_resume.add_adapter_fusion(Fuse("adapter", "additional_adapter")) model_resume.set_active_adapters(Fuse("adapter", "additional_adapter")) - trainer_resume = Trainer( + trainer_resume = AdapterTrainer( model=model_resume, args=TrainingArguments(do_train=True, max_steps=1, output_dir="./examples"), train_dataset=train_dataset, - do_save_full_model=False, - do_save_adapters=True, ) trainer_resume.train(resume_from_checkpoint=True) @@ -143,15 +138,11 @@ def test_auto_set_save_adapters(self): training_args = TrainingArguments( output_dir="./examples", ) - trainer = Trainer( + trainer = AdapterTrainer( model=model, args=training_args, ) - self.assertFalse(trainer.do_save_full_model) - self.assertTrue(trainer.do_save_adapters) - self.assertTrue(trainer.do_save_adapter_fusion) - @slow def test_training_load_best_model_at_end_full_model(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") @@ -181,8 +172,6 @@ def test_training_load_best_model_at_end_full_model(self): args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, - do_save_adapters=False, - do_save_full_model=True, ) trainer.train() @@ -211,7 +200,7 @@ def test_training_load_best_model_at_end_adapter(self): evaluation_strategy="epoch", num_train_epochs=2, ) - trainer = Trainer( + trainer = AdapterTrainer( model=model, args=training_args, train_dataset=train_dataset, @@ -255,12 +244,10 @@ def test_reloading_prediction_head(self): save_steps=1, remove_unused_columns=False, ) - trainer = Trainer( + trainer = AdapterTrainer( model=model, args=training_args, train_dataset=train_dataset, - do_save_adapters=True, - do_save_full_model=False, ) trainer.train() @@ -276,12 +263,10 @@ def test_reloading_prediction_head(self): model_resume.add_adapter_fusion(adapter_setup) model_resume.train_adapter_fusion(adapter_setup) model_resume.set_active_adapters(adapter_setup) - trainer_resume = Trainer( + trainer_resume = AdapterTrainer( model=model_resume, args=TrainingArguments(do_train=True, max_steps=1, output_dir=tempdir), train_dataset=train_dataset, - do_save_adapters=True, - do_save_full_model=False, ) trainer_resume.train(resume_from_checkpoint=True) @@ -323,20 +308,14 @@ def test_general(self): save_steps=1, remove_unused_columns=False, ) - trainer = Trainer( + trainer = AdapterTrainer( model=model, args=training_args, train_dataset=train_dataset, - do_save_adapters=True, - do_save_full_model=False, ) trainer.train() - self.assertFalse(trainer.do_save_full_model) - self.assertTrue(trainer.do_save_adapters) - self.assertFalse(trainer.do_save_adapter_fusion) - # Check that adapters are actually saved but the full model is not files_dir_checkpoint = [file_or_dir for file_or_dir in os.listdir(os.path.join(tempdir, "checkpoint-1"))] self.assertTrue("task" in files_dir_checkpoint) From cb930fa276427c46580bfb64563abc6821a8f991 Mon Sep 17 00:00:00 2001 From: hSterz Date: Fri, 10 Sep 2021 14:49:46 +0200 Subject: [PATCH 21/29] Fix test --- tests/test_adapter_trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_adapter_trainer.py b/tests/test_adapter_trainer.py index 0d567489fc..22a493b1ff 100644 --- a/tests/test_adapter_trainer.py +++ b/tests/test_adapter_trainer.py @@ -84,7 +84,7 @@ def test_resume_training_with_fusion(self): model.add_adapter("additional_adapter") model.add_adapter_fusion(Fuse("adapter", "additional_adapter")) model.set_active_adapters(Fuse("adapter", "additional_adapter")) - model.train_adapter(Fuse("adapter", "additional_adapter")) + model.train_fusion(Fuse("adapter", "additional_adapter")) training_args = TrainingArguments( output_dir="./examples", @@ -107,6 +107,7 @@ def test_resume_training_with_fusion(self): model_resume.add_adapter("additional_adapter") model_resume.add_adapter_fusion(Fuse("adapter", "additional_adapter")) model_resume.set_active_adapters(Fuse("adapter", "additional_adapter")) + model_resume.train_fusion(Fuse("adapter", "additional_adapter")) trainer_resume = AdapterTrainer( model=model_resume, args=TrainingArguments(do_train=True, max_steps=1, output_dir="./examples"), @@ -142,6 +143,7 @@ def test_auto_set_save_adapters(self): model=model, args=training_args, ) + self.assertTrue(trainer.train_adapter_fusion) @slow def test_training_load_best_model_at_end_full_model(self): From 345225deb09dc57d88996bf442ff36070e534193 Mon Sep 17 00:00:00 2001 From: hSterz Date: Fri, 10 Sep 2021 15:04:15 +0200 Subject: [PATCH 22/29] Quality --- src/transformers/adapters/trainer.py | 8 ++++---- tests/test_adapter_trainer.py | 7 ++----- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index 7b81d594d1..2d0f699fbc 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -80,7 +80,9 @@ def __init__( and any([isinstance(child, Fuse) for child in self.model.active_adapters.children]) ) else: - raise ValueError("Expected a freezed model with adapters to train. If you want tu fully finetune the model use the Trainer class") + raise ValueError( + "Expected a freezed model with adapters to train. If you want tu fully finetune the model use the Trainer class" + ) def create_optimizer(self): """ @@ -220,9 +222,7 @@ def on_train_end(self, args: TrainingArguments, state: TrainerState, control: Tr model = kwargs.pop("model") if args.load_best_model_at_end and state.best_model_checkpoint is not None: - logger.info( - f"Loading best adapter(s) from {state.best_model_checkpoint} (score: {state.best_metric})." - ) + logger.info(f"Loading best adapter(s) from {state.best_model_checkpoint} (score: {state.best_metric}).") # attempt to re-load all adapters from checkpoint for adapter in model.config.adapters.adapters: adapter_dir = os.path.join(state.best_model_checkpoint, adapter) diff --git a/tests/test_adapter_trainer.py b/tests/test_adapter_trainer.py index 22a493b1ff..aa1df55501 100644 --- a/tests/test_adapter_trainer.py +++ b/tests/test_adapter_trainer.py @@ -12,8 +12,8 @@ BertForSequenceClassification, GlueDataset, GlueDataTrainingArguments, - TrainingArguments, Trainer, + TrainingArguments, ) from transformers.adapters.composition import Fuse, Stack from transformers.adapters.trainer import AdapterTrainer @@ -203,10 +203,7 @@ def test_training_load_best_model_at_end_adapter(self): num_train_epochs=2, ) trainer = AdapterTrainer( - model=model, - args=training_args, - train_dataset=train_dataset, - eval_dataset=eval_dataset + model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset ) with self.assertLogs(logger) as cm: trainer.train() From c177e2fa0ab9611228802ccd5e8a1a801fb2cc8a Mon Sep 17 00:00:00 2001 From: hSterz Date: Mon, 13 Sep 2021 09:58:58 +0200 Subject: [PATCH 23/29] Fix test --- tests/extended/test_trainer_ext.py | 1 + tests/test_adapter_hub.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py index e7e7a53fe5..99a93ac126 100644 --- a/tests/extended/test_trainer_ext.py +++ b/tests/extended/test_trainer_ext.py @@ -253,6 +253,7 @@ def run_trainer( --validation_file {data_dir}/val.json --test_file {data_dir}/test.json --output_dir {output_dir} + --train_adapter --overwrite_output_dir --max_train_samples 8 --max_source_length {max_len} diff --git a/tests/test_adapter_hub.py b/tests/test_adapter_hub.py index 69da7a285d..a43e6e640f 100644 --- a/tests/test_adapter_hub.py +++ b/tests/test_adapter_hub.py @@ -58,6 +58,7 @@ def test_load_task_adapter_from_hub(self): adapter_name = model.load_adapter( "sts/mrpc@ukp", config=config, version="1", loading_info=loading_info ) + model.train_adapter(adapter_name) self.assertEqual(0, len(loading_info["missing_keys"])) self.assertEqual(0, len(loading_info["unexpected_keys"])) From cf10f1fbbd195821e9807eafa83b05981091e3c8 Mon Sep 17 00:00:00 2001 From: hSterz Date: Tue, 14 Sep 2021 14:16:43 +0200 Subject: [PATCH 24/29] Adapt examples to new AdapterTrainer --- examples/adapterfusion/run_fusion_glue.py | 7 +--- examples/dependency-parsing/run_udp.py | 7 ++-- examples/dependency-parsing/utils_udp.py | 13 +++---- examples/language-modeling/run_clm.py | 6 +-- examples/language-modeling/run_mlm.py | 6 +-- examples/multiple-choice/run_swag.py | 6 +-- examples/question-answering/run_qa.py | 7 ++-- examples/question-answering/trainer_qa.py | 6 ++- examples/summarization/run_summarization.py | 6 +-- examples/text-classification/run_glue.py | 6 +-- examples/text-classification/run_glue_alt.py | 6 +-- examples/token-classification/run_ner.py | 6 +-- examples/translation/run_translation.py | 8 ++-- src/transformers/__init__.py | 5 +++ src/transformers/adapters/hub_mixin.py | 39 +++++++++---------- src/transformers/adapters/models/bert.py | 13 ++++--- src/transformers/adapters/trainer.py | 7 +--- src/transformers/models/gpt2/modeling_gpt2.py | 8 ++-- 18 files changed, 80 insertions(+), 82 deletions(-) diff --git a/examples/adapterfusion/run_fusion_glue.py b/examples/adapterfusion/run_fusion_glue.py index a6a330a8cf..f32342010e 100644 --- a/examples/adapterfusion/run_fusion_glue.py +++ b/examples/adapterfusion/run_fusion_glue.py @@ -28,6 +28,7 @@ from transformers import ( AdapterArguments, + AdapterTrainer, AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, @@ -37,7 +38,6 @@ from transformers import GlueDataTrainingArguments as DataTrainingArguments from transformers import ( HfArgumentParser, - Trainer, TrainingArguments, glue_compute_metrics, glue_output_modes, @@ -203,15 +203,12 @@ def compute_metrics(p: EvalPrediction) -> Dict: preds = np.squeeze(p.predictions) return glue_compute_metrics(data_args.task_name, preds, p.label_ids) - # Initialize our Trainer - trainer = Trainer( + trainer = AdapterTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, - do_save_full_model=False, - do_save_adapter_fusion=True, ) # Training diff --git a/examples/dependency-parsing/run_udp.py b/examples/dependency-parsing/run_udp.py index 1112ab5d1f..85a11d9079 100644 --- a/examples/dependency-parsing/run_udp.py +++ b/examples/dependency-parsing/run_udp.py @@ -22,7 +22,7 @@ MultiLingAdapterArguments, set_seed, ) -from utils_udp import UD_HEAD_LABELS, DependencyParsingTrainer, UDTrainingArguments +from utils_udp import UD_HEAD_LABELS, DependencyParsingAdapterTrainer, DependencyParsingTrainer, UDTrainingArguments logger = logging.getLogger(__name__) @@ -245,13 +245,12 @@ def main(): # Initialize our Trainer # HACK: Set this attribute to False to prevent label columns from being deleted training_args.remove_unused_columns = False - trainer = DependencyParsingTrainer( + trainer_class = DependencyParsingAdapterTrainer if adapter_args.train_adapter else DependencyParsingTrainer + trainer = trainer_class( model=model, args=training_args, train_dataset=dataset["train"], eval_dataset=dataset["validation"], - do_save_full_model=not adapter_args.train_adapter, - do_save_adapters=adapter_args.train_adapter, ) # Training diff --git a/examples/dependency-parsing/utils_udp.py b/examples/dependency-parsing/utils_udp.py index f333cfcf65..65161a6b06 100644 --- a/examples/dependency-parsing/utils_udp.py +++ b/examples/dependency-parsing/utils_udp.py @@ -16,6 +16,7 @@ from tqdm import tqdm from transformers import ( + AdapterTrainer, DataCollator, EvalPrediction, PreTrainedModel, @@ -186,10 +187,6 @@ def __init__( model_init: Callable[[], PreTrainedModel] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, - do_save_full_model: bool = True, - do_save_adapters: bool = False, - do_save_adapter_fusion: bool = False, - adapter_names: Optional[List[List[str]]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), **kwargs, ): @@ -203,10 +200,6 @@ def __init__( model_init, compute_metrics, callbacks, - do_save_full_model, - do_save_adapters, - do_save_adapter_fusion, - adapter_names, optimizers, **kwargs, ) @@ -362,3 +355,7 @@ def _prediction_loop( # Add predictions_rels to output, even though we are only interested in the metrics return PredictionOutput(predictions=predictions_rels, label_ids=None, metrics=results) + + +class DependencyParsingAdapterTrainer(AdapterTrainer, DependencyParsingTrainer): + pass diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py index e61f90d2b0..607934cc8e 100644 --- a/examples/language-modeling/run_clm.py +++ b/examples/language-modeling/run_clm.py @@ -35,6 +35,7 @@ from transformers import ( CONFIG_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING, + AdapterTrainer, AutoConfig, AutoModelForCausalLM, AutoTokenizer, @@ -480,7 +481,8 @@ def group_texts(examples): eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) # Initialize our Trainer - trainer = Trainer( + trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer + trainer = trainer_class( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, @@ -488,8 +490,6 @@ def group_texts(examples): tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it. data_collator=default_data_collator, - do_save_full_model=not adapter_args.train_adapter, - do_save_adapters=adapter_args.train_adapter, ) # Training diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py index 5e6739ca82..b10e2564d8 100644 --- a/examples/language-modeling/run_mlm.py +++ b/examples/language-modeling/run_mlm.py @@ -35,6 +35,7 @@ from transformers import ( CONFIG_MAPPING, MODEL_FOR_MASKED_LM_MAPPING, + AdapterTrainer, AutoConfig, AutoModelForMaskedLM, AutoTokenizer, @@ -512,15 +513,14 @@ def group_texts(examples): ) # Initialize our Trainer - trainer = Trainer( + trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer + trainer = trainer_class( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, - do_save_full_model=not adapter_args.train_adapter, - do_save_adapters=adapter_args.train_adapter, ) # Training diff --git a/examples/multiple-choice/run_swag.py b/examples/multiple-choice/run_swag.py index b028c16188..239b8324f5 100644 --- a/examples/multiple-choice/run_swag.py +++ b/examples/multiple-choice/run_swag.py @@ -32,6 +32,7 @@ import transformers.adapters.composition as ac from transformers import ( AdapterConfig, + AdapterTrainer, AutoConfig, AutoModelForMultipleChoice, AutoTokenizer, @@ -437,7 +438,8 @@ def compute_metrics(eval_predictions): return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()} # Initialize our Trainer - trainer = Trainer( + trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer + trainer = trainer_class( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, @@ -445,8 +447,6 @@ def compute_metrics(eval_predictions): tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, - do_save_full_model=not adapter_args.train_adapter, - do_save_adapters=adapter_args.train_adapter, ) # Training diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py index 78de516229..2c5f8f4ec4 100644 --- a/examples/question-answering/run_qa.py +++ b/examples/question-answering/run_qa.py @@ -27,7 +27,7 @@ from datasets import load_dataset, load_metric import transformers -from trainer_qa import QuestionAnsweringTrainer +from trainer_qa import QuestionAnsweringAdapterTrainer, QuestionAnsweringTrainer from transformers import ( AdapterConfig, AutoConfig, @@ -599,7 +599,8 @@ def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) # Initialize our Trainer - trainer = QuestionAnsweringTrainer( + trainer_class = QuestionAnsweringAdapterTrainer if adapter_args.train_adapter else QuestionAnsweringTrainer + trainer = trainer_class( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, @@ -609,8 +610,6 @@ def compute_metrics(p: EvalPrediction): data_collator=data_collator, post_process_function=post_processing_function, compute_metrics=compute_metrics, - do_save_full_model=not adapter_args.train_adapter, - do_save_adapters=adapter_args.train_adapter, ) # Training diff --git a/examples/question-answering/trainer_qa.py b/examples/question-answering/trainer_qa.py index 7f98eba236..31b208c8dd 100644 --- a/examples/question-answering/trainer_qa.py +++ b/examples/question-answering/trainer_qa.py @@ -16,7 +16,7 @@ A subclass of `Trainer` specific to Question-Answering tasks """ -from transformers import Trainer, is_torch_tpu_available +from transformers import AdapterTrainer, Trainer, is_torch_tpu_available from transformers.trainer_utils import PredictionOutput @@ -103,3 +103,7 @@ def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_ke metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics) + + +class QuestionAnsweringAdapterTrainer(QuestionAnsweringTrainer, AdapterTrainer): + pass diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py index 19bd3ba2dc..e878fc13e9 100644 --- a/examples/summarization/run_summarization.py +++ b/examples/summarization/run_summarization.py @@ -40,6 +40,7 @@ EarlyStoppingCallback, HfArgumentParser, MultiLingAdapterArguments, + Seq2SeqAdapterTrainer, Seq2SeqTrainer, Seq2SeqTrainingArguments, set_seed, @@ -585,7 +586,8 @@ def compute_metrics(eval_preds): training_args.load_best_model_at_end = True # Initialize our Trainer - trainer = Seq2SeqTrainer( + trainer_class = Seq2SeqAdapterTrainer if adapter_args.train_adapter else Seq2SeqTrainer + trainer = trainer_class( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, @@ -593,8 +595,6 @@ def compute_metrics(eval_preds): tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, - do_save_full_model=not adapter_args.train_adapter, - do_save_adapters=adapter_args.train_adapter, ) if data_args.patience and data_args.patience > 0: callback = EarlyStoppingCallback(early_stopping_patience=data_args.patience) diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py index 960ba4ec82..c46c5f98a5 100644 --- a/examples/text-classification/run_glue.py +++ b/examples/text-classification/run_glue.py @@ -30,6 +30,7 @@ import transformers.adapters.composition as ac from transformers import ( AdapterConfig, + AdapterTrainer, AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, @@ -515,7 +516,8 @@ def compute_metrics(p: EvalPrediction): data_collator = None # Initialize our Trainer - trainer = Trainer( + trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer + trainer = trainer_class( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, @@ -523,8 +525,6 @@ def compute_metrics(p: EvalPrediction): compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, - do_save_full_model=not adapter_args.train_adapter, - do_save_adapters=adapter_args.train_adapter, ) # Training diff --git a/examples/text-classification/run_glue_alt.py b/examples/text-classification/run_glue_alt.py index e46e6a252c..06f93157a0 100644 --- a/examples/text-classification/run_glue_alt.py +++ b/examples/text-classification/run_glue_alt.py @@ -33,6 +33,7 @@ import transformers.adapters.composition as ac from transformers import ( AdapterConfig, + AdapterTrainer, AutoConfig, AutoModelWithHeads, AutoTokenizer, @@ -402,7 +403,8 @@ def compute_metrics(p: EvalPrediction): return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} # Initialize our Trainer - trainer = Trainer( + trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer + trainer = trainer_class( model=model, args=training_args, train_dataset=train_dataset, @@ -411,8 +413,6 @@ def compute_metrics(p: EvalPrediction): tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. data_collator=default_data_collator if data_args.pad_to_max_length else None, - do_save_full_model=not adapter_args.train_adapter, - do_save_adapters=adapter_args.train_adapter, ) # Training diff --git a/examples/token-classification/run_ner.py b/examples/token-classification/run_ner.py index 33bdb2b0e3..adea11dc75 100644 --- a/examples/token-classification/run_ner.py +++ b/examples/token-classification/run_ner.py @@ -32,6 +32,7 @@ import transformers.adapters.composition as ac from transformers import ( AdapterConfig, + AdapterTrainer, AutoConfig, AutoModelForTokenClassification, AutoTokenizer, @@ -518,7 +519,8 @@ def compute_metrics(p): } # Initialize our Trainer - trainer = Trainer( + trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer + trainer = trainer_class( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, @@ -526,8 +528,6 @@ def compute_metrics(p): tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, - do_save_full_model=not adapter_args.train_adapter, - do_save_adapters=adapter_args.train_adapter, ) # Training diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py index 181fa8bed1..f059cdbae5 100644 --- a/examples/translation/run_translation.py +++ b/examples/translation/run_translation.py @@ -44,11 +44,12 @@ MBartTokenizer, MBartTokenizerFast, MultiLingAdapterArguments, + Seq2SeqAdapterTrainer, Seq2SeqTrainingArguments, default_data_collator, set_seed, ) -from transformers.adapters.trainer import AdapterSeq2SeqTrainer as Seq2SeqTrainer +from transformers.adapters.trainer import Seq2SeqAdapterTrainer as Seq2SeqTrainer from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version from transformers.utils.versions import require_version @@ -581,7 +582,8 @@ def compute_metrics(eval_preds): training_args.load_best_model_at_end = True # Initialize our Trainer - trainer = Seq2SeqTrainer( + trainer_class = Seq2SeqAdapterTrainer if +adapter_args.train_adapter else Seq2SeqTrainer + trainer = trainer_class( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, @@ -589,8 +591,6 @@ def compute_metrics(eval_preds): tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, - do_save_full_model=not adapter_args.train_adapter, - do_save_adapters=adapter_args.train_adapter, ) if data_args.patience and data_args.patience > 0: callback = EarlyStoppingCallback(early_stopping_patience=data_args.patience) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index a2fd8348b3..0fdf623c78 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1188,6 +1188,10 @@ "ModelConfigAdaptersMixin", "ModelWithHeadsAdaptersMixin", ] + _import_structure["adapters.trainer"] = [ + "AdapterTrainer", + "Seq2SeqAdapterTrainer", + ] _import_structure["adapters.training"] = [ "AdapterArguments", "MultiLingAdapterArguments", @@ -2688,6 +2692,7 @@ ModelConfigAdaptersMixin, ModelWithHeadsAdaptersMixin, ) + from .adapters.trainer import AdapterTrainer, Seq2SeqAdapterTrainer from .adapters.training import AdapterArguments, MultiLingAdapterArguments from .adapters.utils import ( ADAPTER_CACHE, diff --git a/src/transformers/adapters/hub_mixin.py b/src/transformers/adapters/hub_mixin.py index 2325b4dcfa..46985393dd 100644 --- a/src/transformers/adapters/hub_mixin.py +++ b/src/transformers/adapters/hub_mixin.py @@ -10,35 +10,29 @@ DEFAULT_TEXT = "" ADAPTER_CARD_TEMPLATE = """ ---- -tags: -{tags} ---- +--- tags: {tags} --- # Adapter `{adapter_repo_name}` for {model_name} -An [adapter](https://adapterhub.ml) for the {model_name} model that was trained on the {dataset_name} dataset{head_info}. +An [adapter](https://adapterhub.ml) for the {model_name} model that was trained on the {dataset_name} +dataset{head_info}. -This adapter was created for usage with the **[adapter-transformers](https://github.com/Adapter-Hub/adapter-transformers)** library. +This adapter was created for usage with the +**[adapter-transformers](https://github.com/Adapter-Hub/adapter-transformers)** library. ## Usage First, install `adapter-transformers`: -``` -pip install -U adapter-transformers -``` -_Note: adapter-transformers is a fork of transformers that acts as a drop-in replacement with adapter support. [More](https://docs.adapterhub.ml/installation.html)_ +``` pip install -U adapter-transformers ``` _Note: adapter-transformers is a fork of transformers that acts as a +drop-in replacement with adapter support. [More](https://docs.adapterhub.ml/installation.html)_ Now, the adapter can be loaded and activated like this: -```python -from transformers import AutoModelWithHeads +```python from transformers import AutoModelWithHeads -model = AutoModelWithHeads.from_pretrained("{model_name}") -adapter_name = model.load_adapter("{adapter_repo_name}") -model.active_adapters = adapter_name -``` +model = AutoModelWithHeads.from_pretrained("{model_name}") adapter_name = model.load_adapter("{adapter_repo_name}") +model.active_adapters = adapter_name ``` ## Architecture & Training @@ -124,7 +118,8 @@ def push_adapter_to_hub( use_auth_token: Union[bool, str] = True, overwrite_adapter_card: bool = False, ): - """Upload an adapter to HuggingFace's Model Hub. + """ + Upload an adapter to HuggingFace's Model Hub. Args: repo_name (str): The name of the repository on the model hub to upload to. @@ -132,10 +127,11 @@ def push_adapter_to_hub( organization (str, optional): Organization in which to push the adapter (you must be a member of this organization). Defaults to None. adapterhub_tag (str, optional): Tag of the format `/` for categorization on https://adapterhub.ml/explore/. - See https://docs.adapterhub.ml/contributing.html#add-a-new-task-or-subtask for more. - If not specified, `datasets_tag` must be given in case a new adapter card is generated. Defaults to None. + See https://docs.adapterhub.ml/contributing.html#add-a-new-task-or-subtask for more. If not specified, + `datasets_tag` must be given in case a new adapter card is generated. Defaults to None. datasets_tag (str, optional): Dataset identifier from https://huggingface.co/datasets. - If not specified, `adapterhub_tag` must be given in case a new adapter card is generated. Defaults to None. + If not specified, `adapterhub_tag` must be given in case a new adapter card is generated. Defaults to + None. local_path (str, optional): Local path used as clone directory of the adapter repository. If not specified, will create a temporary directory. Defaults to None. commit_message (:obj:`str`, `optional`): @@ -145,7 +141,8 @@ def push_adapter_to_hub( Whether or not the repository created should be private (requires a paying subscription). use_auth_token (:obj:`bool` or :obj:`str`, `optional`): The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token - generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). Defaults to True. + generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). Defaults to + True. overwrite_adapter_card (bool, optional): Overwrite an existing adapter card with a newly generated one. If set to `False`, will only generate an adapter card, if none exists. Defaults to False. diff --git a/src/transformers/adapters/models/bert.py b/src/transformers/adapters/models/bert.py index ecafc0e81b..6dee47b5ce 100644 --- a/src/transformers/adapters/models/bert.py +++ b/src/transformers/adapters/models/bert.py @@ -264,9 +264,10 @@ def add_qa_head( self.add_prediction_head(head, overwrite_ok) def add_dependency_parsing_head(self, head_name, num_labels=2, overwrite_ok=False, id2label=None): - """Adds a biaffine dependency parsing head on top of the model. - The parsing head uses the architecture described in "Is Supervised Syntactic Parsing Beneficial for Language Understanding? - An Empirical Investigation" (Glavaš & Vulić, 2021) (https://arxiv.org/pdf/2008.06788.pdf). + """ + Adds a biaffine dependency parsing head on top of the model. The parsing head uses the architecture described + in "Is Supervised Syntactic Parsing Beneficial for Language Understanding? An Empirical Investigation" (Glavaš + & Vulić, 2021) (https://arxiv.org/pdf/2008.06788.pdf). Args: head_name (str): The name of the head. @@ -278,7 +279,8 @@ def add_dependency_parsing_head(self, head_name, num_labels=2, overwrite_ok=Fals self.add_prediction_head(head, overwrite_ok) def add_masked_lm_head(self, head_name, activation_function="gelu", overwrite_ok=False): - """Adds a masked language modeling head on top of the model. + """ + Adds a masked language modeling head on top of the model. Args: head_name (str): The name of the head. @@ -289,7 +291,8 @@ def add_masked_lm_head(self, head_name, activation_function="gelu", overwrite_ok self.add_prediction_head(head, overwrite_ok=overwrite_ok) def add_causal_lm_head(self, head_name, activation_function="gelu", overwrite_ok=False): - """Adds a causal language modeling head on top of the model. + """ + Adds a causal language modeling head on top of the model. Args: head_name (str): The name of the head. diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index 2d0f699fbc..8a7dadeeff 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -43,9 +43,6 @@ def __init__( model_init: Callable[[], PreTrainedModel] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, - do_save_full_model: Optional[bool] = None, - do_save_adapters: Optional[bool] = None, - do_save_adapter_fusion: Optional[bool] = None, adapter_names: Optional[List[List[str]]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), ): @@ -63,7 +60,7 @@ def __init__( ) # Setting this to True can lead to unexpected behaviour with adapters - self.args.remove_unused_columns = False + # self.args.remove_unused_columns = False if adapter_names is not None: self.model.set_active_adapters(adapter_names) @@ -247,5 +244,5 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra fusion_reg_loss.backward() -class AdapterSeq2SeqTrainer(AdapterTrainer, Seq2SeqTrainer): +class Seq2SeqAdapterTrainer(AdapterTrainer, Seq2SeqTrainer): pass diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 73662a0a6e..446dc8403a 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -1357,10 +1357,10 @@ def forward( models and adpters. Since this class does classification on the last token, it requires to know the position of the last token. If a -:obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each -row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot -guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take -the last value in each row of the batch). +:obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. +If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess +the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take the last +value in each row of the batch). """, GPT2_START_DOCSTRING, ) From 9b5d85913b6e7800c5d1dd2c3d1d5a84c8110732 Mon Sep 17 00:00:00 2001 From: hSterz Date: Tue, 14 Sep 2021 14:43:06 +0200 Subject: [PATCH 25/29] Style --- src/transformers/adapters/trainer.py | 12 ++++++++++-- tests/test_adapter_training.py | 3 +-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index 4bd428ef49..274f672fae 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -7,6 +7,7 @@ from torch.utils.data.dataset import Dataset from transformers import PreTrainedModel, Seq2SeqTrainer, Trainer, __version__ +from transformers.adapters.composition import AdapterCompositionBlock, Fuse from transformers.dependency_versions_check import dep_version_check from transformers.integrations import is_fairscale_available from transformers.modeling_utils import unwrap_model @@ -75,9 +76,10 @@ def __init__( or isinstance(self.model.active_adapters, AdapterCompositionBlock) and any([isinstance(child, Fuse) for child in self.model.active_adapters.children]) ) - else: + if model.active_adapters is None: raise ValueError( - "Expected a freezed model with adapters to train. If you want tu fully finetune the model use the Trainer class" + "Expected a model with an active adapter setup. " + "If you want tu fully finetune the model use the Trainer class" ) def create_optimizer(self): @@ -214,6 +216,12 @@ def __init__(self, trainer): super().__init__() self.trainer = trainer + def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + model = kwargs.pop("model") + model_freezed = getattr(model.base_model, "model_freezed", False) + if not model_freezed: + raise ValueError("The model is not freezed. For training adapters please call the train_adapters() method") + def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): model = kwargs.pop("model") if args.load_best_model_at_end and state.best_model_checkpoint is not None: diff --git a/tests/test_adapter_training.py b/tests/test_adapter_training.py index 3a1f6f94ac..7efc008a9d 100644 --- a/tests/test_adapter_training.py +++ b/tests/test_adapter_training.py @@ -10,9 +10,8 @@ GlueDataTrainingArguments, TrainingArguments, ) -from transformers.adapters.composition import Fuse -from transformers.adapters.trainer import AdapterTrainer as Trainer from transformers.adapters.composition import BatchSplit, Fuse +from transformers.adapters.trainer import AdapterTrainer as Trainer from transformers.testing_utils import require_torch From eca72830cdde0d2232be770d1818b214557a3972 Mon Sep 17 00:00:00 2001 From: hSterz Date: Tue, 14 Sep 2021 17:21:57 +0200 Subject: [PATCH 26/29] Overwrite remove unused columns method --- src/transformers/adapters/trainer.py | 33 +++++++++++++++++++++++++--- tests/extended/test_trainer_ext.py | 1 - 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py index 274f672fae..c32991ba17 100644 --- a/src/transformers/adapters/trainer.py +++ b/src/transformers/adapters/trainer.py @@ -1,8 +1,11 @@ +import inspect import os import re from typing import Callable, Dict, List, Optional, Tuple, Union +import datasets import torch +from packaging import version from torch import nn from torch.utils.data.dataset import Dataset @@ -59,9 +62,6 @@ def __init__( optimizers=optimizers, ) - # Setting this to True can lead to unexpected behaviour with adapters - # self.args.remove_unused_columns = False - if adapter_names is not None: self.model.set_active_adapters(adapter_names) # Set the defaults for loading/ saving model & adapters @@ -210,6 +210,33 @@ def _load_heads(self, resume_from_checkpoint): ): self.model.load_head(os.path.join(resume_from_checkpoint, file_name)) + def _remove_unused_columns(self, dataset: "datasets.Dataset", description: Optional[str] = None): + if not self.args.remove_unused_columns: + return dataset + if self._signature_columns is None: + # Inspect model forward signature to keep only the arguments it accepts. + signature = inspect.signature(self.model.forward) + self._signature_columns = list(signature.parameters.keys()) + # Labels may be named label or label_ids, the default data collator handles that. + self._signature_columns += ["label", "label_ids"] + self._signature_columns += self.label_names + columns = [k for k in self._signature_columns if k in dataset.column_names] + ignored_columns = list(set(dataset.column_names) - set(self._signature_columns)) + if len(ignored_columns) > 0: + dset_description = "" if description is None else f"in the {description} set " + logger.info( + f"The following columns {dset_description} don't have a corresponding argument in " + f"`{self.model.__class__.__name__}.forward` and have been ignored: {', '.join(ignored_columns)}." + ) + + if version.parse(datasets.__version__) < version.parse("1.4.0"): + dataset.set_format( + type=dataset.format["type"], columns=columns, format_kwargs=dataset.format["format_kwargs"] + ) + return dataset + else: + return dataset.remove_columns(ignored_columns) + class AdapterTrainerCallback(TrainerCallback): def __init__(self, trainer): diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py index 99a93ac126..e7e7a53fe5 100644 --- a/tests/extended/test_trainer_ext.py +++ b/tests/extended/test_trainer_ext.py @@ -253,7 +253,6 @@ def run_trainer( --validation_file {data_dir}/val.json --test_file {data_dir}/test.json --output_dir {output_dir} - --train_adapter --overwrite_output_dir --max_train_samples 8 --max_source_length {max_len} From b67595e7dbd56084083ea26857def34eed4dafa3 Mon Sep 17 00:00:00 2001 From: hSterz Date: Wed, 15 Sep 2021 09:44:27 +0200 Subject: [PATCH 27/29] Add extended adapter trainer test --- src/transformers/adapters/modeling.py | 4 + tests/extended/test_adapter_trainer_ext.py | 323 +++++++++++++++++++++ 2 files changed, 327 insertions(+) create mode 100644 tests/extended/test_adapter_trainer_ext.py diff --git a/src/transformers/adapters/modeling.py b/src/transformers/adapters/modeling.py index 9c826cd1b7..01630b45ea 100644 --- a/src/transformers/adapters/modeling.py +++ b/src/transformers/adapters/modeling.py @@ -82,6 +82,10 @@ def __init__( if down_sample is None: self.down_sample = self.input_size // 2 + # ensure that the down sample size is at least 1 + if self.down_sample < 1: + self.down_sample = 1 + # Linear down projection of the input seq_list.append(nn.Linear(self.input_size, self.down_sample)) diff --git a/tests/extended/test_adapter_trainer_ext.py b/tests/extended/test_adapter_trainer_ext.py new file mode 100644 index 0000000000..542434533e --- /dev/null +++ b/tests/extended/test_adapter_trainer_ext.py @@ -0,0 +1,323 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import os +import re +import sys +import unittest +from unittest.mock import patch + +from transformers.file_utils import is_apex_available +from transformers.integrations import is_fairscale_available +from transformers.testing_utils import ( + CaptureStderr, + ExtendSysPath, + TestCasePlus, + execute_subprocess_async, + get_gpu_count, + get_torch_dist_unique_port, + require_torch_gpu, + require_torch_multi_gpu, + require_torch_non_multi_gpu, + slow, +) +from transformers.trainer_callback import TrainerState +from transformers.trainer_utils import set_seed + + +bindir = os.path.abspath(os.path.dirname(__file__)) +with ExtendSysPath(f"{bindir}/../../examples/translation"): + from run_translation import main # noqa + + +set_seed(42) +MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1" +MBART_TINY = "sshleifer/tiny-mbart" + + +# a candidate for testing_utils +def require_fairscale(test_case): + """ + Decorator marking a test that requires fairscale + """ + if not is_fairscale_available(): + return unittest.skip("test requires fairscale")(test_case) + else: + return test_case + + +# a candidate for testing_utils +def require_apex(test_case): + """ + Decorator marking a test that requires apex + """ + if not is_apex_available(): + return unittest.skip("test requires apex")(test_case) + else: + return test_case + + +class TestTrainerExt(TestCasePlus): + def run_seq2seq_quick( + self, + distributed=False, + extra_args_str=None, + predict_with_generate=True, + do_train=True, + do_eval=True, + do_predict=True, + ): + output_dir = self.run_trainer( + eval_steps=1, + max_len=12, + model_name=MBART_TINY, + num_train_epochs=1, + distributed=distributed, + extra_args_str=extra_args_str, + predict_with_generate=predict_with_generate, + do_train=do_train, + do_eval=do_eval, + do_predict=do_predict, + ) + logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history + + if not do_eval: + return + + eval_metrics = [log for log in logs if "eval_loss" in log.keys()] + + first_step_stats = eval_metrics[0] + if predict_with_generate: + assert "eval_bleu" in first_step_stats + + last_step_stats = eval_metrics[-1] + assert isinstance(last_step_stats["eval_bleu"], float) + assert not math.isnan(float(last_step_stats["eval_loss"])), "eval_loss must not be `nan`" + + @require_torch_non_multi_gpu + def test_run_seq2seq_no_dist(self): + self.run_seq2seq_quick() + + # verify that the trainer can handle non-distributed with n_gpu > 1 + @require_torch_multi_gpu + def test_run_seq2seq_dp(self): + self.run_seq2seq_quick(distributed=False) + + # verify that the trainer can handle distributed with n_gpu > 1 + @require_torch_multi_gpu + def test_run_seq2seq_ddp(self): + self.run_seq2seq_quick(distributed=True) + + # test --sharded_ddp w/o --fp16 + @require_torch_multi_gpu + @require_fairscale + def test_run_seq2seq_sharded_ddp(self): + self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple") + + # test --sharded_ddp w/ --fp16 + @require_torch_multi_gpu + @require_fairscale + def test_run_seq2seq_sharded_ddp_fp16(self): + self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple --fp16") + + # test --sharded_ddp zero_dp_2 w/o --fp16 + @require_torch_multi_gpu + @require_fairscale + def test_run_seq2seq_fully_sharded_ddp(self): + self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp zero_dp_2", predict_with_generate=False) + + # test --sharded_ddp zero_dp_2 w/ --fp16 + @require_torch_multi_gpu + @require_fairscale + def test_run_seq2seq_fully_sharded_ddp_fp16(self): + self.run_seq2seq_quick( + distributed=True, extra_args_str="--sharded_ddp zero_dp_2 --fp16", predict_with_generate=False + ) + + @require_apex + @require_torch_gpu + def test_run_seq2seq_apex(self): + # XXX: apex breaks the trainer if it's run twice e.g. run_seq2seq.main() from the same + # program and it breaks other tests that run from the same pytest worker, therefore until this is + # sorted out it must be run only in an external program, that is distributed=True in this + # test and only under one or more gpus - if we want cpu will need to make a special test + # + # specifically to the problem traced it to self.optimizer.step() - if it's run 2nd time via + # 2nd main() call it botches the future eval. + # + self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex") + # test 2nd time - was getting eval_loss': nan' + # to reproduce the problem set distributed=False + self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex") + + @require_torch_multi_gpu + def test_trainer_log_level_replica(self): + log_info_string = "Running training" + kwargs = dict(distributed=True, predict_with_generate=False, do_eval=False, do_predict=False) + + # test with the default log_level - should be info and thus log info once + with CaptureStderr() as cl: + self.run_seq2seq_quick( + **kwargs, + extra_args_str="", + ) + n_matches = len(re.findall(log_info_string, cl.err)) + self.assertEqual(n_matches, 1) + + # test with low log_level and log_level_replica - should be noisy on all processes + # now the info string should appear twice on 2 processes + with CaptureStderr() as cl: + self.run_seq2seq_quick( + **kwargs, + extra_args_str="--log_level debug --log_level_replica debug", + ) + n_matches = len(re.findall(log_info_string, cl.err)) + self.assertEqual(n_matches, 2) + + # test with high log_level and low log_level_replica + # now the info string should appear once only on the replica + with CaptureStderr() as cl: + self.run_seq2seq_quick( + **kwargs, + extra_args_str="--log_level error --log_level_replica debug", + ) + n_matches = len(re.findall(log_info_string, cl.err)) + self.assertEqual(n_matches, 1) + + # test with high log_level and log_level_replica - should be quiet on all processes + with CaptureStderr() as cl: + self.run_seq2seq_quick( + **kwargs, + extra_args_str="--log_level error --log_level_replica error", + ) + n_matches = len(re.findall(log_info_string, cl.err)) + self.assertEqual(n_matches, 0) + + @slow + def test_run_seq2seq_slow(self): + output_dir = self.run_trainer( + eval_steps=2, + max_len=128, + model_name=MARIAN_MODEL, + learning_rate=3e-4, + num_train_epochs=10, + distributed=False, + ) + + # Check metrics + logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history + eval_metrics = [log for log in logs if "eval_loss" in log.keys()] + first_step_stats = eval_metrics[0] + last_step_stats = eval_metrics[-1] + + assert first_step_stats["eval_loss"] > last_step_stats["eval_loss"], "model learned nothing" + assert isinstance(last_step_stats["eval_bleu"], float) + + # test if do_predict saves generations and metrics + contents = os.listdir(output_dir) + contents = {os.path.basename(p) for p in contents} + assert "generated_predictions.txt" in contents + assert "predict_results.json" in contents + + def run_trainer( + self, + eval_steps: int, + max_len: int, + model_name: str, + num_train_epochs: int, + learning_rate: float = 3e-3, + distributed: bool = False, + extra_args_str: str = None, + predict_with_generate: bool = True, + do_train: bool = True, + do_eval: bool = True, + do_predict: bool = True, + ): + data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro" + output_dir = self.get_auto_remove_tmp_dir() + args_train = f""" + --model_name_or_path {model_name} + --train_file {data_dir}/train.json + --validation_file {data_dir}/val.json + --test_file {data_dir}/test.json + --output_dir {output_dir} + --overwrite_output_dir + --max_train_samples 8 + --max_source_length {max_len} + --max_target_length {max_len} + --do_train + --train_adapter + --num_train_epochs {str(num_train_epochs)} + --per_device_train_batch_size 4 + --learning_rate {learning_rate} + --warmup_steps 8 + --logging_steps 0 + --save_steps {str(eval_steps)} + --group_by_length + --label_smoothing_factor 0.1 + --adafactor + --target_lang ro_RO + --source_lang en_XX + """ + + args_eval = f""" + --do_eval + --train_adapter + --per_device_eval_batch_size 4 + --max_eval_samples 8 + --val_max_target_length {max_len} + --evaluation_strategy steps + --eval_steps {str(eval_steps)} + """ + + args_predict = """ + --do_predict + """ + + args = "" + if do_train: + args += args_train + + if do_eval: + args += args_eval + + if do_predict: + args += args_predict + + if predict_with_generate: + args += "--predict_with_generate" + + args = args.split() + + if extra_args_str is not None: + args.extend(extra_args_str.split()) + + if distributed: + n_gpu = get_gpu_count() + master_port = get_torch_dist_unique_port() + distributed_args = f""" + -m torch.distributed.launch + --nproc_per_node={n_gpu} + --master_port={master_port} + {self.examples_dir_str}/pytorch/translation/run_translation.py + """.split() + cmd = [sys.executable] + distributed_args + args + execute_subprocess_async(cmd, env=self.get_env()) + else: + testargs = ["run_translation.py"] + args + with patch.object(sys, "argv", testargs): + main() + + return output_dir From b9e41ec9343fa9d48a1c3f7e4a190c6a06a09ea9 Mon Sep 17 00:00:00 2001 From: hSterz Date: Wed, 15 Sep 2021 09:58:55 +0200 Subject: [PATCH 28/29] Fix --- examples/translation/run_translation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py index f059cdbae5..e32d881950 100644 --- a/examples/translation/run_translation.py +++ b/examples/translation/run_translation.py @@ -582,7 +582,7 @@ def compute_metrics(eval_preds): training_args.load_best_model_at_end = True # Initialize our Trainer - trainer_class = Seq2SeqAdapterTrainer if +adapter_args.train_adapter else Seq2SeqTrainer + trainer_class = Seq2SeqAdapterTrainer if adapter_args.train_adapter else Seq2SeqTrainer trainer = trainer_class( model=model, args=training_args, From 05ef6ba5b6d59ef0144e327be892fbb4fb4bfc61 Mon Sep 17 00:00:00 2001 From: hSterz Date: Wed, 15 Sep 2021 10:55:20 +0200 Subject: [PATCH 29/29] Fix --- examples/translation/run_translation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py index e32d881950..76e750b6b9 100644 --- a/examples/translation/run_translation.py +++ b/examples/translation/run_translation.py @@ -45,11 +45,11 @@ MBartTokenizerFast, MultiLingAdapterArguments, Seq2SeqAdapterTrainer, + Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator, set_seed, ) -from transformers.adapters.trainer import Seq2SeqAdapterTrainer as Seq2SeqTrainer from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version from transformers.utils.versions import require_version