From a77c0ade0a9e9458b231078abe724da0cdd7b31a Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Tue, 24 May 2022 15:08:49 +0800 Subject: [PATCH 01/16] update transformer_decoder_layer.py for hugging face --- .../ops/pytorch/transformer_decoder_layer.py | 61 +++++++++++-------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/lightseq/training/ops/pytorch/transformer_decoder_layer.py b/lightseq/training/ops/pytorch/transformer_decoder_layer.py index 6e722f0d..d4df7289 100644 --- a/lightseq/training/ops/pytorch/transformer_decoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_decoder_layer.py @@ -1,13 +1,17 @@ import math +from dataclasses import dataclass import torch from torch import nn from torch.autograd import Function from lightseq.training.ops.pytorch import transformer_cuda_module +from lightseq.training.ops.pytorch.builder import TransformerBuilder from lightseq.training.ops.pytorch.util import ( copy_para, state_dict, + MODEL_ARCH, + check_config, calc_offset, ) from lightseq.training.ops.pytorch.layer_base import TransformerDecoderLayerBase @@ -111,6 +115,7 @@ class LSTransformerDecoderLayer(TransformerDecoderLayerBase): """ layer_id = 0 + encdec_kv_sta = None def __init__(self, config, initial_weights=None, initial_biases=None): super(LSTransformerDecoderLayer, self).__init__() @@ -156,7 +161,7 @@ def __init__(self, config, initial_weights=None, initial_biases=None): self.para_offset = self.para_offset[:-2] self.para = nn.Parameter(torch.Tensor(self.para_offset[-1])) - if initial_weights is None or initial_biases is None: + if initial_weights is None and initial_biases is None: # enc-dec kv weights and bias self.init_transformer_weights() return @@ -184,28 +189,31 @@ def __init__(self, config, initial_weights=None, initial_biases=None): @staticmethod def gen_offset(hidden_size, intermediate_size, nlayer): + """Returns the offset of each module's parameters among all + parameters of a layer + """ hs, ims = hidden_size, intermediate_size sizes = [ - hs * hs * 3, # attn_qkvw - hs * 3, # attn_qkvb - hs * hs, # attn_ow - hs, # attn_ob - hs, # attn_nw - hs, # attn_nb - hs * hs, # encdec_attn_qw - hs, # encdec_attn_qb - hs * hs, # encdec_attn_ow - hs, # encdec_attn_ob - hs, # encdec_attn_nw - hs, # encdec_attn_nb - hs * ims, # inter_w - ims, # inter_b - hs * ims, # output_w - hs, # output_b - hs, # ffn_nw - hs, # ffn_nb - hs * hs * 2 * nlayer, # encdec_attn_kvw - hs * 2 * nlayer, # encdec_attn_kvb + hs * hs * 3, # attn_qkv weight + hs * 3, # attn_qkv bias + hs * hs, # attn_out weight + hs, # attn_out bias + hs, # attn_layernorm weight + hs, # attn_layernorm bias + hs * hs, # encdec_attn_q weight + hs, # encdec_attn_q bias + hs * hs, # encdec_attn_out weight + hs, # encdec_attn_out bias + hs, # encdec_attn_layernorm weight + hs, # encdec_attn_layernorm bias + hs * ims, # inter weight + ims, # inter bias + hs * ims, # output weight + hs, # output bias + hs, # ffn norm weight + hs, # ffn norm bias + hs * hs * 2 * nlayer, # encdec_attn_kv w + hs * 2 * nlayer, # encdec_attn_kv b ] offsets = calc_offset(sizes) return offsets @@ -216,6 +224,7 @@ def _get_weights(self, i): ) def calc_bound(self, w): + """Used to initialize parameters""" fan_in, _ = nn.init._calculate_fan_in_and_fan_out(w) bound = 1.0 / math.sqrt(fan_in) return bound @@ -268,6 +277,7 @@ def init_transformer_weights(self): nn.init.uniform_(self._get_weights(19), -bound, bound) def __assign_layer_weight_grad(self): + """fp16 or fp32""" param = ( self.para_16 if self.config.fp16 and self.para.dtype != torch.half @@ -316,7 +326,7 @@ def state_dict(self, destination=None, prefix="", keep_vars=False): return destination def forward( - self, decoder_states, encoder_out, encoder_padding_mask, cache, **kwargs + self, decoder_states, encoder_out, encoder_padding_mask, cache=None, **kwargs ): """ decoder_states, [batch_size, trg_len, hidden_size] or [batch_size * beam_size, 1, hidden_size] @@ -375,7 +385,9 @@ def forward( shape, dtype=decoder_states.dtype, device=decoder_states.device ).contiguous() cache["encdec_kv"] = encdec_kv - cache_list.append(cache["encdec_kv"]) + self.encdec_kv = encdec_kv + # cache_list.append(cache["encdec_kv"]) + cache_list.append(self.encdec_kv) head_dim = int(self.config.hidden_size / self.config.nhead) shape = (batch_beams, self.config.nhead, step + 1, head_dim) new_k = torch.zeros( @@ -418,4 +430,5 @@ def forward( self.config, cache_list, ) - return output.to(self.para) + past_key_value = (cache_list[0], cache_list[1]) + return output.to(self.para), past_key_value From a61e7620968882d4c75866c7d31907d48f817224 Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Mon, 30 May 2022 16:58:54 +0800 Subject: [PATCH 02/16] huggingface bert training --- .../huggingface/bert/task_glue/run_glue.py | 73 +++++++++++++++++-- .../huggingface/bert/task_glue/run_glue.sh | 2 +- .../ops/pytorch/transformer_encoder_layer.py | 42 +++++++++++ 3 files changed, 111 insertions(+), 6 deletions(-) diff --git a/examples/training/huggingface/bert/task_glue/run_glue.py b/examples/training/huggingface/bert/task_glue/run_glue.py index 0b3b62ca..29075954 100644 --- a/examples/training/huggingface/bert/task_glue/run_glue.py +++ b/examples/training/huggingface/bert/task_glue/run_glue.py @@ -41,11 +41,14 @@ TrainingArguments, default_data_collator, set_seed, + BertForSequenceClassification, + BertLayer ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version from transformers.utils.versions import require_version from ls_hf_transformer_layer import inject_ls_layer +import torch.nn as nn # Will error if the minimal version of Transformers is not installed. Remove at your own risks. @@ -70,6 +73,56 @@ logger = logging.getLogger(__name__) +class LSBertForSequenceClassification(BertForSequenceClassification): + @classmethod + def from_pretrained(self, *args, training_args, model_args, **kwargs): + self.config = kwargs["config"] + model = super().from_pretrained(*args, **kwargs) + if model_args.module_type == 1 or model_args.module_type == 2: + inject_ls_layer(model, training_args, model_args, self.config) + return model + + def inject_origin_layer(self, hf_layer, ls_layer): + for layer_id in range(self.config.num_hidden_layers): + weight, bias = ls_layer[layer_id].params_dict() + layer = hf_layer[layer_id] + + layer.attention.self.query.weight.data.copy_(weight["self_attn_q_proj"]) + layer.attention.self.query.bias.data.copy_(bias["self_attn_q_proj"]) + layer.attention.self.key.weight.data.copy_(weight["self_attn_k_proj"]) + layer.attention.self.key.bias.data.copy_(bias["self_attn_k_proj"]) + layer.attention.self.value.weight.data.copy_(weight["self_attn_v_proj"]) + layer.attention.self.value.bias.data.copy_(bias["self_attn_v_proj"]) + layer.attention.output.dense.weight.data.copy_(weight["self_attn_out_proj"]) + layer.attention.output.dense.bias.data.copy_(bias["self_attn_out_proj"]) + layer.attention.output.LayerNorm.weight.data.copy_(weight["self_attn_layer_norm"]) + layer.attention.output.LayerNorm.bias.data.copy_(bias["self_attn_layer_norm"]) + layer.intermediate.dense.weight.data.copy_(weight["fc1"]) + layer.intermediate.dense.bias.data.copy_(bias["fc1"]) + layer.output.dense.weight.data.copy_(weight["fc2"]) + layer.output.dense.bias.data.copy_(bias["fc2"]) + layer.output.LayerNorm.weight.data.copy_(weight["final_layer_norm"]) + layer.output.LayerNorm.bias.data.copy_(bias["final_layer_norm"]) + + def save_pretrained(self, *args, **kwargs): + def unwrap_model(model): + # since there could be multiple levels of wrapping, unwrap recursively + if hasattr(model, "module"): + return unwrap_model(model.module) + else: + return model + + model_to_save = unwrap_model(self) + + ls_encoder_layer = model_to_save.bert.encoder.layer + model_to_save.bert.encoder.layer = nn.ModuleList([BertLayer(self.config) for _ in range(self.config.num_hidden_layers)]) + self.inject_origin_layer(model_to_save.bert.encoder.layer, ls_encoder_layer) + state_dict = model_to_save.state_dict() + kwargs["state_dict"] = state_dict + + super().save_pretrained(*args, **kwargs) + model_to_save.bert.encoder.layer = ls_encoder_layer + @dataclass class DataTrainingArguments: @@ -406,18 +459,28 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) - model = AutoModelForSequenceClassification.from_pretrained( - model_args.model_name_or_path, + # model = AutoModelForSequenceClassification.from_pretrained( + # model_args.model_name_or_path, + # from_tf=bool(".ckpt" in model_args.model_name_or_path), + # config=config, + # cache_dir=model_args.cache_dir, + # revision=model_args.model_revision, + # use_auth_token=True if model_args.use_auth_token else None, + # ) + model = LSBertForSequenceClassification.from_pretrained( + '/tmp/sst2/checkpoint-500', + training_args=training_args, + model_args=model_args, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) - + print(type(model)) # Replace with LightSeq encoder layers. - if model_args.module_type == 1 or model_args.module_type == 2: - inject_ls_layer(model, training_args, model_args, config) + # if model_args.module_type == 1 or model_args.module_type == 2: + # inject_ls_layer(model, training_args, model_args, config) # Preprocessing the datasets if data_args.task_name is not None: diff --git a/examples/training/huggingface/bert/task_glue/run_glue.sh b/examples/training/huggingface/bert/task_glue/run_glue.sh index a7756ab2..7a183813 100644 --- a/examples/training/huggingface/bert/task_glue/run_glue.sh +++ b/examples/training/huggingface/bert/task_glue/run_glue.sh @@ -33,5 +33,5 @@ python3 -m torch.distributed.launch \ --fp16 \ --seed 1234 \ --logging_steps 10 \ - --module_type 2 \ + --module_type 1 \ --enable_quant false diff --git a/lightseq/training/ops/pytorch/transformer_encoder_layer.py b/lightseq/training/ops/pytorch/transformer_encoder_layer.py index b249d12c..dddcf33b 100644 --- a/lightseq/training/ops/pytorch/transformer_encoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_encoder_layer.py @@ -102,6 +102,8 @@ def __init__(self, config, initial_weights=None, initial_biases=None): hs = self.config.hidden_size ims = self.config.intermediate_size + self.hs = hs + self.ims = ims self.para_offset = LSTransformerEncoderLayer.gen_offset(hs, ims) self.para = nn.Parameter(torch.Tensor(self.para_offset[-1])) @@ -212,6 +214,46 @@ def init_transformer_weights(self): nn.init.ones_(self._get_weights(10)) nn.init.zeros_(self._get_weights(11)) + + def params_dict(self): + ''' + Returns: + weight: dict + bias: dict + ''' + def copy_and_view(m, shape=None): + if shape is None: + shape = (-1,) + return m.data.clone().view(*shape) + + self_attn_qkvw = self._get_weights(0) + self_attn_qw, self_attn_kw, self_attn_vw = self_attn_qkvw.split(self.hs * self.hs, 0) + self_attn_qkvb = self._get_weights(1) + self_attn_qb, self_attn_kb, self_attn_vb = self_attn_qkvb.split(self.hs, 0) + + weight = { + "self_attn_q_proj": copy_and_view(self_attn_qw, (self.hs, self.hs)), + "self_attn_k_proj": copy_and_view(self_attn_kw, (self.hs, self.hs)), + "self_attn_v_proj": copy_and_view(self_attn_vw, (self.hs, self.hs)), + "self_attn_out_proj": copy_and_view(self._get_weights(2), (self.hs, self.hs)), + "self_attn_layer_norm": copy_and_view(self._get_weights(4), (self.hs,)), + "fc1": copy_and_view(self._get_weights(6), (self.ims, self.hs)), + "fc2": copy_and_view(self._get_weights(8), (self.hs, self.ims)), + "final_layer_norm": copy_and_view(self._get_weights(10), (self.hs,)) + } + bias = { + "self_attn_q_proj": copy_and_view(self_attn_qb), + "self_attn_k_proj": copy_and_view(self_attn_kb), + "self_attn_v_proj": copy_and_view(self_attn_vb), + "self_attn_out_proj": copy_and_view(self._get_weights(3)), + "self_attn_layer_norm": copy_and_view(self._get_weights(5)), + "fc1": copy_and_view(self._get_weights(7)), + "fc2": copy_and_view(self._get_weights(9)), + "final_layer_norm": copy_and_view(self._get_weights(11)) + } + return weight, bias + + def __assign_layer_weight_grad(self): param = ( self.para_16 From 89dc03e861590a5c47590ac9d03a9e7e3ca5bd3b Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Tue, 31 May 2022 11:35:28 +0800 Subject: [PATCH 03/16] hugging bart and bert training --- .../huggingface/bert/task_glue/run_glue.py | 18 ++--- .../huggingface/bert/task_glue/run_glue.sh | 2 +- .../ops/pytorch/transformer_decoder_layer.py | 70 ++++++++++++++++++- 3 files changed, 80 insertions(+), 10 deletions(-) diff --git a/examples/training/huggingface/bert/task_glue/run_glue.py b/examples/training/huggingface/bert/task_glue/run_glue.py index 29075954..9d99ae42 100644 --- a/examples/training/huggingface/bert/task_glue/run_glue.py +++ b/examples/training/huggingface/bert/task_glue/run_glue.py @@ -104,7 +104,7 @@ def inject_origin_layer(self, hf_layer, ls_layer): layer.output.LayerNorm.weight.data.copy_(weight["final_layer_norm"]) layer.output.LayerNorm.bias.data.copy_(bias["final_layer_norm"]) - def save_pretrained(self, *args, **kwargs): + def hf_state_dict(self): def unwrap_model(model): # since there could be multiple levels of wrapping, unwrap recursively if hasattr(model, "module"): @@ -113,15 +113,17 @@ def unwrap_model(model): return model model_to_save = unwrap_model(self) - ls_encoder_layer = model_to_save.bert.encoder.layer model_to_save.bert.encoder.layer = nn.ModuleList([BertLayer(self.config) for _ in range(self.config.num_hidden_layers)]) self.inject_origin_layer(model_to_save.bert.encoder.layer, ls_encoder_layer) state_dict = model_to_save.state_dict() - kwargs["state_dict"] = state_dict + model_to_save.bert.encoder.layer = ls_encoder_layer + return state_dict + + def save_pretrained(self, *args, **kwargs): + kwargs["state_dict"] = self.hf_state_dict() super().save_pretrained(*args, **kwargs) - model_to_save.bert.encoder.layer = ls_encoder_layer @dataclass @@ -467,17 +469,17 @@ def main(): # revision=model_args.model_revision, # use_auth_token=True if model_args.use_auth_token else None, # ) - model = LSBertForSequenceClassification.from_pretrained( + model = BertForSequenceClassification.from_pretrained( '/tmp/sst2/checkpoint-500', - training_args=training_args, - model_args=model_args, + # training_args=training_args, + # model_args=model_args, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) - print(type(model)) + # print(type(model)) # Replace with LightSeq encoder layers. # if model_args.module_type == 1 or model_args.module_type == 2: # inject_ls_layer(model, training_args, model_args, config) diff --git a/examples/training/huggingface/bert/task_glue/run_glue.sh b/examples/training/huggingface/bert/task_glue/run_glue.sh index 7a183813..9979979b 100644 --- a/examples/training/huggingface/bert/task_glue/run_glue.sh +++ b/examples/training/huggingface/bert/task_glue/run_glue.sh @@ -25,12 +25,12 @@ python3 -m torch.distributed.launch \ --do_train \ --do_eval \ --max_seq_length 128 \ - --per_device_train_batch_size 32 \ --learning_rate 2e-5 \ --num_train_epochs 10 \ --output_dir /tmp/$TASK_NAME/ \ --overwrite_output_dir \ --fp16 \ + --per_device_train_batch_size 32 \ --seed 1234 \ --logging_steps 10 \ --module_type 1 \ diff --git a/lightseq/training/ops/pytorch/transformer_decoder_layer.py b/lightseq/training/ops/pytorch/transformer_decoder_layer.py index d4df7289..dd43b64e 100644 --- a/lightseq/training/ops/pytorch/transformer_decoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_decoder_layer.py @@ -153,6 +153,8 @@ def __init__(self, config, initial_weights=None, initial_biases=None): hs = self.config.hidden_size ims = self.config.intermediate_size + self.hs = hs + self.ims = ims self.para_offset = LSTransformerDecoderLayer.gen_offset( hs, ims, self.config.nlayer @@ -218,6 +220,69 @@ def gen_offset(hidden_size, intermediate_size, nlayer): offsets = calc_offset(sizes) return offsets + def params_dict(self): + ''' + Returns: + weight: dict + bias: dict + ''' + def copy_and_view(m, shape=None): + if shape is None: + shape = (-1,) + return m.data.clone().view(*shape) + def _copy(m): + return copy_and_view(m, (self.hs, self.hs)) + + self_attn_qkvw = self._get_weights(0) + self_attn_qw, self_attn_kw, self_attn_vw = self_attn_qkvw.split(self.hs * self.hs, 0) + self_attn_qkvb = self._get_weights(1) + self_attn_qb, self_attn_kb, self_attn_vb = self_attn_qkvb.split(self.hs, 0) + + all_enc_attn_kw, all_enc_attn_vw = None, None + all_enc_attn_kb, all_enc_attn_vb = None, None + if self.config.layer_id == 0: + all_enc_attn_kvw = self._get_weights(18) + all_enc_attn_kvw = all_enc_attn_kvw.split(self.hs * self.hs, 0) + all_enc_attn_kw = list(map(_copy, all_enc_attn_kvw[::2])) + all_enc_attn_vw = list(map(_copy, all_enc_attn_kvw[1::2])) + + all_enc_attn_kvb = self._get_weights(19) + all_enc_attn_kvb = all_enc_attn_kvb.split(self.hs, 0) + all_enc_attn_kb = list(map(copy_and_view, all_enc_attn_kvb[::2])) + all_enc_attn_vb = list(map(copy_and_view, all_enc_attn_kvb[1::2])) + + weight = { + "self_attn_q_proj": copy_and_view(self_attn_qw, (self.hs, self.hs)), + "self_attn_k_proj": copy_and_view(self_attn_kw, (self.hs, self.hs)), + "self_attn_v_proj": copy_and_view(self_attn_vw, (self.hs, self.hs)), + "self_attn_out_proj": copy_and_view(self._get_weights(2), (self.hs, self.hs)), + "self_attn_layer_norm": copy_and_view(self._get_weights(4), (self.hs,)), + "encoder_attn_q_proj":copy_and_view(self._get_weights(6), (self.hs, self.hs)), + "encoder_attn_out_proj":copy_and_view(self._get_weights(8), (self.hs, self.hs)), + "encoder_attn_layer_norm":copy_and_view(self._get_weights(10), (self.hs,)), + "fc1": copy_and_view(self._get_weights(12), (self.ims, self.hs)), + "fc2": copy_and_view(self._get_weights(14), (self.hs, self.ims)), + "final_layer_norm": copy_and_view(self._get_weights(16), (self.hs,)), + "encoder_attn_k_proj": all_enc_attn_kw, + "encoder_attn_v_proj": all_enc_attn_vw, + } + bias = { + "self_attn_q_proj": copy_and_view(self_attn_qb), + "self_attn_k_proj": copy_and_view(self_attn_kb), + "self_attn_v_proj": copy_and_view(self_attn_vb), + "self_attn_out_proj": copy_and_view(self._get_weights(3)), + "self_attn_layer_norm": copy_and_view(self._get_weights(5)), + "encoder_attn_q_proj":copy_and_view(self._get_weights(7), (self.hs,)), + "encoder_attn_out_proj":copy_and_view(self._get_weights(9), (self.hs,)), + "encoder_attn_layer_norm":copy_and_view(self._get_weights(11), (self.hs,)), + "fc1": copy_and_view(self._get_weights(13)), + "fc2": copy_and_view(self._get_weights(15)), + "final_layer_norm": copy_and_view(self._get_weights(17)), + "encoder_attn_k_proj": all_enc_attn_kb, + "encoder_attn_v_proj": all_enc_attn_vb + } + return weight, bias + def _get_weights(self, i): return self.para.data.narrow( 0, self.para_offset[i], self.para_offset[i + 1] - self.para_offset[i] @@ -430,5 +495,8 @@ def forward( self.config, cache_list, ) - past_key_value = (cache_list[0], cache_list[1]) + if cache_list == []: + past_key_value = None + else: + past_key_value = (cache_list[0], cache_list[1]) return output.to(self.para), past_key_value From 845a1aa6581f94e491aac0a371e6480fc6cc0abc Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Tue, 31 May 2022 11:37:51 +0800 Subject: [PATCH 04/16] huggingface bart and bert training --- .../huggingface/bart/summarization/.lock | 0 .../bart/summarization/ls_bart_model.py | 326 ++++++++ .../bart/summarization/run_summarization.py | 743 ++++++++++++++++++ .../huggingface/bart/summarization/sum.sh | 38 + 4 files changed, 1107 insertions(+) create mode 100755 examples/training/huggingface/bart/summarization/.lock create mode 100644 examples/training/huggingface/bart/summarization/ls_bart_model.py create mode 100644 examples/training/huggingface/bart/summarization/run_summarization.py create mode 100644 examples/training/huggingface/bart/summarization/sum.sh diff --git a/examples/training/huggingface/bart/summarization/.lock b/examples/training/huggingface/bart/summarization/.lock new file mode 100755 index 00000000..e69de29b diff --git a/examples/training/huggingface/bart/summarization/ls_bart_model.py b/examples/training/huggingface/bart/summarization/ls_bart_model.py new file mode 100644 index 00000000..2b987957 --- /dev/null +++ b/examples/training/huggingface/bart/summarization/ls_bart_model.py @@ -0,0 +1,326 @@ +import torch +from torch import nn +from dataclasses import dataclass +from lightseq.training.ops.pytorch.transformer_encoder_layer import ( + LSTransformerEncoderLayer as TransformerEncoderLayer, +) +from lightseq.training.ops.pytorch.transformer_decoder_layer import ( + LSTransformerDecoderLayer as TransformerDecoderLayer, +) +from transformers import BartForConditionalGeneration +from transformers.models.bart.modeling_bart import BartEncoderLayer, BartDecoderLayer + +def get_weight_and_bias(m): + weight = m.weight.detach().clone() + bias = m.bias.detach().clone() + return weight, bias + + +def get_hf_bart_dec_enc_atten_kv(layers, params_list, nlayer): + init_ws, init_bs = [], [] + for i in range(nlayer): + layer = layers[i] + modules_list = [] + exec(f"modules_list.append(layer.{params_list.encoder_attn_k_proj})") + exec(f"modules_list.append(layer.{params_list.encoder_attn_v_proj})") + for module in modules_list: + w, b = get_weight_and_bias(module) + init_ws.append(w) + init_bs.append(b) + enc_attn_kvw = torch.cat([ele for ele in init_ws], dim=0) + enc_attn_kvb = torch.cat([ele for ele in init_bs], dim=0) + return enc_attn_kvw, enc_attn_kvb + + +class LSHFTransformerEncoderLayer(TransformerEncoderLayer): + def __init__(self, *args, **kwargs): + self.params_list = None + super(LSHFTransformerEncoderLayer, self).__init__(*args, **kwargs) + + def forward(self, hidden_states, encoder_padding_mask, *args, **kwargs): + ls_encoder_padding_mask = encoder_padding_mask.narrow(2, 0, 1).squeeze().ne(0).type_as(encoder_padding_mask) + output = super().forward(hidden_states, ls_encoder_padding_mask) + return (output, None, None, None) + + @staticmethod + def get_params_list(**kwargs): + """Configuration of model hyperparameters for encoder and decoder""" + @dataclass + class ParamsList: + self_attn_q_proj: None + self_attn_k_proj: None + self_attn_v_proj: None + self_attn_out_proj: None + self_attn_layer_norm: None + fc1: None + fc2: None + final_layer_norm: None + + params_list = ParamsList(**kwargs) + # check_config(config) + return params_list + + @classmethod + def build_model(cls, config, params_list, layer_list, layer_id): + layer = layer_list[layer_id] + modules_list = [] + ## only python >= 3.6 (orderedDict) + for module_name in params_list.__dict__.values(): + print(module_name) + exec(f"modules_list.append(layer.{module_name})") + init_ws = [] + init_bs = [] + for module in modules_list: + w, b = get_weight_and_bias(module) + init_ws.append(w) + init_bs.append(b) + return cls(config, init_ws, init_bs) + + +class LSHFTransformerDecoderLayer(TransformerDecoderLayer): + def __init__(self, *args, **kwargs): + super(LSHFTransformerDecoderLayer, self).__init__(*args, **kwargs) + + def forward(self, hidden_states, + attention_mask = None, + encoder_hidden_states = None, + encoder_attention_mask = None, + past_key_value=None, + use_cache=False, + *args, **kwargs): + encoder_hidden_states = encoder_hidden_states.transpose(0, 1).contiguous() + ls_encoder_padding_mask = encoder_attention_mask.narrow(2, 0, 1).squeeze().ne(0).type_as(encoder_attention_mask) + cache = None + if use_cache: + import pdb; pdb.set_trace() + cache = {} if past_key_value is None else { + "dec_self_k": past_key_value[0], + "dec_self_v": past_key_value[1] + } + output = super().forward(hidden_states, encoder_hidden_states, ls_encoder_padding_mask, cache) + return output + + @staticmethod + def get_params_list(**kwargs): + """Configuration of model hyperparameters for encoder and decoder""" + @dataclass + class ParamsList: + self_attn_q_proj: None + self_attn_k_proj: None + self_attn_v_proj: None + self_attn_out_proj: None + self_attn_layer_norm: None + encoder_attn_q_proj: None + encoder_attn_out_proj: None + encoder_attn_layer_norm: None + fc1: None + fc2: None + final_layer_norm: None + encoder_attn_k_proj: None + encoder_attn_v_proj: None + + params_list = ParamsList(**kwargs) + # check_config(config) + return params_list + + @classmethod + def build_model(cls, config, params_list, layer_list, layer_id): + layer = layer_list[layer_id] + modules_list = [] + for param_name in list(params_list.__dict__.values())[:-2]: + exec(f"modules_list.append(layer.{param_name})") + + init_ws = [] + init_bs = [] + for module in modules_list: + w, b = get_weight_and_bias(module) + init_ws.append(w) + init_bs.append(b) + if layer_id == 0: + enc_kvw, enc_kvb = get_hf_bart_dec_enc_atten_kv(layer_list, params_list, config.nlayer) + init_ws.append(enc_kvw) + init_bs.append(enc_kvb) + return cls(config, init_ws, init_bs) + + +def get_enc_layer_config(training_args, config): + enc_config = LSHFTransformerEncoderLayer.get_config( + max_seq_len=config.max_position_embeddings, + hidden_size=config.d_model, + intermediate_size=config.encoder_ffn_dim, + nhead=config.encoder_attention_heads, + attn_prob_dropout_ratio=config.attention_dropout, + activation_dropout_ratio=config.activation_dropout, + hidden_dropout_ratio=config.dropout, + activation_fn=config.activation_function, + max_batch_tokens=4096, + pre_layer_norm=False, + fp16=training_args.fp16, + local_rank=training_args.local_rank, + ) + enc_params_list = LSHFTransformerEncoderLayer.get_params_list( + self_attn_q_proj="self_attn.q_proj", + self_attn_k_proj="self_attn.k_proj", + self_attn_v_proj="self_attn.v_proj", + self_attn_out_proj="self_attn.out_proj", + self_attn_layer_norm="self_attn_layer_norm", + fc1="fc1", + fc2="fc2", + final_layer_norm="final_layer_norm" + ) + return enc_config, enc_params_list + + +def get_dec_layer_config(training_args, config): + dec_config = LSHFTransformerDecoderLayer.get_config( + max_seq_len=config.max_position_embeddings, + hidden_size=config.d_model, + intermediate_size=config.decoder_ffn_dim, + nhead=config.decoder_attention_heads, + attn_prob_dropout_ratio=config.attention_dropout, + activation_dropout_ratio=config.activation_dropout, + hidden_dropout_ratio=config.dropout, + activation_fn=config.activation_function, + nlayer=config.decoder_layers, + pre_layer_norm=False, + max_batch_tokens=4096, + fp16=training_args.fp16, + local_rank=training_args.local_rank, + ) + dec_params_list = LSHFTransformerDecoderLayer.get_params_list( + self_attn_q_proj="self_attn.q_proj", + self_attn_k_proj="self_attn.k_proj", + self_attn_v_proj="self_attn.v_proj", + self_attn_out_proj="self_attn.out_proj", + self_attn_layer_norm="self_attn_layer_norm", + encoder_attn_q_proj="encoder_attn.q_proj", + encoder_attn_out_proj="encoder_attn.out_proj", + encoder_attn_layer_norm="encoder_attn_layer_norm", + fc1="fc1", + fc2="fc2", + final_layer_norm="final_layer_norm", + encoder_attn_k_proj="encoder_attn.k_proj", + encoder_attn_v_proj="encoder_attn.v_proj", + ) + return dec_config, dec_params_list + + +def inject_lightseq_layer(model, training_args, config): + # encoder op replace + model = model.model + for layer_id in range(config.encoder_layers): + enc_config, enc_params_list = get_enc_layer_config(training_args, config) + model.encoder.layers[layer_id] = LSHFTransformerEncoderLayer.build_model( + enc_config, enc_params_list, model.encoder.layers, layer_id + ).cuda() + # decoder op replace + for layer_id in range(config.decoder_layers): + dec_config, dec_params_list = get_dec_layer_config(training_args, config) + model.decoder.layers[layer_id] = LSHFTransformerDecoderLayer.build_model( + dec_config, dec_params_list, model.decoder.layers, layer_id + ).cuda() + + + # model.apply(disable_quant) + + + +class LSBartForConditionalGeneration(BartForConditionalGeneration): + @classmethod + def from_pretrained(self, *args, training_args, **kwargs): + self.config = kwargs["config"] + model = super().from_pretrained(*args, **kwargs) + inject_lightseq_layer(model, training_args, self.config) + return model + + def inject_origin_layer(self, hf_layer, ls_layer, is_decoder=False): + if not is_decoder: + for layer_id in range(self.config.encoder_layers): + weight, bias = ls_layer[layer_id].params_dict() + layer = hf_layer[layer_id] + + layer.self_attn.q_proj.weight.data.copy_(weight["self_attn_q_proj"]) + layer.self_attn.q_proj.bias.data.copy_(bias["self_attn_q_proj"]) + layer.self_attn.k_proj.weight.data.copy_(weight["self_attn_k_proj"]) + layer.self_attn.k_proj.bias.data.copy_(bias["self_attn_k_proj"]) + layer.self_attn.v_proj.weight.data.copy_(weight["self_attn_v_proj"]) + layer.self_attn.v_proj.bias.data.copy_(bias["self_attn_v_proj"]) + layer.self_attn.out_proj.weight.data.copy_(weight["self_attn_out_proj"]) + layer.self_attn.out_proj.bias.data.copy_(bias["self_attn_out_proj"]) + layer.self_attn_layer_norm.weight.data.copy_(weight["self_attn_layer_norm"]) + layer.self_attn_layer_norm.bias.data.copy_(bias["self_attn_layer_norm"]) + layer.fc1.weight.data.copy_(weight["fc1"]) + layer.fc1.bias.data.copy_(bias["fc1"]) + layer.fc2.weight.data.copy_(weight["fc2"]) + layer.fc2.bias.data.copy_(bias["fc2"]) + layer.final_layer_norm.weight.data.copy_(weight["final_layer_norm"]) + layer.final_layer_norm.bias.data.copy_(bias["final_layer_norm"]) + else: + encoder_attn_k_proj_w = None + encoder_attn_k_proj_b = None + encoder_attn_v_proj_w = None + encoder_attn_v_proj_b = None + for layer_id in range(self.config.decoder_layers): + weight, bias = ls_layer[layer_id].params_dict() + layer = hf_layer[layer_id] + + layer.self_attn.q_proj.weight.data.copy_(weight["self_attn_q_proj"]) + layer.self_attn.q_proj.bias.data.copy_(bias["self_attn_q_proj"]) + layer.self_attn.k_proj.weight.data.copy_(weight["self_attn_k_proj"]) + layer.self_attn.k_proj.bias.data.copy_(bias["self_attn_k_proj"]) + layer.self_attn.v_proj.weight.data.copy_(weight["self_attn_v_proj"]) + layer.self_attn.v_proj.bias.data.copy_(bias["self_attn_v_proj"]) + layer.self_attn.out_proj.weight.data.copy_(weight["self_attn_out_proj"]) + layer.self_attn.out_proj.bias.data.copy_(bias["self_attn_out_proj"]) + layer.self_attn_layer_norm.weight.data.copy_(weight["self_attn_layer_norm"]) + layer.self_attn_layer_norm.bias.data.copy_(bias["self_attn_layer_norm"]) + layer.fc1.weight.data.copy_(weight["fc1"]) + layer.fc1.bias.data.copy_(bias["fc1"]) + layer.fc2.weight.data.copy_(weight["fc2"]) + layer.fc2.bias.data.copy_(bias["fc2"]) + layer.final_layer_norm.weight.data.copy_(weight["final_layer_norm"]) + layer.final_layer_norm.bias.data.copy_(bias["final_layer_norm"]) + + layer.encoder_attn.q_proj.weight.data.copy_(weight["encoder_attn_q_proj"]) + layer.encoder_attn.q_proj.bias.data.copy_(bias["encoder_attn_q_proj"]) + layer.encoder_attn.out_proj.weight.data.copy_(weight["encoder_attn_out_proj"]) + layer.encoder_attn.out_proj.bias.data.copy_(bias["encoder_attn_out_proj"]) + layer.encoder_attn_layer_norm.weight.data.copy_(weight["encoder_attn_layer_norm"]) + layer.encoder_attn_layer_norm.bias.data.copy_(bias["encoder_attn_layer_norm"]) + if layer_id == 0: + encoder_attn_k_proj_w = weight["encoder_attn_k_proj"] + encoder_attn_k_proj_b = bias["encoder_attn_k_proj"] + encoder_attn_v_proj_w = weight["encoder_attn_v_proj"] + encoder_attn_v_proj_b = bias["encoder_attn_v_proj"] + layer.encoder_attn.k_proj.weight.data.copy_(encoder_attn_k_proj_w[layer_id]) + layer.encoder_attn.k_proj.bias.data.copy_(encoder_attn_k_proj_b[layer_id]) + layer.encoder_attn.v_proj.weight.data.copy_(encoder_attn_v_proj_w[layer_id]) + layer.encoder_attn.v_proj.bias.data.copy_(encoder_attn_v_proj_b[layer_id]) + + + + def hf_state_dict(self): + def unwrap_model(model): + # since there could be multiple levels of wrapping, unwrap recursively + if hasattr(model, "module"): + return unwrap_model(model.module) + else: + return model + + model_to_save = unwrap_model(self) + ls_encoder_layer = model_to_save.model.encoder.layers + ls_decoder_layer = model_to_save.model.decoder.layers + model_to_save.model.encoder.layers = nn.ModuleList([BartEncoderLayer(self.config) for _ in range(self.config.encoder_layers)]) + model_to_save.model.decoder.layers = nn.ModuleList([BartDecoderLayer(self.config) for _ in range(self.config.decoder_layers)]) + + self.inject_origin_layer(model_to_save.model.encoder.layers, ls_encoder_layer) + self.inject_origin_layer(model_to_save.model.decoder.layers, ls_decoder_layer, is_decoder=True) + state_dict = model_to_save.state_dict() + model_to_save.model.encoder.layers = ls_encoder_layer + model_to_save.model.decoder.layers = ls_decoder_layer + return state_dict + + + def save_pretrained(self, *args, **kwargs): + kwargs["state_dict"] = self.hf_state_dict() + super().save_pretrained(*args, **kwargs) diff --git a/examples/training/huggingface/bart/summarization/run_summarization.py b/examples/training/huggingface/bart/summarization/run_summarization.py new file mode 100644 index 00000000..183959c7 --- /dev/null +++ b/examples/training/huggingface/bart/summarization/run_summarization.py @@ -0,0 +1,743 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for sequence to sequence. +""" +# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments. + +import logging +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import datasets +import nltk # Here to have a nice missing dependency error message early on +import numpy as np +from datasets import load_dataset, load_metric + +import transformers +from filelock import FileLock +from transformers import ( + AutoConfig, + AutoModelForSeq2SeqLM, + AutoTokenizer, + DataCollatorForSeq2Seq, + HfArgumentParser, + MBart50Tokenizer, + MBart50TokenizerFast, + MBartTokenizer, + MBartTokenizerFast, + Seq2SeqTrainer, + Seq2SeqTrainingArguments, + set_seed, + BartForConditionalGeneration, + BartConfig, +) +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version, is_offline_mode +from transformers.utils.versions import require_version +from ls_bart_model import inject_lightseq_layer, LSBartForConditionalGeneration + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.20.0.dev0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") + +logger = logging.getLogger(__name__) + +try: + nltk.data.find("tokenizers/punkt") +except (LookupError, OSError): + if is_offline_mode(): + raise LookupError( + "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files" + ) + with FileLock(".lock") as lock: + nltk.download("punkt", quiet=True) + +# A list of all multilingual tokenizer which require lang attribute. +MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast] + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": ( + "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + ) + }, + ) + resize_position_embeddings: Optional[bool] = field( + default=None, + metadata={ + "help": ( + "Whether to automatically resize the position embeddings if `max_source_length` exceeds " + "the model's position embeddings." + ) + }, + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + lang: str = field(default=None, metadata={"help": "Language id for summarization."}) + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + text_column: Optional[str] = field( + default=None, + metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."}, + ) + summary_column: Optional[str] = field( + default=None, + metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."}, + ) + train_file: Optional[str] = field( + default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."} + ) + validation_file: Optional[str] = field( + default=None, + metadata={ + "help": ( + "An optional input evaluation data file to evaluate the metrics (rouge) on (a jsonlines or csv file)." + ) + }, + ) + test_file: Optional[str] = field( + default=None, + metadata={ + "help": "An optional input test data file to evaluate the metrics (rouge) on (a jsonlines or csv file)." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + max_source_length: Optional[int] = field( + default=128, + metadata={ + "help": ( + "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + ) + }, + ) + max_target_length: Optional[int] = field( + default=128, + metadata={ + "help": ( + "The maximum total sequence length for target text after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + ) + }, + ) + val_max_target_length: Optional[int] = field( + default=None, + metadata={ + "help": ( + "The maximum total sequence length for validation target text after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`." + "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " + "during ``evaluate`` and ``predict``." + ) + }, + ) + pad_to_max_length: bool = field( + default=False, + metadata={ + "help": ( + "Whether to pad all samples to model maximum sentence length. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " + "efficient on GPU but very bad for TPU." + ) + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ) + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + ) + }, + ) + max_predict_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of prediction examples to this " + "value if set." + ) + }, + ) + num_beams: Optional[int] = field( + default=None, + metadata={ + "help": ( + "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, " + "which is used during ``evaluate`` and ``predict``." + ) + }, + ) + ignore_pad_token_for_loss: bool = field( + default=True, + metadata={ + "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not." + }, + ) + source_prefix: Optional[str] = field( + default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."} + ) + + forced_bos_token: Optional[str] = field( + default=None, + metadata={ + "help": ( + "The token to force as the first generated token after the decoder_start_token_id." + "Useful for multilingual models like mBART where the first generated token" + "needs to be the target language token (Usually it is the target language token)" + ) + }, + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." + if self.val_max_target_length is None: + self.val_max_target_length = self.max_target_length + + +summarization_name_mapping = { + "amazon_reviews_multi": ("review_body", "review_title"), + "big_patent": ("description", "abstract"), + "cnn_dailymail": ("article", "highlights"), + "orange_sum": ("text", "summary"), + "pn_summary": ("article", "summary"), + "psc": ("extract_text", "summary_text"), + "samsum": ("dialogue", "summary"), + "thaisum": ("body", "summary"), + "xglue": ("news_body", "news_title"), + "xsum": ("document", "summary"), + "wiki_summary": ("article", "highlights"), +} + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + if data_args.source_prefix is None and model_args.model_name_or_path in [ + "t5-small", + "t5-base", + "t5-large", + "t5-3b", + "t5-11b", + ]: + logger.warning( + "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with " + "`--source_prefix 'summarize: ' `" + ) + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files this script will use the first column for the full texts and the second column for the + # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + data_files = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = data_args.validation_file.split(".")[-1] + if data_args.test_file is not None: + data_files["test"] = data_args.test_file + extension = data_args.test_file.split(".")[-1] + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + model = LSBartForConditionalGeneration.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + training_args=training_args, + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # model = BartForConditionalGeneration.from_pretrained( + # model_args.model_name_or_path, + # from_tf=bool(".ckpt" in model_args.model_name_or_path), + # config=config, + # cache_dir=model_args.cache_dir, + # revision=model_args.model_revision, + # use_auth_token=True if model_args.use_auth_token else None, + # ) + # inject_lightseq_layer(model=model, training_args=training_args, config=config) + # inject_lightseq_layer(model, training_args, config) + + model.resize_token_embeddings(len(tokenizer)) + + if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): + if isinstance(tokenizer, MBartTokenizer): + model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.lang] + else: + model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(data_args.lang) + + if model.config.decoder_start_token_id is None: + raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") + + if ( + hasattr(model.config, "max_position_embeddings") + and model.config.max_position_embeddings < data_args.max_source_length + ): + if model_args.resize_position_embeddings is None: + logger.warning( + "Increasing the model's number of position embedding vectors from" + f" {model.config.max_position_embeddings} to {data_args.max_source_length}." + ) + model.resize_position_embeddings(data_args.max_source_length) + elif model_args.resize_position_embeddings: + model.resize_position_embeddings(data_args.max_source_length) + else: + raise ValueError( + f"`--max_source_length` is set to {data_args.max_source_length}, but the model only has" + f" {model.config.max_position_embeddings} position encodings. Consider either reducing" + f" `--max_source_length` to {model.config.max_position_embeddings} or to automatically resize the" + " model's position encodings by passing `--resize_position_embeddings`." + ) + + prefix = data_args.source_prefix if data_args.source_prefix is not None else "" + + # Preprocessing the datasets. + # We need to tokenize inputs and targets. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + elif training_args.do_eval: + column_names = raw_datasets["validation"].column_names + elif training_args.do_predict: + column_names = raw_datasets["test"].column_names + else: + logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") + return + + if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): + assert ( + data_args.lang is not None + ), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument" + + tokenizer.src_lang = data_args.lang + tokenizer.tgt_lang = data_args.lang + + # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token + # as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument. + forced_bos_token_id = ( + tokenizer.lang_code_to_id[data_args.forced_bos_token] if data_args.forced_bos_token is not None else None + ) + model.config.forced_bos_token_id = forced_bos_token_id + + # Get the column names for input/target. + dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None) + if data_args.text_column is None: + text_column = dataset_columns[0] if dataset_columns is not None else column_names[0] + else: + text_column = data_args.text_column + if text_column not in column_names: + raise ValueError( + f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}" + ) + if data_args.summary_column is None: + summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1] + else: + summary_column = data_args.summary_column + if summary_column not in column_names: + raise ValueError( + f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}" + ) + + # Temporarily set max_target_length for training. + max_target_length = data_args.max_target_length + padding = "max_length" if data_args.pad_to_max_length else False + + if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"): + logger.warning( + "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" + f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" + ) + + def preprocess_function(examples): + # remove pairs where at least one record is None + + inputs, targets = [], [] + for i in range(len(examples[text_column])): + if examples[text_column][i] is not None and examples[summary_column][i] is not None: + inputs.append(examples[text_column][i]) + targets.append(examples[summary_column][i]) + + inputs = [prefix + inp for inp in inputs] + model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) + + # Setup the tokenizer for targets + with tokenizer.as_target_tokenizer(): + labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) + + # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore + # padding in the loss. + if padding == "max_length" and data_args.ignore_pad_token_for_loss: + labels["input_ids"] = [ + [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] + ] + + model_inputs["labels"] = labels["input_ids"] + return model_inputs + + if training_args.do_train: + if "train" not in raw_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = raw_datasets["train"] + if data_args.max_train_samples is not None: + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) + with training_args.main_process_first(desc="train dataset map pre-processing"): + train_dataset = train_dataset.map( + preprocess_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on train dataset", + ) + + if training_args.do_eval: + max_target_length = data_args.val_max_target_length + if "validation" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = raw_datasets["validation"] + if data_args.max_eval_samples is not None: + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) + with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_dataset.map( + preprocess_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on validation dataset", + ) + + if training_args.do_predict: + max_target_length = data_args.val_max_target_length + if "test" not in raw_datasets: + raise ValueError("--do_predict requires a test dataset") + predict_dataset = raw_datasets["test"] + if data_args.max_predict_samples is not None: + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) + with training_args.main_process_first(desc="prediction dataset map pre-processing"): + predict_dataset = predict_dataset.map( + preprocess_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on prediction dataset", + ) + + # Data collator + label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id + data_collator = DataCollatorForSeq2Seq( + tokenizer, + model=model, + label_pad_token_id=label_pad_token_id, + pad_to_multiple_of=8 if training_args.fp16 else None, + ) + + # Metric + metric = load_metric("rouge") + + def postprocess_text(preds, labels): + preds = [pred.strip() for pred in preds] + labels = [label.strip() for label in labels] + + # rougeLSum expects newline after each sentence + preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] + labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] + + return preds, labels + + def compute_metrics(eval_preds): + preds, labels = eval_preds + if isinstance(preds, tuple): + preds = preds[0] + decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) + if data_args.ignore_pad_token_for_loss: + # Replace -100 in the labels as we can't decode them. + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + + # Some simple post-processing + decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) + # Extract a few results from ROUGE + result = {key: value.mid.fmeasure * 100 for key, value in result.items()} + + prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] + result["gen_len"] = np.mean(prediction_lens) + result = {k: round(v, 4) for k, v in result.items()} + return result + + # Initialize our Trainer + trainer = Seq2SeqTrainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics if training_args.predict_with_generate else None, + ) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + results = {} + max_length = ( + training_args.generation_max_length + if training_args.generation_max_length is not None + else data_args.val_max_target_length + ) + num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams + if training_args.do_eval: + logger.info("*** Evaluate ***") + metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval") + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + if training_args.do_predict: + logger.info("*** Predict ***") + + predict_results = trainer.predict( + predict_dataset, metric_key_prefix="predict", max_length=max_length, num_beams=num_beams + ) + metrics = predict_results.metrics + max_predict_samples = ( + data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) + ) + metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) + + trainer.log_metrics("predict", metrics) + trainer.save_metrics("predict", metrics) + + if trainer.is_world_process_zero(): + if training_args.predict_with_generate: + predictions = tokenizer.batch_decode( + predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True + ) + predictions = [pred.strip() for pred in predictions] + output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt") + with open(output_prediction_file, "w") as writer: + writer.write("\n".join(predictions)) + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + + if data_args.lang is not None: + kwargs["language"] = data_args.lang + + if training_args.push_to_hub: + trainer.push_to_hub(**kwargs) + else: + trainer.create_model_card(**kwargs) + + return results + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/training/huggingface/bart/summarization/sum.sh b/examples/training/huggingface/bart/summarization/sum.sh new file mode 100644 index 00000000..dc41f74c --- /dev/null +++ b/examples/training/huggingface/bart/summarization/sum.sh @@ -0,0 +1,38 @@ +# Copyright 2021 The LightSeq Team +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +THIS_DIR=$(dirname $(readlink -f $0)) + +export TASK_NAME=summarization + + # --model_name_or_path facebook/bart-base \ +python3 -m torch.distributed.launch \ + --nproc_per_node=1 \ + $THIS_DIR/run_summarization.py \ + --model_name_or_path facebook/bart-base \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --output_dir /tmp/tst-summarization \ + --max_source_length 128 \ + --per_device_train_batch_size 32 \ + --per_device_eval_batch_size 32 \ + --overwrite_output_dir \ + --seed 1234 \ + --logging_steps 10 \ + --save_steps 100 \ + --fp16 True \ + --predict_with_generate From 75cfe733e00eeec94f4c53cfcb89c380e4d36350 Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Wed, 1 Jun 2022 12:34:30 +0800 Subject: [PATCH 05/16] update bart,bert example and fix bugs --- .../bart/summarization/ls_bart_model.py | 315 ++++++++++-------- .../bart/summarization/run_summarization.py | 4 +- .../huggingface/bart/summarization/sum.sh | 38 --- .../bert/ls_hf_transformer_layer.py | 105 +++++- .../huggingface/bert/task_glue/run_glue.py | 73 +--- .../ls_fs_transformer_decoder_layer.py | 2 +- lightseq/training/ops/pytorch/transformer.py | 2 +- .../ops/pytorch/transformer_decoder_layer.py | 3 +- .../ops/pytorch/transformer_encoder_layer.py | 2 +- 9 files changed, 291 insertions(+), 253 deletions(-) delete mode 100644 examples/training/huggingface/bart/summarization/sum.sh diff --git a/examples/training/huggingface/bart/summarization/ls_bart_model.py b/examples/training/huggingface/bart/summarization/ls_bart_model.py index 2b987957..70381c5c 100644 --- a/examples/training/huggingface/bart/summarization/ls_bart_model.py +++ b/examples/training/huggingface/bart/summarization/ls_bart_model.py @@ -7,9 +7,13 @@ from lightseq.training.ops.pytorch.transformer_decoder_layer import ( LSTransformerDecoderLayer as TransformerDecoderLayer, ) -from transformers import BartForConditionalGeneration +from transformers import ( + BartForConditionalGeneration, + BartPretrainedModel, +) from transformers.models.bart.modeling_bart import BartEncoderLayer, BartDecoderLayer + def get_weight_and_bias(m): weight = m.weight.detach().clone() bias = m.bias.detach().clone() @@ -32,117 +36,6 @@ def get_hf_bart_dec_enc_atten_kv(layers, params_list, nlayer): return enc_attn_kvw, enc_attn_kvb -class LSHFTransformerEncoderLayer(TransformerEncoderLayer): - def __init__(self, *args, **kwargs): - self.params_list = None - super(LSHFTransformerEncoderLayer, self).__init__(*args, **kwargs) - - def forward(self, hidden_states, encoder_padding_mask, *args, **kwargs): - ls_encoder_padding_mask = encoder_padding_mask.narrow(2, 0, 1).squeeze().ne(0).type_as(encoder_padding_mask) - output = super().forward(hidden_states, ls_encoder_padding_mask) - return (output, None, None, None) - - @staticmethod - def get_params_list(**kwargs): - """Configuration of model hyperparameters for encoder and decoder""" - @dataclass - class ParamsList: - self_attn_q_proj: None - self_attn_k_proj: None - self_attn_v_proj: None - self_attn_out_proj: None - self_attn_layer_norm: None - fc1: None - fc2: None - final_layer_norm: None - - params_list = ParamsList(**kwargs) - # check_config(config) - return params_list - - @classmethod - def build_model(cls, config, params_list, layer_list, layer_id): - layer = layer_list[layer_id] - modules_list = [] - ## only python >= 3.6 (orderedDict) - for module_name in params_list.__dict__.values(): - print(module_name) - exec(f"modules_list.append(layer.{module_name})") - init_ws = [] - init_bs = [] - for module in modules_list: - w, b = get_weight_and_bias(module) - init_ws.append(w) - init_bs.append(b) - return cls(config, init_ws, init_bs) - - -class LSHFTransformerDecoderLayer(TransformerDecoderLayer): - def __init__(self, *args, **kwargs): - super(LSHFTransformerDecoderLayer, self).__init__(*args, **kwargs) - - def forward(self, hidden_states, - attention_mask = None, - encoder_hidden_states = None, - encoder_attention_mask = None, - past_key_value=None, - use_cache=False, - *args, **kwargs): - encoder_hidden_states = encoder_hidden_states.transpose(0, 1).contiguous() - ls_encoder_padding_mask = encoder_attention_mask.narrow(2, 0, 1).squeeze().ne(0).type_as(encoder_attention_mask) - cache = None - if use_cache: - import pdb; pdb.set_trace() - cache = {} if past_key_value is None else { - "dec_self_k": past_key_value[0], - "dec_self_v": past_key_value[1] - } - output = super().forward(hidden_states, encoder_hidden_states, ls_encoder_padding_mask, cache) - return output - - @staticmethod - def get_params_list(**kwargs): - """Configuration of model hyperparameters for encoder and decoder""" - @dataclass - class ParamsList: - self_attn_q_proj: None - self_attn_k_proj: None - self_attn_v_proj: None - self_attn_out_proj: None - self_attn_layer_norm: None - encoder_attn_q_proj: None - encoder_attn_out_proj: None - encoder_attn_layer_norm: None - fc1: None - fc2: None - final_layer_norm: None - encoder_attn_k_proj: None - encoder_attn_v_proj: None - - params_list = ParamsList(**kwargs) - # check_config(config) - return params_list - - @classmethod - def build_model(cls, config, params_list, layer_list, layer_id): - layer = layer_list[layer_id] - modules_list = [] - for param_name in list(params_list.__dict__.values())[:-2]: - exec(f"modules_list.append(layer.{param_name})") - - init_ws = [] - init_bs = [] - for module in modules_list: - w, b = get_weight_and_bias(module) - init_ws.append(w) - init_bs.append(b) - if layer_id == 0: - enc_kvw, enc_kvb = get_hf_bart_dec_enc_atten_kv(layer_list, params_list, config.nlayer) - init_ws.append(enc_kvw) - init_bs.append(enc_kvb) - return cls(config, init_ws, init_bs) - - def get_enc_layer_config(training_args, config): enc_config = LSHFTransformerEncoderLayer.get_config( max_seq_len=config.max_position_embeddings, @@ -221,24 +114,25 @@ def inject_lightseq_layer(model, training_args, config): ).cuda() - # model.apply(disable_quant) - - - -class LSBartForConditionalGeneration(BartForConditionalGeneration): - @classmethod - def from_pretrained(self, *args, training_args, **kwargs): - self.config = kwargs["config"] - model = super().from_pretrained(*args, **kwargs) - inject_lightseq_layer(model, training_args, self.config) - return model +def hf_state_dict(model): + """ + Args: + model: huggingface model replaced with lightseq layer + Returns: + Dict: The huggingface state dict + """ + def unwrap_model(model): + # since there could be multiple levels of wrapping, unwrap recursively + if hasattr(model, "module"): + return unwrap_model(model.module) + else: + return model - def inject_origin_layer(self, hf_layer, ls_layer, is_decoder=False): + def inject_hf_layer(config, hf_layer, ls_layer, is_decoder=False): if not is_decoder: - for layer_id in range(self.config.encoder_layers): + for layer_id in range(config.encoder_layers): weight, bias = ls_layer[layer_id].params_dict() layer = hf_layer[layer_id] - layer.self_attn.q_proj.weight.data.copy_(weight["self_attn_q_proj"]) layer.self_attn.q_proj.bias.data.copy_(bias["self_attn_q_proj"]) layer.self_attn.k_proj.weight.data.copy_(weight["self_attn_k_proj"]) @@ -260,10 +154,9 @@ def inject_origin_layer(self, hf_layer, ls_layer, is_decoder=False): encoder_attn_k_proj_b = None encoder_attn_v_proj_w = None encoder_attn_v_proj_b = None - for layer_id in range(self.config.decoder_layers): + for layer_id in range(config.decoder_layers): weight, bias = ls_layer[layer_id].params_dict() layer = hf_layer[layer_id] - layer.self_attn.q_proj.weight.data.copy_(weight["self_attn_q_proj"]) layer.self_attn.q_proj.bias.data.copy_(bias["self_attn_q_proj"]) layer.self_attn.k_proj.weight.data.copy_(weight["self_attn_k_proj"]) @@ -297,30 +190,156 @@ def inject_origin_layer(self, hf_layer, ls_layer, is_decoder=False): layer.encoder_attn.v_proj.weight.data.copy_(encoder_attn_v_proj_w[layer_id]) layer.encoder_attn.v_proj.bias.data.copy_(encoder_attn_v_proj_b[layer_id]) + model_to_save = unwrap_model(model) + if not isinstance(model_to_save, LSBartPretrainedModel): + raise ValueError( + "Must be ligtseq replaced model" + ) + # reload original modules + ls_encoder_layer = model_to_save.model.encoder.layers + ls_decoder_layer = model_to_save.model.decoder.layers + model_to_save.model.encoder.layers = nn.ModuleList([BartEncoderLayer(model.config) for _ in range(model.config.encoder_layers)]) + model_to_save.model.decoder.layers = nn.ModuleList([BartDecoderLayer(model.config) for _ in range(model.config.decoder_layers)]) + + inject_hf_layer(model.config, model_to_save.model.encoder.layers, ls_encoder_layer) + inject_hf_layer(model.config, model_to_save.model.decoder.layers, ls_decoder_layer, is_decoder=True) + state_dict = model_to_save.state_dict() + # replace with lightseq modules + model_to_save.model.encoder.layers = ls_encoder_layer + model_to_save.model.decoder.layers = ls_decoder_layer + return state_dict + + +class LSHFTransformerEncoderLayer(TransformerEncoderLayer): + def __init__(self, *args, **kwargs): + self.params_list = None + super(LSHFTransformerEncoderLayer, self).__init__(*args, **kwargs) + + def forward(self, hidden_states, encoder_padding_mask, *args, **kwargs): + ls_encoder_padding_mask = encoder_padding_mask.narrow(2, 0, 1).squeeze().ne(0).type_as(encoder_padding_mask) + output = super().forward(hidden_states, ls_encoder_padding_mask) + return (output, None, None, None) + + @staticmethod + def get_params_list(**kwargs): + """Configuration of model hyperparameters for encoder and decoder""" + @dataclass + class ParamsList: + self_attn_q_proj: None + self_attn_k_proj: None + self_attn_v_proj: None + self_attn_out_proj: None + self_attn_layer_norm: None + fc1: None + fc2: None + final_layer_norm: None + + params_list = ParamsList(**kwargs) + # check_config(config) + return params_list + + @classmethod + def build_model(cls, config, params_list, layer_list, layer_id): + layer = layer_list[layer_id] + modules_list = [] + ## only python >= 3.6 (orderedDict) + for module_name in params_list.__dict__.values(): + exec(f"modules_list.append(layer.{module_name})") + init_ws = [] + init_bs = [] + for module in modules_list: + w, b = get_weight_and_bias(module) + init_ws.append(w) + init_bs.append(b) + return cls(config, init_ws, init_bs) + + +class LSHFTransformerDecoderLayer(TransformerDecoderLayer): + def __init__(self, *args, **kwargs): + super(LSHFTransformerDecoderLayer, self).__init__(*args, **kwargs) + + def forward(self, hidden_states, + attention_mask = None, + encoder_hidden_states = None, + encoder_attention_mask = None, + past_key_value=None, + use_cache=False, + *args, **kwargs): + encoder_hidden_states = encoder_hidden_states.transpose(0, 1).contiguous() + ls_encoder_padding_mask = encoder_attention_mask.narrow(2, 0, 1).squeeze().ne(0).type_as(encoder_attention_mask) + cache = None + if use_cache: + import pdb; pdb.set_trace() + cache = {} if past_key_value is None else { + "dec_self_k": past_key_value[0], + "dec_self_v": past_key_value[1] + } + output = super().forward(hidden_states, encoder_hidden_states, ls_encoder_padding_mask, cache) + return output + + @staticmethod + def get_params_list(**kwargs): + """Configuration of model hyperparameters for encoder and decoder""" + @dataclass + class ParamsList: + self_attn_q_proj: None + self_attn_k_proj: None + self_attn_v_proj: None + self_attn_out_proj: None + self_attn_layer_norm: None + encoder_attn_q_proj: None + encoder_attn_out_proj: None + encoder_attn_layer_norm: None + fc1: None + fc2: None + final_layer_norm: None + encoder_attn_k_proj: None + encoder_attn_v_proj: None + params_list = ParamsList(**kwargs) + # check_config(config) + return params_list + + @classmethod + def build_model(cls, config, params_list, layer_list, layer_id): + layer = layer_list[layer_id] + modules_list = [] + for param_name in list(params_list.__dict__.values())[:-2]: + exec(f"modules_list.append(layer.{param_name})") - def hf_state_dict(self): - def unwrap_model(model): - # since there could be multiple levels of wrapping, unwrap recursively - if hasattr(model, "module"): - return unwrap_model(model.module) - else: - return model + init_ws = [] + init_bs = [] + for module in modules_list: + w, b = get_weight_and_bias(module) + init_ws.append(w) + init_bs.append(b) + if layer_id == 0: + enc_kvw, enc_kvb = get_hf_bart_dec_enc_atten_kv(layer_list, params_list, config.nlayer) + init_ws.append(enc_kvw) + init_bs.append(enc_kvb) + return cls(config, init_ws, init_bs) - model_to_save = unwrap_model(self) - ls_encoder_layer = model_to_save.model.encoder.layers - ls_decoder_layer = model_to_save.model.decoder.layers - model_to_save.model.encoder.layers = nn.ModuleList([BartEncoderLayer(self.config) for _ in range(self.config.encoder_layers)]) - model_to_save.model.decoder.layers = nn.ModuleList([BartDecoderLayer(self.config) for _ in range(self.config.decoder_layers)]) - - self.inject_origin_layer(model_to_save.model.encoder.layers, ls_encoder_layer) - self.inject_origin_layer(model_to_save.model.decoder.layers, ls_decoder_layer, is_decoder=True) - state_dict = model_to_save.state_dict() - model_to_save.model.encoder.layers = ls_encoder_layer - model_to_save.model.decoder.layers = ls_decoder_layer - return state_dict +class LSBartPretrainedModel(BartPretrainedModel): + @classmethod + def from_pretrained(self, *args, training_args, **kwargs): + self.config = kwargs["config"] + model = super().from_pretrained(*args, **kwargs) + inject_lightseq_layer(model, training_args, self.config) + return model def save_pretrained(self, *args, **kwargs): - kwargs["state_dict"] = self.hf_state_dict() + kwargs["state_dict"] = hf_state_dict(self) super().save_pretrained(*args, **kwargs) + + +class LSBartForConditionalGeneration(LSBartPretrainedModel, BartForConditionalGeneration): + """from BartForConditionalGeneration""" + + +class LSBartForSequenceClassification(LSBartPretrainedModel, BartForSequenceClassification): + """from BartForSequenceClassification""" + + +class LSBartForQuestionAnswering(LSBartPretrainedModel, BartForQuestionAnswering): + """from BartForQuestionAnswering""" diff --git a/examples/training/huggingface/bart/summarization/run_summarization.py b/examples/training/huggingface/bart/summarization/run_summarization.py index 183959c7..a0e7c4b0 100644 --- a/examples/training/huggingface/bart/summarization/run_summarization.py +++ b/examples/training/huggingface/bart/summarization/run_summarization.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # coding=utf-8 +# Copyright 2022 The LightSeq Team # Copyright 2021 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -409,6 +410,7 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) + # Replace with lightseq encoder layers and save the huggingface model model = LSBartForConditionalGeneration.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), @@ -418,6 +420,7 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) + # # Replace with lightseq encoder layers and save the lightseq model # model = BartForConditionalGeneration.from_pretrained( # model_args.model_name_or_path, # from_tf=bool(".ckpt" in model_args.model_name_or_path), @@ -426,7 +429,6 @@ def main(): # revision=model_args.model_revision, # use_auth_token=True if model_args.use_auth_token else None, # ) - # inject_lightseq_layer(model=model, training_args=training_args, config=config) # inject_lightseq_layer(model, training_args, config) model.resize_token_embeddings(len(tokenizer)) diff --git a/examples/training/huggingface/bart/summarization/sum.sh b/examples/training/huggingface/bart/summarization/sum.sh deleted file mode 100644 index dc41f74c..00000000 --- a/examples/training/huggingface/bart/summarization/sum.sh +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright 2021 The LightSeq Team -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -THIS_DIR=$(dirname $(readlink -f $0)) - -export TASK_NAME=summarization - - # --model_name_or_path facebook/bart-base \ -python3 -m torch.distributed.launch \ - --nproc_per_node=1 \ - $THIS_DIR/run_summarization.py \ - --model_name_or_path facebook/bart-base \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --output_dir /tmp/tst-summarization \ - --max_source_length 128 \ - --per_device_train_batch_size 32 \ - --per_device_eval_batch_size 32 \ - --overwrite_output_dir \ - --seed 1234 \ - --logging_steps 10 \ - --save_steps 100 \ - --fp16 True \ - --predict_with_generate diff --git a/examples/training/huggingface/bert/ls_hf_transformer_layer.py b/examples/training/huggingface/bert/ls_hf_transformer_layer.py index 6ad9b8d8..3d572c21 100644 --- a/examples/training/huggingface/bert/ls_hf_transformer_layer.py +++ b/examples/training/huggingface/bert/ls_hf_transformer_layer.py @@ -1,6 +1,17 @@ +import torch.nn as nn from lightseq.training.ops.pytorch.quantization import qat_mode, disable_quant from lightseq.training.ops.pytorch.torch_transformer_layers import BertEmbeddingLayer - +from transformers import ( + BertForSequenceClassification, + BertPreTrainedModel, + BertLayer, + BertLMHeadModel, + BertForMaskedLM, + BertForNextSentencePrediction, + BertForMultipleChoice, + BertForTokenClassification, + BertForQuestionAnswering, +) def get_hf_bert_enc_layer_params(layer): init_ws = [] @@ -114,3 +125,95 @@ def gen_bert_enc_config(training_args, config): model.bert.encoder.layer[i].apply(qat_mode) else: model.bert.encoder.layer[i].apply(disable_quant) + + +def hf_state_dict(model): + """ + Args: + model: huggingface model replaced with lightseq layer + Returns: + Dict: The huggingface state dict + """ + def unwrap_model(model): + # since there could be multiple levels of wrapping, unwrap recursively + if hasattr(model, "module"): + return unwrap_model(model.module) + else: + return model + + def inject_hf_layer(config, hf_layer, ls_layer): + for layer_id in range(config.num_hidden_layers): + weight, bias = ls_layer[layer_id].params_dict() + layer = hf_layer[layer_id] + layer.attention.self.query.weight.data.copy_(weight["self_attn_q_proj"]) + layer.attention.self.query.bias.data.copy_(bias["self_attn_q_proj"]) + layer.attention.self.key.weight.data.copy_(weight["self_attn_k_proj"]) + layer.attention.self.key.bias.data.copy_(bias["self_attn_k_proj"]) + layer.attention.self.value.weight.data.copy_(weight["self_attn_v_proj"]) + layer.attention.self.value.bias.data.copy_(bias["self_attn_v_proj"]) + layer.attention.output.dense.weight.data.copy_(weight["self_attn_out_proj"]) + layer.attention.output.dense.bias.data.copy_(bias["self_attn_out_proj"]) + layer.attention.output.LayerNorm.weight.data.copy_(weight["self_attn_layer_norm"]) + layer.attention.output.LayerNorm.bias.data.copy_(bias["self_attn_layer_norm"]) + layer.intermediate.dense.weight.data.copy_(weight["fc1"]) + layer.intermediate.dense.bias.data.copy_(bias["fc1"]) + layer.output.dense.weight.data.copy_(weight["fc2"]) + layer.output.dense.bias.data.copy_(bias["fc2"]) + layer.output.LayerNorm.weight.data.copy_(weight["final_layer_norm"]) + layer.output.LayerNorm.bias.data.copy_(bias["final_layer_norm"]) + + model_to_save = unwrap_model(model) + if not isinstance(model_to_save, LSBertPreTrainedModel): + raise ValueError( + "Must be ligtseq replaced model" + ) + # reload original modules + ls_encoder_layer = model_to_save.bert.encoder.layer + model_to_save.bert.encoder.layer = nn.ModuleList([BertLayer(model.config) for _ in range(model.config.num_hidden_layers)]) + inject_hf_layer(model_to_save.config, model_to_save.bert.encoder.layer, ls_encoder_layer) + state_dict = model_to_save.state_dict() + # replace with lightseq modules + model_to_save.bert.encoder.layer = ls_encoder_layer + return state_dict + + +class LSBertPreTrainedModel(BertPreTrainedModel): + @classmethod + def from_pretrained(self, *args, training_args, model_args, **kwargs): + self.config = kwargs["config"] + model = super().from_pretrained(*args, **kwargs) + if model_args.module_type == 1 or model_args.module_type == 2: + inject_ls_layer(model, training_args, model_args, self.config) + return model + + def save_pretrained(self, *args, **kwargs): + kwargs["state_dict"] = hf_state_dict(self) + super().save_pretrained(*args, **kwargs) + + +class LSBertForSequenceClassification(LSBertPreTrainedModel, BertForSequenceClassification): + """from BertForSequenceClassification""" + + +class LSBertLMHeadModel(LSBertPreTrainedModel, BertLMHeadModel): + """from BertLMHeadModel""" + + +class LSBertForMaskedLM(LSBertPreTrainedModel, BertForMaskedLM): + """from BertForMaskedLM""" + + +class LSBertForNextSentencePrediction(LSBertPreTrainedModel, BertForNextSentencePrediction): + """from BertForNextSentencePrediction""" + + +class LSBertForMultipleChoice(LSBertPreTrainedModel, BertForMultipleChoice): + """from BertForMultipleChoice""" + + +class LSBertForTokenClassification(LSBertPreTrainedModel, BertForTokenClassification): + """from BertForTokenClassification""" + + +class LSBertForQuestionAnswering(LSBertPreTrainedModel, BertForQuestionAnswering): + """from BertForQuestionAnswering""" \ No newline at end of file diff --git a/examples/training/huggingface/bert/task_glue/run_glue.py b/examples/training/huggingface/bert/task_glue/run_glue.py index 9d99ae42..22247859 100644 --- a/examples/training/huggingface/bert/task_glue/run_glue.py +++ b/examples/training/huggingface/bert/task_glue/run_glue.py @@ -47,7 +47,7 @@ from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version from transformers.utils.versions import require_version -from ls_hf_transformer_layer import inject_ls_layer +from ls_hf_transformer_layer import inject_ls_layer, LSBertForSequenceClassification import torch.nn as nn @@ -73,58 +73,6 @@ logger = logging.getLogger(__name__) -class LSBertForSequenceClassification(BertForSequenceClassification): - @classmethod - def from_pretrained(self, *args, training_args, model_args, **kwargs): - self.config = kwargs["config"] - model = super().from_pretrained(*args, **kwargs) - if model_args.module_type == 1 or model_args.module_type == 2: - inject_ls_layer(model, training_args, model_args, self.config) - return model - - def inject_origin_layer(self, hf_layer, ls_layer): - for layer_id in range(self.config.num_hidden_layers): - weight, bias = ls_layer[layer_id].params_dict() - layer = hf_layer[layer_id] - - layer.attention.self.query.weight.data.copy_(weight["self_attn_q_proj"]) - layer.attention.self.query.bias.data.copy_(bias["self_attn_q_proj"]) - layer.attention.self.key.weight.data.copy_(weight["self_attn_k_proj"]) - layer.attention.self.key.bias.data.copy_(bias["self_attn_k_proj"]) - layer.attention.self.value.weight.data.copy_(weight["self_attn_v_proj"]) - layer.attention.self.value.bias.data.copy_(bias["self_attn_v_proj"]) - layer.attention.output.dense.weight.data.copy_(weight["self_attn_out_proj"]) - layer.attention.output.dense.bias.data.copy_(bias["self_attn_out_proj"]) - layer.attention.output.LayerNorm.weight.data.copy_(weight["self_attn_layer_norm"]) - layer.attention.output.LayerNorm.bias.data.copy_(bias["self_attn_layer_norm"]) - layer.intermediate.dense.weight.data.copy_(weight["fc1"]) - layer.intermediate.dense.bias.data.copy_(bias["fc1"]) - layer.output.dense.weight.data.copy_(weight["fc2"]) - layer.output.dense.bias.data.copy_(bias["fc2"]) - layer.output.LayerNorm.weight.data.copy_(weight["final_layer_norm"]) - layer.output.LayerNorm.bias.data.copy_(bias["final_layer_norm"]) - - def hf_state_dict(self): - def unwrap_model(model): - # since there could be multiple levels of wrapping, unwrap recursively - if hasattr(model, "module"): - return unwrap_model(model.module) - else: - return model - - model_to_save = unwrap_model(self) - ls_encoder_layer = model_to_save.bert.encoder.layer - model_to_save.bert.encoder.layer = nn.ModuleList([BertLayer(self.config) for _ in range(self.config.num_hidden_layers)]) - self.inject_origin_layer(model_to_save.bert.encoder.layer, ls_encoder_layer) - state_dict = model_to_save.state_dict() - model_to_save.bert.encoder.layer = ls_encoder_layer - return state_dict - - - def save_pretrained(self, *args, **kwargs): - kwargs["state_dict"] = self.hf_state_dict() - super().save_pretrained(*args, **kwargs) - @dataclass class DataTrainingArguments: @@ -461,6 +409,8 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) + + # # Replace with lightseq encoder layers and save the lightseq model # model = AutoModelForSequenceClassification.from_pretrained( # model_args.model_name_or_path, # from_tf=bool(".ckpt" in model_args.model_name_or_path), @@ -469,20 +419,21 @@ def main(): # revision=model_args.model_revision, # use_auth_token=True if model_args.use_auth_token else None, # ) - model = BertForSequenceClassification.from_pretrained( - '/tmp/sst2/checkpoint-500', - # training_args=training_args, - # model_args=model_args, + # # Replace with LightSeq encoder layers. + # if model_args.module_type == 1 or model_args.module_type == 2: + # inject_ls_layer(model, training_args, model_args, config) + + # Replace with lightseq encoder layers and save the huggingface model + model = LSBertForSequenceClassification.from_pretrained( + model_args.model_name_or_path, + training_args=training_args, + model_args=model_args, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) - # print(type(model)) - # Replace with LightSeq encoder layers. - # if model_args.module_type == 1 or model_args.module_type == 2: - # inject_ls_layer(model, training_args, model_args, config) # Preprocessing the datasets if data_args.task_name is not None: diff --git a/lightseq/training/cli/fs_modules/ls_fs_transformer_decoder_layer.py b/lightseq/training/cli/fs_modules/ls_fs_transformer_decoder_layer.py index 9b98d725..e3dba1b2 100644 --- a/lightseq/training/cli/fs_modules/ls_fs_transformer_decoder_layer.py +++ b/lightseq/training/cli/fs_modules/ls_fs_transformer_decoder_layer.py @@ -49,7 +49,7 @@ def forward( else: cache = self.get_self_attn_cache(incremental_state) self.beam_size = int(x.shape[0] / encoder_padding_mask.shape[0]) - res = super().forward(x, encoder_out, encoder_padding_mask, cache) + res, _ = super().forward(x, encoder_out, encoder_padding_mask, cache) if cache: self.set_self_attn_cache(incremental_state, cache) return res, None, None diff --git a/lightseq/training/ops/pytorch/transformer.py b/lightseq/training/ops/pytorch/transformer.py index 51a29dae..5c4307dd 100644 --- a/lightseq/training/ops/pytorch/transformer.py +++ b/lightseq/training/ops/pytorch/transformer.py @@ -194,7 +194,7 @@ def forward(self, trg_tokens, encoder_out, encoder_padding_mask, cache=None): for i, layer in enumerate(self.layers): layer_cache = cache[i] if cache else None - x = layer( + x, _ = layer( x, encoder_out, encoder_padding_mask, diff --git a/lightseq/training/ops/pytorch/transformer_decoder_layer.py b/lightseq/training/ops/pytorch/transformer_decoder_layer.py index dd43b64e..dd89dbc6 100644 --- a/lightseq/training/ops/pytorch/transformer_decoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_decoder_layer.py @@ -126,7 +126,7 @@ def __init__(self, config, initial_weights=None, initial_biases=None): print("Lightseq Transformer config is ", self.config.__dict__) - if self.config.local_rank >= 0: + if self.config.local_rank is not None and self.config.local_rank >= 0: torch.cuda.set_device(self.config.local_rank) # create the layer in cuda kernels. @@ -498,5 +498,6 @@ def forward( if cache_list == []: past_key_value = None else: + # cache decoder self attention k v past_key_value = (cache_list[0], cache_list[1]) return output.to(self.para), past_key_value diff --git a/lightseq/training/ops/pytorch/transformer_encoder_layer.py b/lightseq/training/ops/pytorch/transformer_encoder_layer.py index dddcf33b..9dd18585 100644 --- a/lightseq/training/ops/pytorch/transformer_encoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_encoder_layer.py @@ -95,7 +95,7 @@ def __init__(self, config, initial_weights=None, initial_biases=None): print("Lightseq Transformer config is ", self.config.__dict__) - if self.config.local_rank >= 0: + if self.config.local_rank is not None and self.config.local_rank >= 0: torch.cuda.set_device(self.config.local_rank) self.create_cpp_layer() From 96996fc88c8f30c6c5e2f97b153af5f2cdf93c68 Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Wed, 1 Jun 2022 12:35:34 +0800 Subject: [PATCH 06/16] update example --- .../bart/summarization/run_summarization.sh | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 examples/training/huggingface/bart/summarization/run_summarization.sh diff --git a/examples/training/huggingface/bart/summarization/run_summarization.sh b/examples/training/huggingface/bart/summarization/run_summarization.sh new file mode 100644 index 00000000..8ea316f9 --- /dev/null +++ b/examples/training/huggingface/bart/summarization/run_summarization.sh @@ -0,0 +1,36 @@ +# Copyright 2021 The LightSeq Team +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +THIS_DIR=$(dirname $(readlink -f $0)) + +export TASK_NAME=summarization + +python3 -m torch.distributed.launch \ + --nproc_per_node=1 \ + $THIS_DIR/run_summarization.py \ + --model_name_or_path facebook/bart-base \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --output_dir /tmp/$TASK_NAME \ + --max_source_length 128 \ + --per_device_train_batch_size 32 \ + --per_device_eval_batch_size 32 \ + --overwrite_output_dir \ + --seed 1234 \ + --logging_steps 10 \ + --fp16 \ + --predict_with_generate --save_step 100 From 6fe960f9090659567cf274536259ebb50e2d97ed Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Wed, 1 Jun 2022 12:40:43 +0800 Subject: [PATCH 07/16] fix bugs --- lightseq/training/ops/pytorch/transformer_decoder_layer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lightseq/training/ops/pytorch/transformer_decoder_layer.py b/lightseq/training/ops/pytorch/transformer_decoder_layer.py index dd89dbc6..f41d4fbc 100644 --- a/lightseq/training/ops/pytorch/transformer_decoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_decoder_layer.py @@ -115,7 +115,6 @@ class LSTransformerDecoderLayer(TransformerDecoderLayerBase): """ layer_id = 0 - encdec_kv_sta = None def __init__(self, config, initial_weights=None, initial_biases=None): super(LSTransformerDecoderLayer, self).__init__() From 497536bf4c44efbecdae7ad75c7867b2cfd0e701 Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Wed, 1 Jun 2022 12:48:32 +0800 Subject: [PATCH 08/16] fix bugs --- .../huggingface/bart/summarization/run_summarization.sh | 2 +- examples/training/huggingface/bert/task_glue/run_glue.py | 1 - examples/training/huggingface/bert/task_glue/run_glue.sh | 2 +- lightseq/training/ops/pytorch/transformer_decoder_layer.py | 3 +-- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/training/huggingface/bart/summarization/run_summarization.sh b/examples/training/huggingface/bart/summarization/run_summarization.sh index 8ea316f9..1247cb03 100644 --- a/examples/training/huggingface/bart/summarization/run_summarization.sh +++ b/examples/training/huggingface/bart/summarization/run_summarization.sh @@ -33,4 +33,4 @@ python3 -m torch.distributed.launch \ --seed 1234 \ --logging_steps 10 \ --fp16 \ - --predict_with_generate --save_step 100 + --predict_with_generate diff --git a/examples/training/huggingface/bert/task_glue/run_glue.py b/examples/training/huggingface/bert/task_glue/run_glue.py index 22247859..70167ba1 100644 --- a/examples/training/huggingface/bert/task_glue/run_glue.py +++ b/examples/training/huggingface/bert/task_glue/run_glue.py @@ -48,7 +48,6 @@ from transformers.utils import check_min_version from transformers.utils.versions import require_version from ls_hf_transformer_layer import inject_ls_layer, LSBertForSequenceClassification -import torch.nn as nn # Will error if the minimal version of Transformers is not installed. Remove at your own risks. diff --git a/examples/training/huggingface/bert/task_glue/run_glue.sh b/examples/training/huggingface/bert/task_glue/run_glue.sh index 9979979b..7a183813 100644 --- a/examples/training/huggingface/bert/task_glue/run_glue.sh +++ b/examples/training/huggingface/bert/task_glue/run_glue.sh @@ -25,12 +25,12 @@ python3 -m torch.distributed.launch \ --do_train \ --do_eval \ --max_seq_length 128 \ + --per_device_train_batch_size 32 \ --learning_rate 2e-5 \ --num_train_epochs 10 \ --output_dir /tmp/$TASK_NAME/ \ --overwrite_output_dir \ --fp16 \ - --per_device_train_batch_size 32 \ --seed 1234 \ --logging_steps 10 \ --module_type 1 \ diff --git a/lightseq/training/ops/pytorch/transformer_decoder_layer.py b/lightseq/training/ops/pytorch/transformer_decoder_layer.py index f41d4fbc..48a9a0d7 100644 --- a/lightseq/training/ops/pytorch/transformer_decoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_decoder_layer.py @@ -450,8 +450,7 @@ def forward( ).contiguous() cache["encdec_kv"] = encdec_kv self.encdec_kv = encdec_kv - # cache_list.append(cache["encdec_kv"]) - cache_list.append(self.encdec_kv) + cache_list.append(cache["encdec_kv"]) head_dim = int(self.config.hidden_size / self.config.nhead) shape = (batch_beams, self.config.nhead, step + 1, head_dim) new_k = torch.zeros( From 4203c5eba934c8cd41b452a0ad95361f298a5018 Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Wed, 1 Jun 2022 13:24:12 +0800 Subject: [PATCH 09/16] format update --- .../bart/summarization/ls_bart_model.py | 280 ++++++++++-------- .../bart/summarization/run_summarization.py | 113 ++++--- .../bert/ls_hf_transformer_layer.py | 44 ++- .../huggingface/bert/task_glue/run_glue.py | 33 ++- .../ops/pytorch/transformer_decoder_layer.py | 22 +- .../ops/pytorch/transformer_encoder_layer.py | 12 +- 6 files changed, 308 insertions(+), 196 deletions(-) diff --git a/examples/training/huggingface/bart/summarization/ls_bart_model.py b/examples/training/huggingface/bart/summarization/ls_bart_model.py index 70381c5c..1786133d 100644 --- a/examples/training/huggingface/bart/summarization/ls_bart_model.py +++ b/examples/training/huggingface/bart/summarization/ls_bart_model.py @@ -2,10 +2,10 @@ from torch import nn from dataclasses import dataclass from lightseq.training.ops.pytorch.transformer_encoder_layer import ( - LSTransformerEncoderLayer as TransformerEncoderLayer, + LSTransformerEncoderLayer as TransformerEncoderLayer, ) from lightseq.training.ops.pytorch.transformer_decoder_layer import ( - LSTransformerDecoderLayer as TransformerDecoderLayer, + LSTransformerDecoderLayer as TransformerDecoderLayer, ) from transformers import ( BartForConditionalGeneration, @@ -38,77 +38,79 @@ def get_hf_bart_dec_enc_atten_kv(layers, params_list, nlayer): def get_enc_layer_config(training_args, config): enc_config = LSHFTransformerEncoderLayer.get_config( - max_seq_len=config.max_position_embeddings, - hidden_size=config.d_model, - intermediate_size=config.encoder_ffn_dim, - nhead=config.encoder_attention_heads, - attn_prob_dropout_ratio=config.attention_dropout, - activation_dropout_ratio=config.activation_dropout, - hidden_dropout_ratio=config.dropout, - activation_fn=config.activation_function, - max_batch_tokens=4096, - pre_layer_norm=False, - fp16=training_args.fp16, - local_rank=training_args.local_rank, - ) + max_seq_len=config.max_position_embeddings, + hidden_size=config.d_model, + intermediate_size=config.encoder_ffn_dim, + nhead=config.encoder_attention_heads, + attn_prob_dropout_ratio=config.attention_dropout, + activation_dropout_ratio=config.activation_dropout, + hidden_dropout_ratio=config.dropout, + activation_fn=config.activation_function, + max_batch_tokens=4096, + pre_layer_norm=False, + fp16=training_args.fp16, + local_rank=training_args.local_rank, + ) enc_params_list = LSHFTransformerEncoderLayer.get_params_list( - self_attn_q_proj="self_attn.q_proj", - self_attn_k_proj="self_attn.k_proj", - self_attn_v_proj="self_attn.v_proj", - self_attn_out_proj="self_attn.out_proj", - self_attn_layer_norm="self_attn_layer_norm", - fc1="fc1", - fc2="fc2", - final_layer_norm="final_layer_norm" - ) + self_attn_q_proj="self_attn.q_proj", + self_attn_k_proj="self_attn.k_proj", + self_attn_v_proj="self_attn.v_proj", + self_attn_out_proj="self_attn.out_proj", + self_attn_layer_norm="self_attn_layer_norm", + fc1="fc1", + fc2="fc2", + final_layer_norm="final_layer_norm" + ) return enc_config, enc_params_list def get_dec_layer_config(training_args, config): dec_config = LSHFTransformerDecoderLayer.get_config( - max_seq_len=config.max_position_embeddings, - hidden_size=config.d_model, - intermediate_size=config.decoder_ffn_dim, - nhead=config.decoder_attention_heads, - attn_prob_dropout_ratio=config.attention_dropout, - activation_dropout_ratio=config.activation_dropout, - hidden_dropout_ratio=config.dropout, - activation_fn=config.activation_function, - nlayer=config.decoder_layers, - pre_layer_norm=False, - max_batch_tokens=4096, - fp16=training_args.fp16, - local_rank=training_args.local_rank, - ) + max_seq_len=config.max_position_embeddings, + hidden_size=config.d_model, + intermediate_size=config.decoder_ffn_dim, + nhead=config.decoder_attention_heads, + attn_prob_dropout_ratio=config.attention_dropout, + activation_dropout_ratio=config.activation_dropout, + hidden_dropout_ratio=config.dropout, + activation_fn=config.activation_function, + nlayer=config.decoder_layers, + pre_layer_norm=False, + max_batch_tokens=4096, + fp16=training_args.fp16, + local_rank=training_args.local_rank, + ) dec_params_list = LSHFTransformerDecoderLayer.get_params_list( - self_attn_q_proj="self_attn.q_proj", - self_attn_k_proj="self_attn.k_proj", - self_attn_v_proj="self_attn.v_proj", - self_attn_out_proj="self_attn.out_proj", - self_attn_layer_norm="self_attn_layer_norm", - encoder_attn_q_proj="encoder_attn.q_proj", - encoder_attn_out_proj="encoder_attn.out_proj", - encoder_attn_layer_norm="encoder_attn_layer_norm", - fc1="fc1", - fc2="fc2", - final_layer_norm="final_layer_norm", - encoder_attn_k_proj="encoder_attn.k_proj", - encoder_attn_v_proj="encoder_attn.v_proj", - ) + self_attn_q_proj="self_attn.q_proj", + self_attn_k_proj="self_attn.k_proj", + self_attn_v_proj="self_attn.v_proj", + self_attn_out_proj="self_attn.out_proj", + self_attn_layer_norm="self_attn_layer_norm", + encoder_attn_q_proj="encoder_attn.q_proj", + encoder_attn_out_proj="encoder_attn.out_proj", + encoder_attn_layer_norm="encoder_attn_layer_norm", + fc1="fc1", + fc2="fc2", + final_layer_norm="final_layer_norm", + encoder_attn_k_proj="encoder_attn.k_proj", + encoder_attn_v_proj="encoder_attn.v_proj", + ) return dec_config, dec_params_list def inject_lightseq_layer(model, training_args, config): - # encoder op replace + # encoder op replace model = model.model for layer_id in range(config.encoder_layers): - enc_config, enc_params_list = get_enc_layer_config(training_args, config) + enc_config, enc_params_list = get_enc_layer_config( + training_args, config) model.encoder.layers[layer_id] = LSHFTransformerEncoderLayer.build_model( enc_config, enc_params_list, model.encoder.layers, layer_id ).cuda() # decoder op replace for layer_id in range(config.decoder_layers): - dec_config, dec_params_list = get_dec_layer_config(training_args, config) + dec_config, dec_params_list = get_dec_layer_config( + training_args, config) model.decoder.layers[layer_id] = LSHFTransformerDecoderLayer.build_model( dec_config, dec_params_list, model.decoder.layers, layer_id ).cuda() @@ -133,22 +135,34 @@ def inject_hf_layer(config, hf_layer, ls_layer, is_decoder=False): for layer_id in range(config.encoder_layers): weight, bias = ls_layer[layer_id].params_dict() layer = hf_layer[layer_id] - layer.self_attn.q_proj.weight.data.copy_(weight["self_attn_q_proj"]) - layer.self_attn.q_proj.bias.data.copy_(bias["self_attn_q_proj"]) - layer.self_attn.k_proj.weight.data.copy_(weight["self_attn_k_proj"]) - layer.self_attn.k_proj.bias.data.copy_(bias["self_attn_k_proj"]) - layer.self_attn.v_proj.weight.data.copy_(weight["self_attn_v_proj"]) - layer.self_attn.v_proj.bias.data.copy_(bias["self_attn_v_proj"]) - layer.self_attn.out_proj.weight.data.copy_(weight["self_attn_out_proj"]) - layer.self_attn.out_proj.bias.data.copy_(bias["self_attn_out_proj"]) - layer.self_attn_layer_norm.weight.data.copy_(weight["self_attn_layer_norm"]) - layer.self_attn_layer_norm.bias.data.copy_(bias["self_attn_layer_norm"]) + layer.self_attn.q_proj.weight.data.copy_( + weight["self_attn_q_proj"]) + layer.self_attn.q_proj.bias.data.copy_( + bias["self_attn_q_proj"]) + layer.self_attn.k_proj.weight.data.copy_( + weight["self_attn_k_proj"]) + layer.self_attn.k_proj.bias.data.copy_( + bias["self_attn_k_proj"]) + layer.self_attn.v_proj.weight.data.copy_( + weight["self_attn_v_proj"]) + layer.self_attn.v_proj.bias.data.copy_( + bias["self_attn_v_proj"]) + layer.self_attn.out_proj.weight.data.copy_( + weight["self_attn_out_proj"]) + layer.self_attn.out_proj.bias.data.copy_( + bias["self_attn_out_proj"]) + layer.self_attn_layer_norm.weight.data.copy_( + weight["self_attn_layer_norm"]) + layer.self_attn_layer_norm.bias.data.copy_( + bias["self_attn_layer_norm"]) layer.fc1.weight.data.copy_(weight["fc1"]) layer.fc1.bias.data.copy_(bias["fc1"]) layer.fc2.weight.data.copy_(weight["fc2"]) layer.fc2.bias.data.copy_(bias["fc2"]) - layer.final_layer_norm.weight.data.copy_(weight["final_layer_norm"]) - layer.final_layer_norm.bias.data.copy_(bias["final_layer_norm"]) + layer.final_layer_norm.weight.data.copy_( + weight["final_layer_norm"]) + layer.final_layer_norm.bias.data.copy_( + bias["final_layer_norm"]) else: encoder_attn_k_proj_w = None encoder_attn_k_proj_b = None @@ -157,38 +171,60 @@ def inject_hf_layer(config, hf_layer, ls_layer, is_decoder=False): for layer_id in range(config.decoder_layers): weight, bias = ls_layer[layer_id].params_dict() layer = hf_layer[layer_id] - layer.self_attn.q_proj.weight.data.copy_(weight["self_attn_q_proj"]) - layer.self_attn.q_proj.bias.data.copy_(bias["self_attn_q_proj"]) - layer.self_attn.k_proj.weight.data.copy_(weight["self_attn_k_proj"]) - layer.self_attn.k_proj.bias.data.copy_(bias["self_attn_k_proj"]) - layer.self_attn.v_proj.weight.data.copy_(weight["self_attn_v_proj"]) - layer.self_attn.v_proj.bias.data.copy_(bias["self_attn_v_proj"]) - layer.self_attn.out_proj.weight.data.copy_(weight["self_attn_out_proj"]) - layer.self_attn.out_proj.bias.data.copy_(bias["self_attn_out_proj"]) - layer.self_attn_layer_norm.weight.data.copy_(weight["self_attn_layer_norm"]) - layer.self_attn_layer_norm.bias.data.copy_(bias["self_attn_layer_norm"]) + layer.self_attn.q_proj.weight.data.copy_( + weight["self_attn_q_proj"]) + layer.self_attn.q_proj.bias.data.copy_( + bias["self_attn_q_proj"]) + layer.self_attn.k_proj.weight.data.copy_( + weight["self_attn_k_proj"]) + layer.self_attn.k_proj.bias.data.copy_( + bias["self_attn_k_proj"]) + layer.self_attn.v_proj.weight.data.copy_( + weight["self_attn_v_proj"]) + layer.self_attn.v_proj.bias.data.copy_( + bias["self_attn_v_proj"]) + layer.self_attn.out_proj.weight.data.copy_( + weight["self_attn_out_proj"]) + layer.self_attn.out_proj.bias.data.copy_( + bias["self_attn_out_proj"]) + layer.self_attn_layer_norm.weight.data.copy_( + weight["self_attn_layer_norm"]) + layer.self_attn_layer_norm.bias.data.copy_( + bias["self_attn_layer_norm"]) layer.fc1.weight.data.copy_(weight["fc1"]) layer.fc1.bias.data.copy_(bias["fc1"]) layer.fc2.weight.data.copy_(weight["fc2"]) layer.fc2.bias.data.copy_(bias["fc2"]) - layer.final_layer_norm.weight.data.copy_(weight["final_layer_norm"]) - layer.final_layer_norm.bias.data.copy_(bias["final_layer_norm"]) - - layer.encoder_attn.q_proj.weight.data.copy_(weight["encoder_attn_q_proj"]) - layer.encoder_attn.q_proj.bias.data.copy_(bias["encoder_attn_q_proj"]) - layer.encoder_attn.out_proj.weight.data.copy_(weight["encoder_attn_out_proj"]) - layer.encoder_attn.out_proj.bias.data.copy_(bias["encoder_attn_out_proj"]) - layer.encoder_attn_layer_norm.weight.data.copy_(weight["encoder_attn_layer_norm"]) - layer.encoder_attn_layer_norm.bias.data.copy_(bias["encoder_attn_layer_norm"]) + layer.final_layer_norm.weight.data.copy_( + weight["final_layer_norm"]) + layer.final_layer_norm.bias.data.copy_( + bias["final_layer_norm"]) + + layer.encoder_attn.q_proj.weight.data.copy_( + weight["encoder_attn_q_proj"]) + layer.encoder_attn.q_proj.bias.data.copy_( + bias["encoder_attn_q_proj"]) + layer.encoder_attn.out_proj.weight.data.copy_( + weight["encoder_attn_out_proj"]) + layer.encoder_attn.out_proj.bias.data.copy_( + bias["encoder_attn_out_proj"]) + layer.encoder_attn_layer_norm.weight.data.copy_( + weight["encoder_attn_layer_norm"]) + layer.encoder_attn_layer_norm.bias.data.copy_( + bias["encoder_attn_layer_norm"]) if layer_id == 0: encoder_attn_k_proj_w = weight["encoder_attn_k_proj"] encoder_attn_k_proj_b = bias["encoder_attn_k_proj"] encoder_attn_v_proj_w = weight["encoder_attn_v_proj"] encoder_attn_v_proj_b = bias["encoder_attn_v_proj"] - layer.encoder_attn.k_proj.weight.data.copy_(encoder_attn_k_proj_w[layer_id]) - layer.encoder_attn.k_proj.bias.data.copy_(encoder_attn_k_proj_b[layer_id]) - layer.encoder_attn.v_proj.weight.data.copy_(encoder_attn_v_proj_w[layer_id]) - layer.encoder_attn.v_proj.bias.data.copy_(encoder_attn_v_proj_b[layer_id]) + layer.encoder_attn.k_proj.weight.data.copy_( + encoder_attn_k_proj_w[layer_id]) + layer.encoder_attn.k_proj.bias.data.copy_( + encoder_attn_k_proj_b[layer_id]) + layer.encoder_attn.v_proj.weight.data.copy_( + encoder_attn_v_proj_w[layer_id]) + layer.encoder_attn.v_proj.bias.data.copy_( + encoder_attn_v_proj_b[layer_id]) model_to_save = unwrap_model(model) if not isinstance(model_to_save, LSBartPretrainedModel): @@ -198,11 +234,15 @@ def inject_hf_layer(config, hf_layer, ls_layer, is_decoder=False): # reload original modules ls_encoder_layer = model_to_save.model.encoder.layers ls_decoder_layer = model_to_save.model.decoder.layers - model_to_save.model.encoder.layers = nn.ModuleList([BartEncoderLayer(model.config) for _ in range(model.config.encoder_layers)]) - model_to_save.model.decoder.layers = nn.ModuleList([BartDecoderLayer(model.config) for _ in range(model.config.decoder_layers)]) - - inject_hf_layer(model.config, model_to_save.model.encoder.layers, ls_encoder_layer) - inject_hf_layer(model.config, model_to_save.model.decoder.layers, ls_decoder_layer, is_decoder=True) + model_to_save.model.encoder.layers = nn.ModuleList( + [BartEncoderLayer(model.config) for _ in range(model.config.encoder_layers)]) + model_to_save.model.decoder.layers = nn.ModuleList( + [BartDecoderLayer(model.config) for _ in range(model.config.decoder_layers)]) + + inject_hf_layer( + model.config, model_to_save.model.encoder.layers, ls_encoder_layer) + inject_hf_layer(model.config, model_to_save.model.decoder.layers, + ls_decoder_layer, is_decoder=True) state_dict = model_to_save.state_dict() # replace with lightseq modules model_to_save.model.encoder.layers = ls_encoder_layer @@ -216,7 +256,8 @@ def __init__(self, *args, **kwargs): super(LSHFTransformerEncoderLayer, self).__init__(*args, **kwargs) def forward(self, hidden_states, encoder_padding_mask, *args, **kwargs): - ls_encoder_padding_mask = encoder_padding_mask.narrow(2, 0, 1).squeeze().ne(0).type_as(encoder_padding_mask) + ls_encoder_padding_mask = encoder_padding_mask.narrow( + 2, 0, 1).squeeze().ne(0).type_as(encoder_padding_mask) output = super().forward(hidden_states, ls_encoder_padding_mask) return (output, None, None, None) @@ -225,14 +266,14 @@ def get_params_list(**kwargs): """Configuration of model hyperparameters for encoder and decoder""" @dataclass class ParamsList: - self_attn_q_proj: None - self_attn_k_proj: None - self_attn_v_proj: None - self_attn_out_proj: None - self_attn_layer_norm: None + self_attn_q_proj: None + self_attn_k_proj: None + self_attn_v_proj: None + self_attn_out_proj: None + self_attn_layer_norm: None fc1: None fc2: None - final_layer_norm: None + final_layer_norm: None params_list = ParamsList(**kwargs) # check_config(config) @@ -242,7 +283,7 @@ class ParamsList: def build_model(cls, config, params_list, layer_list, layer_id): layer = layer_list[layer_id] modules_list = [] - ## only python >= 3.6 (orderedDict) + # only python >= 3.6 (orderedDict) for module_name in params_list.__dict__.values(): exec(f"modules_list.append(layer.{module_name})") init_ws = [] @@ -259,22 +300,26 @@ def __init__(self, *args, **kwargs): super(LSHFTransformerDecoderLayer, self).__init__(*args, **kwargs) def forward(self, hidden_states, - attention_mask = None, - encoder_hidden_states = None, - encoder_attention_mask = None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, past_key_value=None, use_cache=False, *args, **kwargs): - encoder_hidden_states = encoder_hidden_states.transpose(0, 1).contiguous() - ls_encoder_padding_mask = encoder_attention_mask.narrow(2, 0, 1).squeeze().ne(0).type_as(encoder_attention_mask) + encoder_hidden_states = encoder_hidden_states.transpose( + 0, 1).contiguous() + ls_encoder_padding_mask = encoder_attention_mask.narrow( + 2, 0, 1).squeeze().ne(0).type_as(encoder_attention_mask) cache = None if use_cache: - import pdb; pdb.set_trace() + import pdb + pdb.set_trace() cache = {} if past_key_value is None else { "dec_self_k": past_key_value[0], "dec_self_v": past_key_value[1] } - output = super().forward(hidden_states, encoder_hidden_states, ls_encoder_padding_mask, cache) + output = super().forward(hidden_states, encoder_hidden_states, + ls_encoder_padding_mask, cache) return output @staticmethod @@ -282,11 +327,11 @@ def get_params_list(**kwargs): """Configuration of model hyperparameters for encoder and decoder""" @dataclass class ParamsList: - self_attn_q_proj: None - self_attn_k_proj: None - self_attn_v_proj: None - self_attn_out_proj: None - self_attn_layer_norm: None + self_attn_q_proj: None + self_attn_k_proj: None + self_attn_v_proj: None + self_attn_out_proj: None + self_attn_layer_norm: None encoder_attn_q_proj: None encoder_attn_out_proj: None encoder_attn_layer_norm: None @@ -314,7 +359,8 @@ def build_model(cls, config, params_list, layer_list, layer_id): init_ws.append(w) init_bs.append(b) if layer_id == 0: - enc_kvw, enc_kvb = get_hf_bart_dec_enc_atten_kv(layer_list, params_list, config.nlayer) + enc_kvw, enc_kvb = get_hf_bart_dec_enc_atten_kv( + layer_list, params_list, config.nlayer) init_ws.append(enc_kvw) init_bs.append(enc_kvb) return cls(config, init_ws, init_bs) @@ -340,6 +386,6 @@ class LSBartForConditionalGeneration(LSBartPretrainedModel, BartForConditionalGe class LSBartForSequenceClassification(LSBartPretrainedModel, BartForSequenceClassification): """from BartForSequenceClassification""" - + class LSBartForQuestionAnswering(LSBartPretrainedModel, BartForQuestionAnswering): """from BartForQuestionAnswering""" diff --git a/examples/training/huggingface/bart/summarization/run_summarization.py b/examples/training/huggingface/bart/summarization/run_summarization.py index a0e7c4b0..d477ab14 100644 --- a/examples/training/huggingface/bart/summarization/run_summarization.py +++ b/examples/training/huggingface/bart/summarization/run_summarization.py @@ -56,7 +56,8 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.20.0.dev0") -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") +require_version("datasets>=1.8.0", + "To fix: pip install -r examples/pytorch/summarization/requirements.txt") logger = logging.getLogger(__name__) @@ -71,7 +72,8 @@ nltk.download("punkt", quiet=True) # A list of all multilingual tokenizer which require lang attribute. -MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast] +MULTILINGUAL_TOKENIZERS = [ + MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast] @dataclass @@ -81,7 +83,8 @@ class ModelArguments: """ model_name_or_path: str = field( - metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + metadata={ + "help": "Path to pretrained model or model identifier from huggingface.co/models"} ) config_name: Optional[str] = field( default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} @@ -91,15 +94,18 @@ class ModelArguments: ) cache_dir: Optional[str] = field( default=None, - metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"}, + metadata={ + "help": "Where to store the pretrained models downloaded from huggingface.co"}, ) use_fast_tokenizer: bool = field( default=True, - metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + metadata={ + "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, ) model_revision: str = field( default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + metadata={ + "help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) use_auth_token: bool = field( default=False, @@ -127,7 +133,8 @@ class DataTrainingArguments: Arguments pertaining to what data we are going to input our model for training and eval. """ - lang: str = field(default=None, metadata={"help": "Language id for summarization."}) + lang: str = field(default=None, metadata={ + "help": "Language id for summarization."}) dataset_name: Optional[str] = field( default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} @@ -137,11 +144,13 @@ class DataTrainingArguments: ) text_column: Optional[str] = field( default=None, - metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."}, + metadata={ + "help": "The name of the column in the datasets containing the full texts (for summarization)."}, ) summary_column: Optional[str] = field( default=None, - metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."}, + metadata={ + "help": "The name of the column in the datasets containing the summaries (for summarization)."}, ) train_file: Optional[str] = field( default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."} @@ -265,14 +274,17 @@ class DataTrainingArguments: def __post_init__(self): if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") + raise ValueError( + "Need either a dataset name or a training/validation file.") else: if self.train_file is not None: extension = self.train_file.split(".")[-1] - assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." + assert extension in [ + "csv", "json"], "`train_file` should be a csv or a json file." if self.validation_file is not None: extension = self.validation_file.split(".")[-1] - assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." + assert extension in [ + "csv", "json"], "`validation_file` should be a csv or a json file." if self.val_max_target_length is None: self.val_max_target_length = self.max_target_length @@ -297,11 +309,13 @@ def main(): # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) + parser = HfArgumentParser( + (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + model_args, data_args, training_args = parser.parse_json_file( + json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() @@ -437,10 +451,12 @@ def main(): if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.lang] else: - model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(data_args.lang) + model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids( + data_args.lang) if model.config.decoder_start_token_id is None: - raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") + raise ValueError( + "Make sure that `config.decoder_start_token_id` is correctly defined") if ( hasattr(model.config, "max_position_embeddings") @@ -473,7 +489,8 @@ def main(): elif training_args.do_predict: column_names = raw_datasets["test"].column_names else: - logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") + logger.info( + "There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") return if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): @@ -492,7 +509,8 @@ def main(): model.config.forced_bos_token_id = forced_bos_token_id # Get the column names for input/target. - dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None) + dataset_columns = summarization_name_mapping.get( + data_args.dataset_name, None) if data_args.text_column is None: text_column = dataset_columns[0] if dataset_columns is not None else column_names[0] else: @@ -530,11 +548,13 @@ def preprocess_function(examples): targets.append(examples[summary_column][i]) inputs = [prefix + inp for inp in inputs] - model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) + model_inputs = tokenizer( + inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): - labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) + labels = tokenizer( + targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. @@ -551,7 +571,8 @@ def preprocess_function(examples): raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: - max_train_samples = min(len(train_dataset), data_args.max_train_samples) + max_train_samples = min( + len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( @@ -569,7 +590,8 @@ def preprocess_function(examples): raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: - max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + max_eval_samples = min( + len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( @@ -587,8 +609,10 @@ def preprocess_function(examples): raise ValueError("--do_predict requires a test dataset") predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: - max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) - predict_dataset = predict_dataset.select(range(max_predict_samples)) + max_predict_samples = min( + len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select( + range(max_predict_samples)) with training_args.main_process_first(desc="prediction dataset map pre-processing"): predict_dataset = predict_dataset.map( preprocess_function, @@ -600,7 +624,8 @@ def preprocess_function(examples): ) # Data collator - label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id + label_pad_token_id = - \ + 100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, @@ -629,16 +654,21 @@ def compute_metrics(eval_preds): if data_args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) - decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + decoded_labels = tokenizer.batch_decode( + labels, skip_special_tokens=True) # Some simple post-processing - decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) + decoded_preds, decoded_labels = postprocess_text( + decoded_preds, decoded_labels) - result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) + result = metric.compute(predictions=decoded_preds, + references=decoded_labels, use_stemmer=True) # Extract a few results from ROUGE - result = {key: value.mid.fmeasure * 100 for key, value in result.items()} + result = {key: value.mid.fmeasure * + 100 for key, value in result.items()} - prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] + prediction_lens = [np.count_nonzero( + pred != tokenizer.pad_token_id) for pred in preds] result["gen_len"] = np.mean(prediction_lens) result = {k: round(v, 4) for k, v in result.items()} return result @@ -666,7 +696,8 @@ def compute_metrics(eval_preds): metrics = train_result.metrics max_train_samples = ( - data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + data_args.max_train_samples if data_args.max_train_samples is not None else len( + train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) @@ -684,8 +715,10 @@ def compute_metrics(eval_preds): num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams if training_args.do_eval: logger.info("*** Evaluate ***") - metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval") - max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics = trainer.evaluate( + max_length=max_length, num_beams=num_beams, metric_key_prefix="eval") + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( + eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) @@ -699,9 +732,11 @@ def compute_metrics(eval_preds): ) metrics = predict_results.metrics max_predict_samples = ( - data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) + data_args.max_predict_samples if data_args.max_predict_samples is not None else len( + predict_dataset) ) - metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) + metrics["predict_samples"] = min( + max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) @@ -712,11 +747,13 @@ def compute_metrics(eval_preds): predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True ) predictions = [pred.strip() for pred in predictions] - output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt") + output_prediction_file = os.path.join( + training_args.output_dir, "generated_predictions.txt") with open(output_prediction_file, "w") as writer: writer.write("\n".join(predictions)) - kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"} + kwargs = {"finetuned_from": model_args.model_name_or_path, + "tasks": "summarization"} if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: @@ -742,4 +779,4 @@ def _mp_fn(index): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/examples/training/huggingface/bert/ls_hf_transformer_layer.py b/examples/training/huggingface/bert/ls_hf_transformer_layer.py index 3d572c21..f0015613 100644 --- a/examples/training/huggingface/bert/ls_hf_transformer_layer.py +++ b/examples/training/huggingface/bert/ls_hf_transformer_layer.py @@ -13,6 +13,7 @@ BertForQuestionAnswering, ) + def get_hf_bert_enc_layer_params(layer): init_ws = [] init_bs = [] @@ -116,7 +117,8 @@ def gen_bert_enc_config(training_args, config): for i in range(config.num_hidden_layers): bert_enc_config = gen_bert_enc_config(training_args, config) - init_ws, init_bs = get_hf_bert_enc_layer_params(model.bert.encoder.layer[i]) + init_ws, init_bs = get_hf_bert_enc_layer_params( + model.bert.encoder.layer[i]) model.bert.encoder.layer[i] = LSHFTransformerEncoderLayer( bert_enc_config, init_ws, init_bs ).cuda() @@ -139,27 +141,37 @@ def unwrap_model(model): if hasattr(model, "module"): return unwrap_model(model.module) else: - return model + return model def inject_hf_layer(config, hf_layer, ls_layer): for layer_id in range(config.num_hidden_layers): weight, bias = ls_layer[layer_id].params_dict() layer = hf_layer[layer_id] - layer.attention.self.query.weight.data.copy_(weight["self_attn_q_proj"]) - layer.attention.self.query.bias.data.copy_(bias["self_attn_q_proj"]) - layer.attention.self.key.weight.data.copy_(weight["self_attn_k_proj"]) + layer.attention.self.query.weight.data.copy_( + weight["self_attn_q_proj"]) + layer.attention.self.query.bias.data.copy_( + bias["self_attn_q_proj"]) + layer.attention.self.key.weight.data.copy_( + weight["self_attn_k_proj"]) layer.attention.self.key.bias.data.copy_(bias["self_attn_k_proj"]) - layer.attention.self.value.weight.data.copy_(weight["self_attn_v_proj"]) - layer.attention.self.value.bias.data.copy_(bias["self_attn_v_proj"]) - layer.attention.output.dense.weight.data.copy_(weight["self_attn_out_proj"]) - layer.attention.output.dense.bias.data.copy_(bias["self_attn_out_proj"]) - layer.attention.output.LayerNorm.weight.data.copy_(weight["self_attn_layer_norm"]) - layer.attention.output.LayerNorm.bias.data.copy_(bias["self_attn_layer_norm"]) + layer.attention.self.value.weight.data.copy_( + weight["self_attn_v_proj"]) + layer.attention.self.value.bias.data.copy_( + bias["self_attn_v_proj"]) + layer.attention.output.dense.weight.data.copy_( + weight["self_attn_out_proj"]) + layer.attention.output.dense.bias.data.copy_( + bias["self_attn_out_proj"]) + layer.attention.output.LayerNorm.weight.data.copy_( + weight["self_attn_layer_norm"]) + layer.attention.output.LayerNorm.bias.data.copy_( + bias["self_attn_layer_norm"]) layer.intermediate.dense.weight.data.copy_(weight["fc1"]) layer.intermediate.dense.bias.data.copy_(bias["fc1"]) layer.output.dense.weight.data.copy_(weight["fc2"]) layer.output.dense.bias.data.copy_(bias["fc2"]) - layer.output.LayerNorm.weight.data.copy_(weight["final_layer_norm"]) + layer.output.LayerNorm.weight.data.copy_( + weight["final_layer_norm"]) layer.output.LayerNorm.bias.data.copy_(bias["final_layer_norm"]) model_to_save = unwrap_model(model) @@ -169,8 +181,10 @@ def inject_hf_layer(config, hf_layer, ls_layer): ) # reload original modules ls_encoder_layer = model_to_save.bert.encoder.layer - model_to_save.bert.encoder.layer = nn.ModuleList([BertLayer(model.config) for _ in range(model.config.num_hidden_layers)]) - inject_hf_layer(model_to_save.config, model_to_save.bert.encoder.layer, ls_encoder_layer) + model_to_save.bert.encoder.layer = nn.ModuleList( + [BertLayer(model.config) for _ in range(model.config.num_hidden_layers)]) + inject_hf_layer(model_to_save.config, + model_to_save.bert.encoder.layer, ls_encoder_layer) state_dict = model_to_save.state_dict() # replace with lightseq modules model_to_save.bert.encoder.layer = ls_encoder_layer @@ -216,4 +230,4 @@ class LSBertForTokenClassification(LSBertPreTrainedModel, BertForTokenClassifica class LSBertForQuestionAnswering(LSBertPreTrainedModel, BertForQuestionAnswering): - """from BertForQuestionAnswering""" \ No newline at end of file + """from BertForQuestionAnswering""" diff --git a/examples/training/huggingface/bert/task_glue/run_glue.py b/examples/training/huggingface/bert/task_glue/run_glue.py index 70167ba1..6fce8ac6 100644 --- a/examples/training/huggingface/bert/task_glue/run_glue.py +++ b/examples/training/huggingface/bert/task_glue/run_glue.py @@ -92,7 +92,8 @@ class DataTrainingArguments: ) dataset_name: Optional[str] = field( default=None, - metadata={"help": "The name of the dataset to use (via the datasets library)."}, + metadata={ + "help": "The name of the dataset to use (via the datasets library)."}, ) dataset_config_name: Optional[str] = field( default=None, @@ -463,12 +464,14 @@ def main(): # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if ( - model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id + model.config.label2id != PretrainedConfig( + num_labels=num_labels).label2id and data_args.task_name is not None and not is_regression ): # Some have all caps in their config, some don't. - label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} + label_name_to_id = { + k.lower(): v for k, v in model.config.label2id.items()} if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = { i: int(label_name_to_id[label_list[i]]) for i in range(num_labels) @@ -484,7 +487,8 @@ def main(): if label_to_id is not None: model.config.label2id = label_to_id - model.config.id2label = {id: label for label, id in config.label2id.items()} + model.config.id2label = { + id: label for label, id in config.label2id.items()} if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( @@ -522,7 +526,8 @@ def preprocess_function(examples): raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + train_dataset = train_dataset.select( + range(data_args.max_train_samples)) if training_args.do_eval: if "validation" not in datasets and "validation_matched" not in datasets: @@ -531,7 +536,8 @@ def preprocess_function(examples): "validation_matched" if data_args.task_name == "mnli" else "validation" ] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + eval_dataset = eval_dataset.select( + range(data_args.max_eval_samples)) if ( training_args.do_predict @@ -551,7 +557,8 @@ def preprocess_function(examples): # Log a few random samples from the training set: if training_args.do_train: for index in random.sample(range(len(train_dataset)), 3): - logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") + logger.info( + f"Sample {index} of the training set: {train_dataset[index]}.") # Get the metric function if data_args.task_name is not None: @@ -562,12 +569,15 @@ def preprocess_function(examples): # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): - preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions - preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) + preds = p.predictions[0] if isinstance( + p.predictions, tuple) else p.predictions + preds = np.squeeze( + preds) if is_regression else np.argmax(preds, axis=1) if data_args.task_name is not None: result = metric.compute(predictions=preds, references=p.label_ids) if len(result) > 1: - result["combined_score"] = np.mean(list(result.values())).item() + result["combined_score"] = np.mean( + list(result.values())).item() return result elif is_regression: return {"mse": ((preds - p.label_ids) ** 2).mean().item()} @@ -578,7 +588,8 @@ def compute_metrics(p: EvalPrediction): if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: - data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) + data_collator = DataCollatorWithPadding( + tokenizer, pad_to_multiple_of=8) else: data_collator = None diff --git a/lightseq/training/ops/pytorch/transformer_decoder_layer.py b/lightseq/training/ops/pytorch/transformer_decoder_layer.py index 48a9a0d7..23a803b1 100644 --- a/lightseq/training/ops/pytorch/transformer_decoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_decoder_layer.py @@ -229,13 +229,16 @@ def copy_and_view(m, shape=None): if shape is None: shape = (-1,) return m.data.clone().view(*shape) + def _copy(m): return copy_and_view(m, (self.hs, self.hs)) self_attn_qkvw = self._get_weights(0) - self_attn_qw, self_attn_kw, self_attn_vw = self_attn_qkvw.split(self.hs * self.hs, 0) + self_attn_qw, self_attn_kw, self_attn_vw = self_attn_qkvw.split( + self.hs * self.hs, 0) self_attn_qkvb = self._get_weights(1) - self_attn_qb, self_attn_kb, self_attn_vb = self_attn_qkvb.split(self.hs, 0) + self_attn_qb, self_attn_kb, self_attn_vb = self_attn_qkvb.split( + self.hs, 0) all_enc_attn_kw, all_enc_attn_vw = None, None all_enc_attn_kb, all_enc_attn_vb = None, None @@ -256,9 +259,9 @@ def _copy(m): "self_attn_v_proj": copy_and_view(self_attn_vw, (self.hs, self.hs)), "self_attn_out_proj": copy_and_view(self._get_weights(2), (self.hs, self.hs)), "self_attn_layer_norm": copy_and_view(self._get_weights(4), (self.hs,)), - "encoder_attn_q_proj":copy_and_view(self._get_weights(6), (self.hs, self.hs)), - "encoder_attn_out_proj":copy_and_view(self._get_weights(8), (self.hs, self.hs)), - "encoder_attn_layer_norm":copy_and_view(self._get_weights(10), (self.hs,)), + "encoder_attn_q_proj": copy_and_view(self._get_weights(6), (self.hs, self.hs)), + "encoder_attn_out_proj": copy_and_view(self._get_weights(8), (self.hs, self.hs)), + "encoder_attn_layer_norm": copy_and_view(self._get_weights(10), (self.hs,)), "fc1": copy_and_view(self._get_weights(12), (self.ims, self.hs)), "fc2": copy_and_view(self._get_weights(14), (self.hs, self.ims)), "final_layer_norm": copy_and_view(self._get_weights(16), (self.hs,)), @@ -271,9 +274,9 @@ def _copy(m): "self_attn_v_proj": copy_and_view(self_attn_vb), "self_attn_out_proj": copy_and_view(self._get_weights(3)), "self_attn_layer_norm": copy_and_view(self._get_weights(5)), - "encoder_attn_q_proj":copy_and_view(self._get_weights(7), (self.hs,)), - "encoder_attn_out_proj":copy_and_view(self._get_weights(9), (self.hs,)), - "encoder_attn_layer_norm":copy_and_view(self._get_weights(11), (self.hs,)), + "encoder_attn_q_proj": copy_and_view(self._get_weights(7), (self.hs,)), + "encoder_attn_out_proj": copy_and_view(self._get_weights(9), (self.hs,)), + "encoder_attn_layer_norm": copy_and_view(self._get_weights(11), (self.hs,)), "fc1": copy_and_view(self._get_weights(13)), "fc2": copy_and_view(self._get_weights(15)), "final_layer_norm": copy_and_view(self._get_weights(17)), @@ -414,7 +417,8 @@ def forward( if hasattr(self, "para_16"): self.para_16.copy_(self.para.to(torch.half)) else: - self.register_buffer("para_16", self.para.clone().detach().half()) + self.register_buffer( + "para_16", self.para.clone().detach().half()) if self.config.fp16: decoder_states = decoder_states.to(torch.half) diff --git a/lightseq/training/ops/pytorch/transformer_encoder_layer.py b/lightseq/training/ops/pytorch/transformer_encoder_layer.py index 9dd18585..c2e47803 100644 --- a/lightseq/training/ops/pytorch/transformer_encoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_encoder_layer.py @@ -214,7 +214,6 @@ def init_transformer_weights(self): nn.init.ones_(self._get_weights(10)) nn.init.zeros_(self._get_weights(11)) - def params_dict(self): ''' Returns: @@ -227,9 +226,11 @@ def copy_and_view(m, shape=None): return m.data.clone().view(*shape) self_attn_qkvw = self._get_weights(0) - self_attn_qw, self_attn_kw, self_attn_vw = self_attn_qkvw.split(self.hs * self.hs, 0) + self_attn_qw, self_attn_kw, self_attn_vw = self_attn_qkvw.split( + self.hs * self.hs, 0) self_attn_qkvb = self._get_weights(1) - self_attn_qb, self_attn_kb, self_attn_vb = self_attn_qkvb.split(self.hs, 0) + self_attn_qb, self_attn_kb, self_attn_vb = self_attn_qkvb.split( + self.hs, 0) weight = { "self_attn_q_proj": copy_and_view(self_attn_qw, (self.hs, self.hs)), @@ -239,7 +240,7 @@ def copy_and_view(m, shape=None): "self_attn_layer_norm": copy_and_view(self._get_weights(4), (self.hs,)), "fc1": copy_and_view(self._get_weights(6), (self.ims, self.hs)), "fc2": copy_and_view(self._get_weights(8), (self.hs, self.ims)), - "final_layer_norm": copy_and_view(self._get_weights(10), (self.hs,)) + "final_layer_norm": copy_and_view(self._get_weights(10), (self.hs,)) } bias = { "self_attn_q_proj": copy_and_view(self_attn_qb), @@ -249,11 +250,10 @@ def copy_and_view(m, shape=None): "self_attn_layer_norm": copy_and_view(self._get_weights(5)), "fc1": copy_and_view(self._get_weights(7)), "fc2": copy_and_view(self._get_weights(9)), - "final_layer_norm": copy_and_view(self._get_weights(11)) + "final_layer_norm": copy_and_view(self._get_weights(11)) } return weight, bias - def __assign_layer_weight_grad(self): param = ( self.para_16 From 8af2a180d0215dfa6a210d7f7d65ad72997db905 Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Wed, 1 Jun 2022 13:34:11 +0800 Subject: [PATCH 10/16] format update --- .../bart/summarization/ls_bart_model.py | 195 +++++++------- .../bart/summarization/run_summarization.py | 252 ++++++++++++------ .../bert/ls_hf_transformer_layer.py | 54 ++-- .../huggingface/bert/task_glue/run_glue.py | 35 +-- .../ops/pytorch/transformer_decoder_layer.py | 30 ++- .../ops/pytorch/transformer_encoder_layer.py | 19 +- 6 files changed, 341 insertions(+), 244 deletions(-) diff --git a/examples/training/huggingface/bart/summarization/ls_bart_model.py b/examples/training/huggingface/bart/summarization/ls_bart_model.py index 1786133d..97182dd1 100644 --- a/examples/training/huggingface/bart/summarization/ls_bart_model.py +++ b/examples/training/huggingface/bart/summarization/ls_bart_model.py @@ -59,7 +59,7 @@ def get_enc_layer_config(training_args, config): self_attn_layer_norm="self_attn_layer_norm", fc1="fc1", fc2="fc2", - final_layer_norm="final_layer_norm" + final_layer_norm="final_layer_norm", ) return enc_config, enc_params_list @@ -102,15 +102,13 @@ def inject_lightseq_layer(model, training_args, config): # encoder op replace model = model.model for layer_id in range(config.encoder_layers): - enc_config, enc_params_list = get_enc_layer_config( - training_args, config) + enc_config, enc_params_list = get_enc_layer_config(training_args, config) model.encoder.layers[layer_id] = LSHFTransformerEncoderLayer.build_model( enc_config, enc_params_list, model.encoder.layers, layer_id ).cuda() # decoder op replace for layer_id in range(config.decoder_layers): - dec_config, dec_params_list = get_dec_layer_config( - training_args, config) + dec_config, dec_params_list = get_dec_layer_config(training_args, config) model.decoder.layers[layer_id] = LSHFTransformerDecoderLayer.build_model( dec_config, dec_params_list, model.decoder.layers, layer_id ).cuda() @@ -123,6 +121,7 @@ def hf_state_dict(model): Returns: Dict: The huggingface state dict """ + def unwrap_model(model): # since there could be multiple levels of wrapping, unwrap recursively if hasattr(model, "module"): @@ -135,34 +134,24 @@ def inject_hf_layer(config, hf_layer, ls_layer, is_decoder=False): for layer_id in range(config.encoder_layers): weight, bias = ls_layer[layer_id].params_dict() layer = hf_layer[layer_id] - layer.self_attn.q_proj.weight.data.copy_( - weight["self_attn_q_proj"]) - layer.self_attn.q_proj.bias.data.copy_( - bias["self_attn_q_proj"]) - layer.self_attn.k_proj.weight.data.copy_( - weight["self_attn_k_proj"]) - layer.self_attn.k_proj.bias.data.copy_( - bias["self_attn_k_proj"]) - layer.self_attn.v_proj.weight.data.copy_( - weight["self_attn_v_proj"]) - layer.self_attn.v_proj.bias.data.copy_( - bias["self_attn_v_proj"]) - layer.self_attn.out_proj.weight.data.copy_( - weight["self_attn_out_proj"]) - layer.self_attn.out_proj.bias.data.copy_( - bias["self_attn_out_proj"]) + layer.self_attn.q_proj.weight.data.copy_(weight["self_attn_q_proj"]) + layer.self_attn.q_proj.bias.data.copy_(bias["self_attn_q_proj"]) + layer.self_attn.k_proj.weight.data.copy_(weight["self_attn_k_proj"]) + layer.self_attn.k_proj.bias.data.copy_(bias["self_attn_k_proj"]) + layer.self_attn.v_proj.weight.data.copy_(weight["self_attn_v_proj"]) + layer.self_attn.v_proj.bias.data.copy_(bias["self_attn_v_proj"]) + layer.self_attn.out_proj.weight.data.copy_(weight["self_attn_out_proj"]) + layer.self_attn.out_proj.bias.data.copy_(bias["self_attn_out_proj"]) layer.self_attn_layer_norm.weight.data.copy_( - weight["self_attn_layer_norm"]) - layer.self_attn_layer_norm.bias.data.copy_( - bias["self_attn_layer_norm"]) + weight["self_attn_layer_norm"] + ) + layer.self_attn_layer_norm.bias.data.copy_(bias["self_attn_layer_norm"]) layer.fc1.weight.data.copy_(weight["fc1"]) layer.fc1.bias.data.copy_(bias["fc1"]) layer.fc2.weight.data.copy_(weight["fc2"]) layer.fc2.bias.data.copy_(bias["fc2"]) - layer.final_layer_norm.weight.data.copy_( - weight["final_layer_norm"]) - layer.final_layer_norm.bias.data.copy_( - bias["final_layer_norm"]) + layer.final_layer_norm.weight.data.copy_(weight["final_layer_norm"]) + layer.final_layer_norm.bias.data.copy_(bias["final_layer_norm"]) else: encoder_attn_k_proj_w = None encoder_attn_k_proj_b = None @@ -171,78 +160,79 @@ def inject_hf_layer(config, hf_layer, ls_layer, is_decoder=False): for layer_id in range(config.decoder_layers): weight, bias = ls_layer[layer_id].params_dict() layer = hf_layer[layer_id] - layer.self_attn.q_proj.weight.data.copy_( - weight["self_attn_q_proj"]) - layer.self_attn.q_proj.bias.data.copy_( - bias["self_attn_q_proj"]) - layer.self_attn.k_proj.weight.data.copy_( - weight["self_attn_k_proj"]) - layer.self_attn.k_proj.bias.data.copy_( - bias["self_attn_k_proj"]) - layer.self_attn.v_proj.weight.data.copy_( - weight["self_attn_v_proj"]) - layer.self_attn.v_proj.bias.data.copy_( - bias["self_attn_v_proj"]) - layer.self_attn.out_proj.weight.data.copy_( - weight["self_attn_out_proj"]) - layer.self_attn.out_proj.bias.data.copy_( - bias["self_attn_out_proj"]) + layer.self_attn.q_proj.weight.data.copy_(weight["self_attn_q_proj"]) + layer.self_attn.q_proj.bias.data.copy_(bias["self_attn_q_proj"]) + layer.self_attn.k_proj.weight.data.copy_(weight["self_attn_k_proj"]) + layer.self_attn.k_proj.bias.data.copy_(bias["self_attn_k_proj"]) + layer.self_attn.v_proj.weight.data.copy_(weight["self_attn_v_proj"]) + layer.self_attn.v_proj.bias.data.copy_(bias["self_attn_v_proj"]) + layer.self_attn.out_proj.weight.data.copy_(weight["self_attn_out_proj"]) + layer.self_attn.out_proj.bias.data.copy_(bias["self_attn_out_proj"]) layer.self_attn_layer_norm.weight.data.copy_( - weight["self_attn_layer_norm"]) - layer.self_attn_layer_norm.bias.data.copy_( - bias["self_attn_layer_norm"]) + weight["self_attn_layer_norm"] + ) + layer.self_attn_layer_norm.bias.data.copy_(bias["self_attn_layer_norm"]) layer.fc1.weight.data.copy_(weight["fc1"]) layer.fc1.bias.data.copy_(bias["fc1"]) layer.fc2.weight.data.copy_(weight["fc2"]) layer.fc2.bias.data.copy_(bias["fc2"]) - layer.final_layer_norm.weight.data.copy_( - weight["final_layer_norm"]) - layer.final_layer_norm.bias.data.copy_( - bias["final_layer_norm"]) + layer.final_layer_norm.weight.data.copy_(weight["final_layer_norm"]) + layer.final_layer_norm.bias.data.copy_(bias["final_layer_norm"]) layer.encoder_attn.q_proj.weight.data.copy_( - weight["encoder_attn_q_proj"]) - layer.encoder_attn.q_proj.bias.data.copy_( - bias["encoder_attn_q_proj"]) + weight["encoder_attn_q_proj"] + ) + layer.encoder_attn.q_proj.bias.data.copy_(bias["encoder_attn_q_proj"]) layer.encoder_attn.out_proj.weight.data.copy_( - weight["encoder_attn_out_proj"]) + weight["encoder_attn_out_proj"] + ) layer.encoder_attn.out_proj.bias.data.copy_( - bias["encoder_attn_out_proj"]) + bias["encoder_attn_out_proj"] + ) layer.encoder_attn_layer_norm.weight.data.copy_( - weight["encoder_attn_layer_norm"]) + weight["encoder_attn_layer_norm"] + ) layer.encoder_attn_layer_norm.bias.data.copy_( - bias["encoder_attn_layer_norm"]) + bias["encoder_attn_layer_norm"] + ) if layer_id == 0: encoder_attn_k_proj_w = weight["encoder_attn_k_proj"] encoder_attn_k_proj_b = bias["encoder_attn_k_proj"] encoder_attn_v_proj_w = weight["encoder_attn_v_proj"] encoder_attn_v_proj_b = bias["encoder_attn_v_proj"] layer.encoder_attn.k_proj.weight.data.copy_( - encoder_attn_k_proj_w[layer_id]) + encoder_attn_k_proj_w[layer_id] + ) layer.encoder_attn.k_proj.bias.data.copy_( - encoder_attn_k_proj_b[layer_id]) + encoder_attn_k_proj_b[layer_id] + ) layer.encoder_attn.v_proj.weight.data.copy_( - encoder_attn_v_proj_w[layer_id]) + encoder_attn_v_proj_w[layer_id] + ) layer.encoder_attn.v_proj.bias.data.copy_( - encoder_attn_v_proj_b[layer_id]) + encoder_attn_v_proj_b[layer_id] + ) model_to_save = unwrap_model(model) if not isinstance(model_to_save, LSBartPretrainedModel): - raise ValueError( - "Must be ligtseq replaced model" - ) + raise ValueError("Must be ligtseq replaced model") # reload original modules ls_encoder_layer = model_to_save.model.encoder.layers ls_decoder_layer = model_to_save.model.decoder.layers model_to_save.model.encoder.layers = nn.ModuleList( - [BartEncoderLayer(model.config) for _ in range(model.config.encoder_layers)]) + [BartEncoderLayer(model.config) for _ in range(model.config.encoder_layers)] + ) model_to_save.model.decoder.layers = nn.ModuleList( - [BartDecoderLayer(model.config) for _ in range(model.config.decoder_layers)]) + [BartDecoderLayer(model.config) for _ in range(model.config.decoder_layers)] + ) + inject_hf_layer(model.config, model_to_save.model.encoder.layers, ls_encoder_layer) inject_hf_layer( - model.config, model_to_save.model.encoder.layers, ls_encoder_layer) - inject_hf_layer(model.config, model_to_save.model.decoder.layers, - ls_decoder_layer, is_decoder=True) + model.config, + model_to_save.model.decoder.layers, + ls_decoder_layer, + is_decoder=True, + ) state_dict = model_to_save.state_dict() # replace with lightseq modules model_to_save.model.encoder.layers = ls_encoder_layer @@ -256,14 +246,19 @@ def __init__(self, *args, **kwargs): super(LSHFTransformerEncoderLayer, self).__init__(*args, **kwargs) def forward(self, hidden_states, encoder_padding_mask, *args, **kwargs): - ls_encoder_padding_mask = encoder_padding_mask.narrow( - 2, 0, 1).squeeze().ne(0).type_as(encoder_padding_mask) + ls_encoder_padding_mask = ( + encoder_padding_mask.narrow(2, 0, 1) + .squeeze() + .ne(0) + .type_as(encoder_padding_mask) + ) output = super().forward(hidden_states, ls_encoder_padding_mask) return (output, None, None, None) @staticmethod def get_params_list(**kwargs): """Configuration of model hyperparameters for encoder and decoder""" + @dataclass class ParamsList: self_attn_q_proj: None @@ -299,32 +294,43 @@ class LSHFTransformerDecoderLayer(TransformerDecoderLayer): def __init__(self, *args, **kwargs): super(LSHFTransformerDecoderLayer, self).__init__(*args, **kwargs) - def forward(self, hidden_states, - attention_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - use_cache=False, - *args, **kwargs): - encoder_hidden_states = encoder_hidden_states.transpose( - 0, 1).contiguous() - ls_encoder_padding_mask = encoder_attention_mask.narrow( - 2, 0, 1).squeeze().ne(0).type_as(encoder_attention_mask) + def forward( + self, + hidden_states, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + use_cache=False, + *args, + **kwargs, + ): + encoder_hidden_states = encoder_hidden_states.transpose(0, 1).contiguous() + ls_encoder_padding_mask = ( + encoder_attention_mask.narrow(2, 0, 1) + .squeeze() + .ne(0) + .type_as(encoder_attention_mask) + ) cache = None if use_cache: import pdb + pdb.set_trace() - cache = {} if past_key_value is None else { - "dec_self_k": past_key_value[0], - "dec_self_v": past_key_value[1] - } - output = super().forward(hidden_states, encoder_hidden_states, - ls_encoder_padding_mask, cache) + cache = ( + {} + if past_key_value is None + else {"dec_self_k": past_key_value[0], "dec_self_v": past_key_value[1]} + ) + output = super().forward( + hidden_states, encoder_hidden_states, ls_encoder_padding_mask, cache + ) return output @staticmethod def get_params_list(**kwargs): """Configuration of model hyperparameters for encoder and decoder""" + @dataclass class ParamsList: self_attn_q_proj: None @@ -360,7 +366,8 @@ def build_model(cls, config, params_list, layer_list, layer_id): init_bs.append(b) if layer_id == 0: enc_kvw, enc_kvb = get_hf_bart_dec_enc_atten_kv( - layer_list, params_list, config.nlayer) + layer_list, params_list, config.nlayer + ) init_ws.append(enc_kvw) init_bs.append(enc_kvb) return cls(config, init_ws, init_bs) @@ -379,11 +386,15 @@ def save_pretrained(self, *args, **kwargs): super().save_pretrained(*args, **kwargs) -class LSBartForConditionalGeneration(LSBartPretrainedModel, BartForConditionalGeneration): +class LSBartForConditionalGeneration( + LSBartPretrainedModel, BartForConditionalGeneration +): """from BartForConditionalGeneration""" -class LSBartForSequenceClassification(LSBartPretrainedModel, BartForSequenceClassification): +class LSBartForSequenceClassification( + LSBartPretrainedModel, BartForSequenceClassification +): """from BartForSequenceClassification""" diff --git a/examples/training/huggingface/bart/summarization/run_summarization.py b/examples/training/huggingface/bart/summarization/run_summarization.py index d477ab14..3397ac5e 100644 --- a/examples/training/huggingface/bart/summarization/run_summarization.py +++ b/examples/training/huggingface/bart/summarization/run_summarization.py @@ -56,8 +56,10 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.20.0.dev0") -require_version("datasets>=1.8.0", - "To fix: pip install -r examples/pytorch/summarization/requirements.txt") +require_version( + "datasets>=1.8.0", + "To fix: pip install -r examples/pytorch/summarization/requirements.txt", +) logger = logging.getLogger(__name__) @@ -73,7 +75,11 @@ # A list of all multilingual tokenizer which require lang attribute. MULTILINGUAL_TOKENIZERS = [ - MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast] + MBartTokenizer, + MBartTokenizerFast, + MBart50Tokenizer, + MBart50TokenizerFast, +] @dataclass @@ -84,28 +90,38 @@ class ModelArguments: model_name_or_path: str = field( metadata={ - "help": "Path to pretrained model or model identifier from huggingface.co/models"} + "help": "Path to pretrained model or model identifier from huggingface.co/models" + } ) config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + default=None, + metadata={ + "help": "Pretrained config name or path if not the same as model_name" + }, ) tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + default=None, + metadata={ + "help": "Pretrained tokenizer name or path if not the same as model_name" + }, ) cache_dir: Optional[str] = field( default=None, metadata={ - "help": "Where to store the pretrained models downloaded from huggingface.co"}, + "help": "Where to store the pretrained models downloaded from huggingface.co" + }, ) use_fast_tokenizer: bool = field( default=True, metadata={ - "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not." + }, ) model_revision: str = field( default="main", metadata={ - "help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + "help": "The specific model version to use (can be a branch name, tag name or commit id)." + }, ) use_auth_token: bool = field( default=False, @@ -133,27 +149,33 @@ class DataTrainingArguments: Arguments pertaining to what data we are going to input our model for training and eval. """ - lang: str = field(default=None, metadata={ - "help": "Language id for summarization."}) + lang: str = field(default=None, metadata={"help": "Language id for summarization."}) dataset_name: Optional[str] = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + default=None, + metadata={"help": "The name of the dataset to use (via the datasets library)."}, ) dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + default=None, + metadata={ + "help": "The configuration name of the dataset to use (via the datasets library)." + }, ) text_column: Optional[str] = field( default=None, metadata={ - "help": "The name of the column in the datasets containing the full texts (for summarization)."}, + "help": "The name of the column in the datasets containing the full texts (for summarization)." + }, ) summary_column: Optional[str] = field( default=None, metadata={ - "help": "The name of the column in the datasets containing the summaries (for summarization)."}, + "help": "The name of the column in the datasets containing the summaries (for summarization)." + }, ) train_file: Optional[str] = field( - default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."} + default=None, + metadata={"help": "The input training data file (a jsonlines or csv file)."}, ) validation_file: Optional[str] = field( default=None, @@ -170,7 +192,8 @@ class DataTrainingArguments: }, ) overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + default=False, + metadata={"help": "Overwrite the cached training and evaluation sets"}, ) preprocessing_num_workers: Optional[int] = field( default=None, @@ -258,7 +281,10 @@ class DataTrainingArguments: }, ) source_prefix: Optional[str] = field( - default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."} + default="", + metadata={ + "help": "A prefix to add before every source text (useful for T5 models)." + }, ) forced_bos_token: Optional[str] = field( @@ -273,18 +299,27 @@ class DataTrainingArguments: ) def __post_init__(self): - if self.dataset_name is None and self.train_file is None and self.validation_file is None: + if ( + self.dataset_name is None + and self.train_file is None + and self.validation_file is None + ): raise ValueError( - "Need either a dataset name or a training/validation file.") + "Need either a dataset name or a training/validation file." + ) else: if self.train_file is not None: extension = self.train_file.split(".")[-1] assert extension in [ - "csv", "json"], "`train_file` should be a csv or a json file." + "csv", + "json", + ], "`train_file` should be a csv or a json file." if self.validation_file is not None: extension = self.validation_file.split(".")[-1] assert extension in [ - "csv", "json"], "`validation_file` should be a csv or a json file." + "csv", + "json", + ], "`validation_file` should be a csv or a json file." if self.val_max_target_length is None: self.val_max_target_length = self.max_target_length @@ -310,12 +345,14 @@ def main(): # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( - (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) + (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments) + ) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1])) + json_file=os.path.abspath(sys.argv[1]) + ) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() @@ -353,14 +390,20 @@ def main(): # Detecting last checkpoint. last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + if ( + os.path.isdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir + ): last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + elif ( + last_checkpoint is not None and training_args.resume_from_checkpoint is None + ): logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -412,13 +455,17 @@ def main(): # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.model_name_or_path, + model_args.config_name + if model_args.config_name + else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + model_args.tokenizer_name + if model_args.tokenizer_name + else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, @@ -447,16 +494,22 @@ def main(): model.resize_token_embeddings(len(tokenizer)) - if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): + if model.config.decoder_start_token_id is None and isinstance( + tokenizer, (MBartTokenizer, MBartTokenizerFast) + ): if isinstance(tokenizer, MBartTokenizer): - model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.lang] + model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ + data_args.lang + ] else: model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids( - data_args.lang) + data_args.lang + ) if model.config.decoder_start_token_id is None: raise ValueError( - "Make sure that `config.decoder_start_token_id` is correctly defined") + "Make sure that `config.decoder_start_token_id` is correctly defined" + ) if ( hasattr(model.config, "max_position_embeddings") @@ -490,7 +543,8 @@ def main(): column_names = raw_datasets["test"].column_names else: logger.info( - "There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") + "There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`." + ) return if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): @@ -504,15 +558,18 @@ def main(): # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token # as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument. forced_bos_token_id = ( - tokenizer.lang_code_to_id[data_args.forced_bos_token] if data_args.forced_bos_token is not None else None + tokenizer.lang_code_to_id[data_args.forced_bos_token] + if data_args.forced_bos_token is not None + else None ) model.config.forced_bos_token_id = forced_bos_token_id # Get the column names for input/target. - dataset_columns = summarization_name_mapping.get( - data_args.dataset_name, None) + dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None) if data_args.text_column is None: - text_column = dataset_columns[0] if dataset_columns is not None else column_names[0] + text_column = ( + dataset_columns[0] if dataset_columns is not None else column_names[0] + ) else: text_column = data_args.text_column if text_column not in column_names: @@ -520,7 +577,9 @@ def main(): f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.summary_column is None: - summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1] + summary_column = ( + dataset_columns[1] if dataset_columns is not None else column_names[1] + ) else: summary_column = data_args.summary_column if summary_column not in column_names: @@ -532,7 +591,9 @@ def main(): max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False - if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"): + if training_args.label_smoothing_factor > 0 and not hasattr( + model, "prepare_decoder_input_ids_from_labels" + ): logger.warning( "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" @@ -543,24 +604,33 @@ def preprocess_function(examples): inputs, targets = [], [] for i in range(len(examples[text_column])): - if examples[text_column][i] is not None and examples[summary_column][i] is not None: + if ( + examples[text_column][i] is not None + and examples[summary_column][i] is not None + ): inputs.append(examples[text_column][i]) targets.append(examples[summary_column][i]) inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer( - inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) + inputs, + max_length=data_args.max_source_length, + padding=padding, + truncation=True, + ) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer( - targets, max_length=max_target_length, padding=padding, truncation=True) + targets, max_length=max_target_length, padding=padding, truncation=True + ) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [ - [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] + [(l if l != tokenizer.pad_token_id else -100) for l in label] + for label in labels["input_ids"] ] model_inputs["labels"] = labels["input_ids"] @@ -571,8 +641,7 @@ def preprocess_function(examples): raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: - max_train_samples = min( - len(train_dataset), data_args.max_train_samples) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( @@ -590,10 +659,11 @@ def preprocess_function(examples): raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: - max_eval_samples = min( - len(eval_dataset), data_args.max_eval_samples) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) - with training_args.main_process_first(desc="validation dataset map pre-processing"): + with training_args.main_process_first( + desc="validation dataset map pre-processing" + ): eval_dataset = eval_dataset.map( preprocess_function, batched=True, @@ -610,10 +680,12 @@ def preprocess_function(examples): predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: max_predict_samples = min( - len(predict_dataset), data_args.max_predict_samples) - predict_dataset = predict_dataset.select( - range(max_predict_samples)) - with training_args.main_process_first(desc="prediction dataset map pre-processing"): + len(predict_dataset), data_args.max_predict_samples + ) + predict_dataset = predict_dataset.select(range(max_predict_samples)) + with training_args.main_process_first( + desc="prediction dataset map pre-processing" + ): predict_dataset = predict_dataset.map( preprocess_function, batched=True, @@ -624,8 +696,9 @@ def preprocess_function(examples): ) # Data collator - label_pad_token_id = - \ - 100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id + label_pad_token_id = ( + -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id + ) data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, @@ -654,21 +727,20 @@ def compute_metrics(eval_preds): if data_args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) - decoded_labels = tokenizer.batch_decode( - labels, skip_special_tokens=True) + decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing - decoded_preds, decoded_labels = postprocess_text( - decoded_preds, decoded_labels) + decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) - result = metric.compute(predictions=decoded_preds, - references=decoded_labels, use_stemmer=True) + result = metric.compute( + predictions=decoded_preds, references=decoded_labels, use_stemmer=True + ) # Extract a few results from ROUGE - result = {key: value.mid.fmeasure * - 100 for key, value in result.items()} + result = {key: value.mid.fmeasure * 100 for key, value in result.items()} - prediction_lens = [np.count_nonzero( - pred != tokenizer.pad_token_id) for pred in preds] + prediction_lens = [ + np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds + ] result["gen_len"] = np.mean(prediction_lens) result = {k: round(v, 4) for k, v in result.items()} return result @@ -681,7 +753,9 @@ def compute_metrics(eval_preds): eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, - compute_metrics=compute_metrics if training_args.predict_with_generate else None, + compute_metrics=compute_metrics + if training_args.predict_with_generate + else None, ) # Training @@ -696,8 +770,9 @@ def compute_metrics(eval_preds): metrics = train_result.metrics max_train_samples = ( - data_args.max_train_samples if data_args.max_train_samples is not None else len( - train_dataset) + data_args.max_train_samples + if data_args.max_train_samples is not None + else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) @@ -712,13 +787,21 @@ def compute_metrics(eval_preds): if training_args.generation_max_length is not None else data_args.val_max_target_length ) - num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams + num_beams = ( + data_args.num_beams + if data_args.num_beams is not None + else training_args.generation_num_beams + ) if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate( - max_length=max_length, num_beams=num_beams, metric_key_prefix="eval") - max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( - eval_dataset) + max_length=max_length, num_beams=num_beams, metric_key_prefix="eval" + ) + max_eval_samples = ( + data_args.max_eval_samples + if data_args.max_eval_samples is not None + else len(eval_dataset) + ) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) @@ -728,15 +811,18 @@ def compute_metrics(eval_preds): logger.info("*** Predict ***") predict_results = trainer.predict( - predict_dataset, metric_key_prefix="predict", max_length=max_length, num_beams=num_beams + predict_dataset, + metric_key_prefix="predict", + max_length=max_length, + num_beams=num_beams, ) metrics = predict_results.metrics max_predict_samples = ( - data_args.max_predict_samples if data_args.max_predict_samples is not None else len( - predict_dataset) + data_args.max_predict_samples + if data_args.max_predict_samples is not None + else len(predict_dataset) ) - metrics["predict_samples"] = min( - max_predict_samples, len(predict_dataset)) + metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) @@ -744,21 +830,25 @@ def compute_metrics(eval_preds): if trainer.is_world_process_zero(): if training_args.predict_with_generate: predictions = tokenizer.batch_decode( - predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True + predict_results.predictions, + skip_special_tokens=True, + clean_up_tokenization_spaces=True, ) predictions = [pred.strip() for pred in predictions] output_prediction_file = os.path.join( - training_args.output_dir, "generated_predictions.txt") + training_args.output_dir, "generated_predictions.txt" + ) with open(output_prediction_file, "w") as writer: writer.write("\n".join(predictions)) - kwargs = {"finetuned_from": model_args.model_name_or_path, - "tasks": "summarization"} + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"} if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name - kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + kwargs[ + "dataset" + ] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name diff --git a/examples/training/huggingface/bert/ls_hf_transformer_layer.py b/examples/training/huggingface/bert/ls_hf_transformer_layer.py index f0015613..807a0c19 100644 --- a/examples/training/huggingface/bert/ls_hf_transformer_layer.py +++ b/examples/training/huggingface/bert/ls_hf_transformer_layer.py @@ -117,8 +117,7 @@ def gen_bert_enc_config(training_args, config): for i in range(config.num_hidden_layers): bert_enc_config = gen_bert_enc_config(training_args, config) - init_ws, init_bs = get_hf_bert_enc_layer_params( - model.bert.encoder.layer[i]) + init_ws, init_bs = get_hf_bert_enc_layer_params(model.bert.encoder.layer[i]) model.bert.encoder.layer[i] = LSHFTransformerEncoderLayer( bert_enc_config, init_ws, init_bs ).cuda() @@ -136,6 +135,7 @@ def hf_state_dict(model): Returns: Dict: The huggingface state dict """ + def unwrap_model(model): # since there could be multiple levels of wrapping, unwrap recursively if hasattr(model, "module"): @@ -147,44 +147,38 @@ def inject_hf_layer(config, hf_layer, ls_layer): for layer_id in range(config.num_hidden_layers): weight, bias = ls_layer[layer_id].params_dict() layer = hf_layer[layer_id] - layer.attention.self.query.weight.data.copy_( - weight["self_attn_q_proj"]) - layer.attention.self.query.bias.data.copy_( - bias["self_attn_q_proj"]) - layer.attention.self.key.weight.data.copy_( - weight["self_attn_k_proj"]) + layer.attention.self.query.weight.data.copy_(weight["self_attn_q_proj"]) + layer.attention.self.query.bias.data.copy_(bias["self_attn_q_proj"]) + layer.attention.self.key.weight.data.copy_(weight["self_attn_k_proj"]) layer.attention.self.key.bias.data.copy_(bias["self_attn_k_proj"]) - layer.attention.self.value.weight.data.copy_( - weight["self_attn_v_proj"]) - layer.attention.self.value.bias.data.copy_( - bias["self_attn_v_proj"]) - layer.attention.output.dense.weight.data.copy_( - weight["self_attn_out_proj"]) - layer.attention.output.dense.bias.data.copy_( - bias["self_attn_out_proj"]) + layer.attention.self.value.weight.data.copy_(weight["self_attn_v_proj"]) + layer.attention.self.value.bias.data.copy_(bias["self_attn_v_proj"]) + layer.attention.output.dense.weight.data.copy_(weight["self_attn_out_proj"]) + layer.attention.output.dense.bias.data.copy_(bias["self_attn_out_proj"]) layer.attention.output.LayerNorm.weight.data.copy_( - weight["self_attn_layer_norm"]) + weight["self_attn_layer_norm"] + ) layer.attention.output.LayerNorm.bias.data.copy_( - bias["self_attn_layer_norm"]) + bias["self_attn_layer_norm"] + ) layer.intermediate.dense.weight.data.copy_(weight["fc1"]) layer.intermediate.dense.bias.data.copy_(bias["fc1"]) layer.output.dense.weight.data.copy_(weight["fc2"]) layer.output.dense.bias.data.copy_(bias["fc2"]) - layer.output.LayerNorm.weight.data.copy_( - weight["final_layer_norm"]) + layer.output.LayerNorm.weight.data.copy_(weight["final_layer_norm"]) layer.output.LayerNorm.bias.data.copy_(bias["final_layer_norm"]) model_to_save = unwrap_model(model) if not isinstance(model_to_save, LSBertPreTrainedModel): - raise ValueError( - "Must be ligtseq replaced model" - ) + raise ValueError("Must be ligtseq replaced model") # reload original modules ls_encoder_layer = model_to_save.bert.encoder.layer model_to_save.bert.encoder.layer = nn.ModuleList( - [BertLayer(model.config) for _ in range(model.config.num_hidden_layers)]) - inject_hf_layer(model_to_save.config, - model_to_save.bert.encoder.layer, ls_encoder_layer) + [BertLayer(model.config) for _ in range(model.config.num_hidden_layers)] + ) + inject_hf_layer( + model_to_save.config, model_to_save.bert.encoder.layer, ls_encoder_layer + ) state_dict = model_to_save.state_dict() # replace with lightseq modules model_to_save.bert.encoder.layer = ls_encoder_layer @@ -205,7 +199,9 @@ def save_pretrained(self, *args, **kwargs): super().save_pretrained(*args, **kwargs) -class LSBertForSequenceClassification(LSBertPreTrainedModel, BertForSequenceClassification): +class LSBertForSequenceClassification( + LSBertPreTrainedModel, BertForSequenceClassification +): """from BertForSequenceClassification""" @@ -217,7 +213,9 @@ class LSBertForMaskedLM(LSBertPreTrainedModel, BertForMaskedLM): """from BertForMaskedLM""" -class LSBertForNextSentencePrediction(LSBertPreTrainedModel, BertForNextSentencePrediction): +class LSBertForNextSentencePrediction( + LSBertPreTrainedModel, BertForNextSentencePrediction +): """from BertForNextSentencePrediction""" diff --git a/examples/training/huggingface/bert/task_glue/run_glue.py b/examples/training/huggingface/bert/task_glue/run_glue.py index 6fce8ac6..c0277bc8 100644 --- a/examples/training/huggingface/bert/task_glue/run_glue.py +++ b/examples/training/huggingface/bert/task_glue/run_glue.py @@ -42,7 +42,7 @@ default_data_collator, set_seed, BertForSequenceClassification, - BertLayer + BertLayer, ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version @@ -92,8 +92,7 @@ class DataTrainingArguments: ) dataset_name: Optional[str] = field( default=None, - metadata={ - "help": "The name of the dataset to use (via the datasets library)."}, + metadata={"help": "The name of the dataset to use (via the datasets library)."}, ) dataset_config_name: Optional[str] = field( default=None, @@ -464,14 +463,12 @@ def main(): # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if ( - model.config.label2id != PretrainedConfig( - num_labels=num_labels).label2id + model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and data_args.task_name is not None and not is_regression ): # Some have all caps in their config, some don't. - label_name_to_id = { - k.lower(): v for k, v in model.config.label2id.items()} + label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = { i: int(label_name_to_id[label_list[i]]) for i in range(num_labels) @@ -487,8 +484,7 @@ def main(): if label_to_id is not None: model.config.label2id = label_to_id - model.config.id2label = { - id: label for label, id in config.label2id.items()} + model.config.id2label = {id: label for label, id in config.label2id.items()} if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( @@ -526,8 +522,7 @@ def preprocess_function(examples): raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select( - range(data_args.max_train_samples)) + train_dataset = train_dataset.select(range(data_args.max_train_samples)) if training_args.do_eval: if "validation" not in datasets and "validation_matched" not in datasets: @@ -536,8 +531,7 @@ def preprocess_function(examples): "validation_matched" if data_args.task_name == "mnli" else "validation" ] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select( - range(data_args.max_eval_samples)) + eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) if ( training_args.do_predict @@ -557,8 +551,7 @@ def preprocess_function(examples): # Log a few random samples from the training set: if training_args.do_train: for index in random.sample(range(len(train_dataset)), 3): - logger.info( - f"Sample {index} of the training set: {train_dataset[index]}.") + logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # Get the metric function if data_args.task_name is not None: @@ -569,15 +562,12 @@ def preprocess_function(examples): # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): - preds = p.predictions[0] if isinstance( - p.predictions, tuple) else p.predictions - preds = np.squeeze( - preds) if is_regression else np.argmax(preds, axis=1) + preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions + preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) if data_args.task_name is not None: result = metric.compute(predictions=preds, references=p.label_ids) if len(result) > 1: - result["combined_score"] = np.mean( - list(result.values())).item() + result["combined_score"] = np.mean(list(result.values())).item() return result elif is_regression: return {"mse": ((preds - p.label_ids) ** 2).mean().item()} @@ -588,8 +578,7 @@ def compute_metrics(p: EvalPrediction): if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: - data_collator = DataCollatorWithPadding( - tokenizer, pad_to_multiple_of=8) + data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None diff --git a/lightseq/training/ops/pytorch/transformer_decoder_layer.py b/lightseq/training/ops/pytorch/transformer_decoder_layer.py index 23a803b1..5a9aa007 100644 --- a/lightseq/training/ops/pytorch/transformer_decoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_decoder_layer.py @@ -190,7 +190,7 @@ def __init__(self, config, initial_weights=None, initial_biases=None): @staticmethod def gen_offset(hidden_size, intermediate_size, nlayer): - """Returns the offset of each module's parameters among all + """Returns the offset of each module's parameters among all parameters of a layer """ hs, ims = hidden_size, intermediate_size @@ -220,11 +220,12 @@ def gen_offset(hidden_size, intermediate_size, nlayer): return offsets def params_dict(self): - ''' + """ Returns: weight: dict bias: dict - ''' + """ + def copy_and_view(m, shape=None): if shape is None: shape = (-1,) @@ -235,10 +236,10 @@ def _copy(m): self_attn_qkvw = self._get_weights(0) self_attn_qw, self_attn_kw, self_attn_vw = self_attn_qkvw.split( - self.hs * self.hs, 0) + self.hs * self.hs, 0 + ) self_attn_qkvb = self._get_weights(1) - self_attn_qb, self_attn_kb, self_attn_vb = self_attn_qkvb.split( - self.hs, 0) + self_attn_qb, self_attn_kb, self_attn_vb = self_attn_qkvb.split(self.hs, 0) all_enc_attn_kw, all_enc_attn_vw = None, None all_enc_attn_kb, all_enc_attn_vb = None, None @@ -257,10 +258,16 @@ def _copy(m): "self_attn_q_proj": copy_and_view(self_attn_qw, (self.hs, self.hs)), "self_attn_k_proj": copy_and_view(self_attn_kw, (self.hs, self.hs)), "self_attn_v_proj": copy_and_view(self_attn_vw, (self.hs, self.hs)), - "self_attn_out_proj": copy_and_view(self._get_weights(2), (self.hs, self.hs)), + "self_attn_out_proj": copy_and_view( + self._get_weights(2), (self.hs, self.hs) + ), "self_attn_layer_norm": copy_and_view(self._get_weights(4), (self.hs,)), - "encoder_attn_q_proj": copy_and_view(self._get_weights(6), (self.hs, self.hs)), - "encoder_attn_out_proj": copy_and_view(self._get_weights(8), (self.hs, self.hs)), + "encoder_attn_q_proj": copy_and_view( + self._get_weights(6), (self.hs, self.hs) + ), + "encoder_attn_out_proj": copy_and_view( + self._get_weights(8), (self.hs, self.hs) + ), "encoder_attn_layer_norm": copy_and_view(self._get_weights(10), (self.hs,)), "fc1": copy_and_view(self._get_weights(12), (self.ims, self.hs)), "fc2": copy_and_view(self._get_weights(14), (self.hs, self.ims)), @@ -281,7 +288,7 @@ def _copy(m): "fc2": copy_and_view(self._get_weights(15)), "final_layer_norm": copy_and_view(self._get_weights(17)), "encoder_attn_k_proj": all_enc_attn_kb, - "encoder_attn_v_proj": all_enc_attn_vb + "encoder_attn_v_proj": all_enc_attn_vb, } return weight, bias @@ -417,8 +424,7 @@ def forward( if hasattr(self, "para_16"): self.para_16.copy_(self.para.to(torch.half)) else: - self.register_buffer( - "para_16", self.para.clone().detach().half()) + self.register_buffer("para_16", self.para.clone().detach().half()) if self.config.fp16: decoder_states = decoder_states.to(torch.half) diff --git a/lightseq/training/ops/pytorch/transformer_encoder_layer.py b/lightseq/training/ops/pytorch/transformer_encoder_layer.py index c2e47803..df51a63a 100644 --- a/lightseq/training/ops/pytorch/transformer_encoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_encoder_layer.py @@ -215,11 +215,12 @@ def init_transformer_weights(self): nn.init.zeros_(self._get_weights(11)) def params_dict(self): - ''' + """ Returns: weight: dict bias: dict - ''' + """ + def copy_and_view(m, shape=None): if shape is None: shape = (-1,) @@ -227,20 +228,22 @@ def copy_and_view(m, shape=None): self_attn_qkvw = self._get_weights(0) self_attn_qw, self_attn_kw, self_attn_vw = self_attn_qkvw.split( - self.hs * self.hs, 0) + self.hs * self.hs, 0 + ) self_attn_qkvb = self._get_weights(1) - self_attn_qb, self_attn_kb, self_attn_vb = self_attn_qkvb.split( - self.hs, 0) + self_attn_qb, self_attn_kb, self_attn_vb = self_attn_qkvb.split(self.hs, 0) weight = { "self_attn_q_proj": copy_and_view(self_attn_qw, (self.hs, self.hs)), "self_attn_k_proj": copy_and_view(self_attn_kw, (self.hs, self.hs)), "self_attn_v_proj": copy_and_view(self_attn_vw, (self.hs, self.hs)), - "self_attn_out_proj": copy_and_view(self._get_weights(2), (self.hs, self.hs)), + "self_attn_out_proj": copy_and_view( + self._get_weights(2), (self.hs, self.hs) + ), "self_attn_layer_norm": copy_and_view(self._get_weights(4), (self.hs,)), "fc1": copy_and_view(self._get_weights(6), (self.ims, self.hs)), "fc2": copy_and_view(self._get_weights(8), (self.hs, self.ims)), - "final_layer_norm": copy_and_view(self._get_weights(10), (self.hs,)) + "final_layer_norm": copy_and_view(self._get_weights(10), (self.hs,)), } bias = { "self_attn_q_proj": copy_and_view(self_attn_qb), @@ -250,7 +253,7 @@ def copy_and_view(m, shape=None): "self_attn_layer_norm": copy_and_view(self._get_weights(5)), "fc1": copy_and_view(self._get_weights(7)), "fc2": copy_and_view(self._get_weights(9)), - "final_layer_norm": copy_and_view(self._get_weights(11)) + "final_layer_norm": copy_and_view(self._get_weights(11)), } return weight, bias From a9fbb9ad77b2be8db0a0e26ce8500b7929fff6c6 Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Wed, 1 Jun 2022 18:47:38 +0800 Subject: [PATCH 11/16] remove return decoder cache --- .gitignore | 2 ++ .../training/huggingface/bart/summarization/.lock | 0 .../huggingface/bart/summarization/ls_bart_model.py | 5 +---- .../fs_modules/ls_fs_transformer_decoder_layer.py | 2 +- lightseq/training/ops/pytorch/transformer.py | 2 +- .../ops/pytorch/transformer_decoder_layer.py | 12 +++++------- 6 files changed, 10 insertions(+), 13 deletions(-) delete mode 100755 examples/training/huggingface/bart/summarization/.lock diff --git a/.gitignore b/.gitignore index 3cb4f4d8..d3476646 100644 --- a/.gitignore +++ b/.gitignore @@ -145,3 +145,5 @@ build/ *.bin *.idx *.hdf5 + +.lock \ No newline at end of file diff --git a/examples/training/huggingface/bart/summarization/.lock b/examples/training/huggingface/bart/summarization/.lock deleted file mode 100755 index e69de29b..00000000 diff --git a/examples/training/huggingface/bart/summarization/ls_bart_model.py b/examples/training/huggingface/bart/summarization/ls_bart_model.py index 97182dd1..cc12f4ca 100644 --- a/examples/training/huggingface/bart/summarization/ls_bart_model.py +++ b/examples/training/huggingface/bart/summarization/ls_bart_model.py @@ -314,9 +314,6 @@ def forward( ) cache = None if use_cache: - import pdb - - pdb.set_trace() cache = ( {} if past_key_value is None @@ -325,7 +322,7 @@ def forward( output = super().forward( hidden_states, encoder_hidden_states, ls_encoder_padding_mask, cache ) - return output + return output, (cache["dec_self_k"], cache["dec_self_v"]) @staticmethod def get_params_list(**kwargs): diff --git a/lightseq/training/cli/fs_modules/ls_fs_transformer_decoder_layer.py b/lightseq/training/cli/fs_modules/ls_fs_transformer_decoder_layer.py index e3dba1b2..9b98d725 100644 --- a/lightseq/training/cli/fs_modules/ls_fs_transformer_decoder_layer.py +++ b/lightseq/training/cli/fs_modules/ls_fs_transformer_decoder_layer.py @@ -49,7 +49,7 @@ def forward( else: cache = self.get_self_attn_cache(incremental_state) self.beam_size = int(x.shape[0] / encoder_padding_mask.shape[0]) - res, _ = super().forward(x, encoder_out, encoder_padding_mask, cache) + res = super().forward(x, encoder_out, encoder_padding_mask, cache) if cache: self.set_self_attn_cache(incremental_state, cache) return res, None, None diff --git a/lightseq/training/ops/pytorch/transformer.py b/lightseq/training/ops/pytorch/transformer.py index 5c4307dd..51a29dae 100644 --- a/lightseq/training/ops/pytorch/transformer.py +++ b/lightseq/training/ops/pytorch/transformer.py @@ -194,7 +194,7 @@ def forward(self, trg_tokens, encoder_out, encoder_padding_mask, cache=None): for i, layer in enumerate(self.layers): layer_cache = cache[i] if cache else None - x, _ = layer( + x = layer( x, encoder_out, encoder_padding_mask, diff --git a/lightseq/training/ops/pytorch/transformer_decoder_layer.py b/lightseq/training/ops/pytorch/transformer_decoder_layer.py index 5a9aa007..0d251c23 100644 --- a/lightseq/training/ops/pytorch/transformer_decoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_decoder_layer.py @@ -460,7 +460,10 @@ def forward( ).contiguous() cache["encdec_kv"] = encdec_kv self.encdec_kv = encdec_kv - cache_list.append(cache["encdec_kv"]) + if "encdec_kv" in cache: + cache_list.append(cache["encdec_kv"]) + else: + cache_list.append(self.encdec_kv) head_dim = int(self.config.hidden_size / self.config.nhead) shape = (batch_beams, self.config.nhead, step + 1, head_dim) new_k = torch.zeros( @@ -503,9 +506,4 @@ def forward( self.config, cache_list, ) - if cache_list == []: - past_key_value = None - else: - # cache decoder self attention k v - past_key_value = (cache_list[0], cache_list[1]) - return output.to(self.para), past_key_value + return output.to(self.para) From f145480c1f26826e84918288210861067e30edb3 Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Wed, 1 Jun 2022 19:05:37 +0800 Subject: [PATCH 12/16] format update --- .gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitignore b/.gitignore index d3476646..3cb4f4d8 100644 --- a/.gitignore +++ b/.gitignore @@ -145,5 +145,3 @@ build/ *.bin *.idx *.hdf5 - -.lock \ No newline at end of file From dd2fa1b792c36767261855e79531c1e6a1952132 Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Wed, 1 Jun 2022 19:34:54 +0800 Subject: [PATCH 13/16] update ner and qa --- .../huggingface/bert/task_ner/run_ner.py | 25 ++++++++++++++----- .../huggingface/bert/task_qa/run_qa.py | 24 +++++++++++++----- .../ops/pytorch/transformer_decoder_layer.py | 2 +- 3 files changed, 38 insertions(+), 13 deletions(-) diff --git a/examples/training/huggingface/bert/task_ner/run_ner.py b/examples/training/huggingface/bert/task_ner/run_ner.py index 41db6c1d..174796db 100644 --- a/examples/training/huggingface/bert/task_ner/run_ner.py +++ b/examples/training/huggingface/bert/task_ner/run_ner.py @@ -44,7 +44,7 @@ ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version -from ls_hf_transformer_layer import inject_ls_layer +from ls_hf_transformer_layer import inject_ls_layer, LSBertForTokenClassification # Will error if the minimal version of Transformers is not installed. Remove at your own risks. @@ -366,8 +366,25 @@ def get_label_list(labels): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) - model = AutoModelForTokenClassification.from_pretrained( + # # Replace with lightseq encoder layers and save the lightseq model + # model = AutoModelForTokenClassification.from_pretrained( + # model_args.model_name_or_path, + # from_tf=bool(".ckpt" in model_args.model_name_or_path), + # config=config, + # cache_dir=model_args.cache_dir, + # revision=model_args.model_revision, + # use_auth_token=True if model_args.use_auth_token else None, + # ) + + # # Replace with LightSeq encoder layers. + # if model_args.module_type == 1 or model_args.module_type == 2: + # inject_ls_layer(model, training_args, model_args, config) + + # Replace with lightseq encoder layers and save the huggingface model + model = LSBertForTokenClassification.from_pretrained( model_args.model_name_or_path, + training_args=training_args, + model_args=model_args, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, @@ -375,10 +392,6 @@ def get_label_list(labels): use_auth_token=True if model_args.use_auth_token else None, ) - # Replace with LightSeq encoder layers. - if model_args.module_type == 1 or model_args.module_type == 2: - inject_ls_layer(model, training_args, model_args, config) - # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( diff --git a/examples/training/huggingface/bert/task_qa/run_qa.py b/examples/training/huggingface/bert/task_qa/run_qa.py index 83c4fe02..920c1327 100644 --- a/examples/training/huggingface/bert/task_qa/run_qa.py +++ b/examples/training/huggingface/bert/task_qa/run_qa.py @@ -46,7 +46,7 @@ from transformers.utils import check_min_version from transformers.utils.versions import require_version from utils_qa import postprocess_qa_predictions -from ls_hf_transformer_layer import inject_ls_layer +from ls_hf_transformer_layer import inject_ls_layer, LSBertForQuestionAnswering # Will error if the minimal version of Transformers is not installed. Remove at your own risks. @@ -373,8 +373,24 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) - model = AutoModelForQuestionAnswering.from_pretrained( + # # Replace with lightseq encoder layers and save the lightseq model + # model = AutoModelForQuestionAnswering.from_pretrained( + # model_args.model_name_or_path, + # from_tf=bool(".ckpt" in model_args.model_name_or_path), + # config=config, + # cache_dir=model_args.cache_dir, + # revision=model_args.model_revision, + # use_auth_token=True if model_args.use_auth_token else None, + # ) + # # Replace with LightSeq encoder layers. + # if model_args.module_type == 1 or model_args.module_type == 2: + # inject_ls_layer(model, training_args, model_args, config) + + # Replace with lightseq encoder layers and save the huggingface model + model = LSBertForQuestionAnswering.from_pretrained( model_args.model_name_or_path, + training_args=training_args, + model_args=model_args, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, @@ -382,10 +398,6 @@ def main(): use_auth_token=True if model_args.use_auth_token else None, ) - # Replace with LightSeq encoder layers. - if model_args.module_type == 1 or model_args.module_type == 2: - inject_ls_layer(model, training_args, model_args, config) - # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( diff --git a/lightseq/training/ops/pytorch/transformer_decoder_layer.py b/lightseq/training/ops/pytorch/transformer_decoder_layer.py index 0d251c23..82907cc1 100644 --- a/lightseq/training/ops/pytorch/transformer_decoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_decoder_layer.py @@ -162,7 +162,7 @@ def __init__(self, config, initial_weights=None, initial_biases=None): self.para_offset = self.para_offset[:-2] self.para = nn.Parameter(torch.Tensor(self.para_offset[-1])) - if initial_weights is None and initial_biases is None: + if initial_weights is None or initial_biases is None: # enc-dec kv weights and bias self.init_transformer_weights() return From 6e575ba7cf14ab7bfdb88b5d072a14acb7c9f0bc Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Wed, 1 Jun 2022 19:44:56 +0800 Subject: [PATCH 14/16] fix bugs --- .../huggingface/bart/summarization/ls_bart_model.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/training/huggingface/bart/summarization/ls_bart_model.py b/examples/training/huggingface/bart/summarization/ls_bart_model.py index cc12f4ca..a2a8ca85 100644 --- a/examples/training/huggingface/bart/summarization/ls_bart_model.py +++ b/examples/training/huggingface/bart/summarization/ls_bart_model.py @@ -8,8 +8,10 @@ LSTransformerDecoderLayer as TransformerDecoderLayer, ) from transformers import ( - BartForConditionalGeneration, BartPretrainedModel, + BartForConditionalGeneration, + BartForSequenceClassification, + BartForQuestionAnswering, ) from transformers.models.bart.modeling_bart import BartEncoderLayer, BartDecoderLayer @@ -313,6 +315,7 @@ def forward( .type_as(encoder_attention_mask) ) cache = None + self_attn_kv_cache = None if use_cache: cache = ( {} @@ -322,7 +325,9 @@ def forward( output = super().forward( hidden_states, encoder_hidden_states, ls_encoder_padding_mask, cache ) - return output, (cache["dec_self_k"], cache["dec_self_v"]) + if use_cache: + self_attn_kv_cache = (cache["dec_self_k"], cache["dec_self_v"]) + return output, self_attn_kv_cache @staticmethod def get_params_list(**kwargs): From 2ba905ead783018c775a11197030e48c0be04a04 Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Wed, 1 Jun 2022 19:48:18 +0800 Subject: [PATCH 15/16] formate update --- .../training/huggingface/bart/summarization/ls_bart_model.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/training/huggingface/bart/summarization/ls_bart_model.py b/examples/training/huggingface/bart/summarization/ls_bart_model.py index a2a8ca85..d3aba5fa 100644 --- a/examples/training/huggingface/bart/summarization/ls_bart_model.py +++ b/examples/training/huggingface/bart/summarization/ls_bart_model.py @@ -273,7 +273,6 @@ class ParamsList: final_layer_norm: None params_list = ParamsList(**kwargs) - # check_config(config) return params_list @classmethod @@ -350,7 +349,6 @@ class ParamsList: encoder_attn_v_proj: None params_list = ParamsList(**kwargs) - # check_config(config) return params_list @classmethod From ff7f336c30fd1f815421b4a989e8c70afbb2d587 Mon Sep 17 00:00:00 2001 From: duanrenchong Date: Wed, 1 Jun 2022 23:40:04 +0800 Subject: [PATCH 16/16] Optimising the cache of the decoder --- .../huggingface/bart/summarization/ls_bart_model.py | 12 ++++++++++-- .../ops/pytorch/transformer_decoder_layer.py | 6 +----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/examples/training/huggingface/bart/summarization/ls_bart_model.py b/examples/training/huggingface/bart/summarization/ls_bart_model.py index d3aba5fa..7d4819ee 100644 --- a/examples/training/huggingface/bart/summarization/ls_bart_model.py +++ b/examples/training/huggingface/bart/summarization/ls_bart_model.py @@ -319,13 +319,21 @@ def forward( cache = ( {} if past_key_value is None - else {"dec_self_k": past_key_value[0], "dec_self_v": past_key_value[1]} + else { + "dec_self_k": past_key_value[0], + "dec_self_v": past_key_value[1], + "encdec_kv": past_key_value[2], + } ) output = super().forward( hidden_states, encoder_hidden_states, ls_encoder_padding_mask, cache ) if use_cache: - self_attn_kv_cache = (cache["dec_self_k"], cache["dec_self_v"]) + self_attn_kv_cache = ( + cache["dec_self_k"], + cache["dec_self_v"], + cache["encdec_kv"] if "encdec_kv" in cache else None, + ) return output, self_attn_kv_cache @staticmethod diff --git a/lightseq/training/ops/pytorch/transformer_decoder_layer.py b/lightseq/training/ops/pytorch/transformer_decoder_layer.py index 82907cc1..b6e84183 100644 --- a/lightseq/training/ops/pytorch/transformer_decoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_decoder_layer.py @@ -459,11 +459,7 @@ def forward( shape, dtype=decoder_states.dtype, device=decoder_states.device ).contiguous() cache["encdec_kv"] = encdec_kv - self.encdec_kv = encdec_kv - if "encdec_kv" in cache: - cache_list.append(cache["encdec_kv"]) - else: - cache_list.append(self.encdec_kv) + cache_list.append(cache["encdec_kv"]) head_dim = int(self.config.hidden_size / self.config.nhead) shape = (batch_beams, self.config.nhead, step + 1, head_dim) new_k = torch.zeros(