diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 414285c9a..991c74fb8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,7 +32,7 @@ repos: hooks: - id: codespell args: [ - '--ignore-words-list=reord', # Word used in error messages that need rewording + '--ignore-words-list=reord,dout', # Word used in error messages that need rewording --check-filenames, --check-hidden, ] diff --git a/README.md b/README.md index 43f4665ce..9b6880bf2 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,11 @@ from the repository root. +### Flash Attention + +To use [Flash-Attention](https://github.com/HazyResearch/flash-attention), install the additional dependencies in `./requirements/requirements-flashattention.txt` and set the attention type in your configuration accordingly (see [configs](./configs/)). This can provide significant speed-ups over regular attention on certain GPU architectures, including Ampere GPUs (such as A100s); see the repository for more details. + + ### Containerized Setup We also provide a Dockerfile if you prefer to run NeoX in a container. To use this option, first build an image named `gpt-neox` from the repository root directory with `docker build -t gpt-neox -f Dockerfile .`. We also host pre-built images on Docker Hub at `leogao2/gpt-neox`. diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 3a15ffae6..a089a5596 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 5bba068 + Default = 12f6f76 current git hash of repository @@ -797,6 +797,14 @@ Misc. Arguments +- **save_iters**: list + + Default = None + + Set during training + + + - **global_num_gpus**: int Default = None @@ -1132,11 +1140,37 @@ Training Arguments -- **save_interval**: int +- **checkpoint_scale**: typing.Literal['linear', 'log'] + + Default = linear + + How step at which checkpoints are saved should scale. "linear" implies 1 checkpoint will be saved at every multiple of `checkpoint-factor`, + while "log" implies that the number of steps between each checkpoint will be multiplied by `checkpoint-factor` at each step, starting from step 1. + + + +- **checkpoint_factor**: int + + Default = None + + Acts as a multiplier on either the "log" or "linear" checkpoint spacing. + + With `checkpoint-scale="linear"`, `checkpoint-factor=20`, and `train-iters=100`, checkpoints will be saved at + steps [20, 40, 60, 80, 100]. + + With `checkpoint-scale="log"`, `checkpoint-factor=2`, and `train-iters=100`, checkpoints will be saved at + steps [1, 2, 4, 8, 16, 32, 64, 100]. + + Note that the last checkpoint step is always saved. + + + +- **extra_save_iters**: list Default = None - Number of iterations between checkpoint saves. + Additional iterations when a checkpoint should be saved. + Must be a list of ints or `None`. diff --git a/megatron/model/flash_attention.py b/megatron/model/flash_attention.py new file mode 100644 index 000000000..5c40a322c --- /dev/null +++ b/megatron/model/flash_attention.py @@ -0,0 +1,185 @@ +# Based on: https://github.com/HazyResearch/flash-attention/blob/4a6eaa9f27df6fff7ffb2c24e894938a687dd870/flash_attn/flash_attn_interface.py + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import flash_attn_cuda + + +def _flash_attn_forward( + q, + k, + v, + out, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + return_softmax, + num_splits=0, + generator=None, +): + """ + num_splits: how much to parallelize over the seqlen_q dimension. num_splits=0 means + it will be set by an internal heuristic. We're exposing num_splits mostly for benchmarking. + Don't change it unless you know what you're doing. + """ + softmax_lse, *rest = flash_attn_cuda.fwd( + q, + k, + v, + out, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + False, + causal, + return_softmax, + num_splits, + generator, + ) + # if out.isnan().any() or softmax_lse.isnan().any(): + # breakpoint() + S_dmask = rest[0] if return_softmax else None + return out, softmax_lse, S_dmask + + +def _flash_attn_backward( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + num_splits=0, + generator=None, +): + """ + num_splits: whether to parallelize over the seqlen_k dimension (num_splits > 1) or + not (num_splits = 1). num_splits=0 means it will be set by an internal heuristic. + Any value above 1 will call the same kernel (i.e. num_splits=2 would call the same kernel + as num_splits=3), so effectively the choices are 0, 1, and 2. + This hyperparameter can be tuned for performance, but default value (heuristic) should work fine. + """ + _, _, _, softmax_d = flash_attn_cuda.bwd( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + False, + causal, + num_splits, + generator, + ) + # if dk.isnan().any() or dk.isnan().any() or dv.isnan().any() or softmax_d.isnan().any(): + # breakpoint() + return dq, dk, dv, softmax_d + + +class FlashAttnQKVPackedFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + qkv, + cu_seqlens, + max_seqlen, + dropout_p, + softmax_scale, + causal, + return_softmax, + ): + # Save rng_state because the backward pass will regenerate the dropout mask + rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None + if softmax_scale is None: + softmax_scale = qkv.shape[-1] ** (-0.5) + out, softmax_lse, S_dmask = _flash_attn_forward( + qkv[:, 0], + qkv[:, 1], + qkv[:, 2], + torch.empty_like(qkv[:, 0]), + cu_seqlens, + cu_seqlens, + max_seqlen, + max_seqlen, + dropout_p, + softmax_scale, + causal=causal, + return_softmax=return_softmax, + ) + ctx.save_for_backward(qkv, out, softmax_lse, cu_seqlens, rng_state) + ctx.dropout_p = dropout_p + ctx.max_seqlen = max_seqlen + ctx.softmax_scale = softmax_scale + ctx.causal = causal + return out if not return_softmax else (out, softmax_lse, S_dmask) + + @staticmethod + def backward(ctx, dout, *args): + qkv, out, softmax_lse, cu_seqlens, rng_state = ctx.saved_tensors + if rng_state is not None: + cur_rng_state = torch.cuda.get_rng_state() + torch.cuda.set_rng_state(rng_state) + dqkv = torch.empty_like(qkv) + _flash_attn_backward( + dout, + qkv[:, 0], + qkv[:, 1], + qkv[:, 2], + out, + softmax_lse, + dqkv[:, 0], + dqkv[:, 1], + dqkv[:, 2], + cu_seqlens, + cu_seqlens, + ctx.max_seqlen, + ctx.max_seqlen, + ctx.dropout_p, + ctx.softmax_scale, + ctx.causal, + ) + if rng_state is not None: + torch.cuda.set_rng_state(cur_rng_state) + return dqkv, None, None, None, None, None, None + + +def flash_attn_unpadded_qkvpacked_func( + qkv, + cu_seqlens, + max_seqlen, + dropout_p, + softmax_scale=None, + causal=False, + return_attn_probs=False, +): + return FlashAttnQKVPackedFunc.apply( + qkv, cu_seqlens, max_seqlen, dropout_p, softmax_scale, causal, return_attn_probs + ) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 16433f876..cd816e5fd 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -259,7 +259,8 @@ def __init__( self.rotary_emb = None self.attention_type = neox_args.attention_config[layer_number] - self.sparse = self.attention_type != "global" + self.use_flash_attention = self.attention_type == "flash" + self.sparse = self.attention_type != "global" and not self.use_flash_attention if self.sparse: self.sparse_attn = configure_sparse_attention( neox_args, @@ -268,19 +269,31 @@ def __init__( mpu=mpu, ) else: - self.scale_mask_softmax = FusedScaleMaskSoftmax( - input_in_fp16=self.fp16, - input_in_bf16=self.bf16, - fusion_type=get_fusion_type(neox_args), - mask_func=self.attention_mask_func, - softmax_in_fp32=self.attention_softmax_in_fp32, - scale=coeff, - ) + if self.use_flash_attention: + from megatron.model.flash_attention import ( + flash_attn_unpadded_qkvpacked_func, + ) + + self.flash_attention_function = flash_attn_unpadded_qkvpacked_func + if self.pos_emb == "alibi": + raise ValueError( + "Flash attention is currently not compatible with AliBi positional embeddings. Use sinuisoidal, learned, or rotary embeddings instead." + ) + else: + self.scale_mask_softmax = FusedScaleMaskSoftmax( + input_in_fp16=self.fp16, + input_in_bf16=self.bf16, + fusion_type=get_fusion_type(neox_args), + mask_func=self.attention_mask_func, + softmax_in_fp32=self.attention_softmax_in_fp32, + scale=coeff, + ) # Dropout. Note that for a single iteration, this layer will generate # different outputs on different number of parallel partitions but # on average it should not be partition dependent. - self.attention_dropout = nn.Dropout(neox_args.attention_dropout) + self.dropout_p = neox_args.attention_dropout + self.attention_dropout = nn.Dropout(self.dropout_p) # Output. self.dense = mpu.RowParallelLinear( @@ -396,6 +409,55 @@ def attention( context_layer = context_layer.view(*output_size) return context_layer + def flash_attention(self, query_layer, key_layer, value_layer): + # [b, np, sq, sk] + output_size = ( + query_layer.size(1), + query_layer.size(2), + query_layer.size(0), + key_layer.size(0), + ) + # [s, b, np, hn] -> [b, s, np, hn] -> [b * s, 1, np, hn] + query_layer = query_layer.transpose(0, 1).reshape( + output_size[0] * output_size[2], 1, output_size[1], -1 + ) + key_layer = key_layer.transpose(0, 1).reshape( + output_size[0] * output_size[3], 1, output_size[1], -1 + ) + value_layer = value_layer.transpose(0, 1).reshape( + output_size[0] * output_size[3], 1, output_size[1], -1 + ) + + # Combined q/k/v into [b * s, 3, np, hn]. + qkv = torch.concat([query_layer, key_layer, value_layer], dim=1) + + batch_size = output_size[0] + seqlen = output_size[2] + max_s = seqlen + cu_seqlens = torch.arange( + 0, + (batch_size + 1) * seqlen, + step=seqlen, + dtype=torch.int32, + device=qkv.device, + ) + output = self.flash_attention_function( + qkv, + cu_seqlens, + max_s, + self.dropout_p if self.training else 0.0, + softmax_scale=None, + causal=True, + ) + # [b * sq, np, hn] -> [b, sq, np, hn] + matmul_result = output.view( + output_size[0], output_size[2], output.shape[1], output.shape[2] + ) + # [b, sq, np, hn] -> [b, np, sq, hn] + matmul_result = matmul_result.transpose(1, 2) + + return matmul_result + def sparse_attention(self, query_layer, key_layer, value_layer, attention_mask): # TODO: sparse attn dropout? # TODO: pad to block size @@ -483,7 +545,9 @@ def forward(self, hidden_states, attention_mask, layer_past=None): if self.use_cache: present = torch.stack((key_layer, value_layer)) - if not self.sparse: + if self.use_flash_attention: + context_layer = self.flash_attention(query_layer, key_layer, value_layer) + elif not self.sparse: context_layer = self.attention( query_layer, key_layer, value_layer, layer_past, attention_mask ) diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 264d598a3..e114772e2 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -735,6 +735,30 @@ def calculate_derived(self): } ) + # derive steps where checkpoint should be saved + if self.checkpoint_factor or self.extra_save_iters: + if self.extra_save_iters: + save_iters = set(self.extra_save_iters) + else: + save_iters = set() + + step = self.checkpoint_factor # don't save step 0 or 1 + while step < self.train_iters: + save_iters.add(step) + if self.checkpoint_scale == "log": + step *= self.checkpoint_factor + elif self.checkpoint_scale == "linear": + step += self.checkpoint_factor + + save_iters = list(save_iters) + save_iters.sort() + + self.update_values( + { + "save_iters": save_iters, + } + ) + # derive precision if (self.fp16 or {}).get("type", self.precision) == "bfloat16": self.update_value("precision", "bfloat16") @@ -824,7 +848,7 @@ def calculate_derived(self): if self.sparsity_config is None: # Can't have a default value as an empty dict so need to set it here self.update_value("sparsity_config", {}) - + # Adding equal dataset weights if none are provided if self.train_data_paths and (self.train_data_weights is None): self.train_data_weights = [1.0] * len(self.train_data_paths) @@ -923,10 +947,10 @@ def validate_values(self): raise ValueError(error_message) return False - if self.save is not None and self.save_interval is None: + if self.save is not None and self.checkpoint_factor is None and self.extra_save_iters is None: error_message = ( self.__class__.__name__ - + ".validate_values() save_interval must be defined if save is defined" + + ".validate_values() checkpoint_factor or extra_save_iters must be defined if save is defined" ) logging.error(error_message) raise ValueError(error_message) diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index a02486a62..0d56a7781 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -34,6 +34,7 @@ "bslongformer", "gmlp", "amlp", + "flash", ] @@ -634,6 +635,11 @@ class NeoXArgsOther(NeoXArgsTemplate): Set during training """ + save_iters: list = None + """ + Set during training + """ + global_num_gpus: int = None """ Set during launching @@ -769,9 +775,29 @@ class NeoXArgsTraining(NeoXArgsTemplate): save input and output of a forward pass with the checkpoint and validate after load """ - save_interval: int = None + checkpoint_scale: Literal["linear", "log"] = "linear" + """ + How step at which checkpoints are saved should scale. "linear" implies 1 checkpoint will be saved at every multiple of `checkpoint-factor`, + while "log" implies that the number of steps between each checkpoint will be multiplied by `checkpoint-factor` at each step, starting from step 1. + """ + + checkpoint_factor: int = None + """ + Acts as a multiplier on either the "log" or "linear" checkpoint spacing. + + With `checkpoint-scale="linear"`, `checkpoint-factor=20`, and `train-iters=100`, checkpoints will be saved at + steps [20, 40, 60, 80, 100]. + + With `checkpoint-scale="log"`, `checkpoint-factor=2`, and `train-iters=100`, checkpoints will be saved at + steps [1, 2, 4, 8, 16, 32, 64, 100]. + + Note that the last checkpoint step is always saved. + """ + + extra_save_iters: list = None """ - Number of iterations between checkpoint saves. + Additional iterations when a checkpoint should be saved. + Must be a list of ints or `None`. """ no_save_optim: bool = False diff --git a/megatron/training.py b/megatron/training.py index 3daa4c3c7..1aec65cef 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -103,6 +103,16 @@ def pretrain(neox_args): iteration = 0 if neox_args.do_train and neox_args.train_iters > 0: + # edge case: save step 0 checkpoint if requested + if neox_args.save and 0 in neox_args.save_iters: + save_checkpoint( + neox_args=neox_args, + iteration=iteration, + model=model, + optimizer=optimizer, + lr_scheduler=lr_scheduler, + ) + iteration = train( neox_args=neox_args, timers=timers, @@ -607,8 +617,7 @@ def train( # Checkpointing if ( neox_args.save - and neox_args.save_interval - and iteration % neox_args.save_interval == 0 + and iteration in neox_args.save_iters ): save_checkpoint( neox_args=neox_args, diff --git a/requirements/requirements-flashattention.txt b/requirements/requirements-flashattention.txt new file mode 100644 index 000000000..0c7d41e59 --- /dev/null +++ b/requirements/requirements-flashattention.txt @@ -0,0 +1 @@ +flash-attn==0.2.2 diff --git a/tools/convert_to_hf.py b/tools/convert_to_hf.py index f10236539..c5c6b306d 100644 --- a/tools/convert_to_hf.py +++ b/tools/convert_to_hf.py @@ -105,11 +105,11 @@ def __init__(self, neox_config): # TODO: change the default value here based on discussion regarding `gpt_j_tied` config parameter's default use_tied_lns = get_key(neox_config, 'gpt-j-tied', False) - if not use_tied_lns: + if use_tied_lns: raise NotImplementedError( """ERROR: Huggingface Transformers does not yet support a single shared layernorm per transformer block for GPT-NeoX models trained w/ GPT-J parallel residuals. - See https://github.com/EleutherAI/gpt-neox/pull/481 for further details.""" + See https://github.com/EleutherAI/gpt-neox/pull/481 for further details.""") # set all config values. hf_config = GPTNeoXConfig(