Skip to content

Commit

Permalink
[Trainer] update sequence parallel (#9757)
Browse files Browse the repository at this point in the history
* update emb doc

* update register_sequence_parallel_allreduce_hooks

* update fuse_sequence_parallel_allreduce
  • Loading branch information
DesmonDay authored Jan 10, 2025
1 parent 331131b commit b4325b9
Show file tree
Hide file tree
Showing 13 changed files with 34 additions and 71 deletions.
2 changes: 1 addition & 1 deletion llm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ python -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" ./predict/flask_ser

#### 7.2 大模型服务化部署工具

该部署工具是基于英伟达Triton框架专为服务器场景的大模型服务化部署而设计。它提供了支持gRPC、HTTP协议的服务接口,以及流式Token输出能力。底层推理引擎支持连续批处理、weight only int8、后训练量化(PTQ)等加速优化策略,为用户带来易用且高性能的部署体验。
该部署工具是基于英伟达 Triton 框架专为服务器场景的大模型服务化部署而设计。它提供了支持 gRPC、HTTP 协议的服务接口,以及流式 Token 输出能力。底层推理引擎支持连续批处理、weight only int8、后训练量化(PTQ)等加速优化策略,为用户带来易用且高性能的部署体验。

基于预编译镜像部署,本节以 Meta-Llama-3-8B-Instruct-A8W8C8 为例,更多模型请参考[LLaMA](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/docs/predict/llama.md)[Qwen](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/docs/predict/qwen.md)[Mixtral](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/docs/predict/mixtral.md), 更细致的模型推理、量化教程可以参考[大模型推理教程](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/docs/predict/inference.md)

Expand Down
5 changes: 0 additions & 5 deletions llm/alignment/dpo/run_dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
LlamaForCausalLMPipe,
Qwen2ForCausalLM,
Qwen2ForCausalLMPipe,
register_sequence_parallel_allreduce_hooks,
)
from paddlenlp.transformers.configuration_utils import LlmMetaConfig
from paddlenlp.trl import (
Expand Down Expand Up @@ -154,10 +153,6 @@ def main():
if model_args.flash_mask and not any(isinstance(model, cls) for cls in flash_mask_support_list):
raise NotImplementedError(f"{model.__class__} not support flash mask.")

if training_args.sequence_parallel:
register_sequence_parallel_allreduce_hooks(
model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
)
if model_args.tokenizer_name_or_path is not None:
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
else:
Expand Down
5 changes: 0 additions & 5 deletions llm/alignment/kto/run_kto.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
LlamaForCausalLM,
LlamaForCausalLMPipe,
Qwen2ForCausalLM,
register_sequence_parallel_allreduce_hooks,
)
from paddlenlp.transformers.configuration_utils import LlmMetaConfig
from paddlenlp.trl import (
Expand Down Expand Up @@ -140,10 +139,6 @@ def main():
if model_args.flash_mask and not any(isinstance(model, cls) for cls in flash_mask_support_list):
raise NotImplementedError(f"{model.__class__} not support flash mask.")

if training_args.sequence_parallel:
register_sequence_parallel_allreduce_hooks(
model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
)
if model_args.tokenizer_name_or_path is not None:
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
else:
Expand Down
4 changes: 0 additions & 4 deletions llm/alignment/rm/flashmask/reward_argument.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,3 @@ class ModelArgument:
default=1,
metadata={"help": "virtual_pp_degree"},
)
sequence_parallel: bool = field(
default=False,
metadata={"help": "whether to use sequence parallel"},
)
10 changes: 1 addition & 9 deletions llm/alignment/rm/flashmask/run_reward.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,7 @@
get_last_checkpoint,
set_seed,
)
from paddlenlp.transformers import (
AutoConfig,
AutoTokenizer,
register_sequence_parallel_allreduce_hooks,
)
from paddlenlp.transformers import AutoConfig, AutoTokenizer
from paddlenlp.utils.log import logger


Expand Down Expand Up @@ -126,10 +122,6 @@ def main():
logger.warning("`flash_mask` must use with zero padding and flash attention.")
model.config.use_flash_attention = True

if model_args.sequence_parallel:
register_sequence_parallel_allreduce_hooks(
model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
)
if model_args.tokenizer_name_or_path is not None:
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
else:
Expand Down
12 changes: 2 additions & 10 deletions llm/auto_parallel/gpt-3/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,14 +225,6 @@ class ModelArguments:
hidden_dropout_prob: float = field(default=0.1, metadata={"help": "The hidden dropout prob."})
attention_probs_dropout_prob: float = field(default=0.1, metadata={"help": "The attention hidden dropout prob."})

sequence_parallel: bool = field(
default=False,
metadata={"help": "whether to use sequence parallel"},
)
fuse_sequence_parallel_allreduce: bool = field(
default=False,
metadata={"help": "whether to use fuse sequence parallel allreduce"},
)
use_fused_rope: Optional[bool] = field(
default=False,
metadata={"help": "Enable rope fusion or not."},
Expand Down Expand Up @@ -502,8 +494,8 @@ def main():
config.fuse_attention_ffn = model_args.fuse_attention_ffn
config.recompute_granularity = model_args.recompute_granularity
config.virtual_pp_degree = model_args.virtual_pp_degree
config.sequence_parallel = model_args.sequence_parallel
config.fuse_sequence_parallel_allreduce = model_args.fuse_sequence_parallel_allreduce
config.sequence_parallel = training_args.sequence_parallel
config.fuse_sequence_parallel_allreduce = training_args.fuse_sequence_parallel_allreduce
config.use_fused_rope = model_args.use_fused_rope
config.no_recompute_layers = model_args.no_recompute_layers
config.pp_recompute_interval = model_args.pp_recompute_interval
Expand Down
12 changes: 2 additions & 10 deletions llm/auto_parallel/llama/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,14 +221,6 @@ class ModelArguments:
"help": "Pre-training from existing paddlenlp model weights. Default False and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models."
},
)
sequence_parallel: bool = field(
default=False,
metadata={"help": "whether to use sequence parallel"},
)
fuse_sequence_parallel_allreduce: bool = field(
default=False,
metadata={"help": "whether to use fuse sequence parallel allreduce"},
)
use_fused_rope: Optional[bool] = field(
default=False,
metadata={"help": "Enable rope fusion or not."},
Expand Down Expand Up @@ -534,8 +526,8 @@ def main():
config.fuse_attention_ffn = model_args.fuse_attention_ffn
config.recompute_granularity = model_args.recompute_granularity
config.virtual_pp_degree = model_args.virtual_pp_degree
config.sequence_parallel = model_args.sequence_parallel
config.fuse_sequence_parallel_allreduce = model_args.fuse_sequence_parallel_allreduce
config.sequence_parallel = training_args.sequence_parallel
config.fuse_sequence_parallel_allreduce = training_args.fuse_sequence_parallel_allreduce
config.use_fused_rope = model_args.use_fused_rope
config.no_recompute_layers = model_args.no_recompute_layers
config.pp_recompute_interval = model_args.pp_recompute_interval
Expand Down
12 changes: 2 additions & 10 deletions llm/auto_parallel/qwen/run_pretrain_3D_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,14 +225,6 @@ class ModelArguments:
"help": "Pre-training from existing paddlenlp model weights. Default False and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models."
},
)
sequence_parallel: bool = field(
default=False,
metadata={"help": "whether to use sequence parallel"},
)
fuse_sequence_parallel_allreduce: bool = field(
default=False,
metadata={"help": "whether to use fuse sequence parallel allreduce"},
)
use_fused_rope: Optional[bool] = field(
default=False,
metadata={"help": "Enable rope fusion or not."},
Expand Down Expand Up @@ -513,8 +505,8 @@ def main():
config.fuse_attention_ffn = model_args.fuse_attention_ffn
config.recompute_granularity = model_args.recompute_granularity
config.virtual_pp_degree = model_args.virtual_pp_degree
config.sequence_parallel = model_args.sequence_parallel
config.fuse_sequence_parallel_allreduce = model_args.fuse_sequence_parallel_allreduce
config.sequence_parallel = training_args.sequence_parallel
config.fuse_sequence_parallel_allreduce = training_args.fuse_sequence_parallel_allreduce
config.use_fused_rope = model_args.use_fused_rope
config.no_recompute_layers = model_args.no_recompute_layers
config.pp_recompute_interval = model_args.pp_recompute_interval
Expand Down
5 changes: 0 additions & 5 deletions llm/run_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
LlamaTokenizer,
Qwen2ForCausalLM,
Qwen2ForCausalLMPipe,
register_sequence_parallel_allreduce_hooks,
)
from paddlenlp.transformers.configuration_utils import LlmMetaConfig
from paddlenlp.trl import DataConfig, ModelConfig, SFTConfig, SFTTrainer
Expand Down Expand Up @@ -231,10 +230,6 @@ def neft_post_hook(module, input, output):
else:
raise NotImplementedError("Only support neftune for model with get_input_embeddings")

if training_args.sequence_parallel:
register_sequence_parallel_allreduce_hooks(
model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
)
# Load tokenizer & dataset
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, from_aistudio=model_args.from_aistudio)
reft_layers = None
Expand Down
6 changes: 0 additions & 6 deletions llm/run_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
AutoTokenizer,
CosineAnnealingWithWarmupDecay,
LinearAnnealingWithWarmupDecay,
register_sequence_parallel_allreduce_hooks,
)
from paddlenlp.transformers.configuration_utils import LlmMetaConfig, llmmetaclass
from paddlenlp.utils.batch_sampler import DistributedBatchSampler
Expand Down Expand Up @@ -492,11 +491,6 @@ def main():
else:
model = model_class.from_config(config, dtype=dtype)

if training_args.sequence_parallel:
register_sequence_parallel_allreduce_hooks(
model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
)

if training_args.recompute:
model.recompute_enable()

Expand Down
5 changes: 0 additions & 5 deletions llm/run_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
LlamaTokenizer,
Qwen2ForCausalLM,
Qwen2ForCausalLMPipe,
register_sequence_parallel_allreduce_hooks,
)
from paddlenlp.transformers.configuration_utils import LlmMetaConfig
from paddlenlp.trl import DataConfig, ModelConfig, QuantConfig, SFTConfig, SFTTrainer
Expand Down Expand Up @@ -162,10 +161,6 @@ def main():
if model_args.flash_mask and not any(isinstance(model, cls) for cls in flash_mask_support_list):
raise NotImplementedError(f"{model.__class__} not support flash mask.")

if training_args.sequence_parallel:
register_sequence_parallel_allreduce_hooks(
model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
)
# Load tokenizer & dataset
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, from_aistudio=model_args.from_aistudio)
# init chat_template for tokenizer
Expand Down
11 changes: 11 additions & 0 deletions paddlenlp/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,12 @@
from ..quantization.quantization_linear import QuantizationLinear
except:
QuantizationLinear = None
try:
from paddle.distributed.fleet.utils.sequence_parallel_utils import (
register_sequence_parallel_allreduce_hooks,
)
except:
pass
from ..transformers.context_parallel_utils import split_inputs_sequence_dim_load_balance
from ..transformers.model_utils import (
PretrainedModel,
Expand Down Expand Up @@ -428,6 +434,11 @@ def _save_ckpt_func(state_dict, path, signal_path=None):
"We do not support skip_save_model_weight in peft model when using unified checkpoint, remove this config."
)

if args.sequence_parallel:
register_sequence_parallel_allreduce_hooks(
self.model, args.gradient_accumulation_steps, args.fuse_sequence_parallel_allreduce
)

self.do_grad_scaling = False
self.enable_autocast_context_manager = False
if args.fp16 or args.bf16:
Expand Down
16 changes: 15 additions & 1 deletion paddlenlp/trainer/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,13 @@ class TrainingArguments:
)
},
)
sequence_parallel: bool = field(
default=False,
metadata={"help": "Whether to enable sequence parallel."},
)
fuse_sequence_parallel_allreduce: bool = field(
default=False, metadata={"help": "Whether to use fuse sequence parallel allreduce."}
)
sequence_parallel_config: str = field(
default="",
metadata={
Expand Down Expand Up @@ -1209,10 +1216,17 @@ def __post_init__(self):
f"Found unknown pipeline mode config {x}, accpet config is disable_p2p_cache_shape, disable_partial_send_recv."
)

enable_partial_send_recv = "disable_partial_send_recv" not in pipeline_parallel_config
if self.sequence_parallel and enable_partial_send_recv:
logger.warning(
"When use pipeline parallel and sequence parallel simultaneously, we should turn off partial send recv."
)
enable_partial_send_recv = False

strategy.pipeline_configs = {
"accumulate_steps": self.gradient_accumulation_steps,
"micro_batch_size": self.per_device_train_batch_size,
"enable_partial_send_recv": "disable_partial_send_recv" not in pipeline_parallel_config,
"enable_partial_send_recv": enable_partial_send_recv,
"p2p_cache_shape": False if "disable_p2p_cache_shape" in pipeline_parallel_config else True,
# "delay_scale_loss": True, Fix ME
}
Expand Down

0 comments on commit b4325b9

Please sign in to comment.