diff --git a/training/kunlunxin/docker_image/pytorch/pytorch_install.sh b/training/kunlunxin/docker_image/pytorch/pytorch_install.sh index 65a640368..a1cdc0566 100644 --- a/training/kunlunxin/docker_image/pytorch/pytorch_install.sh +++ b/training/kunlunxin/docker_image/pytorch/pytorch_install.sh @@ -5,7 +5,8 @@ set -xe pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl -pip install psutil==5.9.5 -i https://pypi.tuna.tsinghua.edu.cn/simple -pip install accelerate==0.20.3 -i https://pypi.tuna.tsinghua.edu.cn/simple +pip install psutil==5.9.5 -i https://pypi.tuna.tsinghua.edu.cn/simple +pip install accelerate==0.20.3 -i https://pypi.tuna.tsinghua.edu.cn/simple pip install tabulate==0.9.0 -i https://pypi.tuna.tsinghua.edu.cn/simple + python -m xacc.install diff --git a/training/kunlunxin/glm-pytorch/config/environment_variables.sh b/training/kunlunxin/glm-pytorch/config/environment_variables.sh index b527e7873..2ec052c25 100755 --- a/training/kunlunxin/glm-pytorch/config/environment_variables.sh +++ b/training/kunlunxin/glm-pytorch/config/environment_variables.sh @@ -1,21 +1,9 @@ -# ================================================= -# Export variables -# ================================================= - export BKCL_PCIE_RING=1 export BKCL_TIMEOUT=1800 # when using tree allreduce, the number of nodes must be a multiple of 2 export BKCL_SOCKET_FORCE_TREE=1 -export XMLIR_D_XPU_L3_SIZE=32505856 - -export BKCL_CCIX_RING=1 -export BKCL_FORCE_SYNC=1 - -export ALLREDUCE_ASYNC=false -export ALLREDUCE_FUSION=0 - -export XMLIR_F_XPU_FC_GEMM_MODE=float -export XMLIR_F_FAST_INDEX_PUT=true +export XMLIR_D_XPU_L3_SIZE=66060288 -export XACC_ENABLE=1 +export XACC=1 +export XACC_ARGS="-L O0" diff --git a/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py b/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py index 719532ede..861c34850 100644 --- a/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py +++ b/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py @@ -2,82 +2,60 @@ from torch import nn import torch.distributed as dist +import torch_xmlir + import config from optimizers import get_optimizer_param_groups from optimizers.loss_scaler import DynamicLossScaler from driver.dist_pytorch import main_proc_print -import torch_xmlir -import torch_xmlir.core.xpu_model as xm -from torch_xmlir.optimizer import AdamW as Adam -from torch_xmlir.nn.clip_grad import clip_grad_norm -from torch_xmlir.distributed import DistributedDataParallel as XPUDDP - -from .converter import convert_model as _convert_model - - -class XPUTorchDDP(XPUDDP): - - def named_parameters(self, prefix: str = '', recurse: bool = True): - return self.module.named_parameters(prefix=prefix, recurse=recurse) +from driver.dist_pytorch import PyTorchDistributedDataParallel as TorchDDP - def state_dict(self, destination=None, prefix='', keep_vars=False): - sd = self.module.state_dict(destination, prefix, keep_vars) - return sd +clip_grad_norm = torch.nn.utils.clip_grad_norm_ - def load_state_dict(self, state_dict, strict=True): - return self.module.load_state_dict(state_dict, strict=strict) +from .converter import convert_model as _convert_model def convert_model(model: torch.nn.Module) -> torch.nn.Module: return _convert_model(model, config) -def create_optimizer(model, args): - param_groups = get_optimizer_param_groups(model) - optimizer = Adam(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps) - main_proc_print(f'Optimizer = {optimizer.__class__.__name__}') - return optimizer - - def model_to_fp16(model): return model def model_to_ddp(model: nn.Module) -> nn.Module: if dist.is_available() and dist.is_initialized(): - model = XPUTorchDDP(model) + model = TorchDDP(model) return model -def create_grad_scaler(): - return None - - def backward(step, lm_loss, reduced_loss, optimizer, lr_scheduler, model): args = config - def _clip_grad(): - if args.clip_grad > 0: - clip_grad_norm(model.parameters(), args.clip_grad) - - lm_loss.backward() - if step % args.gradient_accumulation_steps == 0: - allreduce_grads = reversed( - [p.grad.data for p in model.parameters() if p.grad is not None]) - xm.optimizer_step(optimizer, - barrier=True, - post_allreduce_hook=_clip_grad, - allreduce_average=True, - allreduce_grads=allreduce_grads) - lr_scheduler.step() - - if DynamicLossScaler._has_inf_or_nan(reduced_loss): + if not DynamicLossScaler._has_inf_or_nan(reduced_loss): + backward_step(optimizer, model, lm_loss, args) + if step % args.gradient_accumulation_steps == 0: + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad(set_to_none=True) + else: main_proc_print("Found NaN loss, skip backward") torch_xmlir.xpu.empty_cache() return reduced_loss + + +def backward_step(optimizer, model, lm_loss, args): + """Backward step.""" + + # Total loss. + loss = lm_loss + + loss.backward() + + # Clipping gradients helps prevent the exploding gradient. + if args.clip_grad > 0: + clip_grad_norm(model.parameters(), args.clip_grad) + + return lm_loss diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index cc175466d..39372e969 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -106,5 +106,6 @@ # "gpt2:pytorch:R300:1:8:1": "/raid/dataset/gpt2" # "resnet50:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "transformer_xl:pytorch:R300:1:8:1": "/raid/dataset/transformer_xl/", + # "glm:pytorch:R300:1:8:1": "/raid/home_datasets_ckpt/glm/train/", }