Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update kunlunxin glm config #312

Merged
merged 1 commit into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions training/kunlunxin/docker_image/pytorch/pytorch_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ set -xe
pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl
pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl

pip install psutil==5.9.5 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install accelerate==0.20.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install psutil==5.9.5 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install accelerate==0.20.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install tabulate==0.9.0 -i https://pypi.tuna.tsinghua.edu.cn/simple

python -m xacc.install
18 changes: 3 additions & 15 deletions training/kunlunxin/glm-pytorch/config/environment_variables.sh
Original file line number Diff line number Diff line change
@@ -1,21 +1,9 @@
# =================================================
# Export variables
# =================================================

export BKCL_PCIE_RING=1
export BKCL_TIMEOUT=1800
# when using tree allreduce, the number of nodes must be a multiple of 2
export BKCL_SOCKET_FORCE_TREE=1

export XMLIR_D_XPU_L3_SIZE=32505856

export BKCL_CCIX_RING=1
export BKCL_FORCE_SYNC=1

export ALLREDUCE_ASYNC=false
export ALLREDUCE_FUSION=0

export XMLIR_F_XPU_FC_GEMM_MODE=float
export XMLIR_F_FAST_INDEX_PUT=true
export XMLIR_D_XPU_L3_SIZE=66060288

export XACC_ENABLE=1
export XACC=1
export XACC_ARGS="-L O0"
78 changes: 28 additions & 50 deletions training/kunlunxin/glm-pytorch/extern/trainer_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,82 +2,60 @@
from torch import nn
import torch.distributed as dist

import torch_xmlir

import config
from optimizers import get_optimizer_param_groups
from optimizers.loss_scaler import DynamicLossScaler
from driver.dist_pytorch import main_proc_print

import torch_xmlir
import torch_xmlir.core.xpu_model as xm
from torch_xmlir.optimizer import AdamW as Adam
from torch_xmlir.nn.clip_grad import clip_grad_norm
from torch_xmlir.distributed import DistributedDataParallel as XPUDDP

from .converter import convert_model as _convert_model


class XPUTorchDDP(XPUDDP):

def named_parameters(self, prefix: str = '', recurse: bool = True):
return self.module.named_parameters(prefix=prefix, recurse=recurse)
from driver.dist_pytorch import PyTorchDistributedDataParallel as TorchDDP

def state_dict(self, destination=None, prefix='', keep_vars=False):
sd = self.module.state_dict(destination, prefix, keep_vars)
return sd
clip_grad_norm = torch.nn.utils.clip_grad_norm_

def load_state_dict(self, state_dict, strict=True):
return self.module.load_state_dict(state_dict, strict=strict)
from .converter import convert_model as _convert_model


def convert_model(model: torch.nn.Module) -> torch.nn.Module:
return _convert_model(model, config)


def create_optimizer(model, args):
param_groups = get_optimizer_param_groups(model)
optimizer = Adam(param_groups,
lr=args.lr,
weight_decay=args.weight_decay,
betas=(args.adam_beta1, args.adam_beta2),
eps=args.adam_eps)
main_proc_print(f'Optimizer = {optimizer.__class__.__name__}')
return optimizer


def model_to_fp16(model):
return model


def model_to_ddp(model: nn.Module) -> nn.Module:
if dist.is_available() and dist.is_initialized():
model = XPUTorchDDP(model)
model = TorchDDP(model)
return model


def create_grad_scaler():
return None


def backward(step, lm_loss, reduced_loss, optimizer, lr_scheduler, model):
args = config

def _clip_grad():
if args.clip_grad > 0:
clip_grad_norm(model.parameters(), args.clip_grad)

lm_loss.backward()
if step % args.gradient_accumulation_steps == 0:
allreduce_grads = reversed(
[p.grad.data for p in model.parameters() if p.grad is not None])
xm.optimizer_step(optimizer,
barrier=True,
post_allreduce_hook=_clip_grad,
allreduce_average=True,
allreduce_grads=allreduce_grads)
lr_scheduler.step()

if DynamicLossScaler._has_inf_or_nan(reduced_loss):
if not DynamicLossScaler._has_inf_or_nan(reduced_loss):
backward_step(optimizer, model, lm_loss, args)
if step % args.gradient_accumulation_steps == 0:
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad(set_to_none=True)
else:
main_proc_print("Found NaN loss, skip backward")

torch_xmlir.xpu.empty_cache()
return reduced_loss


def backward_step(optimizer, model, lm_loss, args):
"""Backward step."""

# Total loss.
loss = lm_loss

loss.backward()

# Clipping gradients helps prevent the exploding gradient.
if args.clip_grad > 0:
clip_grad_norm(model.parameters(), args.clip_grad)

return lm_loss
1 change: 1 addition & 0 deletions training/run_benchmarks/config/test_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,5 +106,6 @@
# "gpt2:pytorch:R300:1:8:1": "/raid/dataset/gpt2"
# "resnet50:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
# "transformer_xl:pytorch:R300:1:8:1": "/raid/dataset/transformer_xl/",
# "glm:pytorch:R300:1:8:1": "/raid/home_datasets_ckpt/glm/train/",
}