diff --git a/training/kunlunxin/docker_image/pytorch/pytorch_install.sh b/training/kunlunxin/docker_image/pytorch/pytorch_install.sh index 850a304b4..2a96fe267 100644 --- a/training/kunlunxin/docker_image/pytorch/pytorch_install.sh +++ b/training/kunlunxin/docker_image/pytorch/pytorch_install.sh @@ -4,3 +4,5 @@ set -xe pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl + +python -m xacc.install diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py index 8a4c96915..cd9afdd40 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py @@ -14,4 +14,4 @@ lr_decay_iters = 4338 log_freq = 1 seed = 4096 -max_samples_termination = 925510 +max_samples_termination = 4000 diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py index 840259660..fa79a403e 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py @@ -14,4 +14,4 @@ lr_decay_iters = 4338 log_freq = 1 seed = 4096 -max_samples_termination = 2776540 +max_samples_termination = 20000 diff --git a/training/kunlunxin/glm-pytorch/config/environment_variables.sh b/training/kunlunxin/glm-pytorch/config/environment_variables.sh index 9c9f20b8e..b527e7873 100755 --- a/training/kunlunxin/glm-pytorch/config/environment_variables.sh +++ b/training/kunlunxin/glm-pytorch/config/environment_variables.sh @@ -17,3 +17,5 @@ export ALLREDUCE_FUSION=0 export XMLIR_F_XPU_FC_GEMM_MODE=float export XMLIR_F_FAST_INDEX_PUT=true + +export XACC_ENABLE=1 diff --git a/training/kunlunxin/glm-pytorch/config/requirements.txt b/training/kunlunxin/glm-pytorch/config/requirements.txt index 8bac0066c..46109702b 100644 --- a/training/kunlunxin/glm-pytorch/config/requirements.txt +++ b/training/kunlunxin/glm-pytorch/config/requirements.txt @@ -2,3 +2,4 @@ h5sparse boto3 h5py numpy>=1.15.4 +psutil diff --git a/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py b/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py index 3ac97e41a..719532ede 100644 --- a/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py +++ b/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py @@ -1,13 +1,13 @@ import torch -import config - from torch import nn import torch.distributed as dist +import config from optimizers import get_optimizer_param_groups from optimizers.loss_scaler import DynamicLossScaler from driver.dist_pytorch import main_proc_print +import torch_xmlir import torch_xmlir.core.xpu_model as xm from torch_xmlir.optimizer import AdamW as Adam from torch_xmlir.nn.clip_grad import clip_grad_norm @@ -79,4 +79,5 @@ def _clip_grad(): if DynamicLossScaler._has_inf_or_nan(reduced_loss): main_proc_print("Found NaN loss, skip backward") + torch_xmlir.xpu.empty_cache() return reduced_loss