From 4c57e4a9a9b302512d1bc41010cdbd8026207845 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yifan=20Li=E6=9D=8E=E4=B8=80=E5=B8=86?= Date: Sun, 26 May 2024 21:02:00 -0400 Subject: [PATCH] Add a reminder for the illegal memory error (#3822) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using the GPU version of the neighbor stat code, one may encounter the following issue and the training will stop: ``` [2024-05-24 23:00:42,027] DEEPMD INFO Adjust batch size from 1024 to 2048 [2024-05-24 23:00:42,139] DEEPMD INFO Adjust batch size from 2048 to 4096 [2024-05-24 23:00:42,285] DEEPMD INFO Adjust batch size from 4096 to 8192 [2024-05-24 23:00:42,628] DEEPMD INFO Adjust batch size from 8192 to 16384 [2024-05-24 23:00:43,180] DEEPMD INFO Adjust batch size from 16384 to 32768 [2024-05-24 23:00:44,341] DEEPMD INFO Adjust batch size from 32768 to 65536 [2024-05-24 23:00:46,713] DEEPMD INFO Adjust batch size from 65536 to 131072 2024-05-24 23:00:52.071120: E tensorflow/compiler/xla/stream_executor/cuda/cuda_event.cc:29] Error polling for event status: failed to query event: CUDA_ERROR_ILLEGAL_ADDRESS: an illegal memory access was encountered 2024-05-24 23:00:52.075435: F tensorflow/core/common_runtime/device/device_event_mgr.cc:223] Unexpected Event status: 1 /bin/sh: line 1: 1397100 Aborted ``` This should be due to some issue of TensorFlow. One may use the environment variable `DP_INFER_BATCH_SIZE` to avoid this issue. This PR remind the user to set a small `DP_INFER_BATCH_SIZE` to avoid this issue. - **Bug Fixes** - Added a log message to guide users on setting the `DP_INFER_BATCH_SIZE` environment variable to avoid TensorFlow illegal memory access issues. --------- Signed-off-by: Yifan Li李一帆 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> (cherry picked from commit d754672955aa683310a761b8e1292e6192ea116f) Signed-off-by: Jinzhe Zeng --- deepmd_utils/utils/batch_size.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/deepmd_utils/utils/batch_size.py b/deepmd_utils/utils/batch_size.py index 1b93a51242..8a60cc69fb 100644 --- a/deepmd_utils/utils/batch_size.py +++ b/deepmd_utils/utils/batch_size.py @@ -62,6 +62,11 @@ def __init__(self, initial_batch_size: int = 1024, factor: float = 2.0) -> None: self.maximum_working_batch_size = initial_batch_size if self.is_gpu_available(): self.minimal_not_working_batch_size = 2**31 + log.info( + "If you encounter the error 'an illegal memory access was encountered', this may be due to a TensorFlow issue. " + "To avoid this, set the environment variable DP_INFER_BATCH_SIZE to a smaller value than the last adjusted batch size. " + "The environment variable DP_INFER_BATCH_SIZE controls the inference batch size (nframes * natoms). " + ) else: self.minimal_not_working_batch_size = ( self.maximum_working_batch_size + 1