diff --git a/docs/env_var.md b/docs/env_var.md index c0abc5b70..81ac526fc 100644 --- a/docs/env_var.md +++ b/docs/env_var.md @@ -83,7 +83,9 @@ Contains the path to the directory where metrics will be recorded for consumptio This is relevant only in SageMaker environment, where this variable points to a pre-defined location. -#### `TRAINING_END_DELAY_REFRESH`: +**Note**: The environment variables below are applicable for versions > 0.4.14 + +#### `SMDEBUG_TRAINING_END_DELAY_REFRESH`: During analysis, a [trial](analysis.md) is created to query for tensors from a specified directory. This directory contains collections, events, and index files. This environment variable @@ -91,10 +93,18 @@ specifies how many seconds to wait before refreshing the index files to check if and the tensor is available. By default value, this value is set to 1. -#### `INCOMPLETE_STEP_WAIT_WINDOW`: +#### `SMDEBUG_INCOMPLETE_STEP_WAIT_WINDOW`: During analysis, a [trial](analysis.md) is created to query for tensors from a specified directory. This directory contains collections, events, and index files. A trial checks to see if a step specified in the smdebug hook has been completed. This environment variable specifies the maximum number of incomplete steps that the trial will wait for before marking half of them as complete. Default: 1000 + + +#### `SMDEBUG_MISSING_EVENT_FILE_RETRY_LIMIT`: + +During analysis, a [trial](analysis.md) is created to query for tensors from a specified directory. This +directory contains collections, events, and index files. All the tensor data is stored in the event files. +When tensor data contained in an event file that is not available has been requested, this variable specifcies +the number of times we retry the request. diff --git a/smdebug/core/config_constants.py b/smdebug/core/config_constants.py index d47c127a8..13de9df87 100644 --- a/smdebug/core/config_constants.py +++ b/smdebug/core/config_constants.py @@ -29,11 +29,13 @@ LATEST_MODE_STEP = "latest-mode-step" TRAINING_RUN = "training-run" -INCOMPLETE_STEP_WAIT_WINDOW_KEY = "INCOMPLETE_STEP_WAIT_WINDOW" +INCOMPLETE_STEP_WAIT_WINDOW_KEY = "SMDEBUG_INCOMPLETE_STEP_WAIT_WINDOW" INCOMPLETE_STEP_WAIT_WINDOW_DEFAULT = 1000 -DEFAULT_EVENT_FILE_RETRY_LIMIT = 100 -TRAINING_END_DELAY_REFRESH_KEY = "TRAINING_END_DELAY_REFRESH" +MISSING_EVENT_FILE_RETRY_LIMIT_KEY = "SMDEBUG_MISSING_EVENT_FILE_RETRY_LIMIT" +MISSING_EVENT_FILE_RETRY_LIMIT = 100 + +TRAINING_END_DELAY_REFRESH_KEY = "SMDEBUG_TRAINING_END_DELAY_REFRESH" TRAINING_END_DELAY_REFRESH_DEFAULT = 1 CALLABLE_CACHE_ENV_VAR = "SMDEBUG_KERAS_CALLABLE_CACHE_TYPE" diff --git a/smdebug/core/index_reader.py b/smdebug/core/index_reader.py index 8123da353..f591080e4 100644 --- a/smdebug/core/index_reader.py +++ b/smdebug/core/index_reader.py @@ -12,7 +12,10 @@ # First Party from smdebug.core.access_layer.s3handler import ReadObjectRequest, S3Handler from smdebug.core.access_layer.utils import has_training_ended -from smdebug.core.config_constants import DEFAULT_EVENT_FILE_RETRY_LIMIT +from smdebug.core.config_constants import ( + MISSING_EVENT_FILE_RETRY_LIMIT, + MISSING_EVENT_FILE_RETRY_LIMIT_KEY, +) from smdebug.core.locations import IndexFileLocationUtils, TensorLocation from smdebug.core.logger import get_logger from smdebug.core.modes import ModeKeys @@ -100,7 +103,7 @@ class IndexReader(ABC): def __init__(self, path): self.event_file_retry_limit = os.getenv( - "TORNASOLE_EVENT_FILE_RETRY_LIMIT", DEFAULT_EVENT_FILE_RETRY_LIMIT + MISSING_EVENT_FILE_RETRY_LIMIT_KEY, MISSING_EVENT_FILE_RETRY_LIMIT ) self.path = path self.logger = get_logger() diff --git a/tests/analysis/trials/test_has_passed_step_scenarios.py b/tests/analysis/trials/test_has_passed_step_scenarios.py index a7c911002..559a1fb7b 100644 --- a/tests/analysis/trials/test_has_passed_step_scenarios.py +++ b/tests/analysis/trials/test_has_passed_step_scenarios.py @@ -1,10 +1,10 @@ # Standard Library -import os # Third Party import pytest # First Party +from smdebug.core.config_constants import INCOMPLETE_STEP_WAIT_WINDOW_KEY from smdebug.core.tensor import StepState from smdebug.exceptions import NoMoreData, StepUnavailable from smdebug.trials import create_trial @@ -419,7 +419,7 @@ def test_three_writers_not_all_steps_written_but_later_step_written_complete_job @pytest.mark.slow -def test_override_if_too_many_steps_skipped(): +def test_override_if_too_many_steps_skipped(monkeypatch): """Test Scenario Description" workers : [a,b,c] steps :{ @@ -449,7 +449,7 @@ def test_override_if_too_many_steps_skipped(): window is smaller than the set threshold """ - os.environ["INCOMPLETE_STEP_WAIT_WINDOW"] = "10" + monkeypatch.setenv(INCOMPLETE_STEP_WAIT_WINDOW_KEY, "10") path = "s3://smdebug-testing/resources/has_step_scenarios/too-many-steps-skipped" trial = create_trial(path) @@ -487,8 +487,6 @@ def test_override_if_too_many_steps_skipped(): == "resources/has_step_scenarios/too-many-steps-skipped/index/000000000/000000000009_worker_2.json" ) - del os.environ["INCOMPLETE_STEP_WAIT_WINDOW"] - @pytest.mark.slow def test_partially_written_tensors():