awslabs
diff --git a/‎examples/profiler/README.md‎
Lines changed: 22 additions & 0 deletions b/‎examples/profiler/README.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎smdebug/core/utils.py‎
Lines changed: 37 additions & 0 deletions b/‎smdebug/core/utils.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎smdebug/profiler/MetricsReader.py‎ renamed to ‎smdebug/profiler/AlgorithmMetricsReader.py‎
Lines changed: 23 additions & 118 deletions b/‎smdebug/profiler/MetricsReader.py‎ renamed to ‎smdebug/profiler/AlgorithmMetricsReader.py‎
Lines changed: 23 additions & 118 deletions
@@ -0,0 +1,22 @@
+# Overview
+SageMaker Debugger Profiler provides better insights for training jobs. Customer can use SystemMetricsReader to monitor
+system metrics (CPU, GPU, etc.) and find issues during training.
+
+# Examples
+
+The below code snippet shows how to use SystemMetricsReader in local mode and s3 mode.
+
+## Examples for reading system profiler metrics locally
+```
+from smdebug.profiler.SystemMetricsReader import SystemLocalMetricsReader
+lt = SystemLocalMetricsReader('/localpath/profiler-output')
+events = lt.get_events(1591100000, 1692300000, unit=TimeUnits.SECONDS)
+```
+
+## Example for reading system profiler metrics from s3
+```
+from smdebug.profiler.SystemMetricsReader import SystemS3MetricsReader
+s3Path = "s3://bucket/prefix/trainingjob_name/profiler-output"
+tt = SystemS3MetricsReader(s3Path)
+events = tt.get_events(1591100000, 1692300000, unit=TimeUnits.SECONDS)
+```
@@ -416,6 +416,20 @@ def get_node_id_from_tracefilename(filename: str) -> str:
     return filename.split("_")[1] if is_valid_tracefilename(filename) else ""
 
 
+def get_node_id_from_system_profiler_filename(filename: str) -> str:
+    """
+    The system metric has a file name format:
+    /profiler-output/system/incremental/{$TIMESTAMP}.${NODE_ID}.json
+    Example: /profiler-output/system/incremental/2020060500/1591160699.algo-1.json
+
+    The function extracts and returns the {$NODE_ID} from file.
+    """
+    if validate_system_profiler_file(filename):
+        filename = filename.split("/")[-1]
+        return filename.split(".")[1]
+    return None
+
+
 def get_timestamp_from_tracefilename(filename) -> int:
     """
     The tracefile has a file name format:
@@ -428,3 +442,26 @@ def get_timestamp_from_tracefilename(filename) -> int:
     """
     filename = filename.split("/")[-1]
     return int(filename.split("_")[0] if is_valid_tracefilename(filename) else "0")
+
+
+def get_utctimestamp_us_since_epoch_from_system_profiler_file(filename) -> int:
+    """
+    The system metric file has a file name format:
+    <training job name>/profiler-output/system/incremental/<timestamp of full minute>.<algo-n>.json
+    Example: /profiler-output/system/incremental/2020060500/1591160699.algo-1.json
+
+    The function extracts and returns the <timestamp of full minute> in microseconds from filename.
+    """
+    if validate_system_profiler_file(filename):
+        filename = filename.split("/")[-1]
+        return int(filename.split(".")[0]) * 1000 * 1000
+    return None
+
+
+def validate_system_profiler_file(filename) -> bool:
+    filename_regex = re.compile(".+/system/.+/(\d{10}).algo-\d+.json")
+    stamp = re.match(filename_regex, filename)
+    if stamp is None:
+        logger.debug(f"Invalid System Profiler File Found: {filename}, not able to get timestamp.")
+        return False
+    return True
@@ -6,60 +6,34 @@
 
 # First Party
 from smdebug.core.access_layer.s3handler import ListRequest, ReadObjectRequest, S3Handler, is_s3
-from smdebug.core.logger import get_logger
-from smdebug.core.utils import (
-    get_node_id_from_tracefilename,
-    get_timestamp_from_tracefilename,
-    list_files_in_directory,
-)
+from smdebug.core.utils import get_node_id_from_tracefilename, get_timestamp_from_tracefilename
+from smdebug.profiler.MetricsReaderBase import MetricsReaderBase
 from smdebug.profiler.profiler_constants import (
     DEFAULT_PREFIX,
     ENV_TIME_BUFFER,
-    ENV_TRAIILING_DURATION,
     HOROVODTIMELINE_PREFIX,
     MODELTIMELINE_SUFFIX,
     PYTHONTIMELINE_SUFFIX,
     TENSORBOARDTIMELINE_SUFFIX,
     TIME_BUFFER_DEFAULT,
-    TRAILING_DURATION_DEFAULT,
 )
 from smdebug.profiler.tf_profiler_parser import (
     HorovodProfilerEvents,
     SMProfilerEvents,
     TensorboardProfilerEvents,
 )
-from smdebug.profiler.utils import TimeUnits, convert_utc_timestamp_to_microseconds
 
 
-class MetricsReader:
+class AlgorithmMetricsReader(MetricsReaderBase):
     def __init__(self):
+        super().__init__()
         self.prefix = DEFAULT_PREFIX
-        self.logger = get_logger("smdebug-profiler")
         self._SMEventsParser = SMProfilerEvents()
         self._TBEventsParser = TensorboardProfilerEvents()
         self._HorovordEventsParser = HorovodProfilerEvents()
-        # This is a set of parsed event files. The entry is made into this file only if the complete file is read.
-        self._parsed_files = set()
-        self._timestamp_to_filename = dict()
-
-        # The startAfter_prefix is used in ListPrefix call to poll for available tracefiles in the S3 bucket. The
-        # prefix lags behind the last polled tracefile by tunable trailing duration. This is to ensure that we do not
-        # miss a
-        # tracefile corresponding to timestamp earlier than last polled timestamp but arrived after we had polled.
 
-        self._startAfter_prefix = ""
-
-    """
-    The function returns the timestamp of last available file.
-    This timestamp indicates users can query the events up to this timestamp to gauge
-    """
-
-    def get_timestamp_of_latest_available_file(self):
-        return (
-            sorted(self._timestamp_to_filename.keys())[-1]
-            if len(self._timestamp_to_filename) > 0
-            else 0
-        )
+    def _get_all_event_parsers(self):
+        return [self._SMEventsParser, self._TBEventsParser, self._HorovordEventsParser]
 
     """
     The following function returns the time range for which the tracefiles are currently available in S3 or local
@@ -81,7 +55,7 @@ def get_current_time_range_for_event_query(self):
     Those files might contain 'B' type events that had started prior to 'start'
     """
 
-    def _get_trace_files_in_the_range(
+    def _get_event_files_in_the_range(
         self, start_time_microseconds, end_time_microseconds, use_buffer=True
     ):
         # increase the time range using TIME_BUFFER_DEFAULT
@@ -117,15 +91,15 @@ def _get_trace_files_in_the_range(
 
         # Find the timestamp that is greater than or equal start_time_microseconds. The tracefile corresponding to
         # that timestamp will contain events that are active during start_time_microseconds
-        lower_bound_timestamp = bisect.bisect_left(timestamps, start_time_microseconds)
+        lower_bound_timestamp_index = bisect.bisect_left(timestamps, start_time_microseconds)
 
         # Find the timestamp that is immediate right to the end_time_microseconds. The tracefile corresponding to
         # that timestamp will contain events that are active during end_time_microseconds.
-        upper_bound_timestamp = bisect.bisect_left(timestamps, end_time_microseconds)
+        upper_bound_timestamp_index = bisect.bisect_left(timestamps, end_time_microseconds)
 
         event_files = list()
-        for index in timestamps[lower_bound_timestamp : upper_bound_timestamp + 1]:
-            event_files.append(self._timestamp_to_filename[index])
+        for index in timestamps[lower_bound_timestamp_index : upper_bound_timestamp_index + 1]:
+            event_files.extend(self._timestamp_to_filename[index])
         return event_files
 
     """
@@ -143,52 +117,21 @@ def _get_event_parser(self, filename):
             return self._SMEventsParser
         if TENSORBOARDTIMELINE_SUFFIX in filename:
             return self._TBEventsParser
-        if HorovodProfilerEvents in filename:
+        if HOROVODTIMELINE_PREFIX in filename:
             return self._HorovordEventsParser
 
-    """
-    This function queries the files that are currently available in the directory (for local mode) or in S3 for download.
-    It rebuilds the map of timestamp to filename.
-    """
+    def _get_timestamp_from_filename(self, event_file):
+        return get_timestamp_from_tracefilename(event_file)
+
+    def _get_event_file_regex(self):
+        return r"(.+)\.(json|csv)$"
 
-    def refresh_event_file_list(self):
-        pass
 
+class LocalAlgorithmMetricsReader(AlgorithmMetricsReader):
     """
-    The function returns the events that have recorded within the given time range.
-    The function will download (or parse) the tracefiles that are available
-    for the given time range. It is possible that events are recorded during training but are not available for
-    download.
-    TODO: Implement blocking call to wait for files to be available for download.
+    The metrics reader is created with root folder in which the tracefiles are stored.
     """
 
-    def get_events(self, start_time, end_time, unit=TimeUnits.MICROSECONDS):
-        start_time = convert_utc_timestamp_to_microseconds(start_time, unit)
-        end_time = convert_utc_timestamp_to_microseconds(end_time, unit)
-
-        event_files = self._get_trace_files_in_the_range(start_time, end_time)
-
-        # Download files and parse the events
-        self.parse_event_files(event_files)
-
-        """
-        We might have recorded events from different sources within this timerange.
-        we will get the events from the relevant event parsers and merge them before returning.
-        """
-        result = []
-        for eventParser in [self._SMEventsParser, self._TBEventsParser, self._HorovordEventsParser]:
-            range_events = eventParser.get_events_within_time_range(
-                start_time, end_time, unit=TimeUnits.MICROSECONDS
-            )
-            result.extend(range_events)
-
-        return result
-
-    def parse_event_files(self, event_files):
-        pass
-
-
-class LocalMetricsReader(MetricsReader):
     def __init__(self, trace_root_folder):
         self.trace_root_folder = trace_root_folder
         super().__init__()
@@ -200,27 +143,13 @@ def __init__(self, trace_root_folder):
     """
 
     def refresh_event_file_list(self):
-        path = os.path.expanduser(self.trace_root_folder)
-        event_dir = os.path.join(path, DEFAULT_PREFIX, "")
-        event_regex = r"(.+)\.(json|csv)$"
-        event_files = list_files_in_directory(event_dir, file_regex=event_regex)
-        for event_file in event_files:
-            timestamp = get_timestamp_from_tracefilename(event_file)
-            self._timestamp_to_filename[timestamp] = event_file
-
-    """
-    The function opens and reads the event files if they are not already parsed.
-    For local metrics reader, we are currently assuming that the downloaded event file is a complete file.
-    """
+        self._refresh_event_file_list_local_mode(self.trace_root_folder)
 
     def parse_event_files(self, event_files):
-        for event_file in event_files:
-            if event_file not in self._parsed_files:
-                self._get_event_parser(event_file).read_events_from_file(event_file)
-                self._parsed_files.add(event_file)
+        self._parse_event_files_local_mode(event_files)
 
 
-class S3MetricsReader(MetricsReader):
+class S3AlgorithmMetricsReader(AlgorithmMetricsReader):
     """
     The s3_trial_path points to a s3 folder in which the tracefiles are stored. e.g.
     s3://my_bucket/experiment_base_folder
@@ -273,28 +202,4 @@ def refresh_event_file_list(self):
             Prefix=self.prefix,
             StartAfter=self._startAfter_prefix if self._startAfter_prefix else self.prefix,
         )
-        event_files = [x for x in S3Handler.list_prefix(list_dir) if "json" in x]
-        for event_file in event_files:
-            timestamp = get_timestamp_from_tracefilename(event_file)
-            self._timestamp_to_filename[timestamp] = f"s3://{self.bucket_name}/{event_file}"
-        self.update_start_after_prefix()
-
-    """
-    It is possible that tracefiles from different nodes to arrive in S3 in different order. For example, Even if t1
-    > t2, a tracefile with timestamp "t1" can arrive in S3 before the tracefile with timestamp "t2". If we list the
-    prefix only on the basis of last arrived file (i.e. t1) we will miss the file for t2. Therefore, we will set the
-    start prefix to a timestamp that is trailing behind the last timestamp by 'trailing duration'. This will ensure
-    that we will attempt to get tracefiles with older timestamp even if they arrive late.
-    """
-
-    def update_start_after_prefix(self):
-        trailiing_duration = os.getenv(ENV_TRAIILING_DURATION, TRAILING_DURATION_DEFAULT)
-        sorted_timestamps = sorted(self._timestamp_to_filename.keys())
-        last_timestamp_available = sorted_timestamps[-1]
-        trailing_timestamp = last_timestamp_available - trailiing_duration
-        # Get the timestamp that is closely matching the trailing_timestamp
-        trailing_timestamp = sorted_timestamps[
-            bisect.bisect_left(sorted_timestamps, trailing_timestamp)
-        ]
-        self._startAfter_prefix = self._timestamp_to_filename[trailing_timestamp]
-        s3, bucket_name, self._startAfter_prefix = is_s3(self._startAfter_prefix)
+        self._refresh_event_file_list_s3_mode(list_dir)