aws
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/slurm_plugin/clustermgtd.py‎
Lines changed: 2 additions & 1 deletion b/‎src/slurm_plugin/clustermgtd.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/slurm_plugin/common.py‎
Lines changed: 33 additions & 1 deletion b/‎src/slurm_plugin/common.py‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎src/slurm_plugin/computemgtd.py‎
Lines changed: 21 additions & 46 deletions b/‎src/slurm_plugin/computemgtd.py‎
Lines changed: 21 additions & 46 deletions
diff --git a/‎src/slurm_plugin/resume.py‎
Lines changed: 35 additions & 7 deletions b/‎src/slurm_plugin/resume.py‎
Lines changed: 35 additions & 7 deletions
diff --git a/‎src/slurm_plugin/suspend.py‎
Lines changed: 30 additions & 8 deletions b/‎src/slurm_plugin/suspend.py‎
Lines changed: 30 additions & 8 deletions
@@ -8,6 +8,7 @@ This file is used to list changes made in each version of the aws-parallelcluste
 
 **CHANGES**
 - Use inclusive language in internal naming convention.
+- Improve error handling in slurm plugin processes when clustermgtd is down.
 
 2.10.0
 -----
 
@@ -432,7 +432,8 @@ def manage_cluster(self):
 
     def _write_timestamp_to_file(self):
         """Write timestamp into shared file so compute nodes can determine if head node is online."""
-        with open(self._config.heartbeat_file_path, "w") as timestamp_file:
+        # Make clustermgtd heartbeat readable to all users
+        with open(os.open(self._config.heartbeat_file_path, os.O_WRONLY | os.O_CREAT, 0o644), "w") as timestamp_file:
             # Note: heartbeat must be written with datetime.strftime to convert localized datetime into str
             # datetime.strptime will not work with str(datetime)
             timestamp_file.write(datetime.now(tz=timezone.utc).strftime(TIMESTAMP_FORMAT))
 
@@ -15,7 +15,7 @@
 import json
 import logging
 import subprocess
-from datetime import timezone
+from datetime import datetime, timezone
 
 import boto3
 from botocore.exceptions import ClientError
@@ -422,3 +422,35 @@ def retrieve_instance_type_mapping(file_path):
             e,
         )
         raise
+
+
+def _get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path):
+    """Get clustermgtd's last heartbeat."""
+    with open(clustermgtd_heartbeat_file_path, "r") as timestamp_file:
+        # Note: heartbeat must be written with datetime.strftime to convert localized datetime into str
+        # datetime.strptime will not work with str(datetime)
+        # Example timestamp written to heartbeat file: 2020-07-30 19:34:02.613338+00:00
+        return datetime.strptime(timestamp_file.read().strip(), TIMESTAMP_FORMAT)
+
+
+def _expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout):
+    """Test if clustermgtd heartbeat is expired."""
+    if time_is_up(last_heartbeat, current_time, clustermgtd_timeout):
+        logger.error(
+            "Clustermgtd has been offline since %s. Current time is %s. Timeout of %s seconds has expired!",
+            last_heartbeat,
+            current_time,
+            clustermgtd_timeout,
+        )
+        return True
+    return False
+
+
+def is_clustermgtd_heartbeat_valid(current_time, clustermgtd_timeout, clustermgtd_heartbeat_file_path):
+    try:
+        last_heartbeat = _get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path)
+        logger.info("Latest heartbeat from clustermgtd: %s", last_heartbeat)
+        return not _expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout)
+    except Exception as e:
+        logger.error("Unable to retrieve clustermgtd heartbeat with exception: %s", e)
+        return False
@@ -12,6 +12,7 @@
 
 import logging
 import os
+import time
 from datetime import datetime, timezone
 from logging.config import fileConfig
 from subprocess import CalledProcessError
@@ -23,7 +24,7 @@
 from common.schedulers.slurm_commands import get_nodes_info
 from common.time_utils import seconds
 from common.utils import check_command_output, sleep_remaining_loop_time
-from slurm_plugin.common import CONFIG_FILE_DIR, TIMESTAMP_FORMAT, InstanceManager, log_exception, time_is_up
+from slurm_plugin.common import CONFIG_FILE_DIR, InstanceManager, is_clustermgtd_heartbeat_valid, log_exception
 
 LOOP_TIME = 60
 RELOAD_CONFIG_ITERATIONS = 10
@@ -38,8 +39,8 @@ class ComputemgtdConfig:
         "max_retry": 1,
         "loop_time": LOOP_TIME,
         "proxy": "NONE",
-        "clustermgtd_timeout": 600,
         "disable_computemgtd_actions": False,
+        "clustermgtd_timeout": 600,
         "slurm_nodename_file": os.path.join(CONFIG_FILE_DIR, "slurm_nodename"),
         "logging_config": os.path.join(
             os.path.dirname(__file__), "logging", "parallelcluster_computemgtd_logging.conf"
@@ -61,7 +62,7 @@ def _get_config(self, config_file_path):
         try:
             config.read_file(open(config_file_path, "r"))
         except IOError:
-            log.error(f"Cannot read cluster manager configuration file: {config_file_path}")
+            log.error(f"Cannot read computemgtd configuration file: {config_file_path}")
             raise
 
         # Get config settings
@@ -115,51 +116,34 @@ def _self_terminate(computemgtd_config):
         computemgtd_config.region, computemgtd_config.cluster_name, computemgtd_config.boto3_config
     )
     self_instance_id = check_command_output("curl -s http://169.254.169.254/latest/meta-data/instance-id", shell=True)
+    # Sleep for 10 seconds so termination log entries are uploaded to CW logs
+    log.info("Prepaing to self terminate the instance %s in 10 seconds!", self_instance_id)
+    time.sleep(10)
     log.info("Self terminating instance %s now!", self_instance_id)
     instance_manager.delete_instances([self_instance_id], terminate_batch_size=1)
 
 
-def _get_clustermgtd_heartbeat(clustermgtd_heartbeat_file_path):
-    """Get clustermgtd's last heartbeat."""
-    with open(clustermgtd_heartbeat_file_path, "r") as timestamp_file:
-        # Note: heartbeat must be written with datetime.strftime to convert localized datetime into str
-        # datetime.strptime will not work with str(datetime)
-        # Example timestamp written to heartbeat file: 2020-07-30 19:34:02.613338+00:00
-        return datetime.strptime(timestamp_file.read().strip(), TIMESTAMP_FORMAT)
-
-
-def _expired_clustermgtd_heartbeat(last_heartbeat, current_time, clustermgtd_timeout):
-    """Test if clustermgtd heartbeat is expired."""
-    if time_is_up(last_heartbeat, current_time, clustermgtd_timeout):
-        log.error(
-            "Clustermgtd has been offline since %s. Current time is %s. Timeout of %s seconds has expired!",
-            last_heartbeat,
-            current_time,
-            clustermgtd_timeout,
-        )
-        return True
-    return False
-
-
 @retry(stop_max_attempt_number=3, wait_fixed=1500)
 def _get_nodes_info_with_retry(nodes):
     return get_nodes_info(nodes)
 
 
 def _is_self_node_down(self_nodename):
     """
-    Check if self node is down in slurm.
+    Check if self node is healthy according to the scheduler.
 
-    This check prevents termination of a node that is still well-attached to the scheduler.
-    Note: node that is not attached to the scheduler will be in DOWN* after SlurmdTimeout.
+    Node is considered healthy if:
+    1. Node is not in DOWN
+    2. Node is not in POWER_SAVE
+    Note: node that is incorrectly attached to the scheduler will be in DOWN* after SlurmdTimeout.
     """
     try:
         self_node = _get_nodes_info_with_retry(self_nodename)[0]
         log.info("Current self node state %s", self_node.__repr__())
-        if self_node.is_down():
-            log.warning("Node is in DOWN state, preparing for self termination...")
+        if self_node.is_down() or self_node.is_power():
+            log.warning("Node is incorrectly attached to scheduler, preparing for self termination...")
             return True
-        log.info("Node is not in a DOWN state and correctly attached to scheduler, not terminating...")
+        log.info("Node is correctly attached to scheduler, not terminating...")
         return False
     except Exception as e:
         # This could happen is slurmctld is down completely
@@ -168,13 +152,6 @@ def _is_self_node_down(self_nodename):
     return True
 
 
-def _fail_self_check(last_heartbeat, current_time, computemgtd_config):
-    """Determine if self checks are failing and if the node should self-terminate."""
-    return _expired_clustermgtd_heartbeat(
-        last_heartbeat, current_time, computemgtd_config.clustermgtd_timeout
-    ) and _is_self_node_down(computemgtd_config.nodename)
-
-
 def _load_daemon_config():
     # Get program config
     computemgtd_config = ComputemgtdConfig(os.path.join(COMPUTEMGTD_CONFIG_PATH))
@@ -208,17 +185,15 @@ def _run_computemgtd():
             reload_config_counter -= 1
 
         # Check heartbeat
-        try:
-            last_heartbeat = _get_clustermgtd_heartbeat(computemgtd_config.clustermgtd_heartbeat_file_path)
-            log.info("Latest heartbeat from clustermgtd: %s", last_heartbeat)
-        except Exception as e:
-            log.error("Unable to retrieve clustermgtd heartbeat with exception: %s", e)
-        finally:
+        if not is_clustermgtd_heartbeat_valid(
+            current_time, computemgtd_config.clustermgtd_timeout, computemgtd_config.clustermgtd_heartbeat_file_path
+        ):
             if computemgtd_config.disable_computemgtd_actions:
                 log.info("All computemgtd actions currently disabled")
-            elif _fail_self_check(last_heartbeat, current_time, computemgtd_config):
+            elif _is_self_node_down(computemgtd_config.nodename):
                 _self_terminate(computemgtd_config)
-            sleep_remaining_loop_time(computemgtd_config.loop_time, current_time)
+
+        sleep_remaining_loop_time(computemgtd_config.loop_time, current_time)
 
 
 @retry(wait_fixed=seconds(LOOP_TIME))
 
@@ -12,14 +12,21 @@
 
 import logging
 import os
+from datetime import datetime, timezone
 from logging.config import fileConfig
 
 import argparse
 from botocore.config import Config
 from configparser import ConfigParser
 
 from common.schedulers.slurm_commands import get_nodes_info, set_nodes_down
-from slurm_plugin.common import CONFIG_FILE_DIR, InstanceManager, print_with_count, retrieve_instance_type_mapping
+from slurm_plugin.common import (
+    CONFIG_FILE_DIR,
+    InstanceManager,
+    is_clustermgtd_heartbeat_valid,
+    print_with_count,
+    retrieve_instance_type_mapping,
+)
 
 log = logging.getLogger(__name__)
 
@@ -29,6 +36,7 @@ class SlurmResumeConfig:
         "max_retry": 1,
         "max_batch_size": 500,
         "update_node_address": True,
+        "clustermgtd_timeout": 300,
         "proxy": "NONE",
         "logging_config": os.path.join(os.path.dirname(__file__), "logging", "parallelcluster_resume_logging.conf"),
         "hosted_zone": None,
@@ -79,6 +87,12 @@ def _get_config(self, config_file_path):
             "slurm_resume", "instance_type_mapping", fallback=self.DEFAULTS.get("instance_type_mapping")
         )
         self.instance_name_type_mapping = retrieve_instance_type_mapping(instance_name_type_mapping_file)
+        self.clustermgtd_timeout = config.getint(
+            "slurm_resume",
+            "clustermgtd_timeout",
+            fallback=self.DEFAULTS.get("clustermgtd_timeout"),
+        )
+        self.clustermgtd_heartbeat_file_path = config.get("slurm_resume", "clustermgtd_heartbeat_file_path")
 
         # Configure boto3 to retry 1 times by default
         self._boto3_retry = config.getint("slurm_resume", "boto3_retry", fallback=self.DEFAULTS.get("max_retry"))
@@ -115,6 +129,19 @@ def _handle_failed_nodes(node_list):
 
 def _resume(arg_nodes, resume_config):
     """Launch new EC2 nodes according to nodes requested by slurm."""
+    # Check heartbeat
+    current_time = datetime.now(tz=timezone.utc)
+    if not is_clustermgtd_heartbeat_valid(
+        current_time, resume_config.clustermgtd_timeout, resume_config.clustermgtd_heartbeat_file_path
+    ):
+        log.error(
+            "No valid clustermgtd heartbeat detected, clustermgtd is down!\n"
+            "Please check clustermgtd log for error.\n"
+            "Not launching nodes %s",
+            arg_nodes,
+        )
+        _handle_failed_nodes(arg_nodes)
+        return
     log.info("Launching EC2 instances for the following Slurm nodes: %s", arg_nodes)
     node_list = [node.name for node in get_nodes_info(arg_nodes)]
     log.debug("Retrieved nodelist: %s", node_list)
@@ -148,6 +175,13 @@ def _resume(arg_nodes, resume_config):
 
 
 def main():
+    default_log_file = "/var/log/parallelcluster/slurm_resume.log"
+    logging.basicConfig(
+        filename=default_log_file,
+        level=logging.INFO,
+        format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s",
+    )
+    log.info("ResumeProgram startup.")
     parser = argparse.ArgumentParser()
     parser.add_argument("nodes", help="Nodes to burst")
     args = parser.parse_args()
@@ -157,12 +191,6 @@ def main():
             # Configure root logger
             fileConfig(resume_config.logging_config, disable_existing_loggers=False)
         except Exception as e:
-            default_log_file = "/var/log/parallelcluster/slurm_resume.log"
-            logging.basicConfig(
-                filename=default_log_file,
-                level=logging.INFO,
-                format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s",
-            )
             log.warning(
                 "Unable to configure logging from %s, using default settings and writing to %s.\nException: %s",
                 resume_config.logging_config,
 
@@ -12,18 +12,20 @@
 
 import logging
 import os
+from datetime import datetime, timezone
 from logging.config import fileConfig
 
 import argparse
 from configparser import ConfigParser
 
-from slurm_plugin.common import CONFIG_FILE_DIR
+from slurm_plugin.common import CONFIG_FILE_DIR, is_clustermgtd_heartbeat_valid
 
 log = logging.getLogger(__name__)
 
 
 class SlurmSuspendConfig:
     DEFAULTS = {
+        "clustermgtd_timeout": 300,
         "logging_config": os.path.join(os.path.dirname(__file__), "logging", "parallelcluster_suspend_logging.conf"),
     }
 
@@ -35,13 +37,26 @@ def __init__(self, config_file_path):
             log.error(f"Cannot read slurm cloud bursting scripts configuration file: {config_file_path}")
             raise
 
+        self.clustermgtd_timeout = config.getint(
+            "slurm_suspend",
+            "clustermgtd_timeout",
+            fallback=self.DEFAULTS.get("clustermgtd_timeout"),
+        )
+        self.clustermgtd_heartbeat_file_path = config.get("slurm_suspend", "clustermgtd_heartbeat_file_path")
         self.logging_config = config.get(
             "slurm_suspend", "logging_config", fallback=self.DEFAULTS.get("logging_config")
         )
         log.info(self.__repr__())
 
 
 def main():
+    default_log_file = "/var/log/parallelcluster/slurm_suspend.log"
+    logging.basicConfig(
+        filename=default_log_file,
+        level=logging.INFO,
+        format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s",
+    )
+    log.info("SuspendProgram startup.")
     parser = argparse.ArgumentParser()
     parser.add_argument("nodes", help="Nodes to release")
     args = parser.parse_args()
@@ -50,20 +65,27 @@ def main():
         # Configure root logger
         fileConfig(suspend_config.logging_config, disable_existing_loggers=False)
     except Exception as e:
-        default_log_file = "/var/log/parallelcluster/slurm_suspend.log"
-        logging.basicConfig(
-            filename=default_log_file,
-            level=logging.INFO,
-            format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s",
-        )
         log.warning(
             "Unable to configure logging from %s, using default settings and writing to %s.\nException: %s",
             suspend_config.logging_config,
             default_log_file,
             e,
         )
+
     log.info("Suspending following nodes. Clustermgtd will cleanup orphaned instances: %s", args.nodes)
-    log.info("SuspendProgram finished. Nodes will be available after SuspendTimeout")
+    current_time = datetime.now(tz=timezone.utc)
+    if not is_clustermgtd_heartbeat_valid(
+        current_time, suspend_config.clustermgtd_timeout, suspend_config.clustermgtd_heartbeat_file_path
+    ):
+        log.error(
+            "No valid clustermgtd heartbeat detected, clustermgtd is down! "
+            "Please check clustermgtd log for error.\n"
+            "Nodes will be reset to POWER_SAVE state after SuspendTimeout. "
+            "The backing EC2 instances may not be correctly terminated.\n"
+            "Please check and terminate any orphaned instances in EC2!"
+        )
+    else:
+        log.info("SuspendProgram finished. Nodes will be available after SuspendTimeout")
 
 
 if __name__ == "__main__":