1212
1313import logging
1414import os
15+ import time
1516from datetime import datetime , timezone
1617from logging .config import fileConfig
1718from subprocess import CalledProcessError
2324from common .schedulers .slurm_commands import get_nodes_info
2425from common .time_utils import seconds
2526from common .utils import check_command_output , sleep_remaining_loop_time
26- from slurm_plugin .common import CONFIG_FILE_DIR , TIMESTAMP_FORMAT , InstanceManager , log_exception , time_is_up
27+ from slurm_plugin .common import CONFIG_FILE_DIR , InstanceManager , is_clustermgtd_heartbeat_valid , log_exception
2728
2829LOOP_TIME = 60
2930RELOAD_CONFIG_ITERATIONS = 10
@@ -38,8 +39,8 @@ class ComputemgtdConfig:
3839 "max_retry" : 1 ,
3940 "loop_time" : LOOP_TIME ,
4041 "proxy" : "NONE" ,
41- "clustermgtd_timeout" : 600 ,
4242 "disable_computemgtd_actions" : False ,
43+ "clustermgtd_timeout" : 600 ,
4344 "slurm_nodename_file" : os .path .join (CONFIG_FILE_DIR , "slurm_nodename" ),
4445 "logging_config" : os .path .join (
4546 os .path .dirname (__file__ ), "logging" , "parallelcluster_computemgtd_logging.conf"
@@ -61,7 +62,7 @@ def _get_config(self, config_file_path):
6162 try :
6263 config .read_file (open (config_file_path , "r" ))
6364 except IOError :
64- log .error (f"Cannot read cluster manager configuration file: { config_file_path } " )
65+ log .error (f"Cannot read computemgtd configuration file: { config_file_path } " )
6566 raise
6667
6768 # Get config settings
@@ -115,51 +116,34 @@ def _self_terminate(computemgtd_config):
115116 computemgtd_config .region , computemgtd_config .cluster_name , computemgtd_config .boto3_config
116117 )
117118 self_instance_id = check_command_output ("curl -s http://169.254.169.254/latest/meta-data/instance-id" , shell = True )
119+ # Sleep for 10 seconds so termination log entries are uploaded to CW logs
120+ log .info ("Prepaing to self terminate the instance %s in 10 seconds!" , self_instance_id )
121+ time .sleep (10 )
118122 log .info ("Self terminating instance %s now!" , self_instance_id )
119123 instance_manager .delete_instances ([self_instance_id ], terminate_batch_size = 1 )
120124
121125
122- def _get_clustermgtd_heartbeat (clustermgtd_heartbeat_file_path ):
123- """Get clustermgtd's last heartbeat."""
124- with open (clustermgtd_heartbeat_file_path , "r" ) as timestamp_file :
125- # Note: heartbeat must be written with datetime.strftime to convert localized datetime into str
126- # datetime.strptime will not work with str(datetime)
127- # Example timestamp written to heartbeat file: 2020-07-30 19:34:02.613338+00:00
128- return datetime .strptime (timestamp_file .read ().strip (), TIMESTAMP_FORMAT )
129-
130-
131- def _expired_clustermgtd_heartbeat (last_heartbeat , current_time , clustermgtd_timeout ):
132- """Test if clustermgtd heartbeat is expired."""
133- if time_is_up (last_heartbeat , current_time , clustermgtd_timeout ):
134- log .error (
135- "Clustermgtd has been offline since %s. Current time is %s. Timeout of %s seconds has expired!" ,
136- last_heartbeat ,
137- current_time ,
138- clustermgtd_timeout ,
139- )
140- return True
141- return False
142-
143-
144126@retry (stop_max_attempt_number = 3 , wait_fixed = 1500 )
145127def _get_nodes_info_with_retry (nodes ):
146128 return get_nodes_info (nodes )
147129
148130
149131def _is_self_node_down (self_nodename ):
150132 """
151- Check if self node is down in slurm .
133+ Check if self node is healthy according to the scheduler .
152134
153- This check prevents termination of a node that is still well-attached to the scheduler.
154- Note: node that is not attached to the scheduler will be in DOWN* after SlurmdTimeout.
135+ Node is considered healthy if:
136+ 1. Node is not in DOWN
137+ 2. Node is not in POWER_SAVE
138+ Note: node that is incorrectly attached to the scheduler will be in DOWN* after SlurmdTimeout.
155139 """
156140 try :
157141 self_node = _get_nodes_info_with_retry (self_nodename )[0 ]
158142 log .info ("Current self node state %s" , self_node .__repr__ ())
159- if self_node .is_down ():
160- log .warning ("Node is in DOWN state , preparing for self termination..." )
143+ if self_node .is_down () or self_node . is_power () :
144+ log .warning ("Node is incorrectly attached to scheduler , preparing for self termination..." )
161145 return True
162- log .info ("Node is not in a DOWN state and correctly attached to scheduler, not terminating..." )
146+ log .info ("Node is correctly attached to scheduler, not terminating..." )
163147 return False
164148 except Exception as e :
165149 # This could happen is slurmctld is down completely
@@ -168,13 +152,6 @@ def _is_self_node_down(self_nodename):
168152 return True
169153
170154
171- def _fail_self_check (last_heartbeat , current_time , computemgtd_config ):
172- """Determine if self checks are failing and if the node should self-terminate."""
173- return _expired_clustermgtd_heartbeat (
174- last_heartbeat , current_time , computemgtd_config .clustermgtd_timeout
175- ) and _is_self_node_down (computemgtd_config .nodename )
176-
177-
178155def _load_daemon_config ():
179156 # Get program config
180157 computemgtd_config = ComputemgtdConfig (os .path .join (COMPUTEMGTD_CONFIG_PATH ))
@@ -208,17 +185,15 @@ def _run_computemgtd():
208185 reload_config_counter -= 1
209186
210187 # Check heartbeat
211- try :
212- last_heartbeat = _get_clustermgtd_heartbeat (computemgtd_config .clustermgtd_heartbeat_file_path )
213- log .info ("Latest heartbeat from clustermgtd: %s" , last_heartbeat )
214- except Exception as e :
215- log .error ("Unable to retrieve clustermgtd heartbeat with exception: %s" , e )
216- finally :
188+ if not is_clustermgtd_heartbeat_valid (
189+ current_time , computemgtd_config .clustermgtd_timeout , computemgtd_config .clustermgtd_heartbeat_file_path
190+ ):
217191 if computemgtd_config .disable_computemgtd_actions :
218192 log .info ("All computemgtd actions currently disabled" )
219- elif _fail_self_check ( last_heartbeat , current_time , computemgtd_config ):
193+ elif _is_self_node_down ( computemgtd_config . nodename ):
220194 _self_terminate (computemgtd_config )
221- sleep_remaining_loop_time (computemgtd_config .loop_time , current_time )
195+
196+ sleep_remaining_loop_time (computemgtd_config .loop_time , current_time )
222197
223198
224199@retry (wait_fixed = seconds (LOOP_TIME ))
0 commit comments