Skip to content

Commit

Permalink
Fix slurmd startup for dynamic GPU nodes
Browse files Browse the repository at this point in the history
Dynamic nodes need to start with their gres config
given to slurmd on command line otherwise it results
in invalid registration.
  • Loading branch information
aditigaur4 committed Feb 4, 2025
1 parent aed4c78 commit e2bb92d
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 44 deletions.
62 changes: 43 additions & 19 deletions slurm/install/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ def __init__(self, config: Dict, platform_family: str, mode: str) -> None:
self.platform_family = platform_family
self.mode = mode

self.dynamic_config = config["slurm"].get("dynamic_config")
if self.dynamic_config:
self.dynamic_config = _inject_vm_size(self.dynamic_config, self.vm_size)
self.dynamic_config
self.dynamic_config = config["slurm"].get("dynamic_config", None)
self.dynamic_feature = config["slurm"].get("dynamic_feature", None)
if self.dynamic_feature:
self.dynamic_feature = _inject_vm_size(self.dynamic_feature, self.vm_size)

self.max_node_count = int(config["slurm"].get("max_node_count", 10000))

Expand All @@ -109,19 +109,11 @@ def __init__(self, config: Dict, platform_family: str, mode: str) -> None:
self.ubuntu22_waagent_fix = config["slurm"].get("ubuntu22_waagent_fix", True)


def _inject_vm_size(dynamic_config: str, vm_size: str) -> str:
lc = dynamic_config.lower()
if "feature=" not in lc:
logging.warning("Dynamic config is specified but no 'Feature={some_flag}' is set under slurm.dynamic_config.")
return dynamic_config
else:
ret = []
for tok in dynamic_config.split():
if tok.lower().startswith("feature="):
ret.append(f"Feature={vm_size},{tok[len('Feature='):]}")
else:
ret.append(tok)
return " ".join(ret)
def _inject_vm_size(dynamic_feature: str, vm_size: str) -> str:

if vm_size in dynamic_feature:
return dynamic_feature
return f"{dynamic_feature},{vm_size}"

def setup_config_dir(s: InstallSettings) -> None:

Expand Down Expand Up @@ -508,11 +500,43 @@ def _complete_install_all(s: InstallSettings) -> None:

ilib.create_service("munged", user=s.munge_user, exec_start="/sbin/munged")

def get_gres_count(hostname):
count = 0
try:
with open("/etc/slurm/gres.conf", 'r') as file:
for line in file:
nodename_match = re.search(r'Nodename=([^\s]+)', line, re.IGNORECASE)
count_match = re.search(r'count=(\d+)', line, re.IGNORECASE)
if nodename_match and count_match:
nodename = nodename_match.group(1)
# This command is local to the node and does not send an RPC to the controller.
if hostname in subprocess.run(['scontrol', 'show', 'hostnames', nodename], stdout=subprocess.PIPE, universal_newlines=True).stdout:
count = int(count_match.group(1))

except Exception as e:
logging.error(f"An error occurred: {e}")

return count


def setup_slurmd(s: InstallSettings) -> None:
slurmd_config = f"SLURMD_OPTIONS=-b -N {s.node_name}"
if s.dynamic_config:
slurmd_config = f"SLURMD_OPTIONS={s.dynamic_config} -N {s.node_name}"
if s.dynamic_feature:
if not s.dynamic_config:
override_conf = ""
# Dynamic GPU nodes have to have their gres manually defined by the user before they can be started.
# Check if gres is defined for this node and then add that to configuration options.
gpu_count = get_gres_count(s.node_name)
if gpu_count > 0:
gres_str = f"gres=gpu:{gpu_count}"
override_conf += f" {gres_str}"
override_conf += f" Feature={s.dynamic_feature}"
dynamic_config = f"-Z --conf \"{override_conf}\""
else:
# If user has supplied us dynamic config in the template, use this and add dynamic feature.
dynamic_config = f"-Z --conf \"{s.dynamic_config} Feature={s.dynamic_feature}\""
logging.debug("Dynamic config: %s" % dynamic_config)
slurmd_config = f"SLURMD_OPTIONS={dynamic_config} -N {s.node_name}"
if "-b" not in slurmd_config.split():
slurmd_config = slurmd_config + " -b"

Expand Down
10 changes: 5 additions & 5 deletions slurm/src/slurmcc/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,15 +736,15 @@ def sync_future_states(self, config: Dict, node_list: Optional[List[str]] = None


def _dynamic_partition(partition: partitionlib.Partition, writer: TextIO) -> None:
assert partition.dynamic_config
assert partition.dynamic_feature

writer.write(
"# Creating dynamic nodeset and partition using slurm.dynamic_config=%s\n"
% partition.dynamic_config
"# Creating dynamic nodeset and partition using slurm.dynamic_feature=%s\n"
% partition.dynamic_feature
)
if not partition.features:
logging.error(
f"slurm.dynamic_config was set for {partition.name}"
f"slurm.dynamic_feature was set for {partition.name}"
+ "but it did not include a feature declaration. Slurm requires this! Skipping for now.ß"
)
return
Expand Down Expand Up @@ -779,7 +779,7 @@ def _partitions(
)

for partition in partitions:
if partition.dynamic_config:
if partition.dynamic_feature:
if partition.name in written_dynamic_partitions:
logging.warning("Duplicate partition found mapped to the same name." +
" Using first Feature= declaration and ignoring the rest!")
Expand Down
32 changes: 13 additions & 19 deletions slurm/src/slurmcc/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(
buckets: List[NodeBucket],
max_vm_count: int,
use_pcpu: bool = False,
dynamic_config: Optional[str] = None,
dynamic_feature: Optional[str] = None,
over_allocation_thresholds: Dict = {},
nodearray_machine_types: Optional[List[str]] = None,
dampen_memory: Optional[float] = None,
Expand Down Expand Up @@ -64,7 +64,7 @@ def __init__(
self.max_vm_count = sum([b.max_count for b in buckets])
self.buckets = buckets
self.use_pcpu = use_pcpu
self.dynamic_config = dynamic_config
self.dynamic_feature = dynamic_feature
# cache node_list property for dynamic partitions
self.__dynamic_node_list_cache = None
self.node_list_by_pg: Dict[
Expand All @@ -75,14 +75,8 @@ def __init__(
self.gpu_device_config = gpu_device_config

self.features = []
if self.dynamic_config:
toks = self.dynamic_config.replace('"', "").replace("'", "").split()

for tok in toks:
if "=" in tok:
key, value = tok.split("=", 1)
if key.lower() == "feature":
self.features = value.strip().split(",")
if self.dynamic_feature:
self.features = self.dynamic_feature.split(',')


def bucket_for_node(self, node_name: str) -> NodeBucket:
Expand All @@ -103,7 +97,7 @@ def _slurm_nodes(cls) -> List[Dict[str, str]]:

@property
def node_list(self) -> str:
if not self.dynamic_config:
if not self.dynamic_feature:
static_nodes = self._static_all_nodes()
if not static_nodes:
return ""
Expand Down Expand Up @@ -166,7 +160,7 @@ def _static_all_nodes(self) -> List[str]:
return ret

def all_nodes(self) -> List[str]:
if not self.dynamic_config:
if not self.dynamic_feature:
return self._static_all_nodes()
return slutil.from_hostlist(self.node_list)

Expand All @@ -184,7 +178,7 @@ def gpu_count(self) -> int:
def _construct_node_list(
partition: Partition,
) -> Dict[Optional[PlacementGroup], List[str]]:
if partition.dynamic_config:
if partition.dynamic_feature:
return _construct_dynamic_node_list(partition)
else:
return _construct_static_node_list(partition)
Expand Down Expand Up @@ -284,7 +278,7 @@ def fetch_partitions(
for buckets in split_buckets.values():
nodearray_name = buckets[0].nodearray
slurm_config = buckets[0].software_configuration.get("slurm", {})
dynamic_config = slurm_config.get("dynamic_config")
dynamic_feature = slurm_config.get("dynamic_feature")
is_hpc = str(slurm_config.get("hpc", True)).lower() == "true"
is_autoscale = slurm_config.get("autoscale", True) # TODO
if is_autoscale is None:
Expand Down Expand Up @@ -333,9 +327,9 @@ def fetch_partitions(
nodename_prefix,
)

if len(buckets) > 1 and not dynamic_config:
if len(buckets) > 1 and not dynamic_feature:
logging.warning(
"Multiple buckets defined for nodearray %s, but no dynamic_config. Using first bucket (vm_size or placement group) only.",
"Multiple buckets defined for nodearray %s, but no dynamic_feature. Using first bucket (vm_size or placement group) only.",
nodearray_name,
)
buckets = [buckets[0]]
Expand Down Expand Up @@ -391,7 +385,7 @@ def fetch_partitions(
buckets,
limits.max_count,
use_pcpu=use_pcpu,
dynamic_config=dynamic_config,
dynamic_feature=dynamic_feature,
over_allocation_thresholds=over_allocation_thresholds,
nodearray_machine_types=nodearray_vm_size.get(nodearray_name),
dampen_memory=dampen_memory,
Expand All @@ -402,7 +396,7 @@ def fetch_partitions(
filtered_partitions = []
by_name = hpcutil.partition(all_partitions, lambda p: p.name)
for pname, parts in by_name.items():
all_dyn = set([bool(p.dynamic_config) for p in parts])
all_dyn = set([bool(p.dynamic_feature) for p in parts])

if len(all_dyn) > 1:
logging.error(
Expand All @@ -413,7 +407,7 @@ def fetch_partitions(
else:
if len(parts) > 1 and False in all_dyn:
logging.error(
"Only partitions with slurm.dynamic_config may point to more than one nodearray."
"Only partitions with slurm.dynamic_feature may point to more than one nodearray."
)
disabled_parts_message = [
"/".join([p.name, p.nodearray]) for p in parts
Expand Down
5 changes: 4 additions & 1 deletion templates/slurm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,10 @@ Autoscale = $Autoscale
AdditionalClusterInitSpecs = $DynamicClusterInitSpecs
[[[configuration]]]
slurm.hpc = false
slurm.dynamic_config := "-Z --conf \"Feature=dyn\""
# Slurm only allows a single feature to be defined in a Nodeset.
slurm.dynamic_feature := "dyn"
# Add additional dynamic node configuration options here that can be passed alongside --conf flag to slurmd.
#slurm.dynamic_config := "RealMemory=3700"
# set pcpu = false for all hyperthreaded VMs
slurm.use_pcpu = false
slurm.autoscale = $EnableDynamicPartition
Expand Down

0 comments on commit e2bb92d

Please sign in to comment.