Skip to content

Commit 6ce409b

Browse files
committed
Avoid duplication of nodes in _find_active_nodes()
This is relevant only in cases when the Slurm partition configuration is edited and PC-managed nodes are added to multiple custom partitions. Add unit test to verify that overlapping partitions do not lead to duplication of nodes in the group returned by ClusterManager._find_active_nodes(). Adapt existing unit tests to the changes. Signed-off-by: Jacopo De Amicis <jdamicis@amazon.it>
1 parent dc340b4 commit 6ce409b

File tree

4 files changed

+98
-9
lines changed

4 files changed

+98
-9
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ This file is used to list changes made in each version of the aws-parallelcluste
66
3.6.1
77
------
88

9+
**CHANGES**
10+
- Avoid duplication of nodes seen by ClusterManager if compute nodes are added to multiple Slurm partitions.
11+
912
**BUG FIXES**
1013
- Fix fast insufficient capacity fail-over logic when using Multiple Instance Types and no instances are returned
1114

src/slurm_plugin/clustermgtd.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1123,11 +1123,12 @@ def _find_bootstrap_failure_nodes(slurm_nodes):
11231123

11241124
@staticmethod
11251125
def _find_active_nodes(partitions_name_map):
1126-
active_nodes = []
1126+
active_nodes = set()
11271127
for partition in partitions_name_map.values():
11281128
if partition.state != "INACTIVE":
1129-
active_nodes += partition.slurm_nodes
1130-
return active_nodes
1129+
active_nodes |= set(partition.slurm_nodes)
1130+
# TODO: Returning the set breaks some unit tests (I believe the unit test should be revised though).
1131+
return sorted(list(active_nodes), key=str)
11311132

11321133
def _is_node_in_replacement_valid(self, node, check_node_is_valid):
11331134
"""

src/slurm_plugin/slurm_resources.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from dataclasses import dataclass
1616
from datetime import datetime
1717
from enum import Enum
18+
from typing import List
1819

1920
from common.utils import time_is_up
2021

@@ -49,12 +50,12 @@ def __str__(self):
4950

5051

5152
class SlurmPartition:
52-
def __init__(self, name, nodenames, state):
53+
def __init__(self, name, nodenames, state, slurm_nodes: List = None):
5354
"""Initialize slurm partition with attributes."""
5455
self.name = name
5556
self.nodenames = nodenames
5657
self.state = state
57-
self.slurm_nodes = []
58+
self.slurm_nodes = slurm_nodes if slurm_nodes else []
5859

5960
def is_inactive(self):
6061
return self.state == "INACTIVE"

tests/slurm_plugin/test_clustermgtd.py

Lines changed: 88 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1314,13 +1314,14 @@ def test_maintain_nodes(
13141314
# Run test
13151315
cluster_manager._maintain_nodes(partitions, {})
13161316
# Check function calls
1317-
mock_update_replacement.assert_called_with(active_nodes)
1317+
active_nodes_sorted = sorted(active_nodes, key=str)
1318+
mock_update_replacement.assert_called_with(active_nodes_sorted)
13181319
mock_handle_dynamic.assert_called_with(expected_unhealthy_dynamic_nodes)
13191320
mock_handle_static.assert_called_with(expected_unhealthy_static_nodes)
1320-
mock_handle_powering_down_nodes.assert_called_with(active_nodes)
1321-
mock_handle_failed_health_check_nodes_in_replacement.assert_called_with(active_nodes)
1321+
mock_handle_powering_down_nodes.assert_called_with(active_nodes_sorted)
1322+
mock_handle_failed_health_check_nodes_in_replacement.assert_called_with(active_nodes_sorted)
13221323
if _is_protected_mode_enabled:
1323-
mock_handle_protected_mode_process.assert_called_with(active_nodes, partitions)
1324+
mock_handle_protected_mode_process.assert_called_with(active_nodes_sorted, partitions)
13241325
else:
13251326
mock_handle_protected_mode_process.assert_not_called()
13261327

@@ -3729,3 +3730,86 @@ def test_find_unhealthy_slurm_nodes(
37293730
assert_that(unhealthy_dynamic_nodes).is_equal_to(expected_unhealthy_dynamic_nodes)
37303731
assert_that(unhealthy_static_nodes).is_equal_to(expected_unhealthy_static_nodes)
37313732
assert_that(ice_compute_resources_and_nodes_map).is_equal_to(expected_ice_compute_resources_and_nodes_map)
3733+
3734+
3735+
@pytest.mark.parametrize(
3736+
"partitions_name_map, expected_nodelist",
3737+
[
3738+
pytest.param(
3739+
{
3740+
"queue1": SlurmPartition(
3741+
name="queue1",
3742+
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
3743+
state="UP",
3744+
slurm_nodes=[
3745+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3746+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3747+
],
3748+
),
3749+
"queue2": SlurmPartition(
3750+
name="queue2",
3751+
nodenames="queue2-st-cr1-1,queue2-st-cr1-2",
3752+
state="UP",
3753+
slurm_nodes=[
3754+
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3755+
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3756+
],
3757+
),
3758+
},
3759+
[
3760+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3761+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3762+
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3763+
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3764+
],
3765+
id="Two non-overlapping partitions",
3766+
),
3767+
pytest.param(
3768+
{
3769+
"queue1": SlurmPartition(
3770+
name="queue1",
3771+
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
3772+
state="UP",
3773+
slurm_nodes=[
3774+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3775+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3776+
],
3777+
),
3778+
"custom_partition": SlurmPartition(
3779+
name="custom_partition",
3780+
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
3781+
state="UP",
3782+
slurm_nodes=[
3783+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3784+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3785+
],
3786+
),
3787+
},
3788+
[
3789+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3790+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3791+
],
3792+
id="Two overlapping partitions obtained by creating a custom partition that includes nodes from a "
3793+
"PC-managed queue",
3794+
),
3795+
],
3796+
)
3797+
def test_find_active_nodes(partitions_name_map, expected_nodelist):
3798+
"""
3799+
Unit test for the `ClusterManager._find_active_nodes()` method.
3800+
3801+
Some context about the way this test is implemented:
3802+
- `ClusterManager._find_active_nodes()` may be implemented to return different types of iterables.
3803+
This test was implemented together with a fix that changed the return type from a list to a set,
3804+
and it was desirable to have the test compatible with both return types.
3805+
- The implementation that returned a list caused a duplication of node entities in the returned iterable
3806+
in case the same node belonged to multiple Slurm partitions (via a customization of the Slurm configuration).
3807+
- Due to the way we implement the `__hash__()` dunder method for the SlurmNode class, two different
3808+
SlurmNode objects with the same node name are squashed into the same entity in a set. Therefore
3809+
we cannot use `set(expected_nodelist)` when trying to test the duplication of the node entities
3810+
in the iterable returned by `ClusterManager._find_active_nodes()`.
3811+
- Sets are unordered, so when transforming them into lists we have to sort them to make them comparable with
3812+
the `expected_nodelist`.
3813+
"""
3814+
result_nodelist = sorted(list(ClusterManager._find_active_nodes(partitions_name_map)), key=str)
3815+
assert_that(result_nodelist).is_equal_to(sorted(expected_nodelist, key=str))

0 commit comments

Comments
 (0)