Skip to content

Commit be72632

Browse files
committed
Avoid duplication of nodes in _find_active_nodes()
This is relevant only in cases when the Slurm partition configuration is edited and PC-managed nodes are added to multiple custom partitions. Add unit test to verify that overlapping partitions do not lead to duplication of nodes in the group returned by ClusterManager._find_active_nodes(). Adapt existing unit tests to the changes. Signed-off-by: Jacopo De Amicis <jdamicis@amazon.it>
1 parent e200281 commit be72632

File tree

4 files changed

+96
-9
lines changed

4 files changed

+96
-9
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ This file is used to list changes made in each version of the aws-parallelcluste
1616
3.6.1
1717
------
1818

19+
**CHANGES**
20+
- Avoid duplication of nodes seen by ClusterManager if compute nodes are added to multiple Slurm partitions.
21+
1922
**BUG FIXES**
2023
- Fix fast insufficient capacity fail-over logic when using Multiple Instance Types and no instances are returned
2124

src/slurm_plugin/clustermgtd.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1123,11 +1123,12 @@ def _find_bootstrap_failure_nodes(slurm_nodes):
11231123

11241124
@staticmethod
11251125
def _find_active_nodes(partitions_name_map):
1126-
active_nodes = []
1126+
active_nodes = set()
11271127
for partition in partitions_name_map.values():
11281128
if partition.state != "INACTIVE":
1129-
active_nodes += partition.slurm_nodes
1130-
return active_nodes
1129+
active_nodes |= set(partition.slurm_nodes)
1130+
# TODO: Returning the set breaks some unit tests (I believe the unit test should be revised though).
1131+
return sorted(list(active_nodes), key=str)
11311132

11321133
def _is_node_in_replacement_valid(self, node, check_node_is_valid):
11331134
"""

src/slurm_plugin/slurm_resources.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,12 @@ def __str__(self):
5050

5151

5252
class SlurmPartition:
53-
def __init__(self, name, nodenames, state):
53+
def __init__(self, name, nodenames, state, slurm_nodes: List = None):
5454
"""Initialize slurm partition with attributes."""
5555
self.name = name
5656
self.nodenames = nodenames
5757
self.state = state
58-
self.slurm_nodes = []
58+
self.slurm_nodes = slurm_nodes if slurm_nodes else []
5959

6060
def is_inactive(self):
6161
return self.state == "INACTIVE"

tests/slurm_plugin/test_clustermgtd.py

Lines changed: 87 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1316,13 +1316,13 @@ def test_maintain_nodes(
13161316
# Run test
13171317
cluster_manager._maintain_nodes(partitions, {})
13181318
# Check function calls
1319-
mock_update_replacement.assert_called_with(active_nodes)
1319+
mock_update_replacement.assert_called_with(sorted(active_nodes, key=str))
13201320
mock_handle_dynamic.assert_called_with(expected_unhealthy_dynamic_nodes)
13211321
mock_handle_static.assert_called_with(expected_unhealthy_static_nodes)
1322-
mock_handle_powering_down_nodes.assert_called_with(active_nodes)
1323-
mock_handle_failed_health_check_nodes_in_replacement.assert_called_with(active_nodes)
1322+
mock_handle_powering_down_nodes.assert_called_with(sorted(active_nodes, key=str))
1323+
mock_handle_failed_health_check_nodes_in_replacement.assert_called_with(sorted(active_nodes, key=str))
13241324
if _is_protected_mode_enabled:
1325-
mock_handle_protected_mode_process.assert_called_with(active_nodes, partitions)
1325+
mock_handle_protected_mode_process.assert_called_with(sorted(active_nodes, key=str), partitions)
13261326
else:
13271327
mock_handle_protected_mode_process.assert_not_called()
13281328

@@ -3731,3 +3731,86 @@ def test_find_unhealthy_slurm_nodes(
37313731
assert_that(unhealthy_dynamic_nodes).is_equal_to(expected_unhealthy_dynamic_nodes)
37323732
assert_that(unhealthy_static_nodes).is_equal_to(expected_unhealthy_static_nodes)
37333733
assert_that(ice_compute_resources_and_nodes_map).is_equal_to(expected_ice_compute_resources_and_nodes_map)
3734+
3735+
3736+
@pytest.mark.parametrize(
3737+
"partitions_name_map, expected_nodelist",
3738+
[
3739+
pytest.param(
3740+
{
3741+
"queue1": SlurmPartition(
3742+
name="queue1",
3743+
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
3744+
state="UP",
3745+
slurm_nodes=[
3746+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3747+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3748+
],
3749+
),
3750+
"queue2": SlurmPartition(
3751+
name="queue2",
3752+
nodenames="queue2-st-cr1-1,queue2-st-cr1-2",
3753+
state="UP",
3754+
slurm_nodes=[
3755+
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3756+
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3757+
],
3758+
),
3759+
},
3760+
[
3761+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3762+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3763+
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3764+
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3765+
],
3766+
id="Two non-overlapping partitions",
3767+
),
3768+
pytest.param(
3769+
{
3770+
"queue1": SlurmPartition(
3771+
name="queue1",
3772+
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
3773+
state="UP",
3774+
slurm_nodes=[
3775+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3776+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3777+
],
3778+
),
3779+
"custom_partition": SlurmPartition(
3780+
name="custom_partition",
3781+
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
3782+
state="UP",
3783+
slurm_nodes=[
3784+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3785+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3786+
],
3787+
),
3788+
},
3789+
[
3790+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3791+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3792+
],
3793+
id="Two overlapping partitions obtained by creating a custom partition that includes nodes from a "
3794+
"PC-managed queue",
3795+
),
3796+
],
3797+
)
3798+
def test_find_active_nodes(partitions_name_map, expected_nodelist):
3799+
"""
3800+
Unit test for the `ClusterManager._find_active_nodes()` method.
3801+
3802+
Some context about the way this test is implemented:
3803+
- `ClusterManager._find_active_nodes()` may be implemented to return different types of iterables.
3804+
This test was implemented together with a fix that changed the return type from a list to a set,
3805+
and it was desirable to have the test compatible with both return types.
3806+
- The implementation that returned a list caused a duplication of node entities in the returned iterable
3807+
in case the same node belonged to multiple Slurm partitions (via a customization of the Slurm configuration).
3808+
- Due to the way we implement the `__hash__()` dunder method for the SlurmNode class, two different
3809+
SlurmNode objects with the same node name are squashed into the same entity in a set. Therefore
3810+
we cannot use `set(expected_nodelist)` when trying to test the duplication of the node entities
3811+
in the iterable returned by `ClusterManager._find_active_nodes()`.
3812+
- Sets are unordered, so when transforming them into lists we have to sort them to make them comparable with
3813+
the `expected_nodelist`.
3814+
"""
3815+
result_nodelist = sorted(list(ClusterManager._find_active_nodes(partitions_name_map)), key=str)
3816+
assert_that(result_nodelist).is_equal_to(sorted(expected_nodelist, key=str))

0 commit comments

Comments
 (0)