Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ This file is used to list changes made in each version of the aws-parallelcluste
3.6.1
------

**CHANGES**
- Avoid duplication of nodes seen by ClusterManager if compute nodes are added to multiple Slurm partitions.

**BUG FIXES**
- Fix fast insufficient capacity fail-over logic when using Multiple Instance Types and no instances are returned

Expand Down
2 changes: 1 addition & 1 deletion src/slurm_plugin/clustermgtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -1127,7 +1127,7 @@ def _find_active_nodes(partitions_name_map):
for partition in partitions_name_map.values():
if partition.state != "INACTIVE":
active_nodes += partition.slurm_nodes
return active_nodes
return list(dict.fromkeys(active_nodes))

def _is_node_in_replacement_valid(self, node, check_node_is_valid):
"""
Expand Down
124 changes: 124 additions & 0 deletions tests/slurm_plugin/test_clustermgtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3729,3 +3729,127 @@ def test_find_unhealthy_slurm_nodes(
assert_that(unhealthy_dynamic_nodes).is_equal_to(expected_unhealthy_dynamic_nodes)
assert_that(unhealthy_static_nodes).is_equal_to(expected_unhealthy_static_nodes)
assert_that(ice_compute_resources_and_nodes_map).is_equal_to(expected_ice_compute_resources_and_nodes_map)


@pytest.mark.parametrize(
"partitions_name_map, expected_nodelist",
[
pytest.param(
{
"queue1": SimpleNamespace(
name="queue1",
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
state="UP",
slurm_nodes=[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
"queue2": SimpleNamespace(
name="queue2",
nodenames="queue2-st-cr1-1,queue2-st-cr1-2",
state="UP",
slurm_nodes=[
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
},
[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
id="Two non-overlapping partitions",
),
pytest.param(
{
"queue1": SimpleNamespace(
name="queue1",
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
state="UP",
slurm_nodes=[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
"custom_partition": SimpleNamespace(
name="custom_partition",
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
state="UP",
slurm_nodes=[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
},
[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
id="Two overlapping partitions obtained by creating a custom partition that includes nodes from a "
"PC-managed queue",
),
pytest.param(
{},
[],
id="Empty cluster with no partitions",
),
pytest.param(
{
"queue1": SimpleNamespace(
name="queue1",
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
state="UP",
slurm_nodes=[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
"queue2": SimpleNamespace(
name="queue2",
nodenames="queue2-st-cr1-1,queue2-st-cr1-2",
state="INACTIVE",
slurm_nodes=[
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
},
[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
id="Two non-overlapping partitions, one active and one inactive",
),
pytest.param(
{
"queue1": SimpleNamespace(
name="queue1",
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
state="INACTIVE",
slurm_nodes=[
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
"queue2": SimpleNamespace(
name="queue2",
nodenames="queue2-st-cr1-1,queue2-st-cr1-2",
state="INACTIVE",
slurm_nodes=[
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
],
),
},
[],
id="Two inactive non-overlapping partitions",
),
],
)
def test_find_active_nodes(partitions_name_map, expected_nodelist):
"""Unit test for the `ClusterManager._find_active_nodes()` method."""
result_nodelist = ClusterManager._find_active_nodes(partitions_name_map)
assert_that(result_nodelist).is_equal_to(expected_nodelist)