From 6ce409bc6bbca42a6b29a146aa42ec5f4b6a3fe4 Mon Sep 17 00:00:00 2001 From: Jacopo De Amicis Date: Fri, 16 Jun 2023 23:52:09 +0200 Subject: [PATCH 1/3] Avoid duplication of nodes in _find_active_nodes() This is relevant only in cases when the Slurm partition configuration is edited and PC-managed nodes are added to multiple custom partitions. Add unit test to verify that overlapping partitions do not lead to duplication of nodes in the group returned by ClusterManager._find_active_nodes(). Adapt existing unit tests to the changes. Signed-off-by: Jacopo De Amicis --- CHANGELOG.md | 3 + src/slurm_plugin/clustermgtd.py | 7 +- src/slurm_plugin/slurm_resources.py | 5 +- tests/slurm_plugin/test_clustermgtd.py | 92 ++++++++++++++++++++++++-- 4 files changed, 98 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1c946dbc..c89d9500a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ This file is used to list changes made in each version of the aws-parallelcluste 3.6.1 ------ +**CHANGES** +- Avoid duplication of nodes seen by ClusterManager if compute nodes are added to multiple Slurm partitions. + **BUG FIXES** - Fix fast insufficient capacity fail-over logic when using Multiple Instance Types and no instances are returned diff --git a/src/slurm_plugin/clustermgtd.py b/src/slurm_plugin/clustermgtd.py index b54d33a75..c00690a9f 100644 --- a/src/slurm_plugin/clustermgtd.py +++ b/src/slurm_plugin/clustermgtd.py @@ -1123,11 +1123,12 @@ def _find_bootstrap_failure_nodes(slurm_nodes): @staticmethod def _find_active_nodes(partitions_name_map): - active_nodes = [] + active_nodes = set() for partition in partitions_name_map.values(): if partition.state != "INACTIVE": - active_nodes += partition.slurm_nodes - return active_nodes + active_nodes |= set(partition.slurm_nodes) + # TODO: Returning the set breaks some unit tests (I believe the unit test should be revised though). + return sorted(list(active_nodes), key=str) def _is_node_in_replacement_valid(self, node, check_node_is_valid): """ diff --git a/src/slurm_plugin/slurm_resources.py b/src/slurm_plugin/slurm_resources.py index 8f3a5f61f..d2a9cbddf 100644 --- a/src/slurm_plugin/slurm_resources.py +++ b/src/slurm_plugin/slurm_resources.py @@ -15,6 +15,7 @@ from dataclasses import dataclass from datetime import datetime from enum import Enum +from typing import List from common.utils import time_is_up @@ -49,12 +50,12 @@ def __str__(self): class SlurmPartition: - def __init__(self, name, nodenames, state): + def __init__(self, name, nodenames, state, slurm_nodes: List = None): """Initialize slurm partition with attributes.""" self.name = name self.nodenames = nodenames self.state = state - self.slurm_nodes = [] + self.slurm_nodes = slurm_nodes if slurm_nodes else [] def is_inactive(self): return self.state == "INACTIVE" diff --git a/tests/slurm_plugin/test_clustermgtd.py b/tests/slurm_plugin/test_clustermgtd.py index 3e903cbb4..8412ad40f 100644 --- a/tests/slurm_plugin/test_clustermgtd.py +++ b/tests/slurm_plugin/test_clustermgtd.py @@ -1314,13 +1314,14 @@ def test_maintain_nodes( # Run test cluster_manager._maintain_nodes(partitions, {}) # Check function calls - mock_update_replacement.assert_called_with(active_nodes) + active_nodes_sorted = sorted(active_nodes, key=str) + mock_update_replacement.assert_called_with(active_nodes_sorted) mock_handle_dynamic.assert_called_with(expected_unhealthy_dynamic_nodes) mock_handle_static.assert_called_with(expected_unhealthy_static_nodes) - mock_handle_powering_down_nodes.assert_called_with(active_nodes) - mock_handle_failed_health_check_nodes_in_replacement.assert_called_with(active_nodes) + mock_handle_powering_down_nodes.assert_called_with(active_nodes_sorted) + mock_handle_failed_health_check_nodes_in_replacement.assert_called_with(active_nodes_sorted) if _is_protected_mode_enabled: - mock_handle_protected_mode_process.assert_called_with(active_nodes, partitions) + mock_handle_protected_mode_process.assert_called_with(active_nodes_sorted, partitions) else: mock_handle_protected_mode_process.assert_not_called() @@ -3729,3 +3730,86 @@ def test_find_unhealthy_slurm_nodes( assert_that(unhealthy_dynamic_nodes).is_equal_to(expected_unhealthy_dynamic_nodes) assert_that(unhealthy_static_nodes).is_equal_to(expected_unhealthy_static_nodes) assert_that(ice_compute_resources_and_nodes_map).is_equal_to(expected_ice_compute_resources_and_nodes_map) + + +@pytest.mark.parametrize( + "partitions_name_map, expected_nodelist", + [ + pytest.param( + { + "queue1": SlurmPartition( + name="queue1", + nodenames="queue1-st-cr1-1,queue1-st-cr1-2", + state="UP", + slurm_nodes=[ + StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""), + StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""), + ], + ), + "queue2": SlurmPartition( + name="queue2", + nodenames="queue2-st-cr1-1,queue2-st-cr1-2", + state="UP", + slurm_nodes=[ + StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""), + StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""), + ], + ), + }, + [ + StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""), + StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""), + StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""), + StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""), + ], + id="Two non-overlapping partitions", + ), + pytest.param( + { + "queue1": SlurmPartition( + name="queue1", + nodenames="queue1-st-cr1-1,queue1-st-cr1-2", + state="UP", + slurm_nodes=[ + StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""), + StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""), + ], + ), + "custom_partition": SlurmPartition( + name="custom_partition", + nodenames="queue1-st-cr1-1,queue1-st-cr1-2", + state="UP", + slurm_nodes=[ + StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""), + StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""), + ], + ), + }, + [ + StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""), + StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""), + ], + id="Two overlapping partitions obtained by creating a custom partition that includes nodes from a " + "PC-managed queue", + ), + ], +) +def test_find_active_nodes(partitions_name_map, expected_nodelist): + """ + Unit test for the `ClusterManager._find_active_nodes()` method. + + Some context about the way this test is implemented: + - `ClusterManager._find_active_nodes()` may be implemented to return different types of iterables. + This test was implemented together with a fix that changed the return type from a list to a set, + and it was desirable to have the test compatible with both return types. + - The implementation that returned a list caused a duplication of node entities in the returned iterable + in case the same node belonged to multiple Slurm partitions (via a customization of the Slurm configuration). + - Due to the way we implement the `__hash__()` dunder method for the SlurmNode class, two different + SlurmNode objects with the same node name are squashed into the same entity in a set. Therefore + we cannot use `set(expected_nodelist)` when trying to test the duplication of the node entities + in the iterable returned by `ClusterManager._find_active_nodes()`. + - Sets are unordered, so when transforming them into lists we have to sort them to make them comparable with + the `expected_nodelist`. + """ + result_nodelist = sorted(list(ClusterManager._find_active_nodes(partitions_name_map)), key=str) + assert_that(result_nodelist).is_equal_to(sorted(expected_nodelist, key=str)) From ec7cab88f576430b878e9bb4194d934cb4d59f41 Mon Sep 17 00:00:00 2001 From: Jacopo De Amicis Date: Mon, 19 Jun 2023 16:21:01 +0200 Subject: [PATCH 2/3] Use list of dict to remove duplicate nodes in _find_active_nodes() This preserves the insertion order of nodes in the list of active nodes and removed the need to sort nodelists manually to compare them in unit tests. Signed-off-by: Jacopo De Amicis --- src/slurm_plugin/clustermgtd.py | 7 +++--- tests/slurm_plugin/test_clustermgtd.py | 30 ++++++-------------------- 2 files changed, 10 insertions(+), 27 deletions(-) diff --git a/src/slurm_plugin/clustermgtd.py b/src/slurm_plugin/clustermgtd.py index c00690a9f..bf9ab0741 100644 --- a/src/slurm_plugin/clustermgtd.py +++ b/src/slurm_plugin/clustermgtd.py @@ -1123,12 +1123,11 @@ def _find_bootstrap_failure_nodes(slurm_nodes): @staticmethod def _find_active_nodes(partitions_name_map): - active_nodes = set() + active_nodes = [] for partition in partitions_name_map.values(): if partition.state != "INACTIVE": - active_nodes |= set(partition.slurm_nodes) - # TODO: Returning the set breaks some unit tests (I believe the unit test should be revised though). - return sorted(list(active_nodes), key=str) + active_nodes += partition.slurm_nodes + return list(dict.fromkeys(active_nodes)) def _is_node_in_replacement_valid(self, node, check_node_is_valid): """ diff --git a/tests/slurm_plugin/test_clustermgtd.py b/tests/slurm_plugin/test_clustermgtd.py index 8412ad40f..07c00e2d2 100644 --- a/tests/slurm_plugin/test_clustermgtd.py +++ b/tests/slurm_plugin/test_clustermgtd.py @@ -1314,14 +1314,13 @@ def test_maintain_nodes( # Run test cluster_manager._maintain_nodes(partitions, {}) # Check function calls - active_nodes_sorted = sorted(active_nodes, key=str) - mock_update_replacement.assert_called_with(active_nodes_sorted) + mock_update_replacement.assert_called_with(active_nodes) mock_handle_dynamic.assert_called_with(expected_unhealthy_dynamic_nodes) mock_handle_static.assert_called_with(expected_unhealthy_static_nodes) - mock_handle_powering_down_nodes.assert_called_with(active_nodes_sorted) - mock_handle_failed_health_check_nodes_in_replacement.assert_called_with(active_nodes_sorted) + mock_handle_powering_down_nodes.assert_called_with(active_nodes) + mock_handle_failed_health_check_nodes_in_replacement.assert_called_with(active_nodes) if _is_protected_mode_enabled: - mock_handle_protected_mode_process.assert_called_with(active_nodes_sorted, partitions) + mock_handle_protected_mode_process.assert_called_with(active_nodes, partitions) else: mock_handle_protected_mode_process.assert_not_called() @@ -3795,21 +3794,6 @@ def test_find_unhealthy_slurm_nodes( ], ) def test_find_active_nodes(partitions_name_map, expected_nodelist): - """ - Unit test for the `ClusterManager._find_active_nodes()` method. - - Some context about the way this test is implemented: - - `ClusterManager._find_active_nodes()` may be implemented to return different types of iterables. - This test was implemented together with a fix that changed the return type from a list to a set, - and it was desirable to have the test compatible with both return types. - - The implementation that returned a list caused a duplication of node entities in the returned iterable - in case the same node belonged to multiple Slurm partitions (via a customization of the Slurm configuration). - - Due to the way we implement the `__hash__()` dunder method for the SlurmNode class, two different - SlurmNode objects with the same node name are squashed into the same entity in a set. Therefore - we cannot use `set(expected_nodelist)` when trying to test the duplication of the node entities - in the iterable returned by `ClusterManager._find_active_nodes()`. - - Sets are unordered, so when transforming them into lists we have to sort them to make them comparable with - the `expected_nodelist`. - """ - result_nodelist = sorted(list(ClusterManager._find_active_nodes(partitions_name_map)), key=str) - assert_that(result_nodelist).is_equal_to(sorted(expected_nodelist, key=str)) + """Unit test for the `ClusterManager._find_active_nodes()` method.""" + result_nodelist = ClusterManager._find_active_nodes(partitions_name_map) + assert_that(result_nodelist).is_equal_to(expected_nodelist) From 6eac5c5bbb3663e452b54c76a23fc6deba9e5500 Mon Sep 17 00:00:00 2001 From: Jacopo De Amicis Date: Mon, 19 Jun 2023 17:13:21 +0200 Subject: [PATCH 3/3] Use SimpleNamespace instead of SlurmPartition in unit test This removed the need to extend the constructor of SlurmPartition to pass the list of SlurmNode objects `slurm_nodes`, removing the risk of wrong implementations in the future. Include additional cases to test_find_active_nodes(). Signed-off-by: Jacopo De Amicis --- src/slurm_plugin/slurm_resources.py | 5 +- tests/slurm_plugin/test_clustermgtd.py | 64 ++++++++++++++++++++++++-- 2 files changed, 62 insertions(+), 7 deletions(-) diff --git a/src/slurm_plugin/slurm_resources.py b/src/slurm_plugin/slurm_resources.py index d2a9cbddf..8f3a5f61f 100644 --- a/src/slurm_plugin/slurm_resources.py +++ b/src/slurm_plugin/slurm_resources.py @@ -15,7 +15,6 @@ from dataclasses import dataclass from datetime import datetime from enum import Enum -from typing import List from common.utils import time_is_up @@ -50,12 +49,12 @@ def __str__(self): class SlurmPartition: - def __init__(self, name, nodenames, state, slurm_nodes: List = None): + def __init__(self, name, nodenames, state): """Initialize slurm partition with attributes.""" self.name = name self.nodenames = nodenames self.state = state - self.slurm_nodes = slurm_nodes if slurm_nodes else [] + self.slurm_nodes = [] def is_inactive(self): return self.state == "INACTIVE" diff --git a/tests/slurm_plugin/test_clustermgtd.py b/tests/slurm_plugin/test_clustermgtd.py index 07c00e2d2..e34313f1f 100644 --- a/tests/slurm_plugin/test_clustermgtd.py +++ b/tests/slurm_plugin/test_clustermgtd.py @@ -3736,7 +3736,7 @@ def test_find_unhealthy_slurm_nodes( [ pytest.param( { - "queue1": SlurmPartition( + "queue1": SimpleNamespace( name="queue1", nodenames="queue1-st-cr1-1,queue1-st-cr1-2", state="UP", @@ -3745,7 +3745,7 @@ def test_find_unhealthy_slurm_nodes( StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""), ], ), - "queue2": SlurmPartition( + "queue2": SimpleNamespace( name="queue2", nodenames="queue2-st-cr1-1,queue2-st-cr1-2", state="UP", @@ -3765,7 +3765,7 @@ def test_find_unhealthy_slurm_nodes( ), pytest.param( { - "queue1": SlurmPartition( + "queue1": SimpleNamespace( name="queue1", nodenames="queue1-st-cr1-1,queue1-st-cr1-2", state="UP", @@ -3774,7 +3774,7 @@ def test_find_unhealthy_slurm_nodes( StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""), ], ), - "custom_partition": SlurmPartition( + "custom_partition": SimpleNamespace( name="custom_partition", nodenames="queue1-st-cr1-1,queue1-st-cr1-2", state="UP", @@ -3791,6 +3791,62 @@ def test_find_unhealthy_slurm_nodes( id="Two overlapping partitions obtained by creating a custom partition that includes nodes from a " "PC-managed queue", ), + pytest.param( + {}, + [], + id="Empty cluster with no partitions", + ), + pytest.param( + { + "queue1": SimpleNamespace( + name="queue1", + nodenames="queue1-st-cr1-1,queue1-st-cr1-2", + state="UP", + slurm_nodes=[ + StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""), + StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""), + ], + ), + "queue2": SimpleNamespace( + name="queue2", + nodenames="queue2-st-cr1-1,queue2-st-cr1-2", + state="INACTIVE", + slurm_nodes=[ + StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""), + StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""), + ], + ), + }, + [ + StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""), + StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""), + ], + id="Two non-overlapping partitions, one active and one inactive", + ), + pytest.param( + { + "queue1": SimpleNamespace( + name="queue1", + nodenames="queue1-st-cr1-1,queue1-st-cr1-2", + state="INACTIVE", + slurm_nodes=[ + StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""), + StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""), + ], + ), + "queue2": SimpleNamespace( + name="queue2", + nodenames="queue2-st-cr1-1,queue2-st-cr1-2", + state="INACTIVE", + slurm_nodes=[ + StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""), + StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""), + ], + ), + }, + [], + id="Two inactive non-overlapping partitions", + ), ], ) def test_find_active_nodes(partitions_name_map, expected_nodelist):