From 550806e929e7afd8858750d475b59bd25cfefe3e Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Thu, 22 Dec 2022 10:45:59 +0100 Subject: [PATCH] [Test] Include testing for detached retained managed storage to the integration tests covering the dynamic file system mounting. Signed-off-by: Giacomo Marciani --- tests/integration-tests/conftest.py | 34 ++++++++ .../common/networking/security_groups.py | 39 +++++++++ .../tests/common/storage/assertions.py | 40 +++++++++ .../tests/common/storage/constants.py | 9 ++ .../tests/common/storage/ebs_utils.py | 36 ++++++++ .../tests/common/storage/efs_utils.py | 86 +++++++++++++++++++ .../tests/common/storage/fsx_utils.py | 73 ++++++++++++++++ .../tests/update/test_update.py | 77 ++++++++++++++++- .../pcluster.config.update_drain.yaml | 5 +- 9 files changed, 396 insertions(+), 3 deletions(-) create mode 100644 tests/integration-tests/tests/common/networking/security_groups.py create mode 100644 tests/integration-tests/tests/common/storage/assertions.py create mode 100644 tests/integration-tests/tests/common/storage/constants.py create mode 100644 tests/integration-tests/tests/common/storage/ebs_utils.py create mode 100644 tests/integration-tests/tests/common/storage/efs_utils.py create mode 100644 tests/integration-tests/tests/common/storage/fsx_utils.py diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index d642d76ee4..9b001f57b7 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -84,6 +84,10 @@ from tests.common.osu_common import run_osu_benchmarks from tests.common.schedulers_common import get_scheduler_commands +from tests.common.storage.constants import StorageType +from tests.common.storage.ebs_utils import delete_ebs_volume +from tests.common.storage.efs_utils import delete_efs_filesystem +from tests.common.storage.fsx_utils import delete_fsx_filesystem from tests.common.utils import ( fetch_instance_slots, get_installed_parallelcluster_version, @@ -2012,3 +2016,33 @@ def _add_mount_targets(subnet_ids, efs_ids, security_group, template): ) ) availability_zones_with_mount_target.add(subnet["AvailabilityZone"]) + + +@pytest.fixture(scope="class") +def delete_storage_on_teardown(request, region): + supported_storage_types = [StorageType.STORAGE_EBS, StorageType.STORAGE_EFS, StorageType.STORAGE_FSX] + delete_storage_function = { + StorageType.STORAGE_EBS: delete_ebs_volume, + StorageType.STORAGE_EFS: delete_efs_filesystem, + StorageType.STORAGE_FSX: delete_fsx_filesystem, + } + storage_resources = {storage_type: set() for storage_type in supported_storage_types} + + def _add_storage(storage_type: StorageType, storage_id: str): + logging.info( + f"Adding storage for deletion on teardown: storage of type {storage_type.name} with id {storage_id}" + ) + storage_resources[storage_type].add(storage_id) + + def _delete_storage_resources(): + logging.info("Deleting storage resource on teardown") + for storage_type, storage_ids in storage_resources.items(): + for storage_id in storage_ids: + delete_storage_function[storage_type](region, storage_id) + + yield _add_storage + + if request.config.getoption("no_delete"): + logging.info("Not deleting storage resources marked for removal because --no-delete option was specified") + else: + _delete_storage_resources() diff --git a/tests/integration-tests/tests/common/networking/security_groups.py b/tests/integration-tests/tests/common/networking/security_groups.py new file mode 100644 index 0000000000..a223269004 --- /dev/null +++ b/tests/integration-tests/tests/common/networking/security_groups.py @@ -0,0 +1,39 @@ +import logging + +import boto3 +from botocore.exceptions import ClientError +from retrying import retry +from time_utils import seconds + + +@retry(stop_max_attempt_number=3, wait_fixed=seconds(5)) +def delete_security_group(region: str, security_group_id: str): + logging.info(f"Deleting Security Group {security_group_id}") + try: + _ec2(region).delete_security_group(GroupId=security_group_id) + except Exception as e: + if isinstance(e, ClientError) and "InvalidGroup.NotFound" in str(e): + logging.warning(f"Cannot delete Security Group {security_group_id} because it does not exist") + else: + logging.error(f"Cannot delete Security Group {security_group_id}: {e}") + raise e + + +@retry(stop_max_attempt_number=3, wait_fixed=seconds(5)) +def describe_security_groups_for_network_interface(region: str, network_interface_id: str): + logging.info(f"Describing Security Groups for Network Interface {network_interface_id}") + try: + network_inyterface_description = _ec2(region).describe_network_interfaces( + NetworkInterfaceIds=[network_interface_id] + ) + return [ + security_group["GroupId"] + for security_group in network_inyterface_description["NetworkInterfaces"][0]["Groups"] + ] + except Exception as e: + logging.error(f"Cannot describe Security Groups for Network Interface {network_interface_id}: {e}") + raise e + + +def _ec2(region): + return boto3.client("ec2", region) diff --git a/tests/integration-tests/tests/common/storage/assertions.py b/tests/integration-tests/tests/common/storage/assertions.py new file mode 100644 index 0000000000..a9b577c603 --- /dev/null +++ b/tests/integration-tests/tests/common/storage/assertions.py @@ -0,0 +1,40 @@ +import logging +from typing import List + +from assertpy import assert_that + +from tests.common.storage.constants import StorageType +from tests.common.storage.ebs_utils import describe_ebs_volume +from tests.common.storage.efs_utils import describe_efs_filesystem +from tests.common.storage.fsx_utils import describe_fsx_filesystem + + +def assert_storage_existence( + region: str, storage_type: StorageType, storage_id: str, should_exist: bool, expected_states: List[str] = None +): + logging.info( + f"Checking existence for {storage_type.name} resource {storage_id}: " + f"expected to{' not ' if not should_exist else ' '}exist" + ) + if storage_type == StorageType.STORAGE_EBS: + description = describe_ebs_volume(region, storage_id) + state = description.get("State") if description else None + elif storage_type == StorageType.STORAGE_EFS: + description = describe_efs_filesystem(region, storage_id) + state = description.get("LifeCycleState") if description else None + elif storage_type == StorageType.STORAGE_FSX: + description = describe_fsx_filesystem(region, storage_id) + state = description.get("Lifecycle") if description else None + else: + raise Exception(f"Cannot check existence for storage type {storage_type.name}.") + exists = description is not None + assert_that( + exists, f"The {storage_type.name} resource {storage_id} does{' not ' if not exists else ' '}exist" + ).is_equal_to(should_exist) + + if should_exist and expected_states: + assert_that( + expected_states, + f"The {storage_type.name} resource {storage_id} is not in the expected state: " + f"expected states are {expected_states}, but actual is {state}", + ).contains(state) diff --git a/tests/integration-tests/tests/common/storage/constants.py b/tests/integration-tests/tests/common/storage/constants.py new file mode 100644 index 0000000000..46af82468c --- /dev/null +++ b/tests/integration-tests/tests/common/storage/constants.py @@ -0,0 +1,9 @@ +from enum import Enum + + +class StorageType(Enum): + """Types of storage resources.""" + + STORAGE_EBS = "EBS" + STORAGE_EFS = "EFS" + STORAGE_FSX = "FSX" diff --git a/tests/integration-tests/tests/common/storage/ebs_utils.py b/tests/integration-tests/tests/common/storage/ebs_utils.py new file mode 100644 index 0000000000..2295e90bf3 --- /dev/null +++ b/tests/integration-tests/tests/common/storage/ebs_utils.py @@ -0,0 +1,36 @@ +import logging + +import boto3 +from botocore.exceptions import ClientError +from retrying import retry +from time_utils import seconds + + +@retry(stop_max_attempt_number=3, wait_fixed=seconds(5)) +def describe_ebs_volume(region: str, volume_id: str): + logging.info(f"Describing EBS Volume {volume_id}") + try: + return _ec2(region).describe_volumes(VolumeIds=[volume_id])["Volumes"][0] + except Exception as e: + if isinstance(e, ClientError) and "InvalidVolume.NotFound" in str(e): + return None + else: + logging.error(f"Cannot describe EBS Volume {volume_id}: {e}") + raise e + + +@retry(stop_max_attempt_number=3, wait_fixed=seconds(5)) +def delete_ebs_volume(region: str, volume_id: str): + logging.info(f"Deleting EBS Volume {volume_id}") + try: + _ec2(region).delete_volume(VolumeId=volume_id) + except Exception as e: + if isinstance(e, ClientError) and "InvalidVolume.NotFound" in str(e): + logging.warning(f"Cannot delete EBS Volume {volume_id} because it does not exist") + else: + logging.error(f"Cannot delete EBS Volume {volume_id}: {e}") + raise e + + +def _ec2(region): + return boto3.client("ec2", region) diff --git a/tests/integration-tests/tests/common/storage/efs_utils.py b/tests/integration-tests/tests/common/storage/efs_utils.py new file mode 100644 index 0000000000..f93530d03f --- /dev/null +++ b/tests/integration-tests/tests/common/storage/efs_utils.py @@ -0,0 +1,86 @@ +import logging + +import boto3 +from botocore.exceptions import ClientError +from retrying import retry +from time_utils import seconds + +from tests.common.networking.security_groups import delete_security_group + + +@retry(stop_max_attempt_number=3, wait_fixed=seconds(5)) +def describe_efs_filesystem(region: str, file_system_id: str): + logging.info(f"Describing EFS File System {file_system_id}") + try: + return _efs(region).describe_file_systems(FileSystemId=file_system_id)["FileSystems"][0] + except Exception as e: + if isinstance(e, ClientError) and "FileSystemNotFound" in str(e): + return None + else: + logging.error(f"Cannot describe EFS File System {file_system_id}: {e}") + raise e + + +@retry(stop_max_attempt_number=10, wait_fixed=seconds(30)) +def delete_efs_filesystem(region: str, file_system_id: str, delete_dependent_resources: bool = True): + logging.info(f"Deleting EFS File System {file_system_id}") + try: + if delete_dependent_resources: + mount_targets = describe_efs_mount_targets(region, file_system_id) + security_group_ids = set() + for mount_target in mount_targets: + mount_target_id = mount_target["MountTargetId"] + security_groups = describe_mount_target_security_groups(region, mount_target_id) + for security_group_id in security_groups: + security_group_ids.add(security_group_id) + delete_efs_mount_target(region, mount_target_id) + logging.info( + "The following Security Groups will be deleted as part of " + f"the deletion for the EFS File System {file_system_id}: {security_group_ids}" + ) + for mount_target in mount_targets: + mount_target_id = mount_target["MountTargetId"] + wait_for_efs_mount_target_deletion(region, file_system_id, mount_target_id) + for security_group_id in security_group_ids: + delete_security_group(region, security_group_id) + _efs(region).delete_file_system(FileSystemId=file_system_id) + except Exception as e: + if isinstance(e, ClientError) and "FileSystemNotFound" in str(e): + logging.warning(f"Cannot delete EFS File System {file_system_id} because it does not exist") + else: + logging.error(f"Cannot delete EFS File System {file_system_id}: {e}") + raise e + + +@retry(stop_max_attempt_number=3, wait_fixed=seconds(5)) +def describe_efs_mount_targets(region: str, file_system_id: str): + logging.info(f"Describing Mount Targets for EFS File System {file_system_id}") + return _efs(region).describe_mount_targets(FileSystemId=file_system_id).get("MountTargets", []) + + +@retry(stop_max_attempt_number=3, wait_fixed=seconds(5)) +def describe_mount_target_security_groups(region: str, mount_target_id: str): + logging.info(f"Describing Security Groups for EFS Mount Target {mount_target_id}") + return _efs(region).describe_mount_target_security_groups(MountTargetId=mount_target_id).get("SecurityGroups", []) + + +@retry(stop_max_attempt_number=3, wait_fixed=seconds(5)) +def delete_efs_mount_target(region: str, mount_target_id: str): + logging.info(f"Deleting EFS Mount Target {mount_target_id}") + _efs(region).delete_mount_target(MountTargetId=mount_target_id) + + +@retry(stop_max_attempt_number=10, wait_fixed=seconds(60)) +def wait_for_efs_mount_target_deletion(region: str, file_system_id: str, mount_target_id: str): + logging.info(f"Waiting for deletion of EFS Mount Target {mount_target_id} in EFS File System {file_system_id}") + mount_targets = describe_efs_mount_targets(region, file_system_id) + mount_target_ids = [mt["MountTargetId"] for mt in mount_targets] + if mount_target_id in mount_target_ids: + raise Exception( + f"EFs Mount Target {mount_target_id} in EFS File System {file_system_id} not deleted, yet. " + "Sleeping 60 seconds ..." + ) + + +def _efs(region): + return boto3.client("efs", region) diff --git a/tests/integration-tests/tests/common/storage/fsx_utils.py b/tests/integration-tests/tests/common/storage/fsx_utils.py new file mode 100644 index 0000000000..9ef675ead5 --- /dev/null +++ b/tests/integration-tests/tests/common/storage/fsx_utils.py @@ -0,0 +1,73 @@ +import logging + +import boto3 +from botocore.exceptions import ClientError +from retrying import retry +from time_utils import seconds + +from tests.common.networking.security_groups import ( + delete_security_group, + describe_security_groups_for_network_interface, +) + + +@retry(stop_max_attempt_number=3, wait_fixed=seconds(5)) +def describe_fsx_filesystem(region: str, file_system_id: str): + logging.info(f"Describing FSx File System {file_system_id}") + try: + return _fsx(region).describe_file_systems(FileSystemIds=[file_system_id])["FileSystems"][0] + except Exception as e: + if isinstance(e, ClientError) and "FileSystemNotFound" in str(e): + return None + else: + logging.error(f"Cannot describe FSx File System {file_system_id}: {e}") + raise e + + +@retry(stop_max_attempt_number=10, wait_fixed=seconds(30)) +def delete_fsx_filesystem(region: str, file_system_id: str, delete_dependent_resources: bool = True): + logging.info(f"Deleting FSx File System {file_system_id}") + try: + security_group_ids = set() + if delete_dependent_resources: + security_group_ids |= describe_fsx_filesystem_security_groups(region, file_system_id) + _fsx(region).delete_file_system(FileSystemId=file_system_id) + if delete_dependent_resources: + logging.info( + "The following Security Groups will be deleted as part of " + f"the deletion for the FSx File System {file_system_id}: {security_group_ids}" + ) + wait_for_fsx_filesystem_deletion(region, file_system_id) + for security_group_id in security_group_ids: + delete_security_group(region, security_group_id) + except Exception as e: + if isinstance(e, ClientError) and "FileSystemNotFound" in str(e): + logging.warning(f"Cannot delete FSx File System {file_system_id} because it does not exist") + else: + logging.error(f"Cannot delete FSx File System {file_system_id}: {e}") + raise e + + +@retry(stop_max_attempt_number=3, wait_fixed=seconds(5)) +def describe_fsx_filesystem_security_groups(region: str, file_system_id: str): + logging.info(f"Describing Security Groups for FSx File System {file_system_id}") + fs_description = _fsx(region).describe_file_systems(FileSystemIds=[file_system_id]) + network_interface_ids = fs_description["FileSystems"][0]["NetworkInterfaceIds"] + security_group_ids = set() + for network_interface_id in network_interface_ids: + for security_group_id in describe_security_groups_for_network_interface(region, network_interface_id): + security_group_ids.add(security_group_id) + return security_group_ids + + +@retry(stop_max_attempt_number=10, wait_fixed=seconds(60)) +def wait_for_fsx_filesystem_deletion(region: str, file_system_id: str): + logging.info(f"Waiting for deletion of FSx File System {file_system_id}") + fs_description = describe_fsx_filesystem(region, file_system_id) + if fs_description is not None: + state = fs_description.get("Lifecycle") + raise Exception(f"FSx File System {file_system_id} in state {state} not deleted, yet. Sleeping 60 seconds ...") + + +def _fsx(region): + return boto3.client("fsx", region) diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index 3f258726aa..31c257e064 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -31,6 +31,8 @@ from tests.common.hit_common import assert_compute_node_states, assert_initial_conditions, wait_for_compute_nodes_states from tests.common.scaling_common import get_batch_ce, get_batch_ce_max_size, get_batch_ce_min_size from tests.common.schedulers_common import SlurmCommands +from tests.common.storage.assertions import assert_storage_existence +from tests.common.storage.constants import StorageType from tests.common.utils import generate_random_string, retrieve_latest_ami from tests.storage.storage_common import ( check_fsx, @@ -854,6 +856,7 @@ def test_dynamic_file_systems_update( fsx_factory, svm_factory, open_zfs_volume_factory, + delete_storage_on_teardown, ): """Test update shared storages.""" existing_ebs_mount_dir = "/existing_ebs_mount_dir" @@ -893,7 +896,9 @@ def test_dynamic_file_systems_update( ) # Create cluster with initial configuration - init_config_file = pcluster_config_reader() + init_config_file = pcluster_config_reader( + config_file="pcluster.config.yaml", output_file="pcluster.config.init.yaml" + ) cluster = clusters_factory(init_config_file) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = scheduler_commands_factory(remote_command_executor) @@ -913,8 +918,10 @@ def test_dynamic_file_systems_update( scheduler_commands.wait_job_running(queue1_job_id) # update cluster to add ebs, efs, fsx with drain strategy + logging.info("Updating the cluster to mount managed storage with DeletionPolicy set to Delete") update_cluster_config = pcluster_config_reader( config_file="pcluster.config.update_drain.yaml", + output_file="pcluster.config.update_drain_1.yaml", volume_id=existing_ebs_volume_id, existing_ebs_mount_dir=existing_ebs_mount_dir, existing_efs_mount_dir=existing_efs_mount_dir, @@ -927,12 +934,16 @@ def test_dynamic_file_systems_update( fsx_open_zfs_volume_id=existing_fsx_open_zfs_volume_id, bucket_name=bucket_name, new_ebs_mount_dir=new_ebs_mount_dir, + new_ebs_deletion_policy="Delete", new_raid_mount_dir=new_raid_mount_dir, + new_raid_deletion_policy="Delete", new_lustre_mount_dir=new_lustre_mount_dir, + new_lustre_deletion_policy="Delete", new_efs_mount_dir=new_efs_mount_dir, + new_efs_deletion_policy="Delete", ) - cluster.update(str(update_cluster_config)) + cluster.update(update_cluster_config) # check chef client log contains expected log assert_lines_in_logs( @@ -971,7 +982,49 @@ def test_dynamic_file_systems_update( for mount_dir in all_mount_dirs: verify_directory_correctly_shared(remote_command_executor, mount_dir, scheduler_commands, partitions=["queue1"]) + logging.info("Updating the cluster to set DeletionPolicy to Retain for every managed storage") + update_cluster_config = pcluster_config_reader( + config_file="pcluster.config.update_drain.yaml", + output_file="pcluster.config.update_drain_2.yaml", + volume_id=existing_ebs_volume_id, + existing_ebs_mount_dir=existing_ebs_mount_dir, + existing_efs_mount_dir=existing_efs_mount_dir, + fsx_lustre_mount_dir=fsx_lustre_mount_dir, + fsx_ontap_mount_dir=fsx_ontap_mount_dir, + fsx_open_zfs_mount_dir=fsx_open_zfs_mount_dir, + existing_efs_id=existing_efs_id, + existing_fsx_lustre_fs_id=existing_fsx_lustre_fs_id, + fsx_ontap_volume_id=existing_fsx_ontap_volume_id, + fsx_open_zfs_volume_id=existing_fsx_open_zfs_volume_id, + bucket_name=bucket_name, + new_ebs_mount_dir=new_ebs_mount_dir, + new_ebs_deletion_policy="Retain", + new_raid_mount_dir=new_raid_mount_dir, + new_raid_deletion_policy="Retain", + new_lustre_mount_dir=new_lustre_mount_dir, + new_lustre_deletion_policy="Retain", + new_efs_mount_dir=new_efs_mount_dir, + new_efs_deletion_policy="Retain", + ) + + cluster.update(update_cluster_config) + + existing_ebs_ids = [existing_ebs_volume_id] + existing_efs_ids = [existing_efs_id] + existing_fsx_ids = [existing_fsx_lustre_fs_id, existing_fsx_ontap_volume_id, existing_fsx_open_zfs_volume_id] + + retained_ebs_noraid_volume_ids = [ + id for id in cluster.cfn_outputs["EBSIds"].split(",") if id not in existing_ebs_ids + ] + retained_ebs_raid_volume_ids = [ + id for id in cluster.cfn_outputs["RAIDIds"].split(",") if id not in existing_ebs_ids + ] + retained_ebs_volume_ids = retained_ebs_noraid_volume_ids + retained_ebs_raid_volume_ids + retained_efs_filesystem_ids = [id for id in cluster.cfn_outputs["EFSIds"].split(",") if id not in existing_efs_ids] + retained_fsx_filesystem_ids = [id for id in cluster.cfn_outputs["FSXIds"].split(",") if id not in existing_fsx_ids] + # update cluster to remove ebs, raid, efs and fsx with compute fleet stop + logging.info("Updating the cluster to remove all the shared storage (managed storage will be retained)") cluster.stop() wait_for_computefleet_changed(cluster, "STOPPED") cluster.update(str(init_config_file)) @@ -997,6 +1050,26 @@ def test_dynamic_file_systems_update( cluster_nodes, ) + # Verify that detached managed storage have been retained and delete them. + logging.info( + "Checking that retained managed storage resources have been retained and mark them for deletion on teardown" + ) + retained_storage = { + StorageType.STORAGE_EBS: dict(ids=retained_ebs_volume_ids, expected_states=["available"]), + StorageType.STORAGE_EFS: dict(ids=retained_efs_filesystem_ids, expected_states=["available"]), + StorageType.STORAGE_FSX: dict(ids=retained_fsx_filesystem_ids, expected_states=["AVAILABLE"]), + } + for storage_type in retained_storage: + for storage_id in retained_storage[storage_type]["ids"]: + assert_storage_existence( + region, + storage_type, + storage_id, + should_exist=True, + expected_states=retained_storage[storage_type]["expected_states"], + ) + delete_storage_on_teardown(storage_type, storage_id) + _test_shared_storage_rollback( cluster, existing_ebs_volume_id, diff --git a/tests/integration-tests/tests/update/test_update/test_dynamic_file_systems_update/pcluster.config.update_drain.yaml b/tests/integration-tests/tests/update/test_update/test_dynamic_file_systems_update/pcluster.config.update_drain.yaml index 637d3ea302..78305b9df0 100644 --- a/tests/integration-tests/tests/update/test_update/test_dynamic_file_systems_update/pcluster.config.update_drain.yaml +++ b/tests/integration-tests/tests/update/test_update/test_dynamic_file_systems_update/pcluster.config.update_drain.yaml @@ -52,12 +52,13 @@ SharedStorage: Raid: Type: 0 NumberOfVolumes: 5 + DeletionPolicy: {{ new_raid_deletion_policy }} - MountDir: {{ new_ebs_mount_dir }} Name: /manage-ebs StorageType: Ebs EbsSettings: VolumeType: gp3 - DeletionPolicy: Delete + DeletionPolicy: {{ new_ebs_deletion_policy }} - MountDir: {{ existing_ebs_mount_dir }} Name: existing_ebs StorageType: Ebs @@ -72,6 +73,7 @@ SharedStorage: PerformanceMode: maxIO ThroughputMode: provisioned ProvisionedThroughput: 200 + DeletionPolicy: {{ new_efs_deletion_policy }} - MountDir: {{ existing_efs_mount_dir }} Name: existing_efs StorageType: Efs @@ -86,6 +88,7 @@ SharedStorage: ExportPath: s3://{{ bucket_name }}/export_dir DeploymentType: PERSISTENT_1 PerUnitStorageThroughput: 200 + DeletionPolicy: {{ new_lustre_deletion_policy }} - MountDir: {{ fsx_lustre_mount_dir }} Name: existingfsx StorageType: FsxLustre