Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restore instance type sanity check using cluster ami #2487

Merged
merged 1 commit into from
Feb 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ CHANGELOG

**BUG FIXES**

- Fix sanity checks with ARM instance types by using alinux2 AMI with correct architecture during dryrun
- Fix sanity checks with ARM instance types by using cluster AMI when performing validation
- Fix `enable_efa` parameter validation when using Centos8 and Slurm or ARM instances.
- Use non interactive `apt update` command when building custom Ubuntu AMIs.
- Fix `encrypted_ephemeral = true` when using Alinux2 or CentOS8
Expand Down
75 changes: 63 additions & 12 deletions cli/src/pcluster/cluster_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@
from botocore.exceptions import ClientError

from pcluster.utils import (
error,
Cache,
get_availability_zone_of_subnet,
get_installed_version,
get_supported_az_for_one_instance_type,
is_hit_enabled_cluster,
)
Expand Down Expand Up @@ -115,20 +116,70 @@ def _ec2_run_instance(self, pcluster_config, **kwargs): # noqa: C901 FIXME!!!
"Please double check your cluster configuration.\n{1}".format(kwargs["InstanceType"], message)
)

def _get_latest_alinux_ami_id(self, architecture):
"""Get latest alinux ami id."""
@Cache.cached
def _get_official_image_id(self, os, architecture):
"""Return the id of the current official image, for the provided os-architecture combination."""
ami_id = None
try:
alinux_ami_id = (
boto3.client("ssm")
.get_parameter(Name="/aws/service/ami-amazon-linux-latest/amzn2-ami-minimal-hvm-%s-ebs" % architecture)
.get("Parameter")
.get("Value")
)
image_prefix = self._get_official_image_name_prefix(os, architecture)
# Look for public ParallelCluster AMI in released version
ami_id = self._get_image_id(image_prefix, "amazon", True)
if not ami_id:
# Look for dev account owner during development
ami_id = self._get_image_id(image_prefix, "self", False)
except ClientError as e:
error("Unable to retrieve Amazon Linux 2 AMI id.\n{0}".format(e.response.get("Error").get("Message")))
raise
raise Exception(
"Unable to retrieve official image id for base_os='{0}' and architecture='{1}': {2}".format(
os, architecture, e.response.get("Error").get("Message")
)
)
if not ami_id:
raise Exception(
"No official image id found for base_os='{0}' and architecture='{1}'".format(os, architecture)
)

return ami_id

def _get_image_id(self, image_prefix, owner, public=False):
"""Search for the ami with the specified prefix from the given account and returns the id if found."""
filters = [
{
"Name": "name",
"Values": ["{0}*".format(image_prefix)],
}
]
if public:
filters.append({"Name": "is-public", "Values": ["true"]})

images = boto3.client("ec2").describe_images(Filters=filters, Owners=[owner]).get("Images")
return images[0].get("ImageId") if images else None

def _get_official_image_name_prefix(self, os, architecture):
"""Return the prefix of the current official image, for the provided os-architecture combination."""
suffixes = {
"alinux": "amzn-hvm",
"alinux2": "amzn2-hvm",
"centos7": "centos7-hvm",
"centos8": "centos8-hvm",
"ubuntu1604": "ubuntu-1604-lts-hvm",
"ubuntu1804": "ubuntu-1804-lts-hvm",
}
return "aws-parallelcluster-{version}-{suffix}-{arch}".format(
version=get_installed_version(), suffix=suffixes[os], arch=architecture
)

def _get_cluster_ami_id(self, pcluster_config):
"""Get the image id of the cluster."""
cluster_ami = None
os = pcluster_config.get_section("cluster").get_param_value("base_os")
architecture = pcluster_config.get_section("cluster").get_param_value("architecture")
custom_ami = pcluster_config.get_section("cluster").get_param_value("custom_ami")
try:
cluster_ami = custom_ami if custom_ami else self._get_official_image_id(os, architecture)
except Exception as e:
pcluster_config.error("Error when resolving cluster ami id: {0}".format(e))

return alinux_ami_id
return cluster_ami

def public_ips_in_compute_subnet(self, pcluster_config, network_interfaces_count):
"""Tell if public IPs will be used in compute subnet."""
Expand Down
6 changes: 3 additions & 3 deletions cli/src/pcluster/models/hit/hit_cluster_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def test_configuration(self, pcluster_config):
)
try:

latest_alinux_ami_id = self._get_latest_alinux_ami_id(cluster_section.get_param_value("architecture"))
cluster_ami_id = self._get_cluster_ami_id(pcluster_config)

head_node_network_interfaces = self.build_launch_network_interfaces(
network_interfaces_count=int(cluster_section.get_param_value("network_interfaces_count")[0]),
Expand All @@ -89,7 +89,7 @@ def test_configuration(self, pcluster_config):
InstanceType=head_node_instance_type,
MinCount=1,
MaxCount=1,
ImageId=latest_alinux_ami_id,
ImageId=cluster_ami_id,
CpuOptions=head_node_cpu_options,
NetworkInterfaces=head_node_network_interfaces,
DryRun=True,
Expand All @@ -112,7 +112,7 @@ def test_configuration(self, pcluster_config):
pcluster_config,
compute_resource_section,
disable_hyperthreading=disable_hyperthreading,
ami_id=latest_alinux_ami_id,
ami_id=cluster_ami_id,
subnet=compute_subnet,
security_groups_ids=security_groups_ids,
placement_group=queue_placement_group,
Expand Down
6 changes: 3 additions & 3 deletions cli/src/pcluster/models/sit/sit_cluster_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def test_configuration(self, pcluster_config):
)

try:
latest_alinux_ami_id = self._get_latest_alinux_ami_id(cluster_section.get_param_value("architecture"))
cluster_ami_id = self._get_cluster_ami_id(pcluster_config)

head_node_network_interfaces = self.build_launch_network_interfaces(
network_interfaces_count=int(cluster_section.get_param_value("network_interfaces_count")[0]),
Expand All @@ -127,7 +127,7 @@ def test_configuration(self, pcluster_config):
InstanceType=head_node_instance_type,
MinCount=1,
MaxCount=1,
ImageId=latest_alinux_ami_id,
ImageId=cluster_ami_id,
CpuOptions=head_node_cpu_options,
NetworkInterfaces=head_node_network_interfaces,
Placement=head_node_placement_group,
Expand All @@ -153,7 +153,7 @@ def test_configuration(self, pcluster_config):
InstanceType=compute_instance_type,
MinCount=1,
MaxCount=1,
ImageId=latest_alinux_ami_id,
ImageId=cluster_ami_id,
CpuOptions=compute_cpu_options,
Placement=compute_placement_group,
NetworkInterfaces=network_interfaces,
Expand Down
188 changes: 131 additions & 57 deletions cli/tests/pcluster/config/test_pcluster_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,70 +13,21 @@
from assertpy import assert_that
from pytest import fail

from pcluster.utils import get_installed_version
from tests.common import MockedBoto3Request
from tests.pcluster.config.utils import get_mocked_pcluster_config, init_pcluster_config_from_configparser


@pytest.fixture()
def boto3_stubber_path():
return "pcluster.cluster_model.boto3"
@pytest.fixture(autouse=True)
def clear_cache():
from pcluster.utils import Cache

Cache.clear_all()

@pytest.mark.parametrize(
"architecture, boto3_response, expected_message",
[
(
"arm64",
{
"Parameter": {
"Name": "/aws/service/ami-amazon-linux-latest/amzn2-ami-minimal-hvm-arm64-ebs",
"Type": "String",
"Value": "ami-0aaf2d8fefcde5893",
"Version": 27,
"LastModifiedDate": 1614231667.121,
"DataType": "text",
}
},
None,
),
(
"x86_64",
{
"Parameter": {
"Name": "/aws/service/ami-amazon-linux-latest/amzn2-ami-minimal-hvm-x86_64-ebs",
"Type": "String",
"Value": "ami-0962afb8e2794cd6e",
"Version": 40,
"LastModifiedDate": 1614231667.235,
"DataType": "text",
}
},
None,
),
("x86_64", "Generic Error", "Unable to retrieve Amazon Linux 2 AMI id"),
],
)
def test_get_latest_alinux_ami_id(mocker, boto3_stubber, architecture, boto3_response, expected_message):
mocked_requests = [
MockedBoto3Request(
method="get_parameter",
response=boto3_response,
expected_params={
"Name": "/aws/service/ami-amazon-linux-latest/amzn2-ami-minimal-hvm-%s-ebs" % architecture
},
generate_error=True if expected_message else False,
)
]

boto3_stubber("ssm", mocked_requests)
pcluster_config = get_mocked_pcluster_config(mocker)

if expected_message:
with pytest.raises(SystemExit, match=expected_message):
_ = pcluster_config.cluster_model._get_latest_alinux_ami_id(architecture)
else:
latest_linux_ami_id = pcluster_config.cluster_model._get_latest_alinux_ami_id(architecture)
assert_that(latest_linux_ami_id).is_equal_to(boto3_response.get("Parameter").get("Value"))
@pytest.fixture()
def boto3_stubber_path():
return "pcluster.cluster_model.boto3"


@pytest.mark.parametrize(
Expand Down Expand Up @@ -157,3 +108,126 @@ def test_load_from_file_errors(capsys, config_parser_dict, expected_message):
assert_that(e.args[0]).matches(expected_message)
else:
fail("Unexpected failure when loading file")


@pytest.mark.parametrize(
"custom_ami_id, os, architecture, expected_ami_suffix, expected_public_ami_id, expected_self_ami_id, "
"expected_error_message, raise_boto3_error",
[
# Official ami
(None, "alinux", "x86_64", "amzn-hvm-x86_64", "ami-official", None, None, False),
(None, "alinux2", "x86_64", "amzn2-hvm-x86_64", "ami-official", None, None, False),
(None, "centos7", "x86_64", "centos7-hvm-x86_64", "ami-official", None, None, False),
(None, "centos8", "x86_64", "centos8-hvm-x86_64", "ami-official", None, None, False),
(None, "ubuntu1604", "x86_64", "ubuntu-1604-lts-hvm-x86_64", "ami-official", None, None, False),
(None, "ubuntu1804", "x86_64", "ubuntu-1804-lts-hvm-x86_64", "ami-official", None, None, False),
(None, "alinux", "arm_64", "amzn-hvm-arm_64", "ami-official", None, None, False),
(None, "alinux2", "arm_64", "amzn2-hvm-arm_64", "ami-official", None, None, False),
(None, "centos7", "arm_64", "centos7-hvm-arm_64", "ami-official", None, None, False),
(None, "centos8", "arm_64", "centos8-hvm-arm_64", "ami-official", None, None, False),
(None, "ubuntu1604", "arm_64", "ubuntu-1604-lts-hvm-arm_64", "ami-official", None, None, False),
(None, "ubuntu1804", "arm_64", "ubuntu-1804-lts-hvm-arm_64", "ami-official", None, None, False),
# Custom ami
("ami-custom", "alinux2", "x86_64", None, "ami-custom", None, None, False),
("ami-custom", "alinux2", "arm_64", None, "ami-custom", None, None, False),
# Self ami
(None, "ubuntu1804", "arm_64", "ubuntu-1804-lts-hvm-arm_64", None, "ami-self", None, False),
# No ami found
(None, "alinux2", "x86_64", "amzn2-hvm-x86_64", None, None, "No official image id found", False),
# Boto3 error
(None, "alinux2", "x86_64", "amzn2-hvm-x86_64", None, None, "Unable to retrieve official image id", True),
],
)
def test_get_cluster_ami_id(
mocker,
boto3_stubber,
custom_ami_id,
os,
architecture,
expected_ami_suffix,
expected_public_ami_id,
expected_self_ami_id,
expected_error_message,
raise_boto3_error,
):
if not custom_ami_id:
# Expected request for public ami
mocked_requests = [
MockedBoto3Request(
method="describe_images",
response={
"Images": [
{
"Architecture": architecture,
"CreationDate": "2020-12-22T13:30:33.000Z",
"ImageId": expected_public_ami_id,
}
]
if expected_public_ami_id
else []
},
expected_params={
"Filters": [
{
"Name": "name",
"Values": [
"aws-parallelcluster-{version}-{suffix}*".format(
version=get_installed_version(), suffix=expected_ami_suffix
)
],
},
{"Name": "is-public", "Values": ["true"]},
],
"Owners": ["amazon"],
},
generate_error=raise_boto3_error,
)
]

if not expected_public_ami_id and not raise_boto3_error:
# Expected request for self ami
mocked_requests.append(
MockedBoto3Request(
method="describe_images",
response={
"Images": [
{
"Architecture": architecture,
"CreationDate": "2020-12-22T13:30:33.000Z",
"ImageId": expected_self_ami_id,
}
]
if expected_self_ami_id
else []
},
expected_params={
"Filters": [
{
"Name": "name",
"Values": [
"aws-parallelcluster-{version}-{suffix}*".format(
version=get_installed_version(), suffix=expected_ami_suffix
)
],
},
],
"Owners": ["self"],
},
generate_error=raise_boto3_error,
)
)

boto3_stubber("ec2", mocked_requests)

pcluster_config = get_mocked_pcluster_config(mocker)
pcluster_config.get_section("cluster").get_param("custom_ami").value = custom_ami_id
pcluster_config.get_section("cluster").get_param("base_os").value = os
pcluster_config.get_section("cluster").get_param("architecture").value = architecture

if expected_error_message:
with pytest.raises(SystemExit, match=expected_error_message):
_ = pcluster_config.cluster_model._get_cluster_ami_id(pcluster_config)
else:
expected_ami_id = expected_public_ami_id if expected_public_ami_id else expected_self_ami_id
cluster_ami_id = pcluster_config.cluster_model._get_cluster_ami_id(pcluster_config)
assert_that(cluster_ami_id).is_equal_to(expected_ami_id)