Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AKS] az aks nodepool delete-machines: Add support to delete specific machines in an agent pool #29921

Merged
merged 7 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/azure-cli/azure/cli/command_modules/acs/_help.py
Original file line number Diff line number Diff line change
Expand Up @@ -1791,6 +1791,18 @@
text: az aks operation-abort -g myResourceGroup -n myAKSCluster
"""

helps['aks nodepool delete-machines'] = """
type: command
short-summary: Delete specific machines in an agentpool for a managed cluster.
parameters:
- name: --machine-names
type: string array
short-summary: Space-separated list of machine names from the agent pool to be deleted.
examples:
- name: Delete specific machines in an agent pool
text: az aks nodepool delete-machines -g myResourceGroup --nodepool-name nodepool1 --cluster-name myAKSCluster --machine-names machine1
"""

helps['aks remove-dev-spaces'] = """
type: command
short-summary: Remove Azure Dev Spaces from a managed Kubernetes cluster.
Expand Down
8 changes: 8 additions & 0 deletions src/azure-cli/azure/cli/command_modules/acs/_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -913,6 +913,14 @@ def load_arguments(self, _):
c.argument('dns_zone_resource_ids', options_list=['--ids'], required=True)
c.argument('attach_zones')

with self.argument_context("aks nodepool delete-machines") as c:
c.argument(
"machine_names",
nargs="+",
required=True,
help="Space-separated machine names to delete.",
)


def _get_default_install_location(exe_name):
system = platform.system()
Expand Down
1 change: 1 addition & 0 deletions src/azure-cli/azure/cli/command_modules/acs/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ def load_command_table(self, _):
g.custom_command('start', 'aks_agentpool_start', supports_no_wait=True)
g.wait_command('wait')
g.custom_command('operation-abort', 'aks_agentpool_operation_abort', supports_no_wait=True)
g.custom_command('delete-machines', 'aks_agentpool_delete_machines', supports_no_wait=True)

with self.command_group('aks command', managed_clusters_sdk, client_factory=cf_managed_clusters) as g:
g.custom_command('invoke', 'aks_runcommand', supports_no_wait=True,
Expand Down
44 changes: 44 additions & 0 deletions src/azure-cli/azure/cli/command_modules/acs/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
ResourceNotFoundError,
UnknownError,
ValidationError,
RequiredArgumentMissingError,
)
from azure.cli.core.commands import LongRunningOperation
from azure.cli.core.commands.client_factory import get_subscription_id
Expand Down Expand Up @@ -2648,6 +2649,49 @@ def aks_operation_abort(cmd, # pylint: disable=unused-argument
return sdk_no_wait(no_wait, client.begin_abort_latest_operation, resource_group_name, name)


def aks_agentpool_delete_machines(cmd, # pylint: disable=unused-argument
client,
resource_group_name,
cluster_name,
nodepool_name,
machine_names,
no_wait=False):
agentpool_exists = False
instances = client.list(resource_group_name, cluster_name)
for agentpool_profile in instances:
if agentpool_profile.name.lower() == nodepool_name.lower():
agentpool_exists = True
break

if not agentpool_exists:
raise ResourceNotFoundError(
f"Node pool {nodepool_name} doesn't exist, "
"use 'az aks nodepool list' to get current node pool list"
)

if len(machine_names) == 0:
raise RequiredArgumentMissingError(
"--machine-names doesn't provide, "
"use 'az aks machine list' to get current machine list"
)

AgentPoolDeleteMachinesParameter = cmd.get_models(
"AgentPoolDeleteMachinesParameter",
resource_type=ResourceType.MGMT_CONTAINERSERVICE,
operation_group="agent_pools",
)

machines = AgentPoolDeleteMachinesParameter(machine_names=machine_names)
return sdk_no_wait(
no_wait,
client.begin_delete_machines,
resource_group_name,
cluster_name,
nodepool_name,
machines,
)


def aks_agentpool_show(cmd, client, resource_group_name, cluster_name, nodepool_name):
instance = client.get(resource_group_name, cluster_name, nodepool_name)
return instance
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7923,6 +7923,91 @@ def test_aks_nodepool_snapshot(self, resource_group, resource_group_location):
self.is_empty()
])

# live only, otherwise the current recording mechanism will also record the binary files of
# kubectl and kubelogin resulting in the cassette file
@live_only()
@AllowLargeResponse()
@AKSCustomResourceGroupPreparer(random_name_length=17, name_prefix="clitest", location="westus2")
def test_aks_nodepool_delete_machines(self, resource_group, resource_group_location):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Queued live test for this, you'll need to commit the recording file (would be generated by running the test case in live mode, find it from pipeline artifact) to pass CI.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://dev.azure.com/msazure/CloudNativeCompute/_build/results?buildId=103633419&view=results

This is the passed test I ran. Wondering which recording file do I need? btw, I uploaded the test_aks_nodepool_delete_machines.yaml already.

image

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Take another look, it seems that you performed some data plane operations in the test case, which cannot be properly replayed. Please mark the case as @live_only() (and left a comment, remove the corresponding recording file) to bypass the check.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

aks_name = self.create_random_name("cliakstest", 16)
nodepool_name = self.create_random_name("c", 6)
self.kwargs.update(
{
"resource_group": resource_group,
"location": resource_group_location,
"name": aks_name,
"nodepool_name": nodepool_name,
"ssh_key_value": self.generate_ssh_keys(),
}
)

# create aks cluster
create_cmd = "aks create --resource-group={resource_group} --name={name} --ssh-key-value={ssh_key_value}"
self.cmd(
create_cmd,
checks=[
self.check("provisioningState", "Succeeded"),
],
)
# add nodepool
self.cmd(
"aks nodepool add --resource-group={resource_group} --cluster-name={name} --name={nodepool_name} --node-count=4",
checks=[self.check("provisioningState", "Succeeded")],
)

# install kubectl
try:
subprocess.call(["az", "aks", "install-cli"])
except subprocess.CalledProcessError as err:
raise CLIInternalError("Failed to install kubectl with error: '{}'!".format(err))

try:
# get credential
fd, browse_path = tempfile.mkstemp()
self.kwargs.update(
{
"browse_path": browse_path,
}
)
try:
get_credential_cmd = "aks get-credentials -n {name} -g {resource_group} -f {browse_path}"
self.cmd(get_credential_cmd)
finally:
os.close(fd)

# get machine name
label = "kubernetes.azure.com/agentpool=" + nodepool_name
k_get_node_cmd = ["kubectl", "get", "node", "-l", label, "-o", "name", "--kubeconfig", browse_path]
k_get_node_output = subprocess.check_output(
k_get_node_cmd,
universal_newlines=True,
stderr=subprocess.STDOUT,
)
machine_names = k_get_node_output.split("\n")

machine_name = machine_names[0].strip().strip("node/").strip()
print(f"machine_name: {machine_name}")
self.kwargs.update(
{
"machine_name": machine_name,
}
)

# delete machines
self.cmd(
"aks nodepool delete-machines --resource-group={resource_group} --cluster-name={name} --nodepool-name={nodepool_name} --machine-names={machine_name}"
)

# check count
self.cmd('aks show -g {resource_group} -n {name}', checks=[
self.check('agentPoolProfiles[1].count', 3)
])

finally:
# delete cluster
self.cmd(
'aks delete -g {resource_group} -n {name} --yes --no-wait', checks=[self.is_empty()])

@AllowLargeResponse()
@AKSCustomResourceGroupPreparer(random_name_length=17, name_prefix='clitest', location='centraluseuap')
def test_aks_create_with_windows_gmsa(self, resource_group, resource_group_location):
Expand Down