Skip to content
This repository has been archived by the owner on Nov 1, 2023. It is now read-only.

Commit

Permalink
Delete nodes when they're done (#1763)
Browse files Browse the repository at this point in the history
* Delete nodes when they're done

* Missed a file

* Load node disposal strategy from env var

* Lint

* Fix subtle bug

* Deleting doesn't work, will 'decomission' nodes once they complete work

* Missed a file

* Remove logging line
  • Loading branch information
tevoinea authored and AdamL-Microsoft committed Apr 18, 2022
1 parent d9fbf9a commit 3372105
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 28 deletions.
12 changes: 7 additions & 5 deletions src/api-service/__app__/onefuzzlib/azure/auto_scale.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ def create_auto_scale_profile(
# When there's more than 1 message in the pool queue
operator=ComparisonOperationType.GREATER_THAN_OR_EQUAL,
threshold=1,
divide_per_instance=False,
),
scale_action=ScaleAction(
direction=ScaleDirection.INCREASE,
Expand All @@ -170,16 +171,17 @@ def create_auto_scale_profile(
metric_trigger=MetricTrigger(
metric_name="ApproximateMessageCount",
metric_resource_uri=queue_uri,
# Check every 20 minutes
time_grain=timedelta(minutes=20),
# Check every 10 minutes
time_grain=timedelta(minutes=10),
# The average amount of messages there are in the pool queue
time_aggregation=TimeAggregationType.AVERAGE,
statistic=MetricStatisticType.SUM,
# Over the past 20 minutes
time_window=timedelta(minutes=20),
# Over the past 10 minutes
time_window=timedelta(minutes=10),
# When there's no messages in the pool queue
operator=ComparisonOperationType.EQUALS,
threshold=0,
divide_per_instance=False,
),
scale_action=ScaleAction(
direction=ScaleDirection.DECREASE,
Expand All @@ -194,7 +196,7 @@ def create_auto_scale_profile(

def default_auto_scale_profile(queue_uri: str, scaleset_size: int) -> AutoscaleProfile:
return create_auto_scale_profile(
queue_uri, 1, scaleset_size, scaleset_size, 1, 10, 1, 15
queue_uri, 1, scaleset_size, scaleset_size, 1, 10, 1, 5
)


Expand Down
64 changes: 41 additions & 23 deletions src/api-service/__app__/onefuzzlib/workers/scalesets.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import datetime
import logging
import os
from typing import Any, Dict, List, Optional, Tuple, Union
from uuid import UUID

Expand Down Expand Up @@ -437,8 +438,13 @@ def cleanup_nodes(self) -> bool:

# Perform operations until they fail due to scaleset getting locked
try:
self.reimage_nodes(to_reimage, NodeDisaposalStrategy.scale_in)
self.delete_nodes(to_delete, NodeDisaposalStrategy.scale_in)
strategy_str = os.getenv("ONEFUZZ_NODE_DISPOSAL_STRATEGY", "scale_in")
if strategy_str == "decomission":
strategy = NodeDisaposalStrategy.decomission
else:
strategy = NodeDisaposalStrategy.scale_in
self.reimage_nodes(to_reimage, strategy)
self.delete_nodes(to_delete, strategy)
except UnableToUpdate:
logging.info(
SCALESET_LOG_PREFIX
Expand Down Expand Up @@ -598,17 +604,23 @@ def delete_nodes(
else:
machine_ids.add(node.machine_id)

logging.info(
SCALESET_LOG_PREFIX + "deleting nodes scaleset_id:%s machine_id:%s",
self.scaleset_id,
machine_ids,
)
delete_vmss_nodes(self.scaleset_id, machine_ids)
for node in nodes:
if node.machine_id in machine_ids:
node.delete()
if disposal_strategy == NodeDisaposalStrategy.scale_in:
if disposal_strategy == NodeDisaposalStrategy.decomission:
logging.info(SCALESET_LOG_PREFIX + "decomissioning nodes")
for node in nodes:
if node.machine_id in machine_ids:
node.release_scale_in_protection()
else:
logging.info(
SCALESET_LOG_PREFIX + "deleting nodes scaleset_id:%s machine_id:%s",
self.scaleset_id,
machine_ids,
)
delete_vmss_nodes(self.scaleset_id, machine_ids)
for node in nodes:
if node.machine_id in machine_ids:
node.delete()
if disposal_strategy == NodeDisaposalStrategy.scale_in:
node.release_scale_in_protection()

def reimage_nodes(
self, nodes: List[Node], disposal_strategy: NodeDisaposalStrategy
Expand Down Expand Up @@ -659,18 +671,24 @@ def reimage_nodes(
)
return

result = reimage_vmss_nodes(self.scaleset_id, machine_ids)
if isinstance(result, Error):
raise Exception(
"unable to reimage nodes: %s:%s - %s"
% (self.scaleset_id, machine_ids, result)
)

for node in nodes:
if node.machine_id in machine_ids:
node.delete()
if disposal_strategy == NodeDisaposalStrategy.scale_in:
if disposal_strategy == NodeDisaposalStrategy.decomission:
logging.info(SCALESET_LOG_PREFIX + "decomissioning nodes")
for node in nodes:
if node.machine_id in machine_ids:
node.release_scale_in_protection()
else:
result = reimage_vmss_nodes(self.scaleset_id, machine_ids)
if isinstance(result, Error):
raise Exception(
"unable to reimage nodes: %s:%s - %s"
% (self.scaleset_id, machine_ids, result)
)

for node in nodes:
if node.machine_id in machine_ids:
node.delete()
if disposal_strategy == NodeDisaposalStrategy.scale_in:
node.release_scale_in_protection()

def set_shutdown(self, now: bool) -> None:
if now:
Expand Down
1 change: 1 addition & 0 deletions src/pytypes/onefuzztypes/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,3 +417,4 @@ class UserFieldType(Enum):

class NodeDisaposalStrategy(Enum):
scale_in = "scale_in"
decomission = "decomission"

0 comments on commit 3372105

Please sign in to comment.