diff --git a/src/api-service/__app__/onefuzzlib/azure/auto_scale.py b/src/api-service/__app__/onefuzzlib/azure/auto_scale.py index e69ec496b6e..a7c049162d3 100644 --- a/src/api-service/__app__/onefuzzlib/azure/auto_scale.py +++ b/src/api-service/__app__/onefuzzlib/azure/auto_scale.py @@ -156,6 +156,7 @@ def create_auto_scale_profile( # When there's more than 1 message in the pool queue operator=ComparisonOperationType.GREATER_THAN_OR_EQUAL, threshold=1, + divide_per_instance=False, ), scale_action=ScaleAction( direction=ScaleDirection.INCREASE, @@ -170,16 +171,17 @@ def create_auto_scale_profile( metric_trigger=MetricTrigger( metric_name="ApproximateMessageCount", metric_resource_uri=queue_uri, - # Check every 20 minutes - time_grain=timedelta(minutes=20), + # Check every 10 minutes + time_grain=timedelta(minutes=10), # The average amount of messages there are in the pool queue time_aggregation=TimeAggregationType.AVERAGE, statistic=MetricStatisticType.SUM, - # Over the past 20 minutes - time_window=timedelta(minutes=20), + # Over the past 10 minutes + time_window=timedelta(minutes=10), # When there's no messages in the pool queue operator=ComparisonOperationType.EQUALS, threshold=0, + divide_per_instance=False, ), scale_action=ScaleAction( direction=ScaleDirection.DECREASE, @@ -194,7 +196,7 @@ def create_auto_scale_profile( def default_auto_scale_profile(queue_uri: str, scaleset_size: int) -> AutoscaleProfile: return create_auto_scale_profile( - queue_uri, 1, scaleset_size, scaleset_size, 1, 10, 1, 15 + queue_uri, 1, scaleset_size, scaleset_size, 1, 10, 1, 5 ) diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index f634d390cca..04181a962fe 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -5,6 +5,7 @@ import datetime import logging +import os from typing import Any, Dict, List, Optional, Tuple, Union from uuid import UUID @@ -437,8 +438,13 @@ def cleanup_nodes(self) -> bool: # Perform operations until they fail due to scaleset getting locked try: - self.reimage_nodes(to_reimage, NodeDisaposalStrategy.scale_in) - self.delete_nodes(to_delete, NodeDisaposalStrategy.scale_in) + strategy_str = os.getenv("ONEFUZZ_NODE_DISPOSAL_STRATEGY", "scale_in") + if strategy_str == "decomission": + strategy = NodeDisaposalStrategy.decomission + else: + strategy = NodeDisaposalStrategy.scale_in + self.reimage_nodes(to_reimage, strategy) + self.delete_nodes(to_delete, strategy) except UnableToUpdate: logging.info( SCALESET_LOG_PREFIX @@ -598,17 +604,23 @@ def delete_nodes( else: machine_ids.add(node.machine_id) - logging.info( - SCALESET_LOG_PREFIX + "deleting nodes scaleset_id:%s machine_id:%s", - self.scaleset_id, - machine_ids, - ) - delete_vmss_nodes(self.scaleset_id, machine_ids) - for node in nodes: - if node.machine_id in machine_ids: - node.delete() - if disposal_strategy == NodeDisaposalStrategy.scale_in: + if disposal_strategy == NodeDisaposalStrategy.decomission: + logging.info(SCALESET_LOG_PREFIX + "decomissioning nodes") + for node in nodes: + if node.machine_id in machine_ids: node.release_scale_in_protection() + else: + logging.info( + SCALESET_LOG_PREFIX + "deleting nodes scaleset_id:%s machine_id:%s", + self.scaleset_id, + machine_ids, + ) + delete_vmss_nodes(self.scaleset_id, machine_ids) + for node in nodes: + if node.machine_id in machine_ids: + node.delete() + if disposal_strategy == NodeDisaposalStrategy.scale_in: + node.release_scale_in_protection() def reimage_nodes( self, nodes: List[Node], disposal_strategy: NodeDisaposalStrategy @@ -659,18 +671,24 @@ def reimage_nodes( ) return - result = reimage_vmss_nodes(self.scaleset_id, machine_ids) - if isinstance(result, Error): - raise Exception( - "unable to reimage nodes: %s:%s - %s" - % (self.scaleset_id, machine_ids, result) - ) - - for node in nodes: - if node.machine_id in machine_ids: - node.delete() - if disposal_strategy == NodeDisaposalStrategy.scale_in: + if disposal_strategy == NodeDisaposalStrategy.decomission: + logging.info(SCALESET_LOG_PREFIX + "decomissioning nodes") + for node in nodes: + if node.machine_id in machine_ids: node.release_scale_in_protection() + else: + result = reimage_vmss_nodes(self.scaleset_id, machine_ids) + if isinstance(result, Error): + raise Exception( + "unable to reimage nodes: %s:%s - %s" + % (self.scaleset_id, machine_ids, result) + ) + + for node in nodes: + if node.machine_id in machine_ids: + node.delete() + if disposal_strategy == NodeDisaposalStrategy.scale_in: + node.release_scale_in_protection() def set_shutdown(self, now: bool) -> None: if now: diff --git a/src/pytypes/onefuzztypes/enums.py b/src/pytypes/onefuzztypes/enums.py index a11a22f6471..016bd14ff24 100644 --- a/src/pytypes/onefuzztypes/enums.py +++ b/src/pytypes/onefuzztypes/enums.py @@ -417,3 +417,4 @@ class UserFieldType(Enum): class NodeDisaposalStrategy(Enum): scale_in = "scale_in" + decomission = "decomission"