From 4f1cc9ecdf72f9de5c098a1e30f2966447a50f8f Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 09:46:03 -0500 Subject: [PATCH 01/66] refactor pool autoscaling --- .../__app__/agent_registration/__init__.py | 3 +- .../__app__/onefuzzlib/autoscale.py | 175 ----------- .../__app__/onefuzzlib/azure/creds.py | 6 +- .../__app__/onefuzzlib/workers/autoscale.py | 157 ++++++++++ .../__app__/onefuzzlib/workers/nodes.py | 22 +- .../__app__/onefuzzlib/workers/pools.py | 3 + .../__app__/onefuzzlib/workers/scalesets.py | 65 ++-- .../onefuzzlib/workers/shrink_queue.py | 44 +++ .../__app__/timer_workers/__init__.py | 5 +- src/api-service/tests/__init__.py | 8 +- src/api-service/tests/test_autoscale.py | 130 ++++---- src/api-service/tests/test_scheduler.py | 294 +++++++++--------- src/api-service/tests/test_webhook_hmac.py | 102 +++--- src/pytypes/onefuzztypes/enums.py | 7 + src/pytypes/onefuzztypes/models.py | 8 +- src/pytypes/onefuzztypes/requests.py | 2 +- 16 files changed, 528 insertions(+), 503 deletions(-) delete mode 100644 src/api-service/__app__/onefuzzlib/autoscale.py create mode 100644 src/api-service/__app__/onefuzzlib/workers/autoscale.py create mode 100644 src/api-service/__app__/onefuzzlib/workers/shrink_queue.py diff --git a/src/api-service/__app__/agent_registration/__init__.py b/src/api-service/__app__/agent_registration/__init__.py index ed4803c994..8ee9fa8c4b 100644 --- a/src/api-service/__app__/agent_registration/__init__.py +++ b/src/api-service/__app__/agent_registration/__init__.py @@ -100,7 +100,8 @@ def post(req: func.HttpRequest) -> func.HttpResponse: node.delete() node = Node.create( - pool_name=registration_request.pool_name, + pool_id=pool.pool_id, + pool_name=pool.name, machine_id=registration_request.machine_id, scaleset_id=registration_request.scaleset_id, version=registration_request.version, diff --git a/src/api-service/__app__/onefuzzlib/autoscale.py b/src/api-service/__app__/onefuzzlib/autoscale.py deleted file mode 100644 index 253878d61e..0000000000 --- a/src/api-service/__app__/onefuzzlib/autoscale.py +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import logging -import math -from typing import List - -from onefuzztypes.enums import NodeState, ScalesetState -from onefuzztypes.models import AutoScaleConfig, TaskPool - -from .tasks.main import Task -from .workers.nodes import Node -from .workers.pools import Pool -from .workers.scalesets import Scaleset - - -def scale_up(pool: Pool, scalesets: List[Scaleset], nodes_needed: int) -> None: - logging.info("Scaling up") - autoscale_config = pool.autoscale - if not isinstance(autoscale_config, AutoScaleConfig): - return - - for scaleset in scalesets: - if scaleset.state in [ScalesetState.running, ScalesetState.resize]: - - max_size = min(scaleset.max_size(), autoscale_config.scaleset_size) - logging.info( - "scaleset:%s size:%d max_size:%d", - scaleset.scaleset_id, - scaleset.size, - max_size, - ) - if scaleset.size < max_size: - current_size = scaleset.size - if nodes_needed <= max_size - current_size: - scaleset.size = current_size + nodes_needed - nodes_needed = 0 - else: - scaleset.size = max_size - nodes_needed = nodes_needed - (max_size - current_size) - scaleset.state = ScalesetState.resize - scaleset.save() - - else: - continue - - if nodes_needed == 0: - return - - for _ in range( - math.ceil( - nodes_needed - / min( - Scaleset.scaleset_max_size(autoscale_config.image), - autoscale_config.scaleset_size, - ) - ) - ): - logging.info("Creating Scaleset for Pool %s", pool.name) - max_nodes_scaleset = min( - Scaleset.scaleset_max_size(autoscale_config.image), - autoscale_config.scaleset_size, - nodes_needed, - ) - - if not autoscale_config.region: - raise Exception("Region is missing") - - Scaleset.create( - pool_name=pool.name, - vm_sku=autoscale_config.vm_sku, - image=autoscale_config.image, - region=autoscale_config.region, - size=max_nodes_scaleset, - spot_instances=autoscale_config.spot_instances, - tags={"pool": pool.name}, - ) - nodes_needed -= max_nodes_scaleset - - -def scale_down(scalesets: List[Scaleset], nodes_to_remove: int) -> None: - logging.info("Scaling down") - for scaleset in scalesets: - num_of_nodes = len(Node.search_states(scaleset_id=scaleset.scaleset_id)) - if scaleset.size != num_of_nodes and scaleset.state not in [ - ScalesetState.resize, - ScalesetState.shutdown, - ScalesetState.halt, - ]: - scaleset.state = ScalesetState.resize - scaleset.save() - - free_nodes = Node.search_states( - scaleset_id=scaleset.scaleset_id, - states=[NodeState.free], - ) - nodes = [] - for node in free_nodes: - if not node.delete_requested: - nodes.append(node) - logging.info("Scaleset: %s, #Free Nodes: %s", scaleset.scaleset_id, len(nodes)) - - if nodes and nodes_to_remove > 0: - max_nodes_remove = min(len(nodes), nodes_to_remove) - # All nodes in scaleset are free. Can shutdown VMSS - if max_nodes_remove >= scaleset.size and len(nodes) >= scaleset.size: - scaleset.state = ScalesetState.shutdown - nodes_to_remove = nodes_to_remove - scaleset.size - scaleset.save() - for node in nodes: - node.set_shutdown() - continue - - # Resize of VMSS needed - scaleset.size = scaleset.size - max_nodes_remove - nodes_to_remove = nodes_to_remove - max_nodes_remove - scaleset.state = ScalesetState.resize - scaleset.save() - - -def get_vm_count(tasks: List[Task]) -> int: - count = 0 - for task in tasks: - task_pool = task.get_pool() - if ( - not task_pool - or not isinstance(task_pool, Pool) - or not isinstance(task.config.pool, TaskPool) - ): - continue - count += task.config.pool.count - return count - - -def autoscale_pool(pool: Pool) -> None: - logging.info("autoscale: %s", pool.autoscale) - if not pool.autoscale: - return - - # get all the tasks (count not stopped) for the pool - tasks = Task.get_tasks_by_pool_name(pool.name) - logging.info("Pool: %s, #Tasks %d", pool.name, len(tasks)) - - num_of_tasks = get_vm_count(tasks) - nodes_needed = max(num_of_tasks, pool.autoscale.min_size) - if pool.autoscale.max_size: - nodes_needed = min(nodes_needed, pool.autoscale.max_size) - - # do scaleset logic match with pool - # get all the scalesets for the pool - scalesets = Scaleset.search_by_pool(pool.name) - pool_resize = False - for scaleset in scalesets: - if scaleset.state in ScalesetState.modifying(): - pool_resize = True - break - nodes_needed = nodes_needed - scaleset.size - - if pool_resize: - return - - logging.info("Pool: %s, #Nodes Needed: %d", pool.name, nodes_needed) - if nodes_needed > 0: - # resizing scaleset or creating new scaleset. - scale_up(pool, scalesets, nodes_needed) - elif nodes_needed < 0: - for scaleset in scalesets: - nodes = Node.search_states(scaleset_id=scaleset.scaleset_id) - for node in nodes: - if node.delete_requested: - nodes_needed += 1 - if nodes_needed < 0: - scale_down(scalesets, abs(nodes_needed)) diff --git a/src/api-service/__app__/onefuzzlib/azure/creds.py b/src/api-service/__app__/onefuzzlib/azure/creds.py index 4bf60a0bb9..3dc6e4ab29 100644 --- a/src/api-service/__app__/onefuzzlib/azure/creds.py +++ b/src/api-service/__app__/onefuzzlib/azure/creds.py @@ -16,7 +16,7 @@ from memoization import cached from msrestazure.azure_active_directory import MSIAuthentication from msrestazure.tools import parse_resource_id -from onefuzztypes.primitives import Container +from onefuzztypes.primitives import Container, Region from .monkeypatch import allow_more_workers, reduce_logging @@ -41,12 +41,12 @@ def get_base_resource_group() -> Any: # should be str @cached -def get_base_region() -> Any: # should be str +def get_base_region() -> Region: # should be str client = ResourceManagementClient( credential=get_identity(), subscription_id=get_subscription() ) group = client.resource_groups.get(get_base_resource_group()) - return group.location + return Region(group.location) @cached diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py new file mode 100644 index 0000000000..0084dff7b8 --- /dev/null +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import logging +from typing import List + +from onefuzztypes.enums import ScalesetState + +from ..azure.creds import get_base_region +from ..tasks.main import Task +from .nodes import Node +from .pools import Pool +from .scalesets import Scaleset +from .shrink_queue import ShrinkQueue + + +def set_shrink_queues(pool: Pool, scalesets: List[Scaleset], size: int) -> None: + for scaleset in scalesets: + ShrinkQueue(scaleset.scaleset_id).clear() + + ShrinkQueue(pool.pool_id).set_size(size) + + +def scale_up(pool: Pool, scalesets: List[Scaleset], to_add: int) -> None: + logging.info( + "autoscale up - pool:%s to_add:%d scalesets:%s", + pool, + to_add, + [x.scaleset_id for x in scalesets], + ) + + config = pool.autoscale + if not config: + raise Exception(f"scaling up a non-autoscaling pool: {pool.name}") + + set_shrink_queues(pool, scalesets, 0) + + for scaleset in sorted(scalesets, key=lambda x: x.scaleset_id): + if to_add <= 0: + break + + if scaleset.state in ScalesetState.can_resize(): + scaleset_max_size = scaleset.max_size() + if scaleset.size < scaleset_max_size: + scaleset_to_add = min(to_add, scaleset_max_size - scaleset.size) + scaleset.size += scaleset_to_add + scaleset.state = ScalesetState.resize + scaleset.save() + to_add -= scaleset_to_add + + base_size = Scaleset.scaleset_max_size(config.image) + region = config.region or get_base_region() + while to_add > 0: + scaleset_size = min(base_size, to_add) + logging.info( + "autoscale adding scaleset. pool:%s size:%s", pool.name, scaleset_size + ) + scaleset = Scaleset.create( + pool_name=pool.name, + vm_sku=config.vm_sku, + image=config.image, + region=region, + size=scaleset_size, + spot_instances=config.spot_instances, + tags={"pool": pool.name}, + ) + logging.info("autoscale added scaleset:%s", scaleset.scaleset_id) + to_add -= scaleset_size + + +def shutdown_empty_scalesets(pool: Pool, scalesets: List[Scaleset]) -> None: + for scaleset in scalesets: + nodes = Node.search_states(scaleset_id=scaleset.scaleset_id) + + if ( + not nodes + and scaleset.size == 0 + and scaleset.state not in ScalesetState.needs_work() + ): + logging.info( + "autoscale halting empty scaleset. pool:%s scaleset:%s", + pool.name, + scaleset.scaleset_id, + ) + scaleset.halt() + + +def scale_down(pool: Pool, scalesets: List[Scaleset], to_remove: int) -> None: + logging.info( + "autoscale down - pool:%s to_remove:%d scalesets:%s", + pool, + to_remove, + [x.scaleset_id for x in scalesets], + ) + + set_shrink_queues(pool, scalesets, to_remove) + + +def get_tasks_vm_count(tasks: List[Task]) -> int: + count = 0 + for task in tasks: + if task.config.pool: + count += task.config.pool.count + + if task.config.vm: + count += task.config.vm.count + + return count + + +def autoscale_pool(pool: Pool) -> None: + logging.info("autoscale: %s", pool.autoscale) + if not pool.autoscale: + return + + # get all the tasks (count not stopped) for the pool + tasks = Task.get_tasks_by_pool_name(pool.name) + logging.info("Pool: %s, #Tasks %d", pool.name, len(tasks)) + + num_of_tasks = get_tasks_vm_count(tasks) + new_size = max(num_of_tasks, pool.autoscale.min_size) + if pool.autoscale.max_size: + new_size = min(new_size, pool.autoscale.max_size) + + # do scaleset logic match with pool + # get all the scalesets for the pool + scalesets = Scaleset.search_by_pool(pool.name) + current_size = 0 + for scaleset in scalesets: + modifying = [ + x.scaleset_id for x in scalesets if x.state in ScalesetState.modifying() + ] + if modifying: + logging.info( + "pool has modifying scalesets, unable to autoscale: %s - %s", + pool.name, + modifying, + ) + return + current_size += scaleset.size + + logging.info( + "autoscale pool %s - current_size: %d new_size: %d", + pool.name, + current_size, + new_size, + ) + + if new_size > current_size: + scale_up(pool, scalesets, new_size - current_size) + elif current_size > new_size: + scale_down(pool, scalesets, current_size - new_size) + shutdown_empty_scalesets(pool, scalesets) + else: + shutdown_empty_scalesets(pool, scalesets) diff --git a/src/api-service/__app__/onefuzzlib/workers/nodes.py b/src/api-service/__app__/onefuzzlib/workers/nodes.py index a188686ede..31647da82f 100644 --- a/src/api-service/__app__/onefuzzlib/workers/nodes.py +++ b/src/api-service/__app__/onefuzzlib/workers/nodes.py @@ -26,6 +26,7 @@ from ..azure.vmss import get_instance_id from ..events import send_event from ..orm import MappingIntStrAny, ORMMixin, QueryFilter +from .shrink_queue import ShrinkQueue NODE_EXPIRATION_TIME: datetime.timedelta = datetime.timedelta(hours=1) NODE_REIMAGE_TIME: datetime.timedelta = datetime.timedelta(days=7) @@ -46,12 +47,14 @@ def create( cls, *, pool_name: PoolName, + pool_id: UUID, machine_id: UUID, scaleset_id: Optional[UUID], version: str, ) -> "Node": node = cls( pool_name=pool_name, + pool_id=pool_id, machine_id=machine_id, scaleset_id=scaleset_id, version=version, @@ -70,13 +73,16 @@ def create( def search_states( cls, *, + pool_id: Optional[UUID] = None, scaleset_id: Optional[UUID] = None, states: Optional[List[NodeState]] = None, - pool_name: Optional[str] = None, + pool_name: Optional[PoolName] = None, ) -> List["Node"]: query: QueryFilter = {} if scaleset_id: query["scaleset_id"] = [scaleset_id] + if pool_id: + query["pool_id"] = [pool_id] if states: query["state"] = states if pool_name: @@ -87,13 +93,16 @@ def search_states( def search_outdated( cls, *, + pool_id: Optional[UUID] = None, scaleset_id: Optional[UUID] = None, states: Optional[List[NodeState]] = None, - pool_name: Optional[str] = None, + pool_name: Optional[PoolName] = None, exclude_update_scheduled: bool = False, num_results: Optional[int] = None, ) -> List["Node"]: query: QueryFilter = {} + if pool_id: + query["pool_id"] = [pool_id] if scaleset_id: query["scaleset_id"] = [scaleset_id] if states: @@ -195,10 +204,13 @@ def mark_tasks_stopped_early(self) -> None: ) def could_shrink_scaleset(self) -> bool: - from .scalesets import ScalesetShrinkQueue + if self.scaleset_id: + if ShrinkQueue(self.scaleset_id).should_shrink(): + return True + if self.pool_id: + if ShrinkQueue(self.pool_id).should_shrink(): + return True - if self.scaleset_id and ScalesetShrinkQueue(self.scaleset_id).should_shrink(): - return True return False def can_process_new_work(self) -> bool: diff --git a/src/api-service/__app__/onefuzzlib/workers/pools.py b/src/api-service/__app__/onefuzzlib/workers/pools.py index f8ae1d2bc3..b2e8f91f5e 100644 --- a/src/api-service/__app__/onefuzzlib/workers/pools.py +++ b/src/api-service/__app__/onefuzzlib/workers/pools.py @@ -24,6 +24,7 @@ from ..azure.storage import StorageType from ..events import send_event from ..orm import MappingIntStrAny, ORMMixin, QueryFilter +from .shrink_queue import ShrinkQueue NODE_EXPIRATION_TIME: datetime.timedelta = datetime.timedelta(hours=1) NODE_REIMAGE_TIME: datetime.timedelta = datetime.timedelta(days=7) @@ -126,6 +127,7 @@ def get_pool_queue(self) -> str: def init(self) -> None: create_queue(self.get_pool_queue(), StorageType.corpus) + ShrinkQueue(self.pool_id).create() self.state = PoolState.running self.save() @@ -236,4 +238,5 @@ def key_fields(cls) -> Tuple[str, str]: def delete(self) -> None: super().delete() + ShrinkQueue(self.pool_id).delete() send_event(EventPoolDeleted(pool_name=self.name)) diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index eeb9b3161d..c8674528a5 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -6,7 +6,7 @@ import datetime import logging from typing import Any, Dict, List, Optional, Tuple, Union -from uuid import UUID, uuid4 +from uuid import UUID from onefuzztypes.enums import ErrorCode, NodeState, PoolState, ScalesetState from onefuzztypes.events import ( @@ -18,19 +18,10 @@ from onefuzztypes.models import Scaleset as BASE_SCALESET from onefuzztypes.models import ScalesetNodeState from onefuzztypes.primitives import PoolName, Region -from pydantic import BaseModel, Field from ..azure.auth import build_auth from ..azure.image import get_os from ..azure.network import Network -from ..azure.queue import ( - clear_queue, - create_queue, - delete_queue, - queue_object, - remove_first_message, -) -from ..azure.storage import StorageType from ..azure.vmss import ( UnableToUpdate, create_vmss, @@ -47,6 +38,7 @@ from ..extension import fuzz_extensions from ..orm import MappingIntStrAny, ORMMixin, QueryFilter from .nodes import Node +from .shrink_queue import ShrinkQueue NODE_EXPIRATION_TIME: datetime.timedelta = datetime.timedelta(hours=1) NODE_REIMAGE_TIME: datetime.timedelta = datetime.timedelta(days=7) @@ -151,7 +143,7 @@ def init(self) -> None: logging.info("scaleset init: %s", self.scaleset_id) - ScalesetShrinkQueue(self.scaleset_id).create() + ShrinkQueue(self.scaleset_id).create() # Handle the race condition between a pool being deleted and a # scaleset being added to the pool. @@ -277,11 +269,23 @@ def get_error() -> Error: # result = 'did I modify the scaleset in azure' def cleanup_nodes(self) -> bool: + from .pools import Pool + if self.state == ScalesetState.halt: logging.info("halting scaleset: %s", self.scaleset_id) self.halt() return True + pool = Pool.get_by_name(self.pool_name) + if isinstance(pool, Error): + logging.error( + "unable to find pool during cleanup: %s - %s", + self.scaleset_id, + pool, + ) + self.set_failed(pool) + return True + Node.reimage_long_lived_nodes(self.scaleset_id) to_reimage = [] @@ -309,7 +313,10 @@ def cleanup_nodes(self) -> bool: if node.delete_requested: to_delete.append(node) else: - if ScalesetShrinkQueue(self.scaleset_id).should_shrink(): + if ShrinkQueue(pool.pool_id).should_shrink(): + node.set_halt() + to_delete.append(node) + elif ShrinkQueue(self.scaleset_id).should_shrink(): node.set_halt() to_delete.append(node) elif not node.reimage_queued: @@ -365,7 +372,7 @@ def _resize_grow(self) -> None: return def _resize_shrink(self, to_remove: int) -> None: - queue = ScalesetShrinkQueue(self.scaleset_id) + queue = ShrinkQueue(self.scaleset_id) for _ in range(to_remove): queue.add_entry() @@ -377,7 +384,7 @@ def resize(self) -> None: logging.info("scaleset resize: %s - %s", self.scaleset_id, self.size) # reset the node delete queue - ScalesetShrinkQueue(self.scaleset_id).clear() + ShrinkQueue(self.scaleset_id).clear() # just in case, always ensure size is within max capacity self.size = min(self.size, self.max_size()) @@ -473,7 +480,7 @@ def shutdown(self) -> None: self.halt() def halt(self) -> None: - ScalesetShrinkQueue(self.scaleset_id).delete() + ShrinkQueue(self.scaleset_id).delete() for node in Node.search_states(scaleset_id=self.scaleset_id): logging.info("deleting node %s:%s", self.scaleset_id, node.machine_id) @@ -569,33 +576,7 @@ def key_fields(cls) -> Tuple[str, str]: def delete(self) -> None: super().delete() + ShrinkQueue(self.scaleset_id).delete() send_event( EventScalesetDeleted(scaleset_id=self.scaleset_id, pool_name=self.pool_name) ) - - -class ShrinkEntry(BaseModel): - shrink_id: UUID = Field(default_factory=uuid4) - - -class ScalesetShrinkQueue: - def __init__(self, scaleset_id: UUID): - self.scaleset_id = scaleset_id - - def queue_name(self) -> str: - return "to-shrink-%s" % self.scaleset_id.hex - - def clear(self) -> None: - clear_queue(self.queue_name(), StorageType.config) - - def create(self) -> None: - create_queue(self.queue_name(), StorageType.config) - - def delete(self) -> None: - delete_queue(self.queue_name(), StorageType.config) - - def add_entry(self) -> None: - queue_object(self.queue_name(), ShrinkEntry(), StorageType.config) - - def should_shrink(self) -> bool: - return remove_first_message(self.queue_name(), StorageType.config) diff --git a/src/api-service/__app__/onefuzzlib/workers/shrink_queue.py b/src/api-service/__app__/onefuzzlib/workers/shrink_queue.py new file mode 100644 index 0000000000..8f5e944401 --- /dev/null +++ b/src/api-service/__app__/onefuzzlib/workers/shrink_queue.py @@ -0,0 +1,44 @@ +from uuid import UUID, uuid4 + +from pydantic import BaseModel, Field + +from ..azure.queue import ( + clear_queue, + create_queue, + delete_queue, + queue_object, + remove_first_message, +) +from ..azure.storage import StorageType + + +class ShrinkEntry(BaseModel): + shrink_id: UUID = Field(default_factory=uuid4) + + +class ShrinkQueue: + def __init__(self, base_id: UUID): + self.base_id = base_id + + def queue_name(self) -> str: + return "to-shrink-%s" % self.base_id.hex + + def clear(self) -> None: + clear_queue(self.queue_name(), StorageType.config) + + def create(self) -> None: + create_queue(self.queue_name(), StorageType.config) + + def delete(self) -> None: + delete_queue(self.queue_name(), StorageType.config) + + def add_entry(self) -> None: + queue_object(self.queue_name(), ShrinkEntry(), StorageType.config) + + def set_size(self, size: int) -> None: + self.clear() + for _ in range(size): + self.add_entry() + + def should_shrink(self) -> bool: + return remove_first_message(self.queue_name(), StorageType.config) diff --git a/src/api-service/__app__/timer_workers/__init__.py b/src/api-service/__app__/timer_workers/__init__.py index 09bac76c07..a42d3e5174 100644 --- a/src/api-service/__app__/timer_workers/__init__.py +++ b/src/api-service/__app__/timer_workers/__init__.py @@ -8,9 +8,9 @@ import azure.functions as func from onefuzztypes.enums import NodeState, PoolState -from ..onefuzzlib.autoscale import autoscale_pool from ..onefuzzlib.events import get_events from ..onefuzzlib.orm import process_state_updates +from ..onefuzzlib.workers.autoscale import autoscale_pool from ..onefuzzlib.workers.nodes import Node from ..onefuzzlib.workers.pools import Pool from ..onefuzzlib.workers.scalesets import Scaleset @@ -38,7 +38,8 @@ def main(mytimer: func.TimerRequest, dashboard: func.Out[str]) -> None: # noqa: if pool.state in PoolState.needs_work(): logging.info("update pool: %s (%s)", pool.pool_id, pool.name) process_state_updates(pool) - elif pool.state in PoolState.available() and pool.autoscale: + + if pool.state in PoolState.available() and pool.autoscale: autoscale_pool(pool) Node.mark_outdated_nodes() diff --git a/src/api-service/tests/__init__.py b/src/api-service/tests/__init__.py index 5c0f53d94b..d2effe12b0 100644 --- a/src/api-service/tests/__init__.py +++ b/src/api-service/tests/__init__.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python -# -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. +#!/usr/bin/env python +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. diff --git a/src/api-service/tests/test_autoscale.py b/src/api-service/tests/test_autoscale.py index b56fc6164c..656eba3d42 100644 --- a/src/api-service/tests/test_autoscale.py +++ b/src/api-service/tests/test_autoscale.py @@ -1,65 +1,65 @@ -#!/usr/bin/env python -# -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import unittest -from unittest.mock import MagicMock, patch -from uuid import UUID - -from onefuzztypes.enums import OS, Architecture, ContainerType, TaskType -from onefuzztypes.models import TaskConfig, TaskContainers, TaskDetails, TaskPool -from onefuzztypes.primitives import Container, PoolName - -from __app__.onefuzzlib.autoscale import autoscale_pool, get_vm_count -from __app__.onefuzzlib.tasks.main import Task -from __app__.onefuzzlib.workers.pools import Pool - - -class TestAutoscale(unittest.TestCase): - @patch("__app__.onefuzzlib.tasks.main.Task.get_tasks_by_pool_name") - def test_autoscale_pool(self, mock_get_tasks_by_pool_name: MagicMock) -> None: - pool = Pool( - name=PoolName("test-pool"), - pool_id=UUID("6b049d51-23e9-4f5c-a5af-ff1f73d0d9e9"), - os=OS.linux, - managed=False, - arch=Architecture.x86_64, - ) - autoscale_pool(pool=pool) - mock_get_tasks_by_pool_name.assert_not_called() - - @patch("__app__.onefuzzlib.tasks.main.Task.get_pool") - def test_get_vm_count(self, mock_get_pool: MagicMock) -> None: - self.assertEqual(get_vm_count([]), 0) - - task_config = TaskConfig( - job_id=UUID("6b049d51-23e9-4f5c-a5af-ff1f73d0d9e9"), - containers=[ - TaskContainers( - type=ContainerType.inputs, name=Container("test-container") - ) - ], - tags={}, - task=TaskDetails( - type=TaskType.libfuzzer_fuzz, - duration=12, - target_exe="fuzz.exe", - target_env={}, - target_options=[], - ), - pool=TaskPool(count=2, pool_name=PoolName("test-pool")), - ) - task = Task( - job_id=UUID("6b049d51-23e9-4f5c-a5af-ff1f73d0d9e9"), - os=OS.linux, - config=task_config, - ) - mock_get_pool.return_value = Pool( - name=PoolName("test-pool"), - pool_id=UUID("6b049d51-23e9-4f5c-a5af-ff1f73d0d9e9"), - os=OS.linux, - managed=False, - arch=Architecture.x86_64, - ) - self.assertEqual(get_vm_count([task]), 2) +#!/usr/bin/env python +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import unittest +from unittest.mock import MagicMock, patch +from uuid import UUID + +from onefuzztypes.enums import OS, Architecture, ContainerType, TaskType +from onefuzztypes.models import TaskConfig, TaskContainers, TaskDetails, TaskPool +from onefuzztypes.primitives import Container, PoolName + +from __app__.onefuzzlib.tasks.main import Task +from __app__.onefuzzlib.workers.autoscale import autoscale_pool, get_tasks_vm_count +from __app__.onefuzzlib.workers.pools import Pool + + +class TestAutoscale(unittest.TestCase): + @patch("__app__.onefuzzlib.tasks.main.Task.get_tasks_by_pool_name") + def test_autoscale_pool(self, mock_get_tasks_by_pool_name: MagicMock) -> None: + pool = Pool( + name=PoolName("test-pool"), + pool_id=UUID("6b049d51-23e9-4f5c-a5af-ff1f73d0d9e9"), + os=OS.linux, + managed=False, + arch=Architecture.x86_64, + ) + autoscale_pool(pool=pool) + mock_get_tasks_by_pool_name.assert_not_called() + + @patch("__app__.onefuzzlib.tasks.main.Task.get_pool") + def test_get_vm_count(self, mock_get_pool: MagicMock) -> None: + self.assertEqual(get_tasks_vm_count([]), 0) + + task_config = TaskConfig( + job_id=UUID("6b049d51-23e9-4f5c-a5af-ff1f73d0d9e9"), + containers=[ + TaskContainers( + type=ContainerType.inputs, name=Container("test-container") + ) + ], + tags={}, + task=TaskDetails( + type=TaskType.libfuzzer_fuzz, + duration=12, + target_exe="fuzz.exe", + target_env={}, + target_options=[], + ), + pool=TaskPool(count=2, pool_name=PoolName("test-pool")), + ) + task = Task( + job_id=UUID("6b049d51-23e9-4f5c-a5af-ff1f73d0d9e9"), + os=OS.linux, + config=task_config, + ) + mock_get_pool.return_value = Pool( + name=PoolName("test-pool"), + pool_id=UUID("6b049d51-23e9-4f5c-a5af-ff1f73d0d9e9"), + os=OS.linux, + managed=False, + arch=Architecture.x86_64, + ) + self.assertEqual(get_tasks_vm_count([task]), 2) diff --git a/src/api-service/tests/test_scheduler.py b/src/api-service/tests/test_scheduler.py index 8f959c90bd..b156b8715b 100644 --- a/src/api-service/tests/test_scheduler.py +++ b/src/api-service/tests/test_scheduler.py @@ -1,147 +1,147 @@ -#!/usr/bin/env python -# -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import unittest -from typing import Dict, Generator, List, TypeVar -from uuid import UUID, uuid4 - -from onefuzztypes.enums import OS, ContainerType, TaskType -from onefuzztypes.models import TaskConfig, TaskContainers, TaskDetails, TaskPool -from onefuzztypes.primitives import Container, PoolName - -from __app__.onefuzzlib.tasks.main import Task -from __app__.onefuzzlib.tasks.scheduler import bucket_tasks - -A = TypeVar("A") - - -def chunks(items: List[A], size: int) -> Generator[List[A], None, None]: - return (items[x : x + size] for x in range(0, len(items), size)) - - -class TestTaskBuckets(unittest.TestCase): - def build_tasks(self, size: int) -> List[Task]: - tasks = [] - for _ in range(size): - task = Task( - job_id=UUID(int=0), - config=TaskConfig( - job_id=UUID(int=0), - task=TaskDetails( - type=TaskType.libfuzzer_fuzz, - duration=1, - target_exe="fuzz.exe", - target_env={}, - target_options=[], - ), - pool=TaskPool(pool_name=PoolName("pool"), count=1), - containers=[ - TaskContainers( - type=ContainerType.setup, name=Container("setup") - ) - ], - tags={}, - colocate=True, - ), - os=OS.linux, - ) - tasks.append(task) - return tasks - - def test_all_colocate(self) -> None: - # all tasks should land in one bucket - tasks = self.build_tasks(10) - for task in tasks: - task.config.colocate = True - - buckets = bucket_tasks(tasks) - - for bucket in buckets.values(): - self.assertEqual(len(bucket), 10) - - self.check_buckets(buckets, tasks, bucket_count=1) - - def test_partial_colocate(self) -> None: - # 2 tasks should land on their own, the rest should be colocated into a - # single bucket. - - tasks = self.build_tasks(10) - - # a the task came before colocation was defined - tasks[0].config.colocate = None - - # a the task shouldn't be colocated - tasks[1].config.colocate = False - - buckets = bucket_tasks(tasks) - - lengths = [] - for bucket in buckets.values(): - lengths.append(len(bucket)) - self.assertEqual([1, 1, 8], sorted(lengths)) - self.check_buckets(buckets, tasks, bucket_count=3) - - def test_all_unique_job(self) -> None: - # everything has a unique job_id - tasks = self.build_tasks(10) - for task in tasks: - job_id = uuid4() - task.job_id = job_id - task.config.job_id = job_id - - buckets = bucket_tasks(tasks) - - for bucket in buckets.values(): - self.assertEqual(len(bucket), 1) - - self.check_buckets(buckets, tasks, bucket_count=10) - - def test_multiple_job_buckets(self) -> None: - # at most 3 tasks per bucket, by job_id - tasks = self.build_tasks(10) - for task_chunks in chunks(tasks, 3): - job_id = uuid4() - for task in task_chunks: - task.job_id = job_id - task.config.job_id = job_id - - buckets = bucket_tasks(tasks) - - for bucket in buckets.values(): - self.assertLessEqual(len(bucket), 3) - - self.check_buckets(buckets, tasks, bucket_count=4) - - def test_many_buckets(self) -> None: - tasks = self.build_tasks(100) - job_id = UUID(int=1) - for i, task in enumerate(tasks): - if i % 2 == 0: - task.job_id = job_id - task.config.job_id = job_id - - if i % 3 == 0: - task.os = OS.windows - - if i % 4 == 0: - task.config.containers[0].name = Container("setup2") - - if i % 5 == 0: - if task.config.pool: - task.config.pool.pool_name = PoolName("alternate-pool") - - buckets = bucket_tasks(tasks) - self.check_buckets(buckets, tasks, bucket_count=12) - - def check_buckets(self, buckets: Dict, tasks: List, *, bucket_count: int) -> None: - self.assertEqual(len(buckets), bucket_count, "bucket count") - - for task in tasks: - seen = False - for bucket in buckets.values(): - if task in bucket: - self.assertEqual(seen, False, "task seen in multiple buckets") - seen = True - self.assertEqual(seen, True, "task not seein in any buckets") +#!/usr/bin/env python +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import unittest +from typing import Dict, Generator, List, TypeVar +from uuid import UUID, uuid4 + +from onefuzztypes.enums import OS, ContainerType, TaskType +from onefuzztypes.models import TaskConfig, TaskContainers, TaskDetails, TaskPool +from onefuzztypes.primitives import Container, PoolName + +from __app__.onefuzzlib.tasks.main import Task +from __app__.onefuzzlib.tasks.scheduler import bucket_tasks + +A = TypeVar("A") + + +def chunks(items: List[A], size: int) -> Generator[List[A], None, None]: + return (items[x : x + size] for x in range(0, len(items), size)) + + +class TestTaskBuckets(unittest.TestCase): + def build_tasks(self, size: int) -> List[Task]: + tasks = [] + for _ in range(size): + task = Task( + job_id=UUID(int=0), + config=TaskConfig( + job_id=UUID(int=0), + task=TaskDetails( + type=TaskType.libfuzzer_fuzz, + duration=1, + target_exe="fuzz.exe", + target_env={}, + target_options=[], + ), + pool=TaskPool(pool_name=PoolName("pool"), count=1), + containers=[ + TaskContainers( + type=ContainerType.setup, name=Container("setup") + ) + ], + tags={}, + colocate=True, + ), + os=OS.linux, + ) + tasks.append(task) + return tasks + + def test_all_colocate(self) -> None: + # all tasks should land in one bucket + tasks = self.build_tasks(10) + for task in tasks: + task.config.colocate = True + + buckets = bucket_tasks(tasks) + + for bucket in buckets.values(): + self.assertEqual(len(bucket), 10) + + self.check_buckets(buckets, tasks, bucket_count=1) + + def test_partial_colocate(self) -> None: + # 2 tasks should land on their own, the rest should be colocated into a + # single bucket. + + tasks = self.build_tasks(10) + + # a the task came before colocation was defined + tasks[0].config.colocate = None + + # a the task shouldn't be colocated + tasks[1].config.colocate = False + + buckets = bucket_tasks(tasks) + + lengths = [] + for bucket in buckets.values(): + lengths.append(len(bucket)) + self.assertEqual([1, 1, 8], sorted(lengths)) + self.check_buckets(buckets, tasks, bucket_count=3) + + def test_all_unique_job(self) -> None: + # everything has a unique job_id + tasks = self.build_tasks(10) + for task in tasks: + job_id = uuid4() + task.job_id = job_id + task.config.job_id = job_id + + buckets = bucket_tasks(tasks) + + for bucket in buckets.values(): + self.assertEqual(len(bucket), 1) + + self.check_buckets(buckets, tasks, bucket_count=10) + + def test_multiple_job_buckets(self) -> None: + # at most 3 tasks per bucket, by job_id + tasks = self.build_tasks(10) + for task_chunks in chunks(tasks, 3): + job_id = uuid4() + for task in task_chunks: + task.job_id = job_id + task.config.job_id = job_id + + buckets = bucket_tasks(tasks) + + for bucket in buckets.values(): + self.assertLessEqual(len(bucket), 3) + + self.check_buckets(buckets, tasks, bucket_count=4) + + def test_many_buckets(self) -> None: + tasks = self.build_tasks(100) + job_id = UUID(int=1) + for i, task in enumerate(tasks): + if i % 2 == 0: + task.job_id = job_id + task.config.job_id = job_id + + if i % 3 == 0: + task.os = OS.windows + + if i % 4 == 0: + task.config.containers[0].name = Container("setup2") + + if i % 5 == 0: + if task.config.pool: + task.config.pool.pool_name = PoolName("alternate-pool") + + buckets = bucket_tasks(tasks) + self.check_buckets(buckets, tasks, bucket_count=12) + + def check_buckets(self, buckets: Dict, tasks: List, *, bucket_count: int) -> None: + self.assertEqual(len(buckets), bucket_count, "bucket count") + + for task in tasks: + seen = False + for bucket in buckets.values(): + if task in bucket: + self.assertEqual(seen, False, "task seen in multiple buckets") + seen = True + self.assertEqual(seen, True, "task not seein in any buckets") diff --git a/src/api-service/tests/test_webhook_hmac.py b/src/api-service/tests/test_webhook_hmac.py index ec976f39bb..8298148672 100644 --- a/src/api-service/tests/test_webhook_hmac.py +++ b/src/api-service/tests/test_webhook_hmac.py @@ -1,51 +1,51 @@ -#!/usr/bin/env python -# -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import unittest -from uuid import UUID - -from onefuzztypes.events import EventPing, EventType - -from __app__.onefuzzlib.webhooks import build_message - - -class TestWebhookHmac(unittest.TestCase): - def test_webhook_hmac(self) -> None: - webhook_id = UUID(int=0) - event_id = UUID(int=1) - event_type = EventType.ping - event = EventPing(ping_id=UUID(int=2)) - - data, digest = build_message( - webhook_id=webhook_id, event_id=event_id, event_type=event_type, event=event - ) - - expected = ( - b"{" - b'"event": {"ping_id": "00000000-0000-0000-0000-000000000002"}, ' - b'"event_id": "00000000-0000-0000-0000-000000000001", ' - b'"event_type": "ping", ' - b'"webhook_id": "00000000-0000-0000-0000-000000000000"' - b"}" - ) - - expected_digest = ( - "3502f83237ce006b7f6cfa40b89c0295009e3ccb0a1e62ce1d689700c2c6e698" - "61c0de81e011495c2ca89fbf99485b841cee257bcfba326a3edc66f39dc1feec" - ) - - print(repr(expected)) - self.assertEqual(data, expected) - self.assertEqual(digest, None) - - data, digest = build_message( - webhook_id=webhook_id, - event_id=event_id, - event_type=event_type, - event=event, - secret_token="hello there", - ) - self.assertEqual(data, expected) - self.assertEqual(digest, expected_digest) +#!/usr/bin/env python +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import unittest +from uuid import UUID + +from onefuzztypes.events import EventPing, EventType + +from __app__.onefuzzlib.webhooks import build_message + + +class TestWebhookHmac(unittest.TestCase): + def test_webhook_hmac(self) -> None: + webhook_id = UUID(int=0) + event_id = UUID(int=1) + event_type = EventType.ping + event = EventPing(ping_id=UUID(int=2)) + + data, digest = build_message( + webhook_id=webhook_id, event_id=event_id, event_type=event_type, event=event + ) + + expected = ( + b"{" + b'"event": {"ping_id": "00000000-0000-0000-0000-000000000002"}, ' + b'"event_id": "00000000-0000-0000-0000-000000000001", ' + b'"event_type": "ping", ' + b'"webhook_id": "00000000-0000-0000-0000-000000000000"' + b"}" + ) + + expected_digest = ( + "3502f83237ce006b7f6cfa40b89c0295009e3ccb0a1e62ce1d689700c2c6e698" + "61c0de81e011495c2ca89fbf99485b841cee257bcfba326a3edc66f39dc1feec" + ) + + print(repr(expected)) + self.assertEqual(data, expected) + self.assertEqual(digest, None) + + data, digest = build_message( + webhook_id=webhook_id, + event_id=event_id, + event_type=event_type, + event=event, + secret_token="hello there", + ) + self.assertEqual(data, expected) + self.assertEqual(digest, expected_digest) diff --git a/src/pytypes/onefuzztypes/enums.py b/src/pytypes/onefuzztypes/enums.py index 96d3765b73..be0599a9b7 100644 --- a/src/pytypes/onefuzztypes/enums.py +++ b/src/pytypes/onefuzztypes/enums.py @@ -314,6 +314,13 @@ def modifying(cls) -> List["ScalesetState"]: cls.setup, ] + @classmethod + def can_resize(cls) -> List["ScalesetState"]: + """ + set of states that indicate the scaleset can be resized + """ + return [cls.running, cls.resize] + class Architecture(Enum): x86_64 = "x86_64" diff --git a/src/pytypes/onefuzztypes/models.py b/src/pytypes/onefuzztypes/models.py index 2163aae6db..6d8773dfae 100644 --- a/src/pytypes/onefuzztypes/models.py +++ b/src/pytypes/onefuzztypes/models.py @@ -527,6 +527,7 @@ class NodeHeartbeatEntry(BaseModel): class Node(BaseModel): pool_name: PoolName + pool_id: Optional[UUID] machine_id: UUID state: NodeState = Field(default=NodeState.init) scaleset_id: Optional[UUID] = None @@ -554,16 +555,9 @@ class AutoScaleConfig(BaseModel): max_size: Optional[int] # max size of pool min_size: int = Field(default=0) # min size of pool region: Optional[Region] - scaleset_size: int # Individual scaleset size spot_instances: bool = Field(default=False) vm_sku: str - @validator("scaleset_size", allow_reuse=True) - def check_scaleset_size(cls, value: int) -> int: - if value < 1 or value > 1000: - raise ValueError("invalid scaleset size") - return value - @root_validator() def check_data(cls, values: Any) -> Any: if ( diff --git a/src/pytypes/onefuzztypes/requests.py b/src/pytypes/onefuzztypes/requests.py index 603367a46e..9511cf9c3a 100644 --- a/src/pytypes/onefuzztypes/requests.py +++ b/src/pytypes/onefuzztypes/requests.py @@ -135,7 +135,7 @@ class NodeSearch(BaseRequest): machine_id: Optional[UUID] state: Optional[List[NodeState]] scaleset_id: Optional[UUID] - pool_name: Optional[str] + pool_name: Optional[PoolName] class NodeGet(BaseRequest): From 2d82caff818c69b6d35b806f952482285b6cb1c2 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 10:59:31 -0500 Subject: [PATCH 02/66] regen --- docs/webhook_events.md | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/docs/webhook_events.md b/docs/webhook_events.md index 4e8ec086ea..13fdeb6157 100644 --- a/docs/webhook_events.md +++ b/docs/webhook_events.md @@ -647,10 +647,6 @@ Each event will be submitted via HTTP POST to the user provided URL. "title": "Region", "type": "string" }, - "scaleset_size": { - "title": "Scaleset Size", - "type": "integer" - }, "spot_instances": { "default": false, "title": "Spot Instances", @@ -663,7 +659,6 @@ Each event will be submitted via HTTP POST to the user provided URL. }, "required": [ "image", - "scaleset_size", "vm_sku" ], "title": "AutoScaleConfig", @@ -1798,10 +1793,6 @@ Each event will be submitted via HTTP POST to the user provided URL. "title": "Region", "type": "string" }, - "scaleset_size": { - "title": "Scaleset Size", - "type": "integer" - }, "spot_instances": { "default": false, "title": "Spot Instances", @@ -1814,7 +1805,6 @@ Each event will be submitted via HTTP POST to the user provided URL. }, "required": [ "image", - "scaleset_size", "vm_sku" ], "title": "AutoScaleConfig", From 66d684fe89b231044a2dd26d9305c4b8db01b842 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 11:14:08 -0500 Subject: [PATCH 03/66] continued dev --- .../__app__/onefuzzlib/workers/autoscale.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 0084dff7b8..11b3135fde 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -3,6 +3,10 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +# NOTE: Set ONEFUZZ_SCALESET_MAX_SIZE environment variable to artificially set +# the maximum size of a scaleset for testing. + +import os import logging from typing import List @@ -45,13 +49,25 @@ def scale_up(pool: Pool, scalesets: List[Scaleset], to_add: int) -> None: scaleset_max_size = scaleset.max_size() if scaleset.size < scaleset_max_size: scaleset_to_add = min(to_add, scaleset_max_size - scaleset.size) + logging.info( + "autoscale adding to scaleset: pool:%s scaleset:%s existing_size:%d adding:%d", + pool.name, + scaleset.scaleset_id, + scaleset.size, + scaleset_to_add, + ) scaleset.size += scaleset_to_add scaleset.state = ScalesetState.resize scaleset.save() to_add -= scaleset_to_add - base_size = Scaleset.scaleset_max_size(config.image) region = config.region or get_base_region() + base_size = Scaleset.scaleset_max_size(config.image) + + alternate_max_size = os.environ.get("ONEFUZZ_SCALESET_MAX_SIZE") + if alternate_max_size is not None: + base_size = min(base_size, int(alternate_max_size)) + while to_add > 0: scaleset_size = min(base_size, to_add) logging.info( From ba2fbf72b2c4f4f456ae7b87753bc988644f2460 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 11:28:54 -0500 Subject: [PATCH 04/66] add autoscaling to CLI --- src/cli/onefuzz/api.py | 35 ++++++++++++++++++++++++++++++++++- src/cli/onefuzz/cli.py | 3 ++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/src/cli/onefuzz/api.py b/src/cli/onefuzz/api.py index 44e7a4958b..384c0c4b45 100644 --- a/src/cli/onefuzz/api.py +++ b/src/cli/onefuzz/api.py @@ -1032,6 +1032,7 @@ def create( *, unmanaged: bool = False, arch: enums.Architecture = enums.Architecture.x86_64, + autoscale_config: Optional[models.AutoScaleConfig] = None, ) -> models.Pool: """ Create a worker pool @@ -1045,10 +1046,42 @@ def create( "POST", models.Pool, data=requests.PoolCreate( - name=name, os=os, arch=arch, managed=managed, client_id=client_id + name=name, os=os, arch=arch, managed=managed, client_id=client_id, autoscale_config=autoscale_config ), ) + def create_autoscale( + self, + name: str, + os: enums.OS, + client_id: Optional[UUID] = None, + *, + arch: enums.Architecture = enums.Architecture.x86_64, + min_size: Optional[int] = None, + max_size: Optional[int] = None, + image: Optional[str] = None, + vm_sku: Optional[str] = "Standard_D2s_v3", + region: Optional[primitives.Region] = None, + spot_instances: bool = False, + ) -> models.Pool: + if image is None: + if os == enums.OS.linux: + image = DEFAULT_LINUX_IMAGE + elif os == enums.OS.windows: + image = DEFAULT_WINDOWS_IMAGE + else: + raise NotImplementedError + + autoscale_config = models.AutoScaleConfig( + image=image, + max_size=max_size, + min_size=min_size, + region=region, + spot_instances=spot_instances, + vm_sku=vm_sku, + ) + return self.create(name, os, client_id, unmanaged=False, arch=arch, autoscale_config=autoscale_config) + def get_config(self, pool_name: str) -> models.AgentConfig: """ Get the agent configuration for the pool """ diff --git a/src/cli/onefuzz/cli.py b/src/cli/onefuzz/cli.py index 2f94c1783a..3305474afb 100644 --- a/src/cli/onefuzz/cli.py +++ b/src/cli/onefuzz/cli.py @@ -32,7 +32,7 @@ import jmespath from docstring_parser import parse as parse_docstring from msrest.serialization import Model -from onefuzztypes.primitives import Container, Directory, File +from onefuzztypes.primitives import Container, Directory, File, Region from pydantic import BaseModel, ValidationError LOGGER = logging.getLogger("cli") @@ -158,6 +158,7 @@ def __init__(self, api_types: List[Any]): int: {"type": int}, UUID: {"type": UUID}, Container: {"type": str}, + Region: {"type": str}, File: {"type": arg_file}, Directory: {"type": arg_dir}, } From bcbddca3190c95bfd07ba6712eade6da09429734 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 11:31:05 -0500 Subject: [PATCH 05/66] DRY for default image --- src/cli/onefuzz/api.py | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/src/cli/onefuzz/api.py b/src/cli/onefuzz/api.py index 384c0c4b45..f7e6d2fdb2 100644 --- a/src/cli/onefuzz/api.py +++ b/src/cli/onefuzz/api.py @@ -74,6 +74,15 @@ def _wsl_path(path: str) -> str: return path +def _get_default_image(os: enums.OS) -> str: + if os == enums.OS.linux: + return DEFAULT_LINUX_IMAGE + elif os == enums.OS.windows: + return DEFAULT_WINDOWS_IMAGE + else: + raise NotImplementedError + + def user_confirmation(message: str) -> bool: answer: Optional[str] = None while answer not in ["y", "n"]: @@ -1046,7 +1055,12 @@ def create( "POST", models.Pool, data=requests.PoolCreate( - name=name, os=os, arch=arch, managed=managed, client_id=client_id, autoscale_config=autoscale_config + name=name, + os=os, + arch=arch, + managed=managed, + client_id=client_id, + autoscale_config=autoscale_config, ), ) @@ -1065,12 +1079,7 @@ def create_autoscale( spot_instances: bool = False, ) -> models.Pool: if image is None: - if os == enums.OS.linux: - image = DEFAULT_LINUX_IMAGE - elif os == enums.OS.windows: - image = DEFAULT_WINDOWS_IMAGE - else: - raise NotImplementedError + image = _get_default_image(os) autoscale_config = models.AutoScaleConfig( image=image, @@ -1080,7 +1089,14 @@ def create_autoscale( spot_instances=spot_instances, vm_sku=vm_sku, ) - return self.create(name, os, client_id, unmanaged=False, arch=arch, autoscale_config=autoscale_config) + return self.create( + name, + os, + client_id, + unmanaged=False, + arch=arch, + autoscale_config=autoscale_config, + ) def get_config(self, pool_name: str) -> models.AgentConfig: """ Get the agent configuration for the pool """ @@ -1291,12 +1307,7 @@ def create( if image is None: pool = self.onefuzz.pools.get(pool_name) - if pool.os == enums.OS.linux: - image = DEFAULT_LINUX_IMAGE - elif pool.os == enums.OS.windows: - image = DEFAULT_WINDOWS_IMAGE - else: - raise NotImplementedError + image = _get_default_image(pool.os) return self._req_model( "POST", From 1314aea94fec8786a3f8a899fdf1d84109c518e1 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 11:35:46 -0500 Subject: [PATCH 06/66] lint --- src/api-service/__app__/onefuzzlib/workers/autoscale.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 11b3135fde..cc9a8026d8 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -6,8 +6,8 @@ # NOTE: Set ONEFUZZ_SCALESET_MAX_SIZE environment variable to artificially set # the maximum size of a scaleset for testing. -import os import logging +import os from typing import List from onefuzztypes.enums import ScalesetState @@ -50,7 +50,8 @@ def scale_up(pool: Pool, scalesets: List[Scaleset], to_add: int) -> None: if scaleset.size < scaleset_max_size: scaleset_to_add = min(to_add, scaleset_max_size - scaleset.size) logging.info( - "autoscale adding to scaleset: pool:%s scaleset:%s existing_size:%d adding:%d", + "autoscale adding to scaleset: " + "pool:%s scaleset:%s existing_size:%d adding:%d", pool.name, scaleset.scaleset_id, scaleset.size, From 3b2f7d668b77f19f9b7e74cb389014375adc11b7 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 12:53:03 -0500 Subject: [PATCH 07/66] use proper arg name --- src/cli/onefuzz/api.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/src/cli/onefuzz/api.py b/src/cli/onefuzz/api.py index f7e6d2fdb2..e5ef5b1d55 100644 --- a/src/cli/onefuzz/api.py +++ b/src/cli/onefuzz/api.py @@ -1041,28 +1041,27 @@ def create( *, unmanaged: bool = False, arch: enums.Architecture = enums.Architecture.x86_64, - autoscale_config: Optional[models.AutoScaleConfig] = None, + autoscale: Optional[models.AutoScaleConfig] = None, ) -> models.Pool: """ Create a worker pool :param str name: Name of the worker-pool """ - self.logger.debug("create worker pool") + self.logger.debug("create worker pool: %s", autoscale) managed = not unmanaged - return self._req_model( - "POST", - models.Pool, - data=requests.PoolCreate( - name=name, - os=os, - arch=arch, - managed=managed, - client_id=client_id, - autoscale_config=autoscale_config, - ), + data = requests.PoolCreate( + name=name, + os=os, + arch=arch, + managed=managed, + client_id=client_id, + autoscale=autoscale, ) + self.logger.debug("pool request: %s", data.json()) + + return self._req_model("POST", models.Pool, data=data) def create_autoscale( self, @@ -1081,7 +1080,7 @@ def create_autoscale( if image is None: image = _get_default_image(os) - autoscale_config = models.AutoScaleConfig( + autoscale = models.AutoScaleConfig( image=image, max_size=max_size, min_size=min_size, @@ -1095,7 +1094,7 @@ def create_autoscale( client_id, unmanaged=False, arch=arch, - autoscale_config=autoscale_config, + autoscale=autoscale, ) def get_config(self, pool_name: str) -> models.AgentConfig: From ac00f223ecb2f1ec0bc8742e0b2ba5a8efd1ace3 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 13:07:38 -0500 Subject: [PATCH 08/66] use worksets & busy nodes --- .../__app__/onefuzzlib/tasks/main.py | 17 ------- .../__app__/onefuzzlib/workers/autoscale.py | 31 ++++++------ .../__app__/onefuzzlib/workers/pools.py | 9 ++-- src/api-service/tests/test_autoscale.py | 49 +++---------------- src/cli/onefuzz/api.py | 38 +++++++------- src/pytypes/onefuzztypes/enums.py | 4 ++ 6 files changed, 51 insertions(+), 97 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/tasks/main.py b/src/api-service/__app__/onefuzzlib/tasks/main.py index 230c9c2c59..7d0da9d92c 100644 --- a/src/api-service/__app__/onefuzzlib/tasks/main.py +++ b/src/api-service/__app__/onefuzzlib/tasks/main.py @@ -164,23 +164,6 @@ def get_by_task_id(cls, task_id: UUID) -> Union[Error, "Task"]: task = tasks[0] return task - @classmethod - def get_tasks_by_pool_name(cls, pool_name: str) -> List["Task"]: - tasks = cls.search_states(states=TaskState.available()) - if not tasks: - return [] - - pool_tasks = [] - - for task in tasks: - task_pool = task.get_pool() - if not task_pool: - continue - if pool_name == task_pool.name: - pool_tasks.append(task) - - return pool_tasks - def mark_stopping(self) -> None: if self.state in [TaskState.stopped, TaskState.stopping]: logging.debug( diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index cc9a8026d8..9770b516c6 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -10,10 +10,9 @@ import os from typing import List -from onefuzztypes.enums import ScalesetState +from onefuzztypes.enums import NodeState, ScalesetState from ..azure.creds import get_base_region -from ..tasks.main import Task from .nodes import Node from .pools import Pool from .scalesets import Scaleset @@ -115,29 +114,30 @@ def scale_down(pool: Pool, scalesets: List[Scaleset], to_remove: int) -> None: set_shrink_queues(pool, scalesets, to_remove) -def get_tasks_vm_count(tasks: List[Task]) -> int: +def needed_nodes(pool: Pool) -> int: count = 0 - for task in tasks: - if task.config.pool: - count += task.config.pool.count - if task.config.vm: - count += task.config.vm.count + # NOTE: queue peek only returns the first 30 objects. + workset_queue = pool.peek_work_queue() + count += len(workset_queue) + + nodes = Node.search_states(pool_name=pool.name, states=NodeState.in_use()) + count += len(nodes) return count def autoscale_pool(pool: Pool) -> None: - logging.info("autoscale: %s", pool.autoscale) if not pool.autoscale: return + logging.info("autoscale pool. pool:%s config:%s", pool.name, pool.autoscale.json()) - # get all the tasks (count not stopped) for the pool - tasks = Task.get_tasks_by_pool_name(pool.name) - logging.info("Pool: %s, #Tasks %d", pool.name, len(tasks)) + node_need_estimate = needed_nodes(pool) + logging.info( + "autoscale pool estimate. pool:%s estimate:%d", pool.name, node_need_estimate + ) - num_of_tasks = get_tasks_vm_count(tasks) - new_size = max(num_of_tasks, pool.autoscale.min_size) + new_size = max(node_need_estimate, pool.autoscale.min_size) if pool.autoscale.max_size: new_size = min(new_size, pool.autoscale.max_size) @@ -151,7 +151,8 @@ def autoscale_pool(pool: Pool) -> None: ] if modifying: logging.info( - "pool has modifying scalesets, unable to autoscale: %s - %s", + "autoscale - pool has modifying scalesets, " + "unable to autoscale: %s - %s", pool.name, modifying, ) diff --git a/src/api-service/__app__/onefuzzlib/workers/pools.py b/src/api-service/__app__/onefuzzlib/workers/pools.py index b2e8f91f5e..012de8a5fd 100644 --- a/src/api-service/__app__/onefuzzlib/workers/pools.py +++ b/src/api-service/__app__/onefuzzlib/workers/pools.py @@ -99,6 +99,11 @@ def populate_scaleset_summary(self) -> None: for x in Scaleset.search_by_pool(self.name) ] + def peek_work_queue(self) -> List[WorkSet]: + return peek_queue( + self.get_pool_queue(), StorageType.corpus, object_type=WorkSet + ) + def populate_work_queue(self) -> None: self.work_queue = [] @@ -107,9 +112,7 @@ def populate_work_queue(self) -> None: if self.state == PoolState.init: return - worksets = peek_queue( - self.get_pool_queue(), StorageType.corpus, object_type=WorkSet - ) + worksets = self.peek_work_queue() for workset in worksets: work_units = [ diff --git a/src/api-service/tests/test_autoscale.py b/src/api-service/tests/test_autoscale.py index 656eba3d42..59cc5cce66 100644 --- a/src/api-service/tests/test_autoscale.py +++ b/src/api-service/tests/test_autoscale.py @@ -7,18 +7,16 @@ from unittest.mock import MagicMock, patch from uuid import UUID -from onefuzztypes.enums import OS, Architecture, ContainerType, TaskType -from onefuzztypes.models import TaskConfig, TaskContainers, TaskDetails, TaskPool -from onefuzztypes.primitives import Container, PoolName +from onefuzztypes.enums import OS, Architecture +from onefuzztypes.primitives import PoolName -from __app__.onefuzzlib.tasks.main import Task -from __app__.onefuzzlib.workers.autoscale import autoscale_pool, get_tasks_vm_count +from __app__.onefuzzlib.workers.autoscale import autoscale_pool from __app__.onefuzzlib.workers.pools import Pool class TestAutoscale(unittest.TestCase): - @patch("__app__.onefuzzlib.tasks.main.Task.get_tasks_by_pool_name") - def test_autoscale_pool(self, mock_get_tasks_by_pool_name: MagicMock) -> None: + @patch("__app__.onefuzzlib.workers.autoscale.needed_nodes") + def test_autoscale_pool(self, needed_nodes: MagicMock) -> None: pool = Pool( name=PoolName("test-pool"), pool_id=UUID("6b049d51-23e9-4f5c-a5af-ff1f73d0d9e9"), @@ -27,39 +25,4 @@ def test_autoscale_pool(self, mock_get_tasks_by_pool_name: MagicMock) -> None: arch=Architecture.x86_64, ) autoscale_pool(pool=pool) - mock_get_tasks_by_pool_name.assert_not_called() - - @patch("__app__.onefuzzlib.tasks.main.Task.get_pool") - def test_get_vm_count(self, mock_get_pool: MagicMock) -> None: - self.assertEqual(get_tasks_vm_count([]), 0) - - task_config = TaskConfig( - job_id=UUID("6b049d51-23e9-4f5c-a5af-ff1f73d0d9e9"), - containers=[ - TaskContainers( - type=ContainerType.inputs, name=Container("test-container") - ) - ], - tags={}, - task=TaskDetails( - type=TaskType.libfuzzer_fuzz, - duration=12, - target_exe="fuzz.exe", - target_env={}, - target_options=[], - ), - pool=TaskPool(count=2, pool_name=PoolName("test-pool")), - ) - task = Task( - job_id=UUID("6b049d51-23e9-4f5c-a5af-ff1f73d0d9e9"), - os=OS.linux, - config=task_config, - ) - mock_get_pool.return_value = Pool( - name=PoolName("test-pool"), - pool_id=UUID("6b049d51-23e9-4f5c-a5af-ff1f73d0d9e9"), - os=OS.linux, - managed=False, - arch=Architecture.x86_64, - ) - self.assertEqual(get_tasks_vm_count([task]), 2) + needed_nodes.assert_not_called() diff --git a/src/cli/onefuzz/api.py b/src/cli/onefuzz/api.py index e5ef5b1d55..07e889284c 100644 --- a/src/cli/onefuzz/api.py +++ b/src/cli/onefuzz/api.py @@ -1051,17 +1051,18 @@ def create( self.logger.debug("create worker pool: %s", autoscale) managed = not unmanaged - data = requests.PoolCreate( - name=name, - os=os, - arch=arch, - managed=managed, - client_id=client_id, - autoscale=autoscale, + return self._req_model( + "POST", + models.Pool, + data=requests.PoolCreate( + name=name, + os=os, + arch=arch, + managed=managed, + client_id=client_id, + autoscale=autoscale, + ), ) - self.logger.debug("pool request: %s", data.json()) - - return self._req_model("POST", models.Pool, data=data) def create_autoscale( self, @@ -1080,21 +1081,20 @@ def create_autoscale( if image is None: image = _get_default_image(os) - autoscale = models.AutoScaleConfig( - image=image, - max_size=max_size, - min_size=min_size, - region=region, - spot_instances=spot_instances, - vm_sku=vm_sku, - ) return self.create( name, os, client_id, unmanaged=False, arch=arch, - autoscale=autoscale, + autoscale=models.AutoScaleConfig( + image=image, + max_size=max_size, + min_size=min_size, + region=region, + spot_instances=spot_instances, + vm_sku=vm_sku, + ), ) def get_config(self, pool_name: str) -> models.AgentConfig: diff --git a/src/pytypes/onefuzztypes/enums.py b/src/pytypes/onefuzztypes/enums.py index be0599a9b7..5fe572e819 100644 --- a/src/pytypes/onefuzztypes/enums.py +++ b/src/pytypes/onefuzztypes/enums.py @@ -359,6 +359,10 @@ def ready_for_reset(cls) -> List["NodeState"]: # from the agent. return [cls.done, cls.shutdown, cls.halt] + @classmethod + def in_use(cls) -> List["NodeState"]: + return [cls.setting_up, cls.rebooting, cls.ready, cls.busy, cls.done] + class GithubIssueState(Enum): open = "open" From e5ef6f7a8c7cd1abc3d2812e434b3f8bf5d7927f Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 13:30:02 -0500 Subject: [PATCH 09/66] lint --- src/api-service/__app__/onefuzzlib/tasks/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/api-service/__app__/onefuzzlib/tasks/main.py b/src/api-service/__app__/onefuzzlib/tasks/main.py index 039353e4a8..7d0da9d92c 100644 --- a/src/api-service/__app__/onefuzzlib/tasks/main.py +++ b/src/api-service/__app__/onefuzzlib/tasks/main.py @@ -18,7 +18,6 @@ from onefuzztypes.models import Error from onefuzztypes.models import Task as BASE_TASK from onefuzztypes.models import TaskConfig, TaskVm, UserInfo -from onefuzztypes.primitives import PoolName from ..azure.image import get_os from ..azure.queue import create_queue, delete_queue From 3972dd8d69952a4822dd30c9bcb43156797e49de Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 14:09:13 -0500 Subject: [PATCH 10/66] clarify states used for autoscale counting --- .../__app__/onefuzzlib/workers/autoscale.py | 12 ++++++------ src/pytypes/onefuzztypes/enums.py | 11 ++++------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 9770b516c6..0c739df782 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -141,20 +141,20 @@ def autoscale_pool(pool: Pool) -> None: if pool.autoscale.max_size: new_size = min(new_size, pool.autoscale.max_size) - # do scaleset logic match with pool - # get all the scalesets for the pool scalesets = Scaleset.search_by_pool(pool.name) current_size = 0 for scaleset in scalesets: - modifying = [ - x.scaleset_id for x in scalesets if x.state in ScalesetState.modifying() + unable_to_autoscale = [ + x.scaleset_id + for x in scalesets + if x.state not in ScalesetState.include_autoscale_count() ] - if modifying: + if unable_to_autoscale: logging.info( "autoscale - pool has modifying scalesets, " "unable to autoscale: %s - %s", pool.name, - modifying, + unable_to_autoscale, ) return current_size += scaleset.size diff --git a/src/pytypes/onefuzztypes/enums.py b/src/pytypes/onefuzztypes/enums.py index 5fe572e819..8a1dcb1ae8 100644 --- a/src/pytypes/onefuzztypes/enums.py +++ b/src/pytypes/onefuzztypes/enums.py @@ -306,13 +306,10 @@ def available(cls) -> List["ScalesetState"]: return [x for x in cls if x not in unavailable] @classmethod - def modifying(cls) -> List["ScalesetState"]: - """ set of states that indicate scaleset is resizing """ - return [ - cls.halt, - cls.init, - cls.setup, - ] + def include_autoscale_count(cls) -> List["ScalesetState"]: + """ set of states that indicate inclusion in autoscale counts """ + unavailable = [cls.halt, cls.init, cls.setup] + return [x for x in cls if x not in unavailable] @classmethod def can_resize(cls) -> List["ScalesetState"]: From 81021df6e4e2551332cb1054f54fd4cf7eb1cba3 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 14:40:38 -0500 Subject: [PATCH 11/66] add an event for updating scaleset size --- docs/webhook_events.md | 72 ++++++++++++++++++- .../__app__/onefuzzlib/workers/autoscale.py | 10 +-- .../__app__/onefuzzlib/workers/scalesets.py | 12 ++++ src/pytypes/extra/generate-docs.py | 6 +- src/pytypes/onefuzztypes/events.py | 9 +++ src/pytypes/onefuzztypes/models.py | 10 +-- 6 files changed, 106 insertions(+), 13 deletions(-) diff --git a/docs/webhook_events.md b/docs/webhook_events.md index 13fdeb6157..9377a9b505 100644 --- a/docs/webhook_events.md +++ b/docs/webhook_events.md @@ -37,6 +37,7 @@ Each event will be submitted via HTTP POST to the user provided URL. * [scaleset_created](#scaleset_created) * [scaleset_deleted](#scaleset_deleted) * [scaleset_failed](#scaleset_failed) +* [scaleset_size_changed](#scaleset_size_changed) * [task_created](#task_created) * [task_failed](#task_failed) * [task_state_updated](#task_state_updated) @@ -639,7 +640,6 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "integer" }, "min_size": { - "default": 0, "title": "Min Size", "type": "integer" }, @@ -1071,6 +1071,47 @@ Each event will be submitted via HTTP POST to the user provided URL. } ``` +### scaleset_size_changed + +#### Example + +```json +{ + "pool_name": "example", + "scaleset_id": "00000000-0000-0000-0000-000000000000", + "size": 0 +} +``` + +#### Schema + +```json +{ + "properties": { + "pool_name": { + "title": "Pool Name", + "type": "string" + }, + "scaleset_id": { + "format": "uuid", + "title": "Scaleset Id", + "type": "string" + }, + "size": { + "title": "Size", + "type": "integer" + } + }, + "required": [ + "scaleset_id", + "pool_name", + "size" + ], + "title": "EventScalesetSizeChanged", + "type": "object" +} +``` + ### task_created #### Example @@ -1785,7 +1826,6 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "integer" }, "min_size": { - "default": 0, "title": "Min Size", "type": "integer" }, @@ -2251,6 +2291,30 @@ Each event will be submitted via HTTP POST to the user provided URL. "title": "EventScalesetFailed", "type": "object" }, + "EventScalesetSizeChanged": { + "properties": { + "pool_name": { + "title": "Pool Name", + "type": "string" + }, + "scaleset_id": { + "format": "uuid", + "title": "Scaleset Id", + "type": "string" + }, + "size": { + "title": "Size", + "type": "integer" + } + }, + "required": [ + "scaleset_id", + "pool_name", + "size" + ], + "title": "EventScalesetSizeChanged", + "type": "object" + }, "EventTaskCreated": { "additionalProperties": false, "properties": { @@ -2378,6 +2442,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "scaleset_created", "scaleset_deleted", "scaleset_failed", + "scaleset_size_changed", "task_created", "task_failed", "task_state_updated", @@ -2906,6 +2971,9 @@ Each event will be submitted via HTTP POST to the user provided URL. { "$ref": "#/definitions/EventScalesetDeleted" }, + { + "$ref": "#/definitions/EventScalesetSizeChanged" + }, { "$ref": "#/definitions/EventTaskFailed" }, diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 0c739df782..c33e54f447 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -56,9 +56,7 @@ def scale_up(pool: Pool, scalesets: List[Scaleset], to_add: int) -> None: scaleset.size, scaleset_to_add, ) - scaleset.size += scaleset_to_add - scaleset.state = ScalesetState.resize - scaleset.save() + scaleset.set_new_size(scaleset.size + scaleset_to_add) to_add -= scaleset_to_add region = config.region or get_base_region() @@ -137,8 +135,10 @@ def autoscale_pool(pool: Pool) -> None: "autoscale pool estimate. pool:%s estimate:%d", pool.name, node_need_estimate ) - new_size = max(node_need_estimate, pool.autoscale.min_size) - if pool.autoscale.max_size: + new_size = node_need_estimate + if pool.autoscale.min_size is not None: + new_size = max(node_need_estimate, pool.autoscale.min_size) + if pool.autoscale.max_size is not None: new_size = min(new_size, pool.autoscale.max_size) scalesets = Scaleset.search_by_pool(pool.name) diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index 411481e320..7d68e7377e 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -13,6 +13,7 @@ EventScalesetCreated, EventScalesetDeleted, EventScalesetFailed, + EventScalesetSizeChanged, ) from onefuzztypes.models import Error from onefuzztypes.models import Scaleset as BASE_SCALESET @@ -578,6 +579,17 @@ def update_configs(self) -> None: self.scaleset_id, ) + def set_new_size(self, size: int) -> None: + self.size = size + self.state = ScalesetState.resize + self.save() + + send_event( + EventScalesetSizeChanged( + scaleset_id=self.scaleset_id, pool_name=self.pool_name, size=size + ) + ) + @classmethod def key_fields(cls) -> Tuple[str, str]: return ("pool_name", "scaleset_id") diff --git a/src/pytypes/extra/generate-docs.py b/src/pytypes/extra/generate-docs.py index c92b2f1401..06ef4486a7 100755 --- a/src/pytypes/extra/generate-docs.py +++ b/src/pytypes/extra/generate-docs.py @@ -5,7 +5,7 @@ from typing import Optional from uuid import UUID -from onefuzztypes.primitives import Region, Container +from onefuzztypes.primitives import Region, Container, PoolName from onefuzztypes.enums import ( TaskType, ContainerType, @@ -41,6 +41,7 @@ EventScalesetCreated, EventScalesetFailed, EventScalesetDeleted, + EventScalesetSizeChanged, EventJobCreated, EventJobStopped, EventTaskStateUpdated, @@ -188,6 +189,9 @@ def main(): ), ), EventFileAdded(container=Container("container-name"), filename="example.txt"), + EventScalesetSizeChanged( + scaleset_id=UUID(int=0), pool_name=PoolName("example"), size=0 + ), ] for event in Event.__args__: diff --git a/src/pytypes/onefuzztypes/events.py b/src/pytypes/onefuzztypes/events.py index fb6207fb67..a61776ecd7 100644 --- a/src/pytypes/onefuzztypes/events.py +++ b/src/pytypes/onefuzztypes/events.py @@ -84,6 +84,12 @@ class EventScalesetDeleted(BaseEvent): pool_name: PoolName +class EventScalesetSizeChanged(BaseModel): + scaleset_id: UUID + pool_name: PoolName + size: int + + class EventPoolDeleted(BaseEvent): pool_name: PoolName @@ -154,6 +160,7 @@ class EventFileAdded(BaseEvent): EventScalesetFailed, EventScalesetCreated, EventScalesetDeleted, + EventScalesetSizeChanged, EventTaskFailed, EventTaskStateUpdated, EventTaskCreated, @@ -178,6 +185,7 @@ class EventType(Enum): scaleset_created = "scaleset_created" scaleset_deleted = "scaleset_deleted" scaleset_failed = "scaleset_failed" + scaleset_size_changed = "scaleset_size_changed" task_created = "task_created" task_failed = "task_failed" task_state_updated = "task_state_updated" @@ -201,6 +209,7 @@ class EventType(Enum): EventType.scaleset_created: EventScalesetCreated, EventType.scaleset_deleted: EventScalesetDeleted, EventType.scaleset_failed: EventScalesetFailed, + EventType.scaleset_size_changed: EventScalesetSizeChanged, EventType.task_created: EventTaskCreated, EventType.task_failed: EventTaskFailed, EventType.task_state_updated: EventTaskStateUpdated, diff --git a/src/pytypes/onefuzztypes/models.py b/src/pytypes/onefuzztypes/models.py index f54a8a2a5a..9d5a987ef9 100644 --- a/src/pytypes/onefuzztypes/models.py +++ b/src/pytypes/onefuzztypes/models.py @@ -552,8 +552,8 @@ class NodeTasks(BaseModel): class AutoScaleConfig(BaseModel): image: str - max_size: Optional[int] # max size of pool - min_size: int = Field(default=0) # min size of pool + max_size: Optional[int] + min_size: Optional[int] region: Optional[Region] spot_instances: bool = Field(default=False) vm_sku: str @@ -570,13 +570,13 @@ def check_data(cls, values: Any) -> Any: @validator("max_size", allow_reuse=True) def check_max_size(cls, value: Optional[int]) -> Optional[int]: - if value and value < 1: + if value is not None and (value < 1 or value > 1000): raise ValueError("Autoscale sizes are not defined properly") return value @validator("min_size", allow_reuse=True) - def check_min_size(cls, value: int) -> int: - if value < 0 or value > 1000: + def check_min_size(cls, value: Optional[int]) -> Optional[int]: + if value is not None and (value < 0 or value > 1000): raise ValueError("Invalid pool min_size") return value From e0fbe869a0dbc80015a12d01377a7ad83ba06fdb Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 14:55:50 -0500 Subject: [PATCH 12/66] use set_new_size helper, which triggers event callbacks --- src/api-service/__app__/scaleset/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/api-service/__app__/scaleset/__init__.py b/src/api-service/__app__/scaleset/__init__.py index b62175ed9a..ee7d2abf51 100644 --- a/src/api-service/__app__/scaleset/__init__.py +++ b/src/api-service/__app__/scaleset/__init__.py @@ -137,10 +137,8 @@ def patch(req: func.HttpRequest) -> func.HttpResponse: ) if request.size is not None: - scaleset.size = request.size - scaleset.state = ScalesetState.resize + scaleset.set_new_size(request.size) - scaleset.save() scaleset.auth = None return ok(scaleset) From e3b081e325cb08aaaee73f2bf2c1fc5f00a550b3 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 14:58:21 -0500 Subject: [PATCH 13/66] ensure scaleset size is within max size boundaries --- src/api-service/__app__/onefuzzlib/workers/scalesets.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index 7d68e7377e..86c923424e 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -580,13 +580,14 @@ def update_configs(self) -> None: ) def set_new_size(self, size: int) -> None: - self.size = size + # ensure we always stay within max_size boundaries + self.size = min(size, self.max_size()) self.state = ScalesetState.resize self.save() send_event( EventScalesetSizeChanged( - scaleset_id=self.scaleset_id, pool_name=self.pool_name, size=size + scaleset_id=self.scaleset_id, pool_name=self.pool_name, size=self.size ) ) From 8f49be1d6c8b8feaa82c81476d793ce355a41a11 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 15:31:05 -0500 Subject: [PATCH 14/66] log pool.name rather than full pool info --- src/api-service/__app__/onefuzzlib/workers/autoscale.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index c33e54f447..8fbbb3ee3a 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -29,7 +29,7 @@ def set_shrink_queues(pool: Pool, scalesets: List[Scaleset], size: int) -> None: def scale_up(pool: Pool, scalesets: List[Scaleset], to_add: int) -> None: logging.info( "autoscale up - pool:%s to_add:%d scalesets:%s", - pool, + pool.name, to_add, [x.scaleset_id for x in scalesets], ) @@ -104,7 +104,7 @@ def shutdown_empty_scalesets(pool: Pool, scalesets: List[Scaleset]) -> None: def scale_down(pool: Pool, scalesets: List[Scaleset], to_remove: int) -> None: logging.info( "autoscale down - pool:%s to_remove:%d scalesets:%s", - pool, + pool.name, to_remove, [x.scaleset_id for x in scalesets], ) From e850b27a285a81f3fea0be6d89098d1227ece00f Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 15:31:18 -0500 Subject: [PATCH 15/66] reset siaze as we go based on ground truth for autoscaling scalesets --- src/api-service/__app__/onefuzzlib/workers/scalesets.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index 86c923424e..ff73532333 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -344,6 +344,11 @@ def cleanup_nodes(self) -> bool: except UnableToUpdate: logging.info("scaleset update already in progress: %s", self.scaleset_id) + if pool.autoscale and self.state == ScalesetState.running: + ground_truth_size = get_vmss_size(self.scaleset_id) + if ground_truth_size is not None and ground_truth_size != self.size: + self.set_new_size(ground_truth_size) + return bool(to_reimage) or bool(to_delete) def _resize_equal(self) -> None: From f3b66a89f6b2364d794376d45da5a1d5544d833e Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 16:50:29 -0500 Subject: [PATCH 16/66] use synthetic worksets to trigger nodes to recycle --- .../__app__/onefuzzlib/workers/autoscale.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 8fbbb3ee3a..d109019907 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -11,8 +11,12 @@ from typing import List from onefuzztypes.enums import NodeState, ScalesetState +from onefuzztypes.models import WorkSet +from onefuzztypes.primitives import Container +from ..azure.containers import get_container_sas_url from ..azure.creds import get_base_region +from ..azure.storage import StorageType from .nodes import Node from .pools import Pool from .scalesets import Scaleset @@ -111,6 +115,25 @@ def scale_down(pool: Pool, scalesets: List[Scaleset], to_remove: int) -> None: set_shrink_queues(pool, scalesets, to_remove) + # TODO: this injects synthetic WorkSet entries into the pool queue to + # trigger the nodes to reset faster + # + # This synthetic WorkSet uses the `tools` container as the workset setup + # container. + # + # This should be revisited. + if to_remove: + container_sas = get_container_sas_url( + Container("tools"), StorageType.config, read=True, list=True + ) + + workset = WorkSet( + reboot=False, script=False, work_units=[], setup_url=container_sas + ) + + for _ in range(to_remove): + pool.schedule_workset(workset) + def needed_nodes(pool: Pool) -> int: count = 0 From 7596cee6c2355a0c42accbc1c89b4f6a97985aa1 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 17:48:05 -0500 Subject: [PATCH 17/66] simplify logging --- src/api-service/__app__/onefuzzlib/workers/autoscale.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index d109019907..7878f8b720 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -151,12 +151,8 @@ def needed_nodes(pool: Pool) -> int: def autoscale_pool(pool: Pool) -> None: if not pool.autoscale: return - logging.info("autoscale pool. pool:%s config:%s", pool.name, pool.autoscale.json()) node_need_estimate = needed_nodes(pool) - logging.info( - "autoscale pool estimate. pool:%s estimate:%d", pool.name, node_need_estimate - ) new_size = node_need_estimate if pool.autoscale.min_size is not None: @@ -183,7 +179,7 @@ def autoscale_pool(pool: Pool) -> None: current_size += scaleset.size logging.info( - "autoscale pool %s - current_size: %d new_size: %d", + "autoscale pool:%s current_size: %d new_size: %d", pool.name, current_size, new_size, From 669110eeaf4ca7198d26ef59d29b1658c3445aad Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 17:48:34 -0500 Subject: [PATCH 18/66] fix min/max checks --- src/pytypes/onefuzztypes/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pytypes/onefuzztypes/models.py b/src/pytypes/onefuzztypes/models.py index 9d5a987ef9..5dee08b71f 100644 --- a/src/pytypes/onefuzztypes/models.py +++ b/src/pytypes/onefuzztypes/models.py @@ -561,8 +561,8 @@ class AutoScaleConfig(BaseModel): @root_validator() def check_data(cls, values: Any) -> Any: if ( - "max_size" in values - and values.get("max_size") + values.get("max_size") is not None + and values.get("min_size") is not None and values.get("min_size") > values.get("max_size") ): raise ValueError("The pool min_size is greater than max_size") From ab523c0331080a360e0e8ee4d6e6c7fc0d275861 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 5 Feb 2021 18:10:59 -0500 Subject: [PATCH 19/66] rename based on reality --- docs/webhook_events.md | 14 +++++++------- .../__app__/onefuzzlib/workers/scalesets.py | 4 ++-- src/pytypes/extra/generate-docs.py | 4 ++-- src/pytypes/onefuzztypes/events.py | 8 ++++---- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/webhook_events.md b/docs/webhook_events.md index 9377a9b505..0ac0760f64 100644 --- a/docs/webhook_events.md +++ b/docs/webhook_events.md @@ -37,7 +37,7 @@ Each event will be submitted via HTTP POST to the user provided URL. * [scaleset_created](#scaleset_created) * [scaleset_deleted](#scaleset_deleted) * [scaleset_failed](#scaleset_failed) -* [scaleset_size_changed](#scaleset_size_changed) +* [scaleset_resize_scheduled](#scaleset_resize_scheduled) * [task_created](#task_created) * [task_failed](#task_failed) * [task_state_updated](#task_state_updated) @@ -1071,7 +1071,7 @@ Each event will be submitted via HTTP POST to the user provided URL. } ``` -### scaleset_size_changed +### scaleset_resize_scheduled #### Example @@ -1107,7 +1107,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "pool_name", "size" ], - "title": "EventScalesetSizeChanged", + "title": "EventScalesetResizeScheduled", "type": "object" } ``` @@ -2291,7 +2291,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "title": "EventScalesetFailed", "type": "object" }, - "EventScalesetSizeChanged": { + "EventScalesetResizeScheduled": { "properties": { "pool_name": { "title": "Pool Name", @@ -2312,7 +2312,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "pool_name", "size" ], - "title": "EventScalesetSizeChanged", + "title": "EventScalesetResizeScheduled", "type": "object" }, "EventTaskCreated": { @@ -2442,7 +2442,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "scaleset_created", "scaleset_deleted", "scaleset_failed", - "scaleset_size_changed", + "scaleset_resize_scheduled", "task_created", "task_failed", "task_state_updated", @@ -2972,7 +2972,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "$ref": "#/definitions/EventScalesetDeleted" }, { - "$ref": "#/definitions/EventScalesetSizeChanged" + "$ref": "#/definitions/EventScalesetResizeScheduled" }, { "$ref": "#/definitions/EventTaskFailed" diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index ff73532333..c53ce78636 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -13,7 +13,7 @@ EventScalesetCreated, EventScalesetDeleted, EventScalesetFailed, - EventScalesetSizeChanged, + EventScalesetResizeScheduled, ) from onefuzztypes.models import Error from onefuzztypes.models import Scaleset as BASE_SCALESET @@ -591,7 +591,7 @@ def set_new_size(self, size: int) -> None: self.save() send_event( - EventScalesetSizeChanged( + EventScalesetResizeScheduled( scaleset_id=self.scaleset_id, pool_name=self.pool_name, size=self.size ) ) diff --git a/src/pytypes/extra/generate-docs.py b/src/pytypes/extra/generate-docs.py index 06ef4486a7..affaa2e2d5 100755 --- a/src/pytypes/extra/generate-docs.py +++ b/src/pytypes/extra/generate-docs.py @@ -41,7 +41,7 @@ EventScalesetCreated, EventScalesetFailed, EventScalesetDeleted, - EventScalesetSizeChanged, + EventScalesetResizeScheduled, EventJobCreated, EventJobStopped, EventTaskStateUpdated, @@ -189,7 +189,7 @@ def main(): ), ), EventFileAdded(container=Container("container-name"), filename="example.txt"), - EventScalesetSizeChanged( + EventScalesetResizeScheduled( scaleset_id=UUID(int=0), pool_name=PoolName("example"), size=0 ), ] diff --git a/src/pytypes/onefuzztypes/events.py b/src/pytypes/onefuzztypes/events.py index a61776ecd7..63eeb75015 100644 --- a/src/pytypes/onefuzztypes/events.py +++ b/src/pytypes/onefuzztypes/events.py @@ -84,7 +84,7 @@ class EventScalesetDeleted(BaseEvent): pool_name: PoolName -class EventScalesetSizeChanged(BaseModel): +class EventScalesetResizeScheduled(BaseModel): scaleset_id: UUID pool_name: PoolName size: int @@ -160,7 +160,7 @@ class EventFileAdded(BaseEvent): EventScalesetFailed, EventScalesetCreated, EventScalesetDeleted, - EventScalesetSizeChanged, + EventScalesetResizeScheduled, EventTaskFailed, EventTaskStateUpdated, EventTaskCreated, @@ -185,7 +185,7 @@ class EventType(Enum): scaleset_created = "scaleset_created" scaleset_deleted = "scaleset_deleted" scaleset_failed = "scaleset_failed" - scaleset_size_changed = "scaleset_size_changed" + scaleset_resize_scheduled = "scaleset_resize_scheduled" task_created = "task_created" task_failed = "task_failed" task_state_updated = "task_state_updated" @@ -209,7 +209,7 @@ class EventType(Enum): EventType.scaleset_created: EventScalesetCreated, EventType.scaleset_deleted: EventScalesetDeleted, EventType.scaleset_failed: EventScalesetFailed, - EventType.scaleset_size_changed: EventScalesetSizeChanged, + EventType.scaleset_resize_scheduled: EventScalesetResizeScheduled, EventType.task_created: EventTaskCreated, EventType.task_failed: EventTaskFailed, EventType.task_state_updated: EventTaskStateUpdated, From 6f5c1e8b3250b90eba23aa14e6d1ccd6ab28cdaf Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Mon, 8 Feb 2021 10:13:05 -0500 Subject: [PATCH 20/66] continued simplifying autoscale --- .../__app__/onefuzzlib/workers/scalesets.py | 68 ++++++++----------- 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index c53ce78636..8e8541722d 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -351,37 +351,6 @@ def cleanup_nodes(self) -> bool: return bool(to_reimage) or bool(to_delete) - def _resize_equal(self) -> None: - # NOTE: this is the only place we reset to the 'running' state. - # This ensures that our idea of scaleset size agrees with Azure - node_count = len(Node.search_states(scaleset_id=self.scaleset_id)) - if node_count == self.size: - logging.info("resize finished: %s", self.scaleset_id) - self.state = ScalesetState.running - self.save() - return - else: - logging.info( - "resize is finished, waiting for nodes to check in: " - "%s (%d of %d nodes checked in)", - self.scaleset_id, - node_count, - self.size, - ) - return - - def _resize_grow(self) -> None: - try: - resize_vmss(self.scaleset_id, self.size) - except UnableToUpdate: - logging.info("scaleset is mid-operation already") - return - - def _resize_shrink(self, to_remove: int) -> None: - queue = ShrinkQueue(self.scaleset_id) - for _ in range(to_remove): - queue.add_entry() - def resize(self) -> None: # no longer needing to resize if self.state != ScalesetState.resize: @@ -396,17 +365,38 @@ def resize(self) -> None: self.size = min(self.size, self.max_size()) # Treat Azure knowledge of the size of the scaleset as "ground truth" - size = get_vmss_size(self.scaleset_id) - if size is None: - logging.info("scaleset is unavailable: %s", self.scaleset_id) + ground_truth_size = get_vmss_size(self.scaleset_id) + if ground_truth_size is None: + logging.info("scaleset is unavailable during resize: %s", self.scaleset_id) return - if size == self.size: - self._resize_equal() - elif self.size > size: - self._resize_grow() + if ground_truth_size < self.size: + logging.info( + "scaleset resize - growing. scaleset:%s new_size:%d azure_size:%d", + self.scaleset_id, + self.size, + ground_truth_size, + ) + try: + resize_vmss(self.scaleset_id, self.size) + except UnableToUpdate: + logging.info( + "scaleset resize - unable to update, mid-operation already" + ) + return + elif ground_truth_size > self.size: + to_remove = ground_truth_size - self.size + logging.info( + "scaleset resize - shrinking. scaleset:%s removing:%d", + self.scaleset_id, + to_remove, + ) + ShrinkQueue(self.scaleset_id).set_size(to_remove) else: - self._resize_shrink(size - self.size) + logging.info("scaleset resize - no change. scaleset:%s", self.scaleset_id) + + self.state = ScalesetState.running + self.save() def delete_nodes(self, nodes: List[Node]) -> None: if not nodes: From 12377bd80624859785fbb997f7de575c953e03ec Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Mon, 8 Feb 2021 17:46:20 -0500 Subject: [PATCH 21/66] continued dev --- .../__app__/onefuzzlib/workers/autoscale.py | 20 ++++++++++--------- .../__app__/onefuzzlib/workers/scalesets.py | 1 + 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 7878f8b720..bf70bf40dd 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -8,7 +8,7 @@ import logging import os -from typing import List +from typing import List, Tuple from onefuzztypes.enums import NodeState, ScalesetState from onefuzztypes.models import WorkSet @@ -135,24 +135,23 @@ def scale_down(pool: Pool, scalesets: List[Scaleset], to_remove: int) -> None: pool.schedule_workset(workset) -def needed_nodes(pool: Pool) -> int: - count = 0 - +def needed_nodes(pool: Pool) -> Tuple[int, int]: # NOTE: queue peek only returns the first 30 objects. workset_queue = pool.peek_work_queue() - count += len(workset_queue) + scheduled_worksets = len(workset_queue) nodes = Node.search_states(pool_name=pool.name, states=NodeState.in_use()) - count += len(nodes) + from_nodes = len(nodes) - return count + return (scheduled_worksets, from_nodes) def autoscale_pool(pool: Pool) -> None: if not pool.autoscale: return - node_need_estimate = needed_nodes(pool) + scheduled_worksets, in_use_nodes = needed_nodes(pool) + node_need_estimate = scheduled_worksets + in_use_nodes new_size = node_need_estimate if pool.autoscale.min_size is not None: @@ -179,10 +178,13 @@ def autoscale_pool(pool: Pool) -> None: current_size += scaleset.size logging.info( - "autoscale pool:%s current_size: %d new_size: %d", + "autoscale pool:%s current_size: %d new_size: %d " + "(in-use nodes: %d, scheduled worksets: %d)", pool.name, current_size, new_size, + in_use_nodes, + scheduled_worksets, ) if new_size > current_size: diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index 50e0700ef9..60741938c7 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -328,6 +328,7 @@ def cleanup_nodes(self) -> bool: # Note, using `new=True` makes it such that if a node already has # checked in, this won't overwrite it. Node.create( + pool_id=pool.pool_id, pool_name=self.pool_name, machine_id=machine_id, scaleset_id=self.scaleset_id, From 3d8edb1fb6fd0f50abaab5a6e2b082c379a8de84 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Mon, 8 Feb 2021 17:54:48 -0500 Subject: [PATCH 22/66] unify log messages --- .../__app__/onefuzzlib/workers/autoscale.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index bf70bf40dd..a8c97ea0c0 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -22,6 +22,8 @@ from .scalesets import Scaleset from .shrink_queue import ShrinkQueue +AUTOSCALE_LOG_PREFIX = "autoscale: " + def set_shrink_queues(pool: Pool, scalesets: List[Scaleset], size: int) -> None: for scaleset in scalesets: @@ -32,7 +34,7 @@ def set_shrink_queues(pool: Pool, scalesets: List[Scaleset], size: int) -> None: def scale_up(pool: Pool, scalesets: List[Scaleset], to_add: int) -> None: logging.info( - "autoscale up - pool:%s to_add:%d scalesets:%s", + AUTOSCALE_LOG_PREFIX + "scale up - pool:%s to_add:%d scalesets:%s", pool.name, to_add, [x.scaleset_id for x in scalesets], @@ -53,7 +55,7 @@ def scale_up(pool: Pool, scalesets: List[Scaleset], to_add: int) -> None: if scaleset.size < scaleset_max_size: scaleset_to_add = min(to_add, scaleset_max_size - scaleset.size) logging.info( - "autoscale adding to scaleset: " + AUTOSCALE_LOG_PREFIX + "adding to scaleset: " "pool:%s scaleset:%s existing_size:%d adding:%d", pool.name, scaleset.scaleset_id, @@ -73,7 +75,9 @@ def scale_up(pool: Pool, scalesets: List[Scaleset], to_add: int) -> None: while to_add > 0: scaleset_size = min(base_size, to_add) logging.info( - "autoscale adding scaleset. pool:%s size:%s", pool.name, scaleset_size + AUTOSCALE_LOG_PREFIX + "adding scaleset. pool:%s size:%s", + pool.name, + scaleset_size, ) scaleset = Scaleset.create( pool_name=pool.name, @@ -84,7 +88,11 @@ def scale_up(pool: Pool, scalesets: List[Scaleset], to_add: int) -> None: spot_instances=config.spot_instances, tags={"pool": pool.name}, ) - logging.info("autoscale added scaleset:%s", scaleset.scaleset_id) + logging.info( + AUTOSCALE_LOG_PREFIX + "added pool:%s scaleset:%s", + pool.name, + scaleset.scaleset_id, + ) to_add -= scaleset_size @@ -98,7 +106,7 @@ def shutdown_empty_scalesets(pool: Pool, scalesets: List[Scaleset]) -> None: and scaleset.state not in ScalesetState.needs_work() ): logging.info( - "autoscale halting empty scaleset. pool:%s scaleset:%s", + AUTOSCALE_LOG_PREFIX + "halting empty scaleset. pool:%s scaleset:%s", pool.name, scaleset.scaleset_id, ) @@ -107,7 +115,7 @@ def shutdown_empty_scalesets(pool: Pool, scalesets: List[Scaleset]) -> None: def scale_down(pool: Pool, scalesets: List[Scaleset], to_remove: int) -> None: logging.info( - "autoscale down - pool:%s to_remove:%d scalesets:%s", + AUTOSCALE_LOG_PREFIX + "scaling down - pool:%s to_remove:%d scalesets:%s", pool.name, to_remove, [x.scaleset_id for x in scalesets], @@ -169,8 +177,9 @@ def autoscale_pool(pool: Pool) -> None: ] if unable_to_autoscale: logging.info( - "autoscale - pool has modifying scalesets, " - "unable to autoscale: %s - %s", + AUTOSCALE_LOG_PREFIX + + "unable to autoscale pool due to modifying scalesets. " + "pool:%s scalesets:%s", pool.name, unable_to_autoscale, ) @@ -178,7 +187,7 @@ def autoscale_pool(pool: Pool) -> None: current_size += scaleset.size logging.info( - "autoscale pool:%s current_size: %d new_size: %d " + AUTOSCALE_LOG_PREFIX + "status - pool:%s current_size: %d new_size: %d " "(in-use nodes: %d, scheduled worksets: %d)", pool.name, current_size, From a84ab7863e5d4c0208507eeb8a19a8a0e557db5c Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Mon, 8 Feb 2021 20:35:44 -0500 Subject: [PATCH 23/66] only count worksets with work --- src/api-service/__app__/onefuzzlib/workers/autoscale.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index a8c97ea0c0..320ca5713b 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -146,7 +146,8 @@ def scale_down(pool: Pool, scalesets: List[Scaleset], to_remove: int) -> None: def needed_nodes(pool: Pool) -> Tuple[int, int]: # NOTE: queue peek only returns the first 30 objects. workset_queue = pool.peek_work_queue() - scheduled_worksets = len(workset_queue) + # only count worksets with work + scheduled_worksets = len([x for x in workset_queue if x.work_units]) nodes = Node.search_states(pool_name=pool.name, states=NodeState.in_use()) from_nodes = len(nodes) From 097bc57c882cce2c14204016b9458168350c53c8 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Mon, 8 Feb 2021 20:36:59 -0500 Subject: [PATCH 24/66] skip worksets without work units in work queue population as well --- src/api-service/__app__/onefuzzlib/workers/pools.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/api-service/__app__/onefuzzlib/workers/pools.py b/src/api-service/__app__/onefuzzlib/workers/pools.py index 012de8a5fd..a5bacc9cbd 100644 --- a/src/api-service/__app__/onefuzzlib/workers/pools.py +++ b/src/api-service/__app__/onefuzzlib/workers/pools.py @@ -115,6 +115,10 @@ def populate_work_queue(self) -> None: worksets = self.peek_work_queue() for workset in worksets: + # only include work units with real work + if not workset.work_units: + continue + work_units = [ WorkUnitSummary( job_id=work_unit.job_id, From c53ecccf147bfa0fad77b7027d6aaa8c7e8f92a8 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Mon, 8 Feb 2021 21:03:39 -0500 Subject: [PATCH 25/66] clear synthetic worksets from the queue --- .../__app__/onefuzzlib/azure/queue.py | 45 ++++++++++++++++++- .../__app__/onefuzzlib/workers/autoscale.py | 20 +++++++++ .../__app__/onefuzzlib/workers/pools.py | 13 +++++- 3 files changed, 76 insertions(+), 2 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/azure/queue.py b/src/api-service/__app__/onefuzzlib/azure/queue.py index 2c117c2949..7e699e8b7c 100644 --- a/src/api-service/__app__/onefuzzlib/azure/queue.py +++ b/src/api-service/__app__/onefuzzlib/azure/queue.py @@ -7,7 +7,7 @@ import datetime import json import logging -from typing import List, Optional, Type, TypeVar, Union +from typing import List, Optional, Tuple, Type, TypeVar, Union from uuid import UUID from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError @@ -124,6 +124,23 @@ def send_message( pass +def delete_messages( + name: QueueNameType, storage_type: StorageType, messages: List[str] +) -> None: + queue = get_queue(name, storage_type) + if not queue: + return + + done = [] + for message in messages: + try: + queue.delete_message(message) + done.append(message) + except ResourceNotFoundError: + logging.debug("queue message already deleted: %s - %s", name, message) + logging.debug("queue messages deleted: %s", done) + + def remove_first_message(name: QueueNameType, storage_type: StorageType) -> bool: queue = get_queue(name, storage_type) if queue: @@ -169,6 +186,32 @@ def peek_queue( return result +# Peek at a max of 32 messages +# https://docs.microsoft.com/en-us/python/api/azure-storage-queue/azure.storage.queue.queueclient +def peek_queue_with_id( + name: QueueNameType, + storage_type: StorageType, + *, + object_type: Type[A], + max_messages: int = MAX_PEEK_SIZE, +) -> List[Tuple[str, A]]: + result: List[Tuple[str, A]] = [] + + # message count + if max_messages < MIN_PEEK_SIZE or max_messages > MAX_PEEK_SIZE: + raise ValueError("invalid max messages: %s" % max_messages) + + queue = get_queue(name, storage_type) + if not queue: + return result + + for message in queue.peek_messages(max_messages=max_messages): + decoded = base64.b64decode(message.content) + raw = json.loads(decoded) + result.append((message.id, object_type.parse_obj(raw))) + return result + + def queue_object( name: QueueNameType, message: BaseModel, diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 320ca5713b..af2018f681 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -16,6 +16,7 @@ from ..azure.containers import get_container_sas_url from ..azure.creds import get_base_region +from ..azure.queue import delete_messages from ..azure.storage import StorageType from .nodes import Node from .pools import Pool @@ -143,6 +144,23 @@ def scale_down(pool: Pool, scalesets: List[Scaleset], to_remove: int) -> None: pool.schedule_workset(workset) +def clear_synthetic_worksets(pool: Pool) -> None: + while True: + to_remove = [] + + for (message_id, workset) in pool.peek_work_queue_with_id(): + if not workset.work_units: + to_remove.append(message_id) + + if not to_remove: + break + + logging.info( + AUTOSCALE_LOG_PREFIX + "removing %d synthetic worksets", len(to_remove) + ) + delete_messages(pool.get_pool_queue(), StorageType.corpus, to_remove) + + def needed_nodes(pool: Pool) -> Tuple[int, int]: # NOTE: queue peek only returns the first 30 objects. workset_queue = pool.peek_work_queue() @@ -198,8 +216,10 @@ def autoscale_pool(pool: Pool) -> None: ) if new_size > current_size: + clear_synthetic_worksets(pool) scale_up(pool, scalesets, new_size - current_size) elif current_size > new_size: + clear_synthetic_worksets(pool) scale_down(pool, scalesets, current_size - new_size) shutdown_empty_scalesets(pool, scalesets) else: diff --git a/src/api-service/__app__/onefuzzlib/workers/pools.py b/src/api-service/__app__/onefuzzlib/workers/pools.py index a5bacc9cbd..134fe7584f 100644 --- a/src/api-service/__app__/onefuzzlib/workers/pools.py +++ b/src/api-service/__app__/onefuzzlib/workers/pools.py @@ -20,7 +20,13 @@ ) from onefuzztypes.primitives import PoolName -from ..azure.queue import create_queue, delete_queue, peek_queue, queue_object +from ..azure.queue import ( + create_queue, + delete_queue, + peek_queue, + peek_queue_with_id, + queue_object, +) from ..azure.storage import StorageType from ..events import send_event from ..orm import MappingIntStrAny, ORMMixin, QueryFilter @@ -104,6 +110,11 @@ def peek_work_queue(self) -> List[WorkSet]: self.get_pool_queue(), StorageType.corpus, object_type=WorkSet ) + def peek_work_queue_with_id(self) -> List[Tuple[str, WorkSet]]: + return peek_queue_with_id( + self.get_pool_queue(), StorageType.corpus, object_type=WorkSet + ) + def populate_work_queue(self) -> None: self.work_queue = [] From 591a162639b9889cc994d5ec58dba5dcb660563d Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Mon, 8 Feb 2021 22:04:03 -0500 Subject: [PATCH 26/66] cleanup synthetic worksets directly --- .../__app__/onefuzzlib/azure/queue.py | 37 +++++-------------- .../__app__/onefuzzlib/workers/autoscale.py | 29 +++++++++------ .../__app__/onefuzzlib/workers/pools.py | 13 +------ 3 files changed, 27 insertions(+), 52 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/azure/queue.py b/src/api-service/__app__/onefuzzlib/azure/queue.py index 7e699e8b7c..e8e0a5ba3d 100644 --- a/src/api-service/__app__/onefuzzlib/azure/queue.py +++ b/src/api-service/__app__/onefuzzlib/azure/queue.py @@ -7,11 +7,12 @@ import datetime import json import logging -from typing import List, Optional, Tuple, Type, TypeVar, Union +from typing import List, Optional, Type, TypeVar, Union from uuid import UUID from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError from azure.storage.queue import ( + QueueMessage, QueueSasPermissions, QueueServiceClient, generate_queue_sas, @@ -180,36 +181,16 @@ def peek_queue( return result for message in queue.peek_messages(max_messages=max_messages): - decoded = base64.b64decode(message.content) - raw = json.loads(decoded) - result.append(object_type.parse_obj(raw)) + decoded = decode_message(message, object_type=object_type) + if decoded: + result.append(decoded) return result -# Peek at a max of 32 messages -# https://docs.microsoft.com/en-us/python/api/azure-storage-queue/azure.storage.queue.queueclient -def peek_queue_with_id( - name: QueueNameType, - storage_type: StorageType, - *, - object_type: Type[A], - max_messages: int = MAX_PEEK_SIZE, -) -> List[Tuple[str, A]]: - result: List[Tuple[str, A]] = [] - - # message count - if max_messages < MIN_PEEK_SIZE or max_messages > MAX_PEEK_SIZE: - raise ValueError("invalid max messages: %s" % max_messages) - - queue = get_queue(name, storage_type) - if not queue: - return result - - for message in queue.peek_messages(max_messages=max_messages): - decoded = base64.b64decode(message.content) - raw = json.loads(decoded) - result.append((message.id, object_type.parse_obj(raw))) - return result +def decode_message(message: QueueMessage, object_type: Type[A]) -> Optional[A]: + decoded = base64.b64decode(message.content) + raw = json.loads(decoded) + return object_type.parse_obj(raw) def queue_object( diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index af2018f681..785751c2ab 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -16,7 +16,7 @@ from ..azure.containers import get_container_sas_url from ..azure.creds import get_base_region -from ..azure.queue import delete_messages +from ..azure.queue import decode_message, get_queue from ..azure.storage import StorageType from .nodes import Node from .pools import Pool @@ -145,20 +145,25 @@ def scale_down(pool: Pool, scalesets: List[Scaleset], to_remove: int) -> None: def clear_synthetic_worksets(pool: Pool) -> None: - while True: - to_remove = [] + client = get_queue(pool.get_pool_queue(), StorageType.corpus) + if client is None: + return - for (message_id, workset) in pool.peek_work_queue_with_id(): - if not workset.work_units: - to_remove.append(message_id) + deleted = 0 - if not to_remove: - break + for message in client.receive_messages(): + decoded = decode_message(message, WorkSet) + if not decoded: + logging.warning(AUTOSCALE_LOG_PREFIX + "decode workset failed: %s", message) + continue - logging.info( - AUTOSCALE_LOG_PREFIX + "removing %d synthetic worksets", len(to_remove) - ) - delete_messages(pool.get_pool_queue(), StorageType.corpus, to_remove) + if decoded.work_units: + client.update_message(message, visibility_timeout=0) + else: + client.delete_message(message) + deleted += 1 + + logging.info(AUTOSCALE_LOG_PREFIX + "removed %d synthetic worksets", deleted) def needed_nodes(pool: Pool) -> Tuple[int, int]: diff --git a/src/api-service/__app__/onefuzzlib/workers/pools.py b/src/api-service/__app__/onefuzzlib/workers/pools.py index 134fe7584f..a5bacc9cbd 100644 --- a/src/api-service/__app__/onefuzzlib/workers/pools.py +++ b/src/api-service/__app__/onefuzzlib/workers/pools.py @@ -20,13 +20,7 @@ ) from onefuzztypes.primitives import PoolName -from ..azure.queue import ( - create_queue, - delete_queue, - peek_queue, - peek_queue_with_id, - queue_object, -) +from ..azure.queue import create_queue, delete_queue, peek_queue, queue_object from ..azure.storage import StorageType from ..events import send_event from ..orm import MappingIntStrAny, ORMMixin, QueryFilter @@ -110,11 +104,6 @@ def peek_work_queue(self) -> List[WorkSet]: self.get_pool_queue(), StorageType.corpus, object_type=WorkSet ) - def peek_work_queue_with_id(self) -> List[Tuple[str, WorkSet]]: - return peek_queue_with_id( - self.get_pool_queue(), StorageType.corpus, object_type=WorkSet - ) - def populate_work_queue(self) -> None: self.work_queue = [] From 7284420bd3381835561a317e934a0d4c75e8c536 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Mon, 8 Feb 2021 22:05:48 -0500 Subject: [PATCH 27/66] added log count to workset cleanup --- src/api-service/__app__/onefuzzlib/workers/autoscale.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 785751c2ab..0be84eb6e0 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -150,6 +150,7 @@ def clear_synthetic_worksets(pool: Pool) -> None: return deleted = 0 + ignored = 0 for message in client.receive_messages(): decoded = decode_message(message, WorkSet) @@ -159,11 +160,16 @@ def clear_synthetic_worksets(pool: Pool) -> None: if decoded.work_units: client.update_message(message, visibility_timeout=0) + ignored += 1 else: client.delete_message(message) deleted += 1 - logging.info(AUTOSCALE_LOG_PREFIX + "removed %d synthetic worksets", deleted) + logging.info( + AUTOSCALE_LOG_PREFIX + "cleanup synthetic worksets. ignored:%d deleted:%d", + ignored, + deleted, + ) def needed_nodes(pool: Pool) -> Tuple[int, int]: From bf017df370ff8622e2e6f598f10db17d35ccc2d7 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Tue, 9 Feb 2021 09:24:53 -0500 Subject: [PATCH 28/66] drastically reduce logging for cleanup --- .../__app__/onefuzzlib/workers/autoscale.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 0be84eb6e0..8b4856287b 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -165,11 +165,11 @@ def clear_synthetic_worksets(pool: Pool) -> None: client.delete_message(message) deleted += 1 - logging.info( - AUTOSCALE_LOG_PREFIX + "cleanup synthetic worksets. ignored:%d deleted:%d", - ignored, - deleted, - ) + logging.info( + AUTOSCALE_LOG_PREFIX + "cleanup synthetic worksets. ignored:%d deleted:%d", + ignored, + deleted, + ) def needed_nodes(pool: Pool) -> Tuple[int, int]: From a66097960e535d5b10336c82d8b605f9e7e4b58e Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Tue, 9 Feb 2021 09:57:59 -0500 Subject: [PATCH 29/66] set SignalR transport --- src/deployment/azuredeploy.json | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/deployment/azuredeploy.json b/src/deployment/azuredeploy.json index 7a034539b7..ad6f49734d 100644 --- a/src/deployment/azuredeploy.json +++ b/src/deployment/azuredeploy.json @@ -210,6 +210,10 @@ "name": "AzureSignalRConnectionString", "value": "[listkeys(resourceId('Microsoft.SignalRService/SignalR', variables('signalr-name')), '2018-10-01').primaryConnectionString]" }, + { + "name": "AzureSignalRServiceTransportType", + "value": "Transient" + }, { "name": "ONEFUZZ_INSTANCE_NAME", "value": "[parameters('name')]" From 6f7c8cb2ca7de743625dbe0b5f188827a1565d5b Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Tue, 9 Feb 2021 11:07:44 -0500 Subject: [PATCH 30/66] use the cached Azure Identity instance --- src/api-service/__app__/onefuzzlib/azure/storage.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/azure/storage.py b/src/api-service/__app__/onefuzzlib/azure/storage.py index f03e1eb6c2..385c74895a 100644 --- a/src/api-service/__app__/onefuzzlib/azure/storage.py +++ b/src/api-service/__app__/onefuzzlib/azure/storage.py @@ -9,12 +9,11 @@ from enum import Enum from typing import List, Tuple, cast -from azure.identity import DefaultAzureCredential from azure.mgmt.storage import StorageManagementClient from memoization import cached from msrestazure.tools import parse_resource_id -from .creds import get_base_resource_group, get_subscription +from .creds import get_base_resource_group, get_subscription, get_identity class StorageType(Enum): @@ -25,7 +24,7 @@ class StorageType(Enum): @cached def get_mgmt_client() -> StorageManagementClient: return StorageManagementClient( - credential=DefaultAzureCredential(), subscription_id=get_subscription() + credential=get_identity(), subscription_id=get_subscription() ) From 348fe2d0f9d9d2bc66df06621c7925a3591ab728 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Tue, 9 Feb 2021 11:22:39 -0500 Subject: [PATCH 31/66] lint --- src/api-service/__app__/onefuzzlib/azure/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api-service/__app__/onefuzzlib/azure/storage.py b/src/api-service/__app__/onefuzzlib/azure/storage.py index 385c74895a..b2d4149571 100644 --- a/src/api-service/__app__/onefuzzlib/azure/storage.py +++ b/src/api-service/__app__/onefuzzlib/azure/storage.py @@ -13,7 +13,7 @@ from memoization import cached from msrestazure.tools import parse_resource_id -from .creds import get_base_resource_group, get_subscription, get_identity +from .creds import get_base_resource_group, get_identity, get_subscription class StorageType(Enum): From 4e26c88f4be9ea4329a088fa3f69005614fd5198 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Tue, 9 Feb 2021 16:34:57 -0500 Subject: [PATCH 32/66] don't readd within receive --- src/api-service/__app__/onefuzzlib/workers/autoscale.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 8b4856287b..1c248c0faf 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -152,6 +152,7 @@ def clear_synthetic_worksets(pool: Pool) -> None: deleted = 0 ignored = 0 + keeping = [] for message in client.receive_messages(): decoded = decode_message(message, WorkSet) if not decoded: @@ -159,12 +160,15 @@ def clear_synthetic_worksets(pool: Pool) -> None: continue if decoded.work_units: - client.update_message(message, visibility_timeout=0) + keeping.append(message) ignored += 1 else: client.delete_message(message) deleted += 1 + for message in keeping: + client.update_message(message, visibility_timeout=0) + logging.info( AUTOSCALE_LOG_PREFIX + "cleanup synthetic worksets. ignored:%d deleted:%d", ignored, From 71e8371cbcd382a03fabcc6451cfa6251fa8211a Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Tue, 9 Feb 2021 18:29:02 -0500 Subject: [PATCH 33/66] Fix wait_for_running for templates --- src/cli/onefuzz/templates/__init__.py | 34 +++++++++++++++++---------- src/pytypes/onefuzztypes/enums.py | 4 ++++ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/cli/onefuzz/templates/__init__.py b/src/cli/onefuzz/templates/__init__.py index 552187aa5d..3d41f1cbf0 100644 --- a/src/cli/onefuzz/templates/__init__.py +++ b/src/cli/onefuzz/templates/__init__.py @@ -9,8 +9,8 @@ import zipfile from typing import Any, Dict, List, Optional, Tuple -from onefuzztypes.enums import OS, ContainerType, TaskState -from onefuzztypes.models import Job, NotificationConfig +from onefuzztypes.enums import OS, ContainerType, JobState, TaskState +from onefuzztypes.models import Job, NotificationConfig, Task from onefuzztypes.primitives import Container, Directory, File from onefuzz.backend import wait @@ -233,30 +233,38 @@ def wait_on( } self.wait_for_running = wait_for_running - def check_current_job(self) -> Job: - job = self.onefuzz.jobs.get(self.job.job_id) - if job.state in ["stopped", "stopping"]: + def check_current_job(self) -> List[Task]: + self.job = self.onefuzz.jobs.get(self.job.job_id) + if self.job.state in JobState.shutting_down(): raise StoppedEarly("job unexpectedly stopped early") errors = [] + tasks = [] for task in self.onefuzz.tasks.list(job_id=self.job.job_id): - if task.state in ["stopped", "stopping"]: + if task.state in TaskState.shutting_down(): if task.error: errors.append("%s: %s" % (task.config.task.type, task.error)) else: errors.append("%s" % task.config.task.type) + tasks.append(task) if errors: raise StoppedEarly("tasks stopped unexpectedly.\n%s" % "\n".join(errors)) - return job + return tasks def get_waiting(self) -> List[str]: - tasks = self.onefuzz.tasks.list(job_id=self.job.job_id) - waiting = [ - "%s:%s" % (x.config.task.type.name, x.state.name) - for x in tasks - if x.state not in TaskState.has_started() - ] + tasks = self.check_current_job() + + waiting = [] + for task in tasks: + state_msg = task.state.name + if task.state in TaskState.has_started(): + task = self.onefuzz.tasks.get(task.task_id) + if task.events: + continue + state_msg = "waiting-for-heartbeat" + + waiting.append(f"{task.config.task.type.name}:{state_msg}") return waiting def is_running(self) -> Tuple[bool, str, Any]: diff --git a/src/pytypes/onefuzztypes/enums.py b/src/pytypes/onefuzztypes/enums.py index 96d3765b73..869e0dd1ca 100644 --- a/src/pytypes/onefuzztypes/enums.py +++ b/src/pytypes/onefuzztypes/enums.py @@ -108,6 +108,10 @@ def needs_work(cls) -> List["JobState"]: """ return [cls.init, cls.stopping] + @classmethod + def shutting_down(cls) -> List["JobState"]: + return [cls.stopping, cls.stopped] + class TaskState(Enum): init = "init" From a0f184544606be79db6c7ab243ab7ea6b46bfd48 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 24 Mar 2021 12:28:13 -0400 Subject: [PATCH 34/66] lint --- src/pytypes/extra/generate-docs.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/pytypes/extra/generate-docs.py b/src/pytypes/extra/generate-docs.py index 9af3d8429d..30a98eda45 100755 --- a/src/pytypes/extra/generate-docs.py +++ b/src/pytypes/extra/generate-docs.py @@ -5,7 +5,7 @@ from typing import List, Optional from uuid import UUID -from onefuzztypes.primitives import Region, Container, PoolName + from onefuzztypes.enums import ( OS, Architecture, @@ -34,10 +34,8 @@ EventRegressionReported, EventScalesetCreated, EventScalesetDeleted, - EventScalesetResizeScheduled, - EventJobCreated, - EventJobStopped, EventScalesetFailed, + EventScalesetResizeScheduled, EventTaskCreated, EventTaskFailed, EventTaskHeartbeat, From 5ab569693c6be00e4dee385e4af3b1ea4a041ee2 Mon Sep 17 00:00:00 2001 From: Cheick Keita Date: Mon, 26 Apr 2021 11:43:45 -0700 Subject: [PATCH 35/66] small cleanup --- src/api-service/__app__/onefuzzlib/workers/autoscale.py | 5 ++--- src/api-service/__app__/onefuzzlib/workers/nodes.py | 6 ++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 49742b4848..549b2f4fa4 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -205,10 +205,9 @@ def autoscale_pool(pool: Pool) -> None: scalesets = Scaleset.search_by_pool(pool.name) current_size = 0 for scaleset in scalesets: + valid_auto_scale_states = ScalesetState.include_autoscale_count() unable_to_autoscale = [ - x.scaleset_id - for x in scalesets - if x.state not in ScalesetState.include_autoscale_count() + x.scaleset_id for x in scalesets if x.state not in valid_auto_scale_states ] if unable_to_autoscale: logging.info( diff --git a/src/api-service/__app__/onefuzzlib/workers/nodes.py b/src/api-service/__app__/onefuzzlib/workers/nodes.py index 1f7ed920fa..10eec1f318 100644 --- a/src/api-service/__app__/onefuzzlib/workers/nodes.py +++ b/src/api-service/__app__/onefuzzlib/workers/nodes.py @@ -245,11 +245,9 @@ def mark_tasks_stopped_early(self, error: Optional[Error] = None) -> None: def could_shrink_scaleset(self) -> bool: if self.scaleset_id: - if ShrinkQueue(self.scaleset_id).should_shrink(): - return True + return ShrinkQueue(self.scaleset_id).should_shrink() if self.pool_id: - if ShrinkQueue(self.pool_id).should_shrink(): - return True + return ShrinkQueue(self.pool_id).should_shrink() return False From bfabcce0a52d637be1b546ea6bc065f7e12a43b9 Mon Sep 17 00:00:00 2001 From: Cheick Keita Date: Mon, 26 Apr 2021 12:05:04 -0700 Subject: [PATCH 36/66] formatting --- src/pytypes/onefuzztypes/enums.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytypes/onefuzztypes/enums.py b/src/pytypes/onefuzztypes/enums.py index fd297afe08..1c633d7938 100644 --- a/src/pytypes/onefuzztypes/enums.py +++ b/src/pytypes/onefuzztypes/enums.py @@ -317,7 +317,7 @@ def available(cls) -> List["ScalesetState"]: @classmethod def include_autoscale_count(cls) -> List["ScalesetState"]: - """ set of states that indicate inclusion in autoscale counts """ + """set of states that indicate inclusion in autoscale counts""" unavailable = [cls.halt, cls.init, cls.setup] return [x for x in cls if x not in unavailable] From b15143df67e0d69adf78b21d63d94ebc60afc8bb Mon Sep 17 00:00:00 2001 From: Cheick Keita Date: Tue, 27 Apr 2021 07:52:24 -0700 Subject: [PATCH 37/66] undo could_shrink_scaleset changes --- src/api-service/__app__/onefuzzlib/workers/nodes.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/nodes.py b/src/api-service/__app__/onefuzzlib/workers/nodes.py index 3e434fad13..3288d7d416 100644 --- a/src/api-service/__app__/onefuzzlib/workers/nodes.py +++ b/src/api-service/__app__/onefuzzlib/workers/nodes.py @@ -245,9 +245,11 @@ def mark_tasks_stopped_early(self, error: Optional[Error] = None) -> None: def could_shrink_scaleset(self) -> bool: if self.scaleset_id: - return ShrinkQueue(self.scaleset_id).should_shrink() + if ShrinkQueue(self.scaleset_id).should_shrink(): + return True if self.pool_id: - return ShrinkQueue(self.pool_id).should_shrink() + if ShrinkQueue(self.pool_id).should_shrink(): + return True return False From ee30880122c02b991377cb39eea2e0ab1cdabad9 Mon Sep 17 00:00:00 2001 From: Cheick Keita Date: Tue, 27 Apr 2021 09:31:35 -0700 Subject: [PATCH 38/66] formatting --- src/api-service/__app__/onefuzzlib/azure/queue.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/azure/queue.py b/src/api-service/__app__/onefuzzlib/azure/queue.py index 5b5c48e6da..9c3d765dac 100644 --- a/src/api-service/__app__/onefuzzlib/azure/queue.py +++ b/src/api-service/__app__/onefuzzlib/azure/queue.py @@ -187,9 +187,9 @@ def peek_queue( result.append(object_type.parse_obj(raw)) except ResourceNotFoundError: return result - + return result - + def decode_message(message: QueueMessage, object_type: Type[A]) -> Optional[A]: decoded = base64.b64decode(message.content) From 51c82a6edb84e5472b74d7df9cb8bfe09103fa35 Mon Sep 17 00:00:00 2001 From: Cheick Keita Date: Mon, 10 May 2021 13:26:01 -0700 Subject: [PATCH 39/66] fix build, remove unused reference to yaque --- src/agent/Cargo.lock | 97 ++---------------------------- src/agent/storage-queue/Cargo.toml | 3 +- 2 files changed, 7 insertions(+), 93 deletions(-) diff --git a/src/agent/Cargo.lock b/src/agent/Cargo.lock index cae8944a16..43fbca4426 100644 --- a/src/agent/Cargo.lock +++ b/src/agent/Cargo.lock @@ -39,12 +39,6 @@ version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28b2cd92db5cbd74e8e5028f7e27dd7aa3090e89e4f2a197cc7c8dfb69c7063b" -[[package]] -name = "anymap" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33954243bd79057c2de7338850b85983a44588021f8a5fee574a8888c6de4344" - [[package]] name = "appinsights" version = "0.1.5" @@ -796,17 +790,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ab7d1bd1bd33cc98b0889831b72da23c0aa4df9cec7e0702f46ecea04b35db6" dependencies = [ "bitflags", - "fsevent-sys 2.0.1", -] - -[[package]] -name = "fsevent" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97f347202c95c98805c216f9e1df210e8ebaec9fdb2365700a43c10797a35e63" -dependencies = [ - "bitflags", - "fsevent-sys 3.0.2", + "fsevent-sys", ] [[package]] @@ -818,15 +802,6 @@ dependencies = [ "libc", ] -[[package]] -name = "fsevent-sys" -version = "3.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a29c77f1ca394c3e73a9a5d24cfcabb734682d9634fc398f2204a63c994120" -dependencies = [ - "libc", -] - [[package]] name = "fuchsia-zircon" version = "0.3.3" @@ -1272,17 +1247,6 @@ dependencies = [ "libc", ] -[[package]] -name = "inotify" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d19f57db1baad9d09e43a3cd76dcf82ebdafd37d75c9498b87762dba77c93f15" -dependencies = [ - "bitflags", - "inotify-sys", - "libc", -] - [[package]] name = "inotify-sys" version = "0.1.5" @@ -1645,9 +1609,9 @@ checksum = "2599080e87c9bd051ddb11b10074f4da7b1223298df65d4c2ec5bcf309af1533" dependencies = [ "bitflags", "filetime", - "fsevent 0.4.0", - "fsevent-sys 2.0.1", - "inotify 0.7.1", + "fsevent", + "fsevent-sys", + "inotify", "libc", "mio 0.6.23", "mio-extras", @@ -1655,25 +1619,6 @@ dependencies = [ "winapi 0.3.9", ] -[[package]] -name = "notify" -version = "5.0.0-pre.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ebe7699a0f8c5759450716ee03d231685c22b4fe8f406c42c22e0ad94d40ce7" -dependencies = [ - "anymap", - "bitflags", - "crossbeam-channel 0.5.1", - "filetime", - "fsevent 2.0.2", - "fsevent-sys 3.0.2", - "inotify 0.9.2", - "libc", - "mio 0.7.11", - "walkdir", - "winapi 0.3.9", -] - [[package]] name = "ntapi" version = "0.3.6" @@ -1738,7 +1683,7 @@ dependencies = [ "lazy_static", "log", "nix 0.19.1", - "notify 4.0.16", + "notify", "onefuzz-telemetry", "pete", "proc-maps", @@ -1757,7 +1702,7 @@ dependencies = [ "storage-queue", "strum", "strum_macros", - "sysinfo 0.16.5", + "sysinfo", "tempfile", "tokio 1.5.0", "tokio-stream", @@ -2753,7 +2698,6 @@ dependencies = [ "serde_json", "tokio 1.5.0", "uuid", - "yaque", ] [[package]] @@ -2826,21 +2770,6 @@ dependencies = [ "unicode-xid 0.2.1", ] -[[package]] -name = "sysinfo" -version = "0.14.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2983daff11a197c7c406b130579bc362177aa54cf2cc1f34d6ac88fccaa6a5e1" -dependencies = [ - "cfg-if 0.1.10", - "doc-comment", - "libc", - "ntapi", - "once_cell", - "rayon", - "winapi 0.3.9", -] - [[package]] name = "sysinfo" version = "0.16.5" @@ -3473,20 +3402,6 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a" -[[package]] -name = "yaque" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "543707de19373df21757dc231c46407701d0b05a8067542584ea5c6fa8602725" -dependencies = [ - "futures", - "lazy_static", - "log", - "notify 5.0.0-pre.7", - "rand 0.7.3", - "sysinfo 0.14.15", -] - [[package]] name = "z3-sys" version = "0.6.3" diff --git a/src/agent/storage-queue/Cargo.toml b/src/agent/storage-queue/Cargo.toml index 3a68670ee8..f67850529d 100644 --- a/src/agent/storage-queue/Cargo.toml +++ b/src/agent/storage-queue/Cargo.toml @@ -23,5 +23,4 @@ serde_json = "1.0" serde-xml-rs = "0.4" tokio = { version = "1.5.0" , features=["full"] } queue-file = "1.1" -uuid = { version = "0.8", features = ["serde", "v4"] } -yaque = "0.5.1" +uuid = { version = "0.8", features = ["serde", "v4"] } \ No newline at end of file From 2f7fbc02f09984398e31b2d646c326aac9480324 Mon Sep 17 00:00:00 2001 From: Cheick Keita Date: Mon, 10 May 2021 14:24:19 -0700 Subject: [PATCH 40/66] formatting --- src/api-service/__app__/onefuzzlib/workers/nodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/nodes.py b/src/api-service/__app__/onefuzzlib/workers/nodes.py index e5c63ef923..e135596336 100644 --- a/src/api-service/__app__/onefuzzlib/workers/nodes.py +++ b/src/api-service/__app__/onefuzzlib/workers/nodes.py @@ -31,8 +31,8 @@ from ..azure.vmss import get_instance_id from ..events import send_event from ..orm import MappingIntStrAny, ORMMixin, QueryFilter -from .shrink_queue import ShrinkQueue from ..versions import is_minimum_version +from .shrink_queue import ShrinkQueue NODE_EXPIRATION_TIME: datetime.timedelta = datetime.timedelta(hours=1) NODE_REIMAGE_TIME: datetime.timedelta = datetime.timedelta(days=7) From 37e714fb03c98393f2bd17d38e41bec1b7ba7b07 Mon Sep 17 00:00:00 2001 From: Cheick Keita Date: Mon, 10 May 2021 15:34:03 -0700 Subject: [PATCH 41/66] build fix --- src/agent/onefuzz-agent/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agent/onefuzz-agent/Cargo.toml b/src/agent/onefuzz-agent/Cargo.toml index 1ce01ea960..766f8ca363 100644 --- a/src/agent/onefuzz-agent/Cargo.toml +++ b/src/agent/onefuzz-agent/Cargo.toml @@ -30,7 +30,7 @@ serde = "1.0" serde_json = "1.0" onefuzz = { path = "../onefuzz" } onefuzz-telemetry = { path = "../onefuzz-telemetry" } -path-absolutize = "3.0.6" +path-absolutize = "3.0.10" reqwest-retry = { path = "../reqwest-retry" } stacktrace-parser = { path = "../stacktrace-parser" } storage-queue = { path = "../storage-queue" } From 9056252d19134dd0eb782c10bde174da57d7c9cc Mon Sep 17 00:00:00 2001 From: Cheick Keita Date: Mon, 17 May 2021 13:53:07 -0700 Subject: [PATCH 42/66] update integration tests autoscale --- src/integration-tests/integration-test.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/integration-tests/integration-test.py b/src/integration-tests/integration-test.py index 1728b7bd8d..ec93857073 100755 --- a/src/integration-tests/integration-test.py +++ b/src/integration-tests/integration-test.py @@ -216,16 +216,15 @@ def setup( self, *, region: Optional[Region] = None, - pool_size: int, os_list: List[OS], ) -> None: self.inject_log(self.start_log_marker) for entry in os_list: name = PoolName(f"testpool-{entry.name}-{self.test_id}") self.logger.info("creating pool: %s:%s", entry.name, name) - self.pools[entry] = self.of.pools.create(name, entry) - self.logger.info("creating scaleset for pool: %s", name) - self.of.scalesets.create(name, pool_size, region=region) + self.pools[entry] = self.of.pools.create_autoscale( + name, entry, region=region + ) def launch( self, path: Directory, *, os_list: List[OS], targets: List[str], duration=int @@ -833,7 +832,6 @@ def launch( samples: Directory, *, endpoint: Optional[str] = None, - pool_size: int = 10, region: Optional[Region] = None, os_list: List[OS] = [OS.linux, OS.windows], targets: List[str] = list(TARGETS.keys()), @@ -846,7 +844,7 @@ def launch( self.onefuzz.__setup__(endpoint=endpoint) tester = TestOnefuzz(self.onefuzz, self.logger, test_id) - tester.setup(region=region, pool_size=pool_size, os_list=os_list) + tester.setup(region=region, os_list=os_list) tester.launch(samples, os_list=os_list, targets=targets, duration=duration) return test_id @@ -865,7 +863,6 @@ def test( samples: Directory, *, endpoint: Optional[str] = None, - pool_size: int = 15, region: Optional[Region] = None, os_list: List[OS] = [OS.linux, OS.windows], targets: List[str] = list(TARGETS.keys()), @@ -880,7 +877,6 @@ def test( self.launch( samples, endpoint=endpoint, - pool_size=pool_size, region=region, os_list=os_list, targets=targets, From 4306753f9449f441d600e15362ebc59779e4d37b Mon Sep 17 00:00:00 2001 From: Cheick Keita Date: Mon, 17 May 2021 14:09:53 -0700 Subject: [PATCH 43/66] Tests fix --- src/api-service/__app__/onefuzzlib/workers/scalesets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index dbcb6511a4..5371e9eb75 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -6,7 +6,7 @@ import datetime import logging from typing import Any, Dict, List, Optional, Tuple, Union -from uuid import UUID +from uuid import UUID, uuid4 from onefuzztypes.enums import ErrorCode, NodeState, PoolState, ScalesetState from onefuzztypes.events import ( @@ -20,6 +20,7 @@ from onefuzztypes.models import Scaleset as BASE_SCALESET from onefuzztypes.models import ScalesetNodeState from onefuzztypes.primitives import PoolName, Region +from pydantic import BaseModel, Field from ..__version__ import __version__ from ..azure.auth import build_auth From 4669e5111813d731f4fa6a18df859d1ec7781204 Mon Sep 17 00:00:00 2001 From: Cheick Keita Date: Mon, 17 May 2021 14:39:30 -0700 Subject: [PATCH 44/66] fix import --- src/api-service/__app__/onefuzzlib/workers/scalesets.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index 5371e9eb75..555106009f 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -26,6 +26,14 @@ from ..azure.auth import build_auth from ..azure.image import get_os from ..azure.network import Network +from ..azure.queue import ( + clear_queue, + create_queue, + delete_queue, + queue_object, + remove_first_message, +) +from ..azure.storage import StorageType from ..azure.vmss import ( UnableToUpdate, create_vmss, From 5206ee8c6d5d2810662365e1dc18de76db6e7962 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 15:44:14 -0400 Subject: [PATCH 45/66] merge changes --- src/api-service/__app__/onefuzzlib/workers/nodes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/nodes.py b/src/api-service/__app__/onefuzzlib/workers/nodes.py index f8e484bc89..d6507f5aca 100644 --- a/src/api-service/__app__/onefuzzlib/workers/nodes.py +++ b/src/api-service/__app__/onefuzzlib/workers/nodes.py @@ -253,10 +253,10 @@ def mark_tasks_stopped_early(self, error: Optional[Error] = None) -> None: def could_shrink_scaleset(self) -> bool: if self.scaleset_id and ShrinkQueue(self.scaleset_id).should_shrink(): return True - + if self.pool_id and ShrinkQueue(self.pool_id).should_shrink(): return True - + return False def can_process_new_work(self) -> bool: From 4a39638e7e75ec429752574f851d20dec76fcb1d Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 15:44:39 -0400 Subject: [PATCH 46/66] regen --- docs/webhook_events.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/webhook_events.md b/docs/webhook_events.md index 1dfc7b1c1b..ac9f9abdbf 100644 --- a/docs/webhook_events.md +++ b/docs/webhook_events.md @@ -5509,7 +5509,6 @@ Each event will be submitted via HTTP POST to the user provided URL. "scaleset_created", "scaleset_deleted", "scaleset_failed", - "scaleset_resize_scheduled", "scaleset_state_updated", "scaleset_resize_scheduled", "task_created", @@ -6220,9 +6219,6 @@ Each event will be submitted via HTTP POST to the user provided URL. { "$ref": "#/definitions/EventScalesetDeleted" }, - { - "$ref": "#/definitions/EventScalesetResizeScheduled" - }, { "$ref": "#/definitions/EventScalesetStateUpdated" }, From 9d6b897fd9afb2e0f856f05b5a45d17e25c64b3a Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 15:48:26 -0400 Subject: [PATCH 47/66] move to can_update --- .../__app__/onefuzzlib/workers/autoscale.py | 2 +- src/pytypes/onefuzztypes/enums.py | 16 ---------------- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 549b2f4fa4..2d7ffa927d 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -51,7 +51,7 @@ def scale_up(pool: Pool, scalesets: List[Scaleset], to_add: int) -> None: if to_add <= 0: break - if scaleset.state in ScalesetState.can_resize(): + if scaleset.state in ScalesetState.can_update(): scaleset_max_size = scaleset.max_size() if scaleset.size < scaleset_max_size: scaleset_to_add = min(to_add, scaleset_max_size - scaleset.size) diff --git a/src/pytypes/onefuzztypes/enums.py b/src/pytypes/onefuzztypes/enums.py index 5909c22444..5de79ccf41 100644 --- a/src/pytypes/onefuzztypes/enums.py +++ b/src/pytypes/onefuzztypes/enums.py @@ -330,22 +330,6 @@ def include_autoscale_count(cls) -> List["ScalesetState"]: unavailable = [cls.halt, cls.init, cls.setup] return [x for x in cls if x not in unavailable] - @classmethod - def can_resize(cls) -> List["ScalesetState"]: - """ - set of states that indicate the scaleset can be resized - """ - return [cls.running, cls.resize] - - @classmethod - def modifying(cls) -> List["ScalesetState"]: - """set of states that indicate scaleset is resizing""" - return [ - cls.halt, - cls.init, - cls.setup, - ] - class Architecture(Enum): x86_64 = "x86_64" From aa1600c3db42f7b18d45c8d9cd223fd89d53f96a Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 15:50:36 -0400 Subject: [PATCH 48/66] remove redundent entry --- src/pytypes/extra/generate-docs.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/pytypes/extra/generate-docs.py b/src/pytypes/extra/generate-docs.py index c4b75ce4a1..260a686f37 100755 --- a/src/pytypes/extra/generate-docs.py +++ b/src/pytypes/extra/generate-docs.py @@ -256,9 +256,6 @@ def main() -> None: report=report, ), EventFileAdded(container=Container("container-name"), filename="example.txt"), - EventScalesetResizeScheduled( - scaleset_id=UUID(int=0), pool_name=PoolName("example"), size=0 - ), EventNodeHeartbeat(machine_id=UUID(int=0), pool_name=PoolName("example")), EventTaskHeartbeat(task_id=UUID(int=0), job_id=UUID(int=0), config=task_config), EventInstanceConfigUpdated(config=InstanceConfig(admins=[UUID(int=0)])), From b61c471f17109fa726be261ede2d2b8942d1a80e Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 16:12:25 -0400 Subject: [PATCH 49/66] use the built-in size validators Using the built-in size validators means the documentation includes the min/max values --- docs/webhook_events.md | 56 ++++++++++++++++++ src/pytypes/onefuzztypes/models.py | 88 ++++------------------------ src/pytypes/onefuzztypes/requests.py | 34 ++--------- 3 files changed, 72 insertions(+), 106 deletions(-) diff --git a/docs/webhook_events.md b/docs/webhook_events.md index 0f10a122ce..9dc79c7599 100644 --- a/docs/webhook_events.md +++ b/docs/webhook_events.md @@ -359,6 +359,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "check_retry_count": { + "minimum": 0, "title": "Check Retry Count", "type": "integer" }, @@ -367,6 +368,8 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "duration": { + "maximum": 168, + "minimum": 1, "title": "Duration", "type": "integer" }, @@ -471,6 +474,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "target_timeout": { + "minimum": 1, "title": "Target Timeout", "type": "integer" }, @@ -532,6 +536,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "properties": { "count": { "default": 1, + "minimum": 0, "title": "Count", "type": "integer" }, @@ -707,6 +712,8 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "duration": { + "maximum": 168, + "minimum": 1, "title": "Duration", "type": "integer" }, @@ -863,6 +870,8 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "duration": { + "maximum": 168, + "minimum": 1, "title": "Duration", "type": "integer" }, @@ -1221,11 +1230,16 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "max_size": { + "default": 1000, + "maximum": 1000, + "minimum": 0, "title": "Max Size", "type": "integer" }, "min_size": { "default": 0, + "maximum": 1000, + "minimum": 0, "title": "Min Size", "type": "integer" }, @@ -1944,6 +1958,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "check_retry_count": { + "minimum": 0, "title": "Check Retry Count", "type": "integer" }, @@ -1952,6 +1967,8 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "duration": { + "maximum": 168, + "minimum": 1, "title": "Duration", "type": "integer" }, @@ -2056,6 +2073,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "target_timeout": { + "minimum": 1, "title": "Target Timeout", "type": "integer" }, @@ -2117,6 +2135,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "properties": { "count": { "default": 1, + "minimum": 0, "title": "Count", "type": "integer" }, @@ -2651,6 +2670,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "check_retry_count": { + "minimum": 0, "title": "Check Retry Count", "type": "integer" }, @@ -2659,6 +2679,8 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "duration": { + "maximum": 168, + "minimum": 1, "title": "Duration", "type": "integer" }, @@ -2763,6 +2785,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "target_timeout": { + "minimum": 1, "title": "Target Timeout", "type": "integer" }, @@ -2824,6 +2847,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "properties": { "count": { "default": 1, + "minimum": 0, "title": "Count", "type": "integer" }, @@ -3149,6 +3173,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "check_retry_count": { + "minimum": 0, "title": "Check Retry Count", "type": "integer" }, @@ -3157,6 +3182,8 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "duration": { + "maximum": 168, + "minimum": 1, "title": "Duration", "type": "integer" }, @@ -3261,6 +3288,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "target_timeout": { + "minimum": 1, "title": "Target Timeout", "type": "integer" }, @@ -3322,6 +3350,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "properties": { "count": { "default": 1, + "minimum": 0, "title": "Count", "type": "integer" }, @@ -3592,6 +3621,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "check_retry_count": { + "minimum": 0, "title": "Check Retry Count", "type": "integer" }, @@ -3600,6 +3630,8 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "duration": { + "maximum": 168, + "minimum": 1, "title": "Duration", "type": "integer" }, @@ -3704,6 +3736,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "target_timeout": { + "minimum": 1, "title": "Target Timeout", "type": "integer" }, @@ -3765,6 +3798,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "properties": { "count": { "default": 1, + "minimum": 0, "title": "Count", "type": "integer" }, @@ -4009,6 +4043,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "check_retry_count": { + "minimum": 0, "title": "Check Retry Count", "type": "integer" }, @@ -4017,6 +4052,8 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "duration": { + "maximum": 168, + "minimum": 1, "title": "Duration", "type": "integer" }, @@ -4121,6 +4158,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "target_timeout": { + "minimum": 1, "title": "Target Timeout", "type": "integer" }, @@ -4196,6 +4234,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "properties": { "count": { "default": 1, + "minimum": 0, "title": "Count", "type": "integer" }, @@ -4453,6 +4492,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "check_retry_count": { + "minimum": 0, "title": "Check Retry Count", "type": "integer" }, @@ -4461,6 +4501,8 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "duration": { + "maximum": 168, + "minimum": 1, "title": "Duration", "type": "integer" }, @@ -4565,6 +4607,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "target_timeout": { + "minimum": 1, "title": "Target Timeout", "type": "integer" }, @@ -4626,6 +4669,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "properties": { "count": { "default": 1, + "minimum": 0, "title": "Count", "type": "integer" }, @@ -4732,11 +4776,16 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "max_size": { + "default": 1000, + "maximum": 1000, + "minimum": 0, "title": "Max Size", "type": "integer" }, "min_size": { "default": 0, + "maximum": 1000, + "minimum": 0, "title": "Min Size", "type": "integer" }, @@ -5562,6 +5611,8 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "duration": { + "maximum": 168, + "minimum": 1, "title": "Duration", "type": "integer" }, @@ -5926,6 +5977,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "check_retry_count": { + "minimum": 0, "title": "Check Retry Count", "type": "integer" }, @@ -5934,6 +5986,8 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "duration": { + "maximum": 168, + "minimum": 1, "title": "Duration", "type": "integer" }, @@ -6038,6 +6092,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "boolean" }, "target_timeout": { + "minimum": 1, "title": "Target Timeout", "type": "integer" }, @@ -6113,6 +6168,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "properties": { "count": { "default": 1, + "minimum": 0, "title": "Count", "type": "integer" }, diff --git a/src/pytypes/onefuzztypes/models.py b/src/pytypes/onefuzztypes/models.py index a16b8fd0bf..bc805e4bf2 100644 --- a/src/pytypes/onefuzztypes/models.py +++ b/src/pytypes/onefuzztypes/models.py @@ -119,30 +119,18 @@ class JobConfig(BaseModel): project: str name: str build: str - duration: int - - @validator("duration", allow_reuse=True) - def check_duration(cls, value: int) -> int: - if value < ONE_HOUR or value > SEVEN_DAYS: - raise ValueError("invalid duration") - return value + duration: int = Field(ge=ONE_HOUR, le=SEVEN_DAYS) class ReproConfig(BaseModel): container: Container path: str - duration: int - - @validator("duration", allow_reuse=True) - def check_duration(cls, value: int) -> int: - if value < ONE_HOUR or value > SEVEN_DAYS: - raise ValueError("invalid duration") - return value + duration: int = Field(ge=ONE_HOUR, le=SEVEN_DAYS) class TaskDetails(BaseModel): type: TaskType - duration: int + duration: int = Field(ge=ONE_HOUR, le=SEVEN_DAYS) target_exe: Optional[str] target_env: Optional[Dict[str, str]] target_options: Optional[List[str]] @@ -150,7 +138,7 @@ class TaskDetails(BaseModel): target_options_merge: Optional[bool] check_asan_log: Optional[bool] check_debugger: Optional[bool] = Field(default=True) - check_retry_count: Optional[int] + check_retry_count: Optional[int] = Field(ge=0) check_fuzzer_help: Optional[bool] expect_crash_on_failure: Optional[bool] rename_output: Optional[bool] @@ -168,33 +156,13 @@ class TaskDetails(BaseModel): stats_file: Optional[str] stats_format: Optional[StatsFormat] reboot_after_setup: Optional[bool] - target_timeout: Optional[int] + target_timeout: Optional[int] = Field(ge=1) ensemble_sync_delay: Optional[int] preserve_existing_outputs: Optional[bool] report_list: Optional[List[str]] minimized_stack_depth: Optional[int] coverage_filter: Optional[str] - @validator("check_retry_count", allow_reuse=True) - def validate_check_retry_count(cls, value: int) -> int: - if value is not None: - if value < 0: - raise ValueError("invalid check_retry_count") - return value - - @validator("target_timeout", allow_reuse=True) - def check_target_timeout(cls, value: Optional[int]) -> Optional[int]: - if value is not None: - if value < 1: - raise ValueError("invalid target_timeout") - return value - - @validator("duration", allow_reuse=True) - def check_duration(cls, value: int) -> int: - if value < ONE_HOUR or value > SEVEN_DAYS: - raise ValueError("invalid duration") - return value - class TaskPool(BaseModel): count: int @@ -205,16 +173,10 @@ class TaskVm(BaseModel): region: Region sku: str image: str - count: int = Field(default=1) + count: int = Field(default=1, ge=0) spot_instances: bool = Field(default=False) reboot_after_setup: Optional[bool] - @validator("count", allow_reuse=True) - def check_count(cls, value: int) -> int: - if value <= 0: - raise ValueError("invalid count") - return value - class TaskContainers(BaseModel): type: ContainerType @@ -607,6 +569,7 @@ class NodeCommandEnvelope(BaseModel): class Node(BaseModel): timestamp: Optional[datetime] = Field(alias="Timestamp") pool_name: PoolName + pool_id: Optional[UUID] machine_id: UUID state: NodeState = Field(default=NodeState.init) scaleset_id: Optional[UUID] = None @@ -632,43 +595,20 @@ class NodeTasks(BaseModel): class AutoScaleConfig(BaseModel): image: str - max_size: Optional[int] # max size of pool - min_size: int = Field(default=0) # min size of pool + max_size: int = Field(default=1000, le=1000, ge=0) # max size of pool + min_size: int = Field(default=0, le=1000, ge=0) # min size of pool region: Optional[Region] scaleset_size: int # Individual scaleset size spot_instances: bool = Field(default=False) ephemeral_os_disks: bool = Field(default=False) vm_sku: str - @validator("scaleset_size", allow_reuse=True) - def check_scaleset_size(cls, value: int) -> int: - if value < 1 or value > 1000: - raise ValueError("invalid scaleset size") - return value - @root_validator() def check_data(cls, values: Any) -> Any: - if ( - "max_size" in values - and values.get("max_size") - and values.get("min_size") > values.get("max_size") - ): + if values['min_size'] <= values['max_size']: raise ValueError("The pool min_size is greater than max_size") return values - @validator("max_size", allow_reuse=True) - def check_max_size(cls, value: Optional[int]) -> Optional[int]: - if value and value < 1: - raise ValueError("Autoscale sizes are not defined properly") - return value - - @validator("min_size", allow_reuse=True) - def check_min_size(cls, value: int) -> int: - if value < 0 or value > 1000: - raise ValueError("Invalid pool min_size") - return value - - class Pool(BaseModel): timestamp: Optional[datetime] = Field(alias="Timestamp") name: PoolName @@ -706,7 +646,7 @@ class Scaleset(BaseModel): vm_sku: str image: str region: Region - size: int + size: int = Field(ge=0) spot_instances: bool ephemeral_os_disks: bool = Field(default=False) needs_config_update: bool = Field(default=False) @@ -716,12 +656,6 @@ class Scaleset(BaseModel): client_object_id: Optional[UUID] tags: Dict[str, str] = Field(default_factory=lambda: {}) - @validator("size", allow_reuse=True) - def check_size(cls, value: int) -> int: - if value < 0: - raise ValueError("Invalid scaleset size") - return value - class NotificationConfig(BaseModel): config: NotificationTemplate diff --git a/src/pytypes/onefuzztypes/requests.py b/src/pytypes/onefuzztypes/requests.py index 8180b1b24e..c226dd10e8 100644 --- a/src/pytypes/onefuzztypes/requests.py +++ b/src/pytypes/onefuzztypes/requests.py @@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional from uuid import UUID -from pydantic import AnyHttpUrl, BaseModel, Field, root_validator, validator +from pydantic import AnyHttpUrl, BaseModel, Field, root_validator from ._monkeypatch import _check_hotfix from .consts import ONE_HOUR, SEVEN_DAYS @@ -58,13 +58,7 @@ class TaskSearch(BaseRequest): class TaskResize(TaskGet): - count: int - - @validator("count", allow_reuse=True) - def check_count(cls, value: int) -> int: - if value <= 0: - raise ValueError("invalid count") - return value + count: int = Field(ge=1) class NodeCommandGet(BaseRequest): @@ -128,13 +122,7 @@ class ProxyCreate(BaseRequest): scaleset_id: UUID machine_id: UUID dst_port: int - duration: int - - @validator("duration", allow_reuse=True) - def check_duration(cls, value: int) -> int: - if value < ONE_HOUR or value > SEVEN_DAYS: - raise ValueError("invalid duration") - return value + duration: int = Field(ge=ONE_HOUR, le=SEVEN_DAYS) class ProxyDelete(BaseRequest): @@ -172,13 +160,7 @@ class ScalesetStop(BaseRequest): class ScalesetUpdate(BaseRequest): scaleset_id: UUID - size: Optional[int] - - @validator("size", allow_reuse=True) - def check_optional_size(cls, value: Optional[int]) -> Optional[int]: - if value is not None and value < 0: - raise ValueError("invalid size") - return value + size: Optional[int] = Field(ge=1) class ScalesetCreate(BaseRequest): @@ -186,17 +168,11 @@ class ScalesetCreate(BaseRequest): vm_sku: str image: str region: Optional[Region] - size: int + size: int = Field(ge=1) spot_instances: bool ephemeral_os_disks: bool = Field(default=False) tags: Dict[str, str] - @validator("size", allow_reuse=True) - def check_size(cls, value: int) -> int: - if value <= 0: - raise ValueError("invalid size") - return value - class ContainerGet(BaseRequest): name: Container From fa40b76908574e9374e8652228b6e066360679f4 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 16:15:53 -0400 Subject: [PATCH 50/66] lint --- src/pytypes/onefuzztypes/models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pytypes/onefuzztypes/models.py b/src/pytypes/onefuzztypes/models.py index bc805e4bf2..06d4a603f8 100644 --- a/src/pytypes/onefuzztypes/models.py +++ b/src/pytypes/onefuzztypes/models.py @@ -605,10 +605,11 @@ class AutoScaleConfig(BaseModel): @root_validator() def check_data(cls, values: Any) -> Any: - if values['min_size'] <= values['max_size']: + if values["min_size"] <= values["max_size"]: raise ValueError("The pool min_size is greater than max_size") return values + class Pool(BaseModel): timestamp: Optional[datetime] = Field(alias="Timestamp") name: PoolName From 144475e997eff64c931b0eff5edd0fe1f58f2b71 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 16:27:03 -0400 Subject: [PATCH 51/66] remove extra docs --- docs/webhook_events.md | 8 ++++---- src/pytypes/onefuzztypes/models.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/webhook_events.md b/docs/webhook_events.md index 28068f922a..145316ac71 100644 --- a/docs/webhook_events.md +++ b/docs/webhook_events.md @@ -1233,14 +1233,14 @@ Each event will be submitted via HTTP POST to the user provided URL. "default": 1000, "maximum": 1000, "minimum": 0, - "title": "maximum size of the pool", + "title": "Max Size", "type": "integer" }, "min_size": { "default": 0, "maximum": 1000, "minimum": 0, - "title": "minimum size of the pool", + "title": "Min Size", "type": "integer" }, "region": { @@ -4780,14 +4780,14 @@ Each event will be submitted via HTTP POST to the user provided URL. "default": 1000, "maximum": 1000, "minimum": 0, - "title": "maximum size of the pool", + "title": "Max Size", "type": "integer" }, "min_size": { "default": 0, "maximum": 1000, "minimum": 0, - "title": "minimum size of the pool", + "title": "Min Size", "type": "integer" }, "region": { diff --git a/src/pytypes/onefuzztypes/models.py b/src/pytypes/onefuzztypes/models.py index bb8b6f67dc..7168854be1 100644 --- a/src/pytypes/onefuzztypes/models.py +++ b/src/pytypes/onefuzztypes/models.py @@ -595,8 +595,8 @@ class NodeTasks(BaseModel): class AutoScaleConfig(BaseModel): image: str - max_size: int = Field(default=1000, le=1000, ge=0, title="maximum size of the pool") - min_size: int = Field(default=0, le=1000, ge=0, title="minimum size of the pool") + max_size: int = Field(default=1000, le=1000, ge=0) + min_size: int = Field(default=0, le=1000, ge=0) region: Optional[Region] scaleset_size: int = Field(default=0, description="unused") spot_instances: bool = Field(default=False) From aecd130cdc3e0e9fd1f95552fb6da219172047b5 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 16:42:41 -0400 Subject: [PATCH 52/66] make backwards compatable from the API perspective --- src/cli/onefuzz/api.py | 4 ++-- src/pytypes/onefuzztypes/models.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cli/onefuzz/api.py b/src/cli/onefuzz/api.py index fe548127ef..d878ae0d68 100644 --- a/src/cli/onefuzz/api.py +++ b/src/cli/onefuzz/api.py @@ -1184,8 +1184,8 @@ def create_autoscale( client_id: Optional[UUID] = None, *, arch: enums.Architecture = enums.Architecture.x86_64, - min_size: Optional[int] = None, - max_size: Optional[int] = None, + min_size: int = models.AutoScaleConfig.__fields__['min_size'].default, + max_size: int = models.AutoScaleConfig.__fields__['max_size'].default, image: Optional[str] = None, vm_sku: Optional[str] = "Standard_D2s_v3", region: Optional[primitives.Region] = None, diff --git a/src/pytypes/onefuzztypes/models.py b/src/pytypes/onefuzztypes/models.py index 7168854be1..ab6018211c 100644 --- a/src/pytypes/onefuzztypes/models.py +++ b/src/pytypes/onefuzztypes/models.py @@ -598,14 +598,14 @@ class AutoScaleConfig(BaseModel): max_size: int = Field(default=1000, le=1000, ge=0) min_size: int = Field(default=0, le=1000, ge=0) region: Optional[Region] - scaleset_size: int = Field(default=0, description="unused") + scaleset_size: int = Field(default=1, description="unused") spot_instances: bool = Field(default=False) ephemeral_os_disks: bool = Field(default=False) vm_sku: str @root_validator() def check_data(cls, values: Any) -> Any: - if values["min_size"] <= values["max_size"]: + if values["min_size"] > values["max_size"]: raise ValueError("The pool min_size is greater than max_size") return values From 9ad178172b78d552f8607b23fa195a27efaa9812 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 16:47:35 -0400 Subject: [PATCH 53/66] use set_size --- .../__app__/onefuzzlib/workers/autoscale.py | 2 +- .../__app__/onefuzzlib/workers/scalesets.py | 14 +------------- src/cli/onefuzz/api.py | 4 ++-- 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 2d7ffa927d..a763857e96 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -63,7 +63,7 @@ def scale_up(pool: Pool, scalesets: List[Scaleset], to_add: int) -> None: scaleset.size, scaleset_to_add, ) - scaleset.set_new_size(scaleset.size + scaleset_to_add) + scaleset.set_size(scaleset.size + scaleset_to_add) to_add -= scaleset_to_add region = config.region or get_base_region() diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index e9645cf5bb..8e7adb1b63 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -421,7 +421,7 @@ def cleanup_nodes(self) -> bool: if pool.autoscale and self.state == ScalesetState.running: ground_truth_size = get_vmss_size(self.scaleset_id) if ground_truth_size is not None and ground_truth_size != self.size: - self.set_new_size(ground_truth_size) + self.set_size(ground_truth_size) return bool(to_reimage) or bool(to_delete) @@ -815,18 +815,6 @@ def update_configs(self) -> None: self.scaleset_id, ) - def set_new_size(self, size: int) -> None: - # ensure we always stay within max_size boundaries - self.size = min(size, self.max_size()) - self.state = ScalesetState.resize - self.save() - - send_event( - EventScalesetResizeScheduled( - scaleset_id=self.scaleset_id, pool_name=self.pool_name, size=self.size - ) - ) - @classmethod def key_fields(cls) -> Tuple[str, str]: return ("pool_name", "scaleset_id") diff --git a/src/cli/onefuzz/api.py b/src/cli/onefuzz/api.py index d878ae0d68..3df47f41ed 100644 --- a/src/cli/onefuzz/api.py +++ b/src/cli/onefuzz/api.py @@ -1184,8 +1184,8 @@ def create_autoscale( client_id: Optional[UUID] = None, *, arch: enums.Architecture = enums.Architecture.x86_64, - min_size: int = models.AutoScaleConfig.__fields__['min_size'].default, - max_size: int = models.AutoScaleConfig.__fields__['max_size'].default, + min_size: int = models.AutoScaleConfig.__fields__["min_size"].default, + max_size: int = models.AutoScaleConfig.__fields__["max_size"].default, image: Optional[str] = None, vm_sku: Optional[str] = "Standard_D2s_v3", region: Optional[primitives.Region] = None, From ac1d3bc756dbeba670bbf41e095cdbbc6db52ab1 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 17:07:22 -0400 Subject: [PATCH 54/66] use current resize functionality --- .../__app__/onefuzzlib/workers/scalesets.py | 40 ++++--------------- 1 file changed, 7 insertions(+), 33 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index 8e7adb1b63..b68c02bdc4 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -418,11 +418,6 @@ def cleanup_nodes(self) -> bool: self.scaleset_id, ) - if pool.autoscale and self.state == ScalesetState.running: - ground_truth_size = get_vmss_size(self.scaleset_id) - if ground_truth_size is not None and ground_truth_size != self.size: - self.set_size(ground_truth_size) - return bool(to_reimage) or bool(to_delete) def _resize_equal(self) -> None: @@ -520,8 +515,8 @@ def resize(self) -> None: self.size = min(self.size, self.max_size()) # Treat Azure knowledge of the size of the scaleset as "ground truth" - ground_truth_size = get_vmss_size(self.scaleset_id) - if ground_truth_size is None: + size = get_vmss_size(self.scaleset_id) + if size is None: logging.info( SCALESET_LOG_PREFIX + "scaleset is unavailable. scaleset_id:%s", self.scaleset_id, @@ -532,33 +527,12 @@ def resize(self) -> None: self.set_shutdown(now=True) return - if ground_truth_size < self.size: - logging.info( - "scaleset resize - growing. scaleset:%s new_size:%d azure_size:%d", - self.scaleset_id, - self.size, - ground_truth_size, - ) - try: - resize_vmss(self.scaleset_id, self.size) - except UnableToUpdate: - logging.info( - "scaleset resize - unable to update, mid-operation already" - ) - return - elif ground_truth_size > self.size: - to_remove = ground_truth_size - self.size - logging.info( - "scaleset resize - shrinking. scaleset:%s removing:%d", - self.scaleset_id, - to_remove, - ) - ShrinkQueue(self.scaleset_id).set_size(to_remove) + if size == self.size: + self._resize_equal() + elif self.size > size: + self._resize_grow() else: - logging.info("scaleset resize - no change. scaleset:%s", self.scaleset_id) - - self.state = ScalesetState.running - self.save() + self._resize_shrink(size - self.size) def delete_nodes(self, nodes: List[Node]) -> None: if not nodes: From 37e943be10cbc4f4a7a307c5d3b3b45cd5fa6b90 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 17:11:17 -0400 Subject: [PATCH 55/66] set pool_id on node creation --- .../__app__/agent_registration/__init__.py | 3 ++- src/api-service/__app__/onefuzzlib/workers/nodes.py | 8 ++++++++ .../__app__/onefuzzlib/workers/scalesets.py | 13 +++++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/api-service/__app__/agent_registration/__init__.py b/src/api-service/__app__/agent_registration/__init__.py index d5cffd6bb5..98ce1c3024 100644 --- a/src/api-service/__app__/agent_registration/__init__.py +++ b/src/api-service/__app__/agent_registration/__init__.py @@ -102,7 +102,8 @@ def post(req: func.HttpRequest) -> func.HttpResponse: node.delete() node = Node.create( - pool_name=registration_request.pool_name, + pool_id=pool.pool_id, + pool_name=pool.name, machine_id=registration_request.machine_id, scaleset_id=registration_request.scaleset_id, version=registration_request.version, diff --git a/src/api-service/__app__/onefuzzlib/workers/nodes.py b/src/api-service/__app__/onefuzzlib/workers/nodes.py index 0a254000e5..5d4a3980a0 100644 --- a/src/api-service/__app__/onefuzzlib/workers/nodes.py +++ b/src/api-service/__app__/onefuzzlib/workers/nodes.py @@ -48,6 +48,7 @@ class Node(BASE_NODE, ORMMixin): def create( cls, *, + pool_id: UUID, pool_name: PoolName, machine_id: UUID, scaleset_id: Optional[UUID], @@ -55,6 +56,7 @@ def create( new: bool = False, ) -> "Node": node = cls( + pool_id=pool_id, pool_name=pool_name, machine_id=machine_id, scaleset_id=scaleset_id, @@ -78,11 +80,14 @@ def create( def search_states( cls, *, + pool_id: Optional[UUID] = None, scaleset_id: Optional[UUID] = None, states: Optional[List[NodeState]] = None, pool_name: Optional[PoolName] = None, ) -> List["Node"]: query: QueryFilter = {} + if pool_id: + query["pool_id"] = [pool_id] if scaleset_id: query["scaleset_id"] = [scaleset_id] if states: @@ -95,6 +100,7 @@ def search_states( def search_outdated( cls, *, + pool_id: Optional[UUID] = None, scaleset_id: Optional[UUID] = None, states: Optional[List[NodeState]] = None, pool_name: Optional[PoolName] = None, @@ -102,6 +108,8 @@ def search_outdated( num_results: Optional[int] = None, ) -> List["Node"]: query: QueryFilter = {} + if pool_id: + query["pool_id"] = [pool_id] if scaleset_id: query["scaleset_id"] = [scaleset_id] if states: diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index 2e9d9d09fc..08255bd6ff 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -291,6 +291,8 @@ def get_error() -> Error: # result = 'did I modify the scaleset in azure' def cleanup_nodes(self) -> bool: + from .pools import Pool + logging.info( SCALESET_LOG_PREFIX + "cleaning up nodes. scaleset_id:%s", self.scaleset_id ) @@ -302,6 +304,16 @@ def cleanup_nodes(self) -> bool: self.halt() return True + pool = Pool.get_by_name(self.pool_name) + if isinstance(pool, Error): + logging.error( + "unable to find pool during cleanup: %s - %s", + self.scaleset_id, + pool, + ) + self.set_failed(pool) + return True + Node.reimage_long_lived_nodes(self.scaleset_id) to_reimage = [] @@ -346,6 +358,7 @@ def cleanup_nodes(self) -> bool: # Note, using `new=True` makes it such that if a node already has # checked in, this won't overwrite it. Node.create( + pool_id=pool.pool_id, pool_name=self.pool_name, machine_id=machine_id, scaleset_id=self.scaleset_id, From a6ed5fd5466d5d51fb0c389cbdbf69fcfda8d4cb Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 17:16:22 -0400 Subject: [PATCH 56/66] remove unused change --- src/cli/onefuzz/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cli/onefuzz/cli.py b/src/cli/onefuzz/cli.py index b88dc45aac..cff5aa307e 100644 --- a/src/cli/onefuzz/cli.py +++ b/src/cli/onefuzz/cli.py @@ -158,8 +158,8 @@ def __init__(self, api_types: List[Any]): int: {"type": int}, UUID: {"type": UUID}, Container: {"type": str}, - PoolName: {"type": str}, Region: {"type": str}, + PoolName: {"type": str}, File: {"type": arg_file}, Directory: {"type": arg_dir}, } From 83435d79c217fd904d1db3df3d2169b2aee3c441 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 17:18:12 -0400 Subject: [PATCH 57/66] remove changes from upstream --- src/api-service/__app__/onefuzzlib/workers/nodes.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/nodes.py b/src/api-service/__app__/onefuzzlib/workers/nodes.py index 26283d3d6b..950f8e739e 100644 --- a/src/api-service/__app__/onefuzzlib/workers/nodes.py +++ b/src/api-service/__app__/onefuzzlib/workers/nodes.py @@ -50,7 +50,6 @@ def create( *, pool_id: UUID, pool_name: PoolName, - pool_id: UUID, machine_id: UUID, scaleset_id: Optional[UUID], version: str, @@ -59,7 +58,6 @@ def create( node = cls( pool_id=pool_id, pool_name=pool_name, - pool_id=pool_id, machine_id=machine_id, scaleset_id=scaleset_id, version=version, @@ -92,8 +90,6 @@ def search_states( query["pool_id"] = [pool_id] if scaleset_id: query["scaleset_id"] = [scaleset_id] - if pool_id: - query["pool_id"] = [pool_id] if states: query["state"] = states if pool_name: From dd11a4c26c87c305222c8c520c36a2f300cbe092 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 17:28:22 -0400 Subject: [PATCH 58/66] add a pool-based shrink-queue This isn't used at the moment, but pulled forwards from the autoscale PR. The functionality replicates the scaleset based shrink-queue, but at the context of pools. --- .../__app__/onefuzzlib/workers/nodes.py | 4 ++++ .../__app__/onefuzzlib/workers/pools.py | 16 +++++++++++++--- .../__app__/onefuzzlib/workers/scalesets.py | 3 +++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/nodes.py b/src/api-service/__app__/onefuzzlib/workers/nodes.py index 5d4a3980a0..950f8e739e 100644 --- a/src/api-service/__app__/onefuzzlib/workers/nodes.py +++ b/src/api-service/__app__/onefuzzlib/workers/nodes.py @@ -253,6 +253,10 @@ def mark_tasks_stopped_early(self, error: Optional[Error] = None) -> None: def could_shrink_scaleset(self) -> bool: if self.scaleset_id and ShrinkQueue(self.scaleset_id).should_shrink(): return True + + if self.pool_id and ShrinkQueue(self.pool_id).should_shrink(): + return True + return False def can_process_new_work(self) -> bool: diff --git a/src/api-service/__app__/onefuzzlib/workers/pools.py b/src/api-service/__app__/onefuzzlib/workers/pools.py index 91c7de5f32..409b9f8331 100644 --- a/src/api-service/__app__/onefuzzlib/workers/pools.py +++ b/src/api-service/__app__/onefuzzlib/workers/pools.py @@ -24,6 +24,7 @@ from ..azure.storage import StorageType from ..events import send_event from ..orm import MappingIntStrAny, ORMMixin, QueryFilter +from .shrink_queue import ShrinkQueue NODE_EXPIRATION_TIME: datetime.timedelta = datetime.timedelta(hours=1) NODE_REIMAGE_TIME: datetime.timedelta = datetime.timedelta(days=7) @@ -98,6 +99,11 @@ def populate_scaleset_summary(self) -> None: for x in Scaleset.search_by_pool(self.name) ] + def peek_work_queue(self) -> List[WorkSet]: + return peek_queue( + self.get_pool_queue(), StorageType.corpus, object_type=WorkSet + ) + def populate_work_queue(self) -> None: self.work_queue = [] @@ -106,11 +112,13 @@ def populate_work_queue(self) -> None: if self.state == PoolState.init: return - worksets = peek_queue( - self.get_pool_queue(), StorageType.corpus, object_type=WorkSet - ) + worksets = self.peek_work_queue() for workset in worksets: + # only include work units with work + if not workset.work_units: + continue + work_units = [ WorkUnitSummary( job_id=work_unit.job_id, @@ -126,6 +134,7 @@ def get_pool_queue(self) -> str: def init(self) -> None: create_queue(self.get_pool_queue(), StorageType.corpus) + ShrinkQueue(self.pool_id).create() self.state = PoolState.running self.save() @@ -216,6 +225,7 @@ def halt(self) -> None: nodes = Node.search(query={"pool_name": [self.name]}) if not scalesets and not nodes: delete_queue(self.get_pool_queue(), StorageType.corpus) + ShrinkQueue(self.pool_id).delete() logging.info("pool stopped, deleting: %s", self.name) self.state = PoolState.halt self.delete() diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index 08255bd6ff..dfb02ab582 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -378,6 +378,9 @@ def cleanup_nodes(self) -> bool: if ShrinkQueue(self.scaleset_id).should_shrink(): node.set_halt() to_delete.append(node) + elif ShrinkQueue(pool.pool_id).should_shrink(): + node.set_halt() + to_delete.append(node) else: to_reimage.append(node) From 6b61a699520a91c5296c72894262e309e63cba54 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 17:42:53 -0400 Subject: [PATCH 59/66] more removing changes from usptream --- src/api-service/__app__/onefuzzlib/workers/pools.py | 1 - src/api-service/__app__/onefuzzlib/workers/scalesets.py | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/pools.py b/src/api-service/__app__/onefuzzlib/workers/pools.py index f63121a4f5..409b9f8331 100644 --- a/src/api-service/__app__/onefuzzlib/workers/pools.py +++ b/src/api-service/__app__/onefuzzlib/workers/pools.py @@ -245,5 +245,4 @@ def key_fields(cls) -> Tuple[str, str]: def delete(self) -> None: super().delete() - ShrinkQueue(self.pool_id).delete() send_event(EventPoolDeleted(pool_name=self.name)) diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index f09366a15d..280c2a5dcb 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -375,10 +375,7 @@ def cleanup_nodes(self) -> bool: if node.delete_requested: to_delete.append(node) else: - if ShrinkQueue(pool.pool_id).should_shrink(): - node.set_halt() - to_delete.append(node) - elif ShrinkQueue(self.scaleset_id).should_shrink(): + if ShrinkQueue(self.scaleset_id).should_shrink(): node.set_halt() to_delete.append(node) elif ShrinkQueue(pool.pool_id).should_shrink(): From f40f6306603f8cbb24f67c3d2f2fd949fc2e7446 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 7 Jul 2021 17:44:29 -0400 Subject: [PATCH 60/66] remove from upstream --- src/api-service/__app__/onefuzzlib/workers/scalesets.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index 280c2a5dcb..dfb02ab582 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -795,7 +795,6 @@ def key_fields(cls) -> Tuple[str, str]: def delete(self) -> None: super().delete() - ShrinkQueue(self.scaleset_id).delete() send_event( EventScalesetDeleted(scaleset_id=self.scaleset_id, pool_name=self.pool_name) ) From 27307f7c17dca6d77f232f939bde64a69b72eed0 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Thu, 8 Jul 2021 11:30:09 -0400 Subject: [PATCH 61/66] decode doesn't return an optional --- src/api-service/__app__/onefuzzlib/azure/queue.py | 2 +- src/api-service/__app__/onefuzzlib/workers/autoscale.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/azure/queue.py b/src/api-service/__app__/onefuzzlib/azure/queue.py index 9c3d765dac..e155c56313 100644 --- a/src/api-service/__app__/onefuzzlib/azure/queue.py +++ b/src/api-service/__app__/onefuzzlib/azure/queue.py @@ -191,7 +191,7 @@ def peek_queue( return result -def decode_message(message: QueueMessage, object_type: Type[A]) -> Optional[A]: +def decode_message(message: QueueMessage, object_type: Type[A]) -> A: decoded = base64.b64decode(message.content) raw = json.loads(decoded) return object_type.parse_obj(raw) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index a763857e96..5b2a9d792f 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -156,10 +156,6 @@ def clear_synthetic_worksets(pool: Pool) -> None: keeping = [] for message in client.receive_messages(): decoded = decode_message(message, WorkSet) - if not decoded: - logging.warning(AUTOSCALE_LOG_PREFIX + "decode workset failed: %s", message) - continue - if decoded.work_units: keeping.append(message) ignored += 1 From 5010047f378ed66aaf3091e6bf3a5a1fa10ed944 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Thu, 8 Jul 2021 13:53:18 -0400 Subject: [PATCH 62/66] remove unused code, and only use set_state for changing scaleset state --- .../__app__/onefuzzlib/azure/queue.py | 17 ----------------- .../__app__/onefuzzlib/workers/autoscale.py | 2 +- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/azure/queue.py b/src/api-service/__app__/onefuzzlib/azure/queue.py index e155c56313..d5b12eb394 100644 --- a/src/api-service/__app__/onefuzzlib/azure/queue.py +++ b/src/api-service/__app__/onefuzzlib/azure/queue.py @@ -125,23 +125,6 @@ def send_message( pass -def delete_messages( - name: QueueNameType, storage_type: StorageType, messages: List[str] -) -> None: - queue = get_queue(name, storage_type) - if not queue: - return - - done = [] - for message in messages: - try: - queue.delete_message(message) - done.append(message) - except ResourceNotFoundError: - logging.debug("queue message already deleted: %s - %s", name, message) - logging.debug("queue messages deleted: %s", done) - - def remove_first_message(name: QueueNameType, storage_type: StorageType) -> bool: queue = get_queue(name, storage_type) if queue: diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 5b2a9d792f..4f677dadef 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -112,7 +112,7 @@ def shutdown_empty_scalesets(pool: Pool, scalesets: List[Scaleset]) -> None: pool.name, scaleset.scaleset_id, ) - scaleset.halt() + scaleset.set_state(ScalesetState.shutdown) def scale_down(pool: Pool, scalesets: List[Scaleset], to_remove: int) -> None: From bf6aa936f5afae6594a27ff607203d0f8694a81e Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 9 Jul 2021 12:36:14 -0400 Subject: [PATCH 63/66] add unit tests for calculating change size --- .../__app__/onefuzzlib/workers/autoscale.py | 133 ++++++---- src/api-service/tests/test_autoscale.py | 232 +++++++++++++++++- 2 files changed, 318 insertions(+), 47 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index 4f677dadef..da637f3d0f 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -7,12 +7,12 @@ # the maximum size of a scaleset for testing. import logging -import os from typing import List, Tuple from onefuzztypes.enums import NodeState, ScalesetState from onefuzztypes.models import WorkSet from onefuzztypes.primitives import Container +from pydantic import BaseModel, Field from ..azure.containers import get_container_sas_url from ..azure.creds import get_base_region @@ -33,69 +33,97 @@ def set_shrink_queues(pool: Pool, scalesets: List[Scaleset], size: int) -> None: ShrinkQueue(pool.pool_id).set_size(size) -def scale_up(pool: Pool, scalesets: List[Scaleset], to_add: int) -> None: - logging.info( - AUTOSCALE_LOG_PREFIX + "scale up - pool:%s to_add:%d scalesets:%s", - pool.name, - to_add, - [x.scaleset_id for x in scalesets], - ) +class Change(BaseModel): + scalesets: List[Scaleset] + current_size: int + change_size: int + +class ScalesetChange(BaseModel): + # scaleset -> new size + existing: List[Tuple[Scaleset, int]] = Field(default_factory=list) + # size of each new scaleset + new_scalesets: List[int] = Field(default_factory=list) + + +def calc_scaleset_growth( + pool: Pool, scalesets: List[Scaleset], to_add: int +) -> ScalesetChange: config = pool.autoscale if not config: raise Exception(f"scaling up a non-autoscaling pool: {pool.name}") + base_size = Scaleset.scaleset_max_size(config.image) - set_shrink_queues(pool, scalesets, 0) + changes = ScalesetChange() for scaleset in sorted(scalesets, key=lambda x: x.scaleset_id): if to_add <= 0: break - if scaleset.state in ScalesetState.can_update(): - scaleset_max_size = scaleset.max_size() - if scaleset.size < scaleset_max_size: - scaleset_to_add = min(to_add, scaleset_max_size - scaleset.size) - logging.info( - AUTOSCALE_LOG_PREFIX + "adding to scaleset: " - "pool:%s scaleset:%s existing_size:%d adding:%d", - pool.name, - scaleset.scaleset_id, - scaleset.size, - scaleset_to_add, - ) - scaleset.set_size(scaleset.size + scaleset_to_add) - to_add -= scaleset_to_add + if scaleset.state not in ScalesetState.can_update(): + continue - region = config.region or get_base_region() - base_size = Scaleset.scaleset_max_size(config.image) + scaleset_max_size = scaleset.max_size() + if scaleset.size >= scaleset_max_size: + continue - alternate_max_size = os.environ.get("ONEFUZZ_SCALESET_MAX_SIZE") - if alternate_max_size is not None: - base_size = min(base_size, int(alternate_max_size)) + scaleset_to_add = min(to_add, scaleset_max_size - scaleset.size) + changes.existing.append((scaleset, scaleset.size + scaleset_to_add)) + to_add -= scaleset_to_add while to_add > 0: scaleset_size = min(base_size, to_add) + changes.new_scalesets.append(scaleset_size) + to_add -= scaleset_size + + return changes + + +def scale_up(pool: Pool, scalesets: List[Scaleset], to_add: int) -> None: + logging.info( + AUTOSCALE_LOG_PREFIX + "scale up - pool:%s to_add:%d scalesets:%s", + pool.name, + to_add, + [x.scaleset_id for x in scalesets], + ) + + config = pool.autoscale + if not config: + raise Exception(f"scaling up a non-autoscaling pool: {pool.name}") + region = config.region or get_base_region() + + set_shrink_queues(pool, scalesets, 0) + + changes = calc_scaleset_growth(pool, scalesets, to_add) + for (scaleset, new_size) in changes.existing: logging.info( - AUTOSCALE_LOG_PREFIX + "adding scaleset. pool:%s size:%s", + AUTOSCALE_LOG_PREFIX + + "scale up scaleset - pool:%s scaleset:%s " + + "from:%d to:%d", pool.name, - scaleset_size, + scaleset.scaleset_id, + scaleset.size, + new_size, ) + scaleset.set_size(new_size) + + for size in changes.new_scalesets: scaleset = Scaleset.create( pool_name=pool.name, vm_sku=config.vm_sku, image=config.image, region=region, - size=scaleset_size, + size=size, spot_instances=config.spot_instances, ephemeral_os_disks=config.ephemeral_os_disks, tags={"pool": pool.name}, ) logging.info( - AUTOSCALE_LOG_PREFIX + "added pool:%s scaleset:%s", + AUTOSCALE_LOG_PREFIX + "added pool:%s scaleset:%s size:%d", pool.name, scaleset.scaleset_id, + scaleset.size, ) - to_add -= scaleset_size def shutdown_empty_scalesets(pool: Pool, scalesets: List[Scaleset]) -> None: @@ -185,11 +213,12 @@ def needed_nodes(pool: Pool) -> Tuple[int, int]: return (scheduled_worksets, from_nodes) -def autoscale_pool(pool: Pool) -> None: +def calculate_change( + pool: Pool, scalesets: List[Scaleset], scheduled_worksets: int, in_use_nodes: int +) -> Change: if not pool.autoscale: - return + raise Exception(f"scaling up a non-autoscaling pool: {pool.name}") - scheduled_worksets, in_use_nodes = needed_nodes(pool) node_need_estimate = scheduled_worksets + in_use_nodes new_size = node_need_estimate @@ -198,7 +227,6 @@ def autoscale_pool(pool: Pool) -> None: if pool.autoscale.max_size is not None: new_size = min(new_size, pool.autoscale.max_size) - scalesets = Scaleset.search_by_pool(pool.name) current_size = 0 for scaleset in scalesets: valid_auto_scale_states = ScalesetState.include_autoscale_count() @@ -213,7 +241,7 @@ def autoscale_pool(pool: Pool) -> None: pool.name, unable_to_autoscale, ) - return + return Change(scalesets=[], change_size=0, current_size=0) current_size += scaleset.size logging.info( @@ -226,12 +254,29 @@ def autoscale_pool(pool: Pool) -> None: scheduled_worksets, ) - if new_size > current_size: + return Change( + scalesets=scalesets, + current_size=current_size, + change_size=new_size - current_size, + ) + + +def autoscale_pool(pool: Pool) -> None: + if not pool.autoscale: + return + + scalesets = Scaleset.search_by_pool(pool.name) + + scheduled_worksets, in_use_nodes = needed_nodes(pool) + + change = calculate_change(pool, scalesets, scheduled_worksets, in_use_nodes) + + if change.change_size > 0: clear_synthetic_worksets(pool) - scale_up(pool, scalesets, new_size - current_size) - elif current_size > new_size: + scale_up(pool, change.scalesets, change.change_size) + elif change.change_size < 0: clear_synthetic_worksets(pool) - scale_down(pool, scalesets, current_size - new_size) - shutdown_empty_scalesets(pool, scalesets) + scale_down(pool, change.scalesets, abs(change.change_size)) + shutdown_empty_scalesets(pool, change.scalesets) else: - shutdown_empty_scalesets(pool, scalesets) + shutdown_empty_scalesets(pool, change.scalesets) diff --git a/src/api-service/tests/test_autoscale.py b/src/api-service/tests/test_autoscale.py index 59cc5cce66..b7d4745a1f 100644 --- a/src/api-service/tests/test_autoscale.py +++ b/src/api-service/tests/test_autoscale.py @@ -7,11 +7,21 @@ from unittest.mock import MagicMock, patch from uuid import UUID -from onefuzztypes.enums import OS, Architecture -from onefuzztypes.primitives import PoolName +from onefuzztypes.enums import OS, Architecture, ScalesetState +from onefuzztypes.models import AutoScaleConfig +from onefuzztypes.primitives import PoolName, Region -from __app__.onefuzzlib.workers.autoscale import autoscale_pool +from __app__.onefuzzlib.workers.autoscale import ( + autoscale_pool, + calc_scaleset_growth, + calculate_change, +) from __app__.onefuzzlib.workers.pools import Pool +from __app__.onefuzzlib.workers.scalesets import Scaleset + +VM_SKU = "Standard_B2s" +IMAGE = "Canonical:UbuntuServer:18.04-LTS:latest" +REGION = Region("eastus") class TestAutoscale(unittest.TestCase): @@ -26,3 +36,219 @@ def test_autoscale_pool(self, needed_nodes: MagicMock) -> None: ) autoscale_pool(pool=pool) needed_nodes.assert_not_called() + + def test_scale_up(self) -> None: + autoscale = AutoScaleConfig(image=IMAGE, vm_sku=VM_SKU) + pool = Pool( + name=PoolName("test-pool"), + pool_id=UUID("6b049d51-23e9-4f5c-a5af-ff1f73d0d9e9"), + os=OS.linux, + managed=False, + arch=Architecture.x86_64, + autoscale=autoscale, + ) + + changes = calc_scaleset_growth(pool, [], 0) + self.assertEqual(changes.existing, []) + self.assertEqual(changes.new_scalesets, []) + + changes = calc_scaleset_growth(pool, [], 10) + self.assertEqual(changes.existing, []) + self.assertEqual(changes.new_scalesets, [10]) + + changes = calc_scaleset_growth(pool, [], 1000) + self.assertEqual(changes.existing, []) + self.assertEqual(changes.new_scalesets, [1000]) + + changes = calc_scaleset_growth(pool, [], 1010) + self.assertEqual(changes.existing, []) + self.assertEqual(changes.new_scalesets, [1000, 10]) + + changes = calc_scaleset_growth(pool, [], 3010) + self.assertEqual(changes.existing, []) + self.assertEqual(changes.new_scalesets, [1000, 1000, 1000, 10]) + + # scaleset_1 state is init, so it should never have changes + scaleset_1 = Scaleset( + pool_name=pool.name, + vm_sku=VM_SKU, + image=IMAGE, + region=REGION, + size=1, + spot_instances=False, + ) + + # scaleset_2 state is running, but already at 1000 instances, + # so should never have changes + scaleset_2 = Scaleset( + pool_name=pool.name, + state=ScalesetState.running, + vm_sku=VM_SKU, + image=IMAGE, + region=REGION, + size=1000, + spot_instances=False, + ) + + # these should always have modifications to scaleset_3 first, + # then scaleset_4 regardless of order in the list + scaleset_3 = Scaleset( + scaleset_id=UUID(int=0), + pool_name=pool.name, + state=ScalesetState.running, + vm_sku=VM_SKU, + image=IMAGE, + region=REGION, + size=1, + spot_instances=False, + ) + scaleset_4 = Scaleset( + scaleset_id=UUID(int=1), + pool_name=pool.name, + state=ScalesetState.running, + vm_sku=VM_SKU, + image=IMAGE, + region=REGION, + size=1, + spot_instances=False, + ) + + changes = calc_scaleset_growth(pool, [scaleset_1, scaleset_2], 0) + self.assertEqual(changes.existing, []) + self.assertEqual(changes.new_scalesets, []) + + changes = calc_scaleset_growth(pool, [scaleset_1, scaleset_2], 10) + self.assertEqual(changes.existing, []) + self.assertEqual(changes.new_scalesets, [10]) + + # verify we can grow existing scalesets + changes = calc_scaleset_growth( + pool, [scaleset_1, scaleset_2, scaleset_3, scaleset_4], 10 + ) + self.assertEqual(changes.existing, [(scaleset_3, 11)]) + self.assertEqual(changes.new_scalesets, []) + + # verify order doesn't matter + changes = calc_scaleset_growth( + pool, [scaleset_1, scaleset_2, scaleset_4, scaleset_3], 10 + ) + self.assertEqual(changes.existing, [(scaleset_3, 11)]) + self.assertEqual(changes.new_scalesets, []) + + # verify we can grow multiple scalesets and deal with left over correctly + changes = calc_scaleset_growth( + pool, [scaleset_1, scaleset_2, scaleset_4, scaleset_3], 3010 + ) + self.assertEqual(changes.existing, [(scaleset_3, 1000), (scaleset_4, 1000)]) + self.assertEqual(changes.new_scalesets, [1000, 12]) + + def test_calculate_change(self) -> None: + # no min size, no max size + autoscale_1 = AutoScaleConfig(image=IMAGE, vm_sku=VM_SKU) + pool = Pool( + name=PoolName("test-pool"), + pool_id=UUID("6b049d51-23e9-4f5c-a5af-ff1f73d0d9e9"), + os=OS.linux, + managed=False, + arch=Architecture.x86_64, + autoscale=autoscale_1, + ) + + # needed to make mypy happy later on + assert pool.autoscale is not None + + scaleset_1 = Scaleset( + pool_name=pool.name, + vm_sku=VM_SKU, + image=IMAGE, + region=REGION, + size=1, + spot_instances=False, + state=ScalesetState.running, + ) + scaleset_2 = Scaleset( + pool_name=pool.name, + vm_sku=VM_SKU, + image=IMAGE, + region=REGION, + size=2, + spot_instances=False, + state=ScalesetState.running, + ) + + # no scalesets, but need work + change = calculate_change( + pool, scalesets=[], scheduled_worksets=0, in_use_nodes=0 + ) + self.assertEqual(change.change_size, 0) + + change = calculate_change( + pool, scalesets=[], scheduled_worksets=10, in_use_nodes=0 + ) + self.assertEqual(change.change_size, 10) + + # 3 unused node, 0 used nodes, 10 upcoming jobs + change = calculate_change( + pool, + scalesets=[scaleset_1, scaleset_2], + scheduled_worksets=10, + in_use_nodes=0, + ) + self.assertEqual(change.change_size, 7) + + # 2 unused node, 1 used nodes, 10 upcoming jobs + change = calculate_change( + pool, + scalesets=[scaleset_1, scaleset_2], + scheduled_worksets=10, + in_use_nodes=1, + ) + self.assertEqual(change.change_size, 8) + + # 2 unused node, 1 used nodes, 0 upcoming jobs + change = calculate_change( + pool, + scalesets=[scaleset_1, scaleset_2], + scheduled_worksets=0, + in_use_nodes=1, + ) + self.assertEqual(change.change_size, -2) + + # 2 unused node, 1 used nodes, 0 upcoming jobs + change = calculate_change( + pool, + scalesets=[scaleset_1, scaleset_2], + scheduled_worksets=0, + in_use_nodes=1, + ) + self.assertEqual(change.change_size, -2) + + # now set the minimum size + pool.autoscale.min_size = 5 + + # min size needs 3 more nodes + change = calculate_change( + pool, + scalesets=[scaleset_1, scaleset_2], + scheduled_worksets=0, + in_use_nodes=0, + ) + self.assertEqual(change.change_size, 2) + + change = calculate_change( + pool, + scalesets=[scaleset_1, scaleset_2], + scheduled_worksets=10, + in_use_nodes=0, + ) + self.assertEqual(change.change_size, 7) + + # now set the minimum size + pool.autoscale.max_size = 6 + change = calculate_change( + pool, + scalesets=[scaleset_1, scaleset_2], + scheduled_worksets=100, + in_use_nodes=0, + ) + self.assertEqual(change.change_size, 3) From e44a6810263f95f341c3a6535d13f4fdd607582f Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Fri, 9 Jul 2021 13:06:55 -0400 Subject: [PATCH 64/66] regen docs --- docs/webhook_events.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/webhook_events.md b/docs/webhook_events.md index 145316ac71..36985a67a1 100644 --- a/docs/webhook_events.md +++ b/docs/webhook_events.md @@ -1248,7 +1248,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "scaleset_size": { - "default": 0, + "default": 1, "description": "unused", "title": "Scaleset Size", "type": "integer" @@ -4795,7 +4795,7 @@ Each event will be submitted via HTTP POST to the user provided URL. "type": "string" }, "scaleset_size": { - "default": 0, + "default": 1, "description": "unused", "title": "Scaleset Size", "type": "integer" From 96b92349590327d010e812449b64870fedfed299 Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 14 Jul 2021 15:04:38 -0400 Subject: [PATCH 65/66] add extra available node size --- .../__app__/onefuzzlib/workers/autoscale.py | 30 ++++---- src/api-service/tests/test_autoscale.py | 70 +++---------------- src/pytypes/onefuzztypes/models.py | 8 ++- 3 files changed, 32 insertions(+), 76 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/autoscale.py b/src/api-service/__app__/onefuzzlib/workers/autoscale.py index da637f3d0f..79c2c67057 100644 --- a/src/api-service/__app__/onefuzzlib/workers/autoscale.py +++ b/src/api-service/__app__/onefuzzlib/workers/autoscale.py @@ -201,7 +201,10 @@ def clear_synthetic_worksets(pool: Pool) -> None: ) -def needed_nodes(pool: Pool) -> Tuple[int, int]: +def needed_nodes(pool: Pool) -> int: + if not pool.autoscale: + raise Exception(f"scaling up a non-autoscaling pool: {pool.name}") + # NOTE: queue peek only returns the first 30 objects. workset_queue = pool.peek_work_queue() # only count worksets with work @@ -210,20 +213,20 @@ def needed_nodes(pool: Pool) -> Tuple[int, int]: nodes = Node.search_states(pool_name=pool.name, states=NodeState.in_use()) from_nodes = len(nodes) - return (scheduled_worksets, from_nodes) + node_count = scheduled_worksets + from_nodes + if pool.autoscale.extra_available_size: + node_count += pool.autoscale.extra_available_size + return node_count -def calculate_change( - pool: Pool, scalesets: List[Scaleset], scheduled_worksets: int, in_use_nodes: int -) -> Change: + +def calculate_change(pool: Pool, scalesets: List[Scaleset], new_size: int) -> Change: if not pool.autoscale: raise Exception(f"scaling up a non-autoscaling pool: {pool.name}") - node_need_estimate = scheduled_worksets + in_use_nodes - - new_size = node_need_estimate if pool.autoscale.min_size is not None: - new_size = max(node_need_estimate, pool.autoscale.min_size) + new_size = max(new_size, pool.autoscale.min_size) + if pool.autoscale.max_size is not None: new_size = min(new_size, pool.autoscale.max_size) @@ -245,13 +248,10 @@ def calculate_change( current_size += scaleset.size logging.info( - AUTOSCALE_LOG_PREFIX + "status - pool:%s current_size: %d new_size: %d " - "(in-use nodes: %d, scheduled worksets: %d)", + AUTOSCALE_LOG_PREFIX + "status - pool:%s current_size: %d new_size: %d", pool.name, current_size, new_size, - in_use_nodes, - scheduled_worksets, ) return Change( @@ -267,9 +267,9 @@ def autoscale_pool(pool: Pool) -> None: scalesets = Scaleset.search_by_pool(pool.name) - scheduled_worksets, in_use_nodes = needed_nodes(pool) + node_count = needed_nodes(pool) - change = calculate_change(pool, scalesets, scheduled_worksets, in_use_nodes) + change = calculate_change(pool, scalesets, node_count) if change.change_size > 0: clear_synthetic_worksets(pool) diff --git a/src/api-service/tests/test_autoscale.py b/src/api-service/tests/test_autoscale.py index b7d4745a1f..25e80ce23a 100644 --- a/src/api-service/tests/test_autoscale.py +++ b/src/api-service/tests/test_autoscale.py @@ -177,78 +177,30 @@ def test_calculate_change(self) -> None: ) # no scalesets, but need work - change = calculate_change( - pool, scalesets=[], scheduled_worksets=0, in_use_nodes=0 - ) + change = calculate_change(pool, [], 0) self.assertEqual(change.change_size, 0) - change = calculate_change( - pool, scalesets=[], scheduled_worksets=10, in_use_nodes=0 - ) + change = calculate_change(pool, [], 10) self.assertEqual(change.change_size, 10) - # 3 unused node, 0 used nodes, 10 upcoming jobs - change = calculate_change( - pool, - scalesets=[scaleset_1, scaleset_2], - scheduled_worksets=10, - in_use_nodes=0, - ) + # start with 3, end with 10 + change = calculate_change(pool, [scaleset_1, scaleset_2], 10) self.assertEqual(change.change_size, 7) - # 2 unused node, 1 used nodes, 10 upcoming jobs - change = calculate_change( - pool, - scalesets=[scaleset_1, scaleset_2], - scheduled_worksets=10, - in_use_nodes=1, - ) - self.assertEqual(change.change_size, 8) - - # 2 unused node, 1 used nodes, 0 upcoming jobs - change = calculate_change( - pool, - scalesets=[scaleset_1, scaleset_2], - scheduled_worksets=0, - in_use_nodes=1, - ) + # start with 3, end with 1 + change = calculate_change(pool, [scaleset_1, scaleset_2], 1) self.assertEqual(change.change_size, -2) - # 2 unused node, 1 used nodes, 0 upcoming jobs - change = calculate_change( - pool, - scalesets=[scaleset_1, scaleset_2], - scheduled_worksets=0, - in_use_nodes=1, - ) - self.assertEqual(change.change_size, -2) - - # now set the minimum size + # verify min_size pool.autoscale.min_size = 5 - # min size needs 3 more nodes - change = calculate_change( - pool, - scalesets=[scaleset_1, scaleset_2], - scheduled_worksets=0, - in_use_nodes=0, - ) + change = calculate_change(pool, [scaleset_1, scaleset_2], 0) self.assertEqual(change.change_size, 2) - change = calculate_change( - pool, - scalesets=[scaleset_1, scaleset_2], - scheduled_worksets=10, - in_use_nodes=0, - ) + change = calculate_change(pool, [scaleset_1, scaleset_2], 10) self.assertEqual(change.change_size, 7) - # now set the minimum size + # verify max_size pool.autoscale.max_size = 6 - change = calculate_change( - pool, - scalesets=[scaleset_1, scaleset_2], - scheduled_worksets=100, - in_use_nodes=0, - ) + change = calculate_change(pool, [scaleset_1, scaleset_2], 100) self.assertEqual(change.change_size, 3) diff --git a/src/pytypes/onefuzztypes/models.py b/src/pytypes/onefuzztypes/models.py index d26a1daf0b..98df1363ae 100644 --- a/src/pytypes/onefuzztypes/models.py +++ b/src/pytypes/onefuzztypes/models.py @@ -595,8 +595,11 @@ class NodeTasks(BaseModel): class AutoScaleConfig(BaseModel): image: str - max_size: int = Field(default=1000, le=1000, ge=0) # max size of pool - min_size: int = Field(default=0, le=1000, ge=0) # min size of pool + max_size: int = Field(default=1000, ge=0, description="maximum size of the pool") + min_size: int = Field(default=0, ge=0, description="minimum size of the pool") + extra_available_size: Optional[int] = Field( + ge=0, description="number of unused but available nodes" + ) region: Optional[Region] scaleset_size: int = Field(default=1, description="unused") spot_instances: bool = Field(default=False) @@ -607,6 +610,7 @@ class AutoScaleConfig(BaseModel): def check_data(cls, values: Any) -> Any: if values["min_size"] > values["max_size"]: raise ValueError("The pool min_size is greater than max_size") + return values From 9976de95f604c94e11ba32308049c5ba6795bf6f Mon Sep 17 00:00:00 2001 From: Brian Caswell Date: Wed, 14 Jul 2021 15:07:48 -0400 Subject: [PATCH 66/66] use available size --- docs/webhook_events.md | 20 ++++++++++++++++---- src/cli/onefuzz/api.py | 2 ++ src/integration-tests/integration-test.py | 2 +- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/docs/webhook_events.md b/docs/webhook_events.md index 36985a67a1..71863bc0a2 100644 --- a/docs/webhook_events.md +++ b/docs/webhook_events.md @@ -1225,20 +1225,26 @@ Each event will be submitted via HTTP POST to the user provided URL. "title": "Ephemeral Os Disks", "type": "boolean" }, + "extra_available_size": { + "description": "number of unused but available nodes", + "minimum": 0, + "title": "Extra Available Size", + "type": "integer" + }, "image": { "title": "Image", "type": "string" }, "max_size": { "default": 1000, - "maximum": 1000, + "description": "maximum size of the pool", "minimum": 0, "title": "Max Size", "type": "integer" }, "min_size": { "default": 0, - "maximum": 1000, + "description": "minimum size of the pool", "minimum": 0, "title": "Min Size", "type": "integer" @@ -4772,20 +4778,26 @@ Each event will be submitted via HTTP POST to the user provided URL. "title": "Ephemeral Os Disks", "type": "boolean" }, + "extra_available_size": { + "description": "number of unused but available nodes", + "minimum": 0, + "title": "Extra Available Size", + "type": "integer" + }, "image": { "title": "Image", "type": "string" }, "max_size": { "default": 1000, - "maximum": 1000, + "description": "maximum size of the pool", "minimum": 0, "title": "Max Size", "type": "integer" }, "min_size": { "default": 0, - "maximum": 1000, + "description": "minimum size of the pool", "minimum": 0, "title": "Min Size", "type": "integer" diff --git a/src/cli/onefuzz/api.py b/src/cli/onefuzz/api.py index 3df47f41ed..9e55042124 100644 --- a/src/cli/onefuzz/api.py +++ b/src/cli/onefuzz/api.py @@ -1186,6 +1186,7 @@ def create_autoscale( arch: enums.Architecture = enums.Architecture.x86_64, min_size: int = models.AutoScaleConfig.__fields__["min_size"].default, max_size: int = models.AutoScaleConfig.__fields__["max_size"].default, + extra_available_size: Optional[int] = None, image: Optional[str] = None, vm_sku: Optional[str] = "Standard_D2s_v3", region: Optional[primitives.Region] = None, @@ -1204,6 +1205,7 @@ def create_autoscale( image=image, max_size=max_size, min_size=min_size, + extra_available_size=extra_available_size, region=region, spot_instances=spot_instances, vm_sku=vm_sku, diff --git a/src/integration-tests/integration-test.py b/src/integration-tests/integration-test.py index c45cfadfdb..0f5949b8a1 100755 --- a/src/integration-tests/integration-test.py +++ b/src/integration-tests/integration-test.py @@ -223,7 +223,7 @@ def setup( name = PoolName(f"testpool-{entry.name}-{self.test_id}") self.logger.info("creating pool: %s:%s", entry.name, name) self.pools[entry] = self.of.pools.create_autoscale( - name, entry, region=region + name, entry, region=region, extra_available_size=5 ) def launch(