diff --git a/src/api-service/__app__/onefuzzlib/pools.py b/src/api-service/__app__/onefuzzlib/pools.py index 25ece1ca5d..b16a03e0a0 100644 --- a/src/api-service/__app__/onefuzzlib/pools.py +++ b/src/api-service/__app__/onefuzzlib/pools.py @@ -62,6 +62,8 @@ from .extension import fuzz_extensions from .orm import MappingIntStrAny, ORMMixin, QueryFilter +NODE_EXPIRATION_TIME: datetime.timedelta = datetime.timedelta(hours=1) + # Future work: # # Enabling autoscaling for the scalesets based on the pool work queues. @@ -278,6 +280,18 @@ def set_halt(self) -> None: self.set_shutdown() self.stop() + @classmethod + def get_dead_nodes( + cls, scaleset_id: UUID, expiration_period: datetime.timedelta + ) -> List["Node"]: + time_filter = "heartbeat lt datetime'%s'" % ( + (datetime.datetime.utcnow() - expiration_period).isoformat() + ) + return cls.search( + query={"scaleset_id": [scaleset_id]}, + raw_unchecked_filter=time_filter, + ) + class NodeTasks(BASE_NODE_TASK, ORMMixin): @classmethod @@ -743,6 +757,11 @@ def cleanup_nodes(self) -> bool: # only add nodes that are not already set to reschedule to_reimage.append(node) + dead_nodes = Node.get_dead_nodes(self.scaleset_id, NODE_EXPIRATION_TIME) + for node in dead_nodes: + node.set_halt() + to_reimage.append(node) + # Perform operations until they fail due to scaleset getting locked try: if to_delete: