Skip to content
This repository has been archived by the owner on Nov 1, 2023. It is now read-only.

remove nodes from db upon reimage #1005

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 23 additions & 12 deletions src/api-service/__app__/onefuzzlib/workers/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,6 @@


class Node(BASE_NODE, ORMMixin):
# should only be set by Scaleset.reimage_nodes
# should only be unset during agent_registration POST
reimage_queued: bool = Field(default=False)

@classmethod
def create(
cls,
Expand Down Expand Up @@ -258,7 +254,8 @@ def can_process_new_work(self) -> bool:

if self.is_outdated():
logging.info(
"can_schedule agent and service versions differ, stopping node. "
"can_process_new_work agent and service versions differ, "
"stopping node. "
"machine_id:%s agent_version:%s service_version: %s",
self.machine_id,
self.version,
Expand All @@ -269,51 +266,65 @@ def can_process_new_work(self) -> bool:

if self.is_too_old():
logging.info(
"can_schedule node is too old. machine_id:%s", self.machine_id
"can_process_new_work node is too old. machine_id:%s", self.machine_id
)
self.stop(done=True)
return False

if self.state not in NodeState.can_process_new_work():
logging.info(
"can_process_new_work node not in appropriate state for new work"
"machine_id:%s state:%S",
self.machine_id,
self.state.name,
)
return False

if self.state in NodeState.ready_for_reset():
logging.info(
"can_schedule node is set for reset. machine_id:%s", self.machine_id
"can_process_new_work node is set for reset. machine_id:%s",
self.machine_id,
)
return False

if self.delete_requested:
logging.info(
"can_schedule is set to be deleted. machine_id:%s",
"can_process_new_work is set to be deleted. machine_id:%s",
self.machine_id,
)
self.stop(done=True)
return False

if self.reimage_requested:
logging.info(
"can_schedule is set to be reimaged. machine_id:%s",
"can_process_new_work is set to be reimaged. machine_id:%s",
self.machine_id,
)
self.stop(done=True)
return False

if self.could_shrink_scaleset():
logging.info("node scheduled to shrink. machine_id:%s", self.machine_id)
logging.info(
"can_process_new_work node scheduled to shrink. machine_id:%s",
self.machine_id,
)
self.set_halt()
return False

if self.scaleset_id:
scaleset = Scaleset.get_by_id(self.scaleset_id)
if isinstance(scaleset, Error):
logging.info(
"can_schedule - invalid scaleset. scaleset_id:%s machine_id:%s",
"can_process_new_work invalid scaleset. "
"scaleset_id:%s machine_id:%s",
self.scaleset_id,
self.machine_id,
)
return False

if scaleset.state not in ScalesetState.available():
logging.info(
"can_schedule - scaleset not available for work. "
"can_process_new_work scaleset not available for work. "
"scaleset_id:%s machine_id:%s",
self.scaleset_id,
self.machine_id,
Expand Down
14 changes: 6 additions & 8 deletions src/api-service/__app__/onefuzzlib/workers/scalesets.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,13 +372,7 @@ def cleanup_nodes(self) -> bool:
if ScalesetShrinkQueue(self.scaleset_id).should_shrink():
node.set_halt()
to_delete.append(node)
elif node.reimage_queued:
# reset the reimage_queued flag, in case it's not done
# reimaging during the next cleanup_nodes cycle
node.reimage_queued = False
node.save()
else:
# only add nodes that are not already set to reschedule
to_reimage.append(node)

dead_nodes = Node.get_dead_nodes(self.scaleset_id, NODE_EXPIRATION_TIME)
Expand Down Expand Up @@ -561,6 +555,9 @@ def delete_nodes(self, nodes: List[Node]) -> None:
machine_ids,
)
delete_vmss_nodes(self.scaleset_id, machine_ids)
ranweiler marked this conversation as resolved.
Show resolved Hide resolved
for node in nodes:
if node.machine_id in machine_ids:
node.delete()
ranweiler marked this conversation as resolved.
Show resolved Hide resolved

def reimage_nodes(self, nodes: List[Node]) -> None:
if not nodes:
Expand Down Expand Up @@ -612,9 +609,10 @@ def reimage_nodes(self, nodes: List[Node]) -> None:
"unable to reimage nodes: %s:%s - %s"
% (self.scaleset_id, machine_ids, result)
)

for node in nodes:
node.reimage_queued = True
node.save()
if node.machine_id in machine_ids:
node.delete()

def set_shutdown(self, now: bool) -> None:
if self.state in [ScalesetState.halt, ScalesetState.shutdown]:
Expand Down
4 changes: 4 additions & 0 deletions src/pytypes/onefuzztypes/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,10 @@ def ready_for_reset(cls) -> List["NodeState"]:
# from the agent.
return [cls.done, cls.shutdown, cls.halt]

@classmethod
def can_process_new_work(cls) -> List["NodeState"]:
return [cls.free]


class GithubIssueState(Enum):
open = "open"
Expand Down