Skip to content
This repository has been archived by the owner on Nov 1, 2023. It is now read-only.

Commit

Permalink
remove nodes from db upon reimage (#1005)
Browse files Browse the repository at this point in the history
The flag `Node.reimage_queued` is intended to stop nodes from reimaging repeatedly.  

In #970, in order to work around Azure API failures, this flag was cycled if the node was already set to cleanup.  Unfortunately, reimaging can sometimes take a significant amount of time, causing this change to get nodes multiple times.

Instead of using `reimage_queued` as a flag, this PR deletes the node from the storage table upon reimage.  When the node registers OR the next time through `Scaleset.cleanup_nodes`, the Node will be recreated automatically, whichever comes first.
  • Loading branch information
bmc-msft authored Jun 23, 2021
1 parent 7c549e7 commit 5f8e423
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 20 deletions.
35 changes: 23 additions & 12 deletions src/api-service/__app__/onefuzzlib/workers/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,6 @@


class Node(BASE_NODE, ORMMixin):
# should only be set by Scaleset.reimage_nodes
# should only be unset during agent_registration POST
reimage_queued: bool = Field(default=False)

@classmethod
def create(
cls,
Expand Down Expand Up @@ -258,7 +254,8 @@ def can_process_new_work(self) -> bool:

if self.is_outdated():
logging.info(
"can_schedule agent and service versions differ, stopping node. "
"can_process_new_work agent and service versions differ, "
"stopping node. "
"machine_id:%s agent_version:%s service_version: %s",
self.machine_id,
self.version,
Expand All @@ -269,51 +266,65 @@ def can_process_new_work(self) -> bool:

if self.is_too_old():
logging.info(
"can_schedule node is too old. machine_id:%s", self.machine_id
"can_process_new_work node is too old. machine_id:%s", self.machine_id
)
self.stop(done=True)
return False

if self.state not in NodeState.can_process_new_work():
logging.info(
"can_process_new_work node not in appropriate state for new work"
"machine_id:%s state:%S",
self.machine_id,
self.state.name,
)
return False

if self.state in NodeState.ready_for_reset():
logging.info(
"can_schedule node is set for reset. machine_id:%s", self.machine_id
"can_process_new_work node is set for reset. machine_id:%s",
self.machine_id,
)
return False

if self.delete_requested:
logging.info(
"can_schedule is set to be deleted. machine_id:%s",
"can_process_new_work is set to be deleted. machine_id:%s",
self.machine_id,
)
self.stop(done=True)
return False

if self.reimage_requested:
logging.info(
"can_schedule is set to be reimaged. machine_id:%s",
"can_process_new_work is set to be reimaged. machine_id:%s",
self.machine_id,
)
self.stop(done=True)
return False

if self.could_shrink_scaleset():
logging.info("node scheduled to shrink. machine_id:%s", self.machine_id)
logging.info(
"can_process_new_work node scheduled to shrink. machine_id:%s",
self.machine_id,
)
self.set_halt()
return False

if self.scaleset_id:
scaleset = Scaleset.get_by_id(self.scaleset_id)
if isinstance(scaleset, Error):
logging.info(
"can_schedule - invalid scaleset. scaleset_id:%s machine_id:%s",
"can_process_new_work invalid scaleset. "
"scaleset_id:%s machine_id:%s",
self.scaleset_id,
self.machine_id,
)
return False

if scaleset.state not in ScalesetState.available():
logging.info(
"can_schedule - scaleset not available for work. "
"can_process_new_work scaleset not available for work. "
"scaleset_id:%s machine_id:%s",
self.scaleset_id,
self.machine_id,
Expand Down
14 changes: 6 additions & 8 deletions src/api-service/__app__/onefuzzlib/workers/scalesets.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,13 +372,7 @@ def cleanup_nodes(self) -> bool:
if ScalesetShrinkQueue(self.scaleset_id).should_shrink():
node.set_halt()
to_delete.append(node)
elif node.reimage_queued:
# reset the reimage_queued flag, in case it's not done
# reimaging during the next cleanup_nodes cycle
node.reimage_queued = False
node.save()
else:
# only add nodes that are not already set to reschedule
to_reimage.append(node)

dead_nodes = Node.get_dead_nodes(self.scaleset_id, NODE_EXPIRATION_TIME)
Expand Down Expand Up @@ -561,6 +555,9 @@ def delete_nodes(self, nodes: List[Node]) -> None:
machine_ids,
)
delete_vmss_nodes(self.scaleset_id, machine_ids)
for node in nodes:
if node.machine_id in machine_ids:
node.delete()

def reimage_nodes(self, nodes: List[Node]) -> None:
if not nodes:
Expand Down Expand Up @@ -612,9 +609,10 @@ def reimage_nodes(self, nodes: List[Node]) -> None:
"unable to reimage nodes: %s:%s - %s"
% (self.scaleset_id, machine_ids, result)
)

for node in nodes:
node.reimage_queued = True
node.save()
if node.machine_id in machine_ids:
node.delete()

def set_shutdown(self, now: bool) -> None:
if self.state in [ScalesetState.halt, ScalesetState.shutdown]:
Expand Down
4 changes: 4 additions & 0 deletions src/pytypes/onefuzztypes/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,10 @@ def ready_for_reset(cls) -> List["NodeState"]:
# from the agent.
return [cls.done, cls.shutdown, cls.halt]

@classmethod
def can_process_new_work(cls) -> List["NodeState"]:
return [cls.free]


class GithubIssueState(Enum):
open = "open"
Expand Down

0 comments on commit 5f8e423

Please sign in to comment.