-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
queue status: will show the current worker status #7903
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
import locale | ||
import logging | ||
import os | ||
import time | ||
from collections import defaultdict | ||
from typing import ( | ||
TYPE_CHECKING, | ||
|
@@ -124,22 +125,31 @@ def worker(self) -> "TemporaryWorker": | |
) | ||
|
||
def spawn_worker(self): | ||
from shortuuid import uuid | ||
|
||
from dvc_task.proc.process import ManagedProcess | ||
|
||
logger.debug("Spawning exp queue worker") | ||
wdir_hash = hashlib.sha256(self.wdir.encode("utf-8")).hexdigest()[:6] | ||
node_name = f"dvc-exp-{wdir_hash}-1@localhost" | ||
number = 1 | ||
node_name = f"dvc-exp-{wdir_hash}-{number}@localhost" | ||
worker_status = self.active_worker() | ||
while node_name in worker_status: | ||
number += 1 | ||
node_name = f"dvc-exp-{wdir_hash}-{number}@localhost" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The result There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is intentional. We want to make sure we are reusing node names (so for something like When the dvc-task |
||
|
||
cmd = ["exp", "queue-worker", node_name] | ||
name = "dvc-exp-worker" | ||
if logger.getEffectiveLevel() < logging.INFO: | ||
name = name + str(uuid()) | ||
name = f"dvc-exp-worker-{number}" | ||
logger.debug(f"start worker: {name}, node: {node_name}") | ||
if os.name == "nt": | ||
daemonize(cmd) | ||
else: | ||
ManagedProcess.spawn(["dvc"] + cmd, wdir=self.wdir, name=name) | ||
|
||
for _ in range(5): | ||
time.sleep(1) | ||
if node_name in self.active_worker(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The newly created node can only be detected some time later. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we need this check. We already expect that in some cases the new worker may not start at all (if the node name is already in use it should exit immediately). |
||
return | ||
logger.debug(f"worker {name} node {node_name} didn't start in 5 sec") | ||
|
||
def put(self, *args, **kwargs) -> QueueEntry: | ||
"""Stash an experiment and add it to the queue.""" | ||
entry = self._stash_exp(*args, **kwargs) | ||
|
@@ -316,6 +326,12 @@ def logs( | |
) as fobj: | ||
ui.write(fobj.read()) | ||
|
||
def active_worker(self) -> Set: | ||
"""Return the current active celery worker""" | ||
status = self.celery.control.inspect().active() or {} | ||
logger.debug(f"Worker status: {status}") | ||
return {name for name in status if status[name]} | ||
|
||
|
||
class WorkspaceQueue(BaseStashQueue): | ||
def put(self, *args, **kwargs) -> QueueEntry: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't want to give each worker a new number. The
number
field should only be changing when we explicitly useexp queue start --jobs <number>
(which is still disabled to always be 1).