-
Notifications
You must be signed in to change notification settings - Fork 2.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enables running of multiple tasks in batches #1784
Changes from 1 commit
b31018c
d7c4c86
32fc576
9855228
096808e
a4421ea
81a639e
b68bd29
6382cc6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -140,6 +140,10 @@ def __init__(self, default=_no_value, is_global=False, significant=True, descrip | |
``positional=False`` for abstract base classes and similar cases. | ||
:param bool always_in_help: For the --help option in the command line | ||
parsing. Set true to always show in --help. | ||
:param function(iterable[A]) -> A batch_method: Method to combine an iterable of parsed | ||
parameter values into a single value. Used | ||
when receiving batched parameter lists from | ||
the scheduler. | ||
""" | ||
self._default = default | ||
self._batch_method = batch_method | ||
|
@@ -231,13 +235,13 @@ def parse_list(self, xs): | |
""" | ||
Parse a list of values from the scheduler. | ||
|
||
Only possible if this parameter has a batch method. This will combine the list into a single | ||
parameter value using batch method. | ||
Only possible if this is_batchable() is True. This will combine the list into a single | ||
parameter value using batch method. This should never need to be overridden. | ||
|
||
:param xs: list of values to parse and combine | ||
:return: the combined parsed values | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we also add in the docs that in most cases, this does not need to be overriden? (Or am I wrong about this?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right about that. Added to the docs. |
||
if self._batch_method is None: | ||
if not self.is_batchable(): | ||
raise NotImplementedError('No batch method found') | ||
elif not xs: | ||
raise ValueError('Empty parameter list passed to parse_list') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -232,6 +232,8 @@ def __init__(self, task_id, status, deps, resources=None, priority=0, family='', | |
def __repr__(self): | ||
return "Task(%r)" % vars(self) | ||
|
||
# TODO(2017-08-10) replace this function with direct calls to batchable | ||
# this only exists for backward compatibility | ||
def is_batchable(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The long term plan is just to replace this with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, added a TODO comment for a year from now since it'll probably be a while before this makes it to most end users. |
||
try: | ||
return self.batchable | ||
|
@@ -380,13 +382,11 @@ def get_active_tasks(self, status=None): | |
yield task | ||
|
||
def get_batch_running_tasks(self, batch_id): | ||
if batch_id is None: | ||
return [] | ||
else: | ||
return [ | ||
task for task in self.get_active_tasks(BATCH_RUNNING) | ||
if task.batch_id == batch_id | ||
] | ||
assert batch_id is not None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really like this style. :) |
||
return [ | ||
task for task in self.get_active_tasks(BATCH_RUNNING) | ||
if task.batch_id == batch_id | ||
] | ||
|
||
def get_running_tasks(self): | ||
return six.itervalues(self._status_tasks[RUNNING]) | ||
|
@@ -396,8 +396,6 @@ def get_pending_tasks(self): | |
for status in [PENDING, RUNNING]) | ||
|
||
def set_batcher(self, worker_id, family, batcher_args, max_batch_size): | ||
if max_batch_size is None: | ||
max_batch_size = float('inf') | ||
self._task_batchers.setdefault(worker_id, {}) | ||
self._task_batchers[worker_id][family] = (batcher_args, max_batch_size) | ||
|
||
|
@@ -645,7 +643,7 @@ def _update_priority(self, task, prio, worker): | |
self._update_priority(t, prio, worker) | ||
|
||
@rpc_method() | ||
def add_task_batcher(self, worker, task_family, batched_args, max_batch_size=None): | ||
def add_task_batcher(self, worker, task_family, batched_args, max_batch_size=float('inf')): | ||
self._state.set_batcher(worker, task_family, batched_args, max_batch_size) | ||
|
||
@rpc_method() | ||
|
@@ -694,8 +692,9 @@ def add_task(self, task_id=None, status=PENDING, runnable=True, | |
|
||
if tracking_url is not None or task.status != RUNNING: | ||
task.tracking_url = tracking_url | ||
for batch_task in self._state.get_batch_running_tasks(task.batch_id): | ||
batch_task.tracking_url = tracking_url | ||
if task.batch_id is not None: | ||
for batch_task in self._state.get_batch_running_tasks(task.batch_id): | ||
batch_task.tracking_url = tracking_url | ||
|
||
if batchable is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we avoid tribool logic here? Like now we have 3 possible values ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. batchable is False by default when a new task is created. It'll never be None in the internal representation in the scheduler. This just says not to override batchable when an add_task doesn't specify it. add_tasks for changing job statuses don't always contain every argument, so we don't want to change task.batchable if it isn't supplied. It's the same pattern we use for other parameters nearby in the same function. |
||
task.batchable = batchable | ||
|
@@ -705,8 +704,9 @@ def add_task(self, task_id=None, status=PENDING, runnable=True, | |
|
||
if expl is not None: | ||
task.expl = expl | ||
for batch_task in self._state.get_batch_running_tasks(task.batch_id): | ||
batch_task.expl = expl | ||
if task.batch_id is not None: | ||
for batch_task in self._state.get_batch_running_tasks(task.batch_id): | ||
batch_task.expl = expl | ||
|
||
if not (task.status in (RUNNING, BATCH_RUNNING) and status == PENDING) or new_deps: | ||
# don't allow re-scheduling of task while it is running, it must either fail or succeed first | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just created #1814 which is going to be able to help you.
As for referencing to a section in the documentation. See this example:
luigi/luigi/task.py
Line 591 in 298a0d1
.. _ConfigClasses:
)