1 errors, 158 fail, 97 skipped, 3 808 pass in 13h 3m 18s
29 files 29 suites 13h 3m 18s ⏱️
4 064 tests 3 808 ✅ 97 💤 158 ❌ 1 🔥
55 991 runs 51 717 ✅ 2 163 💤 2 101 ❌ 10 🔥
Results for commit 28804b7.
Annotations
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_basic_merge[inner] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 9af47a5ca91e6062414f19852f840ae6
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57160', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'inner'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_basic_merge(c, s, a, b, how):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
joined = a.merge(b, left_on="y", right_on="y", how=how)
if dd._dask_expr_enabled():
# Ensure we're using a hash join
from dask_expr._merge import HashJoinP2P
assert any(
isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
)
expected = pd.merge(A, B, how, "y")
> await list_eq(joined, expected)
distributed\shuffle\tests\test_merge.py:91:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle 9af47a5ca91e6062414f19852f840ae6
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_basic_merge[left] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 0s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle b1b374543ccedf9726ce1c0fc6c357a0
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57184', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'left'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_basic_merge(c, s, a, b, how):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
joined = a.merge(b, left_on="y", right_on="y", how=how)
if dd._dask_expr_enabled():
# Ensure we're using a hash join
from dask_expr._merge import HashJoinP2P
assert any(
isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
)
expected = pd.merge(A, B, how, "y")
> await list_eq(joined, expected)
distributed\shuffle\tests\test_merge.py:91:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle b1b374543ccedf9726ce1c0fc6c357a0
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_basic_merge[right] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 0s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 4c2016adc19b634b7fa31c47847ec221
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57211', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'right'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_basic_merge(c, s, a, b, how):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
joined = a.merge(b, left_on="y", right_on="y", how=how)
if dd._dask_expr_enabled():
# Ensure we're using a hash join
from dask_expr._merge import HashJoinP2P
assert any(
isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
)
expected = pd.merge(A, B, how, "y")
> await list_eq(joined, expected)
distributed\shuffle\tests\test_merge.py:91:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle 4c2016adc19b634b7fa31c47847ec221
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_basic_merge[outer] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle cd8c05dd69e45b42dca287c4e5affc62
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57235', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'outer'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_basic_merge(c, s, a, b, how):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
joined = a.merge(b, left_on="y", right_on="y", how=how)
if dd._dask_expr_enabled():
# Ensure we're using a hash join
from dask_expr._merge import HashJoinP2P
assert any(
isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
)
expected = pd.merge(A, B, how, "y")
> await list_eq(joined, expected)
distributed\shuffle\tests\test_merge.py:91:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle cd8c05dd69e45b42dca287c4e5affc62
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge_p2p_shuffle_reused_dataframe_with_different_parameters (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 4s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 5s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 4s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 4s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 4s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 4s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 64b40cf6ff1c9589a1265228d57c0f75
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57259', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57260', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57262', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True)
async def test_merge_p2p_shuffle_reused_dataframe_with_different_parameters(c, s, a, b):
pdf1 = pd.DataFrame({"a": range(100), "b": range(0, 200, 2)})
pdf2 = pd.DataFrame({"x": range(200), "y": [1, 2, 3, 4] * 50})
ddf1 = dd.from_pandas(pdf1, npartitions=5)
ddf2 = dd.from_pandas(pdf2, npartitions=10)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
out = (
ddf1.merge(ddf2, left_on="a", right_on="x")
# Vary the number of output partitions for the shuffles of dd2
.repartition(npartitions=20).merge(ddf2, left_on="b", right_on="x")
)
# Generate unique shuffle IDs if the input frame is the same but
# parameters differ. Reusing shuffles in merges is dangerous because of the
# required coordination and complexity introduced through dynamic clusters.
assert sum(id_from_key(k) is not None for k in out.dask) == 4
> result = await c.compute(out)
distributed\shuffle\tests\test_merge.py:126:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle 64b40cf6ff1c9589a1265228d57c0f75
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge_p2p_shuffle_reused_dataframe_with_same_parameters (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 3s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 3s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 3s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 4s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 3s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 64b40cf6ff1c9589a1265228d57c0f75
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57292', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57293', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57295', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True)
async def test_merge_p2p_shuffle_reused_dataframe_with_same_parameters(c, s, a, b):
pdf1 = pd.DataFrame({"a": range(100), "b": range(0, 200, 2)})
pdf2 = pd.DataFrame({"x": range(200), "y": [1, 2, 3, 4] * 50})
ddf1 = dd.from_pandas(pdf1, npartitions=5)
ddf2 = dd.from_pandas(pdf2, npartitions=10)
# This performs two shuffles:
# * ddf1 is shuffled on `a`
# * ddf2 is shuffled on `x`
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
ddf3 = ddf1.merge(
ddf2,
left_on="a",
right_on="x",
)
# This performs one shuffle:
# * ddf3 is shuffled on `b`
# We can reuse the shuffle of dd2 on `x` from the previous merge.
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
out = ddf2.merge(
ddf3,
left_on="x",
right_on="b",
)
# Generate unique shuffle IDs if the input frame is the same and all its
# parameters match. Reusing shuffles in merges is dangerous because of the
# required coordination and complexity introduced through dynamic clusters.
assert sum(id_from_key(k) is not None for k in out.dask) == 4
> result = await c.compute(out)
distributed\shuffle\tests\test_merge.py:163:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle 64b40cf6ff1c9589a1265228d57c0f75
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge[True-inner] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 9af47a5ca91e6062414f19852f840ae6
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57330', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'inner', disk = True
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed\shuffle\tests\test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle 9af47a5ca91e6062414f19852f840ae6
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge[True-outer] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle cd8c05dd69e45b42dca287c4e5affc62
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57353', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'outer', disk = True
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed\shuffle\tests\test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle cd8c05dd69e45b42dca287c4e5affc62
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge[True-left] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 69ac58588df4022b70f1c6b5c56323d3
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57376', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'left', disk = True
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed\shuffle\tests\test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle 69ac58588df4022b70f1c6b5c56323d3
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge[True-right] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 4c2016adc19b634b7fa31c47847ec221
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57399', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'right', disk = True
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed\shuffle\tests\test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle 4c2016adc19b634b7fa31c47847ec221
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge[False-inner] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 9af47a5ca91e6062414f19852f840ae6
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57422', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'inner', disk = False
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed\shuffle\tests\test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle 9af47a5ca91e6062414f19852f840ae6
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge[False-outer] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle cd8c05dd69e45b42dca287c4e5affc62
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57446', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'outer', disk = False
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed\shuffle\tests\test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle cd8c05dd69e45b42dca287c4e5affc62
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge[False-left] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 69ac58588df4022b70f1c6b5c56323d3
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57479', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'left', disk = False
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed\shuffle\tests\test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle 69ac58588df4022b70f1c6b5c56323d3
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge[False-right] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 4c2016adc19b634b7fa31c47847ec221
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57502', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'right', disk = False
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed\shuffle\tests\test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle 4c2016adc19b634b7fa31c47847ec221
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge_by_multiple_columns[inner] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 2s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle ed347283afc2063dd42c7fbce4b90ceb
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57525', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57526', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57528', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'inner'
@pytest.mark.slow
@gen_cluster(client=True, timeout=120)
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
async def test_merge_by_multiple_columns(c, s, a, b, how):
# warnings here from pandas
pdf1l = pd.DataFrame(
{
"a": list("abcdefghij"),
"b": list("abcdefghij"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf1r = pd.DataFrame(
{
"d": list("abcdefghij"),
"e": list("abcdefghij"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("abcdefghij"),
)
pdf2l = pd.DataFrame(
{
"a": list("abcdeabcde"),
"b": list("abcabcabca"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf2r = pd.DataFrame(
{
"d": list("edcbaedcba"),
"e": list("aaabbbcccd"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("fghijklmno"),
)
pdf3l = pd.DataFrame(
{
"a": list("aaaaaaaaaa"),
"b": list("aaaaaaaaaa"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf3r = pd.DataFrame(
{
"d": list("aaabbbccaa"),
"e": list("abbbbbbbbb"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("ABCDEFGHIJ"),
)
for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
ddl = dd.from_pandas(pdl, lpart)
ddr = dd.from_pandas(pdr, rpart)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
expected = pdl.join(pdr, how=how)
assert_eq(
await c.compute(ddl.join(ddr, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
expected = pdr.join(pdl, how=how)
assert_eq(
await c.compute(ddr.join(ddl, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
expected = pd.merge(
pdl, pdr, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddl,
ddr,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
expected = pd.merge(
pdr, pdl, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddr,
ddl,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
# hash join
> await list_eq(
dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
)
distributed\shuffle\tests\test_merge.py:351:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle ed347283afc2063dd42c7fbce4b90ceb
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge_by_multiple_columns[outer] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 2s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 42b1fb2af275859f3acb20b56d92aa7b
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57551', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57552', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57554', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'outer'
@pytest.mark.slow
@gen_cluster(client=True, timeout=120)
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
async def test_merge_by_multiple_columns(c, s, a, b, how):
# warnings here from pandas
pdf1l = pd.DataFrame(
{
"a": list("abcdefghij"),
"b": list("abcdefghij"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf1r = pd.DataFrame(
{
"d": list("abcdefghij"),
"e": list("abcdefghij"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("abcdefghij"),
)
pdf2l = pd.DataFrame(
{
"a": list("abcdeabcde"),
"b": list("abcabcabca"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf2r = pd.DataFrame(
{
"d": list("edcbaedcba"),
"e": list("aaabbbcccd"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("fghijklmno"),
)
pdf3l = pd.DataFrame(
{
"a": list("aaaaaaaaaa"),
"b": list("aaaaaaaaaa"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf3r = pd.DataFrame(
{
"d": list("aaabbbccaa"),
"e": list("abbbbbbbbb"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("ABCDEFGHIJ"),
)
for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
ddl = dd.from_pandas(pdl, lpart)
ddr = dd.from_pandas(pdr, rpart)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
expected = pdl.join(pdr, how=how)
assert_eq(
await c.compute(ddl.join(ddr, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
expected = pdr.join(pdl, how=how)
assert_eq(
await c.compute(ddr.join(ddl, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
expected = pd.merge(
pdl, pdr, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddl,
ddr,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
expected = pd.merge(
pdr, pdl, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddr,
ddl,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
# hash join
> await list_eq(
dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
)
distributed\shuffle\tests\test_merge.py:351:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle 42b1fb2af275859f3acb20b56d92aa7b
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge_by_multiple_columns[left] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 2s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle fdf462962d855192df53f2e0a37998c8
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57575', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57576', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57578', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'left'
@pytest.mark.slow
@gen_cluster(client=True, timeout=120)
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
async def test_merge_by_multiple_columns(c, s, a, b, how):
# warnings here from pandas
pdf1l = pd.DataFrame(
{
"a": list("abcdefghij"),
"b": list("abcdefghij"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf1r = pd.DataFrame(
{
"d": list("abcdefghij"),
"e": list("abcdefghij"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("abcdefghij"),
)
pdf2l = pd.DataFrame(
{
"a": list("abcdeabcde"),
"b": list("abcabcabca"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf2r = pd.DataFrame(
{
"d": list("edcbaedcba"),
"e": list("aaabbbcccd"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("fghijklmno"),
)
pdf3l = pd.DataFrame(
{
"a": list("aaaaaaaaaa"),
"b": list("aaaaaaaaaa"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf3r = pd.DataFrame(
{
"d": list("aaabbbccaa"),
"e": list("abbbbbbbbb"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("ABCDEFGHIJ"),
)
for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
ddl = dd.from_pandas(pdl, lpart)
ddr = dd.from_pandas(pdr, rpart)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
expected = pdl.join(pdr, how=how)
assert_eq(
await c.compute(ddl.join(ddr, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
expected = pdr.join(pdl, how=how)
assert_eq(
await c.compute(ddr.join(ddl, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
expected = pd.merge(
pdl, pdr, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddl,
ddr,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
expected = pd.merge(
pdr, pdl, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddr,
ddl,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
# hash join
> await list_eq(
dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
)
distributed\shuffle\tests\test_merge.py:351:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle fdf462962d855192df53f2e0a37998c8
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge_by_multiple_columns[right] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 2s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 32ef853a75040171fead12eacb19799a
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57598', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57599', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57601', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'right'
@pytest.mark.slow
@gen_cluster(client=True, timeout=120)
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
async def test_merge_by_multiple_columns(c, s, a, b, how):
# warnings here from pandas
pdf1l = pd.DataFrame(
{
"a": list("abcdefghij"),
"b": list("abcdefghij"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf1r = pd.DataFrame(
{
"d": list("abcdefghij"),
"e": list("abcdefghij"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("abcdefghij"),
)
pdf2l = pd.DataFrame(
{
"a": list("abcdeabcde"),
"b": list("abcabcabca"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf2r = pd.DataFrame(
{
"d": list("edcbaedcba"),
"e": list("aaabbbcccd"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("fghijklmno"),
)
pdf3l = pd.DataFrame(
{
"a": list("aaaaaaaaaa"),
"b": list("aaaaaaaaaa"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf3r = pd.DataFrame(
{
"d": list("aaabbbccaa"),
"e": list("abbbbbbbbb"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("ABCDEFGHIJ"),
)
for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
ddl = dd.from_pandas(pdl, lpart)
ddr = dd.from_pandas(pdr, rpart)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
expected = pdl.join(pdr, how=how)
assert_eq(
await c.compute(ddl.join(ddr, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
expected = pdr.join(pdl, how=how)
assert_eq(
await c.compute(ddr.join(ddl, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
expected = pd.merge(
pdl, pdr, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddl,
ddr,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
expected = pd.merge(
pdr, pdl, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddr,
ddl,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not (PANDAS_GE_200 and expected.index.empty),
)
# hash join
> await list_eq(
dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
)
distributed\shuffle\tests\test_merge.py:351:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle 32ef853a75040171fead12eacb19799a
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_index_merge_p2p[inner] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle da3a31b8f0bbd04f35a02f313d67213f
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57633', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57634', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57636', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'inner'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_index_merge_p2p(c, s, a, b, how):
pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
right = dd.from_pandas(pdf_right, npartitions=6)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
assert_eq(
> await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
)
distributed\shuffle\tests\test_merge.py:388:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle da3a31b8f0bbd04f35a02f313d67213f
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_index_merge_p2p[left] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle fb8f046915ba1d0549e4779b7771acb7
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57661', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57662', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57664', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'left'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_index_merge_p2p(c, s, a, b, how):
pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
right = dd.from_pandas(pdf_right, npartitions=6)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
assert_eq(
> await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
)
distributed\shuffle\tests\test_merge.py:388:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle fb8f046915ba1d0549e4779b7771acb7
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_index_merge_p2p[right] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle dec472840ea768be7034f50299547cd8
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57688', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57689', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57691', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'right'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_index_merge_p2p(c, s, a, b, how):
pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
right = dd.from_pandas(pdf_right, npartitions=6)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
assert_eq(
> await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
)
distributed\shuffle\tests\test_merge.py:388:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle dec472840ea768be7034f50299547cd8
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_index_merge_p2p[outer] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle a48b78acdede03b0632f1948d98eb5e2
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57715', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57716', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57718', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'outer'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_index_merge_p2p(c, s, a, b, how):
pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
right = dd.from_pandas(pdf_right, npartitions=6)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
assert_eq(
> await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
)
distributed\shuffle\tests\test_merge.py:388:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle a48b78acdede03b0632f1948d98eb5e2
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge_with_npartitions[4] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 2s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle b6997aff76c3e65a1bf458788738424b
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57743', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57744', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57746', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
npartitions = 4
@pytest.mark.parametrize("npartitions", [4, 5, 10, 20])
@gen_cluster(client=True)
async def test_merge_with_npartitions(c, s, a, b, npartitions):
pdf = pd.DataFrame({"a": [1, 2, 3, 4] * 10, "b": 1})
left = dd.from_pandas(pdf, npartitions=10)
right = dd.from_pandas(pdf, npartitions=5)
expected = pdf.merge(pdf)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
> result = await c.compute(left.merge(right, npartitions=npartitions))
distributed\shuffle\tests\test_merge.py:408:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle b6997aff76c3e65a1bf458788738424b
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge_with_npartitions[5] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 2s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 26d6535645006c46fb8bf8378d0abf12
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57773', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57774', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57776', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
npartitions = 5
@pytest.mark.parametrize("npartitions", [4, 5, 10, 20])
@gen_cluster(client=True)
async def test_merge_with_npartitions(c, s, a, b, npartitions):
pdf = pd.DataFrame({"a": [1, 2, 3, 4] * 10, "b": 1})
left = dd.from_pandas(pdf, npartitions=10)
right = dd.from_pandas(pdf, npartitions=5)
expected = pdf.merge(pdf)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
> result = await c.compute(left.merge(right, npartitions=npartitions))
distributed\shuffle\tests\test_merge.py:408:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle 26d6535645006c46fb8bf8378d0abf12
distributed\shuffle\_shuffle.py:109: RuntimeError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
All 14 runs failed: test_merge_with_npartitions[10] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 2s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 5b3303d1b3116eeb80672eb2d9273073
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
> return get_worker_plugin().barrier(id, run_ids)
distributed\shuffle\_shuffle.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
raise error
distributed\utils.py:409: in f
result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
broadcast_task = asyncio.create_task(
self.scheduler.broadcast(
msg=msg,
workers=list(shuffle.participating_workers),
)
)
barrier_task = self.scheduler.tasks[barrier_key(id)]
barrier_deps: set[TaskState] = barrier_task.dependents
from dask.optimization import SubgraphCallable
from distributed.shuffle._rechunk import rechunk_unpack
from distributed.shuffle._shuffle import shuffle_unpack
def _extract_part_id(run_spec: Any) -> Any:
if not isinstance(run_spec, tuple):
return False
# FIXME: This is extremely crude. The shuffle run / spec should
# likely expose a method that is performing this check and returns
# the ID if possible.
if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
# Happy path, we're just having the unfused dependencies
if len(run_spec) == 4:
return run_spec[2]
return run_spec[1][1]
elif isinstance(run_spec, SubgraphCallable):
for tspec in run_spec.dsk.values():
if partial_index := _extract_part_id(tspec):
return partial_index
return False
else:
if any(ret := _extract_part_id(arg) for arg in run_spec):
return ret
return False
restrictions = {}
for dep in barrier_deps:
# Ensure the broadcast can proceed as needed without blocking
# the event loop here
if not broadcast_task.done():
await asyncio.sleep(0)
if partial_index := _extract_part_id(dep.run_spec):
worker = shuffle.run_spec.worker_for[partial_index]
restrictions[dep.key] = {worker}
else:
> raise RuntimeError("Could not parse barrier dependents")
E RuntimeError: Could not parse barrier dependents
distributed\shuffle\_scheduler_plugin.py:145: RuntimeError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57812', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57813', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57815', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
npartitions = 10
@pytest.mark.parametrize("npartitions", [4, 5, 10, 20])
@gen_cluster(client=True)
async def test_merge_with_npartitions(c, s, a, b, npartitions):
pdf = pd.DataFrame({"a": [1, 2, 3, 4] * 10, "b": 1})
left = dd.from_pandas(pdf, npartitions=10)
right = dd.from_pandas(pdf, npartitions=5)
expected = pdf.merge(pdf)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
> result = await c.compute(left.merge(right, npartitions=npartitions))
distributed\shuffle\tests\test_merge.py:408:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import logging
import os
from collections import defaultdict
from collections.abc import (
Callable,
Collection,
Generator,
Hashable,
Iterable,
Iterator,
Sequence,
)
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
import toolz
from tornado.ioloop import IOLoop
import dask
from dask.base import tokenize
from dask.highlevelgraph import HighLevelGraph
from dask.layers import Layer
from dask.typing import Key
from distributed.core import PooledRPCCall
from distributed.exceptions import Reschedule
from distributed.metrics import context_meter
from distributed.shuffle._arrow import (
buffers_to_table,
check_dtype_support,
check_minimal_arrow_version,
convert_shards,
deserialize_table,
read_from_disk,
serialize_table,
)
from distributed.shuffle._core import (
NDIndex,
ShuffleId,
ShuffleRun,
ShuffleSpec,
barrier_key,
get_worker_plugin,
handle_transfer_errors,
handle_unpack_errors,
)
from distributed.shuffle._exceptions import DataUnavailable
from distributed.shuffle._limiter import ResourceLimiter
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.sizeof import sizeof
logger = logging.getLogger("distributed.shuffle")
if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa
# TODO import from typing (requires Python >=3.10)
from typing_extensions import TypeAlias
from dask.dataframe import DataFrame
def shuffle_transfer(
input: pd.DataFrame,
id: ShuffleId,
input_partition: int,
npartitions: int,
column: str,
meta: pd.DataFrame,
parts_out: set[int],
disk: bool,
drop_column: bool,
) -> int:
with handle_transfer_errors(id):
return get_worker_plugin().add_partition(
input,
input_partition,
spec=DataFrameShuffleSpec(
id=id,
npartitions=npartitions,
column=column,
meta=meta,
parts_out=parts_out,
disk=disk,
drop_column=drop_column,
),
)
def shuffle_unpack(
id: ShuffleId, output_partition: int, barrier_run_id: int
) -> pd.DataFrame:
with handle_unpack_errors(id):
return get_worker_plugin().get_output_partition(
id, barrier_run_id, output_partition
)
def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
try:
return get_worker_plugin().barrier(id, run_ids)
except Reschedule as e:
raise e
except Exception as e:
> raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E RuntimeError: shuffle_barrier failed during shuffle 5b3303d1b3116eeb80672eb2d9273073
distributed\shuffle\_shuffle.py:109: RuntimeError