Skip to content

set restrictions during barrier on scheduler

Sign in for the full log view
GitHub Actions / Unit Test Results failed Jun 25, 2024 in 0s

1 errors, 158 fail, 97 skipped, 3 808 pass in 13h 3m 18s

    29 files      29 suites   13h 3m 18s ⏱️
 4 064 tests  3 808 ✅    97 💤   158 ❌  1 🔥
55 991 runs  51 717 ✅ 2 163 💤 2 101 ❌ 10 🔥

Results for commit 28804b7.

Annotations

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_basic_merge[inner] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 9af47a5ca91e6062414f19852f840ae6
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57160', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'inner'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_basic_merge(c, s, a, b, how):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        joined = a.merge(b, left_on="y", right_on="y", how=how)
    
        if dd._dask_expr_enabled():
            # Ensure we're using a hash join
            from dask_expr._merge import HashJoinP2P
    
            assert any(
                isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
            )
    
        expected = pd.merge(A, B, how, "y")
>       await list_eq(joined, expected)

distributed\shuffle\tests\test_merge.py:91: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle 9af47a5ca91e6062414f19852f840ae6

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_basic_merge[left] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 0s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle b1b374543ccedf9726ce1c0fc6c357a0
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57184', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'left'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_basic_merge(c, s, a, b, how):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        joined = a.merge(b, left_on="y", right_on="y", how=how)
    
        if dd._dask_expr_enabled():
            # Ensure we're using a hash join
            from dask_expr._merge import HashJoinP2P
    
            assert any(
                isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
            )
    
        expected = pd.merge(A, B, how, "y")
>       await list_eq(joined, expected)

distributed\shuffle\tests\test_merge.py:91: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle b1b374543ccedf9726ce1c0fc6c357a0

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_basic_merge[right] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 0s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 4c2016adc19b634b7fa31c47847ec221
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57211', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'right'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_basic_merge(c, s, a, b, how):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        joined = a.merge(b, left_on="y", right_on="y", how=how)
    
        if dd._dask_expr_enabled():
            # Ensure we're using a hash join
            from dask_expr._merge import HashJoinP2P
    
            assert any(
                isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
            )
    
        expected = pd.merge(A, B, how, "y")
>       await list_eq(joined, expected)

distributed\shuffle\tests\test_merge.py:91: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle 4c2016adc19b634b7fa31c47847ec221

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_basic_merge[outer] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle cd8c05dd69e45b42dca287c4e5affc62
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57235', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'outer'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_basic_merge(c, s, a, b, how):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        joined = a.merge(b, left_on="y", right_on="y", how=how)
    
        if dd._dask_expr_enabled():
            # Ensure we're using a hash join
            from dask_expr._merge import HashJoinP2P
    
            assert any(
                isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
            )
    
        expected = pd.merge(A, B, how, "y")
>       await list_eq(joined, expected)

distributed\shuffle\tests\test_merge.py:91: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle cd8c05dd69e45b42dca287c4e5affc62

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge_p2p_shuffle_reused_dataframe_with_different_parameters (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 4s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 5s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 4s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 4s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 4s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 4s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 64b40cf6ff1c9589a1265228d57c0f75
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57259', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57260', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57262', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>

    @gen_cluster(client=True)
    async def test_merge_p2p_shuffle_reused_dataframe_with_different_parameters(c, s, a, b):
        pdf1 = pd.DataFrame({"a": range(100), "b": range(0, 200, 2)})
        pdf2 = pd.DataFrame({"x": range(200), "y": [1, 2, 3, 4] * 50})
        ddf1 = dd.from_pandas(pdf1, npartitions=5)
        ddf2 = dd.from_pandas(pdf2, npartitions=10)
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            out = (
                ddf1.merge(ddf2, left_on="a", right_on="x")
                # Vary the number of output partitions for the shuffles of dd2
                .repartition(npartitions=20).merge(ddf2, left_on="b", right_on="x")
            )
        # Generate unique shuffle IDs if the input frame is the same but
        # parameters differ. Reusing shuffles in merges is dangerous because of the
        # required coordination and complexity introduced through dynamic clusters.
        assert sum(id_from_key(k) is not None for k in out.dask) == 4
>       result = await c.compute(out)

distributed\shuffle\tests\test_merge.py:126: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle 64b40cf6ff1c9589a1265228d57c0f75

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge_p2p_shuffle_reused_dataframe_with_same_parameters (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 3s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 3s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 3s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 3s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 4s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 3s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 64b40cf6ff1c9589a1265228d57c0f75
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57292', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57293', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57295', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>

    @gen_cluster(client=True)
    async def test_merge_p2p_shuffle_reused_dataframe_with_same_parameters(c, s, a, b):
        pdf1 = pd.DataFrame({"a": range(100), "b": range(0, 200, 2)})
        pdf2 = pd.DataFrame({"x": range(200), "y": [1, 2, 3, 4] * 50})
        ddf1 = dd.from_pandas(pdf1, npartitions=5)
        ddf2 = dd.from_pandas(pdf2, npartitions=10)
    
        # This performs two shuffles:
        #   * ddf1 is shuffled on `a`
        #   * ddf2 is shuffled on `x`
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            ddf3 = ddf1.merge(
                ddf2,
                left_on="a",
                right_on="x",
            )
    
        # This performs one shuffle:
        #   * ddf3 is shuffled on `b`
        # We can reuse the shuffle of dd2 on `x` from the previous merge.
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            out = ddf2.merge(
                ddf3,
                left_on="x",
                right_on="b",
            )
        # Generate unique shuffle IDs if the input frame is the same and all its
        # parameters match. Reusing shuffles in merges is dangerous because of the
        # required coordination and complexity introduced through dynamic clusters.
        assert sum(id_from_key(k) is not None for k in out.dask) == 4
>       result = await c.compute(out)

distributed\shuffle\tests\test_merge.py:163: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle 64b40cf6ff1c9589a1265228d57c0f75

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge[True-inner] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 9af47a5ca91e6062414f19852f840ae6
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57330', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'inner', disk = True

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed\shuffle\tests\test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle 9af47a5ca91e6062414f19852f840ae6

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge[True-outer] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle cd8c05dd69e45b42dca287c4e5affc62
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57353', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'outer', disk = True

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed\shuffle\tests\test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle cd8c05dd69e45b42dca287c4e5affc62

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge[True-left] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 69ac58588df4022b70f1c6b5c56323d3
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57376', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'left', disk = True

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed\shuffle\tests\test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle 69ac58588df4022b70f1c6b5c56323d3

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge[True-right] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 4c2016adc19b634b7fa31c47847ec221
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57399', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'right', disk = True

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed\shuffle\tests\test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle 4c2016adc19b634b7fa31c47847ec221

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge[False-inner] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 9af47a5ca91e6062414f19852f840ae6
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57422', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'inner', disk = False

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed\shuffle\tests\test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle 9af47a5ca91e6062414f19852f840ae6

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge[False-outer] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle cd8c05dd69e45b42dca287c4e5affc62
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57446', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'outer', disk = False

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed\shuffle\tests\test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle cd8c05dd69e45b42dca287c4e5affc62

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge[False-left] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 69ac58588df4022b70f1c6b5c56323d3
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57479', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'left', disk = False

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed\shuffle\tests\test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle 69ac58588df4022b70f1c6b5c56323d3

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge[False-right] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 4c2016adc19b634b7fa31c47847ec221
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57502', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'right', disk = False

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed\shuffle\tests\test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle 4c2016adc19b634b7fa31c47847ec221

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge_by_multiple_columns[inner] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 2s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle ed347283afc2063dd42c7fbce4b90ceb
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57525', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57526', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57528', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'inner'

    @pytest.mark.slow
    @gen_cluster(client=True, timeout=120)
    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    async def test_merge_by_multiple_columns(c, s, a, b, how):
        # warnings here from pandas
        pdf1l = pd.DataFrame(
            {
                "a": list("abcdefghij"),
                "b": list("abcdefghij"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf1r = pd.DataFrame(
            {
                "d": list("abcdefghij"),
                "e": list("abcdefghij"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("abcdefghij"),
        )
    
        pdf2l = pd.DataFrame(
            {
                "a": list("abcdeabcde"),
                "b": list("abcabcabca"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf2r = pd.DataFrame(
            {
                "d": list("edcbaedcba"),
                "e": list("aaabbbcccd"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("fghijklmno"),
        )
    
        pdf3l = pd.DataFrame(
            {
                "a": list("aaaaaaaaaa"),
                "b": list("aaaaaaaaaa"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf3r = pd.DataFrame(
            {
                "d": list("aaabbbccaa"),
                "e": list("abbbbbbbbb"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("ABCDEFGHIJ"),
        )
    
        for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
            for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
                ddl = dd.from_pandas(pdl, lpart)
                ddr = dd.from_pandas(pdr, rpart)
    
                with dask.config.set({"dataframe.shuffle.method": "p2p"}):
                    expected = pdl.join(pdr, how=how)
                    assert_eq(
                        await c.compute(ddl.join(ddr, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    expected = pdr.join(pdl, how=how)
                    assert_eq(
                        await c.compute(ddr.join(ddl, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    expected = pd.merge(
                        pdl, pdr, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddl,
                                ddr,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    expected = pd.merge(
                        pdr, pdl, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddr,
                                ddl,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    # hash join
>                   await list_eq(
                        dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
                        pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
                    )

distributed\shuffle\tests\test_merge.py:351: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle ed347283afc2063dd42c7fbce4b90ceb

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge_by_multiple_columns[outer] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 2s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 42b1fb2af275859f3acb20b56d92aa7b
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57551', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57552', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57554', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'outer'

    @pytest.mark.slow
    @gen_cluster(client=True, timeout=120)
    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    async def test_merge_by_multiple_columns(c, s, a, b, how):
        # warnings here from pandas
        pdf1l = pd.DataFrame(
            {
                "a": list("abcdefghij"),
                "b": list("abcdefghij"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf1r = pd.DataFrame(
            {
                "d": list("abcdefghij"),
                "e": list("abcdefghij"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("abcdefghij"),
        )
    
        pdf2l = pd.DataFrame(
            {
                "a": list("abcdeabcde"),
                "b": list("abcabcabca"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf2r = pd.DataFrame(
            {
                "d": list("edcbaedcba"),
                "e": list("aaabbbcccd"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("fghijklmno"),
        )
    
        pdf3l = pd.DataFrame(
            {
                "a": list("aaaaaaaaaa"),
                "b": list("aaaaaaaaaa"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf3r = pd.DataFrame(
            {
                "d": list("aaabbbccaa"),
                "e": list("abbbbbbbbb"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("ABCDEFGHIJ"),
        )
    
        for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
            for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
                ddl = dd.from_pandas(pdl, lpart)
                ddr = dd.from_pandas(pdr, rpart)
    
                with dask.config.set({"dataframe.shuffle.method": "p2p"}):
                    expected = pdl.join(pdr, how=how)
                    assert_eq(
                        await c.compute(ddl.join(ddr, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    expected = pdr.join(pdl, how=how)
                    assert_eq(
                        await c.compute(ddr.join(ddl, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    expected = pd.merge(
                        pdl, pdr, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddl,
                                ddr,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    expected = pd.merge(
                        pdr, pdl, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddr,
                                ddl,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    # hash join
>                   await list_eq(
                        dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
                        pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
                    )

distributed\shuffle\tests\test_merge.py:351: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle 42b1fb2af275859f3acb20b56d92aa7b

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge_by_multiple_columns[left] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 2s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle fdf462962d855192df53f2e0a37998c8
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57575', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57576', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57578', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'left'

    @pytest.mark.slow
    @gen_cluster(client=True, timeout=120)
    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    async def test_merge_by_multiple_columns(c, s, a, b, how):
        # warnings here from pandas
        pdf1l = pd.DataFrame(
            {
                "a": list("abcdefghij"),
                "b": list("abcdefghij"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf1r = pd.DataFrame(
            {
                "d": list("abcdefghij"),
                "e": list("abcdefghij"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("abcdefghij"),
        )
    
        pdf2l = pd.DataFrame(
            {
                "a": list("abcdeabcde"),
                "b": list("abcabcabca"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf2r = pd.DataFrame(
            {
                "d": list("edcbaedcba"),
                "e": list("aaabbbcccd"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("fghijklmno"),
        )
    
        pdf3l = pd.DataFrame(
            {
                "a": list("aaaaaaaaaa"),
                "b": list("aaaaaaaaaa"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf3r = pd.DataFrame(
            {
                "d": list("aaabbbccaa"),
                "e": list("abbbbbbbbb"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("ABCDEFGHIJ"),
        )
    
        for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
            for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
                ddl = dd.from_pandas(pdl, lpart)
                ddr = dd.from_pandas(pdr, rpart)
    
                with dask.config.set({"dataframe.shuffle.method": "p2p"}):
                    expected = pdl.join(pdr, how=how)
                    assert_eq(
                        await c.compute(ddl.join(ddr, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    expected = pdr.join(pdl, how=how)
                    assert_eq(
                        await c.compute(ddr.join(ddl, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    expected = pd.merge(
                        pdl, pdr, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddl,
                                ddr,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    expected = pd.merge(
                        pdr, pdl, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddr,
                                ddl,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    # hash join
>                   await list_eq(
                        dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
                        pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
                    )

distributed\shuffle\tests\test_merge.py:351: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle fdf462962d855192df53f2e0a37998c8

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge_by_multiple_columns[right] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 2s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 32ef853a75040171fead12eacb19799a
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57598', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57599', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57601', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'right'

    @pytest.mark.slow
    @gen_cluster(client=True, timeout=120)
    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    async def test_merge_by_multiple_columns(c, s, a, b, how):
        # warnings here from pandas
        pdf1l = pd.DataFrame(
            {
                "a": list("abcdefghij"),
                "b": list("abcdefghij"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf1r = pd.DataFrame(
            {
                "d": list("abcdefghij"),
                "e": list("abcdefghij"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("abcdefghij"),
        )
    
        pdf2l = pd.DataFrame(
            {
                "a": list("abcdeabcde"),
                "b": list("abcabcabca"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf2r = pd.DataFrame(
            {
                "d": list("edcbaedcba"),
                "e": list("aaabbbcccd"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("fghijklmno"),
        )
    
        pdf3l = pd.DataFrame(
            {
                "a": list("aaaaaaaaaa"),
                "b": list("aaaaaaaaaa"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf3r = pd.DataFrame(
            {
                "d": list("aaabbbccaa"),
                "e": list("abbbbbbbbb"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("ABCDEFGHIJ"),
        )
    
        for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
            for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
                ddl = dd.from_pandas(pdl, lpart)
                ddr = dd.from_pandas(pdr, rpart)
    
                with dask.config.set({"dataframe.shuffle.method": "p2p"}):
                    expected = pdl.join(pdr, how=how)
                    assert_eq(
                        await c.compute(ddl.join(ddr, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    expected = pdr.join(pdl, how=how)
                    assert_eq(
                        await c.compute(ddr.join(ddl, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    expected = pd.merge(
                        pdl, pdr, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddl,
                                ddr,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    expected = pd.merge(
                        pdr, pdl, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddr,
                                ddl,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not (PANDAS_GE_200 and expected.index.empty),
                    )
    
                    # hash join
>                   await list_eq(
                        dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
                        pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
                    )

distributed\shuffle\tests\test_merge.py:351: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle 32ef853a75040171fead12eacb19799a

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_index_merge_p2p[inner] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle da3a31b8f0bbd04f35a02f313d67213f
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57633', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57634', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57636', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'inner'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_index_merge_p2p(c, s, a, b, how):
        pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
        pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
    
        left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
        right = dd.from_pandas(pdf_right, npartitions=6)
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            assert_eq(
>               await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
                pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
            )

distributed\shuffle\tests\test_merge.py:388: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle da3a31b8f0bbd04f35a02f313d67213f

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_index_merge_p2p[left] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle fb8f046915ba1d0549e4779b7771acb7
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57661', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57662', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57664', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'left'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_index_merge_p2p(c, s, a, b, how):
        pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
        pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
    
        left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
        right = dd.from_pandas(pdf_right, npartitions=6)
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            assert_eq(
>               await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
                pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
            )

distributed\shuffle\tests\test_merge.py:388: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle fb8f046915ba1d0549e4779b7771acb7

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_index_merge_p2p[right] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle dec472840ea768be7034f50299547cd8
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57688', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57689', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57691', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'right'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_index_merge_p2p(c, s, a, b, how):
        pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
        pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
    
        left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
        right = dd.from_pandas(pdf_right, npartitions=6)
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            assert_eq(
>               await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
                pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
            )

distributed\shuffle\tests\test_merge.py:388: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle dec472840ea768be7034f50299547cd8

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_index_merge_p2p[outer] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 1s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle a48b78acdede03b0632f1948d98eb5e2
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57715', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57716', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57718', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'outer'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_index_merge_p2p(c, s, a, b, how):
        pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
        pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
    
        left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
        right = dd.from_pandas(pdf_right, npartitions=6)
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            assert_eq(
>               await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
                pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
            )

distributed\shuffle\tests\test_merge.py:388: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle a48b78acdede03b0632f1948d98eb5e2

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge_with_npartitions[4] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 2s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle b6997aff76c3e65a1bf458788738424b
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57743', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57744', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57746', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
npartitions = 4

    @pytest.mark.parametrize("npartitions", [4, 5, 10, 20])
    @gen_cluster(client=True)
    async def test_merge_with_npartitions(c, s, a, b, npartitions):
        pdf = pd.DataFrame({"a": [1, 2, 3, 4] * 10, "b": 1})
    
        left = dd.from_pandas(pdf, npartitions=10)
        right = dd.from_pandas(pdf, npartitions=5)
    
        expected = pdf.merge(pdf)
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
>           result = await c.compute(left.merge(right, npartitions=npartitions))

distributed\shuffle\tests\test_merge.py:408: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle b6997aff76c3e65a1bf458788738424b

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge_with_npartitions[5] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 2s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 26d6535645006c46fb8bf8378d0abf12
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57773', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57774', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57776', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
npartitions = 5

    @pytest.mark.parametrize("npartitions", [4, 5, 10, 20])
    @gen_cluster(client=True)
    async def test_merge_with_npartitions(c, s, a, b, npartitions):
        pdf = pd.DataFrame({"a": [1, 2, 3, 4] * 10, "b": 1})
    
        left = dd.from_pandas(pdf, npartitions=10)
        right = dd.from_pandas(pdf, npartitions=5)
    
        expected = pdf.merge(pdf)
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
>           result = await c.compute(left.merge(right, npartitions=npartitions))

distributed\shuffle\tests\test_merge.py:408: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle 26d6535645006c46fb8bf8378d0abf12

distributed\shuffle\_shuffle.py:109: RuntimeError

Check warning on line 0 in distributed.shuffle.tests.test_merge

See this annotation in the file changed.

@github-actions github-actions / Unit Test Results

All 14 runs failed: test_merge_with_npartitions[10] (distributed.shuffle.tests.test_merge)

artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_expr-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-mindeps-default-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-numpy-ci1/pytest.xml [took 2s]
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 2s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 2s]
Raw output
RuntimeError: shuffle_barrier failed during shuffle 5b3303d1b3116eeb80672eb2d9273073
from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
>           return get_worker_plugin().barrier(id, run_ids)

distributed\shuffle\_shuffle.py:105: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\_worker_plugin.py:403: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed\utils.py:435: in sync
    raise error
distributed\utils.py:409: in f
    result = yield future
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\tornado\gen.py:766: in run
    value = future.result()
distributed\shuffle\_worker_plugin.py:370: in _barrier
    return await shuffle_run.barrier(run_ids)
distributed\shuffle\_core.py:200: in barrier
    await self.scheduler.shuffle_barrier(
distributed\core.py:1397: in send_recv_from_rpc
    return await send_recv(comm=comm, op=key, **kwargs)
distributed\core.py:1181: in send_recv
    raise exc.with_traceback(tb)
distributed\core.py:972: in _handle_comm
    result = await result
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
    
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            broadcast_task = asyncio.create_task(
                self.scheduler.broadcast(
                    msg=msg,
                    workers=list(shuffle.participating_workers),
                )
            )
            barrier_task = self.scheduler.tasks[barrier_key(id)]
            barrier_deps: set[TaskState] = barrier_task.dependents
            from dask.optimization import SubgraphCallable
    
            from distributed.shuffle._rechunk import rechunk_unpack
            from distributed.shuffle._shuffle import shuffle_unpack
    
            def _extract_part_id(run_spec: Any) -> Any:
                if not isinstance(run_spec, tuple):
                    return False
                # FIXME: This is extremely crude. The shuffle run / spec should
                # likely expose a method that is performing this check and returns
                # the ID if possible.
                if run_spec[0] is rechunk_unpack or run_spec[0] is shuffle_unpack:
                    # Happy path, we're just having the unfused dependencies
                    if len(run_spec) == 4:
                        return run_spec[2]
                    return run_spec[1][1]
                elif isinstance(run_spec, SubgraphCallable):
                    for tspec in run_spec.dsk.values():
                        if partial_index := _extract_part_id(tspec):
                            return partial_index
                    return False
                else:
                    if any(ret := _extract_part_id(arg) for arg in run_spec):
                        return ret
                    return False
    
            restrictions = {}
            for dep in barrier_deps:
                # Ensure the broadcast can proceed as needed without blocking
                # the event loop here
                if not broadcast_task.done():
                    await asyncio.sleep(0)
                if partial_index := _extract_part_id(dep.run_spec):
                    worker = shuffle.run_spec.worker_for[partial_index]
                    restrictions[dep.key] = {worker}
                else:
>                   raise RuntimeError("Could not parse barrier dependents")
E                   RuntimeError: Could not parse barrier dependents

distributed\shuffle\_scheduler_plugin.py:145: RuntimeError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:57812', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:57813', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:57815', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
npartitions = 10

    @pytest.mark.parametrize("npartitions", [4, 5, 10, 20])
    @gen_cluster(client=True)
    async def test_merge_with_npartitions(c, s, a, b, npartitions):
        pdf = pd.DataFrame({"a": [1, 2, 3, 4] * 10, "b": 1})
    
        left = dd.from_pandas(pdf, npartitions=10)
        right = dd.from_pandas(pdf, npartitions=5)
    
        expected = pdf.merge(pdf)
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
>           result = await c.compute(left.merge(right, npartitions=npartitions))

distributed\shuffle\tests\test_merge.py:408: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\client.py:399: in _result
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    from __future__ import annotations
    
    import logging
    import os
    from collections import defaultdict
    from collections.abc import (
        Callable,
        Collection,
        Generator,
        Hashable,
        Iterable,
        Iterator,
        Sequence,
    )
    from concurrent.futures import ThreadPoolExecutor
    from dataclasses import dataclass
    from pathlib import Path
    from typing import TYPE_CHECKING, Any
    
    import toolz
    from tornado.ioloop import IOLoop
    
    import dask
    from dask.base import tokenize
    from dask.highlevelgraph import HighLevelGraph
    from dask.layers import Layer
    from dask.typing import Key
    
    from distributed.core import PooledRPCCall
    from distributed.exceptions import Reschedule
    from distributed.metrics import context_meter
    from distributed.shuffle._arrow import (
        buffers_to_table,
        check_dtype_support,
        check_minimal_arrow_version,
        convert_shards,
        deserialize_table,
        read_from_disk,
        serialize_table,
    )
    from distributed.shuffle._core import (
        NDIndex,
        ShuffleId,
        ShuffleRun,
        ShuffleSpec,
        barrier_key,
        get_worker_plugin,
        handle_transfer_errors,
        handle_unpack_errors,
    )
    from distributed.shuffle._exceptions import DataUnavailable
    from distributed.shuffle._limiter import ResourceLimiter
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.sizeof import sizeof
    
    logger = logging.getLogger("distributed.shuffle")
    if TYPE_CHECKING:
        import pandas as pd
        import pyarrow as pa
    
        # TODO import from typing (requires Python >=3.10)
        from typing_extensions import TypeAlias
    
        from dask.dataframe import DataFrame
    
    
    def shuffle_transfer(
        input: pd.DataFrame,
        id: ShuffleId,
        input_partition: int,
        npartitions: int,
        column: str,
        meta: pd.DataFrame,
        parts_out: set[int],
        disk: bool,
        drop_column: bool,
    ) -> int:
        with handle_transfer_errors(id):
            return get_worker_plugin().add_partition(
                input,
                input_partition,
                spec=DataFrameShuffleSpec(
                    id=id,
                    npartitions=npartitions,
                    column=column,
                    meta=meta,
                    parts_out=parts_out,
                    disk=disk,
                    drop_column=drop_column,
                ),
            )
    
    
    def shuffle_unpack(
        id: ShuffleId, output_partition: int, barrier_run_id: int
    ) -> pd.DataFrame:
        with handle_unpack_errors(id):
            return get_worker_plugin().get_output_partition(
                id, barrier_run_id, output_partition
            )
    
    
    def shuffle_barrier(id: ShuffleId, run_ids: list[int]) -> int:
        try:
            return get_worker_plugin().barrier(id, run_ids)
        except Reschedule as e:
            raise e
        except Exception as e:
>           raise RuntimeError(f"shuffle_barrier failed during shuffle {id}") from e
E           RuntimeError: shuffle_barrier failed during shuffle 5b3303d1b3116eeb80672eb2d9273073

distributed\shuffle\_shuffle.py:109: RuntimeError