dask · mrocklin · Apr 13, 2022 · Apr 12, 2022 · Apr 12, 2022 · Apr 12, 2022
@@ -168,7 +168,6 @@ async def wait_and_raise(*args, **kwargs):
 
     b_story = b.story(fut1.key)
     assert any("receive-dep-failed" in msg for msg in b_story)
-    assert any("missing-dep" in msg for msg in b_story)
     assert any("cancelled" in msg for msg in b_story)
     assert any("resumed" in msg for msg in b_story)
 

@@ -3389,3 +3389,24 @@ async def test_tick_interval(c, s, a, b):
     while s.workers[a.address].metrics["event_loop_interval"] < 0.100:
         await asyncio.sleep(0.01)
         time.sleep(0.200)
+
+
+class BreakingWorker(Worker):
+    broke_once = False
+
+    def get_data(self, comm, **kwargs):
+        if not self.broke_once:
+            self.broke_once = True
+            raise OSError("fake error")
+        return super().get_data(comm, **kwargs)
+
+
+@pytest.mark.slow
+@gen_cluster(client=True, Worker=BreakingWorker)
+async def test_broken_comm(c, s, a, b):
+    df = dask.datasets.timeseries(
+        start="2000-01-01",
+        end="2000-01-10",
+    )
+    s = df.shuffle("id", shuffle="tasks")
+    await c.compute(s.size)
@@ -2671,6 +2671,9 @@ def ensure_communicating(self) -> None:
             if ts.state != "fetch":
                 continue
 
+            if self.validate:
+                assert ts.who_has
+
             workers = [w for w in ts.who_has if w not in self.in_flight_workers]
             if not workers:
                 assert ts.priority is not None
@@ -2999,7 +3002,13 @@ async def gather_dep(
                 for d in has_what:
                     ts = self.tasks[d]
                     ts.who_has.remove(worker)
-
+                    if not ts.who_has:
+                        recommendations[ts] = "missing"
+                        logger.info(
+                            "Lost worker connection to %s caused task %s to go missing",
+                            worker,
+                            ts.key,
+                        )
             except Exception as e:
                 logger.exception(e)
                 if self.batched_stream and LOG_PDB: