[Data] ConcurrencyCapBackpressurePolicy - Only increase threshold (#58023)

srinathk10 · web-flow · commit 4ba962c7889d · 2025-10-22T19:27:45.000-07:00
> Thank you for contributing to Ray! 🚀 > Please review the [Ray Contribution Guide](https://docs.ray.io/en/master/ray-contribute/getting-involved.html) before opening a pull request. > ⚠️ Remove these instructions before submitting your PR. > 💡 Tip: Mark as draft if you want early feedback, or ready for review when it's complete. ## Description > Briefly describe what this PR accomplishes and why it's needed. ### [Data] ConcurrencyCapBackpressurePolicy - Only increase threshold When `_update_queue_threshold` to adjust the queue threshold to cap concurrency based on current queued bytes, - Only allow increasing the threshold or maintaining it. - Cannot decrease threshold because the steady state of queued bytes is not known. ## Related issues > Link related issues: "Fixes #1234", "Closes #1234", or "Related to #1234". ## Additional information > Optional: Add implementation details, API changes, usage examples, screenshots, etc. --------- Signed-off-by: Srinath Krishnamachari <srinath.krishnamachari@anyscale.com> Signed-off-by: Srinath Krishnamachari <68668616+srinathk10@users.noreply.github.com>
diff --git a/python/ray/data/_internal/execution/backpressure_policy/concurrency_cap_backpressure_policy.py b/python/ray/data/_internal/execution/backpressure_policy/concurrency_cap_backpressure_policy.py
@@ -208,8 +208,7 @@ def _update_queue_threshold(
         Motivation: Adaptive thresholds prevent both over-throttling (too aggressive) and
         under-throttling (too permissive). The logic balances responsiveness with stability:
         - Fast upward response to pressure spikes (immediate threshold increase)
-        - Gradual downward response to prevent oscillation (EWMA smoothing)
-        - Complete reset when idle (threshold = 0) to avoid stuck high thresholds
+        - Thresholds only increase, never decrease, since we don't know if low pressure is steady state
 
         Args:
             op: Operator whose threshold is being updated.
@@ -219,56 +218,23 @@ def _update_queue_threshold(
             The updated threshold in bytes.
 
         Examples:
-            # Example 1: First sample (bootstrap)
+            # Bootstrap: first sample sets threshold
             # Input: current_queue_size_bytes = 1000, level_prev = 0, dev_prev = 0
-            # EWMA: level = 1000, dev = 0 (first sample)
-            # Base: 1000 + 4*0 = 1000
-            # Threshold: max(1000, 1000) = 1000
-            # prev_threshold = 0, threshold = 1000
+            # EWMA: level = 1000, dev = 0
+            # Base: 1000 + 4*0 = 1000, Threshold: max(1000, 1000) = 1000
             # Result: 1000 (bootstrap)
 
-            # Example 2: Upward adjustment (immediate)
+            # Pressure increase: threshold updated immediately
             # Input: current_queue_size_bytes = 1500, level_prev = 1000, dev_prev = 100
-            # EWMA: level = 1000 + 0.2*(1500-1000) = 1100, dev = 100 + 0.2*(500-100) = 180
-            # Base: 1100 + 4*180 = 1820
-            # Threshold: max(1820, 1500) = 1820
-            # prev_threshold = 1000, threshold = 1820
-            # Result: 1820 (immediate upward)
+            # EWMA: level = 1100, dev = 180, Base: 1100 + 4*180 = 1820
+            # Threshold: max(1820, 1500) = 1820, prev_threshold = 1000
+            # Result: 1820 (threshold increased)
 
-            # Example 3: Downward adjustment (smoothed)
+            # Pressure decrease: threshold maintained (no decrease)
             # Input: current_queue_size_bytes = 100, level_prev = 200, dev_prev = 50
-            # EWMA: level = 200 + 0.2*(100-200) = 180, dev = 50 + 0.2*(100-50) = 60
-            # Base: 180 + 4*60 = 420
-            # Threshold: max(420, 100) = 420
-            # prev_threshold = 500, threshold = 420
-            # smoothed = asymmetric_ewma(500, 420) = 500 + 0.2*(420-500) = 484
-            # Result: 484 (gradual downward adjustment using asymmetric EWMA)
-
-            # Example 4: System becomes idle
-            # Input: current_queue_size_bytes = 0, level_prev = 200, dev_prev = 50
-            # EWMA: level = 200 + 0.2*(0-200) = 160, dev = 50 + 0.2*(200-50) = 80
-            # Base: 160 + 4*80 = 480
-            # Threshold: max(480, 0) = 480
-            # prev_threshold = 484, threshold = 480
-            # smoothed = asymmetric_ewma(484, 480) = 484 + 0.2*(480-484) = 483
-            # Result: 483 (gradual downward adjustment using asymmetric EWMA)
-
-            # Example 5: Continued idle (gradual decay)
-            # Input: current_queue_size_bytes = 0, level_prev = 160, dev_prev = 80
-            # EWMA: level = 160 + 0.2*(0-160) = 128, dev = 80 + 0.2*(160-80) = 96
-            # Base: 128 + 4*96 = 512
-            # Threshold: max(512, 0) = 512
-            # prev_threshold = 483, threshold = 512
-            # Result: 512 (gradual upward, EWMA still adjusting using asymmetric EWMA)
-
-            # Example 6: After many idle samples (threshold finally resets)
-            # Input: current_queue_size_bytes = 0, level_prev = 50, dev_prev = 10
-            # EWMA: level = 50 + 0.2*(0-50) = 40, dev = 10 + 0.2*(50-10) = 18
-            # Base: 40 + 4*18 = 112
-            # Threshold: max(112, 0) = 112
-            # prev_threshold = 200, threshold = 112
-            # smoothed = asymmetric_ewma(200, 112) = 200 + 0.2*(112-200) = 182
-            # Result: 182 (gradual downward adjustment using asymmetric EWMA)
+            # EWMA: level = 180, dev = 60, Base: 180 + 4*60 = 420
+            # Threshold: max(420, 100) = 420, prev_threshold = 500
+            # Result: 500 (threshold maintained, no decrease)
 
         """
         hist = self._queue_history[op]
@@ -303,26 +269,21 @@ def _update_queue_threshold(
         # Step 3: fast ramp-up
         threshold = max(1, int(max(base, q)))
 
-        # Step 4: cache & return with gentle downward response using EWMA_ALPHA
+        # Step 4: cache & return
         prev_threshold = self._queue_thresholds[op]
 
         # Bootstrap
         if prev_threshold == 0:
             self._queue_thresholds[op] = max(1, threshold)
             return self._queue_thresholds[op]
 
-        # Upward: apply immediately
-        if threshold >= prev_threshold:
+        # Only increase threshold when there's clear pressure
+        if threshold > prev_threshold:
             self._queue_thresholds[op] = max(1, threshold)
             return self._queue_thresholds[op]
 
-        # Downward: smooth using asymmetric EWMA
-        # Prevents oscillation by allowing gradual downward adjustments
-        # Uses same asymmetric behavior as EWMA: slow to adjust downward
-        # Example: prev_threshold=200, threshold=100 -> smoothed using asymmetric EWMA
-        smoothed = int(self._update_ewma_asymmetric(prev_threshold, threshold))
-        self._queue_thresholds[op] = max(1, smoothed)
-        return self._queue_thresholds[op]
+        # Keep existing threshold when pressure decreases
+        return prev_threshold
 
     def _effective_cap(self, op: "PhysicalOperator") -> int:
         """Compute a reduced concurrency cap via a tiny {-1,0,+1,+2} controller.
diff --git a/python/ray/data/tests/test_backpressure_policies.py b/python/ray/data/tests/test_backpressure_policies.py
@@ -229,46 +229,54 @@ def test_update_queue_threshold_asymmetric_ewma(self):
         self.assertGreater(policy._q_level_nbytes[mock_op], 150.0)
         self.assertLess(policy._q_level_nbytes[mock_op], 200.0)
 
-    def test_update_queue_threshold_downward_smoothing(self):
-        """Test threshold update with downward smoothing logic."""
+    def test_update_queue_threshold_no_decrease(self):
+        """Test that thresholds are never decreased, only maintained or increased."""
         mock_op = MagicMock()
         policy = ConcurrencyCapBackpressurePolicy(
             DataContext.get_current(),
             {mock_op: MagicMock()},
             MagicMock(),
         )
 
-        # Set up initial state with high threshold and very low level/dev to force downward adjustment
+        # Set up initial state with high threshold
         policy._queue_thresholds[mock_op] = 200
         policy._q_level_nbytes[mock_op] = 10.0  # Very low level
         policy._q_level_dev[mock_op] = 1.0  # Very low deviation
         policy._queue_history[mock_op] = deque([10, 11, 12, 13, 14, 15])
 
-        # Test downward adjustment (should be smoothed)
-        # threshold = max(10 + 4*1, 150) = 150, which is < 200, so should be smoothed
+        # Test that threshold is maintained when calculated threshold is lower
         threshold = policy._update_queue_threshold(mock_op, 150)
 
-        # Should be smoothed between 200 and 150
-        self.assertLess(threshold, 200)
-        self.assertGreaterEqual(threshold, 150)
+        # Should maintain the existing threshold (no decrease)
+        self.assertEqual(threshold, 200)
+        self.assertEqual(policy._queue_thresholds[mock_op], 200)
 
-        # Test that the method works correctly - just verify it doesn't crash
-        # and returns a reasonable threshold value
-        mock_op2 = MagicMock()
-        policy2 = ConcurrencyCapBackpressurePolicy(
+        # Test with even lower queue size
+        threshold_small = policy._update_queue_threshold(mock_op, 50)
+        self.assertEqual(threshold_small, 200)  # Still maintained
+        self.assertEqual(policy._queue_thresholds[mock_op], 200)
+
+    def test_update_queue_threshold_increase(self):
+        """Test that thresholds are increased when calculated threshold is higher."""
+        mock_op = MagicMock()
+        policy = ConcurrencyCapBackpressurePolicy(
             DataContext.get_current(),
-            {mock_op2: MagicMock()},
+            {mock_op: MagicMock()},
             MagicMock(),
         )
-        policy2._queue_thresholds[mock_op2] = 200
-        policy2._q_level_nbytes[mock_op2] = 10.0
-        policy2._q_level_dev[mock_op2] = 1.0
-        policy2._queue_history[mock_op2] = deque([10, 11, 12, 13, 14, 15])
 
-        threshold_small = policy2._update_queue_threshold(mock_op2, 50)
+        # Set up initial state with moderate threshold
+        policy._queue_thresholds[mock_op] = 100
+        policy._q_level_nbytes[mock_op] = 50.0
+        policy._q_level_dev[mock_op] = 20.0
+        policy._queue_history[mock_op] = deque([50, 60, 70, 80, 90, 100])
+
+        # Test that threshold is increased when calculated threshold is higher
+        threshold = policy._update_queue_threshold(mock_op, 200)
 
-        # Just verify it returns a reasonable threshold (at least as high as input)
-        self.assertGreaterEqual(threshold_small, 50)
+        # Should increase the threshold
+        self.assertGreaterEqual(threshold, 200)
+        self.assertGreaterEqual(policy._queue_thresholds[mock_op], 200)
 
     def test_effective_cap_calculation_with_trend(self):
         """Test effective cap calculation with different trend scenarios."""
@@ -474,7 +482,7 @@ def test_threshold_calculation_formula(self):
                 self.assertAlmostEqual(threshold, expected, places=5)
 
     def test_threshold_update_logic_comprehensive(self):
-        """Test comprehensive threshold update logic including bootstrap, upward, and downward cases."""
+        """Test comprehensive threshold update logic including bootstrap, upward, and no-decrease cases."""
         mock_op = MagicMock()
         policy = ConcurrencyCapBackpressurePolicy(
             DataContext.get_current(),
@@ -489,7 +497,7 @@ def test_threshold_update_logic_comprehensive(self):
         # Bootstrap: threshold = max(level + K_DEV * dev, q_now) = max(100 + 4*0, 100) = 100
         self.assertEqual(threshold1, 100)
 
-        # Test 2: Upward adjustment (threshold >= prev_threshold)
+        # Test 2: Upward adjustment (threshold > prev_threshold)
         policy._queue_thresholds[mock_op] = 100
         policy._q_level_nbytes[mock_op] = 50.0
         policy._q_level_dev[mock_op] = 10.0
@@ -499,17 +507,13 @@ def test_threshold_update_logic_comprehensive(self):
         # Just verify it's >= 200 (upward adjustment)
         self.assertGreaterEqual(threshold2, 200)
 
-        # Test 3: Downward adjustment (threshold < prev_threshold)
+        # Test 3: No decrease (threshold < prev_threshold, should maintain existing)
         policy._queue_thresholds[mock_op] = 200
-        policy._q_level_nbytes[
-            mock_op
-        ] = 10.0  # Very low level to force downward adjustment
+        policy._q_level_nbytes[mock_op] = 10.0  # Very low level
         policy._q_level_dev[mock_op] = 1.0  # Very low deviation
         policy._queue_history[mock_op] = deque([10, 11, 12, 13, 14, 15])
         threshold3 = policy._update_queue_threshold(mock_op, 150)
-        # threshold = max(10 + 4*1, 150) = 150, which is < 200, so should be smoothed
-        self.assertLess(threshold3, 200)
-        self.assertGreaterEqual(threshold3, 150)
+        self.assertEqual(threshold3, 200)
 
         # Test 4: Zero threshold case
         fresh_mock_op = MagicMock()
@@ -521,8 +525,8 @@ def test_threshold_update_logic_comprehensive(self):
         fresh_policy._queue_thresholds[fresh_mock_op] = 0
         fresh_policy._queue_history[fresh_mock_op] = deque([0])
         # Fresh policy starts with clean EWMA state
-        threshold5 = fresh_policy._update_queue_threshold(fresh_mock_op, 0)
-        self.assertEqual(threshold5, 1)  # Should round up to 1
+        threshold4 = fresh_policy._update_queue_threshold(fresh_mock_op, 0)
+        self.assertEqual(threshold4, 1)  # Should round up to 1
 
     def test_trend_and_effective_cap_formulas(self):
         """Test trend calculation and effective cap formulas."""