ray-project
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 1 deletion b/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 10 additions & 40 deletions b/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 10 additions & 40 deletions
diff --git a/‎doc/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions b/‎doc/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/source/ray-overview/examples/llamafactory-llm-fine-tune/notebooks/dpo_qlora.ipynb‎
Lines changed: 2 additions & 0 deletions b/‎doc/source/ray-overview/examples/llamafactory-llm-fine-tune/notebooks/dpo_qlora.ipynb‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎doc/source/ray-overview/examples/llamafactory-llm-fine-tune/notebooks/kto_lora.ipynb‎
Lines changed: 2 additions & 0 deletions b/‎doc/source/ray-overview/examples/llamafactory-llm-fine-tune/notebooks/kto_lora.ipynb‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎doc/source/ray-overview/examples/llamafactory-llm-fine-tune/notebooks/sft_lora_deepspeed.ipynb‎
Lines changed: 4 additions & 2 deletions b/‎doc/source/ray-overview/examples/llamafactory-llm-fine-tune/notebooks/sft_lora_deepspeed.ipynb‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎doc/source/ray-overview/examples/llamafactory-llm-fine-tune/train-configs/sft_lora_deepspeed.yaml‎
Lines changed: 1 addition & 1 deletion b/‎doc/source/ray-overview/examples/llamafactory-llm-fine-tune/train-configs/sft_lora_deepspeed.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/source/serve/advanced-guides/advanced-autoscaling.md‎
Lines changed: 41 additions & 1 deletion b/‎doc/source/serve/advanced-guides/advanced-autoscaling.md‎
Lines changed: 41 additions & 1 deletion
diff --git a/‎doc/source/serve/doc_code/application_level_autoscaling.py‎
Lines changed: 36 additions & 0 deletions b/‎doc/source/serve/doc_code/application_level_autoscaling.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎doc/source/serve/doc_code/application_level_autoscaling.yaml‎
Lines changed: 14 additions & 0 deletions b/‎doc/source/serve/doc_code/application_level_autoscaling.yaml‎
Lines changed: 14 additions & 0 deletions
@@ -71,14 +71,14 @@
 /python/ray/data/llm.py @ray-project/ray-llm
 /python/ray/dashboard/modules/metrics/dashboards/serve_llm_dashboard_panels.py @ray-project/ray-llm
 /python/ray/dashboard/modules/metrics/dashboards/serve_llm_grafana_dashboard_base.json @ray-project/ray-llm
-/doc/source/serve/llm/ @ray-project/ray-llm
 
 # Ray Serve
 /python/ray/serve/ @ray-project/ray-serve
 /java/serve/ @ray-project/ray-serve
 /src/ray/protobuf/serve.proto @ray-project/ray-serve
 /python/ray/dashboard/modules/serve/ @ray-project/ray-serve
 /doc/source/serve/ @ray-project/ray-serve @ray-project/ray-docs
+/doc/source/serve/llm/ @ray-project/ray-llm @ray-project/ray-docs
 
 # ML Docker Dependencies
 /python/requirements/ml/dl-cpu-requirements.txt @richardliaw @matthewdeng
 
@@ -1,45 +1,15 @@
-<!-- Thank you for contributing to Ray! 🚀 -->
-<!-- Please review https://github.com/ray-project/ray/blob/master/CONTRIBUTING.rst before opening a pull request. -->
-<!-- 💡 Tip: Mark as draft if you want early feedback, or ready for review when it's complete -->
+> Thank you for contributing to Ray! 🚀
+> Please review the [Ray Contribution Guide](https://docs.ray.io/en/master/ray-contribute/getting-involved.html) before opening a pull request.
 
-## Description
-
-<!-- Briefly describe what this PR accomplishes and why it's needed -->
-
-## Related issues
-
-<!-- Link related issues: "Fixes #1234", "Closes #1234", or "Related to #1234" -->
-
-## Types of change
+> ⚠️ Remove these instructions before submitting your PR.
 
-- [ ] Bug fix 🐛
-- [ ] New feature ✨
-- [ ] Enhancement 🚀
-- [ ] Code refactoring 🔧
-- [ ] Documentation update 📖
-- [ ] Chore 🧹
-- [ ] Style 🎨
+> 💡 Tip: Mark as draft if you want early feedback, or ready for review when it's complete.
 
-## Checklist
-
-**Does this PR introduce breaking changes?**
-- [ ] Yes ⚠️
-- [ ] No
-<!-- If yes, describe what breaks and how users should migrate -->
-
-**Testing:**
-- [ ] Added/updated tests for my changes
-- [ ] Tested the changes manually
-- [ ] This PR is not tested ❌ _(please explain why)_
-
-**Code Quality:**
-- [ ] Signed off every commit (`git commit -s`)
-- [ ] Ran pre-commit hooks ([setup guide](https://docs.ray.io/en/latest/ray-contribute/getting-involved.html#lint-and-formatting))
-
-**Documentation:**
-- [ ] Updated documentation (if applicable) ([contribution guide](https://docs.ray.io/en/latest/ray-contribute/docs.html))
-- [ ] Added new APIs to `doc/source/` (if applicable)
+## Description
+> Briefly describe what this PR accomplishes and why it's needed.
 
-## Additional context
+## Related issues
+> Link related issues: "Fixes #1234", "Closes #1234", or "Related to #1234".
 
-<!-- Optional: Add screenshots, examples, performance impact, breaking change details -->
+## Additional information
+> Optional: Add implementation details, API changes, usage examples, screenshots, etc.
@@ -552,6 +552,7 @@ doctest(
 
 doctest(
     name = "doctest[core]",
+    size = "large",
     files = glob(
         include = [
             "source/ray-core/**/*.md",
 
@@ -134,6 +134,8 @@
     "\n",
     "### Configure LLaMA-Factory with Ray\n",
     "\n",
+    "**Note**: To customize the training configuration, edit `train-configs/dpo_qlora.yaml`. \n",
+    "\n",
     "```yaml\n",
     "# dpo_qlora.yaml\n",
     "\n",
 
@@ -145,6 +145,8 @@
     "\n",
     "### Configure LLaMA-Factory with Ray\n",
     "\n",
+    "**Note**: To customize the training configuration, edit `train-configs/kto_lora.yaml`. \n",
+    "\n",
     "```yaml\n",
     "# kto_lora.yaml\n",
     "\n",
 
@@ -160,7 +160,9 @@
     "- **Gated models:** If your base model has gated access (for example, Llama) on HuggingFace, set `HF_TOKEN` in the runtime environment.\n",
     "- **GPU selection:** The config sets `accelerator_type` to `L40S`, but you can switch to other GPUs such as `A100-40GB` or any other GPU with comparable or more VRAM, depending on your cloud availability.\n",
     "\n",
-    "### LLaMA-Factory + Ray configuration\n",
+    "### Configure LLaMA-Factory with Ray\n",
+    "\n",
+    "**Note**: To customize the training configuration, edit `train-configs/sft_lora_deepspeed.yaml`. \n",
     "\n",
     "```yaml\n",
     "# sft_lora_deepspeed.yaml\n",
@@ -209,7 +211,7 @@
     "### ray\n",
     "ray_run_name: qwen2.5_32b_lora_sft\n",
     "ray_storage_path: /mnt/cluster_storage/\n",
-    "ray_num_workers: 4  # Number of GPUs to use.\n",
+    "ray_num_workers: 4  # Number of GPUs to use\n",
     "resources_per_worker:\n",
     "  GPU: 1\n",
     "  accelerator_type:L40S: 0.001            # Use this to simply specify a GPU type (not guaranteed on the same node). You can use A100-40G if L40S is not available. \n",
 
@@ -44,7 +44,7 @@ ddp_timeout: 180000000
 ### ray
 ray_run_name: qwen2.5_32b_lora_sft
 ray_storage_path: /mnt/cluster_storage/
-ray_num_workers: 4  # Number of GPUs to use.
+ray_num_workers: 4  # Number of GPUs to use
 resources_per_worker:
   GPU: 1
   accelerator_type:L40S: 0.001            # Use this to simply specify a GPU type (not guaranteed on the same node). You can use A100-40G if L40S is not available. 
 
@@ -669,4 +669,44 @@ In your policy, access custom metrics via:
   The number of data points stored for each replica depends on the [`look_back_period_s`](../api/doc/ray.serve.config.AutoscalingConfig.look_back_period_s.rst) (the sliding window size) and [`metrics_interval_s`](../api/doc/ray.serve.config.AutoscalingConfig.metrics_interval_s.rst) (the metric recording interval).
 * **`ctx.aggregated_metrics[metric_name]`** — A time-weighted average computed from the raw metric values for each replica.
 
-> Today, aggregation is a time-weighted average. In future releases, additional aggregation options may be supported.
+
+### Application level autoscaling
+
+By default, each deployment in Ray Serve autoscales independently. When you have multiple deployments that need to scale in a coordinated way—such as deployments that share backend resources, have dependencies on each other, or need load-aware routing—you can define an **application-level autoscaling policy**. This policy makes scaling decisions for all deployments within an application simultaneously.
+
+#### Define an application level policy
+
+An application-level autoscaling policy is a function that takes a Dict[DeploymentID, [`AutoscalingContext`](../api/doc/ray.serve.config.AutoscalingContext.rst)] objects (one per deployment) and returns a tuple of `(decisions, policy_state)`. Each context contains metrics and bounds for one deployment, and the policy returns target replica counts for all deployments.
+
+The following example shows a policy that scales deployments based on their relative load, ensuring that downstream deployments have enough capacity for upstream traffic:
+
+```{literalinclude} ../doc_code/autoscaling_policy.py
+:language: python
+:start-after: __begin_application_level_autoscaling_policy__
+:end-before: __end_application_level_autoscaling_policy__
+```
+
+#### Configure application level autoscaling
+
+To use an application-level policy, you need to define your deployments:
+
+```{literalinclude} ../doc_code/application_level_autoscaling.py
+:language: python
+:start-after: __serve_example_begin__
+:end-before: __serve_example_end__
+```
+
+Then specify the application-level policy in your application config:
+
+```{literalinclude} ../doc_code/application_level_autoscaling.yaml
+:language: yaml
+:emphasize-lines: 4-5
+```
+
+:::{note}
+Programmatic configuration of application-level autoscaling policies through `serve.run()` will be supported in a future release.
+:::
+
+:::{note}
+When you specify both a deployment-level policy and an application-level policy, the application-level policy takes precedence. Ray Serve logs a warning if you configure both.
+:::
@@ -0,0 +1,36 @@
+# __serve_example_begin__
+import time
+from ray import serve
+
+
+@serve.deployment
+class Preprocessor:
+    def __call__(self, input_data: str) -> str:
+        # Simulate preprocessing work
+        time.sleep(0.05)
+        return f"preprocessed_{input_data}"
+
+
+@serve.deployment
+class Model:
+    def __call__(self, preprocessed_data: str) -> str:
+        # Simulate model inference (takes longer than preprocessing)
+        time.sleep(0.1)
+        return f"result_{preprocessed_data}"
+
+
+@serve.deployment
+class Driver:
+    def __init__(self, preprocessor, model):
+        self._preprocessor = preprocessor
+        self._model = model
+
+    async def __call__(self, input_data: str) -> str:
+        # Coordinate preprocessing and model inference
+        preprocessed = await self._preprocessor.remote(input_data)
+        result = await self._model.remote(preprocessed)
+        return result
+
+
+app = Driver.bind(Preprocessor.bind(), Model.bind())
+# __serve_example_end__
@@ -0,0 +1,14 @@
+applications:
+  - name: MyApp
+    import_path: application_level_autoscaling:app
+    autoscaling_policy:
+      policy_function: autoscaling_policy:coordinated_scaling_policy
+    deployments:
+      - name: Preprocessor
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 10
+      - name: Model
+        autoscaling_config:
+          min_replicas: 2
+          max_replicas: 20