Pradyun92
diff --git a/‎.buildkite/nightly-benchmarks/nightly-descriptions.md‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/nightly-benchmarks/nightly-descriptions.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 66 additions & 12 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 66 additions & 12 deletions
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 4 additions & 1 deletion b/‎.github/CODEOWNERS‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.github/workflows/bc-lint.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/bc-lint.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
@@ -8,7 +8,7 @@ This benchmark aims to:
 
 Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
 
-Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 
 ## Setup
 
 
@@ -571,44 +571,98 @@ steps:
 
 #####  models test  #####
 
-- label: Basic Models Test # 57min
-  timeout_in_minutes: 75
+- label: Basic Models Tests (Initialization)
+  timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
   - vllm/
-  - tests/models
+  - tests/models/test_initialization.py
   commands:
-    - pytest -v -s models/test_transformers.py
-    - pytest -v -s models/test_registry.py
-    - pytest -v -s models/test_utils.py
-    - pytest -v -s models/test_vision.py
-    - pytest -v -s models/test_initialization.py
+    # Run a subset of model initialization tests
+    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
 
-- label: Language Models Test (Standard) # 35min
+- label: Basic Models Tests (Extra Initialization) %N
   timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/test_initialization.py
+  commands:
+    # Only when vLLM model source is modified - test initialization of a large
+    # subset of supported models (the complement of the small subset in the above
+    # test.) Also run if model initialization test file is modified
+    - pytest -v -s models/test_initialization.py \
+             -k 'not test_can_initialize_small_subset' \
+             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+             --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Basic Models Tests (Other)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_transformers.py
+  - tests/models/test_registry.py
+  - tests/models/test_utils.py
+  - tests/models/test_vision.py
+  commands:
+    - pytest -v -s models/test_transformers.py \
+                   models/test_registry.py \
+                   models/test_utils.py \
+                   models/test_vision.py
+
+- label: Language Models Tests (Standard)
+  timeout_in_minutes: 25
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
   - vllm/
   - tests/models/language
   commands:
+    # Test standard language models, excluding a subset of slow tests
     - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m core_model
+    - pytest -v -s models/language -m 'core_model and (not slow_test)'
 
-- label: Language Models Test (Hybrid) # 35 min
+- label: Language Models Tests (Extra Standard) %N
   timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/language/pooling/test_embedding.py
+  - tests/models/language/generation/test_common.py
+  - tests/models/language/pooling/test_classification.py
+  commands:
+    # Shard slow subset of standard language models tests. Only run when model
+    # source is modified, or when specified test files are modified
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and slow_test' \
+             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+             --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
   - vllm/
   - tests/models/language/generation
   commands:
     # Install fast path packages for testing against transformers
     # Note: also needed to run plamo2 model in vLLM
     - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
     - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-    - pytest -v -s models/language/generation -m hybrid_model
+    # Shard hybrid language model tests
+    - pytest -v -s models/language/generation \
+                   -m hybrid_model \
+                   --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+                   --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
 
 - label: Language Models Test (Extended Generation) # 80min
   timeout_in_minutes: 110
 
@@ -101,4 +101,7 @@ mkdocs.yaml @hmellor
 /vllm/v1/worker/tpu* @NickLucche
 /vllm/platforms/tpu.py @NickLucche
 /vllm/v1/sample/tpu @NickLucche
-/vllm/tests/v1/tpu @NickLucche
+/vllm/tests/v1/tpu @NickLucche
+
+# KVConnector installation files
+/requirements/kv_connectors.txt @NickLucche
@@ -6,6 +6,8 @@ on:
       - opened
       - synchronize
       - reopened
+      - labeled
+      - unlabeled
 
 jobs:
   bc_lint:
 
@@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26)
 # cmake --install . --component _C
 project(vllm_extensions LANGUAGES CXX)
 
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+
 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
 set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 
@@ -81,7 +81,7 @@ vLLM is flexible and easy to use with:
 - Tensor, pipeline, data and expert parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
 - Prefix caching support
 - Multi-LoRA support