Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmark/config/countdown-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ explorer:
rollout_model:
engine_num: 2
tensor_parallel_size: 1
enforce_eager: true
enforce_eager: false
enable_prefix_caching: false
enable_chunked_prefill: false
gpu_memory_utilization: 0.9
Expand Down
2 changes: 1 addition & 1 deletion docs/sphinx_doc/source/tutorial/example_step_wise.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ explorer:
engine_num: 2
tensor_parallel_size: 2
enable_prefix_caching: false
enforce_eager: true
enforce_eager: false
dtype: bfloat16
seed: 42
gpu_memory_utilization: 0.7
Expand Down
2 changes: 1 addition & 1 deletion docs/sphinx_doc/source_zh/tutorial/example_step_wise.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ explorer:
engine_num: 2
tensor_parallel_size: 2
enable_prefix_caching: false
enforce_eager: true
enforce_eager: false
dtype: bfloat16
seed: 42
gpu_memory_utilization: 0.7
Expand Down
2 changes: 1 addition & 1 deletion examples/agentscope_react/gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ explorer:
engine_num: 4
tensor_parallel_size: 1
enable_prefix_caching: false
enforce_eager: true
enforce_eager: false
enable_openai_api: true
enable_history: true
enable_auto_tool_choice: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ explorer:
engine_num: 4
tensor_parallel_size: 1
enable_prefix_caching: false
enforce_eager: true
enforce_eager: false
enable_openai_api: true
enable_history: true
dtype: bfloat16
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ explorer:
engine_num: 4
tensor_parallel_size: 1
enable_prefix_caching: false
enforce_eager: true
enforce_eager: false
enable_openai_api: true
enable_history: true
dtype: bfloat16
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ explorer:
engine_num: 4
tensor_parallel_size: 1
enable_prefix_caching: false
enforce_eager: true
enforce_eager: false
enable_openai_api: true
enable_history: true
dtype: bfloat16
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ explorer:
engine_num: 4
tensor_parallel_size: 1
enable_prefix_caching: false
enforce_eager: true
enforce_eager: false
dtype: bfloat16
seed: 42
gpu_memory_utilization: 0.7
Expand Down
2 changes: 1 addition & 1 deletion trinity/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ class InferenceModelConfig:
engine_num: int = 1
tensor_parallel_size: int = 1
use_v1: bool = True
enforce_eager: bool = True
enforce_eager: bool = False
enable_prefix_caching: bool = False
enable_chunked_prefill: bool = False
gpu_memory_utilization: float = 0.9
Expand Down
7 changes: 7 additions & 0 deletions trinity/common/models/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@ def __init__(
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
if get_vllm_version() >= parse_version("0.11.0"):
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
if not config.enforce_eager:
# To avoid torch compile conflicts when multiple model are started simultaneously.
# remove this when the following PR is released:
# https://github.com/vllm-project/vllm/pull/27616
os.environ["VLLM_CACHE_ROOT"] = os.path.expanduser(
f"~/.cache/vllm/{config.bundle_indices}"
)
self.default_sampling_params = vllm.SamplingParams(
n=1,
temperature=0.0,
Expand Down