Skip to content
This repository was archived by the owner on Oct 11, 2024. It is now read-only.

Commit 4f7381a

Browse files
youkaichaoRobert Shaw
authored andcommitted
[ci][distributed] fix device count call
[ci][distributed] fix some cuda init that makes it necessary to use spawn (vllm-project#5991)
1 parent cea9f6b commit 4f7381a

File tree

6 files changed

+83
-51
lines changed

6 files changed

+83
-51
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,6 @@ steps:
4545
num_gpus: 2
4646
commands:
4747
- bash ../.buildkite/download-images.sh
48-
# FIXIT: find out which code initialize cuda before running the test
49-
# before the fix, we need to use spawn to test it
50-
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
5148
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
5249
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
5350
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
@@ -60,8 +57,7 @@ steps:
6057
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
6158
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
6259
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
63-
# FIXIT: find out why TP is failing with mp backend on phi3-v
64-
# - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
60+
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
6561
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
6662
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
6763
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
@@ -71,9 +67,6 @@ steps:
7167
working_dir: "/vllm-workspace/tests"
7268
num_gpus: 4
7369
commands:
74-
# FIXIT: find out which code initialize cuda before running the test
75-
# before the fix, we need to use spawn to test it
76-
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
7770
- pytest -v -s distributed/test_pynccl.py
7871
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
7972
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
@@ -225,9 +218,6 @@ steps:
225218
gpu: a100
226219
num_gpus: 4
227220
commands:
228-
# FIXIT: find out which code initialize cuda before running the test
229-
# before the fix, we need to use spawn to test it
230-
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
231221
# NOTE: don't test llama model here, it seems hf implementation is buggy
232222
# see https://github.com/vllm-project/vllm/pull/5689 for details
233223
- pytest -v -s distributed/test_custom_all_reduce.py

tests/conftest.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,29 @@
55
from dataclasses import dataclass
66
from functools import cached_property
77
from pathlib import Path
8-
from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
9-
TypeVar)
8+
from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple,
9+
TypedDict, TypeVar)
1010

1111
import pytest
1212
import torch
1313
import torch.nn as nn
1414
import torch.nn.functional as F
1515
from PIL import Image
1616
from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
17-
AutoProcessor, AutoTokenizer, BatchEncoding)
17+
AutoTokenizer, BatchEncoding)
1818

1919
from vllm import LLM, SamplingParams
2020
from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
2121
from vllm.distributed import (destroy_distributed_environment,
2222
destroy_model_parallel)
2323
from vllm.inputs import TextPrompt
2424
from vllm.logger import init_logger
25-
from vllm.multimodal import MultiModalData
26-
from vllm.multimodal.image import ImageFeatureData, ImagePixelData
25+
26+
if TYPE_CHECKING:
27+
from vllm.multimodal import MultiModalData
28+
else:
29+
# it will call torch.cuda.device_count()
30+
MultiModalData = None
2731
from vllm.sequence import SampleLogprobs
2832
from vllm.utils import cuda_device_count_stateless, is_cpu
2933

@@ -63,6 +67,10 @@ def for_hf(self) -> Image.Image:
6367
return self.pil_image
6468

6569
def for_vllm(self, vision_config: VisionLanguageConfig) -> MultiModalData:
70+
# don't put this import at the top level
71+
# it will call torch.cuda.device_count()
72+
from vllm.multimodal.image import ImageFeatureData # noqa: F401
73+
from vllm.multimodal.image import ImagePixelData
6674
image_input_type = vision_config.image_input_type
6775
ImageInputType = VisionLanguageConfig.ImageInputType
6876

@@ -217,6 +225,9 @@ def __init__(
217225
)
218226

219227
try:
228+
# don't put this import at the top level
229+
# it will call torch.cuda.device_count()
230+
from transformers import AutoProcessor # noqa: F401
220231
self.processor = AutoProcessor.from_pretrained(
221232
model_name,
222233
torch_dtype=torch_dtype,

tests/distributed/test_basic_distributed_correctness.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
import os
2020

2121
import pytest
22-
import torch
22+
23+
from vllm.utils import cuda_device_count_stateless
2324

2425
from tests.nm_utils.utils_skip import should_skip_test_group
2526
from ..models.utils import check_outputs_equal
@@ -37,7 +38,7 @@
3738
@pytest.mark.skip("Upstream test that compares 'golden' results from fp16 "
3839
"model with TP, which is an invalid test strategy due to "
3940
"numerical precision on GPU.")
40-
@pytest.mark.skipif(torch.cuda.device_count() < 2,
41+
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
4142
reason="Need at least 2 GPUs to run the test.")
4243
@pytest.mark.parametrize("model", MODELS)
4344
@pytest.mark.parametrize("dtype", ["half"])
@@ -52,16 +53,20 @@ def test_models(
5253
) -> None:
5354
distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
5455

55-
with hf_runner(model, dtype=dtype) as hf_model:
56-
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
57-
56+
# NOTE: take care of the order. run vLLM first, and then run HF.
57+
# vLLM needs a fresh new process without cuda initialization.
58+
# if we run HF first, the cuda initialization will be done and it
59+
# will hurt multiprocessing backend with fork method (the default method).
5860
with vllm_runner(model,
5961
dtype=dtype,
6062
tensor_parallel_size=2,
6163
distributed_executor_backend=distributed_executor_backend
6264
) as vllm_model:
6365
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
6466

67+
with hf_runner(model, dtype=dtype) as hf_model:
68+
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
69+
6570
check_outputs_equal(
6671
outputs_0_lst=hf_outputs,
6772
outputs_1_lst=vllm_outputs,

tests/distributed/test_chunked_prefill_distributed.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
import os
1919

2020
import pytest
21-
import torch
21+
22+
from vllm.utils import cuda_device_count_stateless
2223

2324
from ..models.utils import check_outputs_equal
2425

@@ -37,7 +38,7 @@
3738
@pytest.mark.skip("Upstream test that compares 'golden' results from fp16 "
3839
"model with TP, which is an invalid test strategy due to "
3940
"numerical precision on GPU.")
40-
@pytest.mark.skipif(torch.cuda.device_count() < 2,
41+
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
4142
reason="Need at least 2 GPUs to run the test.")
4243
@pytest.mark.parametrize("model", MODELS)
4344
@pytest.mark.parametrize("dtype", ["half"])
@@ -60,8 +61,10 @@ def test_models(
6061
enable_chunked_prefill = True
6162
max_num_batched_tokens = chunked_prefill_token_size
6263

63-
with hf_runner(model, dtype=dtype) as hf_model:
64-
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
64+
# NOTE: take care of the order. run vLLM first, and then run HF.
65+
# vLLM needs a fresh new process without cuda initialization.
66+
# if we run HF first, the cuda initialization will be done and it
67+
# will hurt multiprocessing backend with fork method (the default method).
6568

6669
with vllm_runner(
6770
model,
@@ -74,6 +77,9 @@ def test_models(
7477
) as vllm_model:
7578
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
7679

80+
with hf_runner(model, dtype=dtype) as hf_model:
81+
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
82+
7783
check_outputs_equal(
7884
outputs_0_lst=hf_outputs,
7985
outputs_1_lst=vllm_outputs,

tests/models/test_llava.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -93,28 +93,38 @@ def run_test(
9393
"""
9494
model_id, vlm_config = model_and_config
9595
hf_images = [asset.for_hf() for asset in image_assets]
96-
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
9796

98-
with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
99-
hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
100-
max_tokens,
101-
images=hf_images)
102-
103-
vllm_image_prompts = [
104-
p.replace("<image>", "<image>" * vlm_config.image_feature_size)
105-
for p in HF_IMAGE_PROMPTS
106-
]
97+
# NOTE: take care of the order. run vLLM first, and then run HF.
98+
# vLLM needs a fresh new process without cuda initialization.
99+
# if we run HF first, the cuda initialization will be done and it
100+
# will hurt multiprocessing backend with fork method (the default method).
107101

108102
with vllm_runner(model_id,
109103
dtype=dtype,
110104
tensor_parallel_size=tensor_parallel_size,
111105
distributed_executor_backend=distributed_executor_backend,
112106
enforce_eager=True,
113107
**vlm_config.as_cli_args_dict()) as vllm_model:
108+
109+
# NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
110+
# we must put it inside the vllm_runner context manager
111+
# i.e. after creating vLLM instance.
112+
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
113+
114+
vllm_image_prompts = [
115+
p.replace("<image>", "<image>" * vlm_config.image_feature_size)
116+
for p in HF_IMAGE_PROMPTS
117+
]
118+
114119
vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
115120
max_tokens,
116121
images=vllm_images)
117122

123+
with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
124+
hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
125+
max_tokens,
126+
images=hf_images)
127+
118128
check_outputs_equal(
119129
hf_outputs,
120130
[

tests/models/test_phi3v.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -101,23 +101,11 @@ def run_test(
101101
"""
102102
model_id, vlm_config = model_and_config
103103
hf_images = [asset.for_hf() for asset in image_assets]
104-
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
105104

106-
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
107-
hf_model_kwargs = {"_attn_implementation": "eager"}
108-
with hf_runner(model_id, dtype=dtype,
109-
model_kwargs=hf_model_kwargs) as hf_model:
110-
hf_outputs = hf_model.generate_greedy(
111-
HF_IMAGE_PROMPTS,
112-
max_tokens,
113-
images=hf_images,
114-
eos_token_id=hf_model.processor.tokenizer.eos_token_id)
115-
116-
vllm_image_prompts = [
117-
p.replace("<|image_1|>",
118-
"<|image|>" * vlm_config.image_feature_size + "<s>")
119-
for p in HF_IMAGE_PROMPTS
120-
]
105+
# NOTE: take care of the order. run vLLM first, and then run HF.
106+
# vLLM needs a fresh new process without cuda initialization.
107+
# if we run HF first, the cuda initialization will be done and it
108+
# will hurt multiprocessing backend with fork method (the default method).
121109

122110
with vllm_runner(model_id,
123111
max_model_len=2048,
@@ -126,10 +114,32 @@ def run_test(
126114
enforce_eager=True,
127115
distributed_executor_backend=distributed_executor_backend,
128116
**vlm_config.as_cli_args_dict()) as vllm_model:
117+
# NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
118+
# we must put it inside the vllm_runner context manager
119+
# i.e. after creating vLLM instance.
120+
121+
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
122+
123+
vllm_image_prompts = [
124+
p.replace("<|image_1|>",
125+
"<|image|>" * vlm_config.image_feature_size + "<s>")
126+
for p in HF_IMAGE_PROMPTS
127+
]
128+
129129
vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
130130
max_tokens,
131131
images=vllm_images)
132132

133+
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
134+
hf_model_kwargs = {"_attn_implementation": "eager"}
135+
with hf_runner(model_id, dtype=dtype,
136+
model_kwargs=hf_model_kwargs) as hf_model:
137+
hf_outputs = hf_model.generate_greedy(
138+
HF_IMAGE_PROMPTS,
139+
max_tokens,
140+
images=hf_images,
141+
eos_token_id=hf_model.processor.tokenizer.eos_token_id)
142+
133143
check_outputs_equal(
134144
hf_outputs,
135145
[

0 commit comments

Comments
 (0)