|
4 | 4 |
|
5 | 5 | import json |
6 | 6 | import os |
7 | | -import tempfile |
8 | | -from pathlib import Path |
9 | | -from unittest.mock import patch |
10 | 7 |
|
11 | 8 | import pytest |
12 | | -import torch |
13 | 9 | import yaml |
14 | 10 | from transformers import AutoTokenizer |
15 | 11 |
|
16 | | -from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config |
17 | 12 | from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens |
18 | 13 |
|
19 | | -from vllm.utils import ( |
20 | | - FlexibleArgumentParser, |
21 | | - bind_kv_cache, |
22 | | -) |
23 | | -from ..utils import create_new_process_for_each_test, flat_product |
| 14 | +from vllm.utils.argparse_utils import FlexibleArgumentParser |
| 15 | +from ..utils import flat_product |
24 | 16 |
|
25 | 17 |
|
26 | 18 | # Tests for FlexibleArgumentParser |
@@ -256,87 +248,6 @@ def test_duplicate_dict_args(caplog_vllm, parser): |
256 | 248 | assert "-O.mode" in caplog_vllm.text |
257 | 249 |
|
258 | 250 |
|
259 | | -def test_bind_kv_cache(): |
260 | | - from vllm.attention import Attention |
261 | | - |
262 | | - ctx = { |
263 | | - "layers.0.self_attn": Attention(32, 128, 0.1), |
264 | | - "layers.1.self_attn": Attention(32, 128, 0.1), |
265 | | - "layers.2.self_attn": Attention(32, 128, 0.1), |
266 | | - "layers.3.self_attn": Attention(32, 128, 0.1), |
267 | | - } |
268 | | - kv_cache = [ |
269 | | - torch.zeros((1,)), |
270 | | - torch.zeros((1,)), |
271 | | - torch.zeros((1,)), |
272 | | - torch.zeros((1,)), |
273 | | - ] |
274 | | - bind_kv_cache(ctx, [kv_cache]) |
275 | | - assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0] |
276 | | - assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1] |
277 | | - assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[2] |
278 | | - assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[3] |
279 | | - |
280 | | - |
281 | | -def test_bind_kv_cache_kv_sharing(): |
282 | | - from vllm.attention import Attention |
283 | | - |
284 | | - ctx = { |
285 | | - "layers.0.self_attn": Attention(32, 128, 0.1), |
286 | | - "layers.1.self_attn": Attention(32, 128, 0.1), |
287 | | - "layers.2.self_attn": Attention(32, 128, 0.1), |
288 | | - "layers.3.self_attn": Attention(32, 128, 0.1), |
289 | | - } |
290 | | - kv_cache = [ |
291 | | - torch.zeros((1,)), |
292 | | - torch.zeros((1,)), |
293 | | - torch.zeros((1,)), |
294 | | - torch.zeros((1,)), |
295 | | - ] |
296 | | - shared_kv_cache_layers = { |
297 | | - "layers.2.self_attn": "layers.1.self_attn", |
298 | | - "layers.3.self_attn": "layers.0.self_attn", |
299 | | - } |
300 | | - bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers) |
301 | | - assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0] |
302 | | - assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1] |
303 | | - assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[1] |
304 | | - assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[0] |
305 | | - |
306 | | - |
307 | | -def test_bind_kv_cache_non_attention(): |
308 | | - from vllm.attention import Attention |
309 | | - |
310 | | - # example from Jamba PP=2 |
311 | | - ctx = { |
312 | | - "model.layers.20.attn": Attention(32, 128, 0.1), |
313 | | - "model.layers.28.attn": Attention(32, 128, 0.1), |
314 | | - } |
315 | | - kv_cache = [ |
316 | | - torch.zeros((1,)), |
317 | | - torch.zeros((1,)), |
318 | | - ] |
319 | | - bind_kv_cache(ctx, [kv_cache]) |
320 | | - assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache[0] |
321 | | - assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache[1] |
322 | | - |
323 | | - |
324 | | -def test_bind_kv_cache_pp(): |
325 | | - with patch("vllm.utils.torch_utils.cuda_device_count_stateless", lambda: 2): |
326 | | - # this test runs with 1 GPU, but we simulate 2 GPUs |
327 | | - cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2)) |
328 | | - with set_current_vllm_config(cfg): |
329 | | - from vllm.attention import Attention |
330 | | - |
331 | | - ctx = { |
332 | | - "layers.0.self_attn": Attention(32, 128, 0.1), |
333 | | - } |
334 | | - kv_cache = [[torch.zeros((1,))], [torch.zeros((1,))]] |
335 | | - bind_kv_cache(ctx, kv_cache) |
336 | | - assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0][0] |
337 | | - assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0] |
338 | | - |
339 | | - |
340 | 251 | def test_model_specification( |
341 | 252 | parser_with_config, cli_config_file, cli_config_file_with_model |
342 | 253 | ): |
|
0 commit comments