|
19 | 19 |
|
20 | 20 | Run `pytest tests/compile/test_aclgraph.py`. |
21 | 21 | """ |
22 | | - |
23 | 22 | import pytest |
24 | 23 | import torch |
25 | | -from vllm import LLM, SamplingParams |
| 24 | +from vllm import SamplingParams |
| 25 | + |
| 26 | +from tests.e2e.conftest import VllmRunner |
26 | 27 |
|
27 | | -MODELS = ["deepseek-ai/DeepSeek-V2-Lite"] |
| 28 | +MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"] |
28 | 29 |
|
29 | 30 |
|
30 | 31 | @pytest.mark.parametrize("model", MODELS) |
31 | 32 | @pytest.mark.parametrize("max_tokens", [1]) |
32 | 33 | def test_models( |
33 | 34 | model: str, |
34 | 35 | max_tokens: int, |
35 | | - monkeypatch: pytest.MonkeyPatch, |
36 | 36 | ) -> None: |
37 | | - return |
38 | | - |
39 | | - prompts = "The president of the United States is" |
| 37 | + prompts = ["The president of the United States is"] |
40 | 38 |
|
41 | 39 | sampling_params = SamplingParams( |
42 | 40 | max_tokens=max_tokens, |
43 | 41 | temperature=0.0, |
44 | 42 | ) |
45 | 43 |
|
46 | | - vllm_model = LLM(model, long_prefill_token_threshold=4, enforce_eager=True) |
47 | | - output_chunked = vllm_model.generate(prompts, sampling_params) |
48 | | - logprobs_chunked = output_chunked.outputs[0].logprobs |
49 | | - del vllm_model |
50 | | - torch.npu.empty_cache() |
| 44 | + with VllmRunner(model, long_prefill_token_threshold=20, |
| 45 | + enforce_eager=True) as vllm_model: |
| 46 | + output1 = vllm_model.generate(prompts, sampling_params) |
| 47 | + |
| 48 | + with VllmRunner(model, |
| 49 | + enforce_eager=True, |
| 50 | + additional_config={ |
| 51 | + 'ascend_scheduler_config': { |
| 52 | + 'enabled': True |
| 53 | + }, |
| 54 | + }) as vllm_model: |
| 55 | + output2 = vllm_model.generate(prompts, sampling_params) |
| 56 | + |
| 57 | + # Extract the generated token IDs for comparison |
| 58 | + token_ids1 = output1[0][0][0] |
| 59 | + token_ids2 = output2[0][0][0] |
| 60 | + |
| 61 | + print(f"Token IDs 1: {token_ids1}") |
| 62 | + print(f"Token IDs 2: {token_ids2}") |
| 63 | + |
| 64 | + # Convert token IDs to tensors and calculate cosine similarity |
| 65 | + # Take the length of a shorter sequence to ensure consistent dimensions |
| 66 | + min_len = min(len(token_ids1), len(token_ids2)) |
| 67 | + |
| 68 | + tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32) |
| 69 | + tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32) |
51 | 70 |
|
52 | | - vllm_model = LLM(model, |
53 | | - enforce_eager=True, |
54 | | - additional_config={ |
55 | | - 'ascend_scheduler_config': { |
56 | | - 'enabled': True |
57 | | - }, |
58 | | - }) |
59 | | - output = vllm_model.generate(prompts, sampling_params) |
60 | | - logprobs = output.outputs[0].logprobs |
61 | | - del vllm_model |
62 | | - torch.npu.empty_cache() |
| 71 | + # Calculate similarity using torch.cosine_similarity |
| 72 | + similarity = torch.cosine_similarity(tensor1, tensor2, dim=0) |
| 73 | + print(f"Token IDs cosine similarity: {similarity.item()}") |
63 | 74 |
|
64 | | - logprobs_similarity = torch.cosine_similarity(logprobs_chunked.flatten(), |
65 | | - logprobs.flatten(), |
66 | | - dim=0) |
67 | | - assert logprobs_similarity > 0.95 |
| 75 | + assert similarity > 0.95 |
0 commit comments