|
7 | 7 | MODEL = "meta-llama/llama-2-7b-hf" |
8 | 8 | MAX_TOKENS = 200 |
9 | 9 |
|
| 10 | +IS_ASYNC = False |
| 11 | + |
10 | 12 |
|
11 | 13 | @pytest.fixture(scope="session") |
12 | 14 | def vllm_model(vllm_runner): |
13 | 15 | with vllm_runner(MODEL) as vllm_model: |
14 | 16 | yield vllm_model |
15 | 17 |
|
16 | 18 |
|
17 | | -@pytest.mark.skip_global_cleanup |
18 | | -def test_stop_basic(vllm_model): |
19 | | - _test_stopping(vllm_model.model.llm_engine, |
| 19 | +def _test_stopping(llm_engine: LLMEngine, |
| 20 | + expected_output: str, |
| 21 | + expected_reason: Any, |
| 22 | + stop: Optional[List[str]] = None, |
| 23 | + stop_token_ids: Optional[List[int]] = None, |
| 24 | + include_in_output: bool = False, |
| 25 | + use_async_output_proc: bool = False) -> None: |
| 26 | + llm_engine.add_request( |
| 27 | + "id", "A story about vLLM:\n", |
| 28 | + SamplingParams( |
| 29 | + temperature=0.0, |
| 30 | + max_tokens=MAX_TOKENS, |
| 31 | + stop=stop, |
| 32 | + stop_token_ids=stop_token_ids, |
| 33 | + include_stop_str_in_output=include_in_output, |
| 34 | + ), None) |
| 35 | + |
| 36 | + output: Optional[CompletionOutput] = None |
| 37 | + output_text = "" |
| 38 | + stop_reason = None |
| 39 | + |
| 40 | + if use_async_output_proc: |
| 41 | + llm_engine.step() |
| 42 | + |
| 43 | + while llm_engine.has_unfinished_requests(): |
| 44 | + (request_output, ) = llm_engine.step() |
| 45 | + (output, ) = request_output.outputs |
| 46 | + |
| 47 | + # Ensure we don't backtrack |
| 48 | + assert output.text.startswith(output_text) |
| 49 | + output_text = output.text |
| 50 | + stop_reason = output.stop_reason |
| 51 | + |
| 52 | + assert output is not None |
| 53 | + assert output_text == expected_output |
| 54 | + assert stop_reason == expected_reason |
| 55 | + |
| 56 | + |
| 57 | +def _set_async_mode(llm_engine, is_async): |
| 58 | + llm_engine.scheduler[0].use_async_output_proc = is_async |
| 59 | + |
| 60 | + |
| 61 | +def _stop_basic(llm_engine, is_async): |
| 62 | + _test_stopping(llm_engine, |
20 | 63 | stop=["."], |
21 | 64 | include_in_output=False, |
22 | 65 | expected_output="VLLM is a 100% volunteer organization", |
23 | | - expected_reason=".") |
| 66 | + expected_reason=".", |
| 67 | + use_async_output_proc=is_async) |
24 | 68 |
|
25 | | - _test_stopping(vllm_model.model.llm_engine, |
| 69 | + _test_stopping(llm_engine, |
26 | 70 | stop=["."], |
27 | 71 | include_in_output=True, |
28 | 72 | expected_output="VLLM is a 100% volunteer organization.", |
29 | | - expected_reason=".") |
| 73 | + expected_reason=".", |
| 74 | + use_async_output_proc=is_async) |
30 | 75 |
|
31 | 76 |
|
32 | | -@pytest.mark.skip_global_cleanup |
33 | | -def test_stop_multi_tokens(vllm_model): |
| 77 | +def _stop_multi_tokens(llm_engine, is_async): |
34 | 78 | _test_stopping( |
35 | | - vllm_model.model.llm_engine, |
| 79 | + llm_engine, |
36 | 80 | stop=["group of peo", "short"], |
37 | 81 | include_in_output=False, |
38 | 82 | expected_output="VLLM is a 100% volunteer organization. We are a ", |
39 | | - expected_reason="group of peo") |
| 83 | + expected_reason="group of peo", |
| 84 | + use_async_output_proc=is_async) |
40 | 85 |
|
41 | 86 | _test_stopping( |
42 | | - vllm_model.model.llm_engine, |
| 87 | + llm_engine, |
43 | 88 | stop=["group of peo", "short"], |
44 | 89 | include_in_output=True, |
45 | 90 | expected_output= |
46 | 91 | "VLLM is a 100% volunteer organization. We are a group of peo", |
47 | | - expected_reason="group of peo") |
| 92 | + expected_reason="group of peo", |
| 93 | + use_async_output_proc=is_async) |
48 | 94 |
|
49 | 95 |
|
50 | | -@pytest.mark.skip_global_cleanup |
51 | | -def test_stop_partial_token(vllm_model): |
52 | | - _test_stopping(vllm_model.model.llm_engine, |
| 96 | +def _stop_partial_token(llm_engine, is_async): |
| 97 | + _test_stopping(llm_engine, |
53 | 98 | stop=["gani"], |
54 | 99 | include_in_output=False, |
55 | 100 | expected_output="VLLM is a 100% volunteer or", |
56 | | - expected_reason="gani") |
| 101 | + expected_reason="gani", |
| 102 | + use_async_output_proc=is_async) |
57 | 103 |
|
58 | | - _test_stopping(vllm_model.model.llm_engine, |
| 104 | + _test_stopping(llm_engine, |
59 | 105 | stop=["gani"], |
60 | 106 | include_in_output=True, |
61 | 107 | expected_output="VLLM is a 100% volunteer organi", |
62 | | - expected_reason="gani") |
| 108 | + expected_reason="gani", |
| 109 | + use_async_output_proc=is_async) |
63 | 110 |
|
64 | 111 |
|
65 | | -@pytest.mark.skip_global_cleanup |
66 | | -def test_stop_token_id(vllm_model): |
| 112 | +def _stop_token_id(llm_engine, is_async): |
67 | 113 | # token id 13013 => " organization" |
68 | 114 |
|
69 | | - _test_stopping(vllm_model.model.llm_engine, |
| 115 | + _test_stopping(llm_engine, |
70 | 116 | stop_token_ids=[13013], |
71 | 117 | include_in_output=False, |
72 | 118 | expected_output="VLLM is a 100% volunteer", |
73 | | - expected_reason=13013) |
| 119 | + expected_reason=13013, |
| 120 | + use_async_output_proc=is_async) |
74 | 121 |
|
75 | | - _test_stopping(vllm_model.model.llm_engine, |
| 122 | + _test_stopping(llm_engine, |
76 | 123 | stop_token_ids=[13013], |
77 | 124 | include_in_output=True, |
78 | 125 | expected_output="VLLM is a 100% volunteer organization", |
79 | | - expected_reason=13013) |
| 126 | + expected_reason=13013, |
| 127 | + use_async_output_proc=is_async) |
80 | 128 |
|
81 | 129 |
|
82 | | -def _test_stopping(llm_engine: LLMEngine, |
83 | | - expected_output: str, |
84 | | - expected_reason: Any, |
85 | | - stop: Optional[List[str]] = None, |
86 | | - stop_token_ids: Optional[List[int]] = None, |
87 | | - include_in_output: bool = False) -> None: |
88 | | - llm_engine.add_request( |
89 | | - "id", "A story about vLLM:\n", |
90 | | - SamplingParams( |
91 | | - temperature=0.0, |
92 | | - max_tokens=MAX_TOKENS, |
93 | | - stop=stop, |
94 | | - stop_token_ids=stop_token_ids, |
95 | | - include_stop_str_in_output=include_in_output, |
96 | | - ), None) |
| 130 | +@pytest.mark.skip_global_cleanup |
| 131 | +def test_stop_basic(vllm_model): |
| 132 | + _set_async_mode(vllm_model.model.llm_engine, True) |
| 133 | + _stop_basic(vllm_model.model.llm_engine, is_async=True) |
97 | 134 |
|
98 | | - output: Optional[CompletionOutput] = None |
99 | | - output_text = "" |
100 | | - stop_reason = None |
101 | | - while llm_engine.has_unfinished_requests(): |
102 | | - (request_output, ) = llm_engine.step() |
103 | | - (output, ) = request_output.outputs |
| 135 | + _set_async_mode(vllm_model.model.llm_engine, False) |
| 136 | + _stop_basic(vllm_model.model.llm_engine, is_async=False) |
104 | 137 |
|
105 | | - # Ensure we don't backtrack |
106 | | - assert output.text.startswith(output_text) |
107 | | - output_text = output.text |
108 | | - stop_reason = output.stop_reason |
109 | 138 |
|
110 | | - assert output is not None |
111 | | - assert output_text == expected_output |
112 | | - assert stop_reason == expected_reason |
| 139 | +@pytest.mark.skip_global_cleanup |
| 140 | +def test_stop_multi_tokens(vllm_model): |
| 141 | + _set_async_mode(vllm_model.model.llm_engine, True) |
| 142 | + _stop_multi_tokens(vllm_model.model.llm_engine, is_async=True) |
| 143 | + |
| 144 | + _set_async_mode(vllm_model.model.llm_engine, False) |
| 145 | + _stop_multi_tokens(vllm_model.model.llm_engine, is_async=False) |
| 146 | + |
| 147 | + |
| 148 | +@pytest.mark.skip_global_cleanup |
| 149 | +def test_stop_partial_token(vllm_model): |
| 150 | + _set_async_mode(vllm_model.model.llm_engine, True) |
| 151 | + _stop_partial_token(vllm_model.model.llm_engine, is_async=True) |
| 152 | + |
| 153 | + _set_async_mode(vllm_model.model.llm_engine, False) |
| 154 | + _stop_partial_token(vllm_model.model.llm_engine, is_async=False) |
| 155 | + |
| 156 | + |
| 157 | +@pytest.mark.skip_global_cleanup |
| 158 | +def test_stop_token_id(vllm_model): |
| 159 | + _set_async_mode(vllm_model.model.llm_engine, True) |
| 160 | + _stop_token_id(vllm_model.model.llm_engine, is_async=True) |
| 161 | + |
| 162 | + _set_async_mode(vllm_model.model.llm_engine, False) |
| 163 | + _stop_token_id(vllm_model.model.llm_engine, is_async=False) |
0 commit comments