diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 7f8ac0e31a..0f63516861 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -114,6 +114,7 @@ jobs: pytest -sv tests/singlecard/test_offline_inference.py pytest -sv tests/ops pytest -sv tests/compile + pytest -sv tests/singlecard --ignore=tests/singlecard/test_offline_inference.py else pytest -sv -k "QwQ" tests/multicard/test_offline_inference_distributed.py pytest -sv tests/ops @@ -127,6 +128,7 @@ jobs: if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then pytest -sv tests/singlecard/test_offline_inference.py pytest -sv tests/ops + pytest -sv tests/singlecard --ignore=tests/singlecard/test_offline_inference.py else pytest -sv -k "QwQ" tests/multicard/test_offline_inference_distributed.py pytest -sv -k "DeepSeek" tests/multicard/test_offline_inference_distributed.py diff --git a/requirements-dev.txt b/requirements-dev.txt index 4fb45d11ba..68c6cb03b9 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,3 +9,4 @@ ray types-jsonschema xgrammar zmq +sentence_transformers diff --git a/tests/conftest.py b/tests/conftest.py index 78ffe8f4e6..3474c361f5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,15 +18,24 @@ # import gc -from typing import List, Optional, Tuple, TypeVar, Union +import os +from typing import Any, List, Optional, Tuple, TypeVar, Union import numpy as np import pytest import torch +import torch.nn as nn +import torch.nn.functional as F from PIL import Image +from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, + BatchEncoding, BatchFeature) +from transformers.models.auto.auto_factory import _BaseAutoModelClass from vllm import LLM, SamplingParams -from vllm.config import TaskOption +from vllm.config import TaskOption, _get_and_verify_dtype +from vllm.distributed.parallel_state import (destroy_distributed_environment, + destroy_model_parallel) from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt +from vllm.inputs.data import to_enc_dec_tuple_list from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams from vllm.utils import is_list_of @@ -45,6 +54,10 @@ _M = TypeVar("_M") +_TEST_DIR = os.path.dirname(__file__) +_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] +_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] + _PromptMultiModalInput = Union[List[_M], List[List[_M]]] PromptImageInput = _PromptMultiModalInput[Image.Image] @@ -52,11 +65,453 @@ PromptVideoInput = _PromptMultiModalInput[np.ndarray] +def _read_prompts(filename: str) -> list[str]: + with open(filename) as f: + prompts = f.readlines() + return prompts + + +@pytest.fixture +def example_prompts() -> list[str]: + prompts = [] + for filename in _TEST_PROMPTS: + prompts += _read_prompts(filename) + return prompts + + def cleanup_dist_env_and_memory(): destroy_model_parallel() destroy_distributed_environment() gc.collect() torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() + + +_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) +_R = TypeVar("_R") + + +class HfRunner: + + def get_default_device(self): + from vllm.platforms import current_platform + + return ("cpu" if current_platform.is_cpu() else "npu") + + def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: + if x is None or isinstance(x, (bool, )): + return x + + if device is None: + device = self.device + + if isinstance(x, dict): + return {k: self.wrap_device(v, device) for k, v in x.items()} + + if hasattr(x, "device") and x.device.type == device: + return x + + return x.to(device) + + def __init__( + self, + model_name: str, + dtype: str = "auto", + *, + model_kwargs: Optional[dict[str, Any]] = None, + is_sentence_transformer: bool = False, + is_cross_encoder: bool = False, + skip_tokenizer_init: bool = False, + auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM, + ) -> None: + self.model_name = model_name + + self.config = AutoConfig.from_pretrained( + model_name, + trust_remote_code=True, + ) + self.device = self.get_default_device() + self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype) + + model_kwargs = model_kwargs if model_kwargs is not None else {} + model_kwargs.setdefault("torch_dtype", torch_dtype) + + if is_sentence_transformer: + # Lazy init required for AMD CI + from sentence_transformers import SentenceTransformer + + self.model = SentenceTransformer( + model_name, + device=self.device, + model_kwargs=model_kwargs, + trust_remote_code=True, + ) + elif is_cross_encoder: + # Lazy init required for AMD CI + from sentence_transformers import CrossEncoder + + self.model = CrossEncoder( + model_name, + device=self.device, + automodel_args=model_kwargs, + trust_remote_code=True, + ) + else: + model = auto_cls.from_pretrained( + model_name, + trust_remote_code=True, + **model_kwargs, + ) + + if (getattr(model, "quantization_method", None) != "bitsandbytes" + and len({p.device + for p in model.parameters()}) < 2): + model = model.to(self.device) + + self.model = model + + if not skip_tokenizer_init: + self.tokenizer = AutoTokenizer.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + ) + + # don't put this import at the top level + # it will call torch.cuda.device_count() + from transformers import AutoProcessor # noqa: F401 + self.processor = AutoProcessor.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + ) + if skip_tokenizer_init: + self.tokenizer = self.processor.tokenizer + + def get_inputs( + self, + prompts: list[str], + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> list[Union[BatchFeature, BatchEncoding]]: + if images is not None: + assert len(prompts) == len(images) + + if videos is not None: + assert len(prompts) == len(videos) + + if audios is not None: + assert len(prompts) == len(audios) + + all_inputs: list[Union[BatchFeature, BatchEncoding]] = [] + for i, prompt in enumerate(prompts): + processor_kwargs: dict[str, Any] = { + "text": prompt, + "return_tensors": "pt", + } + if images is not None and (image := images[i]) is not None: + processor_kwargs["images"] = image + if videos is not None and (video := videos[i]) is not None: + processor_kwargs["videos"] = video + if audios is not None and (audio_tuple := audios[i]) is not None: + audio, sr = audio_tuple + processor_kwargs["audio"] = audio + processor_kwargs["sampling_rate"] = sr + + inputs = self.processor(**processor_kwargs) + if isinstance(inputs, BatchFeature): + inputs = inputs.to(dtype=self.dtype) + + all_inputs.append(inputs) + + return all_inputs + + def classify(self, prompts: list[str]) -> list[str]: + # output is final logits + all_inputs = self.get_inputs(prompts) + outputs = [] + for inputs in all_inputs: + output = self.model(**self.wrap_device(inputs)) + logits = output.logits.softmax(dim=-1)[0].tolist() + outputs.append(logits) + + return outputs + + def generate( + self, + prompts: list[str], + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + **kwargs: Any, + ) -> list[tuple[list[list[int]], list[str]]]: + all_inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + outputs: list[tuple[list[list[int]], list[str]]] = [] + for inputs in all_inputs: + output_ids = self.model.generate( + **self.wrap_device(inputs), + use_cache=True, + **kwargs, + ) + output_str = self.processor.batch_decode( + output_ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) + output_ids = output_ids.cpu().tolist() + outputs.append((output_ids, output_str)) + return outputs + + def generate_greedy( + self, + prompts: list[str], + max_tokens: int, + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + **kwargs: Any, + ) -> list[tuple[list[int], str]]: + outputs = self.generate(prompts, + do_sample=False, + max_new_tokens=max_tokens, + images=images, + videos=videos, + audios=audios, + **kwargs) + + return [(output_ids[0], output_str[0]) + for output_ids, output_str in outputs] + + def generate_beam_search( + self, + prompts: list[str], + beam_width: int, + max_tokens: int, + ) -> list[tuple[list[list[int]], list[str]]]: + outputs = self.generate(prompts, + do_sample=False, + max_new_tokens=max_tokens, + num_beams=beam_width, + num_return_sequences=beam_width) + for i in range(len(outputs)): + output_ids, output_str = outputs[i] + for j in range(len(output_ids)): + output_ids[j] = [ + x for x in output_ids[j] + if x != self.tokenizer.pad_token_id + ] + outputs[i] = (output_ids, output_str) + return outputs + + def generate_greedy_logprobs( + self, + prompts: list[str], + max_tokens: int, + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + **kwargs: Any, + ) -> list[list[torch.Tensor]]: + all_inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + all_logprobs: list[list[torch.Tensor]] = [] + for inputs in all_inputs: + output = self.model.generate( + **self.wrap_device(inputs), + use_cache=True, + do_sample=False, + max_new_tokens=max_tokens, + output_hidden_states=True, + return_dict_in_generate=True, + **kwargs, + ) + seq_logprobs = self._hidden_states_to_seq_logprobs( + output.hidden_states) + all_logprobs.append(seq_logprobs) + return all_logprobs + + def _hidden_states_to_seq_logprobs( + self, + hidden_states: tuple[tuple[torch.Tensor, ...], ...], + ) -> list[torch.Tensor]: + output_embeddings = self.model.get_output_embeddings() + + seq_logprobs: list[torch.Tensor] = [] + for _, hidden_state in enumerate(hidden_states): + last_hidden_states = hidden_state[-1][0] + logits = torch.matmul( + last_hidden_states.to(output_embeddings.weight.device), + output_embeddings.weight.t(), + ) + if getattr(output_embeddings, "bias", None) is not None: + logits += output_embeddings.bias.unsqueeze(0) + logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32) + seq_logprobs.append(logprobs) + + return seq_logprobs + + def _hidden_states_to_logprobs( + self, + hidden_states: tuple[tuple[torch.Tensor, ...], ...], + num_logprobs: int, + ) -> tuple[list[dict[int, float]], int]: + seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states) + output_len = len(hidden_states) + + # convert to dict + seq_logprobs_lst: list[dict[int, float]] = [] + for tok_idx, tok_logprobs in enumerate(seq_logprobs): + # drop prompt logprobs + if tok_idx == 0: + tok_logprobs = tok_logprobs[-1, :].reshape(1, -1) + topk = tok_logprobs.topk(num_logprobs) + + tok_logprobs_dct = {} + for token_id, logprob in zip(topk.indices[0], topk.values[0]): + tok_logprobs_dct[token_id.item()] = logprob.item() + + seq_logprobs_lst.append(tok_logprobs_dct) + + return ( + seq_logprobs_lst, + output_len, + ) + + def generate_greedy_logprobs_limit( + self, + prompts: list[str], + max_tokens: int, + num_logprobs: int, + images: Optional[PromptImageInput] = None, + audios: Optional[PromptAudioInput] = None, + videos: Optional[PromptVideoInput] = None, + **kwargs: Any, + ) -> list[TokensTextLogprobs]: + all_inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + all_logprobs: list[list[dict[int, float]]] = [] + all_output_ids: list[list[int]] = [] + all_output_strs: list[str] = [] + + for inputs in all_inputs: + output = self.model.generate( + **self.wrap_device(inputs), + use_cache=True, + do_sample=False, + max_new_tokens=max_tokens, + output_hidden_states=True, + return_dict_in_generate=True, + **kwargs, + ) + + ( + seq_logprobs_lst, + output_len, + ) = self._hidden_states_to_logprobs(output.hidden_states, + num_logprobs) + + all_logprobs.append(seq_logprobs_lst) + seq_ids = output.sequences[0] + output_len = len(seq_logprobs_lst) + output_ids = seq_ids[-output_len:] + all_output_ids.append(output_ids.tolist()) + all_output_strs.append(self.tokenizer.decode(output_ids)) + + outputs = zip(all_output_ids, all_output_strs, all_logprobs) + return [(output_ids, output_str, output_logprobs) + for output_ids, output_str, output_logprobs in outputs] + + def generate_encoder_decoder_greedy_logprobs_limit( + self, + encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], + max_tokens: int, + num_logprobs: int, + images: Optional[PromptImageInput] = None, + **kwargs: Any, + ) -> list[TokensTextLogprobs]: + ''' + Greedy logprobs generation for vLLM encoder/decoder models + ''' + + all_logprobs: list[list[dict[int, float]]] = [] + all_output_ids: list[list[int]] = [] + all_output_strs: list[str] = [] + + for i, (encoder_prompt, decoder_prompt) in enumerate( + to_enc_dec_tuple_list(encoder_decoder_prompts)): + processor_kwargs: dict[str, Any] = { + "text": encoder_prompt, + "return_tensors": "pt", + } + if images is not None and images[i] is not None: + processor_kwargs["images"] = images[i] + + encoder_inputs = self.processor(**processor_kwargs) + encoder_inputs = self.wrap_device(encoder_inputs) + + if decoder_prompt is None: + decoder_input_ids = None + else: + decoder_inputs = self.tokenizer(decoder_prompt, + return_tensors="pt") + decoder_input_ids = self.wrap_device(decoder_inputs.input_ids) + + output = self.model.generate( + decoder_input_ids=decoder_input_ids, + use_cache=True, + do_sample=False, + max_new_tokens=max_tokens, + output_hidden_states=True, + return_dict_in_generate=True, + **encoder_inputs, + **kwargs, + ) + + ( + seq_logprobs_lst, + output_len, + ) = self._hidden_states_to_logprobs(output.decoder_hidden_states, + num_logprobs) + + all_logprobs.append(seq_logprobs_lst) + seq_ids = output.sequences[0] + output_ids = seq_ids[-output_len:] + all_output_ids.append(output_ids.tolist()) + all_output_strs.append(self.tokenizer.decode(output_ids)) + + outputs = zip(all_output_ids, all_output_strs, all_logprobs) + return [(output_ids, output_str, output_logprobs) + for output_ids, output_str, output_logprobs in outputs] + + def encode(self, prompts: list[str], *args, + **kwargs) -> list[list[torch.Tensor]]: + return self.model.encode(prompts, *args, **kwargs) + + def predict(self, prompts: list[list[str]]) -> torch.Tensor: + return self.model.predict(prompts, convert_to_tensor=True) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + del self.model + cleanup_dist_env_and_memory() + + +@pytest.fixture(scope="session") +def hf_runner(): + return HfRunner class VllmRunner: diff --git a/tests/prompts/example.txt b/tests/prompts/example.txt new file mode 100644 index 0000000000..e1b97bc6ee --- /dev/null +++ b/tests/prompts/example.txt @@ -0,0 +1,8 @@ +vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. +Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020. +Compare and contrast artificial intelligence with human intelligence in terms of processing information. +Describe the basic components of a neural network and how it can be trained. +Write a short story about a robot that dreams for the first time. +Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. +Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. +Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' diff --git a/tests/prompts/summary.txt b/tests/prompts/summary.txt new file mode 100644 index 0000000000..2f947a264c --- /dev/null +++ b/tests/prompts/summary.txt @@ -0,0 +1 @@ +Subtitles: for our annual races at Knockhill Circuit.Today\'s racing comes from the Porsche Carrera Cup Great Britainand the Legends Cars Elite Cup with JLM.It\'s the latter who get us underway with their first race of the day,and joining me in the commentary box is Paul O\'Neill.First race of the day for the Legends.Jonty Norman has drawn pole position,with Matt Knight alongside.Marcus Pett on Row 2 with Daniel Pooley.Declan Burke is next up, and then Tyler Read, on Row 3.He\'s leading the rookie championship at the moment.Chris Needham on Row 4 with Luke Simmons.Andrew Rogerson and Gareth Sheridan on Row 5.Sixth row, Peter Barrable, with Charlie Budd.Row 7, Jack Parker, fourth in the championship right now.Nick Price is next to him.Will Gibson, who looks like he\'s out of the championship contention now,with Oli Schlup alongside.Then Ben McNeice and Flight Lieutenant Matt Isherwood.Robert Barrable, championship leader, he\'s on Row 10.Then Brent Bowie from Kieran Beattie and Nick Bridgeman.Mike Schlup on Row 12, followed by Ryan McLeish,who won the day overall yesterday.Mark Beaty, Row 13, with Andy Bird.Then it\'s Ben Higgins and Nathan Anthony.Connor Mills and Paul Musselle complete Row 15.And completing the grid is James Newbery.Here we go, with Race number 1 of the day,the final day of the first ever Legends Cars Elite Cup with JLM.And on the front row, it\'s Jonty Norman in grey,Matt Knight in black and gold.Coming from third place on the grid is Marcus Pett,who goes left of shot in the gunmetal carto challenge for the lead.Marcus Pett, the man from Boston in Lincolnshire,goes through into lead position.Very definitely a fancied championship runnerbut hasn\'t quite had the rub of the green this weekend.And they all pile into McIntyre\'s for the first time.And this is where we look for driving standards.James Newbery brakes at the back.He\'s got Paul Musselle immediately in front of him.Those two had an interesting battle yesterdayinvolving a little bit of contact, I think,but they\'re both all right at the moment, as they clear the chicane for the first time.Marcus Pett is away.The difference you\'ll see in Legends Cars racing todayis that for this meeting,the bump drafting that we\'ve seen in the pasthas been ruled out for this round,and it\'s under review for the future.But look at the battle for second position, three wide,as Marcus Pett comes in front of the crowds here.Matt Knight on the inside, Dan Pooley on the outside in 32.Dan Pooley challenging for third. He had a strong day yesterday -he was up in the top ten, which was great to see.The man from March.That third car there, eclipsed at the moment,comes out of the slipstream.Dan repaired his own car after Croft,and that of Kieran Beaty,so I know Kieran wanted to thank him for that. He\'s been working hard.And Pooley side by side with Matt Knight.We\'ve got the 13, Chris Needham car, up there in the mix as well.The three top guys in the...Ryan McLeish getting very sideways there,the Scot in the 71 car.The first time we\'ve seen him on our ITV coverage.He\'s not a guest driver this week.I suppose you could technically call him a guest,but he\'s fully championship registeredand took a splendid win yesterday - overall win and race win.Overall on points.Sorry, Paul, gets a chance to get you in.That\'s Jack Parker!Oh, what\'s happened there?So, this was the start. They\'re all still warming the tyres up,ready for the lights to go green,which they do... around about now.And they get going.And then there was a car, wasn\'t there?Oh, I tell you what, that could\'ve ended up really nastyas it snaked up the grass.Yeah, I\'ll tell you what, the moment when the lights went outwas when Marcus Pett broke ranks.That was a very, very meticulous start from Marcus Pett.The blue car here is Tyler Read, top rookie,who looks like he\'s going down the inside of Daniel Pooley,so he\'s gonna make a space here.So, Dan Pooley has lost second position.It\'s Marcus Pett still out front. Matt Knight...I was saying to the drivers,"Don\'t go away if you\'re in the lead because you won\'t get any coverage." Pett\'s down the road, isn\'t he? Look at the gap he\'s got. Yeah.He\'s got three seconds. It\'s gonna be more than that.What I was quite concerned about was the damp part of the circuitdown at the hairpin, where you need to be down the inside of peopleto get the braking done,but these guys seem to be all respecting...Not track limits, but they\'re respecting each other around usbecause I was quite concerned about coming here,but this is quite synonymous with Legends racing at Knockhill.And look at this now. Knight has got...Look at that. I remember Marcus getting his first race win,which was at Snetterton years ago.It\'s always fantastic to see a first-time winner.And Tyler Read is giving him a great workout.Matt Knight back in third.It\'s between the top two at the moment. Oh! Tyler goes wide.He\'s throwing the car around.Marcus Pett, looking a little bit smoother in the 79,was very frustrated yesterday, but Read\'s all over him.Yeah, but look at this now.You\'ve got third, fourth, fifth and sixth.This is gonna be absolutely spectacular!Tyler Read\'s gone! What\'s gone on?!Oh, has the Treherne engine gone pop? He\'s lost a lot of ground.Is he gonna come back into it?Now it\'s Knight having a go on the outside line again.Matt Knight can\'t do it. He runs out wide.Oli Schlup\'s coming through.Schlup hasn\'t had a win yet in Legends cars, so he\'s queueing up.They\'re coming onto the last lap.This could be a key moment for Oli Schlup,who\'s back in third in the K-Seal car.Across the line.Marcus Pett soaking up the pressure brilliantly so far.But does he need to be in front as they come onto the last lap?I don\'t know, but I think Read must have missed a gear,as someone\'s exited stage left.Look at that, back in the mix!It\'s now six for the lead. Can Pett hold on?Championship leader Robert Barrablehas come through from about three rows from the back,and he\'s at the back of the train.Barrable here is gonna extend his championship leadand start towards the front of the grid for Race 2.Barrable, the Irishman, he\'s there.The white car with the green and orange stripeson the nose cone of the car.But it\'s Marcus Pett out front at the moment... Oh!Matt Isherwood\'s rejoined at the back in the black and green.Isherwood\'s got back at them. Matt Knight\'s having a go.Along Railway Straight.Schlup would normally bump draft him. He can\'t do that on the rules.But look at Marcus Pett.Fairly wide-ish line in. Good defensive stuff from Pett.It\'s all about the run up to the hill now.And Marcus Pett is gonna take the win, I think.Here they come, up towards the line. Pett from Matt Knight.It\'s gonna be Matt\'s best resultin the Legends Cars National Championship.Third position goes to Oli Schlup, who is delighted with that.Then it was Tyler Read. Great race from him.Robert Barrable, though...Barrable, from 19th on the grid, without bump drafting,comes through into fifth placeahead of the excellent recovery from Flight Lieutenant Matt Isherwood.Dan Pooley seventh. Another great result for Dan Pooley.So much to take away from those last racing laps.Oh, and those last four lapsis exactly why we have these Legends on the TOCA package.That was exceptional.Marcus Pett looked like a dead cert not to finish first,but congratulations to you. That was brilliant.But Barrable, after exiting stage leftwhen he caught the back of everybody and got right up there...There\'s too much to talk about. Let\'s just talk about this guy.Pett, you are a legend, mate. Well done.Cracking. It is a lad and dad.Literally, Marcus and his dad, Robert, they look after the car.It is lad and dad. We hear that mentioned in other formulas,but genuinely, that is all it is.It is very difficult for drivers like that and teams like thatto come and race on this stage.It is a big thing. And he\'s such a smashing guy.And his dad as well. Really delighted with the win.Super stuff by Matt Knight. brilliant from Oli Schlup.Fantastic as well from Tyler Read.And on the front row,it\'s Jonty Norman in grey, Matt Knight in black and gold.Coming from third place on the grid is Marcus Pett.Bit of a shemozzle at the back.Two cars hooked up, which is not good to see.Oh, has the Treherne engine gone pop? He\'s lost a lot of ground.Now it\'s Knight having a go on the outside line again.Matt Knight can\'t do it. He runs out wide.Oli Schlup\'s coming through.And Marcus Pett is gonna take the win, I think. Pett from Matt Knight. It\'s gonna be Matt\'s best resultin the Legends Cars National Championship.Here\'s how they finished.Marcus Pett takes another win in the Legends Cars Elite Cup with JLM.READS INFOREADS INFOREADS INFOREADS INFOREADS INFOREADS INFOProblems in that race for Ryan McLeish, yesterday\'s winner.Charlie Budd in 30th.And the other driver having problems, obviously,from that first stoppage, Brent Bowie.Marcus, that was a tough racebecause there was a red flag in the middle of it.Actually, the first bit, you got away,but it was a full reset,and pressure throughout to the chequered flag.Yeah, definitely.We had an ideal start and managed to build up a lead early on,which was great, but when you\'re in that position,the last thing you want to see is a red flag. iming line at the end of lap one.So, Gus Burton leads the way.Big, big dive by Foster on the inside,to go back ahead of Wylie.He goes off the road and back on again.He\'s all sideways.And diving up on the outside line comes Ryan Ratcliffe.Wylie here battling with one of the Pro category cars,but behind him, all the Pro-Am opposition crawling all over him.Well, that was dramatic stuff, wasn\'t it?Round the outside of Turn 1, put Harry Foster in the wrong place.That was Max Bird going wide, number 44, the pink and blue car.So that\'s just haemorrhaged places in Pro-Am.And he\'s the... Oh, a puncture.There\'s somebody with a puncture. Is that Angus Whiteside? Possibly.Let\'s see.I think it is. And you\'ve got this damp patch on the inside,on the braking there, just at the final into the hairpin.This has been a dramatic start to this race for Porsches.Absolutely right.Coming up over the timing line, Gus Burton leads the way.Nine tenths of a second to the good.Big effort being made by Jason Lockwoodin the yellow and orange car in the background, look,to try to get up the inside line, then diving down towards Turn 1.Goes ahead of Oliver White, the very experienced Formula 4 champion.In the silver car, Oliver White, back into Carrera Cup.Remember, he did a full season last year.Good to have him back on the grid.As the cars clamber their way up over the kerb,through the chicane.But Gus Burton saying to everybody, "I\'m back." He leads.Yeah, a dramatic way for Gus Burton to come back to this championship.Remember, he started this year with Century Motorsport but then ducked out of the championship prior to Thruxton.He\'s still competing in the Supercup series with Fach Auto.As there in the pits, getting a new rear left tyre, is Angus Whiteside.But Gus Burton absolutely on it.Very quick in testing here during the week.They tested on Wednesday and on Friday.Gus Burton very quick in...And he\'s really enjoying life now.Back in the championship with the NAPA Racing UK supportand with a different team, Nick Tandy\'s JTR outfit.And he\'s done the fastest lap of the race, as he leads.He is not in the championship fight, but he wants to win races.Car off. It\'s Max Bird again.So, Max Bird, the Pro-Am championship leader,three times a winner in class this year,off the road and back on again.But that\'s gonna throw him way, way down the order.This race is going from bad to worse for him.It\'s just completely unfolded for poor Max Bird.That\'s the curse of having our camera on board, I think,but it\'s just unravelled after a great qualifying.Now, you were talking about Gus Burton\'s start,and it is going to be investigated after the race.OK. Well, it\'ll take a lot of camera action analysisto look at it. This is on board with Bird.Round Turn 1.All OK there. Very close... Goes to the outside.That\'s dangerous cos you can get knocked wide,and that\'s exactly what happens.The man he was trying to get past, Josh Stanton,who spent last night trackside at Cowdenbeath watching stock cars.I\'m not suggesting for a moment he\'s learnt how to defend,but he was enjoying himself, watching a different form of racing.I think all the best people were at Cowdenbeath, weren\'t they?Nick Tandy was, and others. Oh!As there, absolutely on the giddy limit, is Harry Foster,making his way in sixth place.Down towards the hairpin.He\'s dropped back from that leading quintet,but he\'s keeping Ross Wylie at bay.Ross Wylie, there, creeping into shot, leads now Pro-Amahead of Ryan Ratcliffe.And Josh Stanton is third in Pro-Am, last year\'s Am champion.Yeah, and Ross Wylie the only Scottish driver in the race. A lot of support for him,from local sponsors as well as the public.Buoyed by his recent run at the British Grand Prix at Supercup,and thoroughly loving racing at his home circuit, Ross Wylie.Track is nicely dry.There was some threats of possible rain.We had rain yesterday during qualifying.They actually only got one runon their slick tyres yesterday in qualifyingbefore the rain arrived, and that set the grid.So, Gus Burton\'s lead growing all the time.1.3 seconds now, that margin over Adam Smalley.As Max Bird tries to fight back in Pro-Am.Gets up the inside line there.So, that puts him ahead of David Stirling.So, he\'s split the second and third Am fightas he tries to recover.Yeah, but he\'s lost a lot of ground with that momenton the outside of McIntyre\'s.It\'s getting a lot darker overhead at Knockhill,even though there is a break in the cloud.A big effort there from the lapped car of Angus Whiteside.He\'s not fighting for position, he\'s trying to unlap himself.But just wonder whether we might get so f the right of McIntyre\'s,up towards Butcher\'s, then the chicane.And looking to try and maintain this 100% recordin the Team Parker Racing-run car in Am.Yeah. David Fairbrother in second place,but some 11 seconds behind in the Am category.But he will take another podium.His second in the championship, too, Justin Sherwood.The race leader 2.5 seconds to the good, Gus Burton.Other battles still to be resolved.What\'s going on in Pro-Am? Ross Wylie leads.He\'s fallen back behind Josh Malin overall. That was the move.Josh Malin through on the inside at the hairpin.Ross Wylie, in a sense, content to let that happen - gave him room -because that\'s not his battle, but what it does meanis that Ryan Ratcliffe, his class rival,is directly behind him.This is William Aspin versus Max Bird for sixth in Pro-Am.And a very determined Max Bird goes one side, get his nose chopped off.Will Aspin, the man from Florence, defends on the other side.They\'re absolutely together, almost touching.Here comes Max Bird.Oh, but he can\'t find a way through there.Angus Whiteside is now getting in on the act.Round the outside goes Max Bird, but they both take it wide,and through goes Angus Whiteside on the inside.Doesn\'t affect the race order.Whiteside unlaps himself from those two cars. Will Aspin stays ahead. Max Bird tries to fight back.Down towards Duffus Dip.Ignore the car in the lead of this battle packbecause it\'s not on the lead lap.But then Aspin under attack.Max Bird tries to get up alongside himfor the inside line coming into McIntyre\'s.He is on the inside, and he is ahead now.Yeah. And behind him, there was a car completely off on the grassafter Turn 1.So I do think that section of the track is a little slippery,for whatever reason. Maybe it just hasn\'t quite dried out.But this was a great battle between Max Bird and Will Aspin.So, drivers, in one or two cases,setting personal best lap times last time around,suggesting that the road is drying still.The cars are getting lighter on fuel anyway.Down at the hairpin comes the recovering Max Bird,as over the line goes Harry Foster, being chased by Josh Malin.Josh up into seventh overall.A top six could be on - he\'s only half a second back.Yeah, it\'s not far away, is it?And still plenty of laps left in this race.You probably noticed through that Turn 1the drivers are not riding the big kerb on the inside.That\'s because it\'s a new kerb that\'s been put in, actually,to raise the level of the kerbback to the level it was before the track got resurfaced twice.But with the resurfacing twice,it had raised the track surface by 80mm,and the drivers found they were, in previous years,able to use that kerb.Now? Not so much.So, there going through is Oliver Wight in the silver car,down towards the hairpin.Jason Lockwood ahead of him.Jason for EXCELR8, and he is running in 12 at the moment,which is potentially going to be his best finish of the year.It\'s been a tough season for Jason,but he could be on for his best results thus far.However, Gus Burton has rather dominated this,and look at the gap that he\'s pulled.Adam Smalley, as we suggested earlier,might be thinking about banking points,but it doesn\'t look as though he\'s been able to do anything at allabout that JTR car ahead.No. In terms of pure speed,he hasn\'t been able to threaten Gus Burton at all, has he? Gus Burton has led every race.As he\'s now passing David Fairbrotherat the back of the field.But he\'s had this race under control.But unfortunately, he\'s got this investigation after the racefor a possible false start hanging over him.And if, if, if anything is found, and it\'s a false start,normally that\'s a ten-second penalty,and he\'s not ten seconds ahead,so there is gonna be a postscript to this story, that\'s for sure.Now, this is Henry Dawes, Ollie Jacksoncoming through the chicane.Dawes goes wide, goes through the gravel,goes over the grass, loses a place,gets it all sideways, but just about saves it by the end of the straight.Yeah, nearly lost it on the wet grass.Oh. Harry Foster.This is passing David Fairbrother again, further back.So, this is Smalley versus Matty Graham for second place.So, this gap has come r. \n\n Your task is to create long detailed paragraph-by-paragraph summary. Detailed paragraph-by-paragraph summary of the text above: \ No newline at end of file diff --git a/tests/singlecard/embedding/__init__.py b/tests/singlecard/embedding/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/singlecard/embedding/test_embedding.py b/tests/singlecard/embedding/test_embedding.py new file mode 100644 index 0000000000..6601f4628e --- /dev/null +++ b/tests/singlecard/embedding/test_embedding.py @@ -0,0 +1,95 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/tests/models/embedding/language/test_embedding.py +# +"""Compare the embedding outputs of HF and vLLM models. + +Run `pytest tests/singlecard/embedding/test_embedding.py`. +""" +import os +from typing import Any, Dict + +import pytest + +from tests.singlecard.embedding.utils import check_embeddings_close + +env = os.environ.copy() +# the current process might initialize cuda, +# to be safe, we should use spawn method +env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' + +MODELSCOPE_CACHE = "/root/.cache/modelscope/hub/models/" + + +@pytest.mark.parametrize( + "model", + [ + # [Encoder-only] + pytest.param("BAAI/bge-base-en-v1.5"), + pytest.param("sentence-transformers/all-MiniLM-L12-v2"), + # pytest.param("intfloat/multilingual-e5-small"), + # pytest.param("iic/gte-Qwen2-7B-instruct"), + # # # [Decoder-only] + # pytest.param("BAAI/bge-multilingual-gemma2"), + # pytest.param("intfloat/e5-mistral-7b-instruct"), + # pytest.param("iic/gte-Qwen2-1.5B-instruct"), + # pytest.param("QwenCollection/Qwen2-7B-Instruct-embed-base"), + # # [Cross-Encoder] + # pytest.param("sentence-transformers/stsb-roberta-base-v2"), + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +def test_models( + hf_runner, + vllm_runner, + example_prompts, + model, + dtype: str, + monkeypatch: pytest.MonkeyPatch, +) -> None: + with monkeypatch.context() as m: + m.setenv("VLLM_USE_MODELSCOPE", "True") + m.setenv("PYTORCH_NPU_ALLOC_CONF", "max_split_size_mb:256") + vllm_extra_kwargs: Dict[str, Any] = {} + + # The example_prompts has ending "\n", for example: + # "Write a short story about a robot that dreams for the first time.\n" + # sentence_transformers will strip the input texts, see: + # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159 + # This makes the input_ids different between hf_model and vllm_model. + # So we need to strip the input texts to avoid test failing. + example_prompts = [str(s).strip() for s in example_prompts] + + with vllm_runner(model, + task="embed", + dtype=dtype, + max_model_len=None, + **vllm_extra_kwargs) as vllm_model: + vllm_outputs = vllm_model.encode(example_prompts) + + with hf_runner(MODELSCOPE_CACHE + model, + dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_outputs = hf_model.encode(example_prompts) + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) diff --git a/tests/singlecard/embedding/test_scoring.py b/tests/singlecard/embedding/test_scoring.py new file mode 100644 index 0000000000..7fe49df5b7 --- /dev/null +++ b/tests/singlecard/embedding/test_scoring.py @@ -0,0 +1,241 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/tests/models/embedding/language/test_scoring.py +# +"""Compare the scoring outputs of HF and vLLM models. + +Run `pytest tests/singlecard/embedding/test_scoring.py`. +""" +import math +import os + +import pytest +import torch +import torch.nn.functional as F + +env = os.environ.copy() +# the current process might initialize npu, +# to be safe, we should use spawn method +env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' +MODELSCOPE_CACHE = "/root/.cache/modelscope/hub/models/" + +MODELS = [ + # "cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert + "BAAI/bge-reranker-v2-m3", # Roberta +] + +EMBEDDING_MODELS = [ + "sentence-transformers/all-MiniLM-L12-v2", +] + +TEXTS_1 = [ + "What is the capital of France?", + "What is the capital of Germany?", +] + +TEXTS_2 = [ + "The capital of France is Paris.", + "The capital of Germany is Berlin.", +] + + +@pytest.fixture(scope="module", params=MODELS) +def model_name(request): + yield request.param + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str, + monkeypatch: pytest.MonkeyPatch): + + text_pair = [TEXTS_1[0], TEXTS_2[0]] + with monkeypatch.context() as m: + m.setenv("VLLM_USE_MODELSCOPE", "True") + with vllm_runner(model_name, + task="score", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) + + with hf_runner(MODELSCOPE_CACHE + model_name, + dtype=dtype, + is_cross_encoder=True) as hf_model: + hf_outputs = hf_model.predict([text_pair]).tolist() + + assert len(vllm_outputs) == 1 + assert len(hf_outputs) == 1 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str, + monkeypatch: pytest.MonkeyPatch): + + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[0], TEXTS_2[1]], + ] + with monkeypatch.context() as m: + m.setenv("VLLM_USE_MODELSCOPE", "True") + with vllm_runner(model_name, + task="score", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) + + with hf_runner(MODELSCOPE_CACHE + model_name, + dtype=dtype, + is_cross_encoder=True) as hf_model: + hf_outputs = hf_model.predict(text_pairs).tolist() + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str, + monkeypatch: pytest.MonkeyPatch): + + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + with monkeypatch.context() as m: + m.setenv("VLLM_USE_MODELSCOPE", "True") + with vllm_runner(model_name, + task="score", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) + + with hf_runner(MODELSCOPE_CACHE + model_name, + dtype=dtype, + is_cross_encoder=True) as hf_model: + hf_outputs = hf_model.predict(text_pairs).tolist() + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + + +@pytest.fixture(scope="module", params=EMBEDDING_MODELS) +def emb_model_name(request): + yield request.param + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str, monkeypatch: pytest.MonkeyPatch): + + text_pair = [TEXTS_1[0], TEXTS_2[0]] + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_MODELSCOPE", "True") + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) + + with hf_runner(MODELSCOPE_CACHE + emb_model_name, + dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = hf_model.encode(text_pair) + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0) + ] + + assert len(vllm_outputs) == 1 + assert len(hf_outputs) == 1 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str, monkeypatch: pytest.MonkeyPatch): + + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[0], TEXTS_2[1]], + ] + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_MODELSCOPE", "True") + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) + + with hf_runner(MODELSCOPE_CACHE + emb_model_name, + dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = [ + hf_model.encode(text_pair) for text_pair in text_pairs + ] + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, pair), dim=0) + for pair in hf_embeddings + ] + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str, monkeypatch: pytest.MonkeyPatch): + + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_MODELSCOPE", "True") + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) + + with hf_runner(MODELSCOPE_CACHE + emb_model_name, + dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = [ + hf_model.encode(text_pair) for text_pair in text_pairs + ] + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, pair), dim=0) + for pair in hf_embeddings + ] + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) diff --git a/tests/singlecard/embedding/utils.py b/tests/singlecard/embedding/utils.py new file mode 100644 index 0000000000..39c1103de4 --- /dev/null +++ b/tests/singlecard/embedding/utils.py @@ -0,0 +1,56 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/tests/models/embedding/utils.py +# + +from collections.abc import Sequence + +import torch +import torch.nn.functional as F + + +def check_embeddings_close( + *, + embeddings_0_lst: Sequence[list[float]], + embeddings_1_lst: Sequence[list[float]], + name_0: str, + name_1: str, + tol: float = 1e-3, +) -> None: + assert len(embeddings_0_lst) == len(embeddings_1_lst) + + for prompt_idx, (embeddings_0, embeddings_1) in enumerate( + zip(embeddings_0_lst, embeddings_1_lst)): + assert len(embeddings_0) == len(embeddings_1), ( + f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}") + + sim = F.cosine_similarity(torch.tensor(embeddings_0), + torch.tensor(embeddings_1), + dim=0) + + fail_msg = (f"Test{prompt_idx}:" + f"\n{name_0}:\t{embeddings_0[:16]!r}" + f"\n{name_1}:\t{embeddings_1[:16]!r}") + + assert sim >= 1 - tol, fail_msg + + +def matryoshka_fy(tensor, dimensions): + tensor = torch.tensor(tensor) + tensor = tensor[..., :dimensions] + tensor = F.normalize(tensor, p=2, dim=1) + return tensor diff --git a/tests/singlecard/multi_step/__init__.py b/tests/singlecard/multi_step/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/singlecard/multi_step/test_correctness_llm.py b/tests/singlecard/multi_step/test_correctness_llm.py new file mode 100644 index 0000000000..3c7ef6b0cf --- /dev/null +++ b/tests/singlecard/multi_step/test_correctness_llm.py @@ -0,0 +1,117 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/multi_step/test_correctness_llm.py +# +"""Test the LLMEngine with multi-step-decoding""" +from typing import Optional + +import pytest + +from tests.model_utils import check_logprobs_close + +MODELS = [ + "Qwen/Qwen2.5-0.5B-Instruct", +] +NUM_SCHEDULER_STEPS = [8] # Multi-step decoding steps +NUM_PROMPTS = [10] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("tp_size", [1]) +@pytest.mark.parametrize("max_tokens", [5]) +@pytest.mark.parametrize("enforce_eager", [True]) +@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) +@pytest.mark.parametrize("num_prompts", NUM_PROMPTS) +@pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)]) +def test_multi_step_llm_w_prompt_logprobs( + vllm_runner, + example_prompts, + model: str, + dtype: str, + tp_size: int, + max_tokens: int, + enforce_eager: int, + num_scheduler_steps: int, + num_prompts: int, + num_logprobs: Optional[int], + num_prompt_logprobs: Optional[int], +) -> None: + """Test prompt logprobs with multi-step scheduling via sync LLM Engine. + Set up a vLLM engine instance w/ single-step scheduling as a ground-truth + reference. + Prompt them with the same example prompts. + Validate: + * All generated logprobs are all very close + Args: + hf_runner: HF transformers model runner fixture + vllm_runner: vLLM model runner fixture + example_prompts: test fixture providing example prompts + model: model under test (same for single- and multi-step engines) + dtype: tensor datatype for engine to utilize + tp_size: degree of tensor-parallelism + max_tokens: the maximum number of tokens to generate + enforce_eager + num_scheduler_steps: for multi-step scheduling, GPU-side steps per + GPU -> CPU output transfer + num_prompts: number of example prompts under test + num_logprobs: corresponds to the `logprobs` argument to the OpenAI + completions endpoint; `None` -> no logprobs + num_prompt_logprobs: number of logprobs to return for each prompt token; + note that this argument is not supported by the + OpenAI completions endpoint. + """ + + prompts = example_prompts + if len(prompts) < num_prompts: + prompts = prompts * ((num_prompts // len(prompts)) + 1) + prompts = prompts[:num_prompts] + assert len(prompts) == num_prompts + + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + num_scheduler_steps=num_scheduler_steps, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs, + num_prompt_logprobs=num_prompt_logprobs) + + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + ) as vllm_model: + single_step_vllm_outputs = vllm_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs, + num_prompt_logprobs=num_prompt_logprobs) + + check_logprobs_close( + outputs_0_lst=single_step_vllm_outputs, + outputs_1_lst=vllm_outputs, + name_0="vllm_baseline", + name_1="vllm_multistep", + ) diff --git a/tests/singlecard/prefix_caching/__init__.py b/tests/singlecard/prefix_caching/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/singlecard/prefix_caching/core_utils.py b/tests/singlecard/prefix_caching/core_utils.py new file mode 100644 index 0000000000..3713c23ae2 --- /dev/null +++ b/tests/singlecard/prefix_caching/core_utils.py @@ -0,0 +1,273 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/prefix_caching/core_utils.py +# + +import time +from collections import defaultdict +from collections.abc import Sequence as GenericSequence +from typing import Any, Optional + +from vllm import SamplingParams +from vllm.core.scheduler import Scheduler, SchedulerOutputs +from vllm.inputs import EncoderDecoderInputs, token_inputs +from vllm.lora.request import LoRARequest +from vllm.sequence import (Logprob, Sequence, SequenceGroup, + SequenceGroupMetadata) + + +def create_dummy_prompt( + request_id: str, + prompt_length: int = -1, + block_size: Optional[int] = None, + lora_request: Optional[LoRARequest] = None, + prompt_tokens: Optional[list[int]] = None, + min_tokens: int = 0, + max_tokens: int = 16, +) -> tuple[Sequence, SequenceGroup]: + if not block_size: + block_size = prompt_length + + if prompt_tokens is None: + # Create dummy prompt sequence with tokens 0...block_size-1 + # and prompt "0 ... block_size". + prompt_tokens = list(range(prompt_length)) + + prompt_str = " ".join([str(t) for t in prompt_tokens]) + prompt = Sequence( + int(request_id), + inputs=token_inputs(prompt_tokens, prompt=prompt_str), + block_size=block_size, + ) + seq_group = SequenceGroup( + request_id=request_id, + seqs=[prompt], + arrival_time=time.time(), + sampling_params=SamplingParams(max_tokens=max_tokens, + min_tokens=min_tokens), + lora_request=lora_request, + ) + + return prompt, seq_group + + +def create_dummy_lora_sequence(request_id: int, token_ids: list[int], + block_size: int, lora_int_id: int) -> Sequence: + return Sequence(seq_id=request_id, + inputs=token_inputs(token_ids), + block_size=block_size, + lora_request=LoRARequest(lora_name="dummy", + lora_path="/dummy", + lora_int_id=lora_int_id)) + + +def create_dummy_sequence(request_id: int, token_ids: list[int], + block_size: int) -> Sequence: + return Sequence( + seq_id=request_id, + inputs=token_inputs(token_ids), + block_size=block_size, + ) + + +def create_dummy_prompt_encoder_decoder( + request_id: str, + decoder_prompt_length: int, + encoder_prompt_length: int, + block_size: Optional[int] = None, + lora_request: Optional[LoRARequest] = None, +) -> tuple[Sequence, Sequence, SequenceGroup]: + if not block_size: + block_size = decoder_prompt_length + + # Create dummy prompt sequence with tokens 0...block_size-1 + # and prompt "0 ... block_size". Note that the prompt string + # doesn't actually match the tokens + decoder_prompt_tokens = list(range(decoder_prompt_length)) + decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens]) + encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length)))) + encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens]) + + inputs: EncoderDecoderInputs = { + "decoder": token_inputs(decoder_prompt_tokens, + prompt=decoder_prompt_str), + "encoder": token_inputs(encoder_prompt_tokens, + prompt=encoder_prompt_str), + } + + decoder_prompt = Sequence(int(request_id), + inputs=inputs["decoder"], + block_size=block_size) + + encoder_prompt = Sequence(int(request_id), + inputs=inputs["encoder"], + block_size=block_size) + + seq_group = SequenceGroup(request_id=request_id, + seqs=[decoder_prompt], + arrival_time=time.time(), + lora_request=lora_request, + encoder_seq=encoder_prompt) + + return decoder_prompt, encoder_prompt, seq_group + + +def create_seq_group( + seq_prompt_len: int = 1024, + seq_output_lens: GenericSequence[int] = (128, ), + request_id: str = '0', + seq_id_start: int = 0, + sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: + + assert len(seq_output_lens) > 0 + + if sampling_params is None: + sampling_params = SamplingParams() + + prompt_token_ids = [0] * seq_prompt_len + + seqs: list[Sequence] = [] + for seq_id_offset, output_len in enumerate(seq_output_lens): + seq = Sequence( + seq_id=seq_id_start + seq_id_offset, + inputs=token_inputs(prompt_token_ids), + block_size=16, + ) + + for i in range(output_len): + seq.append_token_id( + token_id=i, + logprobs={i: Logprob(0.0)}, + ) + seqs.append(seq) + + seq_group = SequenceGroup( + request_id=request_id, + seqs=seqs, + sampling_params=sampling_params, + arrival_time=time.time(), + ) + + return seq_group + + +def create_seq_group_encoder_decoder( + seq_prompt_len: int = 1024, + seq_output_lens: GenericSequence[int] = (128, ), + request_id: str = '0', + seq_id_start: int = 0, + sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: + + assert len(seq_output_lens) > 0 + + if sampling_params is None: + sampling_params = SamplingParams() + + prompt_token_ids = [0] * seq_prompt_len + + inputs: EncoderDecoderInputs = { + "decoder": token_inputs(prompt_token_ids), + "encoder": token_inputs(prompt_token_ids), + } + + seqs = [] + for seq_id_offset, output_len in enumerate(seq_output_lens): + # Construct decoder input sequences + seq = Sequence( + seq_id=seq_id_start + seq_id_offset, + inputs=inputs["decoder"], + block_size=16, + ) + + for i in range(output_len): + seq.append_token_id( + token_id=i, + logprobs={i: Logprob(0.0)}, + ) + seqs.append(seq) + + # Encoder input sequence + encoder_seq = Sequence( + seq_id=seq_id_start + len(seq_output_lens), + inputs=inputs["encoder"], + block_size=16, + ) + + return SequenceGroup(request_id=request_id, + seqs=seqs, + sampling_params=sampling_params, + arrival_time=time.time(), + encoder_seq=encoder_seq) + + +def round_up_to_next_block(seq_len: int, block_size: int) -> int: + return (seq_len + block_size - 1) // block_size + + +# Helper functions for scheduler tests + + +def get_sequence_groups(scheduler_output): + return [s.seq_group for s in scheduler_output.scheduled_seq_groups] + + +def append_new_token(out, token_id: int): + seq_groups = get_sequence_groups(out) + for seq_group in seq_groups: + for seq in seq_group.get_seqs(): + seq.append_token_id(token_id, {token_id: Logprob(token_id)}) + + +def schedule_and_update_computed_tokens(scheduler): + metas, out, _ = scheduler.schedule() + for s in out.scheduled_seq_groups: + s.seq_group.update_num_computed_tokens(s.token_chunk_size) + return metas, out + + +def append_new_token_seq(seq: Sequence, token_id: int): + seq.append_token_id(token_id, {token_id: Logprob(token_id)}) + + +def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int): + seq_group.update_num_computed_tokens(token_chunk_size) + for seq in seq_group.get_seqs(): + seq.append_token_id(token_id, {token_id: Logprob(token_id)}) + + +class SchedulerProxy: + """ + A proxy class to forward calls to the scheduler. + """ + + def __init__(self, scheduler: Scheduler): + self.scheduler_ = scheduler + self.call_history: dict[str, list[Any]] = defaultdict(list) + + def __getattr__(self, name: str) -> Any: + + def wrapper(*args, **kwargs): + result = getattr(self.scheduler_, name)(*args, **kwargs) + self.call_history[name].append((args, kwargs, result)) + return result + + return wrapper + + def last_schedule_ret( + self, ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]: + _, _, ret = self.call_history["schedule"][-1] + return ret diff --git a/tests/singlecard/prefix_caching/test_prefix_caching.py b/tests/singlecard/prefix_caching/test_prefix_caching.py new file mode 100644 index 0000000000..0423644cbf --- /dev/null +++ b/tests/singlecard/prefix_caching/test_prefix_caching.py @@ -0,0 +1,222 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/prefix_caching/test_prefix_caching.py +# +"""Compare the with and without prefix caching. + +Run `pytest tests/singlecard/prefix_caching/test_prefix_caching.py`. +""" + +from __future__ import annotations + +import pytest +from vllm import SamplingParams, TokensPrompt +from vllm.core.scheduler import Scheduler +from vllm.engine.llm_engine import LLMEngine + +from tests.conftest import VllmRunner +from tests.model_utils import check_outputs_equal +from tests.prefix_caching.core_utils import SchedulerProxy, create_dummy_prompt + + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch: pytest.MonkeyPatch): + """ + This module relies on V0 internals, so set VLLM_USE_V1=0. + """ + with monkeypatch.context() as m: + m.setenv('VLLM_USE_V1', '0') + yield + + +MODELS = [ + "Qwen/Qwen2.5-0.5B-Instruct", +] + +UNSTABLE_PROMPT_SEQUENCE = [ + ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([3] * 1), + ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([5] * 50), + ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([6] * 95), + ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([7] * 174), + ([0] * 588) + ([8] * 1539), +] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [5]) +@pytest.mark.parametrize("cached_position", [0, 1]) +@pytest.mark.parametrize("enable_chunked_prefill", [True, False]) +@pytest.mark.parametrize("block_size", [16]) +def test_mixed_requests( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, + cached_position: int, + enable_chunked_prefill: bool, + block_size: int, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """ + Test the case when some sequences have the prefix cache hit + and the others don't. The cached position determines where + the sequence is at among the batch of prefills. + """ + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + + cached_prompt = example_prompts[cached_position] + with vllm_runner( + model, + dtype=dtype, + enable_prefix_caching=True, + enable_chunked_prefill=enable_chunked_prefill, + block_size=block_size, + ) as vllm_model: + # Run the first prompt so the cache is populated + vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens) + + # Run all the promopts + greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) + req_outputs = vllm_model.model.generate(example_prompts, greedy_params) + + # Verify number of cached tokens + for i in range(len(req_outputs)): + if i == cached_position: + expected_num_cached_tokens = ( + len(req_outputs[i].prompt_token_ids) // + block_size) * block_size + else: + expected_num_cached_tokens = 0 + assert ( + req_outputs[i].num_cached_tokens == expected_num_cached_tokens) + + vllm_outputs = [( + output.prompt_token_ids + list(output.outputs[0].token_ids), + output.prompt + output.outputs[0].text, + ) for output in req_outputs] + + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +def test_unstable_prompt_sequence( + vllm_runner, + monkeypatch: pytest.MonkeyPatch, +) -> None: + with vllm_runner( + "Qwen/Qwen2.5-0.5B-Instruct", + enable_chunked_prefill=True, + enable_prefix_caching=True, + max_model_len=4096, + ) as vllm_model: + for prompt in UNSTABLE_PROMPT_SEQUENCE: + vllm_model.generate(TokensPrompt(prompt_token_ids=prompt), + SamplingParams(max_tokens=1)) + + +@pytest.mark.parametrize("model", MODELS) +def test_fully_cached_prefill_needs_uncached_token(model): + block_size = 128 + max_num_batched_tokens = 16 + num_output_tokens = 5 + # Make a vllm engine + runner = VllmRunner( + model_name=model, + gpu_memory_utilization=0.7, + enable_chunked_prefill=True, + enforce_eager=True, + enable_prefix_caching=True, + block_size=block_size, + max_num_batched_tokens=max_num_batched_tokens, + max_num_seqs=max_num_batched_tokens, + ) + engine: LLMEngine = runner.model.llm_engine + + scheduler: Scheduler = SchedulerProxy(engine.scheduler[0]) # type: ignore + engine.scheduler[0] = scheduler + + # SeqA + seqA_tokens = list(range(2 * block_size)) + seqA, seq_groupA = create_dummy_prompt( + request_id="0", + prompt_tokens=seqA_tokens, + max_tokens=num_output_tokens, + block_size=block_size, + ) + + scheduler.add_seq_group(seq_groupA) + + assert seqA.data.get_num_computed_tokens() == 0 + + # Prefill seqA + while not seqA.is_finished(): + engine.step() + + # seqB + seqB_tokens = [t + 1 for t in seqA_tokens] # shift by 1 + seqB, seq_groupB = create_dummy_prompt( + request_id="1", + prompt_tokens=seqB_tokens, + max_tokens=num_output_tokens, + block_size=block_size, + ) + + # seqC is the same as seqA + seqC, seq_groupC = create_dummy_prompt( + request_id="2", + prompt_tokens=seqA_tokens, + max_tokens=num_output_tokens, + block_size=block_size, + ) + + scheduler.add_seq_group(seq_groupB) + scheduler.add_seq_group(seq_groupC) + + # Even seqC is fully cached, it should not be prefilled since we + # require at least 1 uncached token. + engine.step() + + sched_metas, sched_out, _ = scheduler.last_schedule_ret() + assert len(sched_out.scheduled_seq_groups) == 1 + assert (sched_out.scheduled_seq_groups[0].seq_group.request_id == + seq_groupB.request_id) + assert (sched_out.scheduled_seq_groups[0].token_chunk_size == + max_num_batched_tokens) + + # When seqB is finished, seqC could be prefilled. + while not seqB.is_finished(): + engine.step() + sched_metas, sched_out, _ = scheduler.last_schedule_ret() + assert len(sched_out.scheduled_seq_groups) == 1 + assert (sched_out.scheduled_seq_groups[0].seq_group.request_id == + seq_groupB.request_id) + + engine.step() + sched_metas, sched_out, _ = scheduler.last_schedule_ret() + assert len(sched_out.scheduled_seq_groups) == 1 + assert (sched_out.scheduled_seq_groups[0].seq_group.request_id == + seq_groupC.request_id) + assert sched_out.scheduled_seq_groups[0].token_chunk_size == len( + seqA_tokens) diff --git a/tests/singlecard/test_chunk_prefill.py b/tests/singlecard/test_chunk_prefill.py new file mode 100644 index 0000000000..533fe8ebe5 --- /dev/null +++ b/tests/singlecard/test_chunk_prefill.py @@ -0,0 +1,66 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/models/test_chunked_prefill.py +# +"""Test e2e chunked prefill. + +Chunked prefill can be enabled by enable_chunked_prefill=True. +If prefill size exceeds max_num_batched_tokens, +prefill requests are chunked. + +Run `pytest tests/singlecard/test_chunked_prefill.py`. +""" +import pytest + +MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16]) +@pytest.mark.parametrize("enforce_eager", [True]) +# NOTE: Increasing this in this suite will fail CI because we currently cannot +# reset distributed env properly. Use a value > 1 just when you test. +@pytest.mark.parametrize("tensor_parallel_size", [1]) +def test_models( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, + chunked_prefill_token_size: int, + enforce_eager: bool, + tensor_parallel_size: int, +) -> None: + + max_num_seqs = chunked_prefill_token_size + max_num_batched_tokens = chunked_prefill_token_size + + with vllm_runner( + model, + dtype=dtype, + max_num_batched_tokens=max_num_batched_tokens, + enable_chunked_prefill=True, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + max_num_seqs=max_num_seqs, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + + print(vllm_outputs)