diff --git a/tests/unit_tests/worker/test_hpu_input_batch.py b/tests/unit_tests/worker/test_hpu_input_batch.py index 3ea19e11..1beba5a7 100644 --- a/tests/unit_tests/worker/test_hpu_input_batch.py +++ b/tests/unit_tests/worker/test_hpu_input_batch.py @@ -15,6 +15,7 @@ from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.utils import CpuGpuBuffer from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable from vllm_gaudi.v1.worker.hpu_input_batch import InputBatch, CachedRequestState @@ -37,7 +38,7 @@ def _compare_objs(obj1, obj2, skip: Sequence = ("logitsprocs", "batch_update_bui is_same = False if isinstance(a, torch.Tensor): - if (a.numel() == 0 or b.numel() == 0): + if a.numel() == 0 or b.numel() == 0: is_same = (a.numel() == 0 and b.numel() == 0) elif torch.allclose(a, b): is_same = True @@ -53,6 +54,8 @@ def _compare_objs(obj1, obj2, skip: Sequence = ("logitsprocs", "batch_update_bui is_same = True # if we make it here must be same elif a == b: is_same = True + elif isinstance(a, CpuGpuBuffer): + is_same = np.allclose(a.np, b.np) and torch.allclose(a.gpu, b.gpu) assert is_same, f"Attribute {attr_name} is different"\ f" in {obj1} and {obj2}: {a} != {b}" diff --git a/tests/unit_tests/worker/test_hpu_model_runner.py b/tests/unit_tests/worker/test_hpu_model_runner.py index cd8daf86..1c0cc2b3 100644 --- a/tests/unit_tests/worker/test_hpu_model_runner.py +++ b/tests/unit_tests/worker/test_hpu_model_runner.py @@ -157,7 +157,7 @@ def _is_req_state_block_table_match(model_runner, req_id: str) -> bool: if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids[0]): return False num_blocks = block_table.num_blocks_per_row[req_index] - return (block_table.block_table_np[req_index, :num_blocks] == req_state.block_ids[0]).all() + return (block_table.block_table.np[req_index, :num_blocks] == req_state.block_ids[0]).all() def test_update_states_new_request(model_runner, dist_init): diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 51487653..5f49776b 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -43,7 +43,7 @@ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.multimodal.inputs import PlaceholderRange from vllm.sampling_params import SamplingType -from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs +from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, cdiv, is_pin_memory_available, LazyLoader) from vllm_gaudi.utils import (HPUCompileConfig, is_fake_hpu, async_h2d_copy) from vllm_gaudi.v1.attention.backends.hpu_attn import HPUAttentionMetadataV1 @@ -728,9 +728,7 @@ def __init__( logger.info("Bucketing is OFF.") self._PAD_SLOT_ID = -1 self._PAD_BLOCK_ID = -1 - self._tokenizer = init_tokenizer_from_configs(model_config=vllm_config.model_config, - scheduler_config=vllm_config.scheduler_config, - lora_config=vllm_config.lora_config).tokenizer + self._tokenizer = init_tokenizer_from_configs(model_config=vllm_config.model_config) # TODO(madamczyk-intel): add a knob for that # TODO(madamczyk-intel): debug why increasing it lowers acc