Skip to content

Failure while trying to start vLLM on latest version of ART #97

@corbt

Description

@corbt

I'm getting the stack trace below while trying to run the latest version of ART with a LocalBackend on an H100. I suspect there's some dependency version mismatch that's causing this failure, although I'm unsure how to diagnose exactly.

(roflbot-train-002, pid=2251) INFO 05-02 08:26:39 model_runner.py:1115] Loading model weights took 10.6007 GB
(roflbot-train-002, pid=2251) INFO 05-02 08:26:39 punica_selector.py:18] Using PunicaWrapperGPU.
(roflbot-train-002, pid=2251) Traceback (most recent call last):
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/triton/language/core.py", line 34, in wrapper
(roflbot-train-002, pid=2251)     return fn(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/triton/language/core.py", line 1914, in load
(roflbot-train-002, pid=2251)     return semantic.load(pointer, mask, other, boundary_check, padding_option, cache_modifier, eviction_policy,
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/triton/language/semantic.py", line 1142, in load
(roflbot-train-002, pid=2251)     if ptr.type.is_ptr() and ptr.type.element_ty.is_block():
(roflbot-train-002, pid=2251) AttributeError: 'tuple_type' object has no attribute 'is_ptr'
(roflbot-train-002, pid=2251) 
(roflbot-train-002, pid=2251) The above exception was the direct cause of the following exception:
(roflbot-train-002, pid=2251) 
(roflbot-train-002, pid=2251) triton.compiler.errors.CompilationError: at 34:22:
(roflbot-train-002, pid=2251)         SPLIT_K: Parameter signifying parallelism in the K dimension. 
(roflbot-train-002, pid=2251)         CAST_TYPE: if True, cast the values from the A matrix to the B
(roflbot-train-002, pid=2251)           matrix dtype.
(roflbot-train-002, pid=2251)         b_dtype: datatype of the B matrix
(roflbot-train-002, pid=2251)     """
(roflbot-train-002, pid=2251)     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
(roflbot-train-002, pid=2251)     for k in range(tl.cdiv(K, BLOCK_K * SPLIT_K)):
(roflbot-train-002, pid=2251)         if EVEN_K:
(roflbot-train-002, pid=2251)             tiled_a = tl.load(a_ptr)
(roflbot-train-002, pid=2251)             tiled_b = tl.load(b_ptr)
(roflbot-train-002, pid=2251)         else:
(roflbot-train-002, pid=2251)             tiled_a = tl.load(a_ptr,
(roflbot-train-002, pid=2251)                       ^
(roflbot-train-002, pid=2251) 
(roflbot-train-002, pid=2251) The above exception was the direct cause of the following exception:
(roflbot-train-002, pid=2251) 
(roflbot-train-002, pid=2251) triton.compiler.errors.CompilationError: at 78:18:
(roflbot-train-002, pid=2251) 
(roflbot-train-002, pid=2251)     # Identify A and B block pointers
(roflbot-train-002, pid=2251)     offset_k = tl.arange(0, BLOCK_K)
(roflbot-train-002, pid=2251)     a_ptr = (cur_input_ptr + ram[:, None] * input_d1_stride +
(roflbot-train-002, pid=2251)              offset_k[None, :] * input_d2_stride, )
(roflbot-train-002, pid=2251)     b_ptr = (cur_lora_ptr + cur_lora_d0_stride * lora_index +
(roflbot-train-002, pid=2251)              offset_k[:, None] * cur_lora_d2_stride +
(roflbot-train-002, pid=2251)              rbn[None, :] * cur_lora_d1_stride)
(roflbot-train-002, pid=2251) 
(roflbot-train-002, pid=2251)     # Compute the block matrix product.
(roflbot-train-002, pid=2251)     SPLIT_K = 1
(roflbot-train-002, pid=2251)     accumulator = mm_k(a_ptr, b_ptr, input_d2_stride, cur_lora_d2_stride,
(roflbot-train-002, pid=2251)                   ^
(roflbot-train-002, pid=2251) 
(roflbot-train-002, pid=2251) The above exception was the direct cause of the following exception:
(roflbot-train-002, pid=2251) 
(roflbot-train-002, pid=2251) Traceback (most recent call last):
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/unsloth_zoo/vllm_utils.py", line 1024, in load_vllm
(roflbot-train-002, pid=2251)     llm = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**engine_args))
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/art/local/state.py", line 75, in _from_engine_args
(roflbot-train-002, pid=2251)     return from_engine_args(engine_args, *args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 644, in from_engine_args
(roflbot-train-002, pid=2251)     engine = cls(
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 594, in __init__
(roflbot-train-002, pid=2251)     self.engine = self._engine_class(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 267, in __init__
(roflbot-train-002, pid=2251)     super().__init__(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 276, in __init__
(roflbot-train-002, pid=2251)     self._initialize_kv_caches()
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 421, in _initialize_kv_caches
(roflbot-train-002, pid=2251)     self.model_executor.determine_num_available_blocks())
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 102, in determine_num_available_blocks
(roflbot-train-002, pid=2251)     results = self.collective_rpc("determine_num_available_blocks")
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
(roflbot-train-002, pid=2251)     answer = run_method(self.driver_worker, method, args, kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/utils.py", line 2196, in run_method
(roflbot-train-002, pid=2251)     return func(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(roflbot-train-002, pid=2251)     return func(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/worker/worker.py", line 229, in determine_num_available_blocks
(roflbot-train-002, pid=2251)     self.model_runner.profile_run()
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/worker/multi_step_model_runner.py", line 669, in profile_run
(roflbot-train-002, pid=2251)     return self._base_model_runner.profile_run()
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(roflbot-train-002, pid=2251)     return func(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1235, in profile_run
(roflbot-train-002, pid=2251)     self._dummy_run(max_num_batched_tokens, max_num_seqs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1346, in _dummy_run
(roflbot-train-002, pid=2251)     self.execute_model(model_input, kv_caches, intermediate_tensors)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(roflbot-train-002, pid=2251)     return func(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1724, in execute_model
(roflbot-train-002, pid=2251)     hidden_or_intermediate_states = model_executable(
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(roflbot-train-002, pid=2251)     return self._call_impl(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(roflbot-train-002, pid=2251)     return forward_call(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py", line 486, in forward
(roflbot-train-002, pid=2251)     hidden_states = self.model(input_ids, positions, kv_caches,
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 172, in __call__
(roflbot-train-002, pid=2251)     return self.forward(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py", line 348, in forward
(roflbot-train-002, pid=2251)     hidden_states, residual = layer(
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(roflbot-train-002, pid=2251)     return self._call_impl(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(roflbot-train-002, pid=2251)     return forward_call(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py", line 247, in forward
(roflbot-train-002, pid=2251)     hidden_states = self.self_attn(
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(roflbot-train-002, pid=2251)     return self._call_impl(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(roflbot-train-002, pid=2251)     return forward_call(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py", line 176, in forward
(roflbot-train-002, pid=2251)     qkv, _ = self.qkv_proj(hidden_states)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(roflbot-train-002, pid=2251)     return self._call_impl(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(roflbot-train-002, pid=2251)     return forward_call(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/lora/layers.py", line 523, in forward
(roflbot-train-002, pid=2251)     output_parallel = self.apply(input_, bias)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/lora/layers.py", line 398, in apply
(roflbot-train-002, pid=2251)     self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/lora/punica_wrapper/punica_gpu.py", line 266, in add_lora_linear
(roflbot-train-002, pid=2251)     self.add_expand(y,
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/lora/punica_wrapper/punica_gpu.py", line 165, in add_expand
(roflbot-train-002, pid=2251)     self._apply_expand_prefill(y,
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/lora/punica_wrapper/punica_gpu.py", line 77, in _apply_expand_prefill
(roflbot-train-002, pid=2251)     sgmv_expand(
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/_ops.py", line 1116, in __call__
(roflbot-train-002, pid=2251)     return self._op(*args, **(kwargs or {}))
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(roflbot-train-002, pid=2251)     return func(*args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/lora/ops/triton_ops/sgmv_expand.py", line 192, in _sgmv_expand
(roflbot-train-002, pid=2251)     _sgmv_expand_kernel[grid](
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/triton/runtime/jit.py", line 347, in <lambda>
(roflbot-train-002, pid=2251)     return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/triton/runtime/jit.py", line 569, in run
(roflbot-train-002, pid=2251)     kernel = self.compile(src, target=target, options=options.__dict__)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/triton/compiler/compiler.py", line 278, in compile
(roflbot-train-002, pid=2251)     module = src.make_ir(options, codegen_fns, module_map, context)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/triton/compiler/compiler.py", line 81, in make_ir
(roflbot-train-002, pid=2251)     return ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns,
(roflbot-train-002, pid=2251) triton.compiler.errors.CompilationError: at 63:4:
(roflbot-train-002, pid=2251)     lora_index = tl.load(lora_indices + cur_batch)
(roflbot-train-002, pid=2251)     if lora_index == -1:
(roflbot-train-002, pid=2251)         return
(roflbot-train-002, pid=2251) 
(roflbot-train-002, pid=2251)     m_offset = tl.load(b_seq_start_loc + cur_batch)
(roflbot-train-002, pid=2251) 
(roflbot-train-002, pid=2251)     cta_m_len = min(BLOCK_M, M - (pid_m * BLOCK_M))
(roflbot-train-002, pid=2251)     cta_m_offset = m_offset + (pid_m * BLOCK_M)
(roflbot-train-002, pid=2251)     offset_m = tl.arange(0, BLOCK_M)
(roflbot-train-002, pid=2251)     ram = cta_m_offset + tl.max_contiguous(
(roflbot-train-002, pid=2251)         tl.multiple_of(offset_m % cta_m_len, BLOCK_M), BLOCK_M)
(roflbot-train-002, pid=2251)     do_expand_kernel(
(roflbot-train-002, pid=2251)     ^
(roflbot-train-002, pid=2251) 
(roflbot-train-002, pid=2251) During handling of the above exception, another exception occurred:
(roflbot-train-002, pid=2251) 
(roflbot-train-002, pid=2251) Traceback (most recent call last):
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/train_roflbot/train.py", line 108, in <module>
(roflbot-train-002, pid=2251)     asyncio.run(run_training(model))
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/nest_asyncio.py", line 30, in run
(roflbot-train-002, pid=2251)     return loop.run_until_complete(task)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/nest_asyncio.py", line 98, in run_until_complete
(roflbot-train-002, pid=2251)     return f.result()
(roflbot-train-002, pid=2251)   File "/root/miniconda3/lib/python3.10/asyncio/futures.py", line 201, in result
(roflbot-train-002, pid=2251)     raise self._exception.with_traceback(self._exception_tb)
(roflbot-train-002, pid=2251)   File "/root/miniconda3/lib/python3.10/asyncio/tasks.py", line 234, in __step
(roflbot-train-002, pid=2251)     result = coro.throw(exc)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/train_roflbot/train.py", line 25, in run_training
(roflbot-train-002, pid=2251)     await model.register(backend)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/art/model.py", line 176, in register
(roflbot-train-002, pid=2251)     base_url, api_key = await backend._prepare_backend_for_training(
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/art/local/backend.py", line 193, in _prepare_backend_for_training
(roflbot-train-002, pid=2251)     await service.start_openai_server(config=config)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/mp_actors/traceback.py", line 27, in async_wrapper
(roflbot-train-002, pid=2251)     raise e.with_traceback(streamlined_traceback())
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/art/local/service.py", line 51, in start_openai_server
(roflbot-train-002, pid=2251)     self.state.trainer.save_model(lora_path)
(roflbot-train-002, pid=2251)   File "/root/miniconda3/lib/python3.10/functools.py", line 981, in __get__
(roflbot-train-002, pid=2251)     val = self.func(instance)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/art/local/service.py", line 39, in state
(roflbot-train-002, pid=2251)     return ModelState(self.config)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/art/local/state.py", line 80, in __init__
(roflbot-train-002, pid=2251)     unsloth.FastLanguageModel.from_pretrained(**config.get("init_args", {})),
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/unsloth/models/loader.py", line 363, in from_pretrained
(roflbot-train-002, pid=2251)     model, tokenizer = dispatch_model.from_pretrained(
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/unsloth/models/qwen2.py", line 87, in from_pretrained
(roflbot-train-002, pid=2251)     return FastLlamaModel.from_pretrained(
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/unsloth/models/llama.py", line 1819, in from_pretrained
(roflbot-train-002, pid=2251)     llm = load_vllm(**load_vllm_kwargs)
(roflbot-train-002, pid=2251)   File "/root/sky_workdir/.venv/lib/python3.10/site-packages/unsloth_zoo/vllm_utils.py", line 1051, in load_vllm
(roflbot-train-002, pid=2251)     raise RuntimeError(error)
(roflbot-train-002, pid=2251) RuntimeError: at 63:4:
(roflbot-train-002, pid=2251)     lora_index = tl.load(lora_indices + cur_batch)
(roflbot-train-002, pid=2251)     if lora_index == -1:
(roflbot-train-002, pid=2251)         return
(roflbot-train-002, pid=2251) 
(roflbot-train-002, pid=2251)     m_offset = tl.load(b_seq_start_loc + cur_batch)
(roflbot-train-002, pid=2251) 
(roflbot-train-002, pid=2251)     cta_m_len = min(BLOCK_M, M - (pid_m * BLOCK_M))
(roflbot-train-002, pid=2251)     cta_m_offset = m_offset + (pid_m * BLOCK_M)
(roflbot-train-002, pid=2251)     offset_m = tl.arange(0, BLOCK_M)
(roflbot-train-002, pid=2251)     ram = cta_m_offset + tl.max_contiguous(
(roflbot-train-002, pid=2251)         tl.multiple_of(offset_m % cta_m_len, BLOCK_M), BLOCK_M)
(roflbot-train-002, pid=2251)     do_expand_kernel(
(roflbot-train-002, pid=2251)     ^

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions