-
Notifications
You must be signed in to change notification settings - Fork 599
Closed
Description
I'm getting the stack trace below while trying to run the latest version of ART with a LocalBackend on an H100. I suspect there's some dependency version mismatch that's causing this failure, although I'm unsure how to diagnose exactly.
(roflbot-train-002, pid=2251) INFO 05-02 08:26:39 model_runner.py:1115] Loading model weights took 10.6007 GB
(roflbot-train-002, pid=2251) INFO 05-02 08:26:39 punica_selector.py:18] Using PunicaWrapperGPU.
(roflbot-train-002, pid=2251) Traceback (most recent call last):
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/triton/language/core.py", line 34, in wrapper
(roflbot-train-002, pid=2251) return fn(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/triton/language/core.py", line 1914, in load
(roflbot-train-002, pid=2251) return semantic.load(pointer, mask, other, boundary_check, padding_option, cache_modifier, eviction_policy,
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/triton/language/semantic.py", line 1142, in load
(roflbot-train-002, pid=2251) if ptr.type.is_ptr() and ptr.type.element_ty.is_block():
(roflbot-train-002, pid=2251) AttributeError: 'tuple_type' object has no attribute 'is_ptr'
(roflbot-train-002, pid=2251)
(roflbot-train-002, pid=2251) The above exception was the direct cause of the following exception:
(roflbot-train-002, pid=2251)
(roflbot-train-002, pid=2251) triton.compiler.errors.CompilationError: at 34:22:
(roflbot-train-002, pid=2251) SPLIT_K: Parameter signifying parallelism in the K dimension.
(roflbot-train-002, pid=2251) CAST_TYPE: if True, cast the values from the A matrix to the B
(roflbot-train-002, pid=2251) matrix dtype.
(roflbot-train-002, pid=2251) b_dtype: datatype of the B matrix
(roflbot-train-002, pid=2251) """
(roflbot-train-002, pid=2251) accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
(roflbot-train-002, pid=2251) for k in range(tl.cdiv(K, BLOCK_K * SPLIT_K)):
(roflbot-train-002, pid=2251) if EVEN_K:
(roflbot-train-002, pid=2251) tiled_a = tl.load(a_ptr)
(roflbot-train-002, pid=2251) tiled_b = tl.load(b_ptr)
(roflbot-train-002, pid=2251) else:
(roflbot-train-002, pid=2251) tiled_a = tl.load(a_ptr,
(roflbot-train-002, pid=2251) ^
(roflbot-train-002, pid=2251)
(roflbot-train-002, pid=2251) The above exception was the direct cause of the following exception:
(roflbot-train-002, pid=2251)
(roflbot-train-002, pid=2251) triton.compiler.errors.CompilationError: at 78:18:
(roflbot-train-002, pid=2251)
(roflbot-train-002, pid=2251) # Identify A and B block pointers
(roflbot-train-002, pid=2251) offset_k = tl.arange(0, BLOCK_K)
(roflbot-train-002, pid=2251) a_ptr = (cur_input_ptr + ram[:, None] * input_d1_stride +
(roflbot-train-002, pid=2251) offset_k[None, :] * input_d2_stride, )
(roflbot-train-002, pid=2251) b_ptr = (cur_lora_ptr + cur_lora_d0_stride * lora_index +
(roflbot-train-002, pid=2251) offset_k[:, None] * cur_lora_d2_stride +
(roflbot-train-002, pid=2251) rbn[None, :] * cur_lora_d1_stride)
(roflbot-train-002, pid=2251)
(roflbot-train-002, pid=2251) # Compute the block matrix product.
(roflbot-train-002, pid=2251) SPLIT_K = 1
(roflbot-train-002, pid=2251) accumulator = mm_k(a_ptr, b_ptr, input_d2_stride, cur_lora_d2_stride,
(roflbot-train-002, pid=2251) ^
(roflbot-train-002, pid=2251)
(roflbot-train-002, pid=2251) The above exception was the direct cause of the following exception:
(roflbot-train-002, pid=2251)
(roflbot-train-002, pid=2251) Traceback (most recent call last):
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/unsloth_zoo/vllm_utils.py", line 1024, in load_vllm
(roflbot-train-002, pid=2251) llm = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**engine_args))
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/art/local/state.py", line 75, in _from_engine_args
(roflbot-train-002, pid=2251) return from_engine_args(engine_args, *args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 644, in from_engine_args
(roflbot-train-002, pid=2251) engine = cls(
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 594, in __init__
(roflbot-train-002, pid=2251) self.engine = self._engine_class(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 267, in __init__
(roflbot-train-002, pid=2251) super().__init__(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 276, in __init__
(roflbot-train-002, pid=2251) self._initialize_kv_caches()
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 421, in _initialize_kv_caches
(roflbot-train-002, pid=2251) self.model_executor.determine_num_available_blocks())
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 102, in determine_num_available_blocks
(roflbot-train-002, pid=2251) results = self.collective_rpc("determine_num_available_blocks")
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
(roflbot-train-002, pid=2251) answer = run_method(self.driver_worker, method, args, kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/utils.py", line 2196, in run_method
(roflbot-train-002, pid=2251) return func(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(roflbot-train-002, pid=2251) return func(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/worker/worker.py", line 229, in determine_num_available_blocks
(roflbot-train-002, pid=2251) self.model_runner.profile_run()
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/worker/multi_step_model_runner.py", line 669, in profile_run
(roflbot-train-002, pid=2251) return self._base_model_runner.profile_run()
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(roflbot-train-002, pid=2251) return func(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1235, in profile_run
(roflbot-train-002, pid=2251) self._dummy_run(max_num_batched_tokens, max_num_seqs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1346, in _dummy_run
(roflbot-train-002, pid=2251) self.execute_model(model_input, kv_caches, intermediate_tensors)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(roflbot-train-002, pid=2251) return func(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1724, in execute_model
(roflbot-train-002, pid=2251) hidden_or_intermediate_states = model_executable(
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(roflbot-train-002, pid=2251) return self._call_impl(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(roflbot-train-002, pid=2251) return forward_call(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py", line 486, in forward
(roflbot-train-002, pid=2251) hidden_states = self.model(input_ids, positions, kv_caches,
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 172, in __call__
(roflbot-train-002, pid=2251) return self.forward(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py", line 348, in forward
(roflbot-train-002, pid=2251) hidden_states, residual = layer(
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(roflbot-train-002, pid=2251) return self._call_impl(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(roflbot-train-002, pid=2251) return forward_call(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py", line 247, in forward
(roflbot-train-002, pid=2251) hidden_states = self.self_attn(
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(roflbot-train-002, pid=2251) return self._call_impl(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(roflbot-train-002, pid=2251) return forward_call(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py", line 176, in forward
(roflbot-train-002, pid=2251) qkv, _ = self.qkv_proj(hidden_states)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
(roflbot-train-002, pid=2251) return self._call_impl(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
(roflbot-train-002, pid=2251) return forward_call(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/lora/layers.py", line 523, in forward
(roflbot-train-002, pid=2251) output_parallel = self.apply(input_, bias)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/lora/layers.py", line 398, in apply
(roflbot-train-002, pid=2251) self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/lora/punica_wrapper/punica_gpu.py", line 266, in add_lora_linear
(roflbot-train-002, pid=2251) self.add_expand(y,
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/lora/punica_wrapper/punica_gpu.py", line 165, in add_expand
(roflbot-train-002, pid=2251) self._apply_expand_prefill(y,
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/lora/punica_wrapper/punica_gpu.py", line 77, in _apply_expand_prefill
(roflbot-train-002, pid=2251) sgmv_expand(
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/_ops.py", line 1116, in __call__
(roflbot-train-002, pid=2251) return self._op(*args, **(kwargs or {}))
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(roflbot-train-002, pid=2251) return func(*args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/vllm/lora/ops/triton_ops/sgmv_expand.py", line 192, in _sgmv_expand
(roflbot-train-002, pid=2251) _sgmv_expand_kernel[grid](
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/triton/runtime/jit.py", line 347, in <lambda>
(roflbot-train-002, pid=2251) return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/triton/runtime/jit.py", line 569, in run
(roflbot-train-002, pid=2251) kernel = self.compile(src, target=target, options=options.__dict__)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/triton/compiler/compiler.py", line 278, in compile
(roflbot-train-002, pid=2251) module = src.make_ir(options, codegen_fns, module_map, context)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/triton/compiler/compiler.py", line 81, in make_ir
(roflbot-train-002, pid=2251) return ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns,
(roflbot-train-002, pid=2251) triton.compiler.errors.CompilationError: at 63:4:
(roflbot-train-002, pid=2251) lora_index = tl.load(lora_indices + cur_batch)
(roflbot-train-002, pid=2251) if lora_index == -1:
(roflbot-train-002, pid=2251) return
(roflbot-train-002, pid=2251)
(roflbot-train-002, pid=2251) m_offset = tl.load(b_seq_start_loc + cur_batch)
(roflbot-train-002, pid=2251)
(roflbot-train-002, pid=2251) cta_m_len = min(BLOCK_M, M - (pid_m * BLOCK_M))
(roflbot-train-002, pid=2251) cta_m_offset = m_offset + (pid_m * BLOCK_M)
(roflbot-train-002, pid=2251) offset_m = tl.arange(0, BLOCK_M)
(roflbot-train-002, pid=2251) ram = cta_m_offset + tl.max_contiguous(
(roflbot-train-002, pid=2251) tl.multiple_of(offset_m % cta_m_len, BLOCK_M), BLOCK_M)
(roflbot-train-002, pid=2251) do_expand_kernel(
(roflbot-train-002, pid=2251) ^
(roflbot-train-002, pid=2251)
(roflbot-train-002, pid=2251) During handling of the above exception, another exception occurred:
(roflbot-train-002, pid=2251)
(roflbot-train-002, pid=2251) Traceback (most recent call last):
(roflbot-train-002, pid=2251) File "/root/sky_workdir/train_roflbot/train.py", line 108, in <module>
(roflbot-train-002, pid=2251) asyncio.run(run_training(model))
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/nest_asyncio.py", line 30, in run
(roflbot-train-002, pid=2251) return loop.run_until_complete(task)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/nest_asyncio.py", line 98, in run_until_complete
(roflbot-train-002, pid=2251) return f.result()
(roflbot-train-002, pid=2251) File "/root/miniconda3/lib/python3.10/asyncio/futures.py", line 201, in result
(roflbot-train-002, pid=2251) raise self._exception.with_traceback(self._exception_tb)
(roflbot-train-002, pid=2251) File "/root/miniconda3/lib/python3.10/asyncio/tasks.py", line 234, in __step
(roflbot-train-002, pid=2251) result = coro.throw(exc)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/train_roflbot/train.py", line 25, in run_training
(roflbot-train-002, pid=2251) await model.register(backend)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/art/model.py", line 176, in register
(roflbot-train-002, pid=2251) base_url, api_key = await backend._prepare_backend_for_training(
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/art/local/backend.py", line 193, in _prepare_backend_for_training
(roflbot-train-002, pid=2251) await service.start_openai_server(config=config)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/mp_actors/traceback.py", line 27, in async_wrapper
(roflbot-train-002, pid=2251) raise e.with_traceback(streamlined_traceback())
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/art/local/service.py", line 51, in start_openai_server
(roflbot-train-002, pid=2251) self.state.trainer.save_model(lora_path)
(roflbot-train-002, pid=2251) File "/root/miniconda3/lib/python3.10/functools.py", line 981, in __get__
(roflbot-train-002, pid=2251) val = self.func(instance)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/art/local/service.py", line 39, in state
(roflbot-train-002, pid=2251) return ModelState(self.config)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/art/local/state.py", line 80, in __init__
(roflbot-train-002, pid=2251) unsloth.FastLanguageModel.from_pretrained(**config.get("init_args", {})),
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/unsloth/models/loader.py", line 363, in from_pretrained
(roflbot-train-002, pid=2251) model, tokenizer = dispatch_model.from_pretrained(
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/unsloth/models/qwen2.py", line 87, in from_pretrained
(roflbot-train-002, pid=2251) return FastLlamaModel.from_pretrained(
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/unsloth/models/llama.py", line 1819, in from_pretrained
(roflbot-train-002, pid=2251) llm = load_vllm(**load_vllm_kwargs)
(roflbot-train-002, pid=2251) File "/root/sky_workdir/.venv/lib/python3.10/site-packages/unsloth_zoo/vllm_utils.py", line 1051, in load_vllm
(roflbot-train-002, pid=2251) raise RuntimeError(error)
(roflbot-train-002, pid=2251) RuntimeError: at 63:4:
(roflbot-train-002, pid=2251) lora_index = tl.load(lora_indices + cur_batch)
(roflbot-train-002, pid=2251) if lora_index == -1:
(roflbot-train-002, pid=2251) return
(roflbot-train-002, pid=2251)
(roflbot-train-002, pid=2251) m_offset = tl.load(b_seq_start_loc + cur_batch)
(roflbot-train-002, pid=2251)
(roflbot-train-002, pid=2251) cta_m_len = min(BLOCK_M, M - (pid_m * BLOCK_M))
(roflbot-train-002, pid=2251) cta_m_offset = m_offset + (pid_m * BLOCK_M)
(roflbot-train-002, pid=2251) offset_m = tl.arange(0, BLOCK_M)
(roflbot-train-002, pid=2251) ram = cta_m_offset + tl.max_contiguous(
(roflbot-train-002, pid=2251) tl.multiple_of(offset_m % cta_m_len, BLOCK_M), BLOCK_M)
(roflbot-train-002, pid=2251) do_expand_kernel(
(roflbot-train-002, pid=2251) ^
Metadata
Metadata
Assignees
Labels
No labels