Skip to content

Commit

Permalink
Merge branch 'main' into 05-23-engine-config-in-constructor
Browse files Browse the repository at this point in the history
  • Loading branch information
tqchen authored May 24, 2024
2 parents 98885a9 + 988e9f0 commit aaa1447
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 5 deletions.
2 changes: 1 addition & 1 deletion 3rdparty/tvm
Submodule tvm updated 46 files
+1 −0 CONTRIBUTORS.md
+1 −0 include/tvm/runtime/device_api.h
+15 −1 python/tvm/_ffi/runtime_ctypes.py
+29 −12 python/tvm/dlight/gpu/gemv.py
+3 −0 python/tvm/dlight/gpu/general_reduction.py
+2 −0 python/tvm/dlight/gpu/rmsnorm.py
+4 −0 python/tvm/dlight/gpu/transpose.py
+2 −0 python/tvm/dlight/gpu/utils.py
+10 −0 python/tvm/micro/testing/aot_test_utils.py
+68 −1 python/tvm/relay/op/strategy/arm_cpu.py
+24 −0 python/tvm/testing/utils.py
+0 −1 python/tvm/tir/tensor_intrin/__init__.py
+359 −3 python/tvm/tir/tensor_intrin/arm_cpu.py
+4 −1 python/tvm/topi/arm_cpu/__init__.py
+26 −0 python/tvm/topi/arm_cpu/arm_utils.py
+6 −4 python/tvm/topi/arm_cpu/dense.py
+75 −0 python/tvm/topi/arm_cpu/dense_alter_op.py
+124 −0 python/tvm/topi/arm_cpu/matmul.py
+1 −1 python/tvm/topi/x86/dense_alter_op.py
+3 −1 src/arith/const_int_bound.cc
+1 −2 src/arith/scalable_expression.h
+1 −3 src/relay/backend/te_compiler_cache.cc
+1 −0 src/relay/op/nn/nn.cc
+6 −0 src/runtime/cuda/cuda_device_api.cc
+6 −0 src/runtime/opencl/opencl_device_api.cc
+53 −12 src/runtime/relax_vm/paged_kv_cache.cc
+4 −0 src/runtime/rocm/rocm_device_api.cc
+5 −0 src/runtime/vulkan/vulkan_device_api.cc
+5 −1 src/support/pipe.h
+4 −2 src/target/llvm/codegen_aarch64.cc
+1 −1 src/target/source/source_module.cc
+5 −1 src/tir/schedule/ir_comparator.cc
+49 −21 tests/python/all-platform-minimal-test/test_runtime_ndarray.py
+44 −2 tests/python/codegen/test_target_codegen_aarch64.py
+63 −50 tests/python/dlight/test_gpu_gemv.py
+0 −94 tests/python/integration/test_arm_aprofile.py
+1 −1 tests/python/meta_schedule/test_meta_schedule_postproc_rewrite_tensorize.py
+53 −0 tests/python/relay/strategy/arm_cpu/scalable_utils.py
+88 −3 tests/python/relay/strategy/arm_cpu/test_dense.py
+118 −0 tests/python/relay/strategy/arm_cpu/test_matmul.py
+52 −3 tests/python/relay/strategy/test_select_implementation.py
+56 −0 tests/python/relay/test_pass_alter_op_layout.py
+40 −9 tests/python/topi/test_topi_conv2d_nhwc.py
+17 −3 tests/python/topi/test_topi_matmul.py
+15 −0 web/src/runtime.ts
+26 −3 web/src/webgpu.ts
3 changes: 3 additions & 0 deletions cpp/serve/engine_actions/eagle_new_request_prefill.cc
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,9 @@ class EagleNewRequestPrefillActionObj : public BatchPrefillBaseActionObj {
*/
void MatchPrefixCache(EngineState estate, PrefillInput* input) final {
RequestStateEntry rsentry = input->rsentry;
if (estate->prefix_cache->mode == PrefixCacheMode::kDisable) {
return;
}
if (rsentry->parent_idx == -1 && rsentry->status == RequestStateStatus::kPending &&
!estate->prefix_cache->HasSequence(rsentry->mstates[0]->internal_id)) {
IntTuple tokens = GetConcatPrefillInputData(rsentry->mstates[0]);
Expand Down
3 changes: 3 additions & 0 deletions cpp/serve/engine_actions/new_request_prefill.cc
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,9 @@ class NewRequestPrefillActionObj : public BatchPrefillBaseActionObj {
*/
void MatchPrefixCache(EngineState estate, PrefillInput* input) final {
RequestStateEntry rsentry = input->rsentry;
if (estate->prefix_cache->mode == PrefixCacheMode::kDisable) {
return;
}
if (rsentry->parent_idx == -1 && rsentry->status == RequestStateStatus::kPending &&
!estate->prefix_cache->HasSequence(rsentry->mstates[0]->internal_id)) {
IntTuple tokens = GetConcatPrefillInputData(rsentry->mstates[0]);
Expand Down
10 changes: 10 additions & 0 deletions cpp/serve/prefix_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,11 @@ class PrefixCacheImpl : public PrefixCacheObj {
lru_counter_ = 0;
}

/*!
* \brief The prefix cache mode.
*/
static const PrefixCacheMode mode = PrefixCacheMode::kRadix;

private:
void ReuseRecyclingSequence(int64_t seq_id) {
CHECK(seq_states_.at(seq_id) == SequenceState::kRecycling);
Expand Down Expand Up @@ -383,6 +388,11 @@ class NoPrefixCache : public PrefixCacheObj {
* \brief Reset the prefix cache to initial status. Do nothing and return.
*/
void Reset() final {}

/*!
* \brief The prefix cache mode.
*/
static const PrefixCacheMode mode = PrefixCacheMode::kDisable;
};

TVM_REGISTER_OBJECT_TYPE(NoPrefixCache);
Expand Down
5 changes: 5 additions & 0 deletions cpp/serve/prefix_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,11 @@ class PrefixCacheObj : public Object {
*/
virtual void Reset() = 0;

/*!
* \brief The prefix cache mode.
*/
static const PrefixCacheMode mode = PrefixCacheMode::kDisable;

static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
static constexpr const char* _type_key = "mlc.serve.PrefixCache";
TVM_DECLARE_BASE_OBJECT_INFO(PrefixCacheObj, Object)
Expand Down
8 changes: 5 additions & 3 deletions python/mlc_llm/serve/engine_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,12 +543,14 @@ def __del__(self):

def terminate(self):
"""Terminate the engine."""
if self._terminated:
if hasattr(self, '_terminated') and self._terminated:
return
self._terminated = True
self._ffi["exit_background_loop"]()
self._background_loop_thread.join()
self._background_stream_back_loop_thread.join()
if hasattr(self, '_background_loop_thread'):
self._background_loop_thread.join()
if hasattr(self, '_background_stream_back_loop_thread'):
self._background_stream_back_loop_thread.join()

def _debug_call_func_on_all_worker(self, func_name: str) -> None:
"""Call the given global function on all workers. Only for debug purpose."""
Expand Down
2 changes: 1 addition & 1 deletion tests/python/serve/test_serve_engine_prefix_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def test_basic_engine_system_prompt():
engine_config=EngineConfig(
max_total_sequence_length=4096,
prefix_cache_max_num_recycling_seqs=5,
),
)
)
test_engine_system_prompt(engine)

Expand Down

0 comments on commit aaa1447

Please sign in to comment.