Merge branch 'main' into 05-23-engine-config-in-constructor

mlc-ai · May 24, 2024 · aaa1447 · aaa1447
2 parents 98885a9 + 988e9f0
commit aaa1447
Show file tree

Hide file tree

Showing 7 changed files with 28 additions and 5 deletions.
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/cpp/serve/engine_actions/eagle_new_request_prefill.cc b/cpp/serve/engine_actions/eagle_new_request_prefill.cc
@@ -364,6 +364,9 @@ class EagleNewRequestPrefillActionObj : public BatchPrefillBaseActionObj {
    */
   void MatchPrefixCache(EngineState estate, PrefillInput* input) final {
     RequestStateEntry rsentry = input->rsentry;
+    if (estate->prefix_cache->mode == PrefixCacheMode::kDisable) {
+      return;
+    }
     if (rsentry->parent_idx == -1 && rsentry->status == RequestStateStatus::kPending &&
         !estate->prefix_cache->HasSequence(rsentry->mstates[0]->internal_id)) {
       IntTuple tokens = GetConcatPrefillInputData(rsentry->mstates[0]);

diff --git a/cpp/serve/engine_actions/new_request_prefill.cc b/cpp/serve/engine_actions/new_request_prefill.cc
@@ -250,6 +250,9 @@ class NewRequestPrefillActionObj : public BatchPrefillBaseActionObj {
    */
   void MatchPrefixCache(EngineState estate, PrefillInput* input) final {
     RequestStateEntry rsentry = input->rsentry;
+    if (estate->prefix_cache->mode == PrefixCacheMode::kDisable) {
+      return;
+    }
     if (rsentry->parent_idx == -1 && rsentry->status == RequestStateStatus::kPending &&
         !estate->prefix_cache->HasSequence(rsentry->mstates[0]->internal_id)) {
       IntTuple tokens = GetConcatPrefillInputData(rsentry->mstates[0]);

diff --git a/cpp/serve/prefix_cache.cc b/cpp/serve/prefix_cache.cc
@@ -237,6 +237,11 @@ class PrefixCacheImpl : public PrefixCacheObj {
     lru_counter_ = 0;
   }
 
+  /*!
+   * \brief The prefix cache mode.
+   */
+  static const PrefixCacheMode mode = PrefixCacheMode::kRadix;
+
  private:
   void ReuseRecyclingSequence(int64_t seq_id) {
     CHECK(seq_states_.at(seq_id) == SequenceState::kRecycling);
@@ -383,6 +388,11 @@ class NoPrefixCache : public PrefixCacheObj {
    * \brief Reset the prefix cache to initial status. Do nothing and return.
    */
   void Reset() final {}
+
+  /*!
+   * \brief The prefix cache mode.
+   */
+  static const PrefixCacheMode mode = PrefixCacheMode::kDisable;
 };
 
 TVM_REGISTER_OBJECT_TYPE(NoPrefixCache);

diff --git a/cpp/serve/prefix_cache.h b/cpp/serve/prefix_cache.h
@@ -116,6 +116,11 @@ class PrefixCacheObj : public Object {
    */
   virtual void Reset() = 0;
 
+  /*!
+   * \brief The prefix cache mode.
+   */
+  static const PrefixCacheMode mode = PrefixCacheMode::kDisable;
+
   static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
   static constexpr const char* _type_key = "mlc.serve.PrefixCache";
   TVM_DECLARE_BASE_OBJECT_INFO(PrefixCacheObj, Object)

diff --git a/python/mlc_llm/serve/engine_base.py b/python/mlc_llm/serve/engine_base.py
@@ -543,12 +543,14 @@ def __del__(self):
 
     def terminate(self):
         """Terminate the engine."""
-        if self._terminated:
+        if hasattr(self, '_terminated') and self._terminated:
             return
         self._terminated = True
         self._ffi["exit_background_loop"]()
-        self._background_loop_thread.join()
-        self._background_stream_back_loop_thread.join()
+        if hasattr(self, '_background_loop_thread'):
+            self._background_loop_thread.join()
+        if hasattr(self, '_background_stream_back_loop_thread'):
+            self._background_stream_back_loop_thread.join()
 
     def _debug_call_func_on_all_worker(self, func_name: str) -> None:
         """Call the given global function on all workers. Only for debug purpose."""

diff --git a/tests/python/serve/test_serve_engine_prefix_cache.py b/tests/python/serve/test_serve_engine_prefix_cache.py
@@ -78,7 +78,7 @@ def test_basic_engine_system_prompt():
         engine_config=EngineConfig(
             max_total_sequence_length=4096,
             prefix_cache_max_num_recycling_seqs=5,
-        ),
+        )
     )
     test_engine_system_prompt(engine)
+1 −0		CONTRIBUTORS.md
+1 −0		include/tvm/runtime/device_api.h
+15 −1		python/tvm/_ffi/runtime_ctypes.py
+29 −12		python/tvm/dlight/gpu/gemv.py
+3 −0		python/tvm/dlight/gpu/general_reduction.py
+2 −0		python/tvm/dlight/gpu/rmsnorm.py
+4 −0		python/tvm/dlight/gpu/transpose.py
+2 −0		python/tvm/dlight/gpu/utils.py
+10 −0		python/tvm/micro/testing/aot_test_utils.py
+68 −1		python/tvm/relay/op/strategy/arm_cpu.py
+24 −0		python/tvm/testing/utils.py
+0 −1		python/tvm/tir/tensor_intrin/__init__.py
+359 −3		python/tvm/tir/tensor_intrin/arm_cpu.py
+4 −1		python/tvm/topi/arm_cpu/__init__.py
+26 −0		python/tvm/topi/arm_cpu/arm_utils.py
+6 −4		python/tvm/topi/arm_cpu/dense.py
+75 −0		python/tvm/topi/arm_cpu/dense_alter_op.py
+124 −0		python/tvm/topi/arm_cpu/matmul.py
+1 −1		python/tvm/topi/x86/dense_alter_op.py
+3 −1		src/arith/const_int_bound.cc
+1 −2		src/arith/scalable_expression.h
+1 −3		src/relay/backend/te_compiler_cache.cc
+1 −0		src/relay/op/nn/nn.cc
+6 −0		src/runtime/cuda/cuda_device_api.cc
+6 −0		src/runtime/opencl/opencl_device_api.cc
+53 −12		src/runtime/relax_vm/paged_kv_cache.cc
+4 −0		src/runtime/rocm/rocm_device_api.cc
+5 −0		src/runtime/vulkan/vulkan_device_api.cc
+5 −1		src/support/pipe.h
+4 −2		src/target/llvm/codegen_aarch64.cc
+1 −1		src/target/source/source_module.cc
+5 −1		src/tir/schedule/ir_comparator.cc
+49 −21		tests/python/all-platform-minimal-test/test_runtime_ndarray.py
+44 −2		tests/python/codegen/test_target_codegen_aarch64.py
+63 −50		tests/python/dlight/test_gpu_gemv.py
+0 −94		tests/python/integration/test_arm_aprofile.py
+1 −1		tests/python/meta_schedule/test_meta_schedule_postproc_rewrite_tensorize.py
+53 −0		tests/python/relay/strategy/arm_cpu/scalable_utils.py
+88 −3		tests/python/relay/strategy/arm_cpu/test_dense.py
+118 −0		tests/python/relay/strategy/arm_cpu/test_matmul.py
+52 −3		tests/python/relay/strategy/test_select_implementation.py
+56 −0		tests/python/relay/test_pass_alter_op_layout.py
+40 −9		tests/python/topi/test_topi_conv2d_nhwc.py
+17 −3		tests/python/topi/test_topi_matmul.py
+15 −0		web/src/runtime.ts
+26 −3		web/src/webgpu.ts