octoml · sunggg · Nov 22, 2023 · Oct 8, 2023 · Oct 8, 2023 · Oct 8, 2023
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
diff --git a/ci/jenkinsfile.groovy b/ci/jenkinsfile.groovy
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+
+image = 'mlcaidev/ci-cpu:caab922'
+docker_run = "bash ci/bash.sh ${image}"
+
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+def init_git(submodule = false) {
+  checkout scm
+  if (submodule) {
+    retry(5) {
+      timeout(time: 2, unit: 'MINUTES') {
+        sh(script: 'git submodule update --init --recursive -f', label: 'Update git submodules')
+      }
+    }
+  }
+}
+
+stage('Lint') {
+  parallel(
+    'isort': {
+      node('CPU-SMALL') {
+        ws(per_exec_ws('mlc-llm-lint-isort')) {
+          init_git()
+          sh(script: "ls", label: 'debug')
+          sh(script: "${docker_run} conda env export --name ci-lint", label: 'Checkout version')
+          sh(script: "${docker_run} bash ci/task/isort.sh", label: 'Lint')
+        }
+      }
+    },
+    'black': {
+      node('CPU-SMALL') {
+        ws(per_exec_ws('mlc-llm-lint-black')) {
+          init_git()
+          sh(script: "ls", label: 'debug')
+          sh(script: "${docker_run} conda env export --name ci-lint", label: 'Checkout version')
+          sh(script: "${docker_run} bash ci/task/black.sh", label: 'Lint')
+        }
+      }
+    },
+    'mypy': {
+      node('CPU-SMALL') {
+        ws(per_exec_ws('mlc-llm-lint-mypy')) {
+          init_git()
+          sh(script: "ls", label: 'debug')
+          sh(script: "${docker_run} conda env export --name ci-lint", label: 'Checkout version')
+          sh(script: "${docker_run} bash ci/task/mypy.sh", label: 'Lint')
+        }
+      }
+    },
+    'pylint': {
+      node('CPU-SMALL') {
+        ws(per_exec_ws('mlc-llm-lint-pylint')) {
+          init_git()
+          sh(script: "ls", label: 'debug')
+          sh(script: "${docker_run} conda env export --name ci-lint", label: 'Checkout version')
+          sh(script: "${docker_run} bash ci/task/pylint.sh", label: 'Lint')
+        }
+      }
+    },
+    'clang-format': {
+      node('CPU-SMALL') {
+        ws(per_exec_ws('mlc-llm-lint-clang-format')) {
+          init_git()
+          sh(script: "ls", label: 'debug')
+          sh(script: "${docker_run} conda env export --name ci-lint", label: 'Checkout version')
+          sh(script: "${docker_run} bash ci/task/clang-format.sh", label: 'Lint')
+        }
+      }
+    },
+  )
+}
diff --git a/ci/task/mypy.sh b/ci/task/mypy.sh
@@ -8,4 +8,4 @@ export PYTHONPATH="./python:$PYTHONPATH"
 
 set -x
 
-mypy ./python/ ./tests/python/
+mypy --install-types --non-interactive ./python/ ./tests/python/
diff --git a/ci/task/pylint.sh b/ci/task/pylint.sh
@@ -9,7 +9,7 @@ export PYTHONPATH="./python:$PYTHONPATH"
 set -x
 
 # TVM Unity is a dependency to this testing
-pip install --quiet --pre -U -f https://mlc.ai/wheels mlc-ai-nightly
+pip install --quiet --pre -U -f https://mlc.ai/wheels mlc-ai-nightly requests
 
 pylint --jobs $NUM_THREADS ./python/
 pylint --jobs $NUM_THREADS --recursive=y ./tests/python/
diff --git a/cpp/llm_chat.cc b/cpp/llm_chat.cc
@@ -317,25 +317,33 @@ class LLMChat {
     return os.str();
   }
 
-  bool UpdateMaxWindowSizeFromMetadata() {
+  void UpdateConfigFromMetadata() {
     if (ft_.use_disco) {
-      return false;
-    }
-    if (this->sliding_window_ != -1) {
-      return false;
+      return;
     }
+
     PackedFunc fget_metadata = ft_.mod_get_func("get_metadata");
     if (fget_metadata == nullptr) {
-      return false;
+      return;
     }
     ObjectRef ret = fget_metadata();
     std::string metadata_str = std::string(Downcast<String>(ret));
     picojson::value metadata_info;
     picojson::parse(metadata_info, std::string(metadata_str));
     auto metadata = metadata_info.get<picojson::object>();
+
     ICHECK(metadata["max_window_size"].is<int64_t>());
     max_window_size_ = std::min(max_window_size_, metadata["max_window_size"].get<int64_t>());
-    return true;
+
+    if (metadata.count("prefill_chunk_size")) {
+      ICHECK(metadata["prefill_chunk_size"].is<int64_t>());
+      prefill_chunk_size_ =
+          std::min(prefill_chunk_size_, metadata["prefill_chunk_size"].get<int64_t>());
+    }
+    if (metadata.count("sliding_window")) {
+      ICHECK(metadata["sliding_window"].is<int64_t>());
+      sliding_window_ = std::min(sliding_window_, metadata["sliding_window"].get<int64_t>());
+    }
   }
 
   /*!
@@ -410,21 +418,12 @@ class LLMChat {
           << "Cannot specify both sliding_window and max_window_size.";
       this->sliding_window_ = config["sliding_window"].get<int64_t>();
       CHECK(this->sliding_window_ > 0) << "Sliding window size needs to be positive";
-      CHECK(config.count("sliding_window_chunk_size"))
+      CHECK(config.count("prefill_chunk_size"))
           << "Need to specify chunk size if using sliding window attention.";
     }
-    if (config.count("sliding_window_chunk_size")) {
-      CHECK(config["sliding_window_chunk_size"].is<int64_t>());
-      this->sliding_window_chunk_size_ = config["sliding_window_chunk_size"].get<int64_t>();
-      CHECK(this->sliding_window_chunk_size_ > 0)
-          << "Sliding window chunk size needs to be positive";
-      CHECK(config.count("sliding_window")) << "Need to specify sliding window size.";
-    }
-    if (config.count("model_name")) {
-      CHECK(config["model_name"].is<std::string>());
-      this->model_name_ = config["model_name"].get<std::string>();
-    } else {
-      CHECK(partial_update) << "Key \"model_name\" not found.";
+    if (config.count("prefill_chunk_size")) {
+      CHECK(config["prefill_chunk_size"].is<int64_t>());
+      this->prefill_chunk_size_ = config["prefill_chunk_size"].get<int64_t>();
     }
     if (config.count("top_p")) {
       CHECK(config["top_p"].is<double>());
@@ -513,8 +512,8 @@ class LLMChat {
     // so there is no explicit abi dependency on these extra
     // classes other than basic tvm runtime.
     this->ft_.Init(reload_lib, device_, this->num_shards_);
+    UpdateConfigFromMetadata();
     if (this->sliding_window_ == -1) {
-      UpdateMaxWindowSizeFromMetadata();
       CHECK(max_window_size_ != std::numeric_limits<int64_t>::max())
           << "Key \"max_window_size\" not found.";
     }
@@ -807,9 +806,8 @@ class LLMChat {
       if (ft_.use_disco) {
         LOG(FATAL) << "NotImplementedError: Distributed inference is not supported for this model";
       }
-      if (this->sliding_window_ != -1) {
-        LOG(FATAL)
-            << "NotImplementedError: Sliding window attention does not support separate embedding";
+      if (this->prefill_chunk_size_ != -1) {
+        LOG(FATAL) << "NotImplementedError: Separate embedding does not support chunking";
       }
       NDArray embedding = Downcast<NDArray>(
           EmbedStep(inp, append_conversation, place_in_prompt, generation_config_str));
@@ -832,10 +830,10 @@ class LLMChat {
 
     int32_t new_seq_len = total_seq_len_;
     NDArray logits_on_device;
-    if (this->sliding_window_ != -1) {
-      // Use chunking if we use sliding window attention (see Mistral paper figure 3).
-      for (int64_t begin = 0; begin < token_len; begin += this->sliding_window_chunk_size_) {
-        int64_t end = std::min(token_len, begin + this->sliding_window_chunk_size_);
+    if (this->prefill_chunk_size_ > 0) {
+      // Perform chunking.
+      for (int64_t begin = 0; begin < token_len; begin += this->prefill_chunk_size_) {
+        int64_t end = std::min(token_len, begin + this->prefill_chunk_size_);
         std::vector<int32_t> chunk =
             std::vector<int32_t>(prompt_tokens.begin() + begin, prompt_tokens.begin() + end);
         new_seq_len += static_cast<int64_t>(chunk.size());
@@ -844,6 +842,7 @@ class LLMChat {
       ICHECK_EQ(new_seq_len, total_seq_len_ + token_len) << "Expect chunking process all tokens";
     } else {
       // Otherwise, prefill entire prompt at once.
+      CHECK(sliding_window_ == -1) << "Expect chunking with sliding window attention";
       new_seq_len += token_len;
       logits_on_device = this->ForwardTokens(prompt_tokens, new_seq_len);
     }
@@ -1356,16 +1355,14 @@ class LLMChat {
   //----------------------------
   // Conversation
   //----------------------------
-  // model name
-  std::string model_name_;
   // conversation
   Conversation conversation_;
   // total sequence len,
   int64_t total_seq_len_{0};
   // max window size, mean and max generation length, sliding window
   // If we use sliding window, max window size is its default max() value
   int64_t max_window_size_{std::numeric_limits<int64_t>::max()}, mean_gen_len_{128},
-      max_gen_len_{512}, sliding_window_{-1}, sliding_window_chunk_size_{-1};
+      max_gen_len_{512}, sliding_window_{-1}, prefill_chunk_size_{-1};
   // size of the vocab table
   int64_t vocab_size_;
   // number of shards in distributed inference

diff --git a/docs/deploy/android.rst b/docs/deploy/android.rst
@@ -33,7 +33,7 @@ Prerequisite
   TVM_NDK_CC: $ANDROID_NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android24-clang
   # Example on Windows
   ANDROID_NDK: $HOME/Library/Android/sdk/ndk/25.2.9519653
-  TVM_NDK_CC: $ANDROID_NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android24-clang
+  TVM_NDK_CC: $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android24-clang
 
 **JDK**, such as OpenJDK >= 17, to compile Java bindings of TVM Unity runtime. It could be installed via Homebrew on macOS, apt on Ubuntu or other package managers. Set up the following environment variable:
 
@@ -164,6 +164,6 @@ Instructions have been provided to build an Android App with MLC LLM in previous
 .. code-block:: bash
 
   adb install android/MLCChat/app/release/app-release.apk
-  adb push dist/${MODEL_NAME}-${QUANTIZATION}/params /data/local/tmp/${MODEL_NAME}/
+  adb push dist/${MODEL_NAME}-${QUANTIZATION}/params /data/local/tmp/${MODEL_NAME}-${QUANTIZATION}/
   adb shell "mkdir -p /storage/emulated/0/Android/data/ai.mlc.mlcchat/files/"
-  adb shell "mv /data/local/tmp/${MODEL_NAME} /storage/emulated/0/Android/data/ai.mlc.mlcchat/files/${MODEL_NAME}"
+  adb shell "mv /data/local/tmp/${MODEL_NAME}-${QUANTIZATION} /storage/emulated/0/Android/data/ai.mlc.mlcchat/files/"
diff --git a/docs/index.rst b/docs/index.rst
@@ -151,7 +151,7 @@ It is recommended to have at least 6GB free VRAM to run it.
     - Redmi Note 12 Pro with Snapdragon 685
     - Google Pixel phones
 
-    **Tutorial and source code**. The source code of the iOS app is fully `open source <https://github.com/mlc-ai/mlc-llm/tree/main/android>`__,
+    **Tutorial and source code**. The source code of the android app is fully `open source <https://github.com/mlc-ai/mlc-llm/tree/main/android>`__,
     and a :doc:`tutorial <deploy/android>` is included in documentation.
 
     .. figure:: https://blog.mlc.ai/img/android/android-recording.gif

diff --git a/mlc_llm/build.py b/mlc_llm/build.py
@@ -40,17 +40,18 @@ def main():
         # Post processing of arguments
         parsed_args = core._parse_args(parsed_args)  # pylint: disable=protected-access
 
-        # if num_shard>1 without -convert-weight-only or --build-model-only, we implicitly run it sequentially 
-        if parsed_args.num_shards > 1 and not (parsed_args.build_model_only or parsed_args.convert_weight_only):
+        # if num_shard>1 without -convert-weight-only or --build-model-only, we implicitly run it sequentially
+        if parsed_args.num_shards > 1 and not (parsed_args.build_model_only or parsed_args.convert_weights_only):
             parsed_args.build_model_only = True
-            parsed_args.convert_weight_only = False # just to be explicit
+            parsed_args.convert_weights_only = False # just to be explicit
             core.build_model_from_args(parsed_args)
-            
+
             parsed_args.build_model_only = False
-            parsed_args.convert_weight_only = True
+            parsed_args.convert_weights_only = True
             core.build_model_from_args(parsed_args)
         else:
             core.build_model_from_args(parsed_args)
-
+
+
 if __name__ == "__main__":
     main()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,4 +8,4 @@ export PYTHONPATH="./python:$PYTHONPATH"

		set -x

		mypy ./python/ ./tests/python/
		mypy --install-types --non-interactive ./python/ ./tests/python/