android slm refactor

mlc-ai · Dec 26, 2023 · fb50e54 · fb50e54
1 parent 7684c07
commit fb50e54
Show file tree

Hide file tree

Showing 7 changed files with 56 additions and 48 deletions.
diff --git a/android/library/prepare_model_lib.py b/android/library/prepare_model_lib.py
@@ -2,6 +2,7 @@
 import os
 from tvm.contrib import ndk
 
+
 def main():
     app_config = json.load(open("src/main/assets/app-config.json", "r"))
     target = "android"
@@ -10,10 +11,17 @@ def main():
     tar_list = []
 
     for local_id in app_config["model_libs"]:
-        path = os.path.join(artifact_path, local_id, f"{local_id}-{target}.tar")
-        if not os.path.isfile(path):
-            raise RuntimeError(f"Cannot find {path}")
-        tar_list.append(path)
+        paths = [
+            os.path.join(artifact_path, local_id, f"{local_id}-{target}.tar"),
+            os.path.join(artifact_path, "libs", f"{local_id}-{target}.tar"),
+            os.path.join(artifact_path, "prebuilt", "lib", f"{local_id}-{target}.tar"),
+        ]
+        valid_paths = [p for p in paths if os.path.isfile(p)]
+        if len(valid_paths) == 0:
+            raise RuntimeError(
+                f"Cannot find lib for {local_id} in the following candidate path: {paths}"
+            )
+        tar_list.append(valid_paths[0])
 
     ndk.create_staticlib(os.path.join("build", "model_lib", "libmodel_android.a"), tar_list)
     print(f"Creating lib from {tar_list}..")

diff --git a/android/library/src/main/assets/app-config.json b/android/library/src/main/assets/app-config.json
@@ -1,21 +1,16 @@
 {
   "model_libs": [
-    "Llama-2-7b-chat-hf-q4f16_0",
     "Llama-2-7b-chat-hf-q4f16_1",
     "RedPajama-INCITE-Chat-3B-v1-q4f16_1"
   ],
   "model_list": [
     {
-      "model_url": "https://huggingface.co/mlc-ai/mlc-chat-Llama-2-7b-chat-hf-q4f16_0/",
-      "local_id": "Llama-2-7b-chat-hf-q4f16_0"
+      "model_url": "https://huggingface.co/mlc-ai/Llama-2-7b-chat-hf-q4f16_1-MLC",
+      "local_id": "Llama-2-7b-chat-hf-q4f16_1-MLC"
     },
     {
-      "model_url": "https://huggingface.co/mlc-ai/mlc-chat-Llama-2-7b-chat-hf-q4f16_1/",
-      "local_id": "Llama-2-7b-chat-hf-q4f16_1"
-    },
-    {
-      "model_url": "https://huggingface.co/mlc-ai/mlc-chat-RedPajama-INCITE-Chat-3B-v1-q4f16_1/",
-      "local_id": "RedPajama-INCITE-Chat-3B-v1-q4f16_1"
+      "model_url": "https://huggingface.co/mlc-ai/RedPajama-INCITE-Chat-3B-v1-q4f16_1-MLC",
+      "local_id": "RedPajama-INCITE-Chat-3B-v1-q4f16_1-MLC"
     }
   ],
   "add_model_samples": []

diff --git a/docs/deploy/android.rst b/docs/deploy/android.rst
@@ -85,43 +85,45 @@ To deploy models on Android with reasonable performance, one has to cross-compil
   git clone https://huggingface.co/meta-llama/$MODEL_NAME \
             ./dist/models/
 
-**Compile Android-capable models**. Install TVM Unity compiler as a Python package, and then run the command below:
+**Compile Android-capable models**. Install TVM Unity compiler as a Python package, and then compile the model for android using the following commands:
 
 .. code-block:: bash
 
-  # Show help message
-  python3 -m mlc_llm.build --help
-  # Compile a PyTorch model
-  python3 -m mlc_llm.build \
-          --target android \
-          --max-seq-len 768 \
-          --model ./dist/models/$MODEL_NAME \
-          --quantization $QUANTIZATION
+  # convert weights
+  mlc_chat convert_weight ./dist/models/$MODEL_NAME/ --quantization $QUANTIZATION -o dist/$MODEL_NAME-$QUANTIZATION-MLC/
 
-This generates the directory ``./dist/$MODEL_NAME-$QUANTIZATION`` which contains the necessary components to run the model, as explained below.
+  # create mlc-chat-config.json
+  mlc_chat gen_config ./dist/models/$MODEL_NAME/ --quantization $QUANTIZATION \
+    --conv-template llama-2 --context-window-size 768 -o dist/${MODEL_NAME}-${QUANTIZATION}-MLC/
 
-**Expected output format**. By default models are placed under ``./dist/${MODEL_NAME}-${QUANTIZATION}``, and the result consists of 3 major components:
+  # 2. compile: compile model library with specification in mlc-chat-config.json
+  mlc_chat compile ./dist/${MODEL_NAME}-${QUANTIZATION}-MLC/mlc-chat-config.json \
+      --device android -o ./dist/${MODEL_NAME}-${QUANTIZATION}-MLC/${MODEL_NAME}-${QUANTIZATION}-MLC-android.tar
 
-- Runtime configuration: It configures conversation templates including system prompts, repetition repetition penalty, sampling including temperature and top-p probability, maximum sequence length, etc. It is usually named as ``mlc-chat-config.json`` under ``params/`` alongside with tokenizer configurations.
-- Model lib: The compiled library that uses mobile GPU. It is usually named as ``${MODEL_NAME}-${QUANTIZATION}-android.tar``, for example, ``Llama-2-7b-chat-hf-q4f16_0-android.tar``.
-- Model weights: the model weights are sharded as ``params_shard_*.bin`` under ``params/`` and the metadata is stored in ``ndarray-cache.json``.
+This generates the directory ``./dist/$MODEL_NAME-$QUANTIZATION-MLC`` which contains the necessary components to run the model, as explained below.
+
+**Expected output format**. By default models are placed under ``./dist/${MODEL_NAME}-${QUANTIZATION}-MLC``, and the result consists of 3 major components:
+
+- Runtime configuration: It configures conversation templates including system prompts, repetition repetition penalty, sampling including temperature and top-p probability, maximum sequence length, etc. It is usually named as ``mlc-chat-config.json``  alongside with tokenizer configurations.
+- Model lib: The compiled library that uses mobile GPU. It is usually named as ``${MODEL_NAME}-${QUANTIZATION}-MLC-android.tar``, for example, ``Llama-2-7b-chat-hf-q4f16_1-MLC-android.tar``.
+- Model weights: the model weights are sharded as ``params_shard_*.bin`` and the metadata is stored in ``ndarray-cache.json``.
 
 Create Android Project using Compiled Models
 --------------------------------------------
 
-The source code for MLC LLM is available under ``android/``, including scripts to build dependencies and the main app under ``android/MLCChat/`` that could be opened by Android studio. Enter the directory first:
+The source code for MLC LLM is available under ``android/``, including scripts to build dependencies. Enter the directory first:
 
 .. code-block:: bash
 
   cd ./android/
 
-**Build necessary dependencies.** Configure the list of models the app comes with using the JSON file below, which by default, is configured to use both Llama2-7B and RedPajama-3B:
+**Build necessary dependencies.** Configure the list of models the app comes with using the JSON file ``app-config.json``. The ``model_libs`` field contains the list of model libraries that are bundled with and supported by the apk. The ``model_url`` field contains the list of model URLs that are not bundled with the apk, but downloaded from the Internet at run-time. By default, it is configured to use both Llama2-7B and RedPajama-3B models. To change the configuration, edit ``app-config.json``:
 
 .. code-block:: bash
 
-  vim ./MLCChat/app/src/main/assets/app-config.json
+  vim ./library/src/main/assets/app-config.json
 
-Then bundle the android library ``${MODEL_NAME}-${QUANTIZATION}-android.tar`` compiled from ``mlc_llm.build`` in the previous steps, with TVM Unity's Java runtime by running the commands below:
+Then bundle the android library ``${MODEL_NAME}-${QUANTIZATION}-MLC-android.tar`` compiled from ``mlc_chat compile`` in the previous steps, with TVM Unity's Java runtime by running the commands below:
 
 .. code-block:: bash
 
@@ -135,13 +137,12 @@ which generates the two files below:
   ./build/output/arm64-v8a/libtvm4j_runtime_packed.so
   ./build/output/tvm4j_core.jar
 
-The model execution logic in mobile GPUs is incorporated into ``libtvm4j_runtime_packed.so``, while ``tvm4j_core.jar`` is a lightweight (~60 kb) `Java binding <https://tvm.apache.org/docs/reference/api/javadoc/>`_ to it. Copy them to the right path to be found by the Android project:
+The model execution logic in mobile GPUs is incorporated into ``libtvm4j_runtime_packed.so``, while ``tvm4j_core.jar`` is a lightweight (~60 kb) `Java binding <https://tvm.apache.org/docs/reference/api/javadoc/>`_ to it.
 
-.. code-block:: bash
-
-  cp -a ./build/output/. ./MLCChat/app/src/main/libs
+.. note::
+    ❗ ``./prepare_libs.sh`` expects the android library with the name ``${MODEL_NAME}-${QUANTIZATION}-MLC-android.tar`` to be placed under either ``./dist/${MODEL_NAME}-${QUANTIZATION}-MLC/``, ``./dist/libs/`` or ``./dist/prebuilt/lib``. If you have compiled the model with a different name, please rename it accordingly.
 
-**Build the Android app**. Open folder ``./android/MLCChat`` as an Android Studio Project. Connect your Android device to your machine. In the menu bar of Android Studio, click "Build → Make Project". Once the build is finished, click "Run → Run 'app'" and you will see the app launched on your phone.
+**Build the Android app**. Open folder ``./android`` as an Android Studio Project. Connect your Android device to your machine. In the menu bar of Android Studio, click "Build → Make Project". Once the build is finished, click "Run → Run 'app'" and you will see the app launched on your phone.
 
 .. note::
     ❗ This app cannot be run in an emulator and thus a physical phone is required, because MLC LLM needs an actual mobile GPU to meaningfully run at an accelerated speed.
@@ -151,7 +152,7 @@ Incorporate Model Weights
 
 Instructions have been provided to build an Android App with MLC LLM in previous sections, but it requires run-time weight downloading from HuggingFace, as configured in `app-config.json` in previous steps under `model_url`. However, it could be desirable to bundle weights together into the app to avoid downloading over the network. In this section, we provide a simple ADB-based walkthrough that hopefully helps with further development.
 
-**Generating APK**. Enter Android Studio, and click "Build → Generate Signed Bundle/APK" to build an APK for release. If it is the first time you generate an APK, you will need to create a key according to `the official guide from Android <https://developer.android.com/studio/publish/app-signing#generate-key>`_. This APK will be placed under ``android/MLCChat/app/release/app-release.apk``.
+**Generating APK**. Enter Android Studio, and click "Build → Generate Signed Bundle/APK" to build an APK for release. If it is the first time you generate an APK, you will need to create a key according to `the official guide from Android <https://developer.android.com/studio/publish/app-signing#generate-key>`_. This APK will be placed under ``android/app/release/app-release.apk``.
 
 **Install ADB and USB debugging**. Enable "USB debugging" in the developer mode in your phone settings. In SDK manager, install `Android SDK Platform-Tools <https://developer.android.com/studio/releases/platform-tools>`_. Add the path to platform-tool path to the environment variable ``PATH``. Run the following commands, and if ADB is installed correctly, your phone will appear as a device:
 
@@ -163,7 +164,7 @@ Instructions have been provided to build an Android App with MLC LLM in previous
 
 .. code-block:: bash
 
-  adb install android/MLCChat/app/release/app-release.apk
-  adb push dist/${MODEL_NAME}-${QUANTIZATION}/params /data/local/tmp/${MODEL_NAME}-${QUANTIZATION}/
+  adb install android/app/release/app-release.apk
+  adb push dist/${MODEL_NAME}-${QUANTIZATION}-MLC/ /data/local/tmp/${MODEL_NAME}-${QUANTIZATION}-MLC/
   adb shell "mkdir -p /storage/emulated/0/Android/data/ai.mlc.mlcchat/files/"
-  adb shell "mv /data/local/tmp/${MODEL_NAME}-${QUANTIZATION} /storage/emulated/0/Android/data/ai.mlc.mlcchat/files/"
+  adb shell "mv /data/local/tmp/${MODEL_NAME}-${QUANTIZATION}-MLC/ /storage/emulated/0/Android/data/ai.mlc.mlcchat/files/"
diff --git a/ios/prepare_model_lib.py b/ios/prepare_model_lib.py
@@ -2,6 +2,7 @@
 import os
 from tvm.contrib import cc
 
+
 def main():
     app_config = json.load(open("MLCChat/app-config.json", "r"))
     target = "iphone"
@@ -12,7 +13,8 @@ def main():
     for local_id in app_config["model_libs"]:
         paths = [
             os.path.join(artifact_path, local_id, f"{local_id}-{target}.tar"),
-            os.path.join(artifact_path, "prebuilt", "lib", f"{local_id}-{target}.tar")
+            os.path.join(artifact_path, "libs", f"{local_id}-{target}.tar"),
+            os.path.join(artifact_path, "prebuilt", "lib", f"{local_id}-{target}.tar"),
         ]
         valid_paths = [p for p in paths if os.path.isfile(p)]
         if not valid_paths:

diff --git a/python/mlc_chat/cli/compile.py b/python/mlc_chat/cli/compile.py
@@ -105,11 +105,11 @@ def _check_system_lib_prefix(prefix: str) -> str:
     target, build_func = detect_target_and_host(parsed.device, parsed.host)
     parsed.model_type = detect_model_type(parsed.model_type, parsed.model)
     parsed.quantization = detect_quantization(parsed.quantization, parsed.model)
-    parsed.system_lib_prefix = detect_system_lib_prefix(
-        parsed.device, parsed.system_lib_prefix, parsed.model_type.name, parsed.quantization.name
-    )
     with open(parsed.model, "r", encoding="utf-8") as config_file:
         config = json.load(config_file)
+    parsed.system_lib_prefix = detect_system_lib_prefix(
+        parsed.device, parsed.system_lib_prefix, config["model_lib"]
+    )
 
     compile(
         config=config,

diff --git a/python/mlc_chat/compiler/gen_config.py b/python/mlc_chat/compiler/gen_config.py
@@ -24,6 +24,8 @@ class MLCChatConfig:  # pylint: disable=too-many-instance-attributes
     """Fields in the dumped `mlc-chat-config.json` file."""
 
     model_type: str
+    model_lib: str
+    local_id: str
     quantization: str
     model_config: Dict[str, Any]
     vocab_size: int
@@ -92,6 +94,8 @@ def gen_config(  # pylint: disable=too-many-locals,too-many-arguments,too-many-b
     ).apply(model_config)
     mlc_chat_config = MLCChatConfig(
         model_type=model.name,
+        model_lib=output.name,
+        local_id=output.name,
         quantization=quantization.name,
         model_config=model_config.asdict(),
         vocab_size=model_config.vocab_size,

diff --git a/python/mlc_chat/support/auto_target.py b/python/mlc_chat/support/auto_target.py
@@ -268,9 +268,7 @@ def tvm_callback_cuda_compile(code, target):  # pylint: disable=unused-argument
         return ptx
 
 
-def detect_system_lib_prefix(
-    target_hint: str, prefix_hint: str, model_name: str, quantization: str
-) -> str:
+def detect_system_lib_prefix(target_hint: str, prefix_hint: str, model_lib: str) -> str:
     """Detect the iOS / Android system lib prefix to identify the library needed to load the app.
 
     Parameters
@@ -282,7 +280,7 @@ def detect_system_lib_prefix(
         The hint for the system lib prefix.
     """
     if prefix_hint == "auto" and target_hint in ["iphone", "android"]:
-        prefix = f"{model_name}_{quantization}_".replace("-", "_")
+        prefix = f"{model_lib}_".replace("-", "_")
         logger.warning(
             "%s is automatically picked from the filename, %s, this allows us to use the filename "
             "as the model_lib in android/iOS builds. Please avoid renaming the .tar file when "