Skip to content

Commit

Permalink
android slm refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
Kartik14 committed Dec 26, 2023
1 parent 7684c07 commit fb50e54
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 48 deletions.
16 changes: 12 additions & 4 deletions android/library/prepare_model_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
from tvm.contrib import ndk


def main():
app_config = json.load(open("src/main/assets/app-config.json", "r"))
target = "android"
Expand All @@ -10,10 +11,17 @@ def main():
tar_list = []

for local_id in app_config["model_libs"]:
path = os.path.join(artifact_path, local_id, f"{local_id}-{target}.tar")
if not os.path.isfile(path):
raise RuntimeError(f"Cannot find {path}")
tar_list.append(path)
paths = [
os.path.join(artifact_path, local_id, f"{local_id}-{target}.tar"),
os.path.join(artifact_path, "libs", f"{local_id}-{target}.tar"),
os.path.join(artifact_path, "prebuilt", "lib", f"{local_id}-{target}.tar"),
]
valid_paths = [p for p in paths if os.path.isfile(p)]
if len(valid_paths) == 0:
raise RuntimeError(
f"Cannot find lib for {local_id} in the following candidate path: {paths}"
)
tar_list.append(valid_paths[0])

ndk.create_staticlib(os.path.join("build", "model_lib", "libmodel_android.a"), tar_list)
print(f"Creating lib from {tar_list}..")
Expand Down
13 changes: 4 additions & 9 deletions android/library/src/main/assets/app-config.json
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
{
"model_libs": [
"Llama-2-7b-chat-hf-q4f16_0",
"Llama-2-7b-chat-hf-q4f16_1",
"RedPajama-INCITE-Chat-3B-v1-q4f16_1"
],
"model_list": [
{
"model_url": "https://huggingface.co/mlc-ai/mlc-chat-Llama-2-7b-chat-hf-q4f16_0/",
"local_id": "Llama-2-7b-chat-hf-q4f16_0"
"model_url": "https://huggingface.co/mlc-ai/Llama-2-7b-chat-hf-q4f16_1-MLC",
"local_id": "Llama-2-7b-chat-hf-q4f16_1-MLC"
},
{
"model_url": "https://huggingface.co/mlc-ai/mlc-chat-Llama-2-7b-chat-hf-q4f16_1/",
"local_id": "Llama-2-7b-chat-hf-q4f16_1"
},
{
"model_url": "https://huggingface.co/mlc-ai/mlc-chat-RedPajama-INCITE-Chat-3B-v1-q4f16_1/",
"local_id": "RedPajama-INCITE-Chat-3B-v1-q4f16_1"
"model_url": "https://huggingface.co/mlc-ai/RedPajama-INCITE-Chat-3B-v1-q4f16_1-MLC",
"local_id": "RedPajama-INCITE-Chat-3B-v1-q4f16_1-MLC"
}
],
"add_model_samples": []
Expand Down
55 changes: 28 additions & 27 deletions docs/deploy/android.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,43 +85,45 @@ To deploy models on Android with reasonable performance, one has to cross-compil
git clone https://huggingface.co/meta-llama/$MODEL_NAME \
./dist/models/
**Compile Android-capable models**. Install TVM Unity compiler as a Python package, and then run the command below:
**Compile Android-capable models**. Install TVM Unity compiler as a Python package, and then compile the model for android using the following commands:

.. code-block:: bash
# Show help message
python3 -m mlc_llm.build --help
# Compile a PyTorch model
python3 -m mlc_llm.build \
--target android \
--max-seq-len 768 \
--model ./dist/models/$MODEL_NAME \
--quantization $QUANTIZATION
# convert weights
mlc_chat convert_weight ./dist/models/$MODEL_NAME/ --quantization $QUANTIZATION -o dist/$MODEL_NAME-$QUANTIZATION-MLC/
This generates the directory ``./dist/$MODEL_NAME-$QUANTIZATION`` which contains the necessary components to run the model, as explained below.
# create mlc-chat-config.json
mlc_chat gen_config ./dist/models/$MODEL_NAME/ --quantization $QUANTIZATION \
--conv-template llama-2 --context-window-size 768 -o dist/${MODEL_NAME}-${QUANTIZATION}-MLC/
**Expected output format**. By default models are placed under ``./dist/${MODEL_NAME}-${QUANTIZATION}``, and the result consists of 3 major components:
# 2. compile: compile model library with specification in mlc-chat-config.json
mlc_chat compile ./dist/${MODEL_NAME}-${QUANTIZATION}-MLC/mlc-chat-config.json \
--device android -o ./dist/${MODEL_NAME}-${QUANTIZATION}-MLC/${MODEL_NAME}-${QUANTIZATION}-MLC-android.tar
- Runtime configuration: It configures conversation templates including system prompts, repetition repetition penalty, sampling including temperature and top-p probability, maximum sequence length, etc. It is usually named as ``mlc-chat-config.json`` under ``params/`` alongside with tokenizer configurations.
- Model lib: The compiled library that uses mobile GPU. It is usually named as ``${MODEL_NAME}-${QUANTIZATION}-android.tar``, for example, ``Llama-2-7b-chat-hf-q4f16_0-android.tar``.
- Model weights: the model weights are sharded as ``params_shard_*.bin`` under ``params/`` and the metadata is stored in ``ndarray-cache.json``.
This generates the directory ``./dist/$MODEL_NAME-$QUANTIZATION-MLC`` which contains the necessary components to run the model, as explained below.

**Expected output format**. By default models are placed under ``./dist/${MODEL_NAME}-${QUANTIZATION}-MLC``, and the result consists of 3 major components:

- Runtime configuration: It configures conversation templates including system prompts, repetition repetition penalty, sampling including temperature and top-p probability, maximum sequence length, etc. It is usually named as ``mlc-chat-config.json`` alongside with tokenizer configurations.
- Model lib: The compiled library that uses mobile GPU. It is usually named as ``${MODEL_NAME}-${QUANTIZATION}-MLC-android.tar``, for example, ``Llama-2-7b-chat-hf-q4f16_1-MLC-android.tar``.
- Model weights: the model weights are sharded as ``params_shard_*.bin`` and the metadata is stored in ``ndarray-cache.json``.

Create Android Project using Compiled Models
--------------------------------------------

The source code for MLC LLM is available under ``android/``, including scripts to build dependencies and the main app under ``android/MLCChat/`` that could be opened by Android studio. Enter the directory first:
The source code for MLC LLM is available under ``android/``, including scripts to build dependencies. Enter the directory first:

.. code-block:: bash
cd ./android/
**Build necessary dependencies.** Configure the list of models the app comes with using the JSON file below, which by default, is configured to use both Llama2-7B and RedPajama-3B:
**Build necessary dependencies.** Configure the list of models the app comes with using the JSON file ``app-config.json``. The ``model_libs`` field contains the list of model libraries that are bundled with and supported by the apk. The ``model_url`` field contains the list of model URLs that are not bundled with the apk, but downloaded from the Internet at run-time. By default, it is configured to use both Llama2-7B and RedPajama-3B models. To change the configuration, edit ``app-config.json``:

.. code-block:: bash
vim ./MLCChat/app/src/main/assets/app-config.json
vim ./library/src/main/assets/app-config.json
Then bundle the android library ``${MODEL_NAME}-${QUANTIZATION}-android.tar`` compiled from ``mlc_llm.build`` in the previous steps, with TVM Unity's Java runtime by running the commands below:
Then bundle the android library ``${MODEL_NAME}-${QUANTIZATION}-MLC-android.tar`` compiled from ``mlc_chat compile`` in the previous steps, with TVM Unity's Java runtime by running the commands below:

.. code-block:: bash
Expand All @@ -135,13 +137,12 @@ which generates the two files below:
./build/output/arm64-v8a/libtvm4j_runtime_packed.so
./build/output/tvm4j_core.jar
The model execution logic in mobile GPUs is incorporated into ``libtvm4j_runtime_packed.so``, while ``tvm4j_core.jar`` is a lightweight (~60 kb) `Java binding <https://tvm.apache.org/docs/reference/api/javadoc/>`_ to it. Copy them to the right path to be found by the Android project:
The model execution logic in mobile GPUs is incorporated into ``libtvm4j_runtime_packed.so``, while ``tvm4j_core.jar`` is a lightweight (~60 kb) `Java binding <https://tvm.apache.org/docs/reference/api/javadoc/>`_ to it.

.. code-block:: bash
cp -a ./build/output/. ./MLCChat/app/src/main/libs
.. note::
❗ ``./prepare_libs.sh`` expects the android library with the name ``${MODEL_NAME}-${QUANTIZATION}-MLC-android.tar`` to be placed under either ``./dist/${MODEL_NAME}-${QUANTIZATION}-MLC/``, ``./dist/libs/`` or ``./dist/prebuilt/lib``. If you have compiled the model with a different name, please rename it accordingly.

**Build the Android app**. Open folder ``./android/MLCChat`` as an Android Studio Project. Connect your Android device to your machine. In the menu bar of Android Studio, click "Build → Make Project". Once the build is finished, click "Run → Run 'app'" and you will see the app launched on your phone.
**Build the Android app**. Open folder ``./android`` as an Android Studio Project. Connect your Android device to your machine. In the menu bar of Android Studio, click "Build → Make Project". Once the build is finished, click "Run → Run 'app'" and you will see the app launched on your phone.

.. note::
❗ This app cannot be run in an emulator and thus a physical phone is required, because MLC LLM needs an actual mobile GPU to meaningfully run at an accelerated speed.
Expand All @@ -151,7 +152,7 @@ Incorporate Model Weights

Instructions have been provided to build an Android App with MLC LLM in previous sections, but it requires run-time weight downloading from HuggingFace, as configured in `app-config.json` in previous steps under `model_url`. However, it could be desirable to bundle weights together into the app to avoid downloading over the network. In this section, we provide a simple ADB-based walkthrough that hopefully helps with further development.

**Generating APK**. Enter Android Studio, and click "Build → Generate Signed Bundle/APK" to build an APK for release. If it is the first time you generate an APK, you will need to create a key according to `the official guide from Android <https://developer.android.com/studio/publish/app-signing#generate-key>`_. This APK will be placed under ``android/MLCChat/app/release/app-release.apk``.
**Generating APK**. Enter Android Studio, and click "Build → Generate Signed Bundle/APK" to build an APK for release. If it is the first time you generate an APK, you will need to create a key according to `the official guide from Android <https://developer.android.com/studio/publish/app-signing#generate-key>`_. This APK will be placed under ``android/app/release/app-release.apk``.

**Install ADB and USB debugging**. Enable "USB debugging" in the developer mode in your phone settings. In SDK manager, install `Android SDK Platform-Tools <https://developer.android.com/studio/releases/platform-tools>`_. Add the path to platform-tool path to the environment variable ``PATH``. Run the following commands, and if ADB is installed correctly, your phone will appear as a device:

Expand All @@ -163,7 +164,7 @@ Instructions have been provided to build an Android App with MLC LLM in previous

.. code-block:: bash
adb install android/MLCChat/app/release/app-release.apk
adb push dist/${MODEL_NAME}-${QUANTIZATION}/params /data/local/tmp/${MODEL_NAME}-${QUANTIZATION}/
adb install android/app/release/app-release.apk
adb push dist/${MODEL_NAME}-${QUANTIZATION}-MLC/ /data/local/tmp/${MODEL_NAME}-${QUANTIZATION}-MLC/
adb shell "mkdir -p /storage/emulated/0/Android/data/ai.mlc.mlcchat/files/"
adb shell "mv /data/local/tmp/${MODEL_NAME}-${QUANTIZATION} /storage/emulated/0/Android/data/ai.mlc.mlcchat/files/"
adb shell "mv /data/local/tmp/${MODEL_NAME}-${QUANTIZATION}-MLC/ /storage/emulated/0/Android/data/ai.mlc.mlcchat/files/"
4 changes: 3 additions & 1 deletion ios/prepare_model_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
from tvm.contrib import cc


def main():
app_config = json.load(open("MLCChat/app-config.json", "r"))
target = "iphone"
Expand All @@ -12,7 +13,8 @@ def main():
for local_id in app_config["model_libs"]:
paths = [
os.path.join(artifact_path, local_id, f"{local_id}-{target}.tar"),
os.path.join(artifact_path, "prebuilt", "lib", f"{local_id}-{target}.tar")
os.path.join(artifact_path, "libs", f"{local_id}-{target}.tar"),
os.path.join(artifact_path, "prebuilt", "lib", f"{local_id}-{target}.tar"),
]
valid_paths = [p for p in paths if os.path.isfile(p)]
if not valid_paths:
Expand Down
6 changes: 3 additions & 3 deletions python/mlc_chat/cli/compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,11 @@ def _check_system_lib_prefix(prefix: str) -> str:
target, build_func = detect_target_and_host(parsed.device, parsed.host)
parsed.model_type = detect_model_type(parsed.model_type, parsed.model)
parsed.quantization = detect_quantization(parsed.quantization, parsed.model)
parsed.system_lib_prefix = detect_system_lib_prefix(
parsed.device, parsed.system_lib_prefix, parsed.model_type.name, parsed.quantization.name
)
with open(parsed.model, "r", encoding="utf-8") as config_file:
config = json.load(config_file)
parsed.system_lib_prefix = detect_system_lib_prefix(
parsed.device, parsed.system_lib_prefix, config["model_lib"]
)

compile(
config=config,
Expand Down
4 changes: 4 additions & 0 deletions python/mlc_chat/compiler/gen_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ class MLCChatConfig: # pylint: disable=too-many-instance-attributes
"""Fields in the dumped `mlc-chat-config.json` file."""

model_type: str
model_lib: str
local_id: str
quantization: str
model_config: Dict[str, Any]
vocab_size: int
Expand Down Expand Up @@ -92,6 +94,8 @@ def gen_config( # pylint: disable=too-many-locals,too-many-arguments,too-many-b
).apply(model_config)
mlc_chat_config = MLCChatConfig(
model_type=model.name,
model_lib=output.name,
local_id=output.name,
quantization=quantization.name,
model_config=model_config.asdict(),
vocab_size=model_config.vocab_size,
Expand Down
6 changes: 2 additions & 4 deletions python/mlc_chat/support/auto_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,7 @@ def tvm_callback_cuda_compile(code, target): # pylint: disable=unused-argument
return ptx


def detect_system_lib_prefix(
target_hint: str, prefix_hint: str, model_name: str, quantization: str
) -> str:
def detect_system_lib_prefix(target_hint: str, prefix_hint: str, model_lib: str) -> str:
"""Detect the iOS / Android system lib prefix to identify the library needed to load the app.
Parameters
Expand All @@ -282,7 +280,7 @@ def detect_system_lib_prefix(
The hint for the system lib prefix.
"""
if prefix_hint == "auto" and target_hint in ["iphone", "android"]:
prefix = f"{model_name}_{quantization}_".replace("-", "_")
prefix = f"{model_lib}_".replace("-", "_")
logger.warning(
"%s is automatically picked from the filename, %s, this allows us to use the filename "
"as the model_lib in android/iOS builds. Please avoid renaming the .tar file when "
Expand Down

0 comments on commit fb50e54

Please sign in to comment.