Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backends/qualcomm/_passes/layout_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ class LayoutTransform(ExportPass):
exir_ops.edge.aten.pow.Tensor_Scalar,
exir_ops.edge.aten.prelu.default,
exir_ops.edge.aten.repeat.default,
exir_ops.edge.aten.round.default,
exir_ops.edge.aten.relu.default,
exir_ops.edge.aten.round.default,
exir_ops.edge.aten.sigmoid.default,
exir_ops.edge.aten.split_with_sizes.default,
exir_ops.edge.aten.split_with_sizes_copy.default,
Expand Down
6 changes: 4 additions & 2 deletions backends/qualcomm/quantizer/annotators.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,9 @@ def annotate_masked_fill(node: Node, quantization_config: QuantizationConfig) ->
)


@register_annotator([torch.ops.aten.mul, torch.ops.aten.mul.Tensor])
@register_annotator(
[torch.ops.aten.mul, torch.ops.aten.mul.Tensor, torch.ops.aten.mul_.Tensor]
)
def annotate_mul(node: Node, quantization_config: QuantizationConfig) -> None:
annotate_binary(node, quantization_config)

Expand Down Expand Up @@ -1311,7 +1313,7 @@ def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
)


@register_annotator([torch.ops.aten.zeros.default])
@register_annotator([torch.ops.aten.zeros.default, torch.ops.aten.zeros_like.default])
def annotate_zeros(node: Node, quantization_config: QuantizationConfig) -> None:
if _is_annotated([node]) or not _is_float_tensor(node):
return
Expand Down
11 changes: 7 additions & 4 deletions backends/qualcomm/quantizer/custom_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,9 @@ def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
)


def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None: # noqa: C901
def annotate_matmul_16a8w( # noqa: C901
gm: torch.fx.GraphModule, annotate_conv=True
) -> None:
"""
This function is specific for matmul op 16a8w.
For k, we will tag such as the below, and
Expand Down Expand Up @@ -317,9 +319,10 @@ def annotate_matmul_input1(node: Node):
# The arguments of cat op: (the past kv cache, the new kv cache)
node = node.args[0][1]
elif node.target == torch.ops.aten.conv2d.default:
annotate_conv2d(
node, quantization_config=quantization_config_8a4w_per_channel
)
if annotate_conv:
annotate_conv2d(
node, quantization_config=quantization_config_8a4w_per_channel
)
break
elif node.target in [torch.ops.aten.add.Tensor, torch.ops.aten.sub.Tensor]:
break
Expand Down
8 changes: 8 additions & 0 deletions backends/qualcomm/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ if [ "$BUILD_AARCH64" = true ]; then
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-DEXECUTORCH_ENABLE_LOGGING=ON \
-DQNN_SDK_ROOT=$QNN_SDK_ROOT \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
-DANDROID_ABI='arm64-v8a' \
Expand All @@ -104,6 +105,9 @@ if [ "$BUILD_AARCH64" = true ]; then
-DANDROID_ABI='arm64-v8a' \
-DANDROID_PLATFORM=android-30 \
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
-DSUPPORT_REGEX_LOOKAHEAD=ON \
-DBUILD_TESTING=OFF \
-DEXECUTORCH_ENABLE_LOGGING=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
Expand Down Expand Up @@ -134,6 +138,7 @@ if [ "$BUILD_X86_64" = true ]; then
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-DEXECUTORCH_ENABLE_LOGGING=ON \
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-S $PRJ_ROOT \
-B $BUILD_ROOT \
Expand All @@ -157,6 +162,9 @@ if [ "$BUILD_X86_64" = true ]; then
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-DSUPPORT_REGEX_LOOKAHEAD=ON \
-DBUILD_TESTING=OFF \
-DEXECUTORCH_ENABLE_LOGGING=ON \
-B$EXAMPLE_ROOT

cmake --build $EXAMPLE_ROOT -j$BUILD_JOB_NUMBER
Expand Down
63 changes: 61 additions & 2 deletions backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4049,7 +4049,7 @@ def test_llama3_2_1b(self):
"16a4w",
"--temperature",
"0",
"--llama_model",
"--decoder_model",
"llama3_2",
"--model_mode",
"hybrid",
Expand Down Expand Up @@ -4129,7 +4129,7 @@ def test_llama_stories_110m(self):
"16a4w",
"--temperature",
"0",
"--llama_model",
"--decoder_model",
"stories110m",
"--model_mode",
"hybrid",
Expand Down Expand Up @@ -4171,6 +4171,65 @@ def test_llama_stories_110m(self):
if not self.compile_only and not self.enable_x86_64:
self.assertGreaterEqual(msg["inference_speed"], 220) # Lanai

def test_qwen2_5(self):
if not self.required_envs():
self.skipTest("missing required envs")

prompt = "My favourite condiment is "
cmds = [
"python",
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
"--artifact",
self.artifact_dir,
"--build_folder",
self.build_folder,
"--model",
self.model,
"--ip",
self.ip,
"--port",
str(self.port),
"--prompt",
f"{prompt}",
"--ptq",
"16a8w",
"--decoder_model",
"qwen2_5",
"--model_mode",
"hybrid",
"--prefill_ar_len",
"32",
"--max_seq_len",
"128",
]
if self.compile_only:
cmds.extend(["--compile_only"])
elif self.device:
cmds.extend(["--device", self.device])
if self.host:
cmds.extend(["--host", self.host])
elif self.enable_x86_64:
cmds.extend(["--enable_x86_64"])
if self.pre_gen_pte:
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])

# Accuracy is bad for now. Just check user's prompt is returned.
golden_start_with = "My favourite condiment is "
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
with Listener((self.ip, self.port)) as listener:
conn = listener.accept()
p.communicate()
msg = json.loads(conn.recv())
if "Error" in msg:
self.fail(msg["Error"])
else:
model_out = msg["result"][0]
self.assertTrue(
model_out.startswith(golden_start_with),
f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
)
self.assertGreaterEqual(msg["inference_speed"], 95) # Lanai


class TestExampleOssScript(TestQNN):
def test_albert(self):
Expand Down
14 changes: 14 additions & 0 deletions examples/models/qwen2_5/config/0_5b_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"dim": 896,
"ffn_dim_multiplier": 1,
"hidden_dim": 4864,
"n_heads": 14,
"n_kv_heads": 2,
"n_layers": 24,
"norm_eps": 1e-06,
"rope_theta": 1000000.0,
"use_scaled_rope": false,
"vocab_size": 151936,
"use_hf_rope": true,
"attention_qkv_bias": true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah nice

}
4 changes: 2 additions & 2 deletions examples/qualcomm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ target_include_directories(

# add tokenizers
add_subdirectory(
${EXECUTORCH_ROOT}/extension/llm/tokenizers
${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/tokenizers
${EXECUTORCH_ROOT}/extension/llm/runner
${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/runner
)

# build qnn_executor_runner
Expand Down
9 changes: 9 additions & 0 deletions examples/qualcomm/oss_scripts/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.


# model sharding with custom op
set(CUSTOM_OP_SRCS_FILE
"${EXECUTORCH_SOURCE_DIR}/extension/llm/custom_ops/op_fallback.cpp"
Expand Down Expand Up @@ -63,14 +64,22 @@ target_link_libraries(
executorch_core
extension_data_loader
extension_flat_tensor
extension_llm_runner
extension_module
extension_tensor
tokenizers
gflags
custom_ops
quantized_ops_lib
quantized_kernels
tokenizers
)

target_include_directories(
qnn_llama_runner
PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
)

target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
set_target_properties(
qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
Expand Down
3 changes: 2 additions & 1 deletion examples/qualcomm/oss_scripts/llama/README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Summary

## Overview
This file provides you the instructions to run LLAMA model with different parameters via Qualcomm HTP backend. We currently support the following models:
This file provides you the instructions to run LLM Decoder model with different parameters via Qualcomm HTP backend. We currently support the following models:
1. LLAMA2 Stories 110M
2. LLAMA3.2 1B
3. LLAMA3.2 3B
4. QWEN2.5 0.5B

We offer the following modes to execute the model:

Expand Down
Loading
Loading