diff --git a/.ci/scripts/test_qnn_static_llama.sh b/.ci/scripts/test_qnn_static_llama.sh index ad3b491a992..a5f194ba0b9 100644 --- a/.ci/scripts/test_qnn_static_llama.sh +++ b/.ci/scripts/test_qnn_static_llama.sh @@ -41,6 +41,10 @@ exit_code1=$? $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts . --enable_x86_64 exit_code2=$? +# Check BC +bash backends/qualcomm/bc/test_qnn_static_llama_bc.sh +exit_code3=$? + # Check the exit codes and print messages if [ $exit_code1 -ne 0 ]; then echo "Static Llama compile only with weight sharing test failed. $exit_code1." @@ -50,8 +54,12 @@ if [ $exit_code2 -ne 0 ]; then echo "Static Llama accuracy test failed. $exit_code2." fi +if [ $exit_code3 -ne 0 ]; then + echo "Static Llama BACKWARD COMPATIBILITY test failed. $exit_code3." +fi + # Return failure if either program failed -if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then +if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ] || [ $exit_code3 -ne 0 ]; then exit 1 else exit 0 diff --git a/backends/qualcomm/bc/test_qnn_static_llama_bc.sh b/backends/qualcomm/bc/test_qnn_static_llama_bc.sh new file mode 100644 index 00000000000..c76485a664c --- /dev/null +++ b/backends/qualcomm/bc/test_qnn_static_llama_bc.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi + +which "${PYTHON_EXECUTABLE}" + + +llama_artifacts="." +PTE_ARTIFACT="examples/qualcomm/oss_scripts/llama/artifacts" + +# Download stories260K.pt and tokenizer from Github +curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.pt" --output stories260K.pt +curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.model" --output tokenizer.model +# Create params.json file +touch params.json +echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > params.json + +# Checks e2e accuracy +expected=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts $llama_artifacts --enable_x86_64 | grep "Model CI result:") +exit_code1=$? + +# Checks accuracy with precompiled +output=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir $PTE_ARTIFACT --llama_artifacts $llama_artifacts --enable_x86_64 --pre_gen_pte $PTE_ARTIFACT | grep "Model CI result:") +exit_code2=$? + +if [[ "$output" == "$expected" ]]; then + echo "[BACKWARD COMPATIBILITY CHECK] Output matches expected result." +else + echo "[BACKWARD COMPATIBILITY CHECK] Output mismatch!" + echo "[BACKWARD COMPATIBILITY CHECK] Expected: $expected" + echo "[BACKWARD COMPATIBILITY CHECK] Actual: $output" + exit 1 +fi + +# Check the exit codes and print messages +if [ $exit_code1 -ne 0 ]; then + echo "Static Llama compile only test failed. $exit_code1." +fi + +if [ $exit_code2 -ne 0 ]; then + echo "Static Llama execute precompiled test failed. $exit_code2." +fi + +# Return failure if either program failed +if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then + exit 1 +else + exit 0 +fi diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 2f580cb71b2..51480661bd3 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -4094,6 +4094,84 @@ def test_llama3_2_1b(self): if not self.compile_only and not self.enable_x86_64: self.assertGreaterEqual(msg["inference_speed"], 66) # Lanai + def test_llama_stories_260k(self): + if not self.required_envs(): + self.skipTest("missing required envs") + assert ( + self.llama_artifacts is not None + ), "Please provide path to llama artifacts" + + prompt = "Once" + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--model", + self.model, + "--checkpoint", + f"{self.llama_artifacts}/stories260K.pt", + "--params", + f"{self.llama_artifacts}/params.json", + "--tokenizer_model", + f"{self.llama_artifacts}/tokenizer.model", + "--tokenizer_bin", + f"{self.llama_artifacts}/tokenizer.bin", + "--ip", + self.ip, + "--port", + str(self.port), + "--prompt", + f"{prompt}", + "--ptq", + "16a4w", + "--temperature", + "0", + "--decoder_model", + "stories260k", + "--model_mode", + "hybrid", + "--prefill_ar_len", + "32", + "--max_seq_len", + "128", + ] + if self.compile_only: + cmds.extend(["--compile_only"]) + elif self.device: + cmds.extend(["--device", self.device]) + if self.host: + cmds.extend(["--host", self.host]) + elif self.enable_x86_64: + cmds.extend(["--enable_x86_64"]) + if self.pre_gen_pte: + cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) + + golden_start_with = "Once upon a time," + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + if not self.compile_only: + model_out = msg["result"][0] + print(f"Model CI result:{model_out[: len(golden_start_with)]}") + self.assertTrue( + model_out.startswith(golden_start_with), + f"Expected Output: {golden_start_with}. Actual Output: {model_out}", + ) + # x86 does not allow weight sharing, so we don't check pte size + if not self.enable_x86_64: + pte_size = msg["pte_size"] + self.assertLessEqual(pte_size, 2020000) + if not self.compile_only and not self.enable_x86_64: + self.assertGreaterEqual(msg["inference_speed"], 1600) # Lanai + def test_llama_stories_110m(self): if not self.required_envs(): self.skipTest("missing required envs") diff --git a/examples/qualcomm/oss_scripts/llama/artifacts/README.md b/examples/qualcomm/oss_scripts/llama/artifacts/README.md new file mode 100644 index 00000000000..f0e96aee711 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/artifacts/README.md @@ -0,0 +1,47 @@ +# Artifacts folder for LLaMA backward compatibility validation +This folder contains the stories260K(a smaller LLaMA variant) .pte artifact for backward compatibility (BC) validation in CI pipelines. + +Model source: [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K) + +## Purpose +The .pte files stored here serve as reference pte to ensure that changes to the ExecuTorch do not introduce backward-incompatible changes. + +These files are used in CI to: +1. Compile story llama with the previous (n-1) commit. +2. Run and validate with the current (n) commit. + +We use the stories260K model because it is a minimal LLaMA variant, making it ideal for efficient validation in CI pipelines. + +## File Structure +- stories260k_hybrid_llama_qnn.pte: precompiled story llama used for backward compatibility validation. +## Updating Artifacts +To update the .pte file, follow these steps: + +1. Checkout the latest commit before all your changes. + +2. Download and prepare stories260K model + +```bash +# tokenizer.model & stories260K.pt: +wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.pt" +wget -O tokenizer.model "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.model" + +# tokenizer.bin: +python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin + +# params.json: +echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > params.json +``` + +3. Run the following command to regenerate and update .pte file: + +``` bash +# Checks accuracy with weight sharing disabled since x86 does not support weight sharing. +python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./examples/qualcomm/oss_scripts/llama/artifacts --llama_artifacts . --enable_x86_64 --compile_only + +``` +4. Commit the hybrid_llama_qnn.pte file to the repository. + +5. Update this README if necessary then commit your changes. + +Note: The .pte file is large (~2MB). In the future, we may host it on Hugging Face and download it during CI to reduce repository size. diff --git a/examples/qualcomm/oss_scripts/llama/artifacts/stories260k_hybrid_llama_qnn.pte b/examples/qualcomm/oss_scripts/llama/artifacts/stories260k_hybrid_llama_qnn.pte new file mode 100644 index 00000000000..198b96e5b9b Binary files /dev/null and b/examples/qualcomm/oss_scripts/llama/artifacts/stories260k_hybrid_llama_qnn.pte differ diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index 21a61e33992..388662940f2 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -616,6 +616,9 @@ def compile(args, pte_filename, tokenizer): if "model" in state_dict: state_dict = state_dict["model"] + if args.decoder_model == "stories260k": + state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()} + # Change to HuggingFace weight to improve the performance of RoPE in HTP backend. def permute(w, heads): dim_0 = w.size(0) @@ -751,7 +754,7 @@ def permute(w, heads): annotate_conv=args.ptq != "16a8w", ), ) - if args.decoder_model == "stories110m": + if args.decoder_model == {"stories110m", "stories260k"}: custom_annotations = custom_annotations + ( annotate_linear_16a8w_in_affine_layer, ) @@ -946,7 +949,7 @@ def post_process(): f"--model_path {pte_path}", f"--seq_len {seq_len}", f"--output_path {args.artifact}/outputs/outputs.txt", - f"--performance_output_path {performance_output_path}", + f"--performance_output_path {args.artifact}/{performance_output_path}", f"--kv_updater ShiftPointer", runner_args, ] @@ -995,7 +998,9 @@ def post_process(): adb.pull(output_path=args.artifact, callback=post_process) if args.ip and args.port != -1: inference_speed = 0 - with open(f"{args.artifact}/{performance_output_path}", "r") as f: + with open( + f"{os.path.abspath(args.artifact)}/{performance_output_path}", "r" + ) as f: inference_speed = float(f.read()) pte_size = os.path.getsize(pte_path) @@ -1033,8 +1038,8 @@ def _build_parser(): parser.add_argument( "--decoder_model", - choices=["stories110m", "llama3_2", "qwen2_5"], - help="The Llama model to export. Current available options are: [stories110m, llama3_2, qwen2_5]", + choices=["stories260k", "stories110m", "llama3_2", "qwen2_5"], + help="The Llama model to export. Current available options are: [stories260k, stories110m, llama3_2, qwen2_5]", required=True, ) @@ -1208,16 +1213,19 @@ def export_llama(args) -> None: else: raise RuntimeError(f"Unknown model_mode: {args.model_mode}.") + if args.decoder_model == "stories260k": + pte_filename = f"{args.decoder_model}_" + pte_filename + tokenizer = None runtime_tokenizer_path, decoder_model_version = "", "" - if args.decoder_model == "stories110m": + if args.decoder_model in {"stories110m", "stories260k"}: tokenizer = get_tokenizer(args.tokenizer_model) assert isinstance( tokenizer, SentencePieceTokenizer - ), f"Wrong tokenizer provided for stories110m." + ), f"Wrong tokenizer provided for stories." assert ( args.tokenizer_bin is not None - ), "Please provide tokenizer_bin for stories110m." + ), "Please provide tokenizer_bin for stories." runtime_tokenizer_path = args.tokenizer_bin decoder_model_version = "llama2" elif args.decoder_model == "llama3_2":