From 87a3c54fa1eed5d2c14593faa9ff53f5be7f4b5f Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 3 Mar 2025 15:35:01 -0800
Subject: [PATCH 01/13] up

---
 .ci/scripts/test_ane_static_llama.sh          | 51 +++++++++++++++++++
 .github/workflows/pull.yml                    | 28 ++++++++++
 examples/apple/coreml/llama/export.py         |  1 +
 .../apple/coreml/llama/llama_transformer.py   |  6 ++-
 4 files changed, 84 insertions(+), 2 deletions(-)
 create mode 100644 .ci/scripts/test_ane_static_llama.sh

diff --git a/.ci/scripts/test_ane_static_llama.sh b/.ci/scripts/test_ane_static_llama.sh
new file mode 100644
index 00000000000..b671a6cf5e7
--- /dev/null
+++ b/.ci/scripts/test_ane_static_llama.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.."
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+which "${PYTHON_EXECUTABLE}"
+
+pushd $EXECUTORCH_ROOT/examples/apple/coreml/llama
+
+# Download stories llama110m artifacts
+download_stories_model_artifacts
+
+python export.py -n model.pte -p params.json -c stories110M.pt --seq_length 32 --max_seq_length 64 --dtype fp16 --coreml-quantize c4w
+
+
+python run.py -m model.pte -t tokenizer.model --prompt "Once upon a time," --temperature 0.0 &> tmp.txt
+tail -n +6 tmp.txt &> output.txt
+
+cat output.txt
+
+printf 'Once upon a time,there was a little girl named L ily . She loved to play outside in the sun sh ine . One day , she saw ' &> expected.txt
+
+
+if diff output.txt expected.txt > /dev/null; then
+    echo "Output matches."
+else
+    echo "Output does not match."
+    echo "\n\nExpected:"
+    cat expected.txt
+
+    echo "\n\nGot:"
+    cat output.txt
+
+    echo "\n\nDiff:"
+    diff output.txt expected.txt
+    exit 1
+fi
+
+popd
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 5cc0d3c597b..6cbf8ae7db8 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -495,6 +495,34 @@ jobs:
         # Test static llama weight sharing and accuracy
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama.sh
 
+  test-static-llama-ane:
+    name: test-static-llama-ane
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-stable
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+        bash .ci/scripts/setup-conda.sh
+        eval "$(conda shell.bash hook)"
+
+        # Setup MacOS dependencies as there is no Docker support on MacOS atm
+        PYTHON_EXECUTABLE=python \
+        EXECUTORCH_BUILD_PYBIND=ON \
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
+        ${CONDA_RUN} --no-capture-output \
+        .ci/scripts/setup-macos.sh "$@"
+
+        # Install llama3_2_vision dependencies.
+        PYTHON_EXECUTABLE=python \
+        ${CONDA_RUN} --no-capture-output \
+        ./examples/models/llama3_2_vision/install_requirements.sh
+
+        # Test ANE llama
+        sh .ci/scripts/test_ane_static_llama.sh
+
   test-qnn-models-linux:
     name: test-qnn-models-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
diff --git a/examples/apple/coreml/llama/export.py b/examples/apple/coreml/llama/export.py
index c0f60529895..f440dc878d4 100644
--- a/examples/apple/coreml/llama/export.py
+++ b/examples/apple/coreml/llama/export.py
@@ -203,6 +203,7 @@ def main() -> None:
             torch.ops.aten.scaled_dot_product_attention.default,
             # preserve norm op for numerical stability
             torch.ops.aten.linalg_vector_norm.default,
+            torch.ops.aten.reciprocal.default,
         ],
         compile_config=EdgeCompileConfig(
             _check_ir_validity=False,
diff --git a/examples/apple/coreml/llama/llama_transformer.py b/examples/apple/coreml/llama/llama_transformer.py
index 2ce4c1d2b5b..3c371da4c00 100644
--- a/examples/apple/coreml/llama/llama_transformer.py
+++ b/examples/apple/coreml/llama/llama_transformer.py
@@ -134,8 +134,10 @@ def _norm(self, x):
         # We have yet to do large scale evaluations on the numeric stability of this solution, but note that
         # it appears better than what exists currently (removing FP32 casts and using FP16)
         rms_norm_eps0 = (
-            x * torch.sqrt(torch.tensor(self.dim, dtype=x.dtype))
-        ) / torch.linalg.vector_norm(x, dim=-1, keepdim=True)
+            x
+            * torch.sqrt(torch.tensor(self.dim, dtype=x.dtype))
+            * torch.reciprocal(torch.linalg.vector_norm(x, dim=-1, keepdim=True))
+        )
         return rms_norm_eps0
 
     def forward(self, x):

From ee6a29419df996c366efb3b64bf21b559115066c Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 3 Mar 2025 15:52:44 -0800
Subject: [PATCH 02/13] up

---
 .github/workflows/pull.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 6cbf8ae7db8..5666d5d6f41 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -505,6 +505,7 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
+        export BUILD_TOOL="cmake"
         bash .ci/scripts/setup-conda.sh
         eval "$(conda shell.bash hook)"
 

From 9ec6231beb7a4fd69e0e3056d5b29068074a801e Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 3 Mar 2025 16:26:48 -0800
Subject: [PATCH 03/13] up

---
 .github/workflows/pull.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 5666d5d6f41..74dba00540c 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -505,14 +505,14 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
-        export BUILD_TOOL="cmake"
         bash .ci/scripts/setup-conda.sh
         eval "$(conda shell.bash hook)"
 
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
+        BUILD_TOOL=cmake \
         PYTHON_EXECUTABLE=python \
         EXECUTORCH_BUILD_PYBIND=ON \
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
         ${CONDA_RUN} --no-capture-output \
         .ci/scripts/setup-macos.sh "$@"
 

From 0c627f11b8cd1ff5a96cc46d5f4e196272c60c79 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 3 Mar 2025 16:43:22 -0800
Subject: [PATCH 04/13] up

---
 .github/workflows/pull.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 74dba00540c..7acb9a949c1 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -509,12 +509,12 @@ jobs:
         eval "$(conda shell.bash hook)"
 
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        BUILD_TOOL=cmake \
         PYTHON_EXECUTABLE=python \
         EXECUTORCH_BUILD_PYBIND=ON \
         CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
         ${CONDA_RUN} --no-capture-output \
-        .ci/scripts/setup-macos.sh "$@"
+        .ci/scripts/setup-macos.sh cmake debug false
+
 
         # Install llama3_2_vision dependencies.
         PYTHON_EXECUTABLE=python \

From 4d4b585c0565b734bfd339043c1ca9a9cc49f3b6 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 3 Mar 2025 17:00:26 -0800
Subject: [PATCH 05/13] up

---
 .github/workflows/pull.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 7acb9a949c1..a596d620321 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -513,7 +513,7 @@ jobs:
         EXECUTORCH_BUILD_PYBIND=ON \
         CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
         ${CONDA_RUN} --no-capture-output \
-        .ci/scripts/setup-macos.sh cmake debug false
+        .ci/scripts/setup-macos.sh --build-tool cmake --build-mode Debug --editable false
 
 
         # Install llama3_2_vision dependencies.

From 7b57ae3dfb51fdcf2b7b889de9e4574f6a82bd13 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 3 Mar 2025 17:13:29 -0800
Subject: [PATCH 06/13] up

---
 .github/workflows/pull.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index a596d620321..7b7324fb7f7 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -521,6 +521,9 @@ jobs:
         ${CONDA_RUN} --no-capture-output \
         ./examples/models/llama3_2_vision/install_requirements.sh
 
+        # Install coreml
+        sh ./backends/apple/coreml/scripts/install_requirements.sh
+
         # Test ANE llama
         sh .ci/scripts/test_ane_static_llama.sh
 

From 1ba0f20628e8af11157e87ab5e74fc16f01faf02 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 3 Mar 2025 17:57:57 -0800
Subject: [PATCH 07/13] up

---
 .github/workflows/pull.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 7b7324fb7f7..16c81a3c0b6 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -522,7 +522,7 @@ jobs:
         ./examples/models/llama3_2_vision/install_requirements.sh
 
         # Install coreml
-        sh ./backends/apple/coreml/scripts/install_requirements.sh
+        pip install coremltools
 
         # Test ANE llama
         sh .ci/scripts/test_ane_static_llama.sh

From 74ecaaf4a4f376278590962fd99c1c90c6b48f03 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 3 Mar 2025 20:12:54 -0800
Subject: [PATCH 08/13] up

---
 .github/workflows/pull.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 16c81a3c0b6..df1b4f9ece4 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -523,6 +523,7 @@ jobs:
 
         # Install coreml
         pip install coremltools
+        sh install_requirements.sh
 
         # Test ANE llama
         sh .ci/scripts/test_ane_static_llama.sh

From 37339518cca9dde3899e3836ff47007afaf68742 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 3 Mar 2025 20:33:39 -0800
Subject: [PATCH 09/13] up

---
 .github/workflows/pull.yml | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index df1b4f9ece4..e4b7a49be5e 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -508,22 +508,25 @@ jobs:
         bash .ci/scripts/setup-conda.sh
         eval "$(conda shell.bash hook)"
 
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python \
-        EXECUTORCH_BUILD_PYBIND=ON \
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
-        ${CONDA_RUN} --no-capture-output \
-        .ci/scripts/setup-macos.sh --build-tool cmake --build-mode Debug --editable false
+        # # Setup MacOS dependencies as there is no Docker support on MacOS atm
+        # PYTHON_EXECUTABLE=python \
+        # EXECUTORCH_BUILD_PYBIND=ON \
+        # CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
+        # ${CONDA_RUN} --no-capture-output \
+        # .ci/scripts/setup-macos.sh --build-tool cmake --build-mode Debug --editable false
 
 
-        # Install llama3_2_vision dependencies.
-        PYTHON_EXECUTABLE=python \
-        ${CONDA_RUN} --no-capture-output \
-        ./examples/models/llama3_2_vision/install_requirements.sh
+        # # Install llama3_2_vision dependencies.
+        # PYTHON_EXECUTABLE=python \
+        # ${CONDA_RUN} --no-capture-output \
+        # ./examples/models/llama3_2_vision/install_requirements.sh
 
         # Install coreml
         pip install coremltools
         sh install_requirements.sh
+        sh examples/models/llama/install_requirements.sh
+        python install_executorch.py --pybind coreml
+
 
         # Test ANE llama
         sh .ci/scripts/test_ane_static_llama.sh

From 1bb62e3f9a01ffcdefbc1c64174f37b913e3a081 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 3 Mar 2025 20:42:05 -0800
Subject: [PATCH 10/13] up

---
 .github/workflows/pull.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index e4b7a49be5e..aa8a890636f 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -524,8 +524,9 @@ jobs:
         # Install coreml
         pip install coremltools
         sh install_requirements.sh
-        sh examples/models/llama/install_requirements.sh
+        sh backends/apple/coreml/scripts/install_requirements.sh
         python install_executorch.py --pybind coreml
+        sh examples/models/llama/install_requirements.sh
 
 
         # Test ANE llama

From 030a6b3fa72bc79bff225bfccf8f3b951b7d9407 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 3 Mar 2025 20:49:33 -0800
Subject: [PATCH 11/13] up

---
 .github/workflows/pull.yml | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index aa8a890636f..82a80179d11 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -508,27 +508,12 @@ jobs:
         bash .ci/scripts/setup-conda.sh
         eval "$(conda shell.bash hook)"
 
-        # # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        # PYTHON_EXECUTABLE=python \
-        # EXECUTORCH_BUILD_PYBIND=ON \
-        # CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
-        # ${CONDA_RUN} --no-capture-output \
-        # .ci/scripts/setup-macos.sh --build-tool cmake --build-mode Debug --editable false
-
-
-        # # Install llama3_2_vision dependencies.
-        # PYTHON_EXECUTABLE=python \
-        # ${CONDA_RUN} --no-capture-output \
-        # ./examples/models/llama3_2_vision/install_requirements.sh
-
-        # Install coreml
-        pip install coremltools
+        # Install requirements
         sh install_requirements.sh
         sh backends/apple/coreml/scripts/install_requirements.sh
         python install_executorch.py --pybind coreml
         sh examples/models/llama/install_requirements.sh
 
-
         # Test ANE llama
         sh .ci/scripts/test_ane_static_llama.sh
 

From c6532995cba41904904bd324713e3641f92bd38a Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 3 Mar 2025 21:06:48 -0800
Subject: [PATCH 12/13] up

---
 .ci/scripts/test_ane_static_llama.sh | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/.ci/scripts/test_ane_static_llama.sh b/.ci/scripts/test_ane_static_llama.sh
index b671a6cf5e7..c83c522d629 100644
--- a/.ci/scripts/test_ane_static_llama.sh
+++ b/.ci/scripts/test_ane_static_llama.sh
@@ -24,28 +24,4 @@ download_stories_model_artifacts
 
 python export.py -n model.pte -p params.json -c stories110M.pt --seq_length 32 --max_seq_length 64 --dtype fp16 --coreml-quantize c4w
 
-
-python run.py -m model.pte -t tokenizer.model --prompt "Once upon a time," --temperature 0.0 &> tmp.txt
-tail -n +6 tmp.txt &> output.txt
-
-cat output.txt
-
-printf 'Once upon a time,there was a little girl named L ily . She loved to play outside in the sun sh ine . One day , she saw ' &> expected.txt
-
-
-if diff output.txt expected.txt > /dev/null; then
-    echo "Output matches."
-else
-    echo "Output does not match."
-    echo "\n\nExpected:"
-    cat expected.txt
-
-    echo "\n\nGot:"
-    cat output.txt
-
-    echo "\n\nDiff:"
-    diff output.txt expected.txt
-    exit 1
-fi
-
 popd

From 83ae7aadf6b7d74b2d1146083a8012f1cd82a134 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Tue, 4 Mar 2025 09:07:11 -0800
Subject: [PATCH 13/13] up

---
 .github/workflows/pull.yml  | 22 ----------------------
 .github/workflows/trunk.yml | 22 ++++++++++++++++++++++
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 82a80179d11..5cc0d3c597b 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -495,28 +495,6 @@ jobs:
         # Test static llama weight sharing and accuracy
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama.sh
 
-  test-static-llama-ane:
-    name: test-static-llama-ane
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    with:
-      runner: macos-m1-stable
-      python-version: '3.11'
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      script: |
-        set -eux
-        bash .ci/scripts/setup-conda.sh
-        eval "$(conda shell.bash hook)"
-
-        # Install requirements
-        sh install_requirements.sh
-        sh backends/apple/coreml/scripts/install_requirements.sh
-        python install_executorch.py --pybind coreml
-        sh examples/models/llama/install_requirements.sh
-
-        # Test ANE llama
-        sh .ci/scripts/test_ane_static_llama.sh
-
   test-qnn-models-linux:
     name: test-qnn-models-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 410e95d9a84..c003f050ba0 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -229,6 +229,28 @@ jobs:
         # see if we can import the module successfully
         ${CONDA_RUN} python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
 
+    test-static-llama-ane:
+    name: test-static-llama-ane
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-stable
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+        bash .ci/scripts/setup-conda.sh
+        eval "$(conda shell.bash hook)"
+
+        # Install requirements
+        sh install_requirements.sh
+        sh backends/apple/coreml/scripts/install_requirements.sh
+        python install_executorch.py --pybind coreml
+        sh examples/models/llama/install_requirements.sh
+
+        # Test ANE llama
+        sh .ci/scripts/test_ane_static_llama.sh
+
   test-llama-runner-macos:
     name: test-llama-runner-mac
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main