pytorch
diff --git a/‎.ci/scripts/test_llama.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_llama.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/apple.yml‎
Lines changed: 68 additions & 0 deletions b/‎.github/workflows/apple.yml‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/arm/test/runner_utils.py‎
Lines changed: 10 additions & 2 deletions b/‎backends/arm/test/runner_utils.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎backends/arm/test/setup_testing.sh‎
Lines changed: 18 additions & 3 deletions b/‎backends/arm/test/setup_testing.sh‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎backends/cadence/reference/operators/quantized_layer_norm.cpp‎
Lines changed: 2 additions & 1 deletion b/‎backends/cadence/reference/operators/quantized_layer_norm.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/cadence/reference/operators/quantized_linear_out.cpp‎
Lines changed: 2 additions & 1 deletion b/‎backends/cadence/reference/operators/quantized_linear_out.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/cadence/reference/operators/quantized_matmul_out.cpp‎
Lines changed: 2 additions & 1 deletion b/‎backends/cadence/reference/operators/quantized_matmul_out.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/api/containers/ParamsBuffer.h‎
Lines changed: 18 additions & 1 deletion b/‎backends/vulkan/runtime/api/containers/ParamsBuffer.h‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/api/containers/Tensor.cpp‎
Lines changed: 40 additions & 5 deletions b/‎backends/vulkan/runtime/api/containers/Tensor.cpp‎
Lines changed: 40 additions & 5 deletions
@@ -213,7 +213,7 @@ echo "Creating tokenizer.bin"
 $PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
 
 
-RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10"
+RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10 --warmup=1"
 # Check build tool.
 echo "Running ${EXPORTED_MODEL_NAME} in portable mode"
 if [[ "${BUILD_TOOL}" == "buck2" ]]; then
 
@@ -13,6 +13,7 @@ on:
       - install_requirements.sh
       - backends/apple/**
       - build/build_apple_frameworks.sh
+      - build/build_apple_llm_demo.sh
       - build/create_frameworks.sh
       - build/test_ios_ci.sh
       - examples/demo-apps/apple_ios/**
@@ -215,3 +216,70 @@ jobs:
             shasum -a 256 "${FILENAME}"
             ${AWS_CMD} "${FILENAME}" s3://ossci-ios/executorch/ --acl public-read
           done
+
+  build-benchmark-app:
+    name: build-benchmark-app
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    secrets: inherit
+    with:
+      runner: macos-latest-xlarge
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      upload-artifact: ios-apps
+      secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD
+      timeout: 90
+      script: |
+        set -eux
+
+        echo "::group::Setting up CI environment"
+        .ci/scripts/setup-conda.sh
+
+        BUILD_TOOL=cmake
+        # Setup MacOS dependencies as there is no Docker support on MacOS atm
+        GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+        .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
+
+        # Setup Apple certificate for iOS development
+        BUILD_PROVISION_PROFILE_BASE64="${SECRET_EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64}" \
+        BUILD_CERTIFICATE_BASE64="${SECRET_BUILD_CERTIFICATE_BASE64}" \
+        KEYCHAIN_PASSWORD="${SECRET_KEYCHAIN_PASSWORD}" \
+        .ci/scripts/setup-ios.sh
+
+        # Install CoreML Backend Requirements
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+          backends/apple/coreml/scripts/install_requirements.sh
+
+        # Install MPS Backend Requirements
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+          backends/apple/mps/install_requirements.sh
+        echo "::endgroup::"
+
+        echo "::group::Build ExecuTorch iOS frameworks"
+        FRAMEWORKS=(
+          "executorch"
+          "backend_coreml"
+          "backend_mps"
+          "backend_xnnpack"
+          "kernels_custom"
+          "kernels_optimized"
+          "kernels_portable"
+          "kernels_quantized"
+        )
+
+        # Build Release iOS Frameworks
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
+          build/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack
+
+        mkdir -p extension/apple/Benchmark/Frameworks
+        for FRAMEWORK in "${FRAMEWORKS[@]}"; do (
+          cp -r "cmake-out/${FRAMEWORK}.xcframework" extension/apple/Benchmark/Frameworks/
+        ) done
+        echo "::endgroup::"
+
+        echo "::group::Build ExecuTorch benchmark app"
+        mkdir -p extension/apple/Benchmark/Models
+        ${CONDA_RUN} --no-capture-output \
+          build/build_apple_llm_demo.sh ${ARTIFACTS_DIR_NAME}
+        echo "::endgroup::"
@@ -1,6 +1,7 @@
 .hypothesis
 buck-out/
-cmake-out/
+cmake-out*
+.DS_Store
 cmake-android-out/
 cmake-out-android/
 cmake-ios-out/
 
@@ -6,6 +6,7 @@
 import json
 import logging
 import os
+import re
 import shutil
 import subprocess
 import tempfile
@@ -229,7 +230,9 @@ def run_corstone300(
                 os.path.join(self.intermediate_path, f"{name}.bin"),
             )
         elf_path = os.path.join(
-            "cmake-out", "arm_semihosting_executor_runner", "arm_executor_runner"
+            "cmake-out",
+            "arm_semihosting_executor_runner_corstone-300",
+            "arm_executor_runner",
         )
         assert os.path.exists(
             elf_path
@@ -266,7 +269,12 @@ def run_corstone300(
         ]
         result = _run_cmd(command_args, check=False)
         result_stdout = result.stdout.decode()
-        if "Hard fault" in result_stdout or len(result.stderr) > 0:
+
+        error_regex = r"(^[EF][: ].*$)|(^.*Hard fault.*$)|(^.*Assertion.*$)"
+
+        # Check for errors in the output
+        # regex to check for error or fault messages in stdout from FVP
+        if re.compile(error_regex, re.MULTILINE).search(result_stdout):
             raise RuntimeError(
                 f"Corstone simulation failed, log: \n {result_stdout}\n{result.stderr.decode()}"
             )
 
@@ -13,17 +13,30 @@ ethos_u_root_dir=${et_root_dir}/examples/arm/ethos-u-scratch/ethos-u
 
 toolchain_cmake=${et_root_dir}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
 et_build_dir=${et_root_dir}/cmake-out
-build_test_dir=${et_build_dir}/arm_semihosting_executor_runner
+build_root_test_dir=${et_build_dir}/arm_semihosting_executor_runner
 fvp_model=FVP_Corstone_SSE-300_Ethos-U55
 
 # Build Arm Baremetal executor_runner in semihosting mode.
 # Put in backends/arm/test/res to be used by unit tests.
 function build_semihosting_executorch_runner() {
+    target_board=$1
+    build_test_dir=${build_root_test_dir}_${target_board}
+    echo "[${FUNCNAME[0]}] Configuring ${target_board}"
+    if [[ ${target_board} == "corstone-300" ]]; then
+        local target_cpu=cortex-m55
+    elif [[ ${target_board} == "corstone-320" ]]; then
+        local target_cpu=cortex-m85
+    else
+        echo "[${FUNCNAME[0]}] ERROR: Invalid target_board specified!"
+        exit 1
+    fi
     cd ${et_root_dir}/examples/arm/executor_runner
     pwd
     mkdir -p ${build_test_dir}
     cmake -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake}          \
-          -DTARGET_CPU=cortex-m55                            \
+          -DCMAKE_BUILD_TYPE=RelWithDebInfo                  \
+          -DTARGET_CPU=${target_cpu}                         \
+          -DTARGET_BOARD=${target_board}                     \
           -DSEMIHOSTING=ON                                   \
           -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${build_test_dir} \
           -B ${build_test_dir}                               \
@@ -40,4 +53,6 @@ function build_semihosting_executorch_runner() {
     find ${build_test_dir} -name "arm_executor_runner"
 }
 
-build_semihosting_executorch_runner
+build_semihosting_executorch_runner corstone-300
+
+build_semihosting_executorch_runner corstone-320
@@ -11,7 +11,8 @@
 
 #include <cmath>
 
-using Tensor = exec_aten::Tensor;
+using executorch::aten::Tensor;
+using executorch::runtime::getLeadingDims;
 using executorch::runtime::KernelRuntimeContext;
 
 namespace impl {
 
@@ -13,7 +13,8 @@ namespace impl {
 namespace reference {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using executorch::aten::Tensor;
+using executorch::runtime::getLeadingDims;
 using executorch::runtime::KernelRuntimeContext;
 
 void quantized_linear_out(
 
@@ -13,7 +13,8 @@ namespace impl {
 namespace reference {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using executorch::aten::Tensor;
+using executorch::runtime::getLeadingDims;
 using executorch::runtime::KernelRuntimeContext;
 
 // The quantized matmul. The quantized matmul accumulates in a wider register,
 
@@ -56,12 +56,29 @@ class ParamsBuffer final {
     }
     // Fill the uniform buffer with data in block
     {
-      vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::MemoryAccessType::WRITE);
+      vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::kWrite);
       Block* data_ptr = mapping.template data<Block>();
 
       *data_ptr = block;
     }
   }
+
+  template <typename T>
+  T read() const {
+    T val;
+    if (sizeof(val) != nbytes_) {
+      VK_THROW(
+          "Attempted to store value from ParamsBuffer to type of different size");
+    }
+    // Read value from uniform buffer and store in val
+    {
+      vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::kRead);
+      T* data_ptr = mapping.template data<T>();
+
+      val = *data_ptr;
+    }
+    return val;
+  }
 };
 
 } // namespace api
 
@@ -277,10 +277,11 @@ vTensorStorage::vTensorStorage(
           storage_type_,
           dtype,
           allocate_memory)),
-      last_access_{} {}
+      last_access_{},
+      has_copies_{false} {}
 
 vTensorStorage::vTensorStorage(
-    const vTensorStorage& other,
+    vTensorStorage& other,
     const int64_t buffer_offset)
     : context_(other.context_),
       storage_type_{other.storage_type_},
@@ -289,7 +290,10 @@ vTensorStorage::vTensorStorage(
       buffer_offset_{buffer_offset},
       image_(other.image_),
       buffer_(other.buffer_, buffer_offset),
-      last_access_{other.last_access_} {}
+      last_access_{other.last_access_},
+      has_copies_{false} {
+  other.has_copies_ = true;
+}
 
 vTensorStorage::~vTensorStorage() {
   flush();
@@ -312,6 +316,21 @@ void vTensorStorage::transition(
   vkapi::PipelineStageFlags prev_stage = last_access_.stage;
   vkapi::MemoryAccessFlags prev_access = last_access_.access;
 
+  // If the underlying resource is a copy of another tensor's resource the
+  // last_access may not be accurate, since the original storage may have been
+  // written to as part of the original tensor. Likewise, if the underlying
+  // resource has copies, then the resource may have been updated as part of the
+  // view tensors.
+  //
+  // If the resource is a copy, or has copies of it, then cowardly assume that
+  // it has previously been written to as part of a compute shader before the
+  // current access event so that the appropriate memory barriers may be
+  // inserted.
+  if (is_copy() || has_copies_) {
+    prev_stage = vkapi::PipelineStage::COMPUTE;
+    prev_access = vkapi::kWrite;
+  }
+
   const bool prev_written = (prev_access & vkapi::MemoryAccessType::WRITE) != 0;
 
   VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED;
@@ -358,6 +377,13 @@ void vTensorStorage::transition(
   last_access_.access = cur_access;
 }
 
+bool vTensorStorage::is_copy() const {
+  if (storage_type_ == utils::kBuffer) {
+    return buffer_.is_copy();
+  }
+  return image_.is_copy();
+}
+
 bool vTensorStorage::is_copy_of(const vTensorStorage& other) const {
   if (storage_type_ == utils::kBuffer) {
     return buffer_.is_copy_of(other.buffer_);
@@ -418,7 +444,8 @@ vTensor::vTensor(
   }
 }
 
-vTensor::vTensor(const vTensor& other)
+// NOLINTNEXTLINE
+vTensor::vTensor(vTensor& other)
     : dtype_(other.dtype_),
       // Copy tensor size metadata
       sizes_(other.sizes_.begin(), other.sizes_.end()),
@@ -443,7 +470,7 @@ vTensor::vTensor(const vTensor& other)
       storage_(other.storage_) {}
 
 vTensor::vTensor(
-    const vTensor& other,
+    vTensor& other,
     const std::vector<int64_t>& sizes,
     const std::vector<int64_t>& dim_order,
     const int64_t offset_numel)
@@ -671,6 +698,14 @@ void vTensor::virtual_reconfigure(
   update_metadata();
 }
 
+void vTensor::virtual_clone(const vTensor& other) {
+  VK_CHECK_COND(is_view_of(other));
+  sizes_ = other.sizes_;
+  dim_order_ = other.dim_order_;
+  axis_map_ = other.axis_map_;
+  packed_dim_ = other.packed_dim_;
+}
+
 void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
   VK_CHECK_COND(
       new_sizes.size() == dim_order_.size(),