pytorch
diff --git a/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/metal.yml‎
Lines changed: 191 additions & 0 deletions b/‎.github/workflows/metal.yml‎
Lines changed: 191 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README-wheel.md‎
Lines changed: 1 addition & 1 deletion b/‎README-wheel.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 6 additions & 4 deletions b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎backends/cadence/aot/quantizer/fusion_pass.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/aot/quantizer/fusion_pass.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/ref_implementations.py‎
Lines changed: 40 additions & 5 deletions b/‎backends/cadence/aot/ref_implementations.py‎
Lines changed: 40 additions & 5 deletions
@@ -1 +1 @@
-e8f76b4295584c4328e7fd7971c131cb341c7438
+467660923a5a25e4718e1d6697b93ff1bab4e807
@@ -0,0 +1,191 @@
+name: Test Metal Backend
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+      - release/*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: false
+
+jobs:
+  test-metal-builds:
+    name: test-executorch-metal-build
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m2-stable
+      python-version: '3.11'
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        set -eux
+
+        echo "::group::Test ExecuTorch Metal build"
+        PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_executorch.sh
+        echo "::endgroup::"
+
+  export-voxtral-metal-artifact:
+    name: export-voxtral-metal-artifact
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    secrets: inherit
+    with:
+      runner: macos-m2-stable
+      python-version: '3.11'
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      secrets-env: EXECUTORCH_HF_TOKEN
+      upload-artifact: voxtral-metal-export
+      script: |
+        set -eux
+
+        echo "::group::Setup Huggingface"
+        ${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate
+        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        echo "::endgroup::"
+
+        echo "::group::Setup Optimum-ExecuTorch"
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        echo "Using optimum-executorch version: ${OPTIMUM_ET_VERSION}"
+        ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        ${CONDA_RUN} pip install mistral-common librosa
+        echo "::endgroup::"
+
+        echo "::group::Setup ExecuTorch"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Pip List"
+        ${CONDA_RUN} pip list
+        echo "::endgroup::"
+
+        echo "::group::Export Voxtral"
+        ${CONDA_RUN} optimum-cli export executorch \
+            --model "mistralai/Voxtral-Mini-3B-2507" \
+            --task "multimodal-text-to-text" \
+            --recipe "metal" \
+            --dtype bfloat16 \
+            --max_seq_len 1024 \
+            --output_dir ./
+        ${CONDA_RUN} python -m executorch.extension.audio.mel_spectrogram \
+            --feature_size 128 \
+            --stack_output \
+            --max_audio_len 300 \
+            --output_file voxtral_preprocessor.pte
+
+        test -f model.pte
+        test -f aoti_metal_blob.ptd
+        test -f voxtral_preprocessor.pte
+        echo "::endgroup::"
+
+        echo "::group::Store Voxtral Artifacts"
+        mkdir -p "${RUNNER_ARTIFACT_DIR}"
+        cp model.pte "${RUNNER_ARTIFACT_DIR}/"
+        cp aoti_metal_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
+        cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
+        ls -al "${RUNNER_ARTIFACT_DIR}"
+        echo "::endgroup::"
+
+  test-voxtral-metal-e2e:
+    name: test-voxtral-metal-e2e
+    needs: export-voxtral-metal-artifact
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m2-stable
+      python-version: '3.11'
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      download-artifact: voxtral-metal-export
+      script: |
+        set -eux
+
+        echo "::group::Print machine info"
+        uname -a
+        if [ $(uname -s) == Darwin ]; then
+          sw_vers
+          # Print RAM in GB
+          RAM_BYTES=$(sysctl -n hw.memsize)
+          RAM_GB=$(echo "scale=2; $RAM_BYTES/1024/1024/1024" | bc)
+          echo "Available RAM (GB): $RAM_GB"
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          # Print number of GPU cores (Apple Silicon)
+          if command -v system_profiler &> /dev/null; then
+            GPU_CORES=$(system_profiler SPDisplaysDataType | awk '/Total Number of Cores/ {print $5; exit}')
+            if [ -z "$GPU_CORES" ]; then
+              # Fallback: try to parse "Core Count" from Apple GPU section
+              GPU_CORES=$(system_profiler SPDisplaysDataType | awk '/Core Count/ {print $3; exit}')
+            fi
+            echo "GPU Cores: ${GPU_CORES:-Unknown}"
+          else
+            echo "system_profiler not available, cannot determine GPU cores."
+          fi
+        fi
+        echo "::endgroup::"
+
+        echo "::group::Setup ExecuTorch Requirements"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_requirements.sh
+        echo "::endgroup::"
+
+        echo "::group::Pip List"
+        ${CONDA_RUN} pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Voxtral Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_metal_blob.ptd" .
+        cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
+        TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
+        curl -L $TOKENIZER_URL -o tekken.json
+        ls -al model.pte aoti_metal_blob.ptd voxtral_preprocessor.pte tekken.json
+        echo "::endgroup::"
+
+        echo "::group::Create Test Audio File"
+        say -o call_samantha_hall.aiff "Call Samantha Hall"
+        afconvert -f WAVE -d LEI16 call_samantha_hall.aiff call_samantha_hall.wav
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Runner"
+        ${CONDA_RUN} cmake --preset llm \
+              -DEXECUTORCH_BUILD_METAL=ON \
+              -DCMAKE_INSTALL_PREFIX=cmake-out \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Bcmake-out -S.
+        ${CONDA_RUN} cmake --build cmake-out -j$(( $(sysctl -n hw.ncpu) - 1 )) --target install --config Release
+
+        ${CONDA_RUN} cmake -DEXECUTORCH_BUILD_METAL=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Sexamples/models/voxtral \
+              -Bcmake-out/examples/models/voxtral/
+        ${CONDA_RUN} cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
+        echo "::endgroup::"
+
+        echo "::group::Run Voxtral Runner"
+        set +e
+        OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
+              --model_path model.pte \
+              --data_path aoti_metal_blob.ptd \
+              --tokenizer_path tekken.json \
+              --audio_path call_samantha_hall.wav \
+              --processor_path voxtral_preprocessor.pte \
+              --temperature 0 2>&1)
+        EXIT_CODE=$?
+        set -e
+
+        echo "$OUTPUT"
+
+        if ! echo "$OUTPUT" | grep -iq "Samantha"; then
+          echo "Expected output 'Samantha' not found in output"
+          exit 1
+        fi
+
+        if [ $EXIT_CODE -ne 0 ]; then
+          echo "Unexpected exit code: $EXIT_CODE"
+          exit $EXIT_CODE
+        fi
+        echo "::endgroup::"
@@ -62,7 +62,6 @@ xcuserdata/
 /include/
 /share/
 /version.py
-*.csv
 *_etdump
 
 # Android
 
@@ -34,7 +34,7 @@ executorch
 │   ├── <a href="backends/qualcomm">qualcomm</a> - Qualcomm-specific backends. See <a href="docs/source/backends-qualcomm.md">doc</a>.
 │   ├── <a href="backends/transforms">transforms</a> - Transformations for backend optimization.
 │   ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends-vulkan.md">doc</a>.
-│   └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends-xnnpack.md">doc</a>.
+│   └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends/xnnpack/xnnpack-overview.md">doc</a>.
 ├── <a href="codegen">codegen</a> - Tooling to autogenerate bindings between kernels and the runtime.
 ├── <a href="configurations">configurations</a> - Configuration files.
 ├── <a href="devtools">devtools</a> - Model profiling, debugging, and inspection. Please refer to the <a href="docs/source/devtools-overview.md">tools documentation</a> for more information.
 
@@ -11,7 +11,7 @@ The `executorch` pip package is in beta.
 The prebuilt `executorch.runtime` module included in this package provides a way
 to run ExecuTorch `.pte` files, with some restrictions:
 * Only [core ATen operators](docs/source/ir-ops-set-definition.md) are linked into the prebuilt module
-* Only the [XNNPACK backend delegate](docs/source/backends-xnnpack.md) is linked into the prebuilt module.
+* Only the [XNNPACK backend delegate](docs/source/backends/xnnpack/xnnpack-overview.md) is linked into the prebuilt module.
 * \[macOS only] [Core ML](docs/source/backends/coreml/coreml-overview.md) and [MPS](docs/source/backends/mps/mps-overview.md) backend
   are also linked into the prebuilt module.
 
 
@@ -53,7 +53,6 @@ def _validate_ref_impl_exists() -> None:
     # 1. be removed
     # 2. have a reference implementation added to ref_implementations.py
     _WARN_ONLY = {
-        "cadence::quantized_w8a32_linear",
         "cadence::quantized_add",  # We should only support per_tensor variant, should remove
         "cadence::_softmax_f32_f32",
         "cadence::requantize",  # We should only support per_tensor variant, should remove
@@ -2706,6 +2705,9 @@ def quantized_w8a32_linear_meta(
     # output comes in empty with shape [leading_dims, out_dim]
     src_shape = list(src.shape)
     weight_shape = weight.shape
+    assert (src_shape[-1] % 4) == 0
+    if len(src_shape) >= 2:
+        assert src_shape[-2] == 1
     assert len(weight_shape) == 2
     assert src_shape[-1] == weight_shape[-1]
     src_shape[-1] = weight_shape[0]
@@ -2720,12 +2722,12 @@ def quantized_w8a32_conv_meta(
     bias: torch.Tensor,
     b_scale: float,
 ) -> torch.Tensor:
-    # src comes in shape [batch, in_channel, in_length]
-    # weight comes in shape [out_ch, in_ch, kernel_dim]
+    # src comes in shape [batch, in_length, in_channels]
+    # weight comes in shape [kernel_dim, out_ch, in_ch]
     # output comes in empty with shape [batch, out_ch, in_length - kernel_dim + 1]
     assert len(src.shape) == 3
 
-    out_channels, in_channels, kernel_size = weight.shape
+    kernel_size, out_channels, in_channels = weight.shape
     assert kernel_size == 3
     assert (out_channels % 4) == 0
     assert (in_channels % 4) == 0
 
@@ -397,7 +397,7 @@ def get_args_and_kwargs_mixed_w8a32_conv(
     )
     transposed_weights = graph_module.graph.call_function(
         torch.ops.aten.permute.default,
-        (weights_inputs[0], [2, 0, 1]),  # NCL -> NLC
+        (weights_inputs[0], [2, 0, 1]),  # NCL -> LNC
     )
 
     args = (
 
@@ -854,18 +854,23 @@ def quantized_w8a32_conv(
     if len(weight.shape) != 3:
         raise ValueError("Weight tensor must be 3D")
 
-    out_channels, in_channels, kernel_size = weight.shape
+    kernel_size, out_channels, in_channels = weight.shape
     if kernel_size != 3:
         raise ValueError("Kernel size must be 3")
     if (out_channels % 4) != 0:
         raise ValueError("Out channels must be a multiple of 4")
     if (in_channels % 4) != 0:
         raise ValueError("In channels must be a multiple of 4")
 
-    # src comes in shape [batch, in_channel, in_length]
-    # weight comes in shape [out_ch, in_ch, kernel_dim]
-    # output comes in empty with shape [batch, out_ch, in_length - kernel_dim + 1]
-    # Dequantize weight using scale
+    assert weight.dtype == torch.int8
+    assert bias.dtype == torch.int8
+
+    # To make compliant with torch (LCN -> NCL format)
+    weight = weight.permute(1, 2, 0).contiguous()
+
+    # channels last to channels first
+    src = src.permute(0, 2, 1).contiguous()
+
     dequant_weight = weight.float() * w_scale
 
     # Dequantize bias using scale
@@ -884,6 +889,36 @@ def quantized_w8a32_conv(
     return output
 
 
+@impl_tracked(m, "quantized_w8a32_linear")
+def quantized_w8a32_linear(
+    src: torch.Tensor,
+    weight: torch.Tensor,
+    w_scale: float,
+    bias: torch.Tensor,
+    b_scale: float,
+) -> torch.Tensor:
+    # src comes in shape [leading_dims, in_dim]
+    # weight comes in shape [in_dim, out_dim]
+    # output comes in empty with shape [leading_dims, out_dim]
+    assert weight.dtype == torch.int8
+    assert bias.dtype == torch.int8
+    if len(src.shape) >= 2:
+        assert src.shape[-2] == 1, "Only supporting vector-matrix multiplication"
+
+    # need to transpose to make compliant with torch linear (in, out -> out, in)
+    weight = weight.transpose(1, 0).contiguous()
+    dequant_weight = weight.float() * w_scale
+    dequant_bias = bias.float() * b_scale
+
+    output = torch.nn.functional.linear(
+        src.float(),
+        dequant_weight,
+        dequant_bias,
+    )
+
+    return output
+
+
 @impl_tracked(m, "quantized_conv2d_nhwc.per_tensor")
 def quantized_conv2d_nhwc_per_tensor(
     input_tensor: torch.Tensor,
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-e8f76b4295584c4328e7fd7971c131cb341c7438`
	`1`	`+467660923a5a25e4718e1d6697b93ff1bab4e807`
Original file line number	Diff line number	Diff line change
`@@ -397,7 +397,7 @@ def get_args_and_kwargs_mixed_w8a32_conv(`
`397`	`397`	`)`
`398`	`398`	`transposed_weights = graph_module.graph.call_function(`
`399`	`399`	`torch.ops.aten.permute.default,`
`400`		`- (weights_inputs[0], [2, 0, 1]), # NCL -> NLC`
	`400`	`+ (weights_inputs[0], [2, 0, 1]), # NCL -> LNC`
`401`	`401`	`)`
`402`	`402`
`403`	`403`	`args = (`