Skip to content

Commit a3491bb

Browse files
Merge branch 'main' into tosa_dialect_rescale
2 parents fc6e3f6 + 8946d80 commit a3491bb

File tree

57 files changed

+2586
-1276
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+2586
-1276
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
e8f76b4295584c4328e7fd7971c131cb341c7438
1+
467660923a5a25e4718e1d6697b93ff1bab4e807

.github/workflows/metal.yml

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
name: Test Metal Backend
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches:
7+
- main
8+
- release/*
9+
10+
concurrency:
11+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
12+
cancel-in-progress: false
13+
14+
jobs:
15+
test-metal-builds:
16+
name: test-executorch-metal-build
17+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
18+
with:
19+
runner: macos-m2-stable
20+
python-version: '3.11'
21+
submodules: 'recursive'
22+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
23+
timeout: 90
24+
script: |
25+
set -eux
26+
27+
echo "::group::Test ExecuTorch Metal build"
28+
PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_executorch.sh
29+
echo "::endgroup::"
30+
31+
export-voxtral-metal-artifact:
32+
name: export-voxtral-metal-artifact
33+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
34+
secrets: inherit
35+
with:
36+
runner: macos-m2-stable
37+
python-version: '3.11'
38+
submodules: 'recursive'
39+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
40+
timeout: 90
41+
secrets-env: EXECUTORCH_HF_TOKEN
42+
upload-artifact: voxtral-metal-export
43+
script: |
44+
set -eux
45+
46+
echo "::group::Setup Huggingface"
47+
${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate
48+
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
49+
echo "::endgroup::"
50+
51+
echo "::group::Setup Optimum-ExecuTorch"
52+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
53+
echo "Using optimum-executorch version: ${OPTIMUM_ET_VERSION}"
54+
${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
55+
${CONDA_RUN} pip install mistral-common librosa
56+
echo "::endgroup::"
57+
58+
echo "::group::Setup ExecuTorch"
59+
PYTHON_EXECUTABLE=python ${CONDA_RUN} ./install_executorch.sh
60+
echo "::endgroup::"
61+
62+
echo "::group::Pip List"
63+
${CONDA_RUN} pip list
64+
echo "::endgroup::"
65+
66+
echo "::group::Export Voxtral"
67+
${CONDA_RUN} optimum-cli export executorch \
68+
--model "mistralai/Voxtral-Mini-3B-2507" \
69+
--task "multimodal-text-to-text" \
70+
--recipe "metal" \
71+
--dtype bfloat16 \
72+
--max_seq_len 1024 \
73+
--output_dir ./
74+
${CONDA_RUN} python -m executorch.extension.audio.mel_spectrogram \
75+
--feature_size 128 \
76+
--stack_output \
77+
--max_audio_len 300 \
78+
--output_file voxtral_preprocessor.pte
79+
80+
test -f model.pte
81+
test -f aoti_metal_blob.ptd
82+
test -f voxtral_preprocessor.pte
83+
echo "::endgroup::"
84+
85+
echo "::group::Store Voxtral Artifacts"
86+
mkdir -p "${RUNNER_ARTIFACT_DIR}"
87+
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
88+
cp aoti_metal_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
89+
cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
90+
ls -al "${RUNNER_ARTIFACT_DIR}"
91+
echo "::endgroup::"
92+
93+
test-voxtral-metal-e2e:
94+
name: test-voxtral-metal-e2e
95+
needs: export-voxtral-metal-artifact
96+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
97+
with:
98+
runner: macos-m2-stable
99+
python-version: '3.11'
100+
submodules: 'recursive'
101+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
102+
timeout: 90
103+
download-artifact: voxtral-metal-export
104+
script: |
105+
set -eux
106+
107+
echo "::group::Print machine info"
108+
uname -a
109+
if [ $(uname -s) == Darwin ]; then
110+
sw_vers
111+
# Print RAM in GB
112+
RAM_BYTES=$(sysctl -n hw.memsize)
113+
RAM_GB=$(echo "scale=2; $RAM_BYTES/1024/1024/1024" | bc)
114+
echo "Available RAM (GB): $RAM_GB"
115+
sysctl machdep.cpu.brand_string
116+
sysctl machdep.cpu.core_count
117+
# Print number of GPU cores (Apple Silicon)
118+
if command -v system_profiler &> /dev/null; then
119+
GPU_CORES=$(system_profiler SPDisplaysDataType | awk '/Total Number of Cores/ {print $5; exit}')
120+
if [ -z "$GPU_CORES" ]; then
121+
# Fallback: try to parse "Core Count" from Apple GPU section
122+
GPU_CORES=$(system_profiler SPDisplaysDataType | awk '/Core Count/ {print $3; exit}')
123+
fi
124+
echo "GPU Cores: ${GPU_CORES:-Unknown}"
125+
else
126+
echo "system_profiler not available, cannot determine GPU cores."
127+
fi
128+
fi
129+
echo "::endgroup::"
130+
131+
echo "::group::Setup ExecuTorch Requirements"
132+
CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_requirements.sh
133+
echo "::endgroup::"
134+
135+
echo "::group::Pip List"
136+
${CONDA_RUN} pip list
137+
echo "::endgroup::"
138+
139+
echo "::group::Prepare Voxtral Artifacts"
140+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
141+
cp "${RUNNER_ARTIFACT_DIR}/aoti_metal_blob.ptd" .
142+
cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
143+
TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
144+
curl -L $TOKENIZER_URL -o tekken.json
145+
ls -al model.pte aoti_metal_blob.ptd voxtral_preprocessor.pte tekken.json
146+
echo "::endgroup::"
147+
148+
echo "::group::Create Test Audio File"
149+
say -o call_samantha_hall.aiff "Call Samantha Hall"
150+
afconvert -f WAVE -d LEI16 call_samantha_hall.aiff call_samantha_hall.wav
151+
echo "::endgroup::"
152+
153+
echo "::group::Build Voxtral Runner"
154+
${CONDA_RUN} cmake --preset llm \
155+
-DEXECUTORCH_BUILD_METAL=ON \
156+
-DCMAKE_INSTALL_PREFIX=cmake-out \
157+
-DCMAKE_BUILD_TYPE=Release \
158+
-Bcmake-out -S.
159+
${CONDA_RUN} cmake --build cmake-out -j$(( $(sysctl -n hw.ncpu) - 1 )) --target install --config Release
160+
161+
${CONDA_RUN} cmake -DEXECUTORCH_BUILD_METAL=ON \
162+
-DCMAKE_BUILD_TYPE=Release \
163+
-Sexamples/models/voxtral \
164+
-Bcmake-out/examples/models/voxtral/
165+
${CONDA_RUN} cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
166+
echo "::endgroup::"
167+
168+
echo "::group::Run Voxtral Runner"
169+
set +e
170+
OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
171+
--model_path model.pte \
172+
--data_path aoti_metal_blob.ptd \
173+
--tokenizer_path tekken.json \
174+
--audio_path call_samantha_hall.wav \
175+
--processor_path voxtral_preprocessor.pte \
176+
--temperature 0 2>&1)
177+
EXIT_CODE=$?
178+
set -e
179+
180+
echo "$OUTPUT"
181+
182+
if ! echo "$OUTPUT" | grep -iq "Samantha"; then
183+
echo "Expected output 'Samantha' not found in output"
184+
exit 1
185+
fi
186+
187+
if [ $EXIT_CODE -ne 0 ]; then
188+
echo "Unexpected exit code: $EXIT_CODE"
189+
exit $EXIT_CODE
190+
fi
191+
echo "::endgroup::"

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ xcuserdata/
6262
/include/
6363
/share/
6464
/version.py
65-
*.csv
6665
*_etdump
6766

6867
# Android

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ executorch
3434
│ ├── <a href="backends/qualcomm">qualcomm</a> - Qualcomm-specific backends. See <a href="docs/source/backends-qualcomm.md">doc</a>.
3535
│ ├── <a href="backends/transforms">transforms</a> - Transformations for backend optimization.
3636
│ ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends-vulkan.md">doc</a>.
37-
│ └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends-xnnpack.md">doc</a>.
37+
│ └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends/xnnpack/xnnpack-overview.md">doc</a>.
3838
├── <a href="codegen">codegen</a> - Tooling to autogenerate bindings between kernels and the runtime.
3939
├── <a href="configurations">configurations</a> - Configuration files.
4040
├── <a href="devtools">devtools</a> - Model profiling, debugging, and inspection. Please refer to the <a href="docs/source/devtools-overview.md">tools documentation</a> for more information.

README-wheel.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ The `executorch` pip package is in beta.
1111
The prebuilt `executorch.runtime` module included in this package provides a way
1212
to run ExecuTorch `.pte` files, with some restrictions:
1313
* Only [core ATen operators](docs/source/ir-ops-set-definition.md) are linked into the prebuilt module
14-
* Only the [XNNPACK backend delegate](docs/source/backends-xnnpack.md) is linked into the prebuilt module.
14+
* Only the [XNNPACK backend delegate](docs/source/backends/xnnpack/xnnpack-overview.md) is linked into the prebuilt module.
1515
* \[macOS only] [Core ML](docs/source/backends/coreml/coreml-overview.md) and [MPS](docs/source/backends/mps/mps-overview.md) backend
1616
are also linked into the prebuilt module.
1717

backends/cadence/aot/ops_registrations.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ def _validate_ref_impl_exists() -> None:
5353
# 1. be removed
5454
# 2. have a reference implementation added to ref_implementations.py
5555
_WARN_ONLY = {
56-
"cadence::quantized_w8a32_linear",
5756
"cadence::quantized_add", # We should only support per_tensor variant, should remove
5857
"cadence::_softmax_f32_f32",
5958
"cadence::requantize", # We should only support per_tensor variant, should remove
@@ -2706,6 +2705,9 @@ def quantized_w8a32_linear_meta(
27062705
# output comes in empty with shape [leading_dims, out_dim]
27072706
src_shape = list(src.shape)
27082707
weight_shape = weight.shape
2708+
assert (src_shape[-1] % 4) == 0
2709+
if len(src_shape) >= 2:
2710+
assert src_shape[-2] == 1
27092711
assert len(weight_shape) == 2
27102712
assert src_shape[-1] == weight_shape[-1]
27112713
src_shape[-1] = weight_shape[0]
@@ -2720,12 +2722,12 @@ def quantized_w8a32_conv_meta(
27202722
bias: torch.Tensor,
27212723
b_scale: float,
27222724
) -> torch.Tensor:
2723-
# src comes in shape [batch, in_channel, in_length]
2724-
# weight comes in shape [out_ch, in_ch, kernel_dim]
2725+
# src comes in shape [batch, in_length, in_channels]
2726+
# weight comes in shape [kernel_dim, out_ch, in_ch]
27252727
# output comes in empty with shape [batch, out_ch, in_length - kernel_dim + 1]
27262728
assert len(src.shape) == 3
27272729

2728-
out_channels, in_channels, kernel_size = weight.shape
2730+
kernel_size, out_channels, in_channels = weight.shape
27292731
assert kernel_size == 3
27302732
assert (out_channels % 4) == 0
27312733
assert (in_channels % 4) == 0

backends/cadence/aot/quantizer/fusion_pass.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ def get_args_and_kwargs_mixed_w8a32_conv(
397397
)
398398
transposed_weights = graph_module.graph.call_function(
399399
torch.ops.aten.permute.default,
400-
(weights_inputs[0], [2, 0, 1]), # NCL -> NLC
400+
(weights_inputs[0], [2, 0, 1]), # NCL -> LNC
401401
)
402402

403403
args = (

backends/cadence/aot/ref_implementations.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -854,18 +854,23 @@ def quantized_w8a32_conv(
854854
if len(weight.shape) != 3:
855855
raise ValueError("Weight tensor must be 3D")
856856

857-
out_channels, in_channels, kernel_size = weight.shape
857+
kernel_size, out_channels, in_channels = weight.shape
858858
if kernel_size != 3:
859859
raise ValueError("Kernel size must be 3")
860860
if (out_channels % 4) != 0:
861861
raise ValueError("Out channels must be a multiple of 4")
862862
if (in_channels % 4) != 0:
863863
raise ValueError("In channels must be a multiple of 4")
864864

865-
# src comes in shape [batch, in_channel, in_length]
866-
# weight comes in shape [out_ch, in_ch, kernel_dim]
867-
# output comes in empty with shape [batch, out_ch, in_length - kernel_dim + 1]
868-
# Dequantize weight using scale
865+
assert weight.dtype == torch.int8
866+
assert bias.dtype == torch.int8
867+
868+
# To make compliant with torch (LCN -> NCL format)
869+
weight = weight.permute(1, 2, 0).contiguous()
870+
871+
# channels last to channels first
872+
src = src.permute(0, 2, 1).contiguous()
873+
869874
dequant_weight = weight.float() * w_scale
870875

871876
# Dequantize bias using scale
@@ -884,6 +889,36 @@ def quantized_w8a32_conv(
884889
return output
885890

886891

892+
@impl_tracked(m, "quantized_w8a32_linear")
893+
def quantized_w8a32_linear(
894+
src: torch.Tensor,
895+
weight: torch.Tensor,
896+
w_scale: float,
897+
bias: torch.Tensor,
898+
b_scale: float,
899+
) -> torch.Tensor:
900+
# src comes in shape [leading_dims, in_dim]
901+
# weight comes in shape [in_dim, out_dim]
902+
# output comes in empty with shape [leading_dims, out_dim]
903+
assert weight.dtype == torch.int8
904+
assert bias.dtype == torch.int8
905+
if len(src.shape) >= 2:
906+
assert src.shape[-2] == 1, "Only supporting vector-matrix multiplication"
907+
908+
# need to transpose to make compliant with torch linear (in, out -> out, in)
909+
weight = weight.transpose(1, 0).contiguous()
910+
dequant_weight = weight.float() * w_scale
911+
dequant_bias = bias.float() * b_scale
912+
913+
output = torch.nn.functional.linear(
914+
src.float(),
915+
dequant_weight,
916+
dequant_bias,
917+
)
918+
919+
return output
920+
921+
887922
@impl_tracked(m, "quantized_conv2d_nhwc.per_tensor")
888923
def quantized_conv2d_nhwc_per_tensor(
889924
input_tensor: torch.Tensor,

0 commit comments

Comments
 (0)