Skip to content

Commit a523e23

Browse files
committed
Update on "[ExecuTorch] Support bf16 for binary logical ops"
Differential Revision: [D63486223](https://our.internmc.facebook.com/intern/diff/D63486223/) [ghstack-poisoned]
2 parents 9adaaf0 + 557d1b6 commit a523e23

File tree

560 files changed

+1399
-4393
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

560 files changed

+1399
-4393
lines changed

.ci/scripts/test_llama.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ echo "Creating tokenizer.bin"
213213
$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
214214

215215

216-
RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10"
216+
RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10 --warmup=1"
217217
# Check build tool.
218218
echo "Running ${EXPORTED_MODEL_NAME} in portable mode"
219219
if [[ "${BUILD_TOOL}" == "buck2" ]]; then

.github/workflows/apple.yml

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ on:
1313
- install_requirements.sh
1414
- backends/apple/**
1515
- build/build_apple_frameworks.sh
16+
- build/build_apple_llm_demo.sh
1617
- build/create_frameworks.sh
1718
- build/test_ios_ci.sh
1819
- examples/demo-apps/apple_ios/**
@@ -215,3 +216,70 @@ jobs:
215216
shasum -a 256 "${FILENAME}"
216217
${AWS_CMD} "${FILENAME}" s3://ossci-ios/executorch/ --acl public-read
217218
done
219+
220+
build-benchmark-app:
221+
name: build-benchmark-app
222+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
223+
secrets: inherit
224+
with:
225+
runner: macos-latest-xlarge
226+
python-version: '3.11'
227+
submodules: 'true'
228+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
229+
upload-artifact: ios-apps
230+
secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD
231+
timeout: 90
232+
script: |
233+
set -eux
234+
235+
echo "::group::Setting up CI environment"
236+
.ci/scripts/setup-conda.sh
237+
238+
BUILD_TOOL=cmake
239+
# Setup MacOS dependencies as there is no Docker support on MacOS atm
240+
GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
241+
.ci/scripts/setup-macos.sh "${BUILD_TOOL}"
242+
export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
243+
244+
# Setup Apple certificate for iOS development
245+
BUILD_PROVISION_PROFILE_BASE64="${SECRET_EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64}" \
246+
BUILD_CERTIFICATE_BASE64="${SECRET_BUILD_CERTIFICATE_BASE64}" \
247+
KEYCHAIN_PASSWORD="${SECRET_KEYCHAIN_PASSWORD}" \
248+
.ci/scripts/setup-ios.sh
249+
250+
# Install CoreML Backend Requirements
251+
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
252+
backends/apple/coreml/scripts/install_requirements.sh
253+
254+
# Install MPS Backend Requirements
255+
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
256+
backends/apple/mps/install_requirements.sh
257+
echo "::endgroup::"
258+
259+
echo "::group::Build ExecuTorch iOS frameworks"
260+
FRAMEWORKS=(
261+
"executorch"
262+
"backend_coreml"
263+
"backend_mps"
264+
"backend_xnnpack"
265+
"kernels_custom"
266+
"kernels_optimized"
267+
"kernels_portable"
268+
"kernels_quantized"
269+
)
270+
271+
# Build Release iOS Frameworks
272+
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
273+
build/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack
274+
275+
mkdir -p extension/apple/Benchmark/Frameworks
276+
for FRAMEWORK in "${FRAMEWORKS[@]}"; do (
277+
cp -r "cmake-out/${FRAMEWORK}.xcframework" extension/apple/Benchmark/Frameworks/
278+
) done
279+
echo "::endgroup::"
280+
281+
echo "::group::Build ExecuTorch benchmark app"
282+
mkdir -p extension/apple/Benchmark/Models
283+
${CONDA_RUN} --no-capture-output \
284+
build/build_apple_llm_demo.sh ${ARTIFACTS_DIR_NAME}
285+
echo "::endgroup::"

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
.hypothesis
22
buck-out/
3-
cmake-out/
3+
cmake-out*
4+
.DS_Store
45
cmake-android-out/
56
cmake-out-android/
67
cmake-ios-out/

backends/arm/test/runner_utils.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import json
77
import logging
88
import os
9+
import re
910
import shutil
1011
import subprocess
1112
import tempfile
@@ -229,7 +230,9 @@ def run_corstone300(
229230
os.path.join(self.intermediate_path, f"{name}.bin"),
230231
)
231232
elf_path = os.path.join(
232-
"cmake-out", "arm_semihosting_executor_runner", "arm_executor_runner"
233+
"cmake-out",
234+
"arm_semihosting_executor_runner_corstone-300",
235+
"arm_executor_runner",
233236
)
234237
assert os.path.exists(
235238
elf_path
@@ -266,7 +269,12 @@ def run_corstone300(
266269
]
267270
result = _run_cmd(command_args, check=False)
268271
result_stdout = result.stdout.decode()
269-
if "Hard fault" in result_stdout or len(result.stderr) > 0:
272+
273+
error_regex = r"(^[EF][: ].*$)|(^.*Hard fault.*$)|(^.*Assertion.*$)"
274+
275+
# Check for errors in the output
276+
# regex to check for error or fault messages in stdout from FVP
277+
if re.compile(error_regex, re.MULTILINE).search(result_stdout):
270278
raise RuntimeError(
271279
f"Corstone simulation failed, log: \n {result_stdout}\n{result.stderr.decode()}"
272280
)

backends/arm/test/setup_testing.sh

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,30 @@ ethos_u_root_dir=${et_root_dir}/examples/arm/ethos-u-scratch/ethos-u
1313

1414
toolchain_cmake=${et_root_dir}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
1515
et_build_dir=${et_root_dir}/cmake-out
16-
build_test_dir=${et_build_dir}/arm_semihosting_executor_runner
16+
build_root_test_dir=${et_build_dir}/arm_semihosting_executor_runner
1717
fvp_model=FVP_Corstone_SSE-300_Ethos-U55
1818

1919
# Build Arm Baremetal executor_runner in semihosting mode.
2020
# Put in backends/arm/test/res to be used by unit tests.
2121
function build_semihosting_executorch_runner() {
22+
target_board=$1
23+
build_test_dir=${build_root_test_dir}_${target_board}
24+
echo "[${FUNCNAME[0]}] Configuring ${target_board}"
25+
if [[ ${target_board} == "corstone-300" ]]; then
26+
local target_cpu=cortex-m55
27+
elif [[ ${target_board} == "corstone-320" ]]; then
28+
local target_cpu=cortex-m85
29+
else
30+
echo "[${FUNCNAME[0]}] ERROR: Invalid target_board specified!"
31+
exit 1
32+
fi
2233
cd ${et_root_dir}/examples/arm/executor_runner
2334
pwd
2435
mkdir -p ${build_test_dir}
2536
cmake -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} \
26-
-DTARGET_CPU=cortex-m55 \
37+
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
38+
-DTARGET_CPU=${target_cpu} \
39+
-DTARGET_BOARD=${target_board} \
2740
-DSEMIHOSTING=ON \
2841
-DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${build_test_dir} \
2942
-B ${build_test_dir} \
@@ -40,4 +53,6 @@ function build_semihosting_executorch_runner() {
4053
find ${build_test_dir} -name "arm_executor_runner"
4154
}
4255

43-
build_semihosting_executorch_runner
56+
build_semihosting_executorch_runner corstone-300
57+
58+
build_semihosting_executorch_runner corstone-320

backends/cadence/reference/operators/quantized_layer_norm.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111

1212
#include <cmath>
1313

14-
using Tensor = exec_aten::Tensor;
14+
using executorch::aten::Tensor;
15+
using executorch::runtime::getLeadingDims;
1516
using executorch::runtime::KernelRuntimeContext;
1617

1718
namespace impl {

backends/cadence/reference/operators/quantized_linear_out.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ namespace impl {
1313
namespace reference {
1414
namespace native {
1515

16-
using Tensor = exec_aten::Tensor;
16+
using executorch::aten::Tensor;
17+
using executorch::runtime::getLeadingDims;
1718
using executorch::runtime::KernelRuntimeContext;
1819

1920
void quantized_linear_out(

backends/cadence/reference/operators/quantized_matmul_out.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ namespace impl {
1313
namespace reference {
1414
namespace native {
1515

16-
using Tensor = exec_aten::Tensor;
16+
using executorch::aten::Tensor;
17+
using executorch::runtime::getLeadingDims;
1718
using executorch::runtime::KernelRuntimeContext;
1819

1920
// The quantized matmul. The quantized matmul accumulates in a wider register,

backends/vulkan/runtime/api/containers/ParamsBuffer.h

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,29 @@ class ParamsBuffer final {
5656
}
5757
// Fill the uniform buffer with data in block
5858
{
59-
vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::MemoryAccessType::WRITE);
59+
vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::kWrite);
6060
Block* data_ptr = mapping.template data<Block>();
6161

6262
*data_ptr = block;
6363
}
6464
}
65+
66+
template <typename T>
67+
T read() const {
68+
T val;
69+
if (sizeof(val) != nbytes_) {
70+
VK_THROW(
71+
"Attempted to store value from ParamsBuffer to type of different size");
72+
}
73+
// Read value from uniform buffer and store in val
74+
{
75+
vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::kRead);
76+
T* data_ptr = mapping.template data<T>();
77+
78+
val = *data_ptr;
79+
}
80+
return val;
81+
}
6582
};
6683

6784
} // namespace api

backends/vulkan/runtime/api/containers/Tensor.cpp

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -277,10 +277,11 @@ vTensorStorage::vTensorStorage(
277277
storage_type_,
278278
dtype,
279279
allocate_memory)),
280-
last_access_{} {}
280+
last_access_{},
281+
has_copies_{false} {}
281282

282283
vTensorStorage::vTensorStorage(
283-
const vTensorStorage& other,
284+
vTensorStorage& other,
284285
const int64_t buffer_offset)
285286
: context_(other.context_),
286287
storage_type_{other.storage_type_},
@@ -289,7 +290,10 @@ vTensorStorage::vTensorStorage(
289290
buffer_offset_{buffer_offset},
290291
image_(other.image_),
291292
buffer_(other.buffer_, buffer_offset),
292-
last_access_{other.last_access_} {}
293+
last_access_{other.last_access_},
294+
has_copies_{false} {
295+
other.has_copies_ = true;
296+
}
293297

294298
vTensorStorage::~vTensorStorage() {
295299
flush();
@@ -312,6 +316,21 @@ void vTensorStorage::transition(
312316
vkapi::PipelineStageFlags prev_stage = last_access_.stage;
313317
vkapi::MemoryAccessFlags prev_access = last_access_.access;
314318

319+
// If the underlying resource is a copy of another tensor's resource the
320+
// last_access may not be accurate, since the original storage may have been
321+
// written to as part of the original tensor. Likewise, if the underlying
322+
// resource has copies, then the resource may have been updated as part of the
323+
// view tensors.
324+
//
325+
// If the resource is a copy, or has copies of it, then cowardly assume that
326+
// it has previously been written to as part of a compute shader before the
327+
// current access event so that the appropriate memory barriers may be
328+
// inserted.
329+
if (is_copy() || has_copies_) {
330+
prev_stage = vkapi::PipelineStage::COMPUTE;
331+
prev_access = vkapi::kWrite;
332+
}
333+
315334
const bool prev_written = (prev_access & vkapi::MemoryAccessType::WRITE) != 0;
316335

317336
VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED;
@@ -358,6 +377,13 @@ void vTensorStorage::transition(
358377
last_access_.access = cur_access;
359378
}
360379

380+
bool vTensorStorage::is_copy() const {
381+
if (storage_type_ == utils::kBuffer) {
382+
return buffer_.is_copy();
383+
}
384+
return image_.is_copy();
385+
}
386+
361387
bool vTensorStorage::is_copy_of(const vTensorStorage& other) const {
362388
if (storage_type_ == utils::kBuffer) {
363389
return buffer_.is_copy_of(other.buffer_);
@@ -418,7 +444,8 @@ vTensor::vTensor(
418444
}
419445
}
420446

421-
vTensor::vTensor(const vTensor& other)
447+
// NOLINTNEXTLINE
448+
vTensor::vTensor(vTensor& other)
422449
: dtype_(other.dtype_),
423450
// Copy tensor size metadata
424451
sizes_(other.sizes_.begin(), other.sizes_.end()),
@@ -443,7 +470,7 @@ vTensor::vTensor(const vTensor& other)
443470
storage_(other.storage_) {}
444471

445472
vTensor::vTensor(
446-
const vTensor& other,
473+
vTensor& other,
447474
const std::vector<int64_t>& sizes,
448475
const std::vector<int64_t>& dim_order,
449476
const int64_t offset_numel)
@@ -671,6 +698,14 @@ void vTensor::virtual_reconfigure(
671698
update_metadata();
672699
}
673700

701+
void vTensor::virtual_clone(const vTensor& other) {
702+
VK_CHECK_COND(is_view_of(other));
703+
sizes_ = other.sizes_;
704+
dim_order_ = other.dim_order_;
705+
axis_map_ = other.axis_map_;
706+
packed_dim_ = other.packed_dim_;
707+
}
708+
674709
void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
675710
VK_CHECK_COND(
676711
new_sizes.size() == dim_order_.size(),

0 commit comments

Comments
 (0)