Skip to content

Commit aff141e

Browse files
authored
Move CPU kernels out of experimental
Differential Revision: D80958790 Pull Request resolved: #2868
1 parent 9d01b43 commit aff141e

File tree

136 files changed

+4247
-1875
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

136 files changed

+4247
-1875
lines changed

.github/workflows/torchao_experimental_test.yml

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -55,28 +55,29 @@ jobs:
5555
python torchao/experimental/tests/test_quant_passes.py
5656
pytest -s test/prototype/test_dynamic_activation_lut.py
5757
pytest -s test/quantization/quantize_/workflows/intx/test_intx_opaque_tensor.py
58-
- name: Run kernels/cpu/aarch64/tests
58+
- name: torchao/csrc/cpu - build and run C++ tests
5959
if: runner.os == 'macOS'
6060
run: |
6161
conda activate venv
62-
pushd torchao/experimental/kernels/cpu/aarch64/tests
62+
pushd torchao/csrc/cpu
6363
sh build_and_run_tests.sh
64-
rm -rf /tmp/cmake-out
64+
rm -rf cmake-out
6565
popd
66-
- name: Run torchao/experimental/ops/tests
66+
- name: torchao/csrc/cpu - build benchmarks
6767
if: runner.os == 'macOS'
6868
run: |
6969
conda activate venv
70-
pushd torchao/experimental/ops/tests
71-
sh build_and_run_tests.sh
72-
rm -rf /tmp/cmake-out
70+
pushd torchao/csrc/cpu
71+
sh build_and_run_benchmarks.sh build_only
72+
rm -rf cmake-out
7373
popd
74-
- name: ET ops build
74+
- name: torchao/csrc/cpu - build shared_kernels with ExecuTorch
7575
if: runner.os == 'macOS'
7676
run: |
7777
conda activate venv
78-
pushd torchao/experimental
79-
sh build_torchao_ops.sh executorch
78+
pushd torchao/csrc/cpu
79+
sh build_shared_kernels.sh executorch
80+
rm -rf cmake-out
8081
popd
8182
8283
# test-mps-ops:

setup.py

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def read_version(file_path="version.txt"):
7373
# ├── USE_CPU_KERNELS="1" + Linux → Include optimized CPU kernels (AVX512, etc.)
7474
# └── ARM64 + macOS → Auto-enable experimental builds (build_macos_arm_auto)
7575
#
76-
# Level 3: Experimental builds (cmake-based)
76+
# Level 3: Shared CPU kernel builds (cmake-based)
7777
# ├── BUILD_TORCHAO_EXPERIMENTAL="1" → Force experimental builds
7878
# ├── build_macos_arm_auto → Auto-enable on ARM64 macOS
7979
# └── When enabled, provides access to:
@@ -322,6 +322,19 @@ def build_cmake(self, ext):
322322
ext_filename = os.path.basename(self.get_ext_filename(ext.name))
323323
ext_basename = os.path.splitext(ext_filename)[0]
324324

325+
print(
326+
"CMAKE COMMANG",
327+
[
328+
"cmake",
329+
ext.cmake_lists_dir,
330+
]
331+
+ ext.cmake_args
332+
+ [
333+
"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
334+
"-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename,
335+
],
336+
)
337+
325338
subprocess.check_call(
326339
[
327340
"cmake",
@@ -473,10 +486,22 @@ def get_extensions():
473486

474487
# Collect C++ source files
475488
sources = list(glob.glob(os.path.join(extensions_dir, "**/*.cpp"), recursive=True))
489+
490+
# Exclude C++ CPU sources that are built by CMake
491+
cpu_cmake_sources = glob.glob(
492+
os.path.join(extensions_dir, "cpu", "torch_free_kernels", "**", "*.cpp"),
493+
recursive=True,
494+
)
495+
cpu_cmake_sources += glob.glob(
496+
os.path.join(extensions_dir, "cpu", "shared_kernels", "**", "*.cpp"),
497+
recursive=True,
498+
)
499+
sources = [s for s in sources if s not in cpu_cmake_sources]
500+
476501
if not use_cpu_kernels or not is_linux:
477502
# Remove csrc/cpu/*.cpp
478503
excluded_sources = list(
479-
glob.glob(os.path.join(extensions_dir, "cpu/*.cpp"), recursive=True)
504+
glob.glob(os.path.join(extensions_dir, "cpu/*.cpp"), recursive=False)
480505
)
481506
sources = [s for s in sources if s not in excluded_sources]
482507

@@ -614,6 +639,7 @@ def get_extensions():
614639

615640
ext_modules = []
616641
if len(sources) > 0:
642+
print("SOURCES", sources)
617643
# Double-check to ensure mx_fp_cutlass_kernels.cu is not in sources
618644
sources = [
619645
s for s in sources if os.path.basename(s) != "mx_fp_cutlass_kernels.cu"
@@ -701,7 +727,7 @@ def get_extensions():
701727
)
702728
)
703729

704-
# Build CMakeLists from /torchao/experimental - additional options become available : TORCHAO_BUILD_CPU_AARCH64, TORCHAO_BUILD_KLEIDIAI, TORCHAO_BUILD_MPS_OPS, TORCHAO_PARALLEL_BACKEND
730+
# Build CMakeLists from /torchao/csrc/cpu - additional options become available : TORCHAO_BUILD_CPU_AARCH64, TORCHAO_BUILD_KLEIDIAI, TORCHAO_BUILD_MPS_OPS, TORCHAO_PARALLEL_BACKEND
705731
if build_macos_arm_auto or os.getenv("BUILD_TORCHAO_EXPERIMENTAL") == "1":
706732
build_options = BuildOptions()
707733

@@ -714,24 +740,20 @@ def bool_to_on_off(value):
714740

715741
ext_modules.append(
716742
CMakeExtension(
717-
"torchao._experimental_aten_ops",
718-
cmake_lists_dir="torchao/experimental",
743+
"torchao._C_cpu_shared_kernels_aten",
744+
cmake_lists_dir="torchao/csrc/cpu",
719745
cmake_args=(
720746
[
721747
f"-DCMAKE_BUILD_TYPE={'Debug' if use_debug_mode() else 'Release'}",
722748
f"-DTORCHAO_BUILD_CPU_AARCH64={bool_to_on_off(build_options.build_cpu_aarch64)}",
723749
f"-DTORCHAO_BUILD_KLEIDIAI={bool_to_on_off(build_options.build_kleidi_ai)}",
724-
f"-DTORCHAO_BUILD_MPS_OPS={bool_to_on_off(build_options.build_experimental_mps)}",
725750
f"-DTORCHAO_ENABLE_ARM_NEON_DOT={bool_to_on_off(build_options.enable_arm_neon_dot)}",
726751
f"-DTORCHAO_ENABLE_ARM_I8MM={bool_to_on_off(build_options.enable_arm_i8mm)}",
727752
f"-DTORCHAO_PARALLEL_BACKEND={build_options.parallel_backend}",
753+
"-DTORCHAO_BUILD_TESTS=OFF",
754+
"-DTORCHAO_BUILD_BENCHMARKS=OFF",
728755
"-DTorch_DIR=" + torch_dir,
729756
]
730-
+ (
731-
["-DCMAKE_INSTALL_PREFIX=cmake-out"]
732-
if build_options.build_experimental_mps
733-
else []
734-
)
735757
),
736758
)
737759
)

torchao/csrc/cpu/CMakeLists.txt

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
cmake_minimum_required(VERSION 3.19)
8+
include(CMakeDependentOption)
9+
10+
project(torchao)
11+
12+
set(CMAKE_CXX_STANDARD 17)
13+
14+
if (NOT CMAKE_BUILD_TYPE)
15+
set(CMAKE_BUILD_TYPE Release)
16+
endif()
17+
18+
# Platform options
19+
option(TORCHAO_BUILD_ATEN_OPS "Building torchao ops for ATen." ON)
20+
option(TORCHAO_BUILD_EXECUTORCH_OPS "Building torchao ops for ExecuTorch." OFF)
21+
option(TORCHAO_BUILD_CPU_AARCH64 "Build torchao's CPU aarch64 kernels" OFF)
22+
option(TORCHAO_BUILD_KLEIDIAI "Download, build, and link against Arm KleidiAI library (arm64 only)" OFF)
23+
option(TORCHAO_ENABLE_ARM_NEON_DOT "Enable ARM Neon Dot Product extension" OFF)
24+
option(TORCHAO_ENABLE_ARM_I8MM "Enable ARM 8-bit Integer Matrix Multiply instructions" OFF)
25+
option(TORCHAO_BUILD_TESTS "Build tests" OFF)
26+
option(TORCHAO_BUILD_BENCHMARKS "Build tests" OFF)
27+
28+
# Set default compiler options
29+
add_compile_options("-fPIC" "-Wall" "-Werror" "-Wno-deprecated")
30+
if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
31+
add_compile_options(
32+
"-Wno-error=unknown-pragmas"
33+
"-Wno-array-parameter"
34+
"-Wno-maybe-uninitialized"
35+
"-Wno-sign-compare"
36+
)
37+
elseif (APPLE)
38+
add_compile_options("-Wno-shorten-64-to-32")
39+
endif()
40+
41+
42+
43+
if (NOT TARGET cpuinfo)
44+
cmake_policy(PUSH)
45+
cmake_policy(VERSION 3.5) # cpuinfo requires CMake 3.5
46+
47+
# For some reason cpuinfo package has unused functions/variables
48+
# TODO (T215533422): fix upstream
49+
add_compile_options(-Wno-unused-function -Wno-unused-variable)
50+
51+
# set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
52+
include(FetchContent)
53+
set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL "" FORCE)
54+
set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL "" FORCE)
55+
set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE)
56+
FetchContent_Declare(cpuinfo
57+
GIT_REPOSITORY https://github.com/pytorch/cpuinfo.git
58+
GIT_TAG c61fe919607bbc534d7a5a5707bdd7041e72c5ff
59+
)
60+
FetchContent_MakeAvailable(
61+
cpuinfo)
62+
63+
cmake_policy(POP)
64+
endif()
65+
66+
if (TORCHAO_BUILD_TESTS)
67+
include(FetchContent)
68+
FetchContent_Declare(
69+
googletest
70+
URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
71+
)
72+
FetchContent_MakeAvailable(googletest)
73+
endif()
74+
75+
if (TORCHAO_BUILD_BENCHMARKS)
76+
include(FetchContent)
77+
FetchContent_Declare(googlebenchmark
78+
GIT_REPOSITORY https://github.com/google/benchmark.git
79+
GIT_TAG main) # need main for benchmark::benchmark
80+
81+
set(BENCHMARK_ENABLE_TESTING OFF)
82+
FetchContent_MakeAvailable(
83+
googlebenchmark)
84+
endif()
85+
86+
if(NOT TORCHAO_INCLUDE_DIRS)
87+
set(TORCHAO_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
88+
endif()
89+
90+
if(NOT DEFINED TORCHAO_PARALLEL_BACKEND)
91+
set(TORCHAO_PARALLEL_BACKEND aten_openmp)
92+
endif()
93+
94+
# Set default compiler options
95+
96+
include(CMakePrintHelpers)
97+
include(${CMAKE_CURRENT_SOURCE_DIR}/shared_kernels/Utils.cmake)
98+
99+
message("TORCHAO_INCLUDE_DIRS: ${TORCHAO_INCLUDE_DIRS}")
100+
include_directories(${TORCHAO_INCLUDE_DIRS})
101+
102+
103+
# Build fallback kernels
104+
add_subdirectory(torch_free_kernels/fallback)
105+
106+
# Build cpu/aarch64 kernels
107+
if(TORCHAO_BUILD_CPU_AARCH64)
108+
message(STATUS "Building with cpu/aarch64")
109+
add_compile_definitions(TORCHAO_BUILD_CPU_AARCH64)
110+
111+
if(TORCHAO_ENABLE_ARM_NEON_DOT)
112+
message(STATUS "Building with ARM NEON dot product support")
113+
add_compile_definitions(TORCHAO_ENABLE_ARM_NEON_DOT)
114+
add_compile_options("-march=armv8.4-a+dotprod")
115+
endif()
116+
117+
if(TORCHAO_ENABLE_ARM_I8MM)
118+
message(STATUS "Building with ARM I8MM support")
119+
add_compile_definitions(TORCHAO_ENABLE_ARM_I8MM)
120+
add_compile_options("-march=armv8.6-a")
121+
endif()
122+
123+
if(TORCHAO_BUILD_KLEIDIAI)
124+
message(STATUS "Building with Arm KleidiAI library")
125+
add_compile_definitions(TORCHAO_ENABLE_KLEIDI)
126+
if (NOT TARGET kleidiai)
127+
include(FetchContent)
128+
# KleidiAI is an open-source library that provides optimized
129+
# performance-critical routines, also known as micro-kernels, for artificial
130+
# intelligence (AI) workloads tailored for Arm® CPUs.
131+
set(KLEIDIAI_BUILD_TESTS OFF CACHE BOOL "" FORCE)
132+
set(KLEIDIAI_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE)
133+
FetchContent_Declare(kleidiai
134+
GIT_REPOSITORY https://git.gitlab.arm.com/kleidi/kleidiai.git
135+
GIT_TAG v1.12.0
136+
)
137+
FetchContent_MakeAvailable(kleidiai)
138+
endif()
139+
endif()
140+
141+
# Defines torchao_kernels_aarch64
142+
add_subdirectory(torch_free_kernels/aarch64)
143+
endif()
144+
145+
# Build ATen ops
146+
if(TORCHAO_BUILD_ATEN_OPS)
147+
find_package(Torch REQUIRED)
148+
set(_torchao_op_srcs_aten)
149+
list(APPEND _torchao_op_srcs_aten
150+
shared_kernels/embedding_xbit/op_embedding_xbit_aten.cpp
151+
shared_kernels/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp
152+
shared_kernels/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_aten.cpp
153+
shared_kernels/groupwise_lowbit_weight_lut/groupwise_lowbit_weight_lut.cpp
154+
shared_kernels/groupwise_lowbit_weight_lut/op_groupwise_lowbit_weight_lut_aten.cpp
155+
)
156+
list(TRANSFORM _torchao_op_srcs_aten PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/")
157+
158+
# Use the Python extension name if provided
159+
add_library(torchao_ops_aten SHARED ${_torchao_op_srcs_aten})
160+
if(DEFINED TORCHAO_CMAKE_EXT_SO_NAME)
161+
message(STATUS "Setting output name to: ${TORCHAO_CMAKE_EXT_SO_NAME}.so")
162+
set_target_properties(torchao_ops_aten PROPERTIES
163+
OUTPUT_NAME ${TORCHAO_CMAKE_EXT_SO_NAME}
164+
PREFIX "" # Remove "lib" prefix for Python extensions
165+
SUFFIX ".so" # Add ".so" suffix for Python extensions
166+
)
167+
endif()
168+
169+
target_link_torchao_parallel_backend(torchao_ops_aten "${TORCHAO_PARALLEL_BACKEND}")
170+
if (TORCHAO_BUILD_CPU_AARCH64)
171+
target_link_libraries(torchao_ops_aten PRIVATE torchao_kernels_aarch64)
172+
if (TORCHAO_BUILD_KLEIDIAI)
173+
target_link_libraries(torchao_ops_aten PRIVATE kleidiai)
174+
endif()
175+
endif()
176+
target_link_libraries(torchao_ops_aten PRIVATE cpuinfo)
177+
target_include_directories(torchao_ops_aten PRIVATE "${TORCH_INCLUDE_DIRS}")
178+
target_link_libraries(torchao_ops_aten PRIVATE "${TORCH_LIBRARIES}")
179+
target_compile_definitions(torchao_ops_aten PRIVATE TORCHAO_SHARED_KERNELS_BUILD_ATEN=1)
180+
181+
if (TORCHAO_BUILD_TESTS)
182+
add_subdirectory(shared_kernels/tests)
183+
endif()
184+
185+
if (TORCHAO_BUILD_BENCHMARKS)
186+
add_subdirectory(shared_kernels/benchmarks)
187+
endif()
188+
189+
# Install ATen targets
190+
install(
191+
TARGETS torchao_ops_aten
192+
EXPORT _targets
193+
DESTINATION lib
194+
)
195+
endif()
196+
197+
198+
# Build ExecuTorch ops
199+
if(TORCHAO_BUILD_EXECUTORCH_OPS)
200+
# ExecuTorch package is not required, but EXECUTORCH_INCLUDE_DIRS and EXECUTORCH_LIBRARIES must
201+
# be defined and EXECUTORCH_LIBRARIES must include the following libraries installed by ExecuTorch:
202+
# libexecutorch.a
203+
# libextension_threadpool.a
204+
# libcpuinfo.a
205+
# libpthreadpool.a
206+
if(NOT DEFINED EXECUTORCH_INCLUDE_DIRS AND NOT DEFINED EXECUTORCH_LIBRARIES)
207+
message(WARNING "EXECUTORCH_INCLUDE_DIRS and EXECUTORCH_LIBRARIES are not defined. Looking for ExecuTorch.")
208+
find_package(ExecuTorch HINTS ${CMAKE_PREFIX_PATH}/executorch/share/cmake)
209+
endif()
210+
set(_torchao_op_srcs_executorch)
211+
list(APPEND _torchao_op_srcs_executorch
212+
shared_kernels/embedding_xbit/op_embedding_xbit_executorch.cpp
213+
shared_kernels/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp
214+
shared_kernels/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch.cpp
215+
shared_kernels/groupwise_lowbit_weight_lut/groupwise_lowbit_weight_lut.cpp
216+
shared_kernels/groupwise_lowbit_weight_lut/op_groupwise_lowbit_weight_lut_executorch.cpp)
217+
218+
list(TRANSFORM _torchao_op_srcs_executorch PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/")
219+
add_library(torchao_ops_executorch STATIC ${_torchao_op_srcs_executorch})
220+
221+
target_compile_definitions(torchao_ops_executorch PRIVATE TORCHAO_SHARED_KERNELS_BUILD_EXECUTORCH=1)
222+
223+
# This links to ExecuTorch
224+
target_link_torchao_parallel_backend(torchao_ops_executorch executorch)
225+
if (TORCHAO_BUILD_CPU_AARCH64)
226+
target_link_libraries(torchao_ops_executorch PRIVATE torchao_kernels_aarch64)
227+
if (TORCHAO_BUILD_KLEIDIAI)
228+
target_link_libraries(torchao_ops_executorch PRIVATE kleidiai)
229+
endif()
230+
endif()
231+
target_link_libraries(torchao_ops_executorch PRIVATE cpuinfo)
232+
endif()

0 commit comments

Comments
 (0)