-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdefault.nix
156 lines (142 loc) · 5.03 KB
/
default.nix
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
{ lib
, stdenv
, fetchFromGitHub
, rocmUpdateScript
, cmake
, rocm-cmake
, rocm-merged-llvm
, clr
, rocm-device-libs
, rocminfo
, hipify
, git
, gtest
, zstd
, ninja
, buildTests ? false
, buildExamples ? false
# FIXME: I can't get this to build for gfx1030
, gpuTargets ? [ "gfx908" "gfx90a" ] # gpuTargets = [ "gfx803" "gfx900" "gfx1030" ... ]
}:
stdenv.mkDerivation (finalAttrs: {
pname = "composable_kernel";
# This version must be PEP 440 compatible because it's the version of the ck4inductor python package too
version = "6.4.0a20241220";
outputs = [
"out"
] ++ lib.optionals buildTests [
"test"
] ++ lib.optionals buildExamples [
"example"
];
patches = [
# for Gentoo this gives a significant speedup in build times
# not observing speedup. possibly because our LLVM has been patched to fix amdgpu-early-inline-all issues?
# ./disable-amdgpu-inline.patch
];
src = fetchFromGitHub {
owner = "ROCm";
repo = "composable_kernel";
rev = "07339c738396ebeae57374771ded4dcf11bddf1e";
hash = "sha256-EvEBxlOpQ71BF57VW79WBo/cdxAwTKFXFMiYKyGyyEs=";
};
nativeBuildInputs = [
git
cmake
rocminfo
rocm-cmake
clr
zstd
hipify
ninja
];
buildInputs = [ ];
enableParallelBuilding = true;
requiredSystemFeatures = [ "big-parallel" ];
env.ROCM_PATH = clr;
env.HIP_CLANG_PATH = "${rocm-merged-llvm}/bin";
cmakeFlags = [
"-DCMAKE_MODULE_PATH=${clr}/hip/cmake"
"-DCMAKE_BUILD_TYPE=Release"
"-DCMAKE_POLICY_DEFAULT_CMP0069=NEW"
"-DCMAKE_INTERPROCEDURAL_OPTIMIZATION=TRUE"
# Manually define CMAKE_INSTALL_<DIR>
# See: https://github.com/NixOS/nixpkgs/pull/197838
# "-DDL_KERNELS=ON"
"-DCK_USE_CODEGEN=ON"
"-DCMAKE_INSTALL_BINDIR=bin"
"-DCMAKE_INSTALL_LIBDIR=lib"
"-DCMAKE_INSTALL_INCLUDEDIR=include"
"-DBUILD_DEV=OFF"
"-DROCM_PATH=${clr}"
"-DCMAKE_HIP_COMPILER_ROCM_ROOT=${clr}"
# FP8 can build for 908/90a but very slow build
# and produces unusably slow kernels that are huge
"-DCK_USE_FP8_ON_UNSUPPORTED_ARCH=OFF"
] ++ lib.optionals (gpuTargets != [ ]) [
"-DGPU_ARCHS=${lib.concatStringsSep ";" gpuTargets}"
] ++ lib.optionals buildTests [
"-DGOOGLETEST_DIR=${gtest.src}" # Custom linker names
];
# No flags to build selectively it seems...
postPatch = ''
export HIP_DEVICE_LIB_PATH=${rocm-device-libs}/amdgcn/bitcode
'' + lib.optionalString (!buildTests) ''
substituteInPlace CMakeLists.txt \
--replace-fail "add_subdirectory(test)" ""
substituteInPlace codegen/CMakeLists.txt \
--replace-fail "include(ROCMTest)" ""
'' + lib.optionalString (!buildExamples) ''
substituteInPlace CMakeLists.txt \
--replace-fail "add_subdirectory(example)" ""
'' + ''
substituteInPlace CMakeLists.txt \
--replace-fail "add_subdirectory(profiler)" ""
'';
env.LDFLAGS = "-Wl,--as-needed,--gc-sections,--compress-debug-sections=zstd";
env.CFLAGS = "-O3 -Wl,--as-needed,--gc-sections,--compress-debug-sections=zstd";
env.CXXFLAGS = "-O3 -Wl,--as-needed,--gc-sections,--compress-debug-sections=zstd";
# Clamp parallelism based on free memory at build start to avoid OOM
preConfigure = ''
export NINJA_SUMMARIZE_BUILD=1
export NINJA_STATUS="[%r jobs | %P %f/%t @ %o/s | %w | ETA %W ] "
MEM_GB_TOTAL=$(awk '/MemTotal/ { printf "%d \n", $2/1024/1024 }' /proc/meminfo)
MEM_GB_FREE=$(awk '/MemAvailable/ { printf "%d \n", $2/1024/1024 }' /proc/meminfo)
SWAP_GB_FREE=$(awk '/SwapFree/ { printf "%d \n", $2/1024/1024 }' /proc/meminfo)
APPX_GB=$((MEM_GB_FREE + SWAP_GB_FREE / 4))
APPX_GB=$((APPX_GB > MEM_GB_TOTAL ? MEM_GB_TOTAL : APPX_GB))
MAX_CORES=$((1 + APPX_GB / 3))
MAX_CORES_LINK=$((1 + APPX_GB / 7))
MAX_CORES_LINK=$((MAX_CORES_LINK > NIX_BUILD_CORES ? NIX_BUILD_CORES : MAX_CORES_LINK))
export NIX_BUILD_CORES="$((NIX_BUILD_CORES > MAX_CORES ? MAX_CORES : NIX_BUILD_CORES))"
echo "Picked new core limits NIX_BUILD_CORES=$NIX_BUILD_CORES MAX_CORES_LINK=$MAX_CORES_LINK based on available mem: $APPX_GB GB"
cmakeFlagsArray+=(
"-DCK_PARALLEL_LINK_JOBS=$MAX_CORES_LINK"
"-DCK_PARALLEL_COMPILE_JOBS=$NIX_BUILD_CORES"
)
makeFlagsArray+=("-l$(nproc)")
ninjaFlagsArray+=("-l$(nproc)")
'';
postInstall = ''
zstd --rm $out/lib/libdevice_*_operations.a
'' + lib.optionalString buildTests ''
mkdir -p $test/bin
mv $out/bin/test_* $test/bin
'' + lib.optionalString buildExamples ''
mkdir -p $example/bin
mv $out/bin/example_* $example/bin
'';
passthru.updateScript = rocmUpdateScript {
name = finalAttrs.pname;
inherit (finalAttrs.src) owner;
inherit (finalAttrs.src) repo;
};
meta = with lib; {
description = "Performance portable programming model for machine learning tensor operators";
homepage = "https://github.com/ROCm/composable_kernel";
license = with licenses; [ mit ];
maintainers = teams.rocm.members;
platforms = platforms.linux;
broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version || versionAtLeast finalAttrs.version "7.0.0";
};
})