1
- # Copyright (c) 2020 , NVIDIA CORPORATION. All rights reserved.
1
+ # Copyright (c) 2021 , NVIDIA CORPORATION. All rights reserved.
2
2
#
3
3
# Licensed under the Apache License, Version 2.0 (the "License");
4
4
# you may not use this file except in compliance with the License.
@@ -16,12 +16,23 @@ project(FasterTransformer LANGUAGES CXX CUDA)
16
16
17
17
find_package (CUDA 10.1 REQUIRED)
18
18
19
- option (BUILD_TRT "Build in TensorRT mode" OFF )
20
19
option (BUILD_TF "Build in TensorFlow mode" OFF )
21
- option (BUILD_THE "Build in PyTorch eager mode" OFF )
22
- option (BUILD_THS "Build in TorchScript class mode" OFF )
20
+ option (BUILD_PYT "Build in PyTorch TorchScript class mode" OFF )
21
+ option (BUILD_GPT "Build project with gpt" ON ) # TODO Set default to OFF
23
22
24
- if (BUILD_THS)
23
+ if (BUILD_GPT)
24
+ message (STATUS "Add DBUILD_GPT, requires MPI and NCCL" )
25
+ add_definitions ("-DBUILD_GPT" )
26
+ set (CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR} /cmake/Modules)
27
+ find_package (MPI REQUIRED)
28
+ find_package (NCCL REQUIRED)
29
+ #if(${NCCL_VERSION} LESS 2.7)
30
+ # message(FATAL_ERROR "NCCL_VERSION ${NCCL_VERSION} is less than 2.7")
31
+ #endif()
32
+ set (CMAKE_MODULE_PATH "" ) # prevent the bugs for pytorch building
33
+ endif ()
34
+
35
+ if (BUILD_PYT)
25
36
if (DEFINED ENV{NVIDIA_PYTORCH_VERSION})
26
37
if ($ENV{NVIDIA_PYTORCH_VERSION} VERSION_LESS "20.03" )
27
38
message (FATAL_ERROR "NVIDIA PyTorch image is too old for TorchScript mode." )
@@ -32,7 +43,11 @@ if(BUILD_THS)
32
43
endif ()
33
44
endif ()
34
45
35
- set (CXX_STD "11" CACHE STRING "C++ standard" )
46
+ if (BUILD_PYT OR BUILD_GPT)
47
+ set (CXX_STD "14" CACHE STRING "C++ standard" )
48
+ else ()
49
+ set (CXX_STD "11" CACHE STRING "C++ standard" )
50
+ endif ()
36
51
37
52
set (CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR} )
38
53
@@ -42,57 +57,106 @@ if(BUILD_TF AND NOT TF_PATH)
42
57
message (FATAL_ERROR "TF_PATH must be set if BUILD_TF(=TensorFlow mode) is on." )
43
58
endif ()
44
59
45
- set (TRT_PATH "" CACHE STRING "TensorRT path" )
46
-
47
- if (BUILD_TRT AND NOT TRT_PATH)
48
- message (FATAL_ERROR "TRT_PATH must be set if BUILD_TRT(=TensorRT mode) is on." )
49
- endif ()
50
-
51
60
list (APPEND CMAKE_MODULE_PATH ${CUDA_PATH} /lib64)
52
61
53
62
if (${CUDA_VERSION} GREATER_EQUAL 11.0)
54
63
message (STATUS "Add DCUDA11_MODE" )
55
64
add_definitions ("-DCUDA11_MODE" )
56
65
endif ()
57
66
67
+ # profiling
68
+ option (USE_NVTX "Whether or not to use nvtx" OFF )
69
+ if (USE_NVTX)
70
+ message (STATUS "NVTX is enabled." )
71
+ add_definitions ("-DUSE_NVTX" )
72
+ endif ()
73
+
58
74
# setting compiler flags
59
- set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} " )
75
+ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} " )
60
76
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} " )
61
- set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall" )
62
-
63
- if (SM STREQUAL 80 OR
64
- SM STREQUAL 86 OR
65
- SM STREQUAL 70 OR
66
- SM STREQUAL 75 OR
67
- SM STREQUAL 61 OR
68
- SM STREQUAL 60)
69
- #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\" -rdc=true")
70
- set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM} ,code=\\\" sm_${SM} ,compute_${SM} \\\" " )
71
- if (SM STREQUAL 70 OR SM STREQUAL 75 OR SM STREQUAL 80 OR SM STREQUAL 86)
72
- set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA" )
73
- set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA" )
74
- set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA" )
77
+ set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall -ldl" )
78
+
79
+ # if (SM STREQUAL 80 OR
80
+ # SM STREQUAL 86 OR
81
+ # SM STREQUAL 70 OR
82
+ # SM STREQUAL 75 OR
83
+ # SM STREQUAL 61 OR
84
+ # SM STREQUAL 60)
85
+ # #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\" -rdc=true")
86
+ # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\"")
87
+ # if (SM STREQUAL 70 OR SM STREQUAL 75 OR SM STREQUAL 80 OR SM STREQUAL 86)
88
+ # set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
89
+ # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
90
+ # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
91
+ # endif()
92
+ # message("-- Assign GPU architecture (sm=${SM})")
93
+
94
+ # else()
95
+ # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
96
+ # -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
97
+ # -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
98
+ # ")
99
+ # # -rdc=true")
100
+ # set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
101
+ # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
102
+ # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
103
+ # message("-- Assign GPU architecture (sm=70,75)")
104
+ # endif()
105
+
106
+ set (SM_SETS 52 60 61 70 75 80)
107
+ set (USING_WMMA False )
108
+ set (FIND_SM False )
109
+
110
+ foreach (SM_NUM IN LISTS SM_SETS)
111
+ string (FIND "${SM} " "${SM_NUM} " SM_POS)
112
+ if (SM_POS GREATER -1)
113
+ if (FIND_SM STREQUAL False )
114
+ set (ENV{TORCH_CUDA_ARCH_LIST} "" )
115
+ endif ()
116
+ set (FIND_SM True )
117
+ set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM_NUM} ,code=\\\" sm_${SM_NUM} ,compute_${SM_NUM} \\\" " )
118
+
119
+ if (SM_NUM STREQUAL 70 OR SM_NUM STREQUAL 75 OR SM_NUM STREQUAL 80 OR SM_NUM STREQUAL 86)
120
+ set (USING_WMMA True )
121
+ endif ()
122
+
123
+ if (BUILD_PYT)
124
+ string (SUBSTRING ${SM_NUM} 0 1 SM_MAJOR)
125
+ string (SUBSTRING ${SM_NUM} 1 1 SM_MINOR)
126
+ set (ENV{TORCH_CUDA_ARCH_LIST} "$ENV{TORCH_CUDA_ARCH_LIST} \; ${SM_MAJOR} .${SM_MINOR} " )
127
+ endif ()
128
+
129
+ set (CMAKE_CUDA_ARCHITECTURES ${SM_NUM} )
130
+ message ("-- Assign GPU architecture (sm=${SM_NUM} )" )
75
131
endif ()
76
- if (BUILD_THE OR BUILD_THS)
77
- string (SUBSTRING ${SM} 0 1 SM_MAJOR)
78
- string (SUBSTRING ${SM} 1 1 SM_MINOR)
79
- set (ENV{TORCH_CUDA_ARCH_LIST} "${SM_MAJOR} .${SM_MINOR} " )
132
+ endforeach ()
133
+
134
+ if (USING_WMMA STREQUAL True )
135
+ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA" )
136
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA" )
137
+ set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA" )
138
+ message ("-- Use WMMA" )
80
139
endif ()
81
- message ("-- Assign GPU architecture (sm=${SM} )" )
82
140
83
- else ()
84
- set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
85
- -gencode=arch=compute_70,code=\\\" sm_70,compute_70\\\" \
86
- -gencode=arch=compute_75,code=\\\" sm_75,compute_75\\\" \
87
- " )
88
- # -rdc=true")
89
- set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA" )
90
- set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA" )
91
- set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA" )
92
- if (BUILD_THE OR BUILD_THS)
93
- set (ENV{TORCH_CUDA_ARCH_LIST} "7.0;7.5" )
141
+ if (NOT (FIND_SM STREQUAL True ))
142
+ set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
143
+ -gencode=arch=compute_70,code=\\\" sm_70,compute_70\\\" \
144
+ -gencode=arch=compute_75,code=\\\" sm_75,compute_75\\\" \
145
+ -gencode=arch=compute_80,code=\\\" sm_80,compute_80\\\" \
146
+ " )
147
+ # -rdc=true")
148
+ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA" )
149
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA" )
150
+ set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA" )
151
+ if (BUILD_PYT)
152
+ set (ENV{TORCH_CUDA_ARCH_LIST} "7.0;7.5;8.0" )
153
+ endif ()
154
+ set (CMAKE_CUDA_ARCHITECTURES 70 75 80)
155
+ message ("-- Assign GPU architecture (sm=70,75,80)" )
94
156
endif ()
95
- message ("-- Assign GPU architecture (sm=70,75)" )
157
+
158
+ if (BUILD_PYT)
159
+ set (TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST} )
96
160
endif ()
97
161
98
162
set (CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wall -O0" )
@@ -128,21 +192,14 @@ if(BUILD_TF)
128
192
list (APPEND COMMON_LIB_DIRS ${TF_PATH} )
129
193
endif ()
130
194
131
- if (BUILD_TRT)
132
- list (APPEND COMMON_HEADER_DIRS ${TRT_PATH} /include )
133
- list (APPEND COMMON_LIB_DIRS ${TRT_PATH} /lib)
134
- endif ()
135
-
136
195
set (PYTHON_PATH "python" CACHE STRING "Python path" )
137
- if (BUILD_THS )
196
+ if (BUILD_PYT )
138
197
execute_process (COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import torch; print(torch.__version__,end='');"
139
198
RESULT_VARIABLE _PYTHON_SUCCESS
140
199
OUTPUT_VARIABLE TORCH_VERSION)
141
200
if (TORCH_VERSION VERSION_LESS "1.5.0" )
142
201
message (FATAL_ERROR "PyTorch >= 1.5.0 is needed for TorchScript mode." )
143
202
endif ()
144
- endif ()
145
- if (BUILD_THE OR BUILD_THS)
146
203
execute_process (COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import os; import torch;
147
204
print(os.path.dirname(torch.__file__),end='');"
148
205
RESULT_VARIABLE _PYTHON_SUCCESS
@@ -152,34 +209,25 @@ print(os.path.dirname(torch.__file__),end='');"
152
209
endif ()
153
210
list (APPEND CMAKE_PREFIX_PATH ${TORCH_DIR} )
154
211
find_package (Torch REQUIRED)
155
-
156
212
execute_process (COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; from distutils import sysconfig;
157
- print(sysconfig.get_python_inc());
158
- print(sysconfig.get_config_var('SO'));"
213
+ print(sysconfig.get_python_inc());"
159
214
RESULT_VARIABLE _PYTHON_SUCCESS
160
- OUTPUT_VARIABLE _PYTHON_VALUES )
215
+ OUTPUT_VARIABLE PY_INCLUDE_DIR )
161
216
if (NOT _PYTHON_SUCCESS MATCHES 0)
162
217
message (FATAL_ERROR "Python config Error." )
163
218
endif ()
164
- string (REGEX REPLACE ";" "\\\\ ;" _PYTHON_VALUES ${_PYTHON_VALUES} )
165
- string (REGEX REPLACE "\n " ";" _PYTHON_VALUES ${_PYTHON_VALUES} )
166
- list (GET _PYTHON_VALUES 0 PY_INCLUDE_DIR)
167
- list (GET _PYTHON_VALUES 1 PY_SUFFIX)
168
219
list (APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR} )
169
-
170
- execute_process (COMMAND ${PYTHON_PATH} "-c" "from torch.utils import cpp_extension; print(' '.join(cpp_extension._prepare_ldflags([],True,False)),end='');"
171
- RESULT_VARIABLE _PYTHON_SUCCESS
172
- OUTPUT_VARIABLE TORCH_LINK)
173
- if (NOT _PYTHON_SUCCESS MATCHES 0)
174
- message (FATAL_ERROR "PyTorch link config Error." )
175
- endif ()
176
220
endif ()
177
221
222
+ list (APPEND COMMON_HEADER_DIRS ${MPI_INCLUDE_PATH} )
178
223
179
224
include_directories (
180
225
${COMMON_HEADER_DIRS}
181
226
)
182
227
228
+ # set path of mpi
229
+ list (APPEND COMMON_LIB_DIRS /usr/local/mpi/lib)
230
+
183
231
link_directories (
184
232
${COMMON_LIB_DIRS}
185
233
)
@@ -196,7 +244,7 @@ if(BUILD_TF)
196
244
)
197
245
endif ()
198
246
199
- if (BUILD_THE OR BUILD_THS )
247
+ if (BUILD_PYT )
200
248
add_custom_target (copy ALL COMMENT "Copying pytorch test scripts" )
201
249
add_custom_command (TARGET copy
202
250
POST_BUILD
@@ -205,3 +253,110 @@ if(BUILD_THE OR BUILD_THS)
205
253
COMMAND cp ${PROJECT_SOURCE_DIR} /sample/tensorflow/utils/translation/test .* ${PROJECT_BINARY_DIR} /pytorch/translation/data/
206
254
)
207
255
endif ()
256
+
257
+ ########################################
258
+
259
+ if (BUILD_GPT)
260
+ # Following feature requires cmake 3.15
261
+ # TODO Remove this part or modify such that we can run it under cmake 3.10
262
+ cmake_minimum_required (VERSION 3.15 FATAL_ERROR)
263
+ add_library (transformer-static STATIC
264
+ $<TARGET_OBJECTS:encoder>
265
+ $<TARGET_OBJECTS:cuda_kernels>
266
+ $<TARGET_OBJECTS:transformer_kernels>
267
+ $<TARGET_OBJECTS:nvtx_utils>
268
+ $<TARGET_OBJECTS:cuda_int8_kernels>
269
+ $<TARGET_OBJECTS:attention_kernels>
270
+ $<TARGET_OBJECTS:trt_fused_multi_head_attention>
271
+ $<TARGET_OBJECTS:encoder_gemm_func>
272
+ $<TARGET_OBJECTS:encoder_igemm_func>
273
+ $<TARGET_OBJECTS:decoder>
274
+ $<TARGET_OBJECTS:decoding>
275
+ $<TARGET_OBJECTS:topk>
276
+ $<TARGET_OBJECTS:online_softmax_beamsearch>
277
+ $<TARGET_OBJECTS:nccl_utils>)
278
+ set_property (TARGET transformer-static PROPERTY POSITION_INDEPENDENT_CODE ON )
279
+ set_property (TARGET transformer-static PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON )
280
+ target_link_libraries (transformer-static PUBLIC -lcublas -lcudart -lcurand -lnccl -lmpi nvtx_utils)
281
+
282
+ add_library (transformer-shared SHARED
283
+ $<TARGET_OBJECTS:encoder>
284
+ $<TARGET_OBJECTS:cuda_kernels>
285
+ $<TARGET_OBJECTS:transformer_kernels>
286
+ $<TARGET_OBJECTS:nvtx_utils>
287
+ $<TARGET_OBJECTS:cuda_int8_kernels>
288
+ $<TARGET_OBJECTS:attention_kernels>
289
+ $<TARGET_OBJECTS:trt_fused_multi_head_attention>
290
+ $<TARGET_OBJECTS:encoder_gemm_func>
291
+ $<TARGET_OBJECTS:encoder_igemm_func>
292
+ $<TARGET_OBJECTS:decoder>
293
+ $<TARGET_OBJECTS:decoding>
294
+ $<TARGET_OBJECTS:topk>
295
+ $<TARGET_OBJECTS:online_softmax_beamsearch>
296
+ $<TARGET_OBJECTS:nccl_utils>
297
+ $<TARGET_OBJECTS:gpt_triton_backend>)
298
+ ## add_library(transformer-shared SHARED $<TARGET_OBJECTS:encoder>)
299
+ set_target_properties (transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON )
300
+ set_target_properties (transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON )
301
+ set_target_properties (transformer-shared PROPERTIES LINKER_LANGUAGE CXX)
302
+ target_link_libraries (transformer-shared PUBLIC ${NCCL_LIBRARIES} ${MPI_LIBRARIES} -lcublas -lcublasLt -lcudart -lcurand )
303
+
304
+ include (GNUInstallDirs)
305
+ set (INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR} /cmake/FasterTransformer)
306
+
307
+ include (CMakePackageConfigHelpers)
308
+ configure_package_config_file(
309
+ ${CMAKE_CURRENT_LIST_DIR} /cmake/FasterTransformerConfig.cmake.in
310
+ ${CMAKE_CURRENT_BINARY_DIR} /FasterTransformerConfig.cmake
311
+ INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
312
+ )
313
+
314
+ install (
315
+ FILES
316
+ ${CMAKE_CURRENT_BINARY_DIR} /FasterTransformerConfig.cmake
317
+ DESTINATION ${INSTALL_CONFIGDIR}
318
+ )
319
+
320
+ install (
321
+ TARGETS
322
+ transformer-shared
323
+ EXPORT
324
+ transformer-shared-targets
325
+ LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX} /lib
326
+ ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX} /lib
327
+ )
328
+
329
+ install (
330
+ EXPORT
331
+ transformer-shared-targets
332
+ FILE
333
+ FasterTransformerTargets.cmake
334
+ DESTINATION
335
+ ${INSTALL_CONFIGDIR}
336
+ )
337
+
338
+ file (GLOB_RECURSE HEADER_FILES "*.h" "*.hpp" "*.cuh" )
339
+ foreach ( file ${HEADER_FILES} )
340
+ file ( RELATIVE_PATH rfile ${CMAKE_CURRENT_SOURCE_DIR} ${file} )
341
+ get_filename_component ( dir ${rfile} DIRECTORY )
342
+ install ( FILES ${file} DESTINATION ${CMAKE_INSTALL_PREFIX} /include /${dir} )
343
+ endforeach ()
344
+
345
+
346
+ ################################################################################
347
+ add_executable (gpt sample/cpp/gpt_sample.cc )
348
+ target_link_libraries (gpt PUBLIC -lcublas -lcublasLt -lcudart -lcurand -lnccl -lmpi transformer-static )
349
+ # target_link_libraries(gpt PUBLIC -lcublas -lcublasLt -lcudart -lcurand -lnccl -lmpi decoder decoding)
350
+
351
+ export (
352
+ EXPORT
353
+ transformer-shared-targets
354
+ FILE
355
+ ${CMAKE_CURRENT_BINARY_DIR} /FasterTransformerTargets.cmake
356
+ NAMESPACE
357
+ TritonCore::
358
+ )
359
+
360
+ export (PACKAGE FasterTransformer)
361
+
362
+ endif () # BUILD_GPT
0 commit comments