Skip to content

Commit 0280a66

Browse files
committed
Add the v4.0 codes
1 parent 67bd758 commit 0280a66

File tree

203 files changed

+139218
-10604
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

203 files changed

+139218
-10604
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
*~
2+
*.o
3+
*build*/
4+
*.pyc

CMakeLists.txt

Lines changed: 223 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -16,12 +16,23 @@ project(FasterTransformer LANGUAGES CXX CUDA)
1616

1717
find_package(CUDA 10.1 REQUIRED)
1818

19-
option(BUILD_TRT "Build in TensorRT mode" OFF)
2019
option(BUILD_TF "Build in TensorFlow mode" OFF)
21-
option(BUILD_THE "Build in PyTorch eager mode" OFF)
22-
option(BUILD_THS "Build in TorchScript class mode" OFF)
20+
option(BUILD_PYT "Build in PyTorch TorchScript class mode" OFF)
21+
option(BUILD_GPT "Build project with gpt" ON) # TODO Set default to OFF
2322

24-
if(BUILD_THS)
23+
if(BUILD_GPT)
24+
message(STATUS "Add DBUILD_GPT, requires MPI and NCCL")
25+
add_definitions("-DBUILD_GPT")
26+
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
27+
find_package(MPI REQUIRED)
28+
find_package(NCCL REQUIRED)
29+
#if(${NCCL_VERSION} LESS 2.7)
30+
# message(FATAL_ERROR "NCCL_VERSION ${NCCL_VERSION} is less than 2.7")
31+
#endif()
32+
set(CMAKE_MODULE_PATH "") # prevent the bugs for pytorch building
33+
endif()
34+
35+
if(BUILD_PYT)
2536
if(DEFINED ENV{NVIDIA_PYTORCH_VERSION})
2637
if($ENV{NVIDIA_PYTORCH_VERSION} VERSION_LESS "20.03")
2738
message(FATAL_ERROR "NVIDIA PyTorch image is too old for TorchScript mode.")
@@ -32,7 +43,11 @@ if(BUILD_THS)
3243
endif()
3344
endif()
3445

35-
set(CXX_STD "11" CACHE STRING "C++ standard")
46+
if(BUILD_PYT OR BUILD_GPT)
47+
set(CXX_STD "14" CACHE STRING "C++ standard")
48+
else()
49+
set(CXX_STD "11" CACHE STRING "C++ standard")
50+
endif()
3651

3752
set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
3853

@@ -42,57 +57,106 @@ if(BUILD_TF AND NOT TF_PATH)
4257
message(FATAL_ERROR "TF_PATH must be set if BUILD_TF(=TensorFlow mode) is on.")
4358
endif()
4459

45-
set(TRT_PATH "" CACHE STRING "TensorRT path")
46-
47-
if(BUILD_TRT AND NOT TRT_PATH)
48-
message(FATAL_ERROR "TRT_PATH must be set if BUILD_TRT(=TensorRT mode) is on.")
49-
endif()
50-
5160
list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64)
5261

5362
if (${CUDA_VERSION} GREATER_EQUAL 11.0)
5463
message(STATUS "Add DCUDA11_MODE")
5564
add_definitions("-DCUDA11_MODE")
5665
endif()
5766

67+
# profiling
68+
option(USE_NVTX "Whether or not to use nvtx" OFF)
69+
if(USE_NVTX)
70+
message(STATUS "NVTX is enabled.")
71+
add_definitions("-DUSE_NVTX")
72+
endif()
73+
5874
# setting compiler flags
59-
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
75+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
6076
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
61-
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall")
62-
63-
if (SM STREQUAL 80 OR
64-
SM STREQUAL 86 OR
65-
SM STREQUAL 70 OR
66-
SM STREQUAL 75 OR
67-
SM STREQUAL 61 OR
68-
SM STREQUAL 60)
69-
#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\" -rdc=true")
70-
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\"")
71-
if (SM STREQUAL 70 OR SM STREQUAL 75 OR SM STREQUAL 80 OR SM STREQUAL 86)
72-
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
73-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
74-
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
77+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall -ldl")
78+
79+
# if (SM STREQUAL 80 OR
80+
# SM STREQUAL 86 OR
81+
# SM STREQUAL 70 OR
82+
# SM STREQUAL 75 OR
83+
# SM STREQUAL 61 OR
84+
# SM STREQUAL 60)
85+
# #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\" -rdc=true")
86+
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\"")
87+
# if (SM STREQUAL 70 OR SM STREQUAL 75 OR SM STREQUAL 80 OR SM STREQUAL 86)
88+
# set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
89+
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
90+
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
91+
# endif()
92+
# message("-- Assign GPU architecture (sm=${SM})")
93+
94+
# else()
95+
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
96+
# -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
97+
# -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
98+
# ")
99+
# # -rdc=true")
100+
# set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
101+
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
102+
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
103+
# message("-- Assign GPU architecture (sm=70,75)")
104+
# endif()
105+
106+
set(SM_SETS 52 60 61 70 75 80)
107+
set(USING_WMMA False)
108+
set(FIND_SM False)
109+
110+
foreach(SM_NUM IN LISTS SM_SETS)
111+
string(FIND "${SM}" "${SM_NUM}" SM_POS)
112+
if(SM_POS GREATER -1)
113+
if(FIND_SM STREQUAL False)
114+
set(ENV{TORCH_CUDA_ARCH_LIST} "")
115+
endif()
116+
set(FIND_SM True)
117+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM_NUM},code=\\\"sm_${SM_NUM},compute_${SM_NUM}\\\"")
118+
119+
if (SM_NUM STREQUAL 70 OR SM_NUM STREQUAL 75 OR SM_NUM STREQUAL 80 OR SM_NUM STREQUAL 86)
120+
set(USING_WMMA True)
121+
endif()
122+
123+
if(BUILD_PYT)
124+
string(SUBSTRING ${SM_NUM} 0 1 SM_MAJOR)
125+
string(SUBSTRING ${SM_NUM} 1 1 SM_MINOR)
126+
set(ENV{TORCH_CUDA_ARCH_LIST} "$ENV{TORCH_CUDA_ARCH_LIST}\;${SM_MAJOR}.${SM_MINOR}")
127+
endif()
128+
129+
set(CMAKE_CUDA_ARCHITECTURES ${SM_NUM})
130+
message("-- Assign GPU architecture (sm=${SM_NUM})")
75131
endif()
76-
if(BUILD_THE OR BUILD_THS)
77-
string(SUBSTRING ${SM} 0 1 SM_MAJOR)
78-
string(SUBSTRING ${SM} 1 1 SM_MINOR)
79-
set(ENV{TORCH_CUDA_ARCH_LIST} "${SM_MAJOR}.${SM_MINOR}")
132+
endforeach()
133+
134+
if(USING_WMMA STREQUAL True)
135+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
136+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
137+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
138+
message("-- Use WMMA")
80139
endif()
81-
message("-- Assign GPU architecture (sm=${SM})")
82140

83-
else()
84-
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
85-
-gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
86-
-gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
87-
")
88-
# -rdc=true")
89-
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
90-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
91-
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
92-
if(BUILD_THE OR BUILD_THS)
93-
set(ENV{TORCH_CUDA_ARCH_LIST} "7.0;7.5")
141+
if(NOT (FIND_SM STREQUAL True))
142+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
143+
-gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
144+
-gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
145+
-gencode=arch=compute_80,code=\\\"sm_80,compute_80\\\" \
146+
")
147+
# -rdc=true")
148+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
149+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
150+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
151+
if(BUILD_PYT)
152+
set(ENV{TORCH_CUDA_ARCH_LIST} "7.0;7.5;8.0")
153+
endif()
154+
set(CMAKE_CUDA_ARCHITECTURES 70 75 80)
155+
message("-- Assign GPU architecture (sm=70,75,80)")
94156
endif()
95-
message("-- Assign GPU architecture (sm=70,75)")
157+
158+
if(BUILD_PYT)
159+
set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
96160
endif()
97161

98162
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wall -O0")
@@ -128,21 +192,14 @@ if(BUILD_TF)
128192
list(APPEND COMMON_LIB_DIRS ${TF_PATH})
129193
endif()
130194

131-
if(BUILD_TRT)
132-
list(APPEND COMMON_HEADER_DIRS ${TRT_PATH}/include)
133-
list(APPEND COMMON_LIB_DIRS ${TRT_PATH}/lib)
134-
endif()
135-
136195
set(PYTHON_PATH "python" CACHE STRING "Python path")
137-
if(BUILD_THS)
196+
if(BUILD_PYT)
138197
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import torch; print(torch.__version__,end='');"
139198
RESULT_VARIABLE _PYTHON_SUCCESS
140199
OUTPUT_VARIABLE TORCH_VERSION)
141200
if (TORCH_VERSION VERSION_LESS "1.5.0")
142201
message(FATAL_ERROR "PyTorch >= 1.5.0 is needed for TorchScript mode.")
143202
endif()
144-
endif()
145-
if(BUILD_THE OR BUILD_THS)
146203
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import os; import torch;
147204
print(os.path.dirname(torch.__file__),end='');"
148205
RESULT_VARIABLE _PYTHON_SUCCESS
@@ -152,34 +209,25 @@ print(os.path.dirname(torch.__file__),end='');"
152209
endif()
153210
list(APPEND CMAKE_PREFIX_PATH ${TORCH_DIR})
154211
find_package(Torch REQUIRED)
155-
156212
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; from distutils import sysconfig;
157-
print(sysconfig.get_python_inc());
158-
print(sysconfig.get_config_var('SO'));"
213+
print(sysconfig.get_python_inc());"
159214
RESULT_VARIABLE _PYTHON_SUCCESS
160-
OUTPUT_VARIABLE _PYTHON_VALUES)
215+
OUTPUT_VARIABLE PY_INCLUDE_DIR)
161216
if (NOT _PYTHON_SUCCESS MATCHES 0)
162217
message(FATAL_ERROR "Python config Error.")
163218
endif()
164-
string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
165-
string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
166-
list(GET _PYTHON_VALUES 0 PY_INCLUDE_DIR)
167-
list(GET _PYTHON_VALUES 1 PY_SUFFIX)
168219
list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR})
169-
170-
execute_process(COMMAND ${PYTHON_PATH} "-c" "from torch.utils import cpp_extension; print(' '.join(cpp_extension._prepare_ldflags([],True,False)),end='');"
171-
RESULT_VARIABLE _PYTHON_SUCCESS
172-
OUTPUT_VARIABLE TORCH_LINK)
173-
if (NOT _PYTHON_SUCCESS MATCHES 0)
174-
message(FATAL_ERROR "PyTorch link config Error.")
175-
endif()
176220
endif()
177221

222+
list(APPEND COMMON_HEADER_DIRS ${MPI_INCLUDE_PATH})
178223

179224
include_directories(
180225
${COMMON_HEADER_DIRS}
181226
)
182227

228+
# set path of mpi
229+
list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib)
230+
183231
link_directories(
184232
${COMMON_LIB_DIRS}
185233
)
@@ -196,7 +244,7 @@ if(BUILD_TF)
196244
)
197245
endif()
198246

199-
if(BUILD_THE OR BUILD_THS)
247+
if(BUILD_PYT)
200248
add_custom_target(copy ALL COMMENT "Copying pytorch test scripts")
201249
add_custom_command(TARGET copy
202250
POST_BUILD
@@ -205,3 +253,110 @@ if(BUILD_THE OR BUILD_THS)
205253
COMMAND cp ${PROJECT_SOURCE_DIR}/sample/tensorflow/utils/translation/test.* ${PROJECT_BINARY_DIR}/pytorch/translation/data/
206254
)
207255
endif()
256+
257+
########################################
258+
259+
if(BUILD_GPT)
260+
# Following feature requires cmake 3.15
261+
# TODO Remove this part or modify such that we can run it under cmake 3.10
262+
cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
263+
add_library(transformer-static STATIC
264+
$<TARGET_OBJECTS:encoder>
265+
$<TARGET_OBJECTS:cuda_kernels>
266+
$<TARGET_OBJECTS:transformer_kernels>
267+
$<TARGET_OBJECTS:nvtx_utils>
268+
$<TARGET_OBJECTS:cuda_int8_kernels>
269+
$<TARGET_OBJECTS:attention_kernels>
270+
$<TARGET_OBJECTS:trt_fused_multi_head_attention>
271+
$<TARGET_OBJECTS:encoder_gemm_func>
272+
$<TARGET_OBJECTS:encoder_igemm_func>
273+
$<TARGET_OBJECTS:decoder>
274+
$<TARGET_OBJECTS:decoding>
275+
$<TARGET_OBJECTS:topk>
276+
$<TARGET_OBJECTS:online_softmax_beamsearch>
277+
$<TARGET_OBJECTS:nccl_utils>)
278+
set_property(TARGET transformer-static PROPERTY POSITION_INDEPENDENT_CODE ON)
279+
set_property(TARGET transformer-static PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
280+
target_link_libraries(transformer-static PUBLIC -lcublas -lcudart -lcurand -lnccl -lmpi nvtx_utils)
281+
282+
add_library(transformer-shared SHARED
283+
$<TARGET_OBJECTS:encoder>
284+
$<TARGET_OBJECTS:cuda_kernels>
285+
$<TARGET_OBJECTS:transformer_kernels>
286+
$<TARGET_OBJECTS:nvtx_utils>
287+
$<TARGET_OBJECTS:cuda_int8_kernels>
288+
$<TARGET_OBJECTS:attention_kernels>
289+
$<TARGET_OBJECTS:trt_fused_multi_head_attention>
290+
$<TARGET_OBJECTS:encoder_gemm_func>
291+
$<TARGET_OBJECTS:encoder_igemm_func>
292+
$<TARGET_OBJECTS:decoder>
293+
$<TARGET_OBJECTS:decoding>
294+
$<TARGET_OBJECTS:topk>
295+
$<TARGET_OBJECTS:online_softmax_beamsearch>
296+
$<TARGET_OBJECTS:nccl_utils>
297+
$<TARGET_OBJECTS:gpt_triton_backend>)
298+
## add_library(transformer-shared SHARED $<TARGET_OBJECTS:encoder>)
299+
set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON)
300+
set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
301+
set_target_properties(transformer-shared PROPERTIES LINKER_LANGUAGE CXX)
302+
target_link_libraries(transformer-shared PUBLIC ${NCCL_LIBRARIES} ${MPI_LIBRARIES} -lcublas -lcublasLt -lcudart -lcurand )
303+
304+
include(GNUInstallDirs)
305+
set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/FasterTransformer)
306+
307+
include(CMakePackageConfigHelpers)
308+
configure_package_config_file(
309+
${CMAKE_CURRENT_LIST_DIR}/cmake/FasterTransformerConfig.cmake.in
310+
${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerConfig.cmake
311+
INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
312+
)
313+
314+
install(
315+
FILES
316+
${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerConfig.cmake
317+
DESTINATION ${INSTALL_CONFIGDIR}
318+
)
319+
320+
install(
321+
TARGETS
322+
transformer-shared
323+
EXPORT
324+
transformer-shared-targets
325+
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
326+
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
327+
)
328+
329+
install(
330+
EXPORT
331+
transformer-shared-targets
332+
FILE
333+
FasterTransformerTargets.cmake
334+
DESTINATION
335+
${INSTALL_CONFIGDIR}
336+
)
337+
338+
file(GLOB_RECURSE HEADER_FILES "*.h" "*.hpp" "*.cuh")
339+
foreach ( file ${HEADER_FILES} )
340+
file( RELATIVE_PATH rfile ${CMAKE_CURRENT_SOURCE_DIR} ${file} )
341+
get_filename_component( dir ${rfile} DIRECTORY )
342+
install( FILES ${file} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/${dir} )
343+
endforeach()
344+
345+
346+
################################################################################
347+
add_executable(gpt sample/cpp/gpt_sample.cc )
348+
target_link_libraries(gpt PUBLIC -lcublas -lcublasLt -lcudart -lcurand -lnccl -lmpi transformer-static)
349+
# target_link_libraries(gpt PUBLIC -lcublas -lcublasLt -lcudart -lcurand -lnccl -lmpi decoder decoding)
350+
351+
export(
352+
EXPORT
353+
transformer-shared-targets
354+
FILE
355+
${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerTargets.cmake
356+
NAMESPACE
357+
TritonCore::
358+
)
359+
360+
export(PACKAGE FasterTransformer)
361+
362+
endif() # BUILD_GPT

0 commit comments

Comments
 (0)