Skip to content

Commit

Permalink
Add the v4.0 codes
Browse files Browse the repository at this point in the history
  • Loading branch information
byshiue committed Apr 4, 2021
1 parent 67bd758 commit 0280a66
Show file tree
Hide file tree
Showing 203 changed files with 139,218 additions and 10,604 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*~
*.o
*build*/
*.pyc
291 changes: 223 additions & 68 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -16,12 +16,23 @@ project(FasterTransformer LANGUAGES CXX CUDA)

find_package(CUDA 10.1 REQUIRED)

option(BUILD_TRT "Build in TensorRT mode" OFF)
option(BUILD_TF "Build in TensorFlow mode" OFF)
option(BUILD_THE "Build in PyTorch eager mode" OFF)
option(BUILD_THS "Build in TorchScript class mode" OFF)
option(BUILD_PYT "Build in PyTorch TorchScript class mode" OFF)
option(BUILD_GPT "Build project with gpt" ON) # TODO Set default to OFF

if(BUILD_THS)
if(BUILD_GPT)
message(STATUS "Add DBUILD_GPT, requires MPI and NCCL")
add_definitions("-DBUILD_GPT")
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
find_package(MPI REQUIRED)
find_package(NCCL REQUIRED)
#if(${NCCL_VERSION} LESS 2.7)
# message(FATAL_ERROR "NCCL_VERSION ${NCCL_VERSION} is less than 2.7")
#endif()
set(CMAKE_MODULE_PATH "") # prevent the bugs for pytorch building
endif()

if(BUILD_PYT)
if(DEFINED ENV{NVIDIA_PYTORCH_VERSION})
if($ENV{NVIDIA_PYTORCH_VERSION} VERSION_LESS "20.03")
message(FATAL_ERROR "NVIDIA PyTorch image is too old for TorchScript mode.")
Expand All @@ -32,7 +43,11 @@ if(BUILD_THS)
endif()
endif()

set(CXX_STD "11" CACHE STRING "C++ standard")
if(BUILD_PYT OR BUILD_GPT)
set(CXX_STD "14" CACHE STRING "C++ standard")
else()
set(CXX_STD "11" CACHE STRING "C++ standard")
endif()

set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})

Expand All @@ -42,57 +57,106 @@ if(BUILD_TF AND NOT TF_PATH)
message(FATAL_ERROR "TF_PATH must be set if BUILD_TF(=TensorFlow mode) is on.")
endif()

set(TRT_PATH "" CACHE STRING "TensorRT path")

if(BUILD_TRT AND NOT TRT_PATH)
message(FATAL_ERROR "TRT_PATH must be set if BUILD_TRT(=TensorRT mode) is on.")
endif()

list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64)

if (${CUDA_VERSION} GREATER_EQUAL 11.0)
message(STATUS "Add DCUDA11_MODE")
add_definitions("-DCUDA11_MODE")
endif()

# profiling
option(USE_NVTX "Whether or not to use nvtx" OFF)
if(USE_NVTX)
message(STATUS "NVTX is enabled.")
add_definitions("-DUSE_NVTX")
endif()

# setting compiler flags
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall")

if (SM STREQUAL 80 OR
SM STREQUAL 86 OR
SM STREQUAL 70 OR
SM STREQUAL 75 OR
SM STREQUAL 61 OR
SM STREQUAL 60)
#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\" -rdc=true")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\"")
if (SM STREQUAL 70 OR SM STREQUAL 75 OR SM STREQUAL 80 OR SM STREQUAL 86)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall -ldl")

# if (SM STREQUAL 80 OR
# SM STREQUAL 86 OR
# SM STREQUAL 70 OR
# SM STREQUAL 75 OR
# SM STREQUAL 61 OR
# SM STREQUAL 60)
# #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\" -rdc=true")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\"")
# if (SM STREQUAL 70 OR SM STREQUAL 75 OR SM STREQUAL 80 OR SM STREQUAL 86)
# set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
# endif()
# message("-- Assign GPU architecture (sm=${SM})")

# else()
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
# -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
# -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
# ")
# # -rdc=true")
# set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
# message("-- Assign GPU architecture (sm=70,75)")
# endif()

set(SM_SETS 52 60 61 70 75 80)
set(USING_WMMA False)
set(FIND_SM False)

foreach(SM_NUM IN LISTS SM_SETS)
string(FIND "${SM}" "${SM_NUM}" SM_POS)
if(SM_POS GREATER -1)
if(FIND_SM STREQUAL False)
set(ENV{TORCH_CUDA_ARCH_LIST} "")
endif()
set(FIND_SM True)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM_NUM},code=\\\"sm_${SM_NUM},compute_${SM_NUM}\\\"")

if (SM_NUM STREQUAL 70 OR SM_NUM STREQUAL 75 OR SM_NUM STREQUAL 80 OR SM_NUM STREQUAL 86)
set(USING_WMMA True)
endif()

if(BUILD_PYT)
string(SUBSTRING ${SM_NUM} 0 1 SM_MAJOR)
string(SUBSTRING ${SM_NUM} 1 1 SM_MINOR)
set(ENV{TORCH_CUDA_ARCH_LIST} "$ENV{TORCH_CUDA_ARCH_LIST}\;${SM_MAJOR}.${SM_MINOR}")
endif()

set(CMAKE_CUDA_ARCHITECTURES ${SM_NUM})
message("-- Assign GPU architecture (sm=${SM_NUM})")
endif()
if(BUILD_THE OR BUILD_THS)
string(SUBSTRING ${SM} 0 1 SM_MAJOR)
string(SUBSTRING ${SM} 1 1 SM_MINOR)
set(ENV{TORCH_CUDA_ARCH_LIST} "${SM_MAJOR}.${SM_MINOR}")
endforeach()

if(USING_WMMA STREQUAL True)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
message("-- Use WMMA")
endif()
message("-- Assign GPU architecture (sm=${SM})")

else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
-gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
-gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
")
# -rdc=true")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
if(BUILD_THE OR BUILD_THS)
set(ENV{TORCH_CUDA_ARCH_LIST} "7.0;7.5")
if(NOT (FIND_SM STREQUAL True))
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
-gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
-gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
-gencode=arch=compute_80,code=\\\"sm_80,compute_80\\\" \
")
# -rdc=true")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
if(BUILD_PYT)
set(ENV{TORCH_CUDA_ARCH_LIST} "7.0;7.5;8.0")
endif()
set(CMAKE_CUDA_ARCHITECTURES 70 75 80)
message("-- Assign GPU architecture (sm=70,75,80)")
endif()
message("-- Assign GPU architecture (sm=70,75)")

if(BUILD_PYT)
set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
endif()

set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wall -O0")
Expand Down Expand Up @@ -128,21 +192,14 @@ if(BUILD_TF)
list(APPEND COMMON_LIB_DIRS ${TF_PATH})
endif()

if(BUILD_TRT)
list(APPEND COMMON_HEADER_DIRS ${TRT_PATH}/include)
list(APPEND COMMON_LIB_DIRS ${TRT_PATH}/lib)
endif()

set(PYTHON_PATH "python" CACHE STRING "Python path")
if(BUILD_THS)
if(BUILD_PYT)
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import torch; print(torch.__version__,end='');"
RESULT_VARIABLE _PYTHON_SUCCESS
OUTPUT_VARIABLE TORCH_VERSION)
if (TORCH_VERSION VERSION_LESS "1.5.0")
message(FATAL_ERROR "PyTorch >= 1.5.0 is needed for TorchScript mode.")
endif()
endif()
if(BUILD_THE OR BUILD_THS)
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import os; import torch;
print(os.path.dirname(torch.__file__),end='');"
RESULT_VARIABLE _PYTHON_SUCCESS
Expand All @@ -152,34 +209,25 @@ print(os.path.dirname(torch.__file__),end='');"
endif()
list(APPEND CMAKE_PREFIX_PATH ${TORCH_DIR})
find_package(Torch REQUIRED)

execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; from distutils import sysconfig;
print(sysconfig.get_python_inc());
print(sysconfig.get_config_var('SO'));"
print(sysconfig.get_python_inc());"
RESULT_VARIABLE _PYTHON_SUCCESS
OUTPUT_VARIABLE _PYTHON_VALUES)
OUTPUT_VARIABLE PY_INCLUDE_DIR)
if (NOT _PYTHON_SUCCESS MATCHES 0)
message(FATAL_ERROR "Python config Error.")
endif()
string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
list(GET _PYTHON_VALUES 0 PY_INCLUDE_DIR)
list(GET _PYTHON_VALUES 1 PY_SUFFIX)
list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR})

execute_process(COMMAND ${PYTHON_PATH} "-c" "from torch.utils import cpp_extension; print(' '.join(cpp_extension._prepare_ldflags([],True,False)),end='');"
RESULT_VARIABLE _PYTHON_SUCCESS
OUTPUT_VARIABLE TORCH_LINK)
if (NOT _PYTHON_SUCCESS MATCHES 0)
message(FATAL_ERROR "PyTorch link config Error.")
endif()
endif()

list(APPEND COMMON_HEADER_DIRS ${MPI_INCLUDE_PATH})

include_directories(
${COMMON_HEADER_DIRS}
)

# set path of mpi
list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib)

link_directories(
${COMMON_LIB_DIRS}
)
Expand All @@ -196,7 +244,7 @@ if(BUILD_TF)
)
endif()

if(BUILD_THE OR BUILD_THS)
if(BUILD_PYT)
add_custom_target(copy ALL COMMENT "Copying pytorch test scripts")
add_custom_command(TARGET copy
POST_BUILD
Expand All @@ -205,3 +253,110 @@ if(BUILD_THE OR BUILD_THS)
COMMAND cp ${PROJECT_SOURCE_DIR}/sample/tensorflow/utils/translation/test.* ${PROJECT_BINARY_DIR}/pytorch/translation/data/
)
endif()

########################################

if(BUILD_GPT)
# Following feature requires cmake 3.15
# TODO Remove this part or modify such that we can run it under cmake 3.10
cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
add_library(transformer-static STATIC
$<TARGET_OBJECTS:encoder>
$<TARGET_OBJECTS:cuda_kernels>
$<TARGET_OBJECTS:transformer_kernels>
$<TARGET_OBJECTS:nvtx_utils>
$<TARGET_OBJECTS:cuda_int8_kernels>
$<TARGET_OBJECTS:attention_kernels>
$<TARGET_OBJECTS:trt_fused_multi_head_attention>
$<TARGET_OBJECTS:encoder_gemm_func>
$<TARGET_OBJECTS:encoder_igemm_func>
$<TARGET_OBJECTS:decoder>
$<TARGET_OBJECTS:decoding>
$<TARGET_OBJECTS:topk>
$<TARGET_OBJECTS:online_softmax_beamsearch>
$<TARGET_OBJECTS:nccl_utils>)
set_property(TARGET transformer-static PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET transformer-static PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(transformer-static PUBLIC -lcublas -lcudart -lcurand -lnccl -lmpi nvtx_utils)

add_library(transformer-shared SHARED
$<TARGET_OBJECTS:encoder>
$<TARGET_OBJECTS:cuda_kernels>
$<TARGET_OBJECTS:transformer_kernels>
$<TARGET_OBJECTS:nvtx_utils>
$<TARGET_OBJECTS:cuda_int8_kernels>
$<TARGET_OBJECTS:attention_kernels>
$<TARGET_OBJECTS:trt_fused_multi_head_attention>
$<TARGET_OBJECTS:encoder_gemm_func>
$<TARGET_OBJECTS:encoder_igemm_func>
$<TARGET_OBJECTS:decoder>
$<TARGET_OBJECTS:decoding>
$<TARGET_OBJECTS:topk>
$<TARGET_OBJECTS:online_softmax_beamsearch>
$<TARGET_OBJECTS:nccl_utils>
$<TARGET_OBJECTS:gpt_triton_backend>)
## add_library(transformer-shared SHARED $<TARGET_OBJECTS:encoder>)
set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
set_target_properties(transformer-shared PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(transformer-shared PUBLIC ${NCCL_LIBRARIES} ${MPI_LIBRARIES} -lcublas -lcublasLt -lcudart -lcurand )

include(GNUInstallDirs)
set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/FasterTransformer)

include(CMakePackageConfigHelpers)
configure_package_config_file(
${CMAKE_CURRENT_LIST_DIR}/cmake/FasterTransformerConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerConfig.cmake
INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
)

install(
FILES
${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerConfig.cmake
DESTINATION ${INSTALL_CONFIGDIR}
)

install(
TARGETS
transformer-shared
EXPORT
transformer-shared-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
)

install(
EXPORT
transformer-shared-targets
FILE
FasterTransformerTargets.cmake
DESTINATION
${INSTALL_CONFIGDIR}
)

file(GLOB_RECURSE HEADER_FILES "*.h" "*.hpp" "*.cuh")
foreach ( file ${HEADER_FILES} )
file( RELATIVE_PATH rfile ${CMAKE_CURRENT_SOURCE_DIR} ${file} )
get_filename_component( dir ${rfile} DIRECTORY )
install( FILES ${file} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/${dir} )
endforeach()


################################################################################
add_executable(gpt sample/cpp/gpt_sample.cc )
target_link_libraries(gpt PUBLIC -lcublas -lcublasLt -lcudart -lcurand -lnccl -lmpi transformer-static)
# target_link_libraries(gpt PUBLIC -lcublas -lcublasLt -lcudart -lcurand -lnccl -lmpi decoder decoding)

export(
EXPORT
transformer-shared-targets
FILE
${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerTargets.cmake
NAMESPACE
TritonCore::
)

export(PACKAGE FasterTransformer)

endif() # BUILD_GPT
Loading

0 comments on commit 0280a66

Please sign in to comment.