Skip to content

Commit

Permalink
upated build system
Browse files Browse the repository at this point in the history
  • Loading branch information
DiamonDinoia committed Jul 25, 2024
1 parent d0ce11e commit 73f937b
Show file tree
Hide file tree
Showing 5 changed files with 244 additions and 16 deletions.
17 changes: 9 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.19)
cmake_minimum_required(VERSION 3.23)

project(FINUFFT VERSION 2.2.0 LANGUAGES C CXX)

Expand Down Expand Up @@ -46,7 +46,7 @@ if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS AND NOT DEFINED FINUFFT_ARC
endif ()
set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)")
set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library")

set(FINUFFT_CUDA_ARCHITECTURES "native" CACHE STRING "CUDA architectures to build for (e.g. 60;70;75;)")
# All options go here
# sphinx tag (don't remove): @cmake_opts_start
option(FINUFFT_BUILD_EXAMPLES "Whether to build the FINUFFT examples" OFF)
Expand Down Expand Up @@ -271,25 +271,26 @@ if (FINUFFT_USE_CUDA)
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
add_subdirectory(src/cuda)
if (BUILD_TESTING AND FINUFFT_BUILD_TESTS)
if (FINUFFT_BUILD_TESTS)
add_subdirectory(perftest/cuda)
add_subdirectory(test/cuda)
endif ()

list(APPEND INSTALL_TARGETS cufinufft)
endif ()

# Add tests defined in their own directory
if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CPU)
if (FINUFFT_USE_CPU AND FINUFFT_BUILD_TESTS)
add_subdirectory(test)
add_subdirectory(perftest)
endif ()

if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CUDA)
add_subdirectory(test/cuda)
if (FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_CPU)
add_subdirectory(examples)
endif ()

if (FINUFFT_BUILD_EXAMPLES)
add_subdirectory(examples)
if (FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_GPU)
add_subdirectory(examples/cuda)
endif ()

if (FINUFFT_BUILD_FORTRAN)
Expand Down
6 changes: 6 additions & 0 deletions perftest/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
add_executable(cuperftest cuperftest.cu)
target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
target_link_libraries(cuperftest PUBLIC cufinufft)
set_target_properties(cuperftest PROPERTIES
LINKER_LANGUAGE CUDA
CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
)

#file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/bench.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
185 changes: 185 additions & 0 deletions perftest/cuda/bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import matplotlib.pyplot as plt
import os
import subprocess
import pandas as pd
import numpy as np
import io
cwd = os.getcwd()


# function that runs a command line command and returns the output
# it also takes a list of arguments to pass to the command
def run_command(command, args):
# convert command and args to a string
try:
cmd = [command] + args
print("Running command:", ' '.join(cmd))
result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
return result.stdout, result.stderr
except subprocess.CalledProcessError as e:
print('stdout output:\n', e.stdout)
print('stderr output:\n', e.stderr)
print("Error executing command:", e)


# function that builds a string from a dictionary of arguments

def build_args(args):
args_list = []
for key, value in args.items():
args_list.append(key)
args_list.append(value)
return args_list


# function

# example command to run:
# nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6
# example arguments
args = {"--prec": "f",
"--n_runs": "5",
"--method": "0",
"--sort": "1",
"--N1": "16777216",
# "--N1": "256",
# "--N2": "256",
# "--N3": "256",
"--kerevalmethod": "1",
"--M": "1E8",
"--tol": "1E-6"}
# iterate over tol from 1E-6 to 1E-1
data = {
'method': [],
'throughput': [],
'tolerance': [],
# 'setpts': [],
'exec': [],
}
warmup = {"--prec": "f",
"--n_runs": "1",
"--method": "0",
"--N1": "256",
"--N2": "256",
# "--N3": "256",
"--M": "256",
"--tol": "1E-1"}
cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(warmup)
print("Warmup")
stdout, stderr = run_command("nsys", cmd)
print("Benchmarking")
if stderr != '':
print(stderr)
exit(0)
max_range = 16 if args["--prec"] == "d" else 7

for i in range(1, max_range):
args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i)
print("Running with tol = 1E-" + str(i))
for method in ['2', '1']:
args["--method"] = method
if method == '0':
data['method'].append('auto')
elif method == '1':
data['method'].append('GM')
elif method == '2':
data['method'].append('SM')
elif method == '4':
data['method'].append('BLOCK')
print("Method " + data['method'][-1])
cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args)
stdout, stderr = run_command("nsys", cmd)
if stderr != '':
print(stderr)
exit(0)
# skip all lines starting with # in stdout
conf = [x for x in stdout.splitlines() if x.startswith("#")]
print('\n'.join(conf))
stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7]
if stdout[0].startswith("bin"):
print(stdout[0])
stdout = stdout[1:]

stdout = '\n'.join(stdout)
# convert stdout to a dataframe from csv string
dt = pd.read_csv(io.StringIO(stdout), sep=',')
setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() # it is only one row it extracts the value
exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() # it is only one row it extracts the value
print(f'setpts pts/s: {setpts}')
print(f'exec pts/s: {exec}')
cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep",
"--format=csv", "--output", "cuperftest"]
stdout, _ = run_command("nsys", cmd)
# remove format from cmd
cmd = cmd[:-3]
# print(run_command("nsys", cmd))
# print(csv)
dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv")
# print(dt)
# sum the "Total Time" column of the ones that contain "fft" in name
# print(dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")])
total_fft = dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]['Duration (ns)'].sum()
print(f'total_fft: {total_fft}')
# drop all the rows with spread not in "Name"
dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")]
# print(dt)
# exit(0)
# sort dt by column "Time (%)"
total_spread = dt['Duration (ns)'].sum() - total_fft
print(f'total_spread: {total_spread}')
if total_fft > total_spread:
print("Warning: total_fft > total_spread")
# exit(0)
# pt/s
throughput = float(args['--M']) * float(args['--n_runs']) * 1_000_000_000 / total_spread
print(f'throughput: {throughput}')
data['throughput'].append(throughput)
data['tolerance'].append(args['--tol'])
# data['setpts'].append(setpts)
data['exec'].append(exec)


df = pd.DataFrame(data)
# Pivot the DataFrame
pivot_df = df.pivot(index='tolerance', columns='method')
# print(pivot_df)
# scale the throughput SM by GM
# pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM']
# pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM']
# scale setpts SM by GM
# pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM']
# pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM']
# remove the GM column
# pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True)
pivot_df.drop(('exec', 'GM'), axis=1, inplace=True)
pivot_df.drop(('exec', 'SM'), axis=1, inplace=True)
print(pivot_df)
# Plot
pivot_df.plot(kind='bar', figsize=(10, 7))
# Find the minimum throughput value
min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min())
max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max())
print(min_val, max_val)
plt.ylim(min_val * .90, max_val * 1.1)
# plt.ylim(.8, 1.2)

# Calculate the smallest power of 10
# min_pow_10 = 10 ** np.floor(np.log10(min_throughput))

# Adjust the plot's y-axis limits
# plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.009) # Adding 10% for upper margin

# plot an horizontal line at 1 with label "GM"
# plt.axhline(y=1, color='k', linestyle='--', label='GM')
plt.xlabel('Tolerance')
plt.ylabel('Throughput')
plt.title('Throughput by Tolerance and Method')
plt.legend(title='Method')
plt.tight_layout()
plt.show()
plt.xlabel("Tolerance")
plt.ylabel("Points/s")
plt.savefig("bench.png")
plt.savefig("bench.svg")
plt.savefig("bench.pdf")
plt.show()
13 changes: 13 additions & 0 deletions perftest/cuda/bench.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --M 2e6 --method 0 --tol 1e-4
./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e1 --N3 1e1 --M 2e6 --method 0 --tol 1e-4
./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e1 --M 2e6 --method 0 --tol 1e-4
./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4
./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4
#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10
#./cuperftest --prec d --n_runs 5 --N1 1e4 --N2 1e4 --N3 1e4 --M 2e6 --method 0 --tol 1e-10
#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10
#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10
#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e6 --M 2e6 --method 0 --tol 1e-10
#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10
#./cuperftest --prec d --n_runs 5 --N1 1e7 --N2 1e7 --M 2e6 --method 0 --tol 1e-10
#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e8 --M 2e6 --method 0 --tol 1e-10
39 changes: 31 additions & 8 deletions src/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)

set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp
${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp)
set(PRECISION_DEPENDENT_SRC
Expand All @@ -22,13 +17,34 @@ set(CUFINUFFT_INCLUDE_DIRS
)
set(CUFINUFFT_INCLUDE_DIRS ${CUFINUFFT_INCLUDE_DIRS} PARENT_SCOPE)

# flush denormals to zero and enable verbose PTXAS output
set(FINUFFT_CUDA_FLAGS
-ftz=true -fmad=true -restrict -Xptxas=-v --extra-device-vectorization -res-usage
-Wdouble-promotion -lineinfo --extended-lambda --expt-relaxed-constexpr
)

add_library(cufinufft_common_objects OBJECT ${PRECISION_INDEPENDENT_SRC})
target_include_directories(cufinufft_common_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
set_property(TARGET cufinufft_common_objects PROPERTY POSITION_INDEPENDENT_CODE ON)
set_target_properties(
cufinufft_common_objects PROPERTIES
POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING}
CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
CUDA_SEPARABLE_COMPILATION ON
)

target_compile_options(cufinufft_common_objects PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${FINUFFT_CUDA_FLAGS}>)
target_compile_features(cufinufft_common_objects PRIVATE cxx_std_17)

add_library(cufinufft_objects OBJECT ${PRECISION_DEPENDENT_SRC})
target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
set_property(TARGET cufinufft_objects PROPERTY POSITION_INDEPENDENT_CODE ON)
set_target_properties(
cufinufft_objects PROPERTIES
POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING}
CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
CUDA_SEPARABLE_COMPILATION ON
)
target_compile_features(cufinufft_objects PRIVATE cxx_std_17)
target_compile_options(cufinufft_objects PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${FINUFFT_CUDA_FLAGS}>)

if (FINUFFT_SHARED_LINKING)
add_library(cufinufft SHARED
Expand Down Expand Up @@ -56,5 +72,12 @@ else ()
target_link_libraries(cufinufft PUBLIC CUDA::cudart_static CUDA::cufft_static CUDA::nvToolsExt)
endif ()

target_compile_options(cufinufft PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${FINUFFT_CUDA_FLAGS}>)
file(GLOB CUFINUFFT_PUBLIC_HEADERS "${CMAKE_SOURCE_DIR}/include/cufinufft*.h")
set_target_properties(cufinufft PROPERTIES PUBLIC_HEADER "${CUFINUFFT_PUBLIC_HEADERS}")
set_target_properties(
cufinufft PROPERTIES
PUBLIC_HEADER "${CUFINUFFT_PUBLIC_HEADERS}"
POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING}
CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
CUDA_SEPARABLE_COMPILATION ON
)

0 comments on commit 73f937b

Please sign in to comment.