upated build system

DiamonDinoia · Jul 25, 2024 · 73f937b · 73f937b
1 parent d0ce11e
commit 73f937b
Show file tree

Hide file tree

Showing 5 changed files with 244 additions and 16 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.19)
+cmake_minimum_required(VERSION 3.23)
 
 project(FINUFFT VERSION 2.2.0 LANGUAGES C CXX)
 
@@ -46,7 +46,7 @@ if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS AND NOT DEFINED FINUFFT_ARC
 endif ()
 set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)")
 set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library")
-
+set(FINUFFT_CUDA_ARCHITECTURES "native" CACHE STRING "CUDA architectures to build for (e.g. 60;70;75;)")
 # All options go here
 # sphinx tag (don't remove): @cmake_opts_start
 option(FINUFFT_BUILD_EXAMPLES "Whether to build the FINUFFT examples" OFF)
@@ -271,25 +271,26 @@ if (FINUFFT_USE_CUDA)
     enable_language(CUDA)
     find_package(CUDAToolkit REQUIRED)
     add_subdirectory(src/cuda)
-    if (BUILD_TESTING AND FINUFFT_BUILD_TESTS)
+    if (FINUFFT_BUILD_TESTS)
         add_subdirectory(perftest/cuda)
+        add_subdirectory(test/cuda)
     endif ()
 
     list(APPEND INSTALL_TARGETS cufinufft)
 endif ()
 
 # Add tests defined in their own directory
-if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CPU)
+if (FINUFFT_USE_CPU AND FINUFFT_BUILD_TESTS)
     add_subdirectory(test)
     add_subdirectory(perftest)
 endif ()
 
-if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CUDA)
-    add_subdirectory(test/cuda)
+if (FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_CPU)
+    add_subdirectory(examples)
 endif ()
 
-if (FINUFFT_BUILD_EXAMPLES)
-    add_subdirectory(examples)
+if (FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_GPU)
+    add_subdirectory(examples/cuda)
 endif ()
 
 if (FINUFFT_BUILD_FORTRAN)

diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt
@@ -1,3 +1,9 @@
 add_executable(cuperftest cuperftest.cu)
 target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
 target_link_libraries(cuperftest PUBLIC cufinufft)
+set_target_properties(cuperftest PROPERTIES
+        LINKER_LANGUAGE CUDA
+        CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
+)
+
+#file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/bench.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py
@@ -0,0 +1,185 @@
+import matplotlib.pyplot as plt
+import os
+import subprocess
+import pandas as pd
+import numpy as np
+import io
+cwd = os.getcwd()
+
+
+# function that runs a command line command and returns the output
+# it also takes a list of arguments to pass to the command
+def run_command(command, args):
+    # convert command and args to a string
+    try:
+        cmd = [command] + args
+        print("Running command:", ' '.join(cmd))
+        result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        return result.stdout, result.stderr
+    except subprocess.CalledProcessError as e:
+        print('stdout output:\n', e.stdout)
+        print('stderr output:\n', e.stderr)
+        print("Error executing command:", e)
+
+
+# function that builds a string from a dictionary of arguments
+
+def build_args(args):
+    args_list = []
+    for key, value in args.items():
+        args_list.append(key)
+        args_list.append(value)
+    return args_list
+
+
+# function
+
+# example command to run:
+# nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6
+# example arguments
+args = {"--prec": "f",
+        "--n_runs": "5",
+        "--method": "0",
+        "--sort": "1",
+        "--N1": "16777216",
+        # "--N1": "256",
+        # "--N2": "256",
+        # "--N3": "256",
+        "--kerevalmethod": "1",
+        "--M": "1E8",
+        "--tol": "1E-6"}
+# iterate over tol from 1E-6 to 1E-1
+data = {
+    'method': [],
+    'throughput': [],
+    'tolerance': [],
+    # 'setpts': [],
+    'exec': [],
+}
+warmup = {"--prec": "f",
+        "--n_runs": "1",
+        "--method": "0",
+        "--N1": "256",
+        "--N2": "256",
+        # "--N3": "256",
+        "--M": "256",
+        "--tol": "1E-1"}
+cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(warmup)
+print("Warmup")
+stdout, stderr = run_command("nsys", cmd)
+print("Benchmarking")
+if stderr != '':
+    print(stderr)
+    exit(0)
+max_range = 16 if args["--prec"] == "d" else 7
+
+for i in range(1, max_range):
+    args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i)
+    print("Running with tol = 1E-" + str(i))
+    for method in ['2', '1']:
+        args["--method"] = method
+        if method == '0':
+            data['method'].append('auto')
+        elif method == '1':
+            data['method'].append('GM')
+        elif method == '2':
+            data['method'].append('SM')
+        elif method == '4':
+            data['method'].append('BLOCK')
+        print("Method " + data['method'][-1])
+        cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args)
+        stdout, stderr = run_command("nsys", cmd)
+        if stderr != '':
+            print(stderr)
+            exit(0)
+        # skip all lines starting with # in stdout
+        conf = [x for x in stdout.splitlines() if x.startswith("#")]
+        print('\n'.join(conf))
+        stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7]
+        if stdout[0].startswith("bin"):
+            print(stdout[0])
+            stdout = stdout[1:]
+
+        stdout = '\n'.join(stdout)
+        # convert stdout to a dataframe from csv string
+        dt = pd.read_csv(io.StringIO(stdout), sep=',')
+        setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() # it is only one row it extracts the value
+        exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() # it is only one row it extracts the value
+        print(f'setpts pts/s: {setpts}')
+        print(f'exec pts/s: {exec}')
+        cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep",
+               "--format=csv", "--output", "cuperftest"]
+        stdout, _ = run_command("nsys", cmd)
+        # remove format from cmd
+        cmd = cmd[:-3]
+        # print(run_command("nsys", cmd))
+        # print(csv)
+        dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv")
+        # print(dt)
+        # sum the "Total Time" column of the ones that contain "fft" in name
+        # print(dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")])
+        total_fft = dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]['Duration (ns)'].sum()
+        print(f'total_fft: {total_fft}')
+        # drop all the rows with spread not in "Name"
+        dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")]
+        # print(dt)
+        # exit(0)
+        # sort dt by column "Time (%)"
+        total_spread = dt['Duration (ns)'].sum() - total_fft
+        print(f'total_spread: {total_spread}')
+        if total_fft > total_spread:
+            print("Warning: total_fft > total_spread")
+            # exit(0)
+        # pt/s
+        throughput = float(args['--M']) * float(args['--n_runs']) * 1_000_000_000 / total_spread
+        print(f'throughput: {throughput}')
+        data['throughput'].append(throughput)
+        data['tolerance'].append(args['--tol'])
+        # data['setpts'].append(setpts)
+        data['exec'].append(exec)
+
+
+df = pd.DataFrame(data)
+# Pivot the DataFrame
+pivot_df = df.pivot(index='tolerance', columns='method')
+# print(pivot_df)
+# scale the throughput SM by GM
+# pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM']
+# pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM']
+# scale setpts SM by GM
+# pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM']
+# pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM']
+# remove the GM column
+# pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True)
+pivot_df.drop(('exec', 'GM'), axis=1, inplace=True)
+pivot_df.drop(('exec', 'SM'), axis=1, inplace=True)
+print(pivot_df)
+# Plot
+pivot_df.plot(kind='bar', figsize=(10, 7))
+# Find the minimum throughput value
+min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min())
+max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max())
+print(min_val, max_val)
+plt.ylim(min_val * .90, max_val * 1.1)
+# plt.ylim(.8, 1.2)
+
+# Calculate the smallest power of 10
+# min_pow_10 = 10 ** np.floor(np.log10(min_throughput))
+
+# Adjust the plot's y-axis limits
+# plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.009)  # Adding 10% for upper margin
+
+# plot an horizontal line at 1 with label "GM"
+# plt.axhline(y=1, color='k', linestyle='--', label='GM')
+plt.xlabel('Tolerance')
+plt.ylabel('Throughput')
+plt.title('Throughput by Tolerance and Method')
+plt.legend(title='Method')
+plt.tight_layout()
+plt.show()
+plt.xlabel("Tolerance")
+plt.ylabel("Points/s")
+plt.savefig("bench.png")
+plt.savefig("bench.svg")
+plt.savefig("bench.pdf")
+plt.show()
diff --git a/perftest/cuda/bench.sh b/perftest/cuda/bench.sh
@@ -0,0 +1,13 @@
+./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --M 2e6 --method 0 --tol 1e-4
+./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e1 --N3 1e1 --M 2e6 --method 0 --tol 1e-4
+./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e1 --M 2e6 --method 0 --tol 1e-4
+./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4
+./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4
+#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10
+#./cuperftest --prec d --n_runs 5 --N1 1e4 --N2 1e4 --N3 1e4 --M 2e6 --method 0 --tol 1e-10
+#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10
+#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10
+#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e6 --M 2e6 --method 0 --tol 1e-10
+#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10
+#./cuperftest --prec d --n_runs 5 --N1 1e7 --N2 1e7 --M 2e6 --method 0 --tol 1e-10
+#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e8 --M 2e6 --method 0 --tol 1e-10
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
@@ -1,8 +1,3 @@
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
-
 set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp
         ${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp)
 set(PRECISION_DEPENDENT_SRC
@@ -22,13 +17,34 @@ set(CUFINUFFT_INCLUDE_DIRS
 )
 set(CUFINUFFT_INCLUDE_DIRS ${CUFINUFFT_INCLUDE_DIRS} PARENT_SCOPE)
 
+# flush denormals to zero and enable verbose PTXAS output
+set(FINUFFT_CUDA_FLAGS
+        -ftz=true -fmad=true -restrict -Xptxas=-v --extra-device-vectorization -res-usage
+        -Wdouble-promotion -lineinfo --extended-lambda --expt-relaxed-constexpr
+)
+
 add_library(cufinufft_common_objects OBJECT ${PRECISION_INDEPENDENT_SRC})
 target_include_directories(cufinufft_common_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
-set_property(TARGET cufinufft_common_objects PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_target_properties(
+        cufinufft_common_objects PROPERTIES
+        POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING}
+        CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
+        CUDA_SEPARABLE_COMPILATION ON
+)
+
+target_compile_options(cufinufft_common_objects PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${FINUFFT_CUDA_FLAGS}>)
+target_compile_features(cufinufft_common_objects PRIVATE cxx_std_17)
 
 add_library(cufinufft_objects OBJECT ${PRECISION_DEPENDENT_SRC})
 target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
-set_property(TARGET cufinufft_objects PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_target_properties(
+        cufinufft_objects PROPERTIES
+        POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING}
+        CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
+        CUDA_SEPARABLE_COMPILATION ON
+)
+target_compile_features(cufinufft_objects PRIVATE cxx_std_17)
+target_compile_options(cufinufft_objects PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${FINUFFT_CUDA_FLAGS}>)
 
 if (FINUFFT_SHARED_LINKING)
     add_library(cufinufft SHARED
@@ -56,5 +72,12 @@ else ()
     target_link_libraries(cufinufft PUBLIC CUDA::cudart_static CUDA::cufft_static CUDA::nvToolsExt)
 endif ()
 
+target_compile_options(cufinufft PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${FINUFFT_CUDA_FLAGS}>)
 file(GLOB CUFINUFFT_PUBLIC_HEADERS "${CMAKE_SOURCE_DIR}/include/cufinufft*.h")
-set_target_properties(cufinufft PROPERTIES PUBLIC_HEADER "${CUFINUFFT_PUBLIC_HEADERS}")
+set_target_properties(
+        cufinufft PROPERTIES
+        PUBLIC_HEADER "${CUFINUFFT_PUBLIC_HEADERS}"
+        POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING}
+        CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
+        CUDA_SEPARABLE_COMPILATION ON
+)