setup/Make.Generic.Options

#############################################################################################################
# HPL-GPU 2.0 Options File for generic configuration
#
# This file lists all mandatory options at the top, and optional tuning options in the end.
# Some options can be configured via the traditional -DHPL... interface and with the new HPL_PARAMDEFS
# interface. In this case, both variants are provided for reference, the new interface outcommented as "#NEW#"
#############################################################################################################

#Any custom options here
#HPL_DEFS      += 

#This option defines whether HPL-GPU is compiled with MPI for multi-node support
HPL_CONFIG_MPI = 0

#This option defines the verbosity level of HPL-GPU. The setting ranges from 0 (no output) to 4 (very verbose), default is 3. For many-node runs the suggested setting is 1.
HPL_CONFIG_VERBOSE = 3

#Add git commit hashes to binary
HPL_GIT_STATUS = 1

#Select the backend for CALDGEMM, the default is opencl (possible options: cal, cuda, opencl, cpu
HPL_CALDGEMM_BACKEND = opencl

#############################################################################################################
#All the following are optional tuning options
#############################################################################################################

#Use Link time optimization and agressive compiler optimization flags
HPL_USE_LTO   = 1
#HPL_AGGRESSIVE_OPTIMIZATION = 1

#Use AVX LASPW implementation
HPL_DEFS      += -DHPL_LASWP_AVX 

#This setting links HPL to libcpufreq, allowing to change CPU frequencies during runtime. This can be used to obtain better efficiency.
#HPL_DEFS     += -DHPL_CPUPOWER

#These settings cal alter certain parameters during runtime. The first setting is insider HPL, the second for CALDGEMM parameters. The example alters the CPU frequency over time and disables some GPUs towards the end for better efficiency.
#HPL_DEFS     += -DHPL_CUSTOM_PARAMETER_CHANGE="if (j == startrow) setcpufreq(3000000, 3000000);"
#HPL_DEFS     += -DHPL_CUSTOM_PARAMETER_CHANGE_CALDGEMM="cal_dgemm->SetNumberDevices(factorize_first_iteration || global_m_remain > 50000 ? 4 : (global_m_remain > 35000 ? 3 : (global_m_remain > 25000 ? 2 : 1)));if (curcpufreq >= 2200000 && global_m_remain > 100000) setcpufreq(2200000, 1200000); else if (global_m_remain < 50000 && global_m_remain >= 50000 - K) setcpufreq(2700000, 1200000); else if (global_m_remain < 70000 && global_m_remain >= 70000 - K) setcpufreq(2400000, 2400000); else if (global_m_remain < 85000 && global_m_remain >= 85000 - K) setcpufreq(2200000, 2200000); else if (global_m_remain < 95000 && global_m_remain >= 95000 - K) setcpufreq(2000000, 2000000);"
#This second example alters the AsyncDGEMMThreshold with the remaining matrix size, and adepts the pipeline mid marker -Aq dynamically
#HPL_DEFS      += -DHPL_CUSTOM_PARAMETER_CHANGE_CALDGEMM="if (factorize_first_iteration) {cal_info.AsyncDGEMMThreshold = 256;} else if (global_m_remain < 125000) {cal_info.AsyncDGEMMThreshold = 480;} else if (global_m_remain < 155000) {cal_info.AsyncDGEMMThreshold = 1920;} else {cal_info.AsyncDGEMMThreshold = 3840;} if (factorize_first_iteration || global_m_remain > 50000) cal_info.PipelinedMidMarker = 25000; else cal_info.PipelinedMidMarker = global_m_remain / 2 + 1000;"

#As above, but this time we modify some factorization parameters over time. (Relevant for older AMD processors)
#HPL_DEFS     += -DHPL_CUSTOM_PARAMETER_CHANGE="ALGO->nbmin = j > 45000 ? 64 : 32;"

#Temperature threshold when run is aborted. Requires CALDGEMM to be compiled with ADL support.
#HPL_DEFS     += -DHPL_GPU_TEMPERATURE_THRESHOLD=92.