setup/Make.Generic.Options_OldInterface

#############################################################################################################
# HPL-GPU 2.0 Options File for generic configuration
#
# This file lists all mandatory options at the top, and optional tuning options in the end.
#############################################################################################################

#This option defines whether HPL-GPU is compiled with MPI for multi-node support
HPL_CONFIG_MPI = 1

#This option defines the verbosity level of HPL-GPU. The setting ranges from 0 (no output) to 4 (very verbose), default is 3. For many-node runs the suggested setting is 1.
HPL_CONFIG_VERBOSE = 3

#Select the backend for CALDGEMM, the default is opencl
HPL_DEFS      += -DHPL_CALDGEMM_BACKEND=opencl

#In case the selected backend is OpenCL, this setting chooses the 3rd party library file that provides the OpenCL kernels.
HPL_CALBEDEFS += ((caldgemm_opencl::caldgemm_config_backend_opencl*)cal_info.config_backend)->kernelLib=\"amddgemm.so\";

#This sets the GPU ratio, i.e. the fraction of the workload performed by the GPU. The following options exist: 1.0: All work performed by GPU, 0.0 <= GPURatio < 1.0: Fixed setting of
#GPU Ratio, -1.0: Fully automatic GPU Ratio calculation, -1.0 < GPURatio < 0.0: This defines the negative initial GPU Ratio: during runtime the ratio is then dynamically adapted. This
#means e.g. GPURatio = -0.9 will start with a ratio of 0.9, and then adapt it dynamically. The suggested settings are: 1.0 for GPU only and for efficiency optimized cases; In any other
#case a negative initial setting is suggested. Please see also the advence GPURatio options below.
HPL_CALDEFS   += cal_info.GPURatio = 1.0;

#This defines the CPU core used for the management thread and for the MPI communication. If unset this is autodetected.
HPL_CALDEFS  += cal_info.PinMainThread = 0;
HPL_DEFS     += -DHPL_MPI_AFFINITY="{1}"

#Local matrix size for which alternate lookahead is activated. Usualy you want to leave this infinitely high. For optimized efficiency, always activate. For optimized performance, you can lower this setting in some cases. Optimal value must be tuned manually.
HPL_CALDEFS  += cal_info.AlternateLookahead = 10000000;

#############################################################################################################
#All the following are optional tuning options
#############################################################################################################

#HPL-GPU can pass the command line parameters of dgemm_bench to caldgemm, these parameters are evaluated last overwriting all other settings, in this example restricting CALDGEMM to 2 GPUs.
#HPL_DEFS      += -DHPL_CALDGEMM_PARAM="-Y 2"

#You can reorder the GPU device numbering. In general, it is good to interleave NUMA nodes, i.e. if you have 2 NUMA nodes, 4 GPUs, GPUs 0 and 1 on node 0, GPUs 2 and 3 on node 1, the
#below setting is suggested. Keep in mind that the altered numbering affects other settings relative to GPU numbering, e.g. GPU_ALLOC_MAPPING
#HPL_DEFS     += -DHPL_GPU_DEVICE_IDS="{0,2,1,3}"

#Define CPU cores used to allocated GPU related memory. You should chose the correct NUMA nodes. For the above 4-GPU example and 2 10-core CPUs the following would be good:
#HPL_DEFS     += -DHPL_GPU_ALLOC_MAPPING="{0,10,0,10}"

#Exclude some CPU cores from CALDGEMM / HPL-GPU usage
#HPL_DEFS     += -DHPL_GPU_EXCLUDE_CORES="{8,9,18,19}"

#These option can define cutoff points (remaining local matrix size) where the Lookahead and Lookahead 2 features are turned off. (Needed for older AMD processors for better performance).
#HPL_DEFS     += -DHPL_DISABLE_LOOKAHEAD=6000 -DHPL_LOOKAHEAD2_TURNOFF=4000

#This setting links HPL to libcpufreq, allowing to change CPU frequencies during runtime. This can be used to obtain better efficiency.
#HPL_DEFS     += -DHPL_CPUFREQ

#These settings cal alter certain parameters during runtime. The first setting is insider HPL, the second for CALDGEMM parameters. The example alters the CPU frequency over time and disables some GPUs towards the end for better efficiency.
#HPL_DEFS     += -DHPL_CUSTOM_PARAMETER_CHANGE="if (j == startrow) setcpufreq(3000000, 3000000);"
#HPL_DEFS     += -DHPL_CUSTOM_PARAMETER_CHANGE_CALDGEMM="cal_dgemm->SetNumberDevices(global_m_remain < 25000 ? 1 : (global_m_remain < 35000 ? 2 : (global_m_remain < 50000 ? 3 : 4)));if (curcpufreq >= 2200000 && global_m_remain > 100000) setcpufreq(2200000, 1200000); else if (global_m_remain < 50000 && global_m_remain >= 50000 - K) setcpufreq(2700000, 1200000); else if (global_m_remain < 70000 && global_m_remain >= 70000 - K) setcpufreq(2400000, 2400000); else if (global_m_remain < 85000 && global_m_remain >= 85000 - K) setcpufreq(2200000, 2200000); else if (global_m_remain < 95000 && global_m_remain >= 95000 - K) setcpufreq(2000000, 2000000);"

#As above, but this time we modify some factorization parameters over time. (Relevant for older AMD processors)
#HPL_DEFS     += -DHPL_CUSTOM_PARAMETER_CHANGE="ALGO->nbmin = j > 45000 ? 64 : 32;"

#Tool to find the duration of the core phase of HPL, needed to measure power consumption and power efficiency.
#HPL_DEFS     += -DHPL_DURATION_FIND_HELPER

#In multi-node runs, the factorization causes significant CPU load on some but not all nodes. Caldgemm tries to take this into accound for automatic gpu ratio calculation, but sometimes this fails.
#In this case, the following setting can define a minimum GPU ratio in iterations where the node performs the factorization.
#See also GPURatioMax, GPURatioMarginTime, GPURatioMarginTimeDuringFact, GPURatioLookaheadSizeMod, GPURatioPenalties, GPURatioPenaltyFactor. If you do not want them to interfere with ratio calculation, set them all to 0!
#HPL_DEFS     += cal_info.GPURatioDuringFact = 0.9;

#Offload some steps during the factorization to the GPU. Offloading is active as soon as local matrix size is lower than the defined parameter.
#For best efficiency, you want to enable this always, i.e. set the parameter very high. For best performance, parameter must be tuned manually.
#HPL_DEFS     += -DHPL_CALDGEMM_ASYNC_FACT_DGEMM=2000000 -DHPL_CALDGEMM_ASYNC_FACT_FIRST
#HPL_CALDEFS  += cal_info.AsyncSideQueue = true;

#Same as above, but for the large Update-DTRSM. Optimal value for performance and efficiency is usually identical. Value must be tuned manually.
#HPL_DEFS     += -DHPL_CALDGEMM_ASYNC_DTRSM=70000
#HPL_CALDEFS  += cal_info.AsyncDTRSM = true;

#Preallocate some data to avoid unnecessary dynamic mallocs, define number of BBuffers to save memory. PreallocData should be larger than local matrix size / Nb. max_bbuffers should be larger than local matrix size / (Number of GPUs * Nb).
#HPL_CALDEFS  += cal_info.PreallocData = 60;cal_info.max_bbuffers = 17;

#Minimize the CPU workload, as soon as the local matrix size is below this threshold. Only applicable if GPURatio < 1.0. This can be used to help the GPU ratio estimation, and forces GPURatio to 1.0 for small matrices.
#HPL_CALDEFS  += cal_info.MinimizeCPUPart = 2000000;

#Temperature threshold when run is aborted. Requires CALDGEMM to be compiled with ADL support.
#HPL_DEFS     += -DHPL_GPU_TEMPERATURE_THRESHOLD=92.