From 0f8aa7eecf4eab21d00455e8f4f69348f1a01bfa Mon Sep 17 00:00:00 2001 From: Kuhu Shukla Date: Thu, 1 Aug 2024 10:55:31 -0500 Subject: [PATCH] Address review comments Signed-off-by: Kuhu Shukla --- .../spark/rapids/GpuDeviceManager.scala | 44 +++++++++---------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala index 595cdcbff46..16f7e94f4c7 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala @@ -216,7 +216,7 @@ object GpuDeviceManager extends Logging { } } - private def toMB(x: Long): Double = x / 1024 / 1024.0 + private def toMiB(x: Long): Double = x / 1024 / 1024.0 private def computeRmmPoolSize(conf: RapidsConf, info: CudaMemInfo): Long = { def truncateToAlignment(x: Long): Long = x & ~511L @@ -238,41 +238,39 @@ object GpuDeviceManager extends Logging { } var poolAllocation = truncateToAlignment( (conf.rmmAllocFraction * (info.free - reserveAmount)).toLong) + val errorPhrase = "The pool allocation of " + + s"${toMiB(poolAllocation)} MiB (gpu.free: ${toMiB(info.free)}," + + s"${RapidsConf.RMM_ALLOC_FRACTION}: (=${conf.rmmAllocFraction}," + + s"${RapidsConf.RMM_ALLOC_RESERVE}: ${reserveAmount} => " + + s"(gpu.free - reserve) * allocFraction = ${toMiB(poolAllocation)}) was " if (poolAllocation < minAllocation) { - throw new IllegalArgumentException(s"The pool allocation of " + - s"${toMB(poolAllocation)} MiB (gpu.free: ${toMB(info.free)}," + - s"${RapidsConf.RMM_ALLOC_FRACTION}: (=${conf.rmmAllocFraction}," + - s"${RapidsConf.RMM_ALLOC_RESERVE}: ${reserveAmount} => " + - s"(gpu.free - reserve) * allocFraction = ${toMB(poolAllocation)})" + - s"was less than allocation of ${toMB(minAllocation)} MiB (gpu.total: " + - s"${toMB(info.total)} MiB, ${RapidsConf.RMM_ALLOC_MIN_FRACTION}: " + + throw new IllegalArgumentException(errorPhrase + + s"less than allocation of ${toMiB(minAllocation)} MiB (gpu.total: " + + s"${toMiB(info.total)} MiB, ${RapidsConf.RMM_ALLOC_MIN_FRACTION}: " + s"${conf.rmmAllocMinFraction} => gpu.total *" + - s"minAllocFraction = ${toMB(minAllocation)} MiB). Please ensure that the GPU has" + + s"minAllocFraction = ${toMiB(minAllocation)} MiB). Please ensure that the GPU has" + s"enough free memory, or adjust configuration accordingly.") } if (maxAllocation < poolAllocation) { - throw new IllegalArgumentException(s"The pool allocation of " + - s"${toMB(poolAllocation)} MiB (gpu.free: ${toMB(info.free)}," + - s"${RapidsConf.RMM_ALLOC_FRACTION}: (=${conf.rmmAllocFraction}," + - s"${RapidsConf.RMM_ALLOC_RESERVE}: ${reserveAmount} => " + - s"(gpu.free - reserve) * allocFraction = ${toMB(poolAllocation)})" + - s"was more than allocation of ${toMB(maxAllocation)} MiB (gpu.total: " + - s"${toMB(info.total)} MiB, ${RapidsConf.RMM_ALLOC_MAX_FRACTION}: " + + throw new IllegalArgumentException(errorPhrase + + s"more than allocation of ${toMiB(maxAllocation)} MiB (gpu.total: " + + s"${toMiB(info.total)} MiB, ${RapidsConf.RMM_ALLOC_MAX_FRACTION}: " + s"${conf.rmmAllocMaxFraction} => gpu.total *" + - s"maxAllocFraction = ${toMB(maxAllocation)} MiB). Please ensure that pool allocation" + + s"maxAllocFraction = ${toMiB(maxAllocation)} MiB). Please ensure that pool allocation" + s"does not exceed maximum allocation and adjust configuration accordingly.") } if (reserveAmount >= maxAllocation) { - throw new IllegalArgumentException(s"RMM reserve memory (${toMB(reserveAmount)} MB) " + - s"larger than maximum pool size (${toMB(maxAllocation)} MB). Check the settings for " + + throw new IllegalArgumentException(s"RMM reserve memory (${toMiB(reserveAmount)} MB) " + + s"larger than maximum pool size (${toMiB(maxAllocation)} MB). Check the settings for " + s"${RapidsConf.RMM_ALLOC_MAX_FRACTION} (=${conf.rmmAllocFraction}) and " + s"${RapidsConf.RMM_ALLOC_RESERVE} (=$reserveAmount)") } val adjustedMaxAllocation = truncateToAlignment(maxAllocation - reserveAmount) if (poolAllocation > adjustedMaxAllocation) { - logWarning(s"RMM pool allocation (${toMB(poolAllocation)} MB) does not leave enough free " + - s"memory for reserve memory (${toMB(reserveAmount)} MB), lowering the pool size to " + - s"${toMB(adjustedMaxAllocation)} MB to accommodate the requested reserve amount.") + logWarning(s"RMM pool allocation (${toMiB(poolAllocation)} MB) does not leave enough" + + s"free memory for reserve memory (${toMiB(reserveAmount)} MB), lowering the pool " + + s"size to ${toMiB(adjustedMaxAllocation)} MB to " + + s"accommodate the requested reserve amount.") poolAllocation = adjustedMaxAllocation } @@ -356,7 +354,7 @@ object GpuDeviceManager extends Logging { deviceId = Some(gpuId) logInfo(s"Initializing RMM${features.mkString(" ", " ", "")} " + - s"pool size = ${toMB(poolAllocation)} MB on gpuId $gpuId") + s"pool size = ${toMiB(poolAllocation)} MB on gpuId $gpuId") if (Cuda.isPtdsEnabled()) { logInfo("Using per-thread default stream")