Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions deploy/sdk/src/dynamo/sdk/cli/allocator.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,7 @@ def get_resource_envs(
f"GPU {stat['index']} ({stat['name']}): "
f"Memory: {format_memory_gb(stat['free_memory'])} free / "
f"{format_memory_gb(stat['total_memory'])} total, "
f"Utilization: {stat['gpu_utilization']}%, "
f"Temperature: {stat['temperature']}°C"
f"Utilization: {stat['gpu_utilization']}% "
)
except Exception as e:
logger.debug(f"Failed to get GPU stats: {e}")
Expand Down
2 changes: 1 addition & 1 deletion deploy/sdk/src/dynamo/sdk/cli/circus.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def get_env_or_reserved_port(env_var):
if port_env:
return int(port_env)
else:
with reserve_free_port() as port:
with reserve_free_port() as port: # type: ignore
return port


Expand Down
72 changes: 35 additions & 37 deletions deploy/sdk/src/dynamo/sdk/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@
import logging
import os
import pathlib
import random
import socket
from typing import Any, DefaultDict, Dict, Iterator, Optional, Protocol, TextIO, Union
from typing import Any, DefaultDict, Dict, Iterator, Protocol, TextIO, Union

import typer
import yaml
Expand Down Expand Up @@ -59,47 +58,46 @@ def dynamo_address(self) -> tuple[str, str]:
...


class PortReserver:
def __init__(self, host: str = "localhost"):
self.host = host
self.socket: socket.socket | None = None
self.port: int | None = None

def __enter__(self) -> int:
try:
self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.socket.bind((self.host, 0))
_, self.port = self.socket.getsockname()
return self.port
except socket.error as e:
self.close_socket()
logger.warning(f"Failed to reserve port on {self.host}: {str(e)}")
raise

def __exit__(self, exc_type, exc_val, exc_tb):
self.close_socket()

def close_socket(self):
try:
if self.socket:
self.socket.close()
except socket.error as e:
logger.warning(f"Error while closing socket: {str(e)}")
# Don't re-raise the exception as this is cleanup code
return True


@contextlib.contextmanager
def reserve_free_port(
host: str = "localhost",
port: int | None = None,
prefix: Optional[str] = None,
max_retry: int = 50,
enable_so_reuseport: bool = False,
) -> Iterator[int]:
"""
detect free port and reserve until exit the context
Detect free port and reserve until exit the context.
Returns a context manager that yields the reserved port.
"""
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
if enable_so_reuseport:
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
if sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT) == 0:
raise RuntimeError("Failed to set SO_REUSEPORT.") from None

if prefix is not None:
prefix_num = int(prefix) * 10 ** (5 - len(prefix))
suffix_range = min(65535 - prefix_num, 10 ** (5 - len(prefix)))
for _ in range(max_retry):
suffix = random.randint(0, suffix_range)
port = int(f"{prefix_num + suffix}")
try:
sock.bind((host, port))
break
except OSError:
continue
else:
raise RuntimeError(
f"Cannot find free port with prefix {prefix} after {max_retry} retries."
) from None
else:
if port:
sock.bind((host, port))
else:
sock.bind((host, 0))
try:
yield sock.getsockname()[1]
finally:
sock.close()
with PortReserver(host) as port:
yield port


def save_dynamo_state(
Expand Down
111 changes: 1 addition & 110 deletions deploy/sdk/src/dynamo/sdk/lib/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ def __init__(self, index: int, total_memory: int, name: str, uuid: str):
self.name = name
self.uuid = uuid
self.available = True # Can be set to False if GPU is reserved/in use
self.temperature = 0 # in Celsius
self.utilization = 0 # in percent (0-100)
self.processes: list[GPUProcess] = []

Expand Down Expand Up @@ -142,14 +141,6 @@ def _discover_gpus(self):
index=i, total_memory=memory_info.total, name=name, uuid=uuid
)

# Get additional GPU information if available
try:
gpu_info.temperature = pynvml.nvmlDeviceGetTemperature(
handle, pynvml.NVML_TEMPERATURE_GPU
)
except pynvml.NVMLError:
logger.debug(f"Could not get temperature for GPU {i}")

try:
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
gpu_info.utilization = utilization.gpu
Expand All @@ -173,7 +164,7 @@ def _discover_gpus(self):
logger.warning(f"Error discovering GPUs: {e}")

def update_gpu_stats(self):
"""Update GPU statistics (utilization, memory, temperature, etc.)."""
"""Update GPU statistics (utilization, memory etc.)."""
if not self._initialized:
return

Expand All @@ -185,14 +176,6 @@ def update_gpu_stats(self):
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
gpu.total_memory = memory_info.total

# Update temperature
try:
gpu.temperature = pynvml.nvmlDeviceGetTemperature(
handle, pynvml.NVML_TEMPERATURE_GPU
)
except pynvml.NVMLError:
pass

# Update utilization
try:
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
Expand Down Expand Up @@ -242,97 +225,6 @@ def get_gpu_memory(self, index: int) -> tuple[int, int]:
logger.warning(f"Error getting GPU memory for GPU {index}: {e}")
return (0, 0)

def get_gpu_utilization(self, index: int) -> int:
"""
Return GPU utilization percentage for a specific GPU.

Args:
index: GPU index

Returns:
GPU utilization percentage (0-100)
"""
if not self._initialized or index >= len(self.gpus):
return 0

try:
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
return utilization.gpu # Returns GPU utilization percentage (0-100)
except pynvml.NVMLError as e:
logger.warning(f"Error getting GPU utilization for GPU {index}: {e}")
return 0

def get_gpu_temperature(self, index: int) -> int:
"""
Return GPU temperature for a specific GPU.

Args:
index: GPU index

Returns:
GPU temperature in Celsius
"""
if not self._initialized or index >= len(self.gpus):
return 0

try:
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
return pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
except pynvml.NVMLError as e:
logger.warning(f"Error getting GPU temperature for GPU {index}: {e}")
return 0

def get_gpu_processes(self, index: int) -> list[GPUProcess]:
"""
Return processes running on a specific GPU.

Args:
index: GPU index

Returns:
List of processes running on the GPU
"""
if not self._initialized or index >= len(self.gpus):
return []

try:
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
return [
GPUProcess(pid=p.pid, used_memory=p.usedGpuMemory) for p in processes
]
except pynvml.NVMLError as e:
logger.warning(f"Error getting GPU processes for GPU {index}: {e}")
return []

def get_best_gpu_for_memory(self, required_memory: int) -> int:
"""
Return the index of the GPU with the most available memory that meets the requirement.

Args:
required_memory: Required memory in bytes

Returns:
GPU index, or -1 if no suitable GPU was found
"""
if not self._initialized:
return -1

best_gpu = -1
max_free = 0

for gpu in self.gpus:
if not gpu.available:
continue

_, free = self.get_gpu_memory(gpu.index)
if free > required_memory and free > max_free:
max_free = free
best_gpu = gpu.index

return best_gpu

def reset_allocations(self):
"""Reset all GPU allocations."""
self._gpu_fractions = []
Expand Down Expand Up @@ -365,7 +257,6 @@ def get_gpu_stats(self) -> list[dict[str, t.Any]]:
if total_memory > 0
else 0,
"gpu_utilization": gpu.utilization,
"temperature": gpu.temperature,
"process_count": len(gpu.processes),
"processes": [
{
Expand Down
Loading