ai-dynamo · biswapanda · Jun 17, 2025 · Jun 16, 2025 · Jun 16, 2025 · Jun 17, 2025
@@ -233,8 +233,7 @@ def get_resource_envs(
                                 f"GPU {stat['index']} ({stat['name']}): "
                                 f"Memory: {format_memory_gb(stat['free_memory'])} free / "
                                 f"{format_memory_gb(stat['total_memory'])} total, "
-                                f"Utilization: {stat['gpu_utilization']}%, "
-                                f"Temperature: {stat['temperature']}°C"
+                                f"Utilization: {stat['gpu_utilization']}% "
                             )
                     except Exception as e:
                         logger.debug(f"Failed to get GPU stats: {e}")

@@ -104,7 +104,7 @@ def get_env_or_reserved_port(env_var):
     if port_env:
         return int(port_env)
     else:
-        with reserve_free_port() as port:
+        with reserve_free_port() as port:  # type: ignore
             return port
 
 

@@ -23,9 +23,8 @@
 import logging
 import os
 import pathlib
-import random
 import socket
-from typing import Any, DefaultDict, Dict, Iterator, Optional, Protocol, TextIO, Union
+from typing import Any, DefaultDict, Dict, Iterator, Protocol, TextIO, Union
 
 import typer
 import yaml
@@ -59,47 +58,46 @@ def dynamo_address(self) -> tuple[str, str]:
         ...
 
 
+class PortReserver:
+    def __init__(self, host: str = "localhost"):
+        self.host = host
+        self.socket: socket.socket | None = None
+        self.port: int | None = None
+
+    def __enter__(self) -> int:
+        try:
+            self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            self.socket.bind((self.host, 0))
+            _, self.port = self.socket.getsockname()
+            return self.port
+        except socket.error as e:
+            self.close_socket()
+            logger.warning(f"Failed to reserve port on {self.host}: {str(e)}")
+            raise
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close_socket()
+
+    def close_socket(self):
+        try:
+            if self.socket:
+                self.socket.close()
+        except socket.error as e:
+            logger.warning(f"Error while closing socket: {str(e)}")
+            # Don't re-raise the exception as this is cleanup code
+            return True
+
+
 @contextlib.contextmanager
 def reserve_free_port(
     host: str = "localhost",
-    port: int | None = None,
-    prefix: Optional[str] = None,
-    max_retry: int = 50,
-    enable_so_reuseport: bool = False,
 ) -> Iterator[int]:
     """
-    detect free port and reserve until exit the context
+    Detect free port and reserve until exit the context.
+    Returns a context manager that yields the reserved port.
     """
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    if enable_so_reuseport:
-        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
-        if sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT) == 0:
-            raise RuntimeError("Failed to set SO_REUSEPORT.") from None
-
-    if prefix is not None:
-        prefix_num = int(prefix) * 10 ** (5 - len(prefix))
-        suffix_range = min(65535 - prefix_num, 10 ** (5 - len(prefix)))
-        for _ in range(max_retry):
-            suffix = random.randint(0, suffix_range)
-            port = int(f"{prefix_num + suffix}")
-            try:
-                sock.bind((host, port))
-                break
-            except OSError:
-                continue
-        else:
-            raise RuntimeError(
-                f"Cannot find free port with prefix {prefix} after {max_retry} retries."
-            ) from None
-    else:
-        if port:
-            sock.bind((host, port))
-        else:
-            sock.bind((host, 0))
-    try:
-        yield sock.getsockname()[1]
-    finally:
-        sock.close()
+    with PortReserver(host) as port:
+        yield port
 
 
 def save_dynamo_state(

@@ -67,7 +67,6 @@ def __init__(self, index: int, total_memory: int, name: str, uuid: str):
         self.name = name
         self.uuid = uuid
         self.available = True  # Can be set to False if GPU is reserved/in use
-        self.temperature = 0  # in Celsius
         self.utilization = 0  # in percent (0-100)
         self.processes: list[GPUProcess] = []
 
@@ -142,14 +141,6 @@ def _discover_gpus(self):
                     index=i, total_memory=memory_info.total, name=name, uuid=uuid
                 )
 
-                # Get additional GPU information if available
-                try:
-                    gpu_info.temperature = pynvml.nvmlDeviceGetTemperature(
-                        handle, pynvml.NVML_TEMPERATURE_GPU
-                    )
-                except pynvml.NVMLError:
-                    logger.debug(f"Could not get temperature for GPU {i}")
-
                 try:
                     utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
                     gpu_info.utilization = utilization.gpu
@@ -173,7 +164,7 @@ def _discover_gpus(self):
             logger.warning(f"Error discovering GPUs: {e}")
 
     def update_gpu_stats(self):
-        """Update GPU statistics (utilization, memory, temperature, etc.)."""
+        """Update GPU statistics (utilization, memory etc.)."""
         if not self._initialized:
             return
 
@@ -185,14 +176,6 @@ def update_gpu_stats(self):
                 memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                 gpu.total_memory = memory_info.total
 
-                # Update temperature
-                try:
-                    gpu.temperature = pynvml.nvmlDeviceGetTemperature(
-                        handle, pynvml.NVML_TEMPERATURE_GPU
-                    )
-                except pynvml.NVMLError:
-                    pass
-
                 # Update utilization
                 try:
                     utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
@@ -242,97 +225,6 @@ def get_gpu_memory(self, index: int) -> tuple[int, int]:
             logger.warning(f"Error getting GPU memory for GPU {index}: {e}")
             return (0, 0)
 
-    def get_gpu_utilization(self, index: int) -> int:
-        """
-        Return GPU utilization percentage for a specific GPU.
-
-        Args:
-            index: GPU index
-
-        Returns:
-            GPU utilization percentage (0-100)
-        """
-        if not self._initialized or index >= len(self.gpus):
-            return 0
-
-        try:
-            handle = pynvml.nvmlDeviceGetHandleByIndex(index)
-            utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
-            return utilization.gpu  # Returns GPU utilization percentage (0-100)
-        except pynvml.NVMLError as e:
-            logger.warning(f"Error getting GPU utilization for GPU {index}: {e}")
-            return 0
-
-    def get_gpu_temperature(self, index: int) -> int:
-        """
-        Return GPU temperature for a specific GPU.
-
-        Args:
-            index: GPU index
-
-        Returns:
-            GPU temperature in Celsius
-        """
-        if not self._initialized or index >= len(self.gpus):
-            return 0
-
-        try:
-            handle = pynvml.nvmlDeviceGetHandleByIndex(index)
-            return pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
-        except pynvml.NVMLError as e:
-            logger.warning(f"Error getting GPU temperature for GPU {index}: {e}")
-            return 0
-
-    def get_gpu_processes(self, index: int) -> list[GPUProcess]:
-        """
-        Return processes running on a specific GPU.
-
-        Args:
-            index: GPU index
-
-        Returns:
-            List of processes running on the GPU
-        """
-        if not self._initialized or index >= len(self.gpus):
-            return []
-
-        try:
-            handle = pynvml.nvmlDeviceGetHandleByIndex(index)
-            processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
-            return [
-                GPUProcess(pid=p.pid, used_memory=p.usedGpuMemory) for p in processes
-            ]
-        except pynvml.NVMLError as e:
-            logger.warning(f"Error getting GPU processes for GPU {index}: {e}")
-            return []
-
-    def get_best_gpu_for_memory(self, required_memory: int) -> int:
-        """
-        Return the index of the GPU with the most available memory that meets the requirement.
-
-        Args:
-            required_memory: Required memory in bytes
-
-        Returns:
-            GPU index, or -1 if no suitable GPU was found
-        """
-        if not self._initialized:
-            return -1
-
-        best_gpu = -1
-        max_free = 0
-
-        for gpu in self.gpus:
-            if not gpu.available:
-                continue
-
-            _, free = self.get_gpu_memory(gpu.index)
-            if free > required_memory and free > max_free:
-                max_free = free
-                best_gpu = gpu.index
-
-        return best_gpu
-
     def reset_allocations(self):
         """Reset all GPU allocations."""
         self._gpu_fractions = []
@@ -365,7 +257,6 @@ def get_gpu_stats(self) -> list[dict[str, t.Any]]:
                     if total_memory > 0
                     else 0,
                     "gpu_utilization": gpu.utilization,
-                    "temperature": gpu.temperature,
                     "process_count": len(gpu.processes),
                     "processes": [
                         {