Skip to content

Commit

Permalink
Merge branch 'main' into feature/workflows_profiler
Browse files Browse the repository at this point in the history
  • Loading branch information
PawelPeczek-Roboflow committed Oct 4, 2024
2 parents 480bc62 + 74b0eb4 commit 0139543
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 2 deletions.
15 changes: 14 additions & 1 deletion inference/core/interfaces/http/http_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from fastapi.staticfiles import StaticFiles
from fastapi_cprofile.profiler import CProfileMiddleware
from prometheus_fastapi_instrumentator import Instrumentator
from prometheus_fastapi_instrumentator import metrics as prom_metrics
from starlette.convertors import StringConvertor, register_url_convertor
from starlette.middleware.base import BaseHTTPMiddleware

Expand Down Expand Up @@ -187,6 +188,10 @@
MessageToBigError,
)
from inference.core.managers.base import ModelManager
from inference.core.managers.metrics import (
prom_cpu_utilization_total,
prom_gpu_utilization_total,
)
from inference.core.roboflow_api import (
get_roboflow_dataset_type,
get_roboflow_workspace,
Expand Down Expand Up @@ -504,7 +509,15 @@ def __init__(
)

if ENABLE_PROMETHEUS:
Instrumentator().expose(app, endpoint="/metrics")
instrumentator = Instrumentator().instrument(app)
instrumentator.add(prom_cpu_utilization_total())
instrumentator.add(prom_gpu_utilization_total())
instrumentator.add(
prom_metrics.latency(
buckets=(1,),
)
)
instrumentator.expose(app, endpoint="/metrics")

if METLO_KEY:
app.add_middleware(
Expand Down
55 changes: 55 additions & 0 deletions inference/core/managers/metrics.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
import os
import platform
import re
import socket
import time
import uuid
from typing import Callable

import GPUtil
from prometheus_client import REGISTRY, Gauge
from prometheus_fastapi_instrumentator.metrics import Info

from inference.core.cache import cache
from inference.core.logger import logger
from inference.core.version import __version__

previous_cpu_total = None
previous_time = None
NUM_CPU_CORES = os.cpu_count()


def get_model_metrics(
inference_server_id: str, model_id: str, min: float = -1, max: float = float("inf")
Expand Down Expand Up @@ -99,3 +109,48 @@ def get_inference_results_for_model(
inference_results.append({"request_time": score, "inference": result})

return inference_results


def prom_cpu_utilization_total() -> Callable[[Info], None]:
cpu_utilization_gauge = Gauge(
"process_cpu_utilization_total", "Total CPU utilization"
)

def instrumentation(info: Info) -> None:
global previous_cpu_total, previous_time
cpu_metric = REGISTRY.get_sample_value("process_cpu_seconds_total")
if cpu_metric is None:
return
current_time = time.time()
if previous_cpu_total is None:
previous_time = current_time
previous_cpu_total = cpu_metric
else:
cpu_delta = cpu_metric - previous_cpu_total
time_delta = current_time - previous_time
if time_delta > 0:
cpu_utilization_percent = 100 * (cpu_delta / time_delta) / NUM_CPU_CORES
cpu_utilization_gauge.set(cpu_utilization_percent)
previous_cpu_total = cpu_metric
previous_time = current_time

return instrumentation


def prom_gpu_utilization_total() -> Callable[[Info], None]:
gpu_load_gauge = Gauge("gpu_load_percentage", "GPU Load", ["gpu_id"])
gpu_memory_gauge = Gauge(
"gpu_memory_utilization", "GPU Memory Utilization", ["gpu_id"]
)
gpu_temp_gauge = Gauge("gpu_temperature_celsius", "GPU Temperature", ["gpu_id"])

def instrumentation(info: Info) -> None:
gpus = GPUtil.getGPUs()
if not gpus:
return
for gpu in gpus:
gpu_load_gauge.labels(gpu_id=gpu.id).set(gpu.load * 100)
gpu_memory_gauge.labels(gpu_id=gpu.id).set(gpu.memoryUtil * 100)
gpu_temp_gauge.labels(gpu_id=gpu.id).set(gpu.temperature)

return instrumentation
3 changes: 2 additions & 1 deletion requirements/requirements.gpu.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
onnxruntime-gpu<=1.15.1
onnxruntime-gpu<=1.15.1
GPUtil==1.4.0

0 comments on commit 0139543

Please sign in to comment.