Skip to content

Commit

Permalink
Add classes to model pod metrics and project invoices.
Browse files Browse the repository at this point in the history
The class `Pod` represent a pod's metrics and exposes methods to get the
service unit and runtime of a pod. The service unit information is returned as
a namedtuples. The method to get runtime will allow us to ignore certain
hours, though, that will be added in a later commit.

The class `ProjectInvoice` represents invoice data for a project. It has a
method that takes in a pod and aggregates it's usage data. Another method
`generate_invoice_rows` will return the csv data to be exported.

The caller write_metrics_by_namespace has been updated to use these new classes.

Tests have been updated to work with the new changes. There's some duplicate
constants that I will sort in the next few commits.
  • Loading branch information
naved001 committed Sep 27, 2024
1 parent f159566 commit fe56aae
Show file tree
Hide file tree
Showing 3 changed files with 264 additions and 172 deletions.
203 changes: 203 additions & 0 deletions openshift_metrics/invoice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import math
from dataclasses import dataclass, field
from collections import namedtuple
from typing import List
from decimal import Decimal, ROUND_HALF_UP

# GPU types
GPU_A100 = "NVIDIA-A100-40GB"
GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
GPU_V100 = "Tesla-V100-PCIE-32GB"
GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"

# GPU Resource - MIG Geometries
# A100 Strategies
MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
WHOLE_GPU = "nvidia.com/gpu"

# SU Types
SU_CPU = "OpenShift CPU"
SU_A100_GPU = "OpenShift GPUA100"
SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
SU_V100_GPU = "OpenShift GPUV100"
SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
SU_UNKNOWN = "Openshift Unknown"

ServiceUnit = namedtuple("ServiceUnit", ["su_type", "su_count", "determinig_resource"])


class Pod:
"""Object that represents a pod"""

def __init__(
self,
start_time: int,
duration: int,
cpu_request: Decimal,
gpu_request: Decimal,
memory_request: Decimal,
gpu_type: str,
gpu_resource: str,
):
self.start_time = start_time
self.end_time = start_time + duration
self.cpu_request = cpu_request
self.memory_request = memory_request
self.gpu_request = gpu_request
self.gpu_type = gpu_type
self.gpu_resource = gpu_resource

@staticmethod
def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type, gpu_resource):
"""
Returns the type of service unit, the count, and the determining resource
"""
su_type = SU_UNKNOWN
su_count = 0

# pods that requested a specific GPU but weren't scheduled may report 0 GPU
if gpu_resource is not None and gpu_count == 0:
return SU_UNKNOWN_GPU, 0, "GPU"

# pods in weird states
if cpu_count == 0 or memory_count == 0:
return SU_UNKNOWN, 0, "CPU"

known_gpu_su = {
GPU_A100: SU_A100_GPU,
GPU_A100_SXM4: SU_A100_SXM4_GPU,
GPU_V100: SU_V100_GPU,
}

A100_SXM4_MIG = {
MIG_1G_5GB: SU_UNKNOWN_MIG_GPU,
MIG_2G_10GB: SU_UNKNOWN_MIG_GPU,
MIG_3G_20GB: SU_UNKNOWN_MIG_GPU,
}

# GPU count for some configs is -1 for math reasons, in reality it is 0
su_config = {
SU_CPU: {"gpu": -1, "cpu": 1, "ram": 4},
SU_A100_GPU: {"gpu": 1, "cpu": 24, "ram": 74},
SU_A100_SXM4_GPU: {"gpu": 1, "cpu": 32, "ram": 245},
SU_V100_GPU: {"gpu": 1, "cpu": 24, "ram": 192},
SU_UNKNOWN_GPU: {"gpu": 1, "cpu": 8, "ram": 64},
SU_UNKNOWN_MIG_GPU: {"gpu": 1, "cpu": 8, "ram": 64},
SU_UNKNOWN: {"gpu": -1, "cpu": 1, "ram": 1},
}

if gpu_resource is None and gpu_count == 0:
su_type = SU_CPU
elif gpu_type is not None and gpu_resource == WHOLE_GPU:
su_type = known_gpu_su.get(gpu_type, SU_UNKNOWN_GPU)
elif gpu_type == GPU_A100_SXM4: # for MIG GPU of type A100_SXM4
su_type = A100_SXM4_MIG.get(gpu_resource, SU_UNKNOWN_MIG_GPU)
else:
return SU_UNKNOWN_GPU, 0, "GPU"

cpu_multiplier = cpu_count / su_config[su_type]["cpu"]
gpu_multiplier = gpu_count / su_config[su_type]["gpu"]
memory_multiplier = memory_count / su_config[su_type]["ram"]

su_count = max(cpu_multiplier, gpu_multiplier, memory_multiplier)

# no fractional SUs for GPU SUs
if su_type != SU_CPU:
su_count = math.ceil(su_count)

if gpu_multiplier >= cpu_multiplier and gpu_multiplier >= memory_multiplier:
determining_resource = "GPU"
elif cpu_multiplier >= gpu_multiplier and cpu_multiplier >= memory_multiplier:
determining_resource = "CPU"
else:
determining_resource = "RAM"

return ServiceUnit(su_type, su_count, determining_resource)

def get_runtime(self, ignore_hours=None):
"""Returns runtime in hours"""
return Decimal(self.end_time - self.start_time) / 3600


@dataclass()
class Rates:
cpu: Decimal
gpu_a100: Decimal
gpu_a100sxm4: Decimal
gpu_v100: Decimal


@dataclass
class ProjectInvoce:
"""Represents the invoicing data for a project."""

invoice_month: str
project: str
project_id: str
pi: str
invoice_email: str
invoice_address: str
intitution: str
institution_specific_code: str
rates: Rates
su_hours: dict = field(
default_factory=lambda: {
SU_CPU: 0,
SU_A100_GPU: 0,
SU_A100_SXM4_GPU: 0,
SU_V100_GPU: 0,
SU_UNKNOWN_GPU: 0,
SU_UNKNOWN_MIG_GPU: 0,
SU_UNKNOWN: 0,
}
)

def add_pod(self, pod: Pod):
"""Aggregate the pods data"""
su_type, su_count, _ = Pod.get_service_unit(
cpu_count=pod.cpu_request,
memory_count=pod.memory_request,
gpu_count=pod.gpu_request,
gpu_type=pod.gpu_type,
gpu_resource=pod.gpu_resource,
)
duration_in_hours = pod.get_runtime()
self.su_hours[su_type] += su_count * duration_in_hours

def get_rate(self, su_type):
if su_type == SU_CPU:
return self.rates.cpu
if su_type == SU_A100_GPU:
return self.rates.gpu_a100
if su_type == SU_A100_SXM4_GPU:
return self.rates.gpu_a100sxm4
if su_type == SU_V100_GPU:
return self.rates.gpu_v100
return Decimal(0)

def generate_invoice_rows(self, report_month) -> List[str]:
rows = []
for su_type, hours in self.su_hours.items():
if hours > 0:
hours = math.ceil(hours)
rate = self.get_rate(su_type)
cost = (rate * hours).quantize(Decimal(".01"), rounding=ROUND_HALF_UP)
row = [
report_month,
self.project,
self.project_id,
self.pi,
self.invoice_email,
self.invoice_address,
self.intitution,
self.institution_specific_code,
hours,
su_type,
rate,
cost,
]
rows.append(row)
return rows
40 changes: 20 additions & 20 deletions openshift_metrics/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import tempfile
from unittest import TestCase, mock

from openshift_metrics import utils
from openshift_metrics import utils, invoice
import os

class TestGetNamespaceAnnotations(TestCase):
Expand Down Expand Up @@ -302,116 +302,116 @@ def test_write_metrics_by_namespace_decimal(self, mock_gna):
class TestGetServiceUnit(TestCase):

def test_cpu_only(self):
su_type, su_count, determining_resource = utils.get_service_unit(4, 16, 0, None, None)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(4, 16, 0, None, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 4)
self.assertEqual(determining_resource, "CPU")

def test_known_gpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(24, 74, 1, utils.GPU_A100, utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(24, 74, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_known_gpu_A100_SXM4(self):
su_type, su_count, determining_resource = utils.get_service_unit(32, 245, 1, utils.GPU_A100_SXM4, utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(32, 245, 1, utils.GPU_A100_SXM4, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_SXM4_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_known_gpu_high_cpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(50, 96, 1, utils.GPU_A100, utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(50, 96, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 3)
self.assertEqual(determining_resource, "CPU")

def test_known_gpu_high_memory(self):
su_type, su_count, determining_resource = utils.get_service_unit(24, 100, 1, utils.GPU_A100, utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(24, 100, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 2)
self.assertEqual(determining_resource, "RAM")

def test_known_gpu_low_cpu_memory(self):
su_type, su_count, determining_resource = utils.get_service_unit(2, 4, 1, utils.GPU_A100, utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(2, 4, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_unknown_gpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 1, "Unknown_GPU_Type", utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(8, 64, 1, "Unknown_GPU_Type", utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_known_gpu_zero_count(self):
su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 0, utils.GPU_A100, utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(8, 64, 0, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "GPU")

def test_known_mig_gpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, utils.GPU_A100_SXM4, utils.MIG_1G_5GB)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(1, 4, 1, utils.GPU_A100_SXM4, utils.MIG_1G_5GB)
self.assertEqual(su_type, utils.SU_UNKNOWN_MIG_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_known_gpu_unknown_resource(self):
su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, utils.GPU_A100, "nvidia.com/mig_20G_500GB")
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(1, 4, 1, utils.GPU_A100, "nvidia.com/mig_20G_500GB")
self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "GPU")

def test_unknown_gpu_known_resource(self):
su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, "Unknown GPU", utils.MIG_2G_10GB)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(1, 4, 1, "Unknown GPU", utils.MIG_2G_10GB)
self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "GPU")

def test_zero_memory(self):
su_type, su_count, determining_resource = utils.get_service_unit(1, 0, 0, None, None)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(1, 0, 0, None, None)
self.assertEqual(su_type, utils.SU_UNKNOWN)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "CPU")

def test_zero_cpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(0, 1, 0, None, None)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(0, 1, 0, None, None)
self.assertEqual(su_type, utils.SU_UNKNOWN)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "CPU")

def test_memory_dominant(self):
su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 0, None, None)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(8, 64, 0, None, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 16)
self.assertEqual(determining_resource, "RAM")

def test_fractional_su_cpu_dominant(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.5, 0.5, 0, None, None)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(0.5, 0.5, 0, None, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 0.5)
self.assertEqual(determining_resource, "CPU")

def test_fractional_su_memory_dominant(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.1, 1, 0, None, None)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(0.1, 1, 0, None, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 0.25)
self.assertEqual(determining_resource, "RAM")

def test_known_gpu_fractional_cpu_memory(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.8, 0.8, 1, utils.GPU_A100, utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(0.8, 0.8, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_decimal_return_type(self):
from decimal import Decimal
_, su_count, _ = utils.get_service_unit(Decimal("1"), Decimal("8.1"), Decimal("0"), None, None)
_, su_count, _ = invoice.Pod.get_service_unit(Decimal("1"), Decimal("8.1"), Decimal("0"), None, None)
self.assertIsInstance(su_count, Decimal)
self.assertEqual(su_count, Decimal('2.025'))

def test_not_decimal_return_type_when_gpu_su_type(self):
from decimal import Decimal
su_type, su_count, _ = utils.get_service_unit(Decimal("1"), Decimal("76"), Decimal("1"), utils.GPU_A100, utils.WHOLE_GPU)
su_type, su_count, _ = invoice.Pod.get_service_unit(Decimal("1"), Decimal("76"), Decimal("1"), utils.GPU_A100, utils.WHOLE_GPU)
# for GPU SUs, we always round up to the nearest integer
self.assertIsInstance(su_count, int)
self.assertEqual(su_count, 2)
Expand Down
Loading

0 comments on commit fe56aae

Please sign in to comment.