Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add classes to model pod metrics and project invoices. #78

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 203 additions & 0 deletions openshift_metrics/invoice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import math
from dataclasses import dataclass, field
from collections import namedtuple
from typing import List
from decimal import Decimal, ROUND_HALF_UP

# GPU types
GPU_A100 = "NVIDIA-A100-40GB"
GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
GPU_V100 = "Tesla-V100-PCIE-32GB"
GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"

# GPU Resource - MIG Geometries
# A100 Strategies
MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
WHOLE_GPU = "nvidia.com/gpu"

# SU Types
SU_CPU = "OpenShift CPU"
SU_A100_GPU = "OpenShift GPUA100"
SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
SU_V100_GPU = "OpenShift GPUV100"
SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
SU_UNKNOWN = "Openshift Unknown"

ServiceUnit = namedtuple("ServiceUnit", ["su_type", "su_count", "determinig_resource"])


class Pod:
"""Object that represents a pod"""

def __init__(
self,
start_time: int,
duration: int,
cpu_request: Decimal,
gpu_request: Decimal,
memory_request: Decimal,
gpu_type: str,
gpu_resource: str,
):
self.start_time = start_time
self.end_time = start_time + duration
self.cpu_request = cpu_request
self.memory_request = memory_request
self.gpu_request = gpu_request
self.gpu_type = gpu_type
self.gpu_resource = gpu_resource

@staticmethod
def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type, gpu_resource):
"""
Returns the type of service unit, the count, and the determining resource
"""
su_type = SU_UNKNOWN
su_count = 0

# pods that requested a specific GPU but weren't scheduled may report 0 GPU
if gpu_resource is not None and gpu_count == 0:
return SU_UNKNOWN_GPU, 0, "GPU"

# pods in weird states
if cpu_count == 0 or memory_count == 0:
return SU_UNKNOWN, 0, "CPU"

known_gpu_su = {
GPU_A100: SU_A100_GPU,
GPU_A100_SXM4: SU_A100_SXM4_GPU,
GPU_V100: SU_V100_GPU,
}

A100_SXM4_MIG = {
MIG_1G_5GB: SU_UNKNOWN_MIG_GPU,
MIG_2G_10GB: SU_UNKNOWN_MIG_GPU,
MIG_3G_20GB: SU_UNKNOWN_MIG_GPU,
}

# GPU count for some configs is -1 for math reasons, in reality it is 0
su_config = {
SU_CPU: {"gpu": -1, "cpu": 1, "ram": 4},
SU_A100_GPU: {"gpu": 1, "cpu": 24, "ram": 74},
SU_A100_SXM4_GPU: {"gpu": 1, "cpu": 32, "ram": 245},
SU_V100_GPU: {"gpu": 1, "cpu": 24, "ram": 192},
SU_UNKNOWN_GPU: {"gpu": 1, "cpu": 8, "ram": 64},
SU_UNKNOWN_MIG_GPU: {"gpu": 1, "cpu": 8, "ram": 64},
SU_UNKNOWN: {"gpu": -1, "cpu": 1, "ram": 1},
}

if gpu_resource is None and gpu_count == 0:
su_type = SU_CPU
elif gpu_type is not None and gpu_resource == WHOLE_GPU:
su_type = known_gpu_su.get(gpu_type, SU_UNKNOWN_GPU)
elif gpu_type == GPU_A100_SXM4: # for MIG GPU of type A100_SXM4
su_type = A100_SXM4_MIG.get(gpu_resource, SU_UNKNOWN_MIG_GPU)
else:
return SU_UNKNOWN_GPU, 0, "GPU"

cpu_multiplier = cpu_count / su_config[su_type]["cpu"]
gpu_multiplier = gpu_count / su_config[su_type]["gpu"]
memory_multiplier = memory_count / su_config[su_type]["ram"]

su_count = max(cpu_multiplier, gpu_multiplier, memory_multiplier)

# no fractional SUs for GPU SUs
if su_type != SU_CPU:
su_count = math.ceil(su_count)

if gpu_multiplier >= cpu_multiplier and gpu_multiplier >= memory_multiplier:
determining_resource = "GPU"
elif cpu_multiplier >= gpu_multiplier and cpu_multiplier >= memory_multiplier:
determining_resource = "CPU"
else:
determining_resource = "RAM"

return ServiceUnit(su_type, su_count, determining_resource)

def get_runtime(self, ignore_hours=None):
"""Returns runtime in hours"""
return Decimal(self.end_time - self.start_time) / 3600


@dataclass()
class Rates:
cpu: Decimal
gpu_a100: Decimal
gpu_a100sxm4: Decimal
gpu_v100: Decimal


@dataclass
class ProjectInvoce:
"""Represents the invoicing data for a project."""

invoice_month: str
project: str
project_id: str
pi: str
invoice_email: str
invoice_address: str
intitution: str
institution_specific_code: str
rates: Rates
su_hours: dict = field(
default_factory=lambda: {
SU_CPU: 0,
SU_A100_GPU: 0,
SU_A100_SXM4_GPU: 0,
SU_V100_GPU: 0,
SU_UNKNOWN_GPU: 0,
SU_UNKNOWN_MIG_GPU: 0,
SU_UNKNOWN: 0,
}
)

def add_pod(self, pod: Pod):
"""Aggregate the pods data"""
su_type, su_count, _ = Pod.get_service_unit(
cpu_count=pod.cpu_request,
memory_count=pod.memory_request,
gpu_count=pod.gpu_request,
gpu_type=pod.gpu_type,
gpu_resource=pod.gpu_resource,
)
duration_in_hours = pod.get_runtime()
self.su_hours[su_type] += su_count * duration_in_hours

def get_rate(self, su_type):
if su_type == SU_CPU:
return self.rates.cpu
if su_type == SU_A100_GPU:
return self.rates.gpu_a100
if su_type == SU_A100_SXM4_GPU:
return self.rates.gpu_a100sxm4
if su_type == SU_V100_GPU:
return self.rates.gpu_v100
return Decimal(0)

def generate_invoice_rows(self, report_month) -> List[str]:
rows = []
for su_type, hours in self.su_hours.items():
if hours > 0:
hours = math.ceil(hours)
rate = self.get_rate(su_type)
cost = (rate * hours).quantize(Decimal(".01"), rounding=ROUND_HALF_UP)
row = [
report_month,
self.project,
self.project_id,
self.pi,
self.invoice_email,
self.invoice_address,
self.intitution,
self.institution_specific_code,
hours,
su_type,
rate,
cost,
]
rows.append(row)
return rows
40 changes: 20 additions & 20 deletions openshift_metrics/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import tempfile
from unittest import TestCase, mock

from openshift_metrics import utils
from openshift_metrics import utils, invoice
import os

class TestGetNamespaceAnnotations(TestCase):
Expand Down Expand Up @@ -302,116 +302,116 @@ def test_write_metrics_by_namespace_decimal(self, mock_gna):
class TestGetServiceUnit(TestCase):

def test_cpu_only(self):
su_type, su_count, determining_resource = utils.get_service_unit(4, 16, 0, None, None)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(4, 16, 0, None, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 4)
self.assertEqual(determining_resource, "CPU")

def test_known_gpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(24, 74, 1, utils.GPU_A100, utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(24, 74, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_known_gpu_A100_SXM4(self):
su_type, su_count, determining_resource = utils.get_service_unit(32, 245, 1, utils.GPU_A100_SXM4, utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(32, 245, 1, utils.GPU_A100_SXM4, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_SXM4_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_known_gpu_high_cpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(50, 96, 1, utils.GPU_A100, utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(50, 96, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 3)
self.assertEqual(determining_resource, "CPU")

def test_known_gpu_high_memory(self):
su_type, su_count, determining_resource = utils.get_service_unit(24, 100, 1, utils.GPU_A100, utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(24, 100, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 2)
self.assertEqual(determining_resource, "RAM")

def test_known_gpu_low_cpu_memory(self):
su_type, su_count, determining_resource = utils.get_service_unit(2, 4, 1, utils.GPU_A100, utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(2, 4, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_unknown_gpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 1, "Unknown_GPU_Type", utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(8, 64, 1, "Unknown_GPU_Type", utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_known_gpu_zero_count(self):
su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 0, utils.GPU_A100, utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(8, 64, 0, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "GPU")

def test_known_mig_gpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, utils.GPU_A100_SXM4, utils.MIG_1G_5GB)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(1, 4, 1, utils.GPU_A100_SXM4, utils.MIG_1G_5GB)
self.assertEqual(su_type, utils.SU_UNKNOWN_MIG_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_known_gpu_unknown_resource(self):
su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, utils.GPU_A100, "nvidia.com/mig_20G_500GB")
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(1, 4, 1, utils.GPU_A100, "nvidia.com/mig_20G_500GB")
self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "GPU")

def test_unknown_gpu_known_resource(self):
su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, "Unknown GPU", utils.MIG_2G_10GB)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(1, 4, 1, "Unknown GPU", utils.MIG_2G_10GB)
self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "GPU")

def test_zero_memory(self):
su_type, su_count, determining_resource = utils.get_service_unit(1, 0, 0, None, None)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(1, 0, 0, None, None)
self.assertEqual(su_type, utils.SU_UNKNOWN)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "CPU")

def test_zero_cpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(0, 1, 0, None, None)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(0, 1, 0, None, None)
self.assertEqual(su_type, utils.SU_UNKNOWN)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "CPU")

def test_memory_dominant(self):
su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 0, None, None)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(8, 64, 0, None, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 16)
self.assertEqual(determining_resource, "RAM")

def test_fractional_su_cpu_dominant(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.5, 0.5, 0, None, None)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(0.5, 0.5, 0, None, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 0.5)
self.assertEqual(determining_resource, "CPU")

def test_fractional_su_memory_dominant(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.1, 1, 0, None, None)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(0.1, 1, 0, None, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 0.25)
self.assertEqual(determining_resource, "RAM")

def test_known_gpu_fractional_cpu_memory(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.8, 0.8, 1, utils.GPU_A100, utils.WHOLE_GPU)
su_type, su_count, determining_resource = invoice.Pod.get_service_unit(0.8, 0.8, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_decimal_return_type(self):
from decimal import Decimal
_, su_count, _ = utils.get_service_unit(Decimal("1"), Decimal("8.1"), Decimal("0"), None, None)
_, su_count, _ = invoice.Pod.get_service_unit(Decimal("1"), Decimal("8.1"), Decimal("0"), None, None)
self.assertIsInstance(su_count, Decimal)
self.assertEqual(su_count, Decimal('2.025'))

def test_not_decimal_return_type_when_gpu_su_type(self):
from decimal import Decimal
su_type, su_count, _ = utils.get_service_unit(Decimal("1"), Decimal("76"), Decimal("1"), utils.GPU_A100, utils.WHOLE_GPU)
su_type, su_count, _ = invoice.Pod.get_service_unit(Decimal("1"), Decimal("76"), Decimal("1"), utils.GPU_A100, utils.WHOLE_GPU)
# for GPU SUs, we always round up to the nearest integer
self.assertIsInstance(su_count, int)
self.assertEqual(su_count, 2)
Expand Down
Loading
Loading