Skip to content

Commit

Permalink
return resource pool types in config endpoint (#2182)
Browse files Browse the repository at this point in the history
  • Loading branch information
zubenkoivan authored Apr 30, 2024
1 parent 0f679f1 commit f16d605
Show file tree
Hide file tree
Showing 10 changed files with 251 additions and 90 deletions.
32 changes: 31 additions & 1 deletion platform_api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
from .orchestrator.jobs_storage import JobsStorage, PostgresJobsStorage
from .orchestrator.jobs_storage.base import JobStorageTransactionError
from .postgres import make_async_engine
from .resource import Preset
from .resource import Preset, ResourcePoolType
from .user import authorized_user, untrusted_user
from .utils.update_notifier import (
Notifier,
Expand Down Expand Up @@ -164,6 +164,10 @@ def _convert_cluster_config_to_payload(
) -> dict[str, Any]:
cluster_config = user_cluster_config.config
orgs = user_cluster_config.orgs
resource_pool_types = [
self._convert_resource_pool_type_to_payload(r)
for r in cluster_config.orchestrator.resource_pool_types
]
presets = [
self._convert_preset_to_payload(preset)
for preset in cluster_config.orchestrator.presets
Expand All @@ -177,6 +181,7 @@ def _convert_cluster_config_to_payload(
"metrics_url": str(cluster_config.ingress.metrics_url),
"disks_url": str(cluster_config.ingress.disks_url),
"buckets_url": str(cluster_config.ingress.buckets_url),
"resource_pool_types": resource_pool_types,
"resource_presets": presets,
"orgs": orgs,
"timezone": str(cluster_config.timezone),
Expand All @@ -189,6 +194,31 @@ def _convert_cluster_config_to_payload(
result["users_url"] = str(self._config.auth.public_endpoint_url)
return result

def _convert_resource_pool_type_to_payload(
self, resource_pool_type: ResourcePoolType
) -> dict[str, Any]:
payload: dict[str, Any] = {
"name": resource_pool_type.name,
"min_size": resource_pool_type.min_size,
"max_size": resource_pool_type.max_size,
"cpu": resource_pool_type.cpu,
"memory": resource_pool_type.memory,
"disk_size": resource_pool_type.disk_size,
}
if resource_pool_type.nvidia_gpu is not None:
payload["nvidia_gpu"] = resource_pool_type.nvidia_gpu
if resource_pool_type.amd_gpu is not None:
payload["amd_gpu"] = resource_pool_type.amd_gpu
if resource_pool_type.tpu:
payload["tpu"] = {
"types": resource_pool_type.tpu.types,
"software_versions": resource_pool_type.tpu.software_versions,
"ipv4_cidr_block": resource_pool_type.tpu.ipv4_cidr_block,
}
if resource_pool_type.is_preemptible:
payload["is_preemptible"] = resource_pool_type.is_preemptible
return payload

def _convert_preset_to_payload(self, preset: Preset) -> dict[str, Any]:
payload: dict[str, Any] = {
"name": preset.name,
Expand Down
12 changes: 3 additions & 9 deletions platform_api/cluster_config_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,20 +134,14 @@ def _create_tpu_preset(
)

def _create_resource_pool_type(self, payload: dict[str, Any]) -> ResourcePoolType:
cpu = payload.get("cpu")

memory = _get_memory_with_deprecated_mb(payload, "memory")
available_memory = _get_memory_with_deprecated_mb(payload, "available_memory")
return ResourcePoolType(
name=payload["name"],
nvidia_gpu=payload.get("nvidia_gpu"),
amd_gpu=payload.get("amd_gpu"),
is_preemptible=payload.get("is_preemptible"),
cpu=cpu,
available_cpu=payload.get("available_cpu") or cpu,
memory=memory,
available_memory=available_memory or memory,
disk_gb=payload.get("disk_size_gb"),
cpu=payload.get("available_cpu") or payload.get("cpu"),
memory=payload.get("available_memory") or payload.get("memory"),
disk_size=payload.get("disk_size"),
min_size=payload.get("min_size"),
max_size=payload.get("max_size"),
tpu=self._create_tpu_resource(payload.get("tpu")),
Expand Down
6 changes: 3 additions & 3 deletions platform_api/orchestrator/job_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,11 +235,11 @@ def require_gpu(self) -> bool:
return bool(self.nvidia_gpu or self.amd_gpu)

def check_fit_into_pool_type(self, pool_type: ResourcePoolType) -> bool:
if not pool_type.available_cpu or not pool_type.available_memory:
if not pool_type.cpu or not pool_type.memory:
return False
return (
self.cpu <= pool_type.available_cpu
and self.memory <= pool_type.available_memory
self.cpu <= pool_type.cpu
and self.memory <= pool_type.memory
and self._check_gpu(pool_type)
and self._check_tpu(pool_type)
)
Expand Down
4 changes: 2 additions & 2 deletions platform_api/orchestrator/kube_orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,8 +480,8 @@ def _update_pod_container_resources(
) -> PodDescriptor:
if not pod.resources:
return pod
max_node_cpu = max(p.available_cpu or 0 for p in pool_types)
max_node_memory = max(p.available_memory or 0 for p in pool_types)
max_node_cpu = max(p.cpu or 0 for p in pool_types)
max_node_memory = max(p.memory or 0 for p in pool_types)
max_node_nvidia_gpu = max(p.nvidia_gpu or 0 for p in pool_types)
max_node_amd_gpu = max(p.amd_gpu or 0 for p in pool_types)
pod_nvidia_gpu = pod.resources.nvidia_gpu or 0
Expand Down
4 changes: 1 addition & 3 deletions platform_api/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,10 @@ class ResourcePoolType:
name: str = field(default_factory=lambda: str(uuid.uuid4()))
is_preemptible: Optional[bool] = False
cpu: Optional[float] = None
available_cpu: Optional[float] = None
memory: Optional[int] = None
available_memory: Optional[int] = None
nvidia_gpu: Optional[int] = None
amd_gpu: Optional[int] = None
disk_gb: Optional[int] = None
disk_size: Optional[int] = None
min_size: Optional[int] = None
max_size: Optional[int] = None
tpu: Optional[TPUResource] = None
Expand Down
30 changes: 18 additions & 12 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,31 +159,35 @@ def _f(**kwargs: Any) -> OrchestratorConfig:
"resource_pool_types": [
ResourcePoolType(
name="cpu",
min_size=1,
max_size=2,
cpu=1.0,
available_cpu=1.0,
memory=2048 * 10**6,
available_memory=2048 * 10**6,
disk_size=150 * 10**9,
),
ResourcePoolType(
name="cpu-p",
min_size=1,
max_size=2,
cpu=1.0,
available_cpu=1.0,
memory=2048 * 10**6,
available_memory=2048 * 10**6,
disk_size=150 * 10**9,
is_preemptible=True,
),
ResourcePoolType(
min_size=1,
max_size=2,
cpu=100,
available_cpu=100,
memory=500_000 * 10**6,
available_memory=500_000 * 10**6,
disk_size=150 * 10**9,
),
ResourcePoolType(
name="tpu",
min_size=1,
max_size=2,
cpu=1.0,
available_cpu=1.0,
memory=2048 * 10**6,
available_memory=2048 * 10**6,
disk_size=150 * 10**9,
tpu=TPUResource(
ipv4_cidr_block="1.1.1.1/32",
types=("v2-8",),
Expand All @@ -192,11 +196,13 @@ def _f(**kwargs: Any) -> OrchestratorConfig:
),
ResourcePoolType(
name="gpu",
min_size=1,
max_size=2,
cpu=1.0,
available_cpu=1.0,
memory=2048 * 10**6,
available_memory=2048 * 10**6,
disk_size=150 * 10**9,
nvidia_gpu=1,
amd_gpu=2,
),
],
"presets": [
Expand Down Expand Up @@ -339,8 +345,8 @@ async def _create(
labels[kube_config.node_label_preemptible] = "true"
capacity = {
"pods": "110",
"cpu": int(pool_type.available_cpu or 0),
"memory": f"{pool_type.available_memory}",
"cpu": int(pool_type.cpu or 0),
"memory": f"{pool_type.memory}",
"nvidia.com/gpu": pool_type.nvidia_gpu or 0,
}
taints = [
Expand Down
150 changes: 150 additions & 0 deletions tests/integration/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,56 @@ async def test_config(
"metrics_url": "https://neu.ro/api/v1/metrics",
"disks_url": "https://neu.ro/api/v1/disk",
"buckets_url": "https://neu.ro/api/v1/buckets",
"resource_pool_types": [
{
"name": "cpu",
"min_size": 1,
"max_size": 2,
"cpu": 1.0,
"memory": 2048 * 10**6,
"disk_size": 150 * 10**9,
},
{
"name": "cpu-p",
"min_size": 1,
"max_size": 2,
"cpu": 1.0,
"memory": 2048 * 10**6,
"disk_size": 150 * 10**9,
"is_preemptible": True,
},
{
"name": mock.ANY,
"min_size": 1,
"max_size": 2,
"cpu": 100,
"memory": 500_000 * 10**6,
"disk_size": 150 * 10**9,
},
{
"name": "tpu",
"min_size": 1,
"max_size": 2,
"cpu": 1.0,
"memory": 2048 * 10**6,
"disk_size": 150 * 10**9,
"tpu": {
"ipv4_cidr_block": "1.1.1.1/32",
"types": ["v2-8"],
"software_versions": ["1.14"],
},
},
{
"name": "gpu",
"min_size": 1,
"max_size": 2,
"cpu": 1.0,
"memory": 2048 * 10**6,
"disk_size": 150 * 10**9,
"nvidia_gpu": 1,
"amd_gpu": 2,
},
],
"resource_presets": [
{
"name": "gpu-small",
Expand Down Expand Up @@ -501,6 +551,56 @@ async def test_config__with_orgs_and_projects(
"metrics_url": "https://neu.ro/api/v1/metrics",
"disks_url": "https://neu.ro/api/v1/disk",
"buckets_url": "https://neu.ro/api/v1/buckets",
"resource_pool_types": [
{
"name": "cpu",
"min_size": 1,
"max_size": 2,
"cpu": 1.0,
"memory": 2048 * 10**6,
"disk_size": 150 * 10**9,
},
{
"name": "cpu-p",
"min_size": 1,
"max_size": 2,
"cpu": 1.0,
"memory": 2048 * 10**6,
"disk_size": 150 * 10**9,
"is_preemptible": True,
},
{
"name": mock.ANY,
"min_size": 1,
"max_size": 2,
"cpu": 100,
"memory": 500_000 * 10**6,
"disk_size": 150 * 10**9,
},
{
"name": "tpu",
"min_size": 1,
"max_size": 2,
"cpu": 1.0,
"memory": 2048 * 10**6,
"disk_size": 150 * 10**9,
"tpu": {
"ipv4_cidr_block": "1.1.1.1/32",
"types": ["v2-8"],
"software_versions": ["1.14"],
},
},
{
"name": "gpu",
"min_size": 1,
"max_size": 2,
"cpu": 1.0,
"memory": 2048 * 10**6,
"disk_size": 150 * 10**9,
"nvidia_gpu": 1,
"amd_gpu": 2,
},
],
"resource_presets": [
{
"name": "gpu-small",
Expand Down Expand Up @@ -687,6 +787,56 @@ async def test_config_with_oauth(
"metrics_url": "https://neu.ro/api/v1/metrics",
"disks_url": "https://neu.ro/api/v1/disk",
"buckets_url": "https://neu.ro/api/v1/buckets",
"resource_pool_types": [
{
"name": "cpu",
"min_size": 1,
"max_size": 2,
"cpu": 1.0,
"memory": 2048 * 10**6,
"disk_size": 150 * 10**9,
},
{
"name": "cpu-p",
"min_size": 1,
"max_size": 2,
"cpu": 1.0,
"memory": 2048 * 10**6,
"disk_size": 150 * 10**9,
"is_preemptible": True,
},
{
"name": mock.ANY,
"min_size": 1,
"max_size": 2,
"cpu": 100,
"memory": 500_000 * 10**6,
"disk_size": 150 * 10**9,
},
{
"name": "tpu",
"min_size": 1,
"max_size": 2,
"cpu": 1.0,
"memory": 2048 * 10**6,
"disk_size": 150 * 10**9,
"tpu": {
"ipv4_cidr_block": "1.1.1.1/32",
"types": ["v2-8"],
"software_versions": ["1.14"],
},
},
{
"name": "gpu",
"min_size": 1,
"max_size": 2,
"cpu": 1.0,
"memory": 2048 * 10**6,
"disk_size": 150 * 10**9,
"nvidia_gpu": 1,
"amd_gpu": 2,
},
],
"resource_presets": [
{
"name": "gpu-small",
Expand Down
Loading

0 comments on commit f16d605

Please sign in to comment.