Skip to content

Commit

Permalink
Use explicit GPU makes. Add support for intel GPU (#2192)
Browse files Browse the repository at this point in the history
Use explicit GPU makes. Add support for intel GPU
  • Loading branch information
parikls authored Oct 21, 2024
1 parent 58b302f commit 007ffc7
Show file tree
Hide file tree
Showing 20 changed files with 273 additions and 49 deletions.
20 changes: 18 additions & 2 deletions platform_api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,14 @@ def _convert_resource_pool_type_to_payload(
payload["nvidia_gpu"] = resource_pool_type.nvidia_gpu
if resource_pool_type.amd_gpu is not None:
payload["amd_gpu"] = resource_pool_type.amd_gpu
if resource_pool_type.intel_gpu is not None:
payload["intel_gpu"] = resource_pool_type.intel_gpu
if resource_pool_type.nvidia_gpu_model is not None:
payload["nvidia_gpu_model"] = resource_pool_type.nvidia_gpu_model
if resource_pool_type.amd_gpu_model is not None:
payload["amd_gpu_model"] = resource_pool_type.amd_gpu_model
if resource_pool_type.intel_gpu_model is not None:
payload["intel_gpu_model"] = resource_pool_type.intel_gpu_model
if resource_pool_type.tpu:
payload["tpu"] = {
"types": resource_pool_type.tpu.types,
Expand Down Expand Up @@ -247,8 +255,16 @@ def _convert_preset_to_payload(self, preset: Preset) -> dict[str, Any]:
payload["gpu"] = preset.nvidia_gpu
if preset.amd_gpu is not None:
payload["amd_gpu"] = preset.amd_gpu
if preset.gpu_model is not None:
payload["gpu_model"] = preset.gpu_model
if preset.intel_gpu is not None:
payload["intel_gpu"] = preset.intel_gpu
nvidia_gpu_model = preset.nvidia_gpu_model or preset.gpu_model
if nvidia_gpu_model is not None:
payload["gpu_model"] = nvidia_gpu_model
payload["nvidia_gpu_model"] = nvidia_gpu_model
if preset.amd_gpu_model is not None:
payload["amd_gpu_model"] = preset.amd_gpu_model
if preset.intel_gpu_model is not None:
payload["intel_gpu_model"] = preset.intel_gpu_model
if preset.tpu:
payload["tpu"] = {
"type": preset.tpu.type,
Expand Down
13 changes: 12 additions & 1 deletion platform_api/cluster_config_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def _create_presets(self, payload: dict[str, Any]) -> list[Preset]:
memory = _get_memory_with_deprecated_mb(preset, "memory")
if memory is None:
raise ValueError("memory is not set for resource preset")
nvidia_gpu_model = preset.get("nvidia_gpu_model") or preset.get("gpu_model")
result.append(
Preset(
name=preset["name"],
Expand All @@ -88,7 +89,11 @@ def _create_presets(self, payload: dict[str, Any]) -> list[Preset]:
or preset.get("is_preemptible_node_required", False),
nvidia_gpu=preset.get("nvidia_gpu"),
amd_gpu=preset.get("amd_gpu"),
gpu_model=preset.get("gpu_model"),
intel_gpu=preset.get("intel_gpu"),
gpu_model=nvidia_gpu_model,
nvidia_gpu_model=nvidia_gpu_model,
amd_gpu_model=preset.get("amd_gpu_model"),
intel_gpu_model=preset.get("intel_gpu_model"),
tpu=self._create_tpu_preset(preset.get("tpu")),
is_external_job=preset.get("is_external_job", False),
resource_pool_names=preset.get("resource_pool_names", ()),
Expand Down Expand Up @@ -144,6 +149,12 @@ def _create_resource_pool_type(self, payload: dict[str, Any]) -> ResourcePoolTyp
name=payload["name"],
nvidia_gpu=payload.get("nvidia_gpu"),
amd_gpu=payload.get("amd_gpu"),
intel_gpu=payload.get("intel_gpu"),
nvidia_gpu_model=(
payload.get("nvidia_gpu_model") or payload.get("gpu_model")
),
amd_gpu_model=payload.get("amd_gpu_model"),
intel_gpu_model=payload.get("intel_gpu_model"),
is_preemptible=payload.get("is_preemptible"),
cpu=payload.get("available_cpu") or payload.get("cpu"),
memory=payload.get("available_memory") or payload.get("memory"),
Expand Down
5 changes: 4 additions & 1 deletion platform_api/handlers/job_request_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,10 @@ def create_resources_from_payload(payload: dict[str, Any]) -> ContainerResources
),
nvidia_gpu=payload.get("nvidia_gpu"),
amd_gpu=payload.get("amd_gpu"),
gpu_model_id=payload.get("gpu_model"),
intel_gpu=payload.get("intel_gpu"),
nvidia_gpu_model=payload.get("nvidia_gpu_model") or payload.get("gpu_model"),
amd_gpu_model=payload.get("amd_gpu_model"),
intel_gpu_model=payload.get("intel_gpu_model"),
shm=payload.get("shm"),
tpu=tpu,
)
Expand Down
23 changes: 19 additions & 4 deletions platform_api/handlers/jobs_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,16 @@ def _set_preset_resources(payload: dict[str, Any]) -> dict[str, Any]:
container_resources["nvidia_gpu"] = preset.nvidia_gpu
if preset.amd_gpu:
container_resources["amd_gpu"] = preset.amd_gpu
if preset.gpu_model:
container_resources["gpu_model"] = preset.gpu_model
if preset.intel_gpu:
container_resources["intel_gpu"] = preset.intel_gpu
nvidia_gpu_model = preset.nvidia_gpu_model or preset.gpu_model
if nvidia_gpu_model:
container_resources["gpu_model"] = nvidia_gpu_model
container_resources["nvidia_gpu_model"] = nvidia_gpu_model
if preset.amd_gpu_model:
container_resources["amd_gpu_model"] = preset.amd_gpu_model
if preset.intel_gpu_model:
container_resources["intel_gpu_model"] = preset.intel_gpu_model
if preset.tpu:
container_resources["tpu"] = {
"type": preset.tpu.type,
Expand Down Expand Up @@ -386,8 +394,15 @@ def convert_job_container_to_json(container: Container) -> dict[str, Any]:
resources["gpu"] = container.resources.nvidia_gpu
if container.resources.amd_gpu is not None:
resources["amd_gpu"] = container.resources.amd_gpu
if container.resources.gpu_model_id:
resources["gpu_model"] = container.resources.gpu_model_id
if container.resources.intel_gpu is not None:
resources["intel_gpu"] = container.resources.intel_gpu
if container.resources.nvidia_gpu_model:
resources["gpu_model"] = container.resources.nvidia_gpu_model
resources["nvidia_gpu_model"] = container.resources.nvidia_gpu_model
if container.resources.amd_gpu_model:
resources["amd_gpu_model"] = container.resources.amd_gpu_model
if container.resources.intel_gpu_model:
resources["intel_gpu_model"] = container.resources.intel_gpu_model
if container.resources.shm is not None:
resources["shm"] = container.resources.shm
if container.resources.tpu:
Expand Down
9 changes: 7 additions & 2 deletions platform_api/handlers/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,10 +225,15 @@ def check_memory_keys(data: Any) -> Any:
common_resources_validator
+ t.Dict(
{
t.Key("gpu", optional=True): t.Int(gte=0), # TODO: deprecated, remove
# TODO: deprecated, remove
t.Key("gpu", to_name="nvidia_gpu", optional=True): t.Int(gte=0),
t.Key("nvidia_gpu", optional=True): t.Int(gte=0),
t.Key("amd_gpu", optional=True): t.Int(gte=0),
t.Key("gpu_model", optional=True): t.String,
t.Key("intel_gpu", optional=True): t.Int(gte=0),
t.Key("gpu_model", to_name="nvidia_gpu_model", optional=True): t.String,
t.Key("nvidia_gpu_model", optional=True): t.String,
t.Key("amd_gpu_model", optional=True): t.String,
t.Key("intel_gpu_model", optional=True): t.String,
}
),
]
Expand Down
13 changes: 11 additions & 2 deletions platform_api/orchestrator/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,10 @@ def has_nvidia_gpu(self) -> bool:
def has_amd_gpu(self) -> bool:
return bool(self.request.container.resources.amd_gpu)

@property
def has_intel_gpu(self) -> bool:
return bool(self.request.container.resources.intel_gpu)

def get_run_time(
self,
*,
Expand Down Expand Up @@ -645,8 +649,9 @@ def price_credits_per_hour(self) -> Decimal:
preset = self.preset
if preset:
return preset.credits_per_hour
# Default cost is maximal cost through all presets
# If there is no presets, that it is badly configured cluster in general
# Default cost is maximal cost through all presets.
# If there are no presets,
# then it is a badly configured cluster in general,
# and it is safe to assume zero cost
result = max(
(preset.credits_per_hour for preset in self._orchestrator_config.presets),
Expand Down Expand Up @@ -710,6 +715,10 @@ def has_nvidia_gpu(self) -> bool:
def has_amd_gpu(self) -> bool:
return self._record.has_amd_gpu

@property
def has_intel_gpu(self) -> bool:
return self._record.has_intel_gpu

@property
def status(self) -> JobStatus:
return self._status_history.current.status
Expand Down
79 changes: 66 additions & 13 deletions platform_api/orchestrator/job_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,10 @@ class ContainerResources:
memory: int
nvidia_gpu: Optional[int] = None
amd_gpu: Optional[int] = None
gpu_model_id: Optional[str] = None # TODO: deprecated, remove
intel_gpu: Optional[int] = None
nvidia_gpu_model: Optional[str] = None
amd_gpu_model: Optional[str] = None
intel_gpu_model: Optional[str] = None
shm: Optional[bool] = None
tpu: Optional[ContainerTPUResource] = None

Expand All @@ -214,7 +217,12 @@ def from_primitive(cls, payload: dict[str, Any]) -> "ContainerResources":
),
nvidia_gpu=payload.get("nvidia_gpu") or payload.get("gpu"),
amd_gpu=payload.get("amd_gpu"),
gpu_model_id=payload.get("gpu_model_id"),
intel_gpu=payload.get("intel_gpu"),
nvidia_gpu_model=(
payload.get("nvidia_gpu_model") or payload.get("gpu_model_id")
),
amd_gpu_model=payload.get("amd_gpu_model"),
intel_gpu_model=payload.get("intel_gpu_model"),
shm=payload.get("shm"),
tpu=tpu,
)
Expand All @@ -226,8 +234,16 @@ def to_primitive(self) -> dict[str, Any]:
payload["gpu"] = self.nvidia_gpu
if self.amd_gpu is not None:
payload["amd_gpu"] = self.amd_gpu
if self.gpu_model_id:
payload["gpu_model_id"] = self.gpu_model_id
if self.intel_gpu is not None:
payload["intel_gpu"] = self.intel_gpu
if self.nvidia_gpu_model:
# todo: gpu_model_id is deprecated. it is here for a backward compatability
payload["gpu_model_id"] = self.nvidia_gpu_model
payload["nvidia_gpu_model"] = self.nvidia_gpu_model
if self.amd_gpu_model:
payload["amd_gpu_model"] = self.amd_gpu_model
if self.intel_gpu_model:
payload["intel_gpu_model"] = self.intel_gpu_model
if self.shm is not None:
payload["shm"] = self.shm
if self.tpu:
Expand All @@ -236,7 +252,7 @@ def to_primitive(self) -> dict[str, Any]:

@property
def require_gpu(self) -> bool:
return bool(self.nvidia_gpu or self.amd_gpu)
return bool(self.nvidia_gpu or self.amd_gpu or self.intel_gpu)

def check_fit_into_pool_type(self, pool_type: ResourcePoolType) -> bool:
if not pool_type.cpu or not pool_type.memory:
Expand All @@ -258,27 +274,41 @@ def check_fit_into_preset(self, preset: Preset) -> bool:

def _check_gpu(self, entry: Union[ResourcePoolType, Preset]) -> bool:
if not self.require_gpu:
# container does not need GPU. we are good regardless of presence
# of GPU in the pool type.
# container does not need GPU.
# we are good regardless of the presence of GPU in the pool type.
return True

# container needs GPU

if not entry.nvidia_gpu and not entry.amd_gpu:
if self.nvidia_gpu and not self._gpu_match(
resources_gpu=self.nvidia_gpu,
resources_gpu_model=self.nvidia_gpu_model,
entry_gpu=entry.nvidia_gpu,
entry_gpu_model=entry.nvidia_gpu_model,
):
return False

if (entry.nvidia_gpu or 0) < (self.nvidia_gpu or 0):
if self.amd_gpu and not self._gpu_match(
resources_gpu=self.amd_gpu,
resources_gpu_model=self.amd_gpu_model,
entry_gpu=entry.amd_gpu,
entry_gpu_model=entry.amd_gpu_model,
):
return False

if (entry.amd_gpu or 0) < (self.amd_gpu or 0):
if self.intel_gpu and not self._gpu_match(
resources_gpu=self.intel_gpu,
resources_gpu_model=self.intel_gpu_model,
entry_gpu=entry.intel_gpu,
entry_gpu_model=entry.intel_gpu_model,
):
return False

return True

def _check_tpu(self, pool_type: ResourcePoolType) -> bool:
if not self.tpu:
# container does not need TPU. we are good regardless of presence
# of TPU in the pool type.
# container does not need TPU.
# we are good regardless of the presence of TPU in the pool type.
return True

# container needs TPU
Expand All @@ -291,6 +321,29 @@ def _check_tpu(self, pool_type: ResourcePoolType) -> bool:
and self.tpu.software_version in pool_type.tpu.software_versions
)

@staticmethod
def _gpu_match(
resources_gpu: int,
resources_gpu_model: Optional[str],
entry_gpu: Optional[int],
entry_gpu_model: Optional[str],
) -> bool:
"""
Ensures that the resource GPU requirement matches
with the entry (preset or resource pool) GPUs
"""
if not entry_gpu:
# entry doesn't have the same GPU make
return False
if entry_gpu < resources_gpu:
# entry has less GPU than resources requires
return False
if not resources_gpu_model:
# ready to exit. resources doesn't required a specific GPU model
return True
# resource requires a specific model. therefore, we compare them
return entry_gpu_model == resources_gpu_model

def _check_tpu_preset(self, preset: Preset) -> bool:
if not self.tpu:
# container does not need TPU. we are good regardless of presence
Expand Down
5 changes: 4 additions & 1 deletion platform_api/orchestrator/jobs_poller.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@ def _parse_resources(data: Mapping[str, Any]) -> ContainerResources:
memory=data["memory"],
nvidia_gpu=data.get("nvidia_gpu"),
amd_gpu=data.get("amd_gpu"),
gpu_model_id=data.get("gpu_model"),
intel_gpu=data.get("intel_gpu"),
nvidia_gpu_model=data.get("nvidia_gpu_model") or data.get("gpu_model"),
amd_gpu_model=data.get("amd_gpu_model"),
intel_gpu_model=data.get("intel_gpu_model"),
shm=data.get("shm"),
tpu=tpu,
)
Expand Down
Loading

0 comments on commit 007ffc7

Please sign in to comment.