Skip to content
This repository has been archived by the owner on Oct 31, 2023. It is now read-only.

Docker GPU environment #4618

Merged
merged 9 commits into from
Aug 29, 2019
143 changes: 143 additions & 0 deletions golem/envs/docker/gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
from copy import copy
from logging import getLogger, Logger
from typing import Any, ClassVar, Dict, List, Optional

from dataclasses import dataclass, field

from golem.core.common import update_dict
from golem.envs import (
CounterId,
CounterUsage,
EnvId,
EnvSupportStatus
)
from golem.envs.docker import DockerRuntimePayload
from golem.envs.docker.cpu import (
DockerCPUEnvironment,
DockerCPUConfig,
DockerCPURuntime,
)
from golem.envs.docker.vendor import nvidia

logger = getLogger(__name__)


@dataclass
class DockerGPUConfig(DockerCPUConfig):
# GPU vendor identifier
gpu_vendor: str = 'UNKNOWN'
# GPU device list
gpu_devices: List[str] = field(default_factory=list)
# Enabled GPU device capabilities
gpu_caps: List[str] = field(default_factory=list)
# GPU device and driver constraints
gpu_requirements: Dict[str, str] = field(default_factory=dict)

def validate(self) -> None:
pass

def container_config(self) -> Dict[str, Any]:
raise NotImplementedError


@dataclass
class DockerNvidiaGPUConfig(DockerGPUConfig):

gpu_vendor: str = field(
default=nvidia.VENDOR)
gpu_devices: List[str] = field(
default_factory=lambda: copy(nvidia.DEFAULT_DEVICES))
gpu_caps: List[str] = field(
default_factory=lambda: copy(nvidia.DEFAULT_CAPABILITIES))
gpu_requirements: Dict[str, str] = field(
default_factory=lambda: copy(nvidia.DEFAULT_REQUIREMENTS))

def validate(self) -> None:
nvidia.validate_devices(self.gpu_devices)
nvidia.validate_capabilities(self.gpu_caps)
nvidia.validate_requirements(self.gpu_requirements)

def container_config(self) -> Dict[str, Any]:
environment = {
# Golem
'GPU_ENABLED': '1',
'GPU_VENDOR': self.gpu_vendor,
# nvidia-container-runtime
'NVIDIA_VISIBLE_DEVICES': ','.join(self.gpu_devices),
'NVIDIA_DRIVER_CAPABILITIES': ','.join(self.gpu_caps),
}

# pylint: disable=no-member
for req, val in self.gpu_requirements.items():
environment[f'NVIDIA_REQUIRE_{req.upper()}'] = val
# pylint: enable=no-member

return dict(
runtime='nvidia',
environment=environment,
)


class DockerGPURuntime(DockerCPURuntime):

def usage_counters(self) -> Dict[CounterId, CounterUsage]:
raise NotImplementedError


class DockerGPUEnvironment(DockerCPUEnvironment):

ENV_ID: ClassVar[EnvId] = 'docker_gpu'
ENV_DESCRIPTION: ClassVar[str] = 'Docker environment using GPU'

BENCHMARK_IMAGE = 'golemfactory/gpu_benchmark:1.0'

# Enforce DockerGPUConfig config class type (DockerCPUConfig in super)
def __init__( # pylint: disable=useless-super-delegation
self,
config: DockerGPUConfig,
env_logger: Optional[Logger] = None,
) -> None:
super().__init__(config, env_logger or logger)

@classmethod
def supported(cls) -> EnvSupportStatus:
if not nvidia.is_supported():
return EnvSupportStatus(False, "No supported GPU found")
return super().supported()

@classmethod
def parse_config(cls, config_dict: Dict[str, Any]) -> DockerGPUConfig:
if config_dict['gpu_vendor'] == nvidia.VENDOR:
return DockerNvidiaGPUConfig(**config_dict)
return DockerGPUConfig(**config_dict)

@classmethod
def _validate_config(cls, config: DockerCPUConfig) -> None:
if not isinstance(config, DockerGPUConfig):
raise ValueError(f"Invalid config class: '{config.__class__}'")

super()._validate_config(config)
config.validate()

def _create_container_config(
self,
config: DockerCPUConfig,
payload: DockerRuntimePayload,
) -> Dict[str, Any]:
if not isinstance(config, DockerGPUConfig):
raise ValueError(f"Invalid config class: '{config.__class__}'")

container_config = super()._create_container_config(config, payload)
update_dict(container_config, config.container_config())
return container_config

def _create_runtime(
self,
config: DockerCPUConfig,
payload: DockerRuntimePayload,
) -> DockerCPURuntime:
container_config = self._create_container_config(config, payload)
return DockerGPURuntime(
container_config,
self._port_mapper,
runtime_logger=self._logger)
28 changes: 28 additions & 0 deletions golem/envs/docker/non_hypervised.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from typing import Any, Dict

from golem.docker.hypervisor.dummy import DummyHypervisor
from golem.envs.docker.cpu import DockerCPUEnvironment
from golem.envs.docker.gpu import DockerGPUEnvironment, DockerGPUConfig


class NonHypervisedDockerCPUEnvironment(DockerCPUEnvironment):
Expand All @@ -12,3 +15,28 @@ class is to use Docker CPU Environment alongside with DockerManager. """
@classmethod
def _get_hypervisor_class(cls):
return DummyHypervisor


class NonHypervisedDockerGPUEnvironment(DockerGPUEnvironment):
""" This is a temporary class that never uses a hypervisor. It just assumes
that Docker VM is properly configured if needed. The purpose of this
class is to use Docker GPU Environment alongside with DockerManager. """

# TODO: Remove when DockerManager is removed

@classmethod
def _get_hypervisor_class(cls):
return DummyHypervisor

@classmethod
def default(
cls,
config_dict: Dict[str, Any]
) -> 'NonHypervisedDockerGPUEnvironment':
from golem.envs.docker.vendor import nvidia
config_dict = dict(config_dict)
config_dict['gpu_vendor'] = nvidia.VENDOR
docker_config = DockerGPUConfig.from_dict(config_dict)
# Make linters know that docker_config is an instance of DockerGPUConfig
assert isinstance(docker_config, DockerGPUConfig)
return cls(docker_config)
Empty file.
94 changes: 94 additions & 0 deletions golem/envs/docker/vendor/nvidia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import re
from typing import Dict, List

# FIXME: move the nvgpu module out of the apps folder
from apps.core import nvgpu


VENDOR = 'NVIDIA'

DEFAULT_DEVICES = ['all']
SPECIAL_DEVICES = {
'void', # or empty or unset: the same behavior as runc
'none', # no GPUs accessible, but driver capabilities will be enabled
'all', # all GPUs accessible
}
DEVICE_INDEX_REGEX = re.compile(r"^(\d+)$")
DEVICE_NAME_REGEX = re.compile(r"^(GPU-[a-fA-F0-9\-]+)$")

DEFAULT_CAPABILITIES = ['compute', 'graphics', 'utility']
SPECIAL_CAPABILITIES = {
'all', # enable all available driver capabilities
}
CAPABILITIES = {
'compute', # required for CUDA and OpenCL applications
'compat32', # required for running 32-bit applications
'graphics', # required for running OpenGL and Vulkan applications
'utility', # required for using nvidia-smi and NVML
'video', # required for using the Video Codec SDK
'display', # required for leveraging X11 display
}

DEFAULT_REQUIREMENTS: Dict[str, str] = dict()
REQUIREMENTS = {
'cuda', # constraint on the CUDA driver version
'driver', # constraint on the driver version
'arch', # constraint on the compute architectures of the selected GPUs
'brand', # constraint on the brand of the selected GPUs (e.g. GeForce)
}


def is_supported() -> bool:
return nvgpu.is_supported()


def validate_devices(devices: List[str]) -> None:
if not devices:
raise ValueError(f"Missing {VENDOR} GPUs: {devices}")

special_count = sum([d in SPECIAL_DEVICES for d in devices])
has_mixed_devices = special_count > 0 and len(devices) > 1

if special_count > 1 or has_mixed_devices:
raise ValueError(f"Mixed {VENDOR} GPU devices: {devices}")
# Only a "special" device name was provided
if special_count > 0:
return

# All device names are device indexes
if all([DEVICE_INDEX_REGEX.match(d) for d in devices]):
return
# All device names are in a form of a UUID
if all([DEVICE_NAME_REGEX.match(d) for d in devices]):
return

raise ValueError(f"Invalid {VENDOR} GPU device names: {devices}")


def validate_capabilities(caps: List[str]) -> None:
if not caps:
raise ValueError(f"Missing {VENDOR} GPU caps: {caps}")

special_count = sum([c in SPECIAL_CAPABILITIES for c in caps])
has_mixed_caps = special_count > 0 and len(caps) > 1

if special_count > 1 or has_mixed_caps:
raise ValueError(f"Mixed {VENDOR} GPU caps: {caps}")
# Only a "special" capability was provided
if special_count > 0:
return

# All capability names are known
if not all([c in CAPABILITIES for c in caps]):
raise ValueError(f"Invalid {VENDOR} GPU caps: {caps}")


def validate_requirements(requirements: Dict[str, str]) -> None:
""" Validate requirement names and check if a value was provided """
for name, val in requirements.items():
if name not in REQUIREMENTS:
raise ValueError(
f"Invalid {VENDOR} GPU requirement name: '{name}'")
if not val:
raise ValueError(
f"Invalid {VENDOR} GPU requirement value: '{name}'='{val}'")
19 changes: 14 additions & 5 deletions golem/task/taskcomputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from golem.docker.task_thread import DockerTaskThread
from golem.envs import EnvId, EnvStatus
from golem.envs.docker.cpu import DockerCPUConfig, DockerCPUEnvironment
from golem.envs.docker.gpu import DockerGPUConfig, DockerGPUEnvironment
from golem.hardware import scale_memory, MemSize
from golem.manager.nodestatesnapshot import ComputingSubtaskStateSnapshot
from golem.resource.dirmanager import DirManager
Expand Down Expand Up @@ -409,19 +410,27 @@ def change_config(
assert not self._is_computing()
self._work_dir = work_dir

# FIXME: Decide how to properly configure environments
docker_env = self._env_manager.environment(DockerCPUEnvironment.ENV_ID)
docker_env.update_config(DockerCPUConfig(
config_dict = dict(
work_dirs=[work_dir],
cpu_count=config_desc.num_cores,
memory_mb=scale_memory(
config_desc.max_memory_size,
unit=MemSize.kibi,
to_unit=MemSize.mebi
)
))
return defer.succeed(None)
)

# FIXME: Decide how to properly configure environments
docker_cpu = self._env_manager.environment(DockerCPUEnvironment.ENV_ID)
docker_cpu.update_config(DockerCPUConfig(**config_dict))

if self._env_manager.enabled(DockerGPUEnvironment.ENV_ID):
docker_gpu = self._env_manager.environment(
DockerGPUEnvironment.ENV_ID)
# TODO: GPU options in config_dict
docker_gpu.update_config(DockerGPUConfig(**config_dict))

return defer.succeed(None)

class TaskComputer: # pylint: disable=too-many-instance-attributes
""" TaskComputer is responsible for task computations that take
Expand Down
22 changes: 18 additions & 4 deletions golem/task/taskserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,13 @@
SupportStatus,
UnsupportReason,
)
from golem.envs import Environment as NewEnv
from golem.envs import Environment as NewEnv, EnvSupportStatus
from golem.envs.auto_setup import auto_setup
from golem.envs.docker.cpu import DockerCPUConfig
from golem.envs.docker.non_hypervised import NonHypervisedDockerCPUEnvironment
from golem.envs.docker.non_hypervised import (
NonHypervisedDockerCPUEnvironment,
NonHypervisedDockerGPUEnvironment,
)
from golem.model import TaskPayment
from golem.network.hyperdrive.client import HyperdriveAsyncClient
from golem.network.transport import msg_queue
Expand Down Expand Up @@ -130,16 +133,27 @@ def __init__(self,
self.config_desc = config_desc

os.makedirs(self.get_task_computer_root(), exist_ok=True)
docker_cpu_config = DockerCPUConfig(
work_dirs=[Path(self.get_task_computer_root())])

docker_config_dict = dict(work_dirs=[self.get_task_computer_root()])
docker_cpu_config = DockerCPUConfig.from_dict(docker_config_dict)
docker_cpu_env = auto_setup(
NonHypervisedDockerCPUEnvironment(docker_cpu_config))

new_env_manager = EnvironmentManager()
new_env_manager.register_env(
docker_cpu_env,
DockerTaskApiPayloadBuilder,
)

docker_gpu_status = NonHypervisedDockerGPUEnvironment.supported()
if docker_gpu_status == EnvSupportStatus(True):
docker_gpu_env = auto_setup(
NonHypervisedDockerGPUEnvironment.default(docker_config_dict))
new_env_manager.register_env(
docker_gpu_env,
DockerTaskApiPayloadBuilder,
)

self.node = node
self.task_archiver = task_archiver
self.task_keeper = TaskHeaderKeeper(
Expand Down
Empty file.
Loading