Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feat] Add support for associating scheduler resources with an environment #3152

Merged
merged 19 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion docs/config_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,7 @@ ReFrame can launch containerized applications, but you need to configure properl
Custom Job Scheduler Resources
==============================

ReFrame allows you to define custom scheduler resources for each partition that you can then transparently access through the :attr:`~reframe.core.pipeline.RegressionTest.extra_resources` attribute of a regression test.
ReFrame allows you to define custom scheduler resources for each partition/environment that can then be transparently accessed through the :attr:`~reframe.core.pipeline.RegressionTest.extra_resources` attribute of a regression test or the environment.
vkarak marked this conversation as resolved.
Show resolved Hide resolved

.. py:attribute:: systems.partitions.resources.name

Expand Down Expand Up @@ -766,6 +766,27 @@ ReFrame allows you to define custom scheduler resources for each partition that
The backend assumes a ``qsub`` option, if the options passed in these attributes start with a ``-``.


.. py:attribute:: systems.partitions.env_resources.name

:required: Yes

The name of the resources.
This name will be used to request this resource in a programming environment :attr:`~environments.resources`.

.. versionadded:: 4.6


.. py:attribute:: systems.partitions.env_resources.options

:required: No
:default: ``[]``

A list of options to be passed to this partition’s job scheduler.
This is very similar to the :attr:`~config.systems.partitions.resources.options` parameter, but it is used to define resources specific to a programming environment.

.. versionadded:: 4.6


Environment Configuration
=========================

Expand Down Expand Up @@ -948,6 +969,16 @@ They are associated with `system partitions <#system-partition-configuration>`__
It first looks for definitions for the current partition, then for the containing system and, finally, for global definitions (the ``*`` pseudo-system).


.. py:attribute:: environments.resources

:required: No
:default: ``{}``

This is similar to a regression test's :attr:`~reframe.core.pipeline.RegressionTest.extra_resources`.
vkarak marked this conversation as resolved.
Show resolved Hide resolved

.. versionadded:: 4.6


.. _logging-config-reference:

Logging Configuration
Expand Down
12 changes: 12 additions & 0 deletions reframe/core/environments.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ def __init__(self,
cxxflags=None,
fflags=None,
ldflags=None,
resources=None,
**kwargs):
super().__init__(name, modules, env_vars, extras, features,
prepare_cmds)
Expand All @@ -254,6 +255,7 @@ def __init__(self,
self._cxxflags = cxxflags or []
self._fflags = fflags or []
self._ldflags = ldflags or []
self._resources = resources or {}

@property
def cc(self):
Expand Down Expand Up @@ -326,3 +328,13 @@ def nvcc(self):
:type: :class:`str`
'''
return self._nvcc

@property
def resources(self):
'''The resources associated with this environment.
ekouts marked this conversation as resolved.
Show resolved Hide resolved

.. versionadded:: 4.6.0

:type: :class:`Dict[str, object]`
'''
return self._resources
17 changes: 15 additions & 2 deletions reframe/core/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -1863,7 +1863,10 @@ def compile(self):
# build_job_opts. We want any user supplied options to be able to
# override those set by the framework.
resources_opts = self._map_resources_to_jobopts()
self._build_job.options = resources_opts + self._build_job.options
env_resources_opts = self._map_env_resources_to_jobopts()
self._build_job.options = (
env_resources_opts + resources_opts + self._build_job.options
)
with osext.change_dir(self._stagedir):
# Prepare build job
build_commands = [
Expand Down Expand Up @@ -2011,7 +2014,10 @@ def _get_cp_env():
# job_opts. We want any user supplied options to be able to
# override those set by the framework.
resources_opts = self._map_resources_to_jobopts()
self._job.options = resources_opts + self._job.options
env_resources_opts = self._map_env_resources_to_jobopts()
self._job.options = (
env_resources_opts + resources_opts + self._job.options
)
with osext.change_dir(self._stagedir):
try:
self.logger.debug('Generating the run script')
Expand Down Expand Up @@ -2041,6 +2047,13 @@ def _map_resources_to_jobopts(self):

return resources_opts

def _map_env_resources_to_jobopts(self):
resources_opts = []
for r, v in self._current_environ.resources.items():
resources_opts += self._current_partition.get_env_resource(r, **v)

return resources_opts

@final
def compile_complete(self):
'''Check if the build phase has completed.
Expand Down
34 changes: 30 additions & 4 deletions reframe/core/systems.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,8 @@ class SystemPartition(jsonext.JSONSerializable):
def __init__(self, *, parent, name, sched_type, launcher_type,
descr, access, container_runtime, container_environs,
resources, local_env, environs, max_jobs, prepare_cmds,
processor, devices, extras, features, time_limit):
processor, devices, extras, features, time_limit,
env_resources):
getlogger().debug(f'Initializing system partition {name!r}')
self._parent_system = parent
self._name = name
Expand All @@ -183,6 +184,7 @@ def __init__(self, *, parent, name, sched_type, launcher_type,
self._max_jobs = max_jobs
self._prepare_cmds = prepare_cmds
self._resources = {r['name']: r['options'] for r in resources}
self._env_resources = {r['name']: r['options'] for r in env_resources}
self._processor = ProcessorInfo(processor)
self._devices = [DeviceInfo(d) for d in devices]
self._extras = extras
Expand Down Expand Up @@ -349,6 +351,21 @@ def get_resource(self, name, **values):

return ret

def get_env_resource(self, name, **values):
'''Instantiate managed environment resource ``name`` with ``value``.

:meta private:
'''

ret = []
for r in self._env_resources.get(name, []):
try:
ret.append(r.format(**values))
except KeyError:
pass

return ret

def environment(self, name):
'''Return the partition environment named ``name``.'''

Expand Down Expand Up @@ -452,7 +469,14 @@ def json(self):
'options': options
}
for name, options in self._resources.items()
]
],
'env_resources': [
{
'name': name,
'options': options
}
for name, options in self._env_resources.items()
],
}

def __str__(self):
Expand Down Expand Up @@ -549,7 +573,8 @@ def create(cls, site_config):
cflags=site_config.get(f'environments/@{e}/cflags'),
cxxflags=site_config.get(f'environments/@{e}/cxxflags'),
fflags=site_config.get(f'environments/@{e}/fflags'),
ldflags=site_config.get(f'environments/@{e}/ldflags')
ldflags=site_config.get(f'environments/@{e}/ldflags'),
resources=site_config.get(f'environments/@{e}/resources')
) for e in site_config.get(f'{partid}/environs')
if any(re.match(pattern, e) for pattern in env_patt)
]
Expand All @@ -576,7 +601,8 @@ def create(cls, site_config):
devices=site_config.get(f'{partid}/devices'),
extras=site_config.get(f'{partid}/extras'),
features=site_config.get(f'{partid}/features'),
time_limit=site_config.get(f'{partid}/time_limit')
time_limit=site_config.get(f'{partid}/time_limit'),
env_resources=site_config.get(f'{partid}/env_resources')
)
)

Expand Down
27 changes: 27 additions & 0 deletions reframe/schemas/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,21 @@
"required": ["name"],
"additionalProperties": false
}
},
"env_resources": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"options": {
"type": "array",
"items": {"type": "string"}
}
},
"required": ["name"],
"additionalProperties": false
}
}
},
"required": ["name", "scheduler", "launcher"],
Expand Down Expand Up @@ -410,6 +425,16 @@
"type": "array",
"items": {"$ref": "#/defs/alphanum_string"}
},
"resources": {
"type": "object",
"propertyNames": {
"pattern": "^[a-zA-Z_][a-zA-Z0-9_]*$"
},
"additionalProperties": {
"type": "object",
"additionalProperties": true
}
},
"target_systems": {"$ref": "#/defs/system_ref"}
},
"required": ["name"],
Expand Down Expand Up @@ -616,6 +641,8 @@
"systems/partitions/features": [],
"systems/partitions/resources": [],
"systems/partitions/resources/options": [],
"systems/partitions/env_resources": [],
"systems/partitions/env_resources/options": [],
"systems/partitions/modules": [],
"systems/partitions/env_vars": [],
"systems/partitions/variables": [],
Expand Down
9 changes: 9 additions & 0 deletions unittests/resources/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,15 @@ def hostname():
]
}
],
'env_resources': [
{
'name': 'uenv',
'options': [
'--mount={mount}',
'--file={file}'
],
vkarak marked this conversation as resolved.
Show resolved Hide resolved
},
],
'features': ['cuda', 'mpi'],
'extras': {
'gpu_arch': 'a100'
Expand Down
6 changes: 6 additions & 0 deletions unittests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,12 @@ def test_system_create(site_config):
assert resources_spec == ['#DW jobdw capacity=100GB',
'#DW stage_in source=/foo']

env_resources_spec = partition.get_env_resource(
'uenv', mount='mount_point', file='file_path'
)
assert env_resources_spec == ['--mount=mount_point',
'--file=file_path']

# Check processor info
assert partition.processor.info is not None
assert partition.processor.topology is not None
Expand Down
Loading