diff --git a/.gitignore b/.gitignore index f79b3369..ef44eef6 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,9 @@ pyslurm/*.pxi~ pyslurm/*.pxd~ pyslurm/*.so pyslurm/*.c +pyslurm/**/*.c +pyslurm/**/*.so +pyslurm/**/__pycache__ # Ignore vim swap files *.swp @@ -25,6 +28,7 @@ tests/*.pyc # Ignore pycache (Python 3) */__pycache__ +*/**/__pycache__ # Ignore job output files *.out diff --git a/pyslurm/__init__.py b/pyslurm/__init__.py index 177bf7cb..aa9e26c6 100644 --- a/pyslurm/__init__.py +++ b/pyslurm/__init__.py @@ -16,6 +16,51 @@ from .pyslurm import * from .__version__ import __version__ +from pyslurm.core.job import ( + Job, + Jobs, + JobStep, + JobSteps, + JobSubmitDescription, +) + +from pyslurm.core import db +from pyslurm.core.node import Node, Nodes + +import pyslurm.core.error +from pyslurm.core.error import ( + RPCError, +) + +# Utility time functions +from pyslurm.core.common.ctime import ( + timestr_to_secs, + timestr_to_mins, + secs_to_timestr, + mins_to_timestr, + date_to_timestamp, + timestamp_to_date, +) + +# General utility functions +from pyslurm.core.common import ( + uid_to_name, + gid_to_name, + user_to_uid, + group_to_gid, + expand_range_str, + humanize, + dehumanize, + nodelist_from_range_str, + nodelist_to_range_str, +) + +from pyslurm.core import slurmctld + +# Initialize slurm api +from pyslurm.api import slurm_init, slurm_fini +slurm_init() + def version(): return __version__ diff --git a/pyslurm/api.pxd b/pyslurm/api.pxd new file mode 100644 index 00000000..9b19ec9a --- /dev/null +++ b/pyslurm/api.pxd @@ -0,0 +1,26 @@ +######################################################################### +# api.pxd - pyslurm core API +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.core.common cimport cstr diff --git a/pyslurm/api.pyx b/pyslurm/api.pyx new file mode 100644 index 00000000..0f34fedb --- /dev/null +++ b/pyslurm/api.pyx @@ -0,0 +1,43 @@ +######################################################################### +# api.pyx - pyslurm core API +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + + +def slurm_init(config_path=None): + """Initialize the Slurm API. + + This function must be called first before certain RPC functions can be + executed. slurm_init is automatically called when the pyslurm module is + loaded. + + Args: + config_path (str, optional): + An absolute path to the slurm config file to use. The default is + None, so libslurm will automatically detect its config. + """ + slurm.slurm_init(cstr.from_unicode(config_path)) + + +def slurm_fini(): + """Clean up data structures previously allocated through slurm_init.""" + slurm.slurm_fini() diff --git a/pyslurm/core/__init__.pxd b/pyslurm/core/__init__.pxd new file mode 100644 index 00000000..e69de29b diff --git a/pyslurm/core/__init__.py b/pyslurm/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pyslurm/core/common/__init__.pxd b/pyslurm/core/common/__init__.pxd new file mode 100644 index 00000000..7915de2f --- /dev/null +++ b/pyslurm/core/common/__init__.pxd @@ -0,0 +1,32 @@ +######################################################################### +# common/__init__.pxd - common/utility functions +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.slurm cimport xfree, try_xmalloc, xmalloc +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from pyslurm.core.common cimport cstr +from libc.stdlib cimport free + +cpdef uid_to_name(uint32_t uid, err_on_invalid=*, dict lookup=*) +cpdef gid_to_name(uint32_t gid, err_on_invalid=*, dict lookup=*) diff --git a/pyslurm/core/common/__init__.pyx b/pyslurm/core/common/__init__.pyx new file mode 100644 index 00000000..6ad5ae47 --- /dev/null +++ b/pyslurm/core/common/__init__.pyx @@ -0,0 +1,349 @@ +######################################################################### +# common/__init__.pyx - common/utility functions +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from grp import getgrgid, getgrnam, getgrall +from pwd import getpwuid, getpwnam, getpwall +from os import getuid, getgid +from itertools import chain +import re +import signal + + +MEMORY_UNITS = { + "K": 2**10.0, + "M": 2**20.0, + "G": 2**30.0, + "T": 2**40.0, + "P": 2**50.0, + "E": 2**60.0, + "Z": 2**70.0 +} + + +cpdef uid_to_name(uint32_t uid, err_on_invalid=True, dict lookup={}): + """Translate UID to a User-Name.""" + if uid == slurm.NO_VAL or uid == slurm.INFINITE: + return None + + if lookup: + try: + name = lookup[uid] + return name + except KeyError as e: + if err_on_invalid: + raise e + else: + try: + name = getpwuid(uid).pw_name + return name + except KeyError as e: + if err_on_invalid: + raise e + + return None + + +cpdef gid_to_name(uint32_t gid, err_on_invalid=True, dict lookup={}): + """Translate a uid to a Group-Name.""" + if gid == slurm.NO_VAL or gid == slurm.INFINITE: + return None + + if lookup: + try: + name = lookup[gid] + return name + except KeyError as e: + if err_on_invalid: + raise e + else: + try: + name = getgrgid(gid).gr_name + return name + except KeyError as e: + if err_on_invalid: + raise e + + return None + + +def user_to_uid(user, err_on_invalid=True): + """Translate User-Name to a uid.""" + if user is None: + return slurm.NO_VAL + + try: + if isinstance(user, str): + return getpwnam(user).pw_uid + + return getpwuid(user).pw_uid + except KeyError as e: + if err_on_invalid: + raise e + + return getuid() + + +def group_to_gid(group, err_on_invalid=True): + """Translate a Group-Name to a gid.""" + if group is None: + return slurm.NO_VAL + + try: + if isinstance(group, str): + return getgrnam(group).gr_gid + + return getgrgid(group).gr_gid + except KeyError as e: + if err_on_invalid: + raise e + + return getgid() + + +def _getgrall_to_dict(): + cdef list groups = getgrall() + cdef dict grp_info = {item.gr_gid: item.gr_name for item in groups} + return grp_info + + +def _getpwall_to_dict(): + cdef list passwd = getpwall() + cdef dict pw_info = {item.pw_uid: item.pw_name for item in passwd} + return pw_info + + +def expand_range_str(range_str): + """Expand a ranged string of numbers to a list of unique values. + + Args: + range_str (str): + A range string, which can for example look like this: + "1,2,3-10,11,15-20" + + Returns: + list: List of unique values + """ + ret = [] + for mrange in range_str.split(","): + start, sep, end = mrange.partition("-") + start = int(start) + + if sep: + ret += range(start, int(end)+1) + else: + ret.append(start) + + return ret + + +def nodelist_from_range_str(nodelist): + """Convert a bracketed nodelist str with ranges to a list. + + Args: + nodelist (Union[str, list]): + Comma-seperated str or list with potentially bracketed hostnames + and ranges. + + Returns: + list: List of all nodenames or None on failure + """ + if isinstance(nodelist, list): + nodelist = ",".join(nodelist) + + cdef: + char *nl = nodelist + slurm.hostlist_t hl + char *hl_unranged = NULL + + hl = slurm.slurm_hostlist_create(nl) + if not hl: + return [] + + hl_unranged = slurm.slurm_hostlist_deranged_string_malloc(hl) + out = cstr.to_list(hl_unranged) + + free(hl_unranged) + slurm.slurm_hostlist_destroy(hl) + + return out + + +def nodelist_to_range_str(nodelist): + """Convert a list of nodes to a bracketed str with ranges. + + Args: + nodelist (Union[str, list]): + Comma-seperated str or list with unique, unbracketed nodenames. + + Returns: + str: Bracketed, ranged nodelist or None on failure. + """ + if isinstance(nodelist, list): + nodelist = ",".join(nodelist) + + cdef: + char *nl = nodelist + slurm.hostlist_t hl + char *hl_ranged = NULL + + hl = slurm.slurm_hostlist_create(nl) + if not hl: + return None + + hl_ranged = slurm.slurm_hostlist_ranged_string_malloc(hl) + out = cstr.to_unicode(hl_ranged) + + free(hl_ranged) + slurm.slurm_hostlist_destroy(hl) + + return out + + +def humanize(num, decimals=1): + """Humanize a number. + + This will convert the number to a string and add appropriate suffixes like + M,G,T,P,... + + Args: + num (int): + Number to humanize + decimals (int, optional): + Amount of decimals the humanized string should have. + + Returns: + str: Humanized number with appropriate suffix. + """ + if num is None or num == "unlimited": + return num + + num = int(num) + for unit in ["M", "G", "T", "P", "E", "Z"]: + if abs(num) < 1024.0: + return f"{num:3.{decimals}f}{unit}" + num /= 1024.0 + + return f"{num:.{decimals}f}Y" + + +def dehumanize(humanized_str, target="M", decimals=0): + """Dehumanize a previously humanized value. + + Args: + humanized_str (str): + A humanized str, for example "5M" or "10T" + target (str): + Target unit. The default is "M" (Mebibytes). Allowed values are + K,M,G,T,P,E,Z + decimals (int): + Amount of decimal places the result should have. Default is 0 + + Returns: + int: Dehumanized value + """ + if not humanized_str: + return None + + units_str = " ".join(MEMORY_UNITS.keys()) + splitted = re.split(f'([{units_str}])', str(humanized_str)) + + if len(splitted) == 1: + try: + return int(humanized_str) + except ValueError as e: + raise ValueError(f"Invalid value specified: {humanized_str}") + + val = float(splitted[0]) + unit = splitted[1] + + val_in_bytes = val * MEMORY_UNITS[unit] + val_in_target_size = float(val_in_bytes / MEMORY_UNITS[target]) + + if not decimals: + return round(val_in_target_size) + else: + return float(f"{val_in_target_size:.{decimals}f}") + + +def signal_to_num(sig): + if not sig: + return None + + try: + if str(sig).isnumeric(): + _sig = signal.Signals(int(sig)).value + else: + _sig = signal.Signals[sig].value + except Exception: + raise ValueError(f"Invalid Signal: {sig}.") from None + + return _sig + + +def cpubind_to_num(cpu_bind): + cdef uint32_t flags = 0 + + if not cpu_bind: + return flags + + cpu_bind = cpu_bind.casefold().split(",") + + if "none" in cpu_bind: + flags |= slurm.CPU_BIND_NONE + elif "sockets" in cpu_bind: + flags |= slurm.CPU_BIND_TO_SOCKETS + elif "ldoms" in cpu_bind: + flags |= slurm.CPU_BIND_TO_LDOMS + elif "cores" in cpu_bind: + flags |= slurm.CPU_BIND_TO_CORES + elif "threads" in cpu_bind: + flags |= slurm.CPU_BIND_TO_THREADS + elif "off" in cpu_bind: + flags |= slurm.CPU_BIND_OFF + if "verbose" in cpu_bind: + flags |= slurm.CPU_BIND_VERBOSE + + return flags + + +def instance_to_dict(inst): + cdef dict out = {} + for attr in dir(inst): + val = getattr(inst, attr) + if attr.startswith("_") or callable(val): + # Ignore everything starting with "_" and all functions. + continue + out[attr] = val + + return out + + +def _sum_prop(obj, name, startval=0): + val = startval + for n in obj.values(): + v = name.__get__(n) + if v is not None: + val += v + + return val diff --git a/pyslurm/core/common/cstr.pxd b/pyslurm/core/common/cstr.pxd new file mode 100644 index 00000000..b1719bde --- /dev/null +++ b/pyslurm/core/common/cstr.pxd @@ -0,0 +1,39 @@ +######################################################################### +# common/cstr.pxd - slurm string functions +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.slurm cimport xfree, try_xmalloc, xmalloc +from libc.string cimport memcpy, strlen + +cdef char *from_unicode(s) +cdef to_unicode(char *s, default=*) +cdef fmalloc(char **old, val) +cdef fmalloc2(char **p1, char **p2, val) +cdef free_array(char **arr, count) +cpdef list to_list(char *str_list) +cdef from_list(char **old, vals, delim=*) +cdef from_list2(char **p1, char **p2, vals, delim=*) +cpdef dict to_dict(char *str_dict, str delim1=*, str delim2=*) +cdef from_dict(char **old, vals, prepend=*, str delim1=*, str delim2=*) +cpdef dict to_gres_dict(char *gres) diff --git a/pyslurm/core/common/cstr.pyx b/pyslurm/core/common/cstr.pyx new file mode 100644 index 00000000..8301c994 --- /dev/null +++ b/pyslurm/core/common/cstr.pyx @@ -0,0 +1,287 @@ +######################################################################### +# common/cstr.pyx - pyslurm string functions +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +import re + +cdef bytes NULL_BYTE = "\0".encode("ascii") +cdef bytes NONE_BYTE = "None".encode("ascii") + +cdef char *from_unicode(s): + """Convert Python3 str (unicode) to char* (no malloc) + + Note + The lifetime of this char* depends on the lifetime of the equivalent + python-object passed in. If the python-object is gone, the char* cannot + be used safely anymore. + """ + if not s: + return NULL + + _s = str(s) + return _s + + +cdef to_unicode(char *_str, default=None): + """Convert a char* to Python3 str (unicode)""" + if _str and _str[0] != NULL_BYTE: + if _str == NONE_BYTE: + return None + + return _str + else: + return default + + +cdef fmalloc2(char **p1, char **p2, val): + """Like fmalloc, but copies the value to 2 char pointers.""" + fmalloc(p1, val) + fmalloc(p2, val) + + +cdef fmalloc(char **old, val): + """Try to free first and then create xmalloc'ed char* from str. + + Note: Uses Slurm's memory allocator. + """ + # TODO: Consider doing some size checks on the input by having an extra + # argument like "max_size" which is configurable. Otherwise infinitely huge + # strings could just be passed in and consume a lot of memory which would + # allow for a denial of service attack on services that use pyslurm. + cdef: + const char *tmp = NULL + size_t siz + + # Free the previous allocation (if neccessary) + xfree(old[0]) + + # Consider: Maybe every string containing a \0 should just + # be rejected with an Exception instead of silently cutting + # everything after \0 off? + + if val and val[0] != "\0": + # Let Cython convert the Python-string to a char* + # which will be NUL-terminated. + tmp = val + + # Get the length of the char*, include space for NUL character + siz = strlen(tmp) + 1 + + old[0] = slurm.try_xmalloc(siz) + if not old[0]: + raise MemoryError("xmalloc failed for char*") + + memcpy(old[0], tmp, siz) + else: + old[0] = NULL + + +cpdef list to_list(char *str_list): + """Convert C-String to a list.""" + cdef str ret = to_unicode(str_list) + + if not ret: + return [] + + return ret.split(",") + + +def list_to_str(vals, delim=","): + """Convert list to a C-String.""" + cdef object final = vals + + if vals and not isinstance(vals, str): + final = delim.join(vals) + + return final + + +cdef from_list(char **old, vals, delim=","): + fmalloc(old, list_to_str(vals, delim)) + + +cdef from_list2(char **p1, char **p2, vals, delim=","): + from_list(p1, vals, delim) + from_list(p2, vals, delim) + + +cpdef dict to_dict(char *str_dict, str delim1=",", str delim2="="): + """Convert a char* key=value pair to dict. + + With a char* Slurm represents key-values pairs usually in the form of: + key1=value1,key2=value2 + which can easily be converted to a dict. + """ + cdef: + str _str_dict = to_unicode(str_dict) + str key, val + dict out = {} + + if not _str_dict or delim1 not in _str_dict: + return out + + for kv in _str_dict.split(delim1): + if delim2 in kv: + key, val = kv.split(delim2, 1) + out[key] = val + + return out + + +def validate_str_key_value_format(val, delim1=",", delim2="="): + cdef dict out = {} + + for kv in val.split(delim1): + if delim2 in kv: + k, v = kv.split(delim2) + out[k] = v + else: + raise ValueError( + f"Invalid format for key-value pair {kv}. " + f"Expected {delim2} as seperator." + ) + + return out + + +def dict_to_str(vals, prepend=None, delim1=",", delim2="="): + """Convert a dict (or str) to Slurm Key-Value pair. + + Slurm predominantly uses a format of: + key1=value1,key2=value2,... + + for Key/Value type things, which can be easily created from a dict. + + A String which already has this form can also be passed in. The correct + format of this string will the be validated. + """ + cdef: + tmp_dict = {} if not vals else vals + list tmp = [] + + if not vals: + return None + + if isinstance(vals, str): + tmp_dict = validate_str_key_value_format(vals, delim1, delim2) + + for k, v in tmp_dict.items(): + if ((delim1 in k or delim2 in k) or + delim1 in v or delim2 in v): + raise ValueError( + f"Key or Value cannot contain either {delim1} or {delim2}. " + f"Got Key: {k} and Value: {v}." + ) + + tmp.append(f"{'' if not prepend else prepend}{k}{delim2}{v}") + + return delim1.join(tmp) + + +cdef from_dict(char **old, vals, prepend=None, + str delim1=",", str delim2="="): + fmalloc(old, dict_to_str(vals, prepend, delim1, delim2)) + + +cpdef dict to_gres_dict(char *gres): + """Parse a GRES string.""" + cdef: + dict output = {} + str gres_str = to_unicode(gres) + + if not gres_str or gres_str == "(null)": + return {} + + for item in re.split(",(?=[^,]+?:)", gres_str): + + # Remove the additional "gres" specifier if it exists + if "gres:" in item: + item = item.replace("gres:", "") + + gres_splitted = re.split( + ":(?=[^:]+?)", + item.replace("(", ":", 1).replace(")", "") + ) + + name, typ, cnt = gres_splitted[0], gres_splitted[1], 0 + + # Check if we have a gres type. + if typ.isdigit(): + cnt = typ + typ = None + else: + cnt = gres_splitted[2] + + # Dict Key-Name depends on if we have a gres type or not + name_and_typ = f"{name}:{typ}" if typ else name + + if not "IDX" in gres_splitted: + # Check if we need to parse the exact GRES index when coming from + # job_resources_t. + output[name_and_typ] = int(cnt) + else: + # Cover cases with IDX + idx = gres_splitted[3] if not typ else gres_splitted[4] + output[name_and_typ] = { + "count": cnt, + "indexes": idx, + } + + return output + + +def from_gres_dict(vals, typ=""): + final = [] + gres_dict = {} if not vals else vals + + if not vals: + return None + + if isinstance(vals, str) and not vals.isdigit(): + gres_dict = {} + gres_list = vals.replace("gres:", "") + for gres_str in gres_list.split(","): + gres_and_type, cnt = gres_str.rsplit(":", 1) + gres_dict.update({gres_and_type: int(cnt)}) + elif not isinstance(vals, dict): + return f"gres:{typ}:{int(vals)}" + + for gres_and_type, cnt in gres_dict.items(): + # Error immediately on specifications that contain more than one + # semicolon, as it is wrong. + if len(gres_and_type.split(":")) > 2: + raise ValueError(f"Invalid specifier: '{gres_and_type}'") + + if typ not in gres_and_type: + gres_and_type = f"{gres_and_type}:{typ}" + + final.append(f"gres:{gres_and_type}:{int(cnt)}") + + return ",".join(final) + + +cdef free_array(char **arr, count): + for i in range(count): + xfree(arr[i]) + + xfree(arr) diff --git a/pyslurm/core/common/ctime.pxd b/pyslurm/core/common/ctime.pxd new file mode 100644 index 00000000..d8abb12d --- /dev/null +++ b/pyslurm/core/common/ctime.pxd @@ -0,0 +1,32 @@ +######################################################################### +# ctime.pxd - wrappers around slurm time functions +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.core.common cimport cstr +from libc.stdint cimport uint32_t + +cdef extern from 'time.h' nogil: + ctypedef long time_t + double difftime(time_t time1, time_t time2) + time_t time(time_t *t) diff --git a/pyslurm/core/common/ctime.pyx b/pyslurm/core/common/ctime.pyx new file mode 100644 index 00000000..fdf68834 --- /dev/null +++ b/pyslurm/core/common/ctime.pyx @@ -0,0 +1,213 @@ +######################################################################### +# ctime.pyx - wrappers around slurm time functions +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +import datetime + + +def timestr_to_secs(timestr): + """Convert Slurm Timestring to seconds + + Args: + timestr (str): + A Timestring compatible with Slurms time functions. + + Returns: + int: Amount of time in seconds + """ + cdef: + char *tmp = NULL + uint32_t secs + + if timestr is None: + return slurm.NO_VAL + elif timestr == "unlimited": + return slurm.INFINITE + + if str(timestr).isdigit(): + timestr = "00:00:{}".format(timestr) + + tmp = cstr.from_unicode(timestr) + secs = slurm.slurm_time_str2secs(tmp) + + if secs == slurm.NO_VAL: + raise ValueError(f"Invalid Time Specification: {timestr}.") + + return secs + + +def timestr_to_mins(timestr): + """Convert Slurm Timestring to minutes + + Args: + timestr (str): + A Timestring compatible with Slurms time functions. + + Returns: + int: Amount of time in minutes + """ + cdef: + char *tmp = NULL + uint32_t mins + + if timestr is None: + return slurm.NO_VAL + elif timestr == "unlimited": + return slurm.INFINITE + + tmp = cstr.from_unicode(timestr) + mins = slurm.slurm_time_str2mins(tmp) + + if mins == slurm.NO_VAL: + raise ValueError(f"Invalid Time Specification: {timestr}.") + + return mins + + +def secs_to_timestr(secs, default=None): + """Parse time in seconds to Slurm Timestring + + Args: + secs (int): + Amount of seconds to convert + + Returns: + str: A Slurm timestring + """ + cdef char time_line[32] + + if secs == slurm.NO_VAL or secs is None: + return default + elif secs != slurm.INFINITE: + slurm.slurm_secs2time_str( + secs, + time_line, + sizeof(time_line) + ) + + tmp = cstr.to_unicode(time_line) + if tmp == "00:00:00": + return None + else: + return tmp + else: + return "unlimited" + + +def mins_to_timestr(mins, default=None): + """Parse time in minutes to Slurm Timestring + + Args: + mins (int): + Amount of minutes to convert + + Returns: + str: A Slurm timestring + """ + cdef char time_line[32] + + if mins == slurm.NO_VAL or mins is None: + return default + elif mins != slurm.INFINITE: + slurm.slurm_mins2time_str( + mins, + time_line, + sizeof(time_line) + ) + + tmp = cstr.to_unicode(time_line) + if tmp == "00:00:00": + return None + else: + return tmp + else: + return "unlimited" + + +def date_to_timestamp(date, on_nodate=0): + """Parse Date to Unix timestamp + + Args: + date (Union[str, int, datetime.datetime]): + A date to convert to a Unix timestamp. + + Returns: + int: A unix timestamp + """ + cdef: + time_t tmp_time + char* tmp_char = NULL + + if not date: + # time_t of 0, so the option will be ignored by slurmctld + return on_nodate + elif str(date).isdigit(): + # Allow the user to pass a timestamp directly. + return int(date) + elif isinstance(date, datetime.datetime): + # Allow the user to pass a datetime.datetime object. + return int(date.timestamp()) + + tmp_char = cstr.from_unicode(date) + tmp_time = slurm.slurm_parse_time(tmp_char, 0) + + if not tmp_time: + raise ValueError(f"Invalid Time Specification: {date}") + + return tmp_time + + +def timestamp_to_date(timestamp): + """Parse Unix timestamp to Slurm Date-string + + Args: + timestamp (int): + A Unix timestamp that should be converted. + + Returns: + str: A Slurm date timestring + """ + cdef: + char time_str[32] + time_t _time = timestamp + + if _time == slurm.NO_VAL: + return None + + # slurm_make_time_str returns 'Unknown' if 0 or slurm.INFINITE + slurm.slurm_make_time_str(&_time, time_str, sizeof(time_str)) + + ret = cstr.to_unicode(time_str) + if ret == "Unknown": + return None + + return ret + + +def _raw_time(time, default=None): + if (time == slurm.NO_VAL or + time == 0 or + time == slurm.INFINITE): + return default + + return time diff --git a/pyslurm/core/common/uint.pxd b/pyslurm/core/common/uint.pxd new file mode 100644 index 00000000..0fd38739 --- /dev/null +++ b/pyslurm/core/common/uint.pxd @@ -0,0 +1,43 @@ +######################################################################### +# common/uint.pxd - functions dealing with parsing uint types +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t + +cpdef u8(val, inf=*, noval=*, on_noval=*, zero_is_noval=*) +cpdef u16(val, inf=*, noval=*, on_noval=*, zero_is_noval=*) +cpdef u32(val, inf=*, noval=*, on_noval=*, zero_is_noval=*) +cpdef u64(val, inf=*, noval=*, on_noval=*, zero_is_noval=*) +cpdef u8_parse(uint8_t val, on_inf=*, on_noval=*, noval=*, zero_is_noval=*) +cpdef u16_parse(uint16_t val, on_inf=*, on_noval=*, noval=*, zero_is_noval=*) +cpdef u32_parse(uint32_t val, on_inf=*, on_noval=*, noval=*, zero_is_noval=*) +cpdef u64_parse(uint64_t val, on_inf=*, on_noval=*, noval=*, zero_is_noval=*) +cpdef u8_bool(val) +cpdef u16_bool(val) +cdef u8_parse_bool(uint8_t val) +cdef u16_parse_bool(uint16_t val) +cdef u64_parse_bool_flag(uint64_t flags, flag) +cdef u64_set_bool_flag(uint64_t *flags, boolean, flag_val) +cdef u16_parse_bool_flag(uint16_t flags, flag) +cdef u16_set_bool_flag(uint16_t *flags, boolean, flag_val) diff --git a/pyslurm/core/common/uint.pyx b/pyslurm/core/common/uint.pyx new file mode 100644 index 00000000..7418e109 --- /dev/null +++ b/pyslurm/core/common/uint.pyx @@ -0,0 +1,181 @@ +######################################################################### +# common/uint.pyx - functions dealing with parsing uint types +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + + +cpdef u8(val, inf=False, noval=slurm.NO_VAL8, on_noval=slurm.NO_VAL8, zero_is_noval=True): + """Try to convert arbitrary 'val' to uint8_t""" + if val is None or (val == 0 and zero_is_noval) or val == noval: + return on_noval + elif inf and val == "unlimited": + return slurm.INFINITE8 + else: + if isinstance(val, str) and val.isdigit(): + return int(val) + + return val + + +cpdef u8_parse(uint8_t val, on_inf="unlimited", on_noval=None, noval=slurm.NO_VAL8, zero_is_noval=True): + """Convert uint8_t to Python int (with a few situational parameters)""" + if val == noval or (val == 0 and zero_is_noval): + return on_noval + elif val == slurm.INFINITE8: + return on_inf + else: + return val + + +cpdef u16(val, inf=False, noval=slurm.NO_VAL16, on_noval=slurm.NO_VAL16, zero_is_noval=True): + """Try to convert arbitrary 'val' to uint16_t""" + if val is None or (val == 0 and zero_is_noval) or val == noval: + return on_noval + elif inf and val == "unlimited": + return slurm.INFINITE16 + else: + if isinstance(val, str) and val.isdigit(): + return int(val) + + return val + + +cpdef u16_parse(uint16_t val, on_inf="unlimited", on_noval=None, noval=slurm.NO_VAL16, zero_is_noval=True): + """Convert uint16_t to Python int (with a few situational parameters)""" + if val == noval or (val == 0 and zero_is_noval): + return on_noval + elif val == slurm.INFINITE16: + return on_inf + else: + return val + + +cpdef u32(val, inf=False, noval=slurm.NO_VAL, on_noval=slurm.NO_VAL, zero_is_noval=True): + """Try to convert arbitrary 'val' to uint32_t""" + if val is None or (val == 0 and zero_is_noval) or val == noval: + return on_noval + elif inf and val == "unlimited": + return slurm.INFINITE + else: + if isinstance(val, str) and val.isdigit(): + return int(val) + + return val + + +cpdef u32_parse(uint32_t val, on_inf="unlimited", on_noval=None, noval=slurm.NO_VAL, zero_is_noval=True): + """Convert uint32_t to Python int (with a few situational parameters)""" + if val == noval or (val == 0 and zero_is_noval): + return on_noval + elif val == slurm.INFINITE: + return on_inf + else: + return val + + +cpdef u64(val, inf=False, noval=slurm.NO_VAL64, on_noval=slurm.NO_VAL64, zero_is_noval=True): + """Try to convert arbitrary 'val' to uint64_t""" + if val is None or (val == 0 and zero_is_noval) or val == noval: + return on_noval + elif inf and val == "unlimited": + return slurm.INFINITE64 + else: + if isinstance(val, str) and val.isdigit(): + return int(val) + + return val + + +cpdef u64_parse(uint64_t val, on_inf="unlimited", on_noval=None, noval=slurm.NO_VAL64, zero_is_noval=True): + """Convert uint64_t to Python int (with a few situational parameters)""" + if val == noval or (val == 0 and zero_is_noval): + return on_noval + elif val == slurm.INFINITE64: + return on_inf + else: + return val + + +cpdef u8_bool(val): + if val is None: + return slurm.NO_VAL8 + elif val: + return 1 + else: + return 0 + + +cpdef u16_bool(val): + if val is None: + return slurm.NO_VAL16 + elif val: + return 1 + else: + return 0 + + +cdef u8_parse_bool(uint8_t val): + if not val or val == slurm.NO_VAL8: + return False + + return True + + +cdef u16_parse_bool(uint16_t val): + if not val or val == slurm.NO_VAL16: + return False + + return True + + +cdef u64_set_bool_flag(uint64_t *flags, boolean, flag_val): + if boolean: + flags[0] |= flag_val + else: + flags[0] &= ~flag_val + + +cdef u64_parse_bool_flag(uint64_t flags, flag): + if flags == slurm.NO_VAL: + return False + + if flags & flag: + return True + else: + return False + + +cdef u16_set_bool_flag(uint16_t *flags, boolean, flag_val): + if boolean: + flags[0] |= flag_val + else: + flags[0] &= ~flag_val + + +cdef u16_parse_bool_flag(uint16_t flags, flag): + if flags == slurm.NO_VAL16: + return False + + if flags & flag: + return True + else: + return False diff --git a/pyslurm/core/db/__init__.pxd b/pyslurm/core/db/__init__.pxd new file mode 100644 index 00000000..e69de29b diff --git a/pyslurm/core/db/__init__.py b/pyslurm/core/db/__init__.py new file mode 100644 index 00000000..a742f72b --- /dev/null +++ b/pyslurm/core/db/__init__.py @@ -0,0 +1,37 @@ +######################################################################### +# db/__init__.py - database package __init__ file +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from pyslurm.core.db.connection import Connection +from pyslurm.core.db.step import JobStep +from pyslurm.core.db.job import ( + Job, + Jobs, + JobSearchFilter, +) +from pyslurm.core.db.tres import ( + TrackableResource, + TrackableResources, +) +from pyslurm.core.db.qos import ( + QualitiesOfService, + QualityOfService, + QualityOfServiceSearchFilter, +) diff --git a/pyslurm/core/db/connection.pxd b/pyslurm/core/db/connection.pxd new file mode 100644 index 00000000..6ac2dfc6 --- /dev/null +++ b/pyslurm/core/db/connection.pxd @@ -0,0 +1,43 @@ +######################################################################### +# connection.pxd - pyslurm slurmdbd database connection +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from libc.stdint cimport uint16_t +from pyslurm.slurm cimport ( + slurmdb_connection_get, + slurmdb_connection_close, + slurmdb_connection_commit, +) + + +cdef class Connection: + """A connection to the slurmdbd. + + Attributes: + is_open (bool): + Whether the connection is open or closed. + """ + cdef: + void *ptr + uint16_t flags diff --git a/pyslurm/core/db/connection.pyx b/pyslurm/core/db/connection.pyx new file mode 100644 index 00000000..ff32dd92 --- /dev/null +++ b/pyslurm/core/db/connection.pyx @@ -0,0 +1,79 @@ +######################################################################### +# connection.pyx - pyslurm slurmdbd database connection +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm.core.error import RPCError + + +cdef class Connection: + + def __cinit__(self): + self.ptr = NULL + self.flags = 0 + + def __init__(self): + raise RuntimeError("A new connection should be created through " + "calling Connection.open()") + + def __dealloc__(self): + self.close() + + @staticmethod + def open(): + """Open a new connection to the slurmdbd + + Raises: + RPCError: When opening the connection fails + + Returns: + (Connection): Connection to slurmdbd + """ + cdef Connection conn = Connection.__new__(Connection) + conn.ptr = slurmdb_connection_get(&conn.flags) + if not conn.ptr: + raise RPCError(msg="Failed to open onnection to slurmdbd") + + return conn + + def close(self): + """Close the current connection.""" + if self.is_open: + slurmdb_connection_close(&self.ptr) + self.ptr = NULL + + def commit(self): + """Commit recent changes.""" + if slurmdb_connection_commit(self.ptr, 1) == slurm.SLURM_ERROR: + raise RPCError("Failed to commit database changes.") + + def rollback(self): + """Rollback recent changes.""" + if slurmdb_connection_commit(self.ptr, 0) == slurm.SLURM_ERROR: + raise RPCError("Failed to rollback database changes.") + + @property + def is_open(self): + if self.ptr: + return True + else: + return False diff --git a/pyslurm/core/db/job.pxd b/pyslurm/core/db/job.pxd new file mode 100644 index 00000000..2b220a05 --- /dev/null +++ b/pyslurm/core/db/job.pxd @@ -0,0 +1,279 @@ +######################################################################### +# job.pxd - pyslurm slurmdbd job api +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.slurm cimport ( + slurmdb_job_rec_t, + slurmdb_job_cond_t, + slurmdb_step_rec_t, + slurmdb_jobs_get, + slurmdb_destroy_job_cond, + slurmdb_destroy_job_rec, + slurmdb_destroy_step_rec, + slurm_destroy_selected_step, + slurm_selected_step_t, + slurm_list_create, + slurm_list_append, + try_xmalloc, + slurmdb_job_cond_def_start_end, + slurm_job_state_string, + slurm_job_reason_string, +) +from pyslurm.core.db.util cimport ( + SlurmList, + SlurmListItem, + make_char_list, +) +from pyslurm.core.db.step cimport JobStep, JobSteps +from pyslurm.core.db.stats cimport JobStats +from pyslurm.core.db.connection cimport Connection +from pyslurm.core.common cimport cstr +from pyslurm.core.db.qos cimport QualitiesOfService + + +cdef class JobSearchFilter: + """Search conditions for Slurm database Jobs. + + Args: + **kwargs: + Any valid attribute of the object. + + Attributes: + ids (list): + A list of Job ids to search for. + start_time (Union[str, int, datetime.datetime]): + Search for Jobs which started after this time. + end_time (Union[str, int, datetime.datetime]): + Search for Jobs which ended before this time. + accounts (list): + Search for Jobs with these account names. + association_ids (list): + Search for Jobs with these association ids. + clusters (list): + Search for Jobs running in these clusters. + constraints (list): + Search for Jobs with these constraints. + cpus (int): + Search for Jobs with exactly this many CPUs. + Note: If you also specify max_cpus, then this value will act as + the minimum. + max_cpus (int): + Search for Jobs with no more than this amount of CPUs. + Note: This value has no effect without also setting cpus. + nodes (int): + Search for Jobs with exactly this many nodes. + Note: If you also specify max_nodes, then this value will act as + the minimum. + max_nodes (int): + Search for Jobs with no more than this amount of nodes. + Note: This value has no effect without also setting nodes. + qos (list): + Search for Jobs with these Qualities of Service. + names (list): + Search for Jobs with these job names. + partitions (list): + Search for Jobs with these partition names. + groups (list): + Search for Jobs with these group names. You can both specify the + groups as string or by their GID. + timelimit (Union[str, int]): + Search for Jobs with exactly this timelimit. + Note: If you also specify max_timelimit, then this value will act + as the minimum. + max_timelimit (Union[str, int]): + Search for Jobs which run no longer than this timelimit + Note: This value has no effect without also setting timelimit + users (list): + Search for Jobs with these user names. You can both specify the + users as string or by their UID. + wckeys (list): + Search for Jobs with these WCKeys + nodelist (list): + Search for Jobs that ran on any of these Nodes + with_script (bool): + Instruct the slurmdbd to also send the job script(s) + Note: This requires specifying explictiy job ids, and is mutually + exclusive with with_env + with_env (bool): + Instruct the slurmdbd to also send the job environment(s) + Note: This requires specifying explictiy job ids, and is mutually + exclusive with with_script + """ + cdef slurmdb_job_cond_t *ptr + + cdef public: + ids + start_time + end_time + accounts + association_ids + clusters + constraints + cpus + max_cpus + nodes + max_nodes + qualities_of_service + names + partitions + groups + timelimit + max_timelimit + users + wckeys + nodelist + with_script + with_env + + +cdef class Jobs(dict): + """A collection of Database Jobs.""" + cdef: + SlurmList info + Connection db_conn + + +cdef class Job: + """A Slurm Database Job. + + Args: + job_id (int): + An Integer representing a Job-ID. + + Raises: + MemoryError: If malloc fails to allocate memory. + + Attributes: + steps (pyslurm.db.JobSteps): + Steps this Job has + stats (pyslurm.db.JobStats): + Utilization statistics of this Job + account (str): + Account of the Job. + admin_comment (str): + Admin comment for the Job. + num_nodes (int): + Amount of nodes this Job has allocated (if it is running) or + requested (if it is still pending). + array_id (int): + The master Array-Job ID. + array_tasks_parallel (int): + Max number of array tasks allowed to run simultaneously. + array_task_id (int): + Array Task ID of this Job if it is an Array-Job. + array_tasks_waiting (str): + Array Tasks that are still waiting. + association_id (int): + ID of the Association this job runs in. + block_id (str): + Name of the block used (for BlueGene Systems) + cluster (str): + Cluster this Job belongs to + constraints (str): + Constraints of the Job + container (str): + Path to OCI Container bundle + db_index (int): + Unique database index of the Job in the job table + derived_exit_code (int): + Highest exit code of all the Job steps + derived_exit_code_signal (int): + Signal of the derived exit code + comment (str): + Comment for the Job + elapsed_time (int): + Amount of seconds elapsed for the Job + eligible_time (int): + When the Job became eligible to run, as a unix timestamp + end_time (int): + When the Job ended, as a unix timestamp + exit_code (int): + Exit code of the job script or salloc. + exit_code_signal (int): + Signal of the exit code for this Job. + group_id (int): + ID of the group for this Job + group_name (str): + Name of the group for this Job + id (int): + ID of the Job + name (str): + Name of the Job + mcs_label (str): + MCS Label of the Job + nodelist (str): + Nodes this Job is using + partition (str): + Name of the Partition for this Job + priority (int): + Priority for the Job + quality_of_service (str): + Name of the Quality of Service for the Job + cpus (int): + Amount of CPUs the Job has/had allocated, or, if the Job is still + pending, this will reflect the amount requested. + memory (int): + Amount of memory the Job requested in total + reservation (str): + Name of the Reservation for this Job + script (str): + The batch script for this Job. + Note: Only available if the "with_script" condition was given + start_time (int): + Time when the Job started, as a unix timestamp + state (str): + State of the Job + state_reason (str): + Last reason a Job was blocked from running + cancelled_by (str): + Name of the User who cancelled this Job + submit_time (int): + Time the Job was submitted, as a unix timestamp + submit_command (str): + Full command issued to submit the Job + suspended_time (int): + Amount of seconds the Job was suspended + system_comment (str): + Arbitrary System comment for the Job + time_limit (int): + Time limit of the Job in minutes + user_id (int): + UID of the User this Job belongs to + user_name (str): + Name of the User this Job belongs to + wckey (str): + Name of the WCKey for this Job + working_directory (str): + Working directory of the Job + """ + cdef: + slurmdb_job_rec_t *ptr + QualitiesOfService qos_data + + cdef public: + JobSteps steps + JobStats stats + + @staticmethod + cdef Job from_ptr(slurmdb_job_rec_t *in_ptr) diff --git a/pyslurm/core/db/job.pyx b/pyslurm/core/db/job.pyx new file mode 100644 index 00000000..d66f789e --- /dev/null +++ b/pyslurm/core/db/job.pyx @@ -0,0 +1,598 @@ +######################################################################### +# job.pyx - pyslurm slurmdbd job api +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from os import WIFSIGNALED, WIFEXITED, WTERMSIG, WEXITSTATUS +from pyslurm.core.error import RPCError +from pyslurm.core.db.tres cimport TrackableResources, TrackableResource +from pyslurm.core import slurmctld +from pyslurm.core.common.uint import * +from pyslurm.core.common.ctime import ( + date_to_timestamp, + timestr_to_mins, + _raw_time, +) +from pyslurm.core.common import ( + gid_to_name, + group_to_gid, + user_to_uid, + uid_to_name, + nodelist_to_range_str, + instance_to_dict, +) + + +cdef class JobSearchFilter: + + def __cinit__(self): + self.ptr = NULL + + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + + def __dealloc__(self): + self._dealloc() + + def _dealloc(self): + slurmdb_destroy_job_cond(self.ptr) + self.ptr = NULL + + def _alloc(self): + self._dealloc() + self.ptr = try_xmalloc(sizeof(slurmdb_job_cond_t)) + if not self.ptr: + raise MemoryError("xmalloc failed for slurmdb_job_cond_t") + + self.ptr.db_flags = slurm.SLURMDB_JOB_FLAG_NOTSET + self.ptr.flags |= slurm.JOBCOND_FLAG_NO_TRUNC + + def _parse_qos(self): + if not self.qualities_of_service: + return None + + qos_id_list = [] + qos = QualitiesOfService.load() + for q in self.qualities_of_service: + if isinstance(q, int): + qos_id_list.append(q) + elif q in qos: + qos_id_list.append(str(qos[q].id)) + else: + raise ValueError(f"QoS {q} does not exist") + + return qos_id_list + + def _parse_groups(self): + if not self.groups: + return None + + gid_list = [] + for group in self.groups: + if isinstance(group, int): + gid_list.append(group) + else: + gid_list.append(group_to_gid(group)) + + return gid_list + + def _parse_users(self): + if not self.users: + return None + + uid_list = [] + for user in self.users: + if not isinstance(user, list): + uid_list.append(int(user)) + elif user: + uid_list.append(user_to_uid(user)) + + return uid_list + + def _parse_clusters(self): + if not self.clusters: + # Get the local cluster name + # This is a requirement for some other parameters to function + # correctly, like self.nodelist + slurm_conf = slurmctld.Config.load() + return [slurm_conf.cluster] + elif self.clusters == "all": + return None + else: + return self.clusters + + def _parse_state(self): + # TODO: implement + return None + + def _create(self): + self._alloc() + cdef: + slurmdb_job_cond_t *ptr = self.ptr + slurm_selected_step_t *selected_step + + ptr.usage_start = date_to_timestamp(self.start_time) + ptr.usage_end = date_to_timestamp(self.end_time) + slurmdb_job_cond_def_start_end(ptr) + ptr.cpus_min = u32(self.cpus, on_noval=0) + ptr.cpus_max = u32(self.max_cpus, on_noval=0) + ptr.nodes_min = u32(self.nodes, on_noval=0) + ptr.nodes_max = u32(self.max_nodes, on_noval=0) + ptr.timelimit_min = u32(timestr_to_mins(self.timelimit), on_noval=0) + ptr.timelimit_max = u32(timestr_to_mins(self.max_timelimit), + on_noval=0) + make_char_list(&ptr.acct_list, self.accounts) + make_char_list(&ptr.associd_list, self.association_ids) + make_char_list(&ptr.cluster_list, self._parse_clusters()) + make_char_list(&ptr.constraint_list, self.constraints) + make_char_list(&ptr.jobname_list, self.names) + make_char_list(&ptr.groupid_list, self._parse_groups()) + make_char_list(&ptr.userid_list, self._parse_users()) + make_char_list(&ptr.wckey_list, self.wckeys) + make_char_list(&ptr.partition_list, self.partitions) + make_char_list(&ptr.qos_list, self._parse_qos()) + make_char_list(&ptr.state_list, self._parse_state()) + + if self.nodelist: + cstr.fmalloc(&ptr.used_nodes, + nodelist_to_range_str(self.nodelist)) + + if self.ids: + # These are only allowed by the slurmdbd when specific jobs are + # requested. + if self.with_script and self.with_env: + raise ValueError("with_script and with_env are mutually " + "exclusive") + + if self.with_script: + ptr.flags |= slurm.JOBCOND_FLAG_SCRIPT + elif self.with_env: + ptr.flags |= slurm.JOBCOND_FLAG_ENV + + ptr.step_list = slurm_list_create(slurm_destroy_selected_step) + already_added = [] + for i in self.ids: + job_id = u32(i) + if job_id in already_added: + continue + + selected_step = NULL + selected_step = try_xmalloc( + sizeof(slurm_selected_step_t)) + if not selected_step: + raise MemoryError("xmalloc failed for slurm_selected_step_t") + + selected_step.array_task_id = slurm.NO_VAL + selected_step.het_job_offset = slurm.NO_VAL + selected_step.step_id.step_id = slurm.NO_VAL + selected_step.step_id.job_id = job_id + slurm_list_append(ptr.step_list, selected_step) + already_added.append(job_id) + + +cdef class Jobs(dict): + + def __init__(self, *args, **kwargs): + # TODO: ability to initialize with existing job objects + pass + + @staticmethod + def load(search_filter=None): + """Load Jobs from the Slurm Database + + Implements the slurmdb_jobs_get RPC. + + Args: + search_filter (pyslurm.db.JobSearchFilter): + A search filter that the slurmdbd will apply when retrieving + Jobs from the database. + + Raises: + RPCError: When getting the Jobs from the Database was not + sucessful + """ + cdef: + Jobs jobs = Jobs() + Job job + JobSearchFilter cond + SlurmListItem job_ptr + QualitiesOfService qos_data + + if search_filter: + cond = search_filter + else: + cond = JobSearchFilter() + + cond._create() + jobs.db_conn = Connection.open() + jobs.info = SlurmList.wrap(slurmdb_jobs_get(jobs.db_conn.ptr, + cond.ptr)) + if jobs.info.is_null: + raise RPCError(msg="Failed to get Jobs from slurmdbd") + + qos_data = QualitiesOfService.load(name_is_key=False, + db_connection=jobs.db_conn) + + # TODO: also get trackable resources with slurmdb_tres_get and store + # it in each job instance. tres_alloc_str and tres_req_str only + # contain the numeric tres ids, but it probably makes more sense to + # convert them to its type name for the user in advance. + + # TODO: For multi-cluster support, remove duplicate federation jobs + # TODO: How to handle the possibility of duplicate job ids that could + # appear if IDs on a cluster are resetted? + for job_ptr in SlurmList.iter_and_pop(jobs.info): + job = Job.from_ptr(job_ptr.data) + job.qos_data = qos_data + job._create_steps() + JobStats._sum_step_stats_for_job(job, job.steps) + jobs[job.id] = job + + return jobs + + +cdef class Job: + + def __cinit__(self): + self.ptr = NULL + + def __init__(self, job_id): + self._alloc_impl() + self.ptr.jobid = int(job_id) + + def __dealloc__(self): + self._dealloc_impl() + + def _dealloc_impl(self): + slurmdb_destroy_job_rec(self.ptr) + self.ptr = NULL + + def _alloc_impl(self): + if not self.ptr: + self.ptr = try_xmalloc( + sizeof(slurmdb_job_rec_t)) + if not self.ptr: + raise MemoryError("xmalloc failed for slurmdb_job_rec_t") + + @staticmethod + cdef Job from_ptr(slurmdb_job_rec_t *in_ptr): + cdef Job wrap = Job.__new__(Job) + wrap.ptr = in_ptr + wrap.steps = JobSteps.__new__(JobSteps) + wrap.stats = JobStats() + return wrap + + @staticmethod + def load(job_id, with_script=False, with_env=False): + """Load the information for a specific Job from the Database. + + Args: + job_id (int): + ID of the Job to be loaded. + + Returns: + (pyslurm.db.Job): Returns a new Job instance + + Raises: + RPCError: If requesting the information for the database Job was + not sucessful. + """ + jfilter = JobSearchFilter(ids=[int(job_id)], + with_script=with_script, with_env=with_env) + jobs = Jobs.load(jfilter) + if not jobs or job_id not in jobs: + raise RPCError(msg=f"Job {job_id} does not exist") + + return jobs[job_id] + + def _create_steps(self): + cdef: + JobStep step + SlurmList step_list + SlurmListItem step_ptr + + step_list = SlurmList.wrap(self.ptr.steps, owned=False) + for step_ptr in SlurmList.iter_and_pop(step_list): + step = JobStep.from_ptr(step_ptr.data) + self.steps[step.id] = step + + def as_dict(self): + """Database Job information formatted as a dictionary. + + Returns: + (dict): Database Job information as dict + """ + cdef dict out = instance_to_dict(self) + + if self.stats: + out["stats"] = self.stats.as_dict() + + steps = out.pop("steps", {}) + out["steps"] = {} + for step_id, step in steps.items(): + out["steps"][step_id] = step.as_dict() + + return out + + @property + def account(self): + return cstr.to_unicode(self.ptr.account) + + @property + def admin_comment(self): + return cstr.to_unicode(self.ptr.admin_comment) + + @property + def num_nodes(self): + val = TrackableResources.find_count_in_str(self.ptr.tres_alloc_str, + slurm.TRES_NODE) + if val is not None: + # Job is already running and has nodes allocated + return val + else: + # Job is still pending, so we return the number of requested nodes + # instead. + val = TrackableResources.find_count_in_str(self.ptr.tres_req_str, + slurm.TRES_NODE) + return val + + @property + def array_id(self): + return u32_parse(self.ptr.array_job_id) + + @property + def array_tasks_parallel(self): + return u32_parse(self.ptr.array_max_tasks) + + @property + def array_task_id(self): + return u32_parse(self.ptr.array_task_id) + + @property + def array_tasks_waiting(self): + task_str = cstr.to_unicode(self.ptr.array_task_str) + if not task_str: + return None + + if "%" in task_str: + # We don't want this % character and everything after it + # in here, so remove it. + task_str = task_str[:task_str.rindex("%")] + + return task_str + + @property + def association_id(self): + return u32_parse(self.ptr.associd) + + @property + def block_id(self): + return cstr.to_unicode(self.ptr.blockid) + + @property + def cluster(self): + return cstr.to_unicode(self.ptr.cluster) + + @property + def constraints(self): + return cstr.to_list(self.ptr.constraints) + + @property + def container(self): + return cstr.to_list(self.ptr.container) + + @property + def db_index(self): + return u64_parse(self.ptr.db_index) + + @property + def derived_exit_code(self): + if (self.ptr.derived_ec == slurm.NO_VAL + or not WIFEXITED(self.ptr.derived_ec)): + return None + + return WEXITSTATUS(self.ptr.derived_ec) + + @property + def derived_exit_code_signal(self): + if (self.ptr.derived_ec == slurm.NO_VAL + or not WIFSIGNALED(self.ptr.derived_ec)): + return None + + return WTERMSIG(self.ptr.derived_ec) + + @property + def comment(self): + return cstr.to_unicode(self.ptr.derived_es) + + @property + def elapsed_time(self): + return _raw_time(self.ptr.elapsed) + + @property + def eligible_time(self): + return _raw_time(self.ptr.eligible) + + @property + def end_time(self): + return _raw_time(self.ptr.end) + + @property + def exit_code(self): + # TODO + return 0 + + @property + def exit_code_signal(self): + # TODO + return 0 + + # uint32_t flags + + def group_id(self): + return u32_parse(self.ptr.gid, zero_is_noval=False) + + def group_name(self): + return gid_to_name(self.ptr.gid) + + # uint32_t het_job_id + # uint32_t het_job_offset + + @property + def id(self): + return self.ptr.jobid + + @property + def name(self): + return cstr.to_unicode(self.ptr.jobname) + + # uint32_t lft + + @property + def mcs_label(self): + return cstr.to_unicode(self.ptr.mcs_label) + + @property + def nodelist(self): + return cstr.to_unicode(self.ptr.nodes) + + @property + def partition(self): + return cstr.to_unicode(self.ptr.partition) + + @property + def priority(self): + return u32_parse(self.ptr.priority, zero_is_noval=False) + + @property + def quality_of_service(self): + _qos = self.qos_data.get(self.ptr.qosid, None) + if _qos: + return _qos.name + else: + return None + + @property + def cpus(self): + val = TrackableResources.find_count_in_str(self.ptr.tres_alloc_str, + slurm.TRES_CPU) + if val is not None: + # Job is already running and has cpus allocated + return val + else: + # Job is still pending, so we return the number of requested cpus + # instead. + return u32_parse(self.ptr.req_cpus) + + @property + def memory(self): + val = TrackableResources.find_count_in_str(self.ptr.tres_req_str, + slurm.TRES_MEM) + return val + + @property + def reservation(self): + return cstr.to_unicode(self.ptr.resv_name) + +# @property +# def reservation_id(self): +# return u32_parse(self.ptr.resvid) + + @property + def script(self): + return cstr.to_unicode(self.ptr.script) + + @property + def environment(self): + return cstr.to_dict(self.ptr.env, delim1="\n", delim2="=") + + @property + def start_time(self): + return _raw_time(self.ptr.start) + + @property + def state(self): + return cstr.to_unicode(slurm_job_state_string(self.ptr.state)) + + @property + def state_reason(self): + return cstr.to_unicode(slurm_job_reason_string + (self.ptr.state_reason_prev)) + + @property + def cancelled_by(self): + return uid_to_name(self.ptr.requid) + + @property + def submit_time(self): + return _raw_time(self.ptr.submit) + + @property + def submit_command(self): + return cstr.to_unicode(self.ptr.submit_line) + + @property + def suspended_time(self): + return _raw_time(self.ptr.elapsed) + + @property + def system_comment(self): + return cstr.to_unicode(self.ptr.system_comment) + + @property + def time_limit(self): + # TODO: Perhaps we should just find out what the actual PartitionLimit + # is? + return _raw_time(self.ptr.timelimit, "PartitionLimit") + + @property + def user_id(self): + return u32_parse(self.ptr.uid, zero_is_noval=False) + + @property + def user_name(self): + # Theres also a ptr->user + # https://github.com/SchedMD/slurm/blob/6365a8b7c9480c48678eeedef99864d8d3b6a6b5/src/sacct/print.c#L1946 + return uid_to_name(self.ptr.uid) + + # TODO: used gres + + @property + def wckey(self): + return cstr.to_unicode(self.ptr.wckey) + +# @property +# def wckey_id(self): +# return u32_parse(self.ptr.wckeyid) + + @property + def working_directory(self): + return cstr.to_unicode(self.ptr.work_dir) + +# @property +# def tres_allocated(self): +# return TrackableResources.from_str(self.ptr.tres_alloc_str) + +# @property +# def tres_requested(self): +# return TrackableResources.from_str(self.ptr.tres_req_str) diff --git a/pyslurm/core/db/qos.pxd b/pyslurm/core/db/qos.pxd new file mode 100644 index 00000000..3ba59dc6 --- /dev/null +++ b/pyslurm/core/db/qos.pxd @@ -0,0 +1,65 @@ +######################################################################### +# qos.pxd - pyslurm slurmdbd qos api +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.slurm cimport ( + slurmdb_qos_rec_t, + slurmdb_qos_cond_t, + slurmdb_destroy_qos_rec, + slurmdb_destroy_qos_cond, + slurmdb_qos_get, + slurm_preempt_mode_num, + try_xmalloc, +) +from pyslurm.core.db.util cimport ( + SlurmList, + SlurmListItem, + make_char_list, +) +from pyslurm.core.db.connection cimport Connection +from pyslurm.core.common cimport cstr + + +cdef class QualitiesOfService(dict): + cdef: + SlurmList info + Connection db_conn + + +cdef class QualityOfServiceSearchFilter: + cdef slurmdb_qos_cond_t *ptr + + cdef public: + names + ids + descriptions + preempt_modes + with_deleted + + +cdef class QualityOfService: + cdef slurmdb_qos_rec_t *ptr + + @staticmethod + cdef QualityOfService from_ptr(slurmdb_qos_rec_t *in_ptr) diff --git a/pyslurm/core/db/qos.pyx b/pyslurm/core/db/qos.pyx new file mode 100644 index 00000000..bd5a35de --- /dev/null +++ b/pyslurm/core/db/qos.pyx @@ -0,0 +1,194 @@ +######################################################################### +# qos.pyx - pyslurm slurmdbd qos api +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm.core.error import RPCError +from pyslurm.core.common import ( + instance_to_dict, +) + + +cdef class QualitiesOfService(dict): + + def __init__(self): + pass + + @staticmethod + def load(search_filter=None, name_is_key=True, db_connection=None): + cdef: + QualitiesOfService qos_dict = QualitiesOfService() + QualityOfService qos + QualityOfServiceSearchFilter cond + SlurmListItem qos_ptr + Connection conn = db_connection + + if search_filter: + cond = search_filter + else: + cond = QualityOfServiceSearchFilter() + + cond._create() + qos_dict.db_conn = Connection.open() if not conn else conn + qos_dict.info = SlurmList.wrap(slurmdb_qos_get(qos_dict.db_conn.ptr, + cond.ptr)) + if qos_dict.info.is_null: + raise RPCError(msg="Failed to get QoS data from slurmdbd") + + for qos_ptr in SlurmList.iter_and_pop(qos_dict.info): + qos = QualityOfService.from_ptr(qos_ptr.data) + if name_is_key: + qos_dict[qos.name] = qos + else: + qos_dict[qos.id] = qos + + return qos_dict + + +cdef class QualityOfServiceSearchFilter: + + def __cinit__(self): + self.ptr = NULL + + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + + def __dealloc__(self): + self._dealloc() + + def _dealloc(self): + slurmdb_destroy_qos_cond(self.ptr) + self.ptr = NULL + + def _alloc(self): + self._dealloc() + self.ptr = try_xmalloc(sizeof(slurmdb_qos_cond_t)) + if not self.ptr: + raise MemoryError("xmalloc failed for slurmdb_qos_cond_t") + + def _parse_preempt_modes(self): + if not self.preempt_modes: + return 0 + + if isinstance(self.preempt_modes, int): + return self.preempt_modes + + out = 0 + for mode in self.preempt_modes: + _mode = slurm_preempt_mode_num(mode) + if _mode == slurm.NO_VAL16: + raise ValueError(f"Unknown preempt mode: {mode}") + + if _mode == slurm.PREEMPT_MODE_OFF: + _mode = slurm.PREEMPT_MODE_COND_OFF + + out |= _mode + + return out + + def _create(self): + self._alloc() + cdef slurmdb_qos_cond_t *ptr = self.ptr + + make_char_list(&ptr.name_list, self.names) + make_char_list(&ptr.id_list, self.ids) + make_char_list(&ptr.description_list, self.descriptions) + ptr.preempt_mode = self._parse_preempt_modes() + ptr.with_deleted = 1 if bool(self.with_deleted) else 0 + + +cdef class QualityOfService: + + def __cinit__(self): + self.ptr = NULL + + def __init__(self, name=None): + self._alloc_impl() + self.name = name + + def __dealloc__(self): + self._dealloc_impl() + + def _dealloc_impl(self): + slurmdb_destroy_qos_rec(self.ptr) + self.ptr = NULL + + def _alloc_impl(self): + if not self.ptr: + self.ptr = try_xmalloc( + sizeof(slurmdb_qos_rec_t)) + if not self.ptr: + raise MemoryError("xmalloc failed for slurmdb_qos_rec_t") + + @staticmethod + cdef QualityOfService from_ptr(slurmdb_qos_rec_t *in_ptr): + cdef QualityOfService wrap = QualityOfService.__new__(QualityOfService) + wrap.ptr = in_ptr + return wrap + + def as_dict(self): + """Database QualityOfService information formatted as a dictionary. + + Returns: + (dict): Database QualityOfService information as dict + """ + return instance_to_dict(self) + + @staticmethod + def load(name): + """Load the information for a specific Quality of Service. + + Args: + name (str): + Name of the Quality of Service to be loaded. + + Returns: + (pyslurm.db.QualityOfService): Returns a new QualityOfService + instance. + + Raises: + RPCError: If requesting the information from the database was not + sucessful. + """ + qfilter = QualityOfServiceSearchFilter(names=[name]) + qos_data = QualitiesOfService.load(qfilter) + if not qos_data or name not in qos_data: + raise RPCError(msg=f"QualityOfService {name} does not exist") + + return qos_data[name] + + @property + def name(self): + return cstr.to_unicode(self.ptr.name) + + @name.setter + def name(self, val): + cstr.fmalloc(&self.ptr.name, val) + + @property + def description(self): + return cstr.to_unicode(self.ptr.description) + + @property + def id(self): + return self.ptr.id diff --git a/pyslurm/core/db/stats.pxd b/pyslurm/core/db/stats.pxd new file mode 100644 index 00000000..1f321ab2 --- /dev/null +++ b/pyslurm/core/db/stats.pxd @@ -0,0 +1,143 @@ +######################################################################### +# stats.pxd - pyslurm slurmdbd job stats +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.slurm cimport ( + try_xmalloc, + slurmdb_stats_t, + slurmdb_job_rec_t, +) +from pyslurm.core.db.tres cimport TrackableResources +from pyslurm.core.db.step cimport JobStep, JobSteps +from pyslurm.core.db.job cimport Job +from pyslurm.core.common cimport cstr + + +cdef class JobStats: + """Statistics for a Slurm Job or Step. + + Note: + For more information also see the sacct manpage. + + Attributes: + consumed_energy (int): + Total amount of energy consumed, in joules + elapsed_cpu_time (int): + Total amount of time used(Elapsed time * cpu count) in seconds. + This is not the real CPU-Efficiency, but rather the total amount + of cpu-time the CPUs were occupied for + avg_cpu_time (int): + Average CPU-Time (System + User) in seconds of all tasks + avg_cpu_frequency (int): + Average weighted CPU-Frequency of all tasks, in Kilohertz + avg_disk_read (int): + Average number of bytes read by all tasks + avg_disk_write (int): + Average number of bytes written by all tasks + avg_page_faults (int): + Average number of page faults by all tasks + avg_resident_memory (int): + Average Resident Set Size (RSS) in bytes of all tasks + avg_virtual_memory (int): + Average Virtual Memory Size (VSZ) in bytes of all tasks + max_disk_read (int): + Highest peak number of bytes read by all tasks + max_disk_read_node (int): + Name of the Node where max_disk_read occured + max_disk_read_task (int): + ID of the Task where max_disk_read occured + max_disk_write (int): + Lowest peak number of bytes written by all tasks + max_disk_write_node (int): + Name of the Node where max_disk_write occured + max_disk_write_task (int): + ID of the Task where max_disk_write occured + max_page_faults (int): + Highest peak number of page faults by all tasks + max_page_faults_node (int): + Name of the Node where max_page_faults occured + max_page_faults_task (int): + ID of the Task where max_page_faults occured + max_resident_memory (int): + Highest peak Resident Set Size (RSS) in bytes by all tasks + max_resident_memory_node (int): + Name of the Node where max_resident_memory occured + max_resident_memory_task (int): + ID of the Task where max_resident_memory occured + max_virtual_memory (int): + Highest peak Virtual Memory Size (VSZ) in bytes by all tasks + max_virtual_memory_node (int): + Name of the Node where max_virtual_memory occured + max_virtual_memory_task (int): + ID of the Task where max_virtual_memory occured + min_cpu_time (int): + Lowest peak CPU-Time (System + User) in seconds of all tasks + min_cpu_time_node (int): + Name of the Node where min_cpu_time occured + min_cpu_time_task (int): + ID of the Task where min_cpu_time occured + total_cpu_time (int): + Sum of user_cpu_time and system_cpu_time, in seconds + user_cpu_time (int): + Amount of Time spent in user space, in seconds + system_cpu_time (int): + Amount of Time spent in kernel space, in seconds + """ + cdef slurmdb_job_rec_t *job + + cdef public: + consumed_energy + elapsed_cpu_time + avg_cpu_time + avg_cpu_frequency + avg_disk_read + avg_disk_write + avg_page_faults + avg_resident_memory + avg_virtual_memory + max_disk_read + max_disk_read_node + max_disk_read_task + max_disk_write + max_disk_write_node + max_disk_write_task + max_page_faults + max_page_faults_node + max_page_faults_task + max_resident_memory + max_resident_memory_node + max_resident_memory_task + max_virtual_memory + max_virtual_memory_node + max_virtual_memory_task + min_cpu_time + min_cpu_time_node + min_cpu_time_task + total_cpu_time + user_cpu_time + system_cpu_time + + @staticmethod + cdef JobStats from_step(JobStep step) + diff --git a/pyslurm/core/db/stats.pyx b/pyslurm/core/db/stats.pyx new file mode 100644 index 00000000..bd6606a0 --- /dev/null +++ b/pyslurm/core/db/stats.pyx @@ -0,0 +1,207 @@ +######################################################################### +# stats.pyx - pyslurm slurmdbd job stats +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm.core.common import ( + nodelist_from_range_str, + instance_to_dict, +) + + +cdef class JobStats: + + def __init__(self): + for attr, val in instance_to_dict(self).items(): + setattr(self, attr, 0) + + self.max_disk_read_node = None + self.max_disk_read_task = None + self.max_disk_write_node = None + self.max_disk_write_task = None + self.max_page_faults_node = None + self.max_page_faults_task = None + self.max_resident_memory_node = None + self.max_resident_memory_task = None + self.max_virtual_memory_node = None + self.max_virtual_memory_task = None + self.min_cpu_time_node = None + self.min_cpu_time_task = None + + def as_dict(self): + return instance_to_dict(self) + + @staticmethod + cdef JobStats from_step(JobStep step): + cdef JobStats wrap = JobStats() + if not &step.ptr.stats: + return wrap + + cdef: + list nodes = nodelist_from_range_str( + cstr.to_unicode(step.ptr.nodes)) + cpu_time_adj = 1000 + slurmdb_stats_t *ptr = &step.ptr.stats + + if ptr.consumed_energy != slurm.NO_VAL64: + wrap.consumed_energy = ptr.consumed_energy + + wrap.avg_cpu_time = TrackableResources.find_count_in_str( + ptr.tres_usage_in_ave, slurm.TRES_CPU) / cpu_time_adj + + elapsed = step.elapsed_time if step.elapsed_time else 0 + cpus = step.cpus if step.cpus else 0 + wrap.elapsed_cpu_time = elapsed * cpus + + ave_freq = int(ptr.act_cpufreq) + if ave_freq != slurm.NO_VAL: + wrap.avg_cpu_frequency = ptr.act_cpufreq + + wrap.avg_disk_read = TrackableResources.find_count_in_str( + ptr.tres_usage_in_ave, slurm.TRES_FS_DISK) + wrap.avg_disk_write = TrackableResources.find_count_in_str( + ptr.tres_usage_out_ave, slurm.TRES_FS_DISK) + wrap.avg_page_faults = TrackableResources.find_count_in_str( + ptr.tres_usage_in_ave, slurm.TRES_PAGES) + wrap.avg_resident_memory = TrackableResources.find_count_in_str( + ptr.tres_usage_in_ave, slurm.TRES_MEM) + wrap.avg_virtual_memory = TrackableResources.find_count_in_str( + ptr.tres_usage_in_ave, slurm.TRES_VMEM) + + wrap.max_disk_read = TrackableResources.find_count_in_str( + ptr.tres_usage_in_max, slurm.TRES_FS_DISK) + max_disk_read_nodeid = TrackableResources.find_count_in_str( + ptr.tres_usage_in_max_nodeid, slurm.TRES_FS_DISK) + wrap.max_disk_read_task = TrackableResources.find_count_in_str( + ptr.tres_usage_in_max_taskid, slurm.TRES_FS_DISK) + + wrap.max_disk_write = TrackableResources.find_count_in_str( + ptr.tres_usage_out_max, slurm.TRES_FS_DISK) + max_disk_write_nodeid = TrackableResources.find_count_in_str( + ptr.tres_usage_out_max_nodeid, slurm.TRES_FS_DISK) + wrap.max_disk_write_task = TrackableResources.find_count_in_str( + ptr.tres_usage_out_max_taskid, slurm.TRES_FS_DISK) + + wrap.max_resident_memory = TrackableResources.find_count_in_str( + ptr.tres_usage_in_max, slurm.TRES_MEM) + max_resident_memory_nodeid = TrackableResources.find_count_in_str( + ptr.tres_usage_in_max_nodeid, slurm.TRES_MEM) + wrap.max_resident_memory_task = TrackableResources.find_count_in_str( + ptr.tres_usage_in_max_taskid, slurm.TRES_MEM) + + wrap.max_virtual_memory = TrackableResources.find_count_in_str( + ptr.tres_usage_in_max, slurm.TRES_VMEM) + max_virtual_memory_nodeid = TrackableResources.find_count_in_str( + ptr.tres_usage_in_max_nodeid, slurm.TRES_VMEM) + wrap.max_virtual_memory_task = TrackableResources.find_count_in_str( + ptr.tres_usage_in_max_taskid, slurm.TRES_VMEM) + + wrap.min_cpu_time = TrackableResources.find_count_in_str( + ptr.tres_usage_in_min, slurm.TRES_CPU) / cpu_time_adj + min_cpu_time_nodeid = TrackableResources.find_count_in_str( + ptr.tres_usage_in_min_nodeid, slurm.TRES_CPU) + wrap.min_cpu_time_task = TrackableResources.find_count_in_str( + ptr.tres_usage_in_min_taskid, slurm.TRES_CPU) + + wrap.total_cpu_time = TrackableResources.find_count_in_str( + ptr.tres_usage_in_tot, slurm.TRES_CPU) + + if nodes: + wrap.max_disk_write_node = nodes[max_disk_write_nodeid] + wrap.max_disk_read_node = nodes[max_disk_read_nodeid] + wrap.max_resident_memory_node = nodes[max_resident_memory_nodeid] + wrap.max_virtual_memory_node = nodes[max_virtual_memory_nodeid] + wrap.min_cpu_time_node = nodes[min_cpu_time_nodeid] + + if step.ptr.user_cpu_sec != slurm.NO_VAL64: + wrap.user_cpu_time = step.ptr.user_cpu_sec + + if step.ptr.sys_cpu_sec != slurm.NO_VAL64: + wrap.system_cpu_time = step.ptr.sys_cpu_sec + + return wrap + + @staticmethod + def _sum_step_stats_for_job(Job job, JobSteps steps): + cdef: + JobStats job_stats = job.stats + JobStats step_stats = None + + for step in steps.values(): + step_stats = step.stats + + job_stats.consumed_energy += step_stats.consumed_energy + job_stats.avg_cpu_time += step_stats.avg_cpu_time + job_stats.avg_cpu_frequency += step_stats.avg_cpu_frequency + job_stats.avg_disk_read += step_stats.avg_disk_read + job_stats.avg_disk_write += step_stats.avg_disk_write + job_stats.avg_page_faults += step_stats.avg_page_faults + + if step_stats.max_disk_read >= job_stats.max_disk_read: + job_stats.max_disk_read = step_stats.max_disk_read + job_stats.max_disk_read_node = step_stats.max_disk_read_node + job_stats.max_disk_read_task = step_stats.max_disk_read_task + + if step_stats.max_disk_write >= job_stats.max_disk_write: + job_stats.max_disk_write = step_stats.max_disk_write + job_stats.max_disk_write_node = step_stats.max_disk_write_node + job_stats.max_disk_write_task = step_stats.max_disk_write_task + + if step_stats.max_page_faults >= job_stats.max_page_faults: + job_stats.max_page_faults = step_stats.max_page_faults + job_stats.max_page_faults_node = step_stats.max_page_faults_node + job_stats.max_page_faults_task = step_stats.max_page_faults_task + + if step_stats.max_resident_memory >= job_stats.max_resident_memory: + job_stats.max_resident_memory = step_stats.max_resident_memory + job_stats.max_resident_memory_node = step_stats.max_resident_memory_node + job_stats.max_resident_memory_task = step_stats.max_resident_memory_task + job_stats.avg_resident_memory = job_stats.max_resident_memory + + if step_stats.max_virtual_memory >= job_stats.max_virtual_memory: + job_stats.max_virtual_memory = step_stats.max_virtual_memory + job_stats.max_virtual_memory_node = step_stats.max_virtual_memory_node + job_stats.max_virtual_memory_task = step_stats.max_virtual_memory_task + job_stats.avg_virtual_memory = job_stats.max_virtual_memory + + if step_stats.min_cpu_time >= job_stats.min_cpu_time: + job_stats.min_cpu_time = step_stats.min_cpu_time + job_stats.min_cpu_time_node = step_stats.min_cpu_time_node + job_stats.min_cpu_time_task = step_stats.min_cpu_time_task + + if job.ptr.tot_cpu_sec != slurm.NO_VAL64: + job_stats.total_cpu_time = job.ptr.tot_cpu_sec + + if job.ptr.user_cpu_sec != slurm.NO_VAL64: + job_stats.user_cpu_time = job.ptr.user_cpu_sec + + if job.ptr.sys_cpu_sec != slurm.NO_VAL64: + job_stats.system_cpu_time = job.ptr.sys_cpu_sec + + elapsed = job.elapsed_time if job.elapsed_time else 0 + cpus = job.cpus if job.cpus else 0 + job_stats.elapsed_cpu_time = elapsed * cpus + + step_count = len(steps) + if step_count: + job_stats.avg_cpu_frequency /= step_count + diff --git a/pyslurm/core/db/step.pxd b/pyslurm/core/db/step.pxd new file mode 100644 index 00000000..77d45cd2 --- /dev/null +++ b/pyslurm/core/db/step.pxd @@ -0,0 +1,100 @@ +######################################################################### +# step.pxd - pyslurm slurmdbd step api +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.slurm cimport ( + slurmdb_job_rec_t, + slurmdb_job_cond_t, + slurmdb_step_rec_t, + slurmdb_jobs_get, + slurmdb_destroy_job_cond, + slurmdb_destroy_job_rec, + slurmdb_destroy_step_rec, + try_xmalloc, + slurmdb_job_cond_def_start_end, + slurm_job_state_string, + slurm_job_reason_string, +) +from pyslurm.core.db.util cimport SlurmList, SlurmListItem +from pyslurm.core.db.connection cimport Connection +from pyslurm.core.common cimport cstr +from pyslurm.core.db.stats cimport JobStats + + +cdef class JobSteps(dict): + pass + + +cdef class JobStep: + """A Slurm Database Job-step. + + Attributes: + stats (pyslurm.db.JobStats): + Utilization statistics for this Step + num_nodes (int): + Amount of nodes this Step has allocated + cpus (int): + Amount of CPUs the Step has/had allocated + memory (int): + Amount of memory the Step requested + container (str): + Path to OCI Container bundle + elapsed_time (int): + Amount of seconds elapsed for the Step + end_time (int): + When the Step ended, as a unix timestamp + eligible_time (int): + When the Step became eligible to run, as a unix timestamp + start_time (int): + Time when the Step started, as a unix timestamp + exit_code (int): + Exit code of the step + ntasks (int): + Number of tasks the Step uses + cpu_frequency_min (str): + Minimum CPU-Frequency requested for the Step + cpu_frequency_max (str): + Maximum CPU-Frequency requested for the Step + cpu_frequency_governor (str): + CPU-Frequency Governor requested for the Step + nodelist (str): + Nodes this Step is using + id (Union[str, int]): + ID of the Step + job_id (int): + ID of the Job this Step is a part of + state (str): + State of the Step + cancelled_by (str): + Name of the User who cancelled this Step + submit_command (str): + Full command issued to start the Step + suspended_time (int): + Amount of seconds the Step was suspended + """ + cdef slurmdb_step_rec_t *ptr + cdef public JobStats stats + + @staticmethod + cdef JobStep from_ptr(slurmdb_step_rec_t *step) diff --git a/pyslurm/core/db/step.pyx b/pyslurm/core/db/step.pyx new file mode 100644 index 00000000..aa1bd612 --- /dev/null +++ b/pyslurm/core/db/step.pyx @@ -0,0 +1,177 @@ +######################################################################### +# step.pyx - pyslurm slurmdbd step api +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from os import WIFSIGNALED, WIFEXITED, WTERMSIG, WEXITSTATUS +from pyslurm.core.error import RPCError +from pyslurm.core.db.tres cimport TrackableResources, TrackableResource +from pyslurm.core.common.uint import * +from pyslurm.core.common.ctime import _raw_time +from pyslurm.core.common import ( + gid_to_name, + uid_to_name, + instance_to_dict, +) +from pyslurm.core.job.util import cpu_freq_int_to_str +from pyslurm.core.job.step import humanize_step_id + + +cdef class JobStep: + + def __cinit__(self): + self.ptr = NULL + + def __init__(self): + raise RuntimeError("You can not instantiate this class directly " + " at the moment") + + def __dealloc__(self): + slurmdb_destroy_step_rec(self.ptr) + self.ptr = NULL + + @staticmethod + cdef JobStep from_ptr(slurmdb_step_rec_t *step): + cdef JobStep wrap = JobStep.__new__(JobStep) + wrap.ptr = step + wrap.stats = JobStats.from_step(wrap) + return wrap + + def as_dict(self): + cdef dict out = instance_to_dict(self) + out["stats"] = self.stats.as_dict() + return out + + @property + def num_nodes(self): + nnodes = u32_parse(self.ptr.nnodes) + if not nnodes and self.ptr.tres_alloc_str: + return TrackableResources.find_count_in_str( + self.ptr.tres_alloc_str, slurm.TRES_NODE) + else: + return nnodes + + @property + def cpus(self): + req_cpus = TrackableResources.find_count_in_str( + self.ptr.tres_alloc_str, slurm.TRES_CPU) + + if req_cpus == slurm.INFINITE64: + return 0 + + return req_cpus +# if req_cpus == slurm.INFINITE64 and step.job_ptr: +# tres_alloc_str = cstr.to_unicode(step.job_ptr.tres_alloc_str) +# req_cpus = TrackableResources.find_count_in_str(tres_alloc_str, +# slurm.TRES_CPU) +# if not req_cpus: +# tres_req_str = cstr.to_unicode(step.job_ptr.tres_req_str) +# req_cpus = TrackableResources.find_count_in_str(tres_req_str, +# slurm.TRES_CPU) + + @property + def memory(self): + val = TrackableResources.find_count_in_str(self.ptr.tres_alloc_str, + slurm.TRES_MEM) + return val + + # Only in Parent Job available: + # resvcpu? + + @property + def container(self): + return cstr.to_unicode(self.ptr.container) + + @property + def elapsed_time(self): + # seconds + return _raw_time(self.ptr.elapsed) + + @property + def end_time(self): + return _raw_time(self.ptr.end) + + @property + def eligible_time(self): + return _raw_time(self.ptr.start) + + @property + def start_time(self): + return _raw_time(self.ptr.start) + + @property + def exit_code(self): + # TODO + return None + + @property + def ntasks(self): + return u32_parse(self.ptr.ntasks) + + @property + def cpu_frequency_min(self): + return cpu_freq_int_to_str(self.ptr.req_cpufreq_min) + + @property + def cpu_frequency_max(self): + return cpu_freq_int_to_str(self.ptr.req_cpufreq_max) + + @property + def cpu_frequency_governor(self): + return cpu_freq_int_to_str(self.ptr.req_cpufreq_gov) + + @property + def nodelist(self): + return cstr.to_unicode(self.ptr.nodes) + + @property + def id(self): + return humanize_step_id(self.ptr.step_id.step_id) + + @property + def job_id(self): + return self.ptr.step_id.job_id + + @property + def name(self): + return cstr.to_unicode(self.ptr.stepname) + +# @property +# def distribution(self): +# # ptr.task_dist +# pass + + @property + def state(self): + return cstr.to_unicode(slurm_job_state_string(self.ptr.state)) + + @property + def cancelled_by(self): + return uid_to_name(self.ptr.requid) + + @property + def submit_command(self): + return cstr.to_unicode(self.ptr.submit_line) + + @property + def suspended_time(self): + return _raw_time(self.ptr.elapsed) diff --git a/pyslurm/core/db/tres.pxd b/pyslurm/core/db/tres.pxd new file mode 100644 index 00000000..f08bb3df --- /dev/null +++ b/pyslurm/core/db/tres.pxd @@ -0,0 +1,45 @@ +######################################################################### +# tres.pxd - pyslurm slurmdbd tres api +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.core.common cimport cstr +from libc.stdint cimport uint64_t +from pyslurm.slurm cimport ( + slurmdb_tres_rec_t, + slurmdb_destroy_tres_rec, + slurmdb_find_tres_count_in_string, + try_xmalloc, +) + + +cdef class TrackableResources(dict): + cdef public raw_str + + @staticmethod + cdef TrackableResources from_str(char *tres_str) + + +cdef class TrackableResource: + cdef slurmdb_tres_rec_t *ptr + + @staticmethod + cdef TrackableResource from_ptr(slurmdb_tres_rec_t *in_ptr) diff --git a/pyslurm/core/db/tres.pyx b/pyslurm/core/db/tres.pyx new file mode 100644 index 00000000..1e77994b --- /dev/null +++ b/pyslurm/core/db/tres.pyx @@ -0,0 +1,112 @@ +######################################################################### +# tres.pyx - pyslurm slurmdbd tres api +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm.core.common.uint import * + + +cdef class TrackableResources(dict): + + def __init__(self): + pass + + @staticmethod + cdef TrackableResources from_str(char *tres_str): + cdef: + TrackableResources tres_collection + TrackableResource tres + str raw_str = cstr.to_unicode(tres_str) + dict tres_dict + + tres_collection = TrackableResources.__new__(TrackableResources) + if not raw_str: + return tres_collection + + tres_collection.raw_str = raw_str + tres_dict = cstr.to_dict(tres_str) + for tres_id, val in tres_dict.items(): + tres = TrackableResource(tres_id) + tres.ptr.count = val + + return tres + + @staticmethod + def find_count_in_str(tres_str, typ): + if not tres_str: + return 0 + + cdef uint64_t tmp + tmp = slurmdb_find_tres_count_in_string(tres_str, typ) + if tmp == slurm.INFINITE64 or tmp == slurm.NO_VAL64: + return 0 + else: + return tmp + + +cdef class TrackableResource: + + def __cinit__(self): + self.ptr = NULL + + def __init__(self, tres_id): + self._alloc_impl() + self.ptr.id = tres_id + + def __dealloc__(self): + self._dealloc_impl() + + def _alloc_impl(self): + if not self.ptr: + self.ptr = try_xmalloc( + sizeof(slurmdb_tres_rec_t)) + if not self.ptr: + raise MemoryError("xmalloc failed for slurmdb_tres_rec_t") + + def _dealloc_impl(self): + slurmdb_destroy_tres_rec(self.ptr) + self.ptr = NULL + + @staticmethod + cdef TrackableResource from_ptr(slurmdb_tres_rec_t *in_ptr): + cdef TrackableResource wrap = TrackableResource.__new__(TrackableResource) + wrap.ptr = in_ptr + return wrap + + @property + def id(self): + return self.ptr.id + + @property + def name(self): + return cstr.to_unicode(self.ptr.name) + + @property + def type(self): + return cstr.to_unicode(self.ptr.type) + + @property + def count(self): + return u64_parse(self.ptr.count) + + # rec_count + # alloc_secs diff --git a/pyslurm/core/db/util.pxd b/pyslurm/core/db/util.pxd new file mode 100644 index 00000000..deb71ed4 --- /dev/null +++ b/pyslurm/core/db/util.pxd @@ -0,0 +1,65 @@ +######################################################################### +# util.pxd - pyslurm slurmdbd util functions +######################################################################### +# Copyright (C) 2022 Toni Harzendorf +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.core.common cimport cstr +from pyslurm.slurm cimport ( + ListIterator, + List, + slurm_list_iterator_create, + slurm_list_iterator_destroy, + slurm_list_iterator_reset, + slurm_list_count, + slurm_list_next, + slurm_list_destroy, + slurm_list_create, + slurm_list_pop, + slurm_list_append, + slurm_xfree_ptr, +) + +cdef slurm_list_to_pylist(List in_list) +cdef make_char_list(List *in_list, vals) + + +cdef class SlurmListItem: + cdef void *data + + @staticmethod + cdef SlurmListItem from_ptr(void *item) + + +cdef class SlurmList: + cdef: + List info + ListIterator itr + + cdef readonly: + owned + int itr_cnt + int cnt + + @staticmethod + cdef SlurmList wrap(List, owned=*) + + @staticmethod + cdef SlurmList create(slurm.ListDelF delf, owned=*) diff --git a/pyslurm/core/db/util.pyx b/pyslurm/core/db/util.pyx new file mode 100644 index 00000000..2560c4b0 --- /dev/null +++ b/pyslurm/core/db/util.pyx @@ -0,0 +1,188 @@ +######################################################################### +# util.pyx - pyslurm slurmdbd util functions +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + + +cdef make_char_list(List *in_list, vals): + if not vals: + return None + + # Make a new SlurmList wrapper with the values + cdef SlurmList slist = SlurmList(vals) + + # Make sure the previous list is deallocated + if in_list[0]: + slurm_list_destroy(in_list[0]) + + # Assign the pointer from slist to in_list, and give up ownership of slist + in_list[0] = slist.info + slist.owned = False + + +cdef slurm_list_to_pylist(List in_list): + return SlurmList.wrap(in_list, owned=False).to_pylist() + + +cdef class SlurmListItem: + + def __cinit__(self): + self.data = NULL + + @staticmethod + cdef SlurmListItem from_ptr(void *item): + cdef SlurmListItem wrap = SlurmListItem.__new__(SlurmListItem) + wrap.data = item + return wrap + + @property + def has_data(self): + if self.data: + return True + else: + return False + + def to_str(self): + # Mostly for debugging purposes. Can only be used "safely" if we have + # a char* list + cdef char* entry = self.data + return cstr.to_unicode(entry) + + +cdef class SlurmList: + """Convenience Wrapper around slurms List type""" + def __cinit__(self): + self.info = NULL + self.itr = NULL + self.itr_cnt = 0 + self.cnt = 0 + self.owned = True + + def __init__(self, vals=None): + self.info = slurm_list_create(slurm_xfree_ptr) + self.append(vals) + + def __dealloc__(self): + self._dealloc_itr() + self._dealloc_list() + + def _dealloc_list(self): + if self.info is not NULL and self.owned: + slurm_list_destroy(self.info) + self.cnt = 0 + self.info = NULL + + def _dealloc_itr(self): + if self.itr: + slurm_list_iterator_destroy(self.itr) + self.itr_cnt = 0 + self.itr = NULL + + def __iter__(self): + self._dealloc_itr() + if not self.is_null: + self.itr = slurm_list_iterator_create(self.info) + + return self + + def __next__(self): + if self.is_null or self.is_itr_null: + raise StopIteration + + if self.itr_cnt < self.cnt: + self.itr_cnt += 1 + return SlurmListItem.from_ptr(slurm_list_next(self.itr)) + + self._dealloc_itr() + raise StopIteration + + @staticmethod + def iter_and_pop(SlurmList li): + while li.cnt > 0: + yield SlurmListItem.from_ptr(slurm_list_pop(li.info)) + li.cnt -= 1 + + @staticmethod + cdef SlurmList create(slurm.ListDelF delfunc, owned=True): + cdef SlurmList wrapper = SlurmList.__new__(SlurmList) + wrapper.info = slurm_list_create(delfunc) + wrapper.owned = owned + return wrapper + + @staticmethod + cdef SlurmList wrap(List li, owned=True): + cdef SlurmList wrapper = SlurmList.__new__(SlurmList) + if not li: + return wrapper + + wrapper.info = li + wrapper.cnt = slurm_list_count(li) + wrapper.owned = owned + return wrapper + + def to_pylist(self): + cdef: + SlurmListItem item + list out = [] + + for item in self: + if not item.has_data: + continue + + pystr = cstr.to_unicode(item.data) + if pystr: + out.append(int(pystr) if pystr.isdigit() else pystr) + + return out + + def append(self, vals): + cdef char *entry = NULL + + if not vals: + return None + + to_add = vals + if not isinstance(vals, list): + # If it is not a list, then anything that can't be casted to str + # will error below anyways + to_add = [vals] + + for val in to_add: + if val: + entry = NULL + cstr.fmalloc(&entry, str(val)) + slurm_list_append(self.info, entry) + self.cnt += 1 + + @property + def is_itr_null(self): + if not self.itr: + return True + else: + return False + + @property + def is_null(self): + if not self.info: + return True + else: + return False diff --git a/pyslurm/core/error.pyx b/pyslurm/core/error.pyx new file mode 100644 index 00000000..69130abd --- /dev/null +++ b/pyslurm/core/error.pyx @@ -0,0 +1,100 @@ +######################################################################### +# error.pyx - pyslurm error utilities +######################################################################### +# Copyright (C) 2022 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm.core.common cimport cstr +from pyslurm cimport slurm +from pyslurm.slurm cimport slurm_get_errno + + +def slurm_strerror(errno): + """Convert a slurm errno to a string. + + Args: + errno (int): + The error number for which the string representation should be + returned. + + Returns: + (str): String representation of errno. + """ + return cstr.to_unicode(slurm.slurm_strerror(errno)) + + +def slurm_errno(): + """Get the current slurm errno. + + Returns: + (int): Current slurm errno + """ + return slurm_get_errno() + + +def get_last_slurm_error(): + """Get the last slurm error that occured as a tuple of errno and string. + + Returns: + errno (int): The error number + errno_str (str): The errno converted to a String + """ + errno = slurm_errno() + + if errno == slurm.SLURM_SUCCESS: + return (errno, 'Success') + else: + return (errno, slurm_strerror(errno)) + + +class RPCError(Exception): + """Exception for handling Slurm RPC errors. + + Args: + errno (int): + A slurm error number returned by RPC functions. Default is None, + which will get the last slurm error automatically. + msg (str): + An optional, custom error description. If this is set, the errno + will not be translated to its string representation. + """ + def __init__(self, errno=slurm.SLURM_ERROR, msg=None): + self.msg = msg + self.errno = errno + + if not msg: + if errno == slurm.SLURM_ERROR: + self.errno, self.msg = get_last_slurm_error() + else: + self.msg = slurm_strerror(errno) + + super().__init__(self.msg) + + +def verify_rpc(errno): + """Verify a Slurm RPC + + Args: + errno (int): + A Slurm error value + """ + if errno != slurm.SLURM_SUCCESS: + raise RPCError(errno) diff --git a/pyslurm/core/job/__init__.pxd b/pyslurm/core/job/__init__.pxd new file mode 100644 index 00000000..e69de29b diff --git a/pyslurm/core/job/__init__.py b/pyslurm/core/job/__init__.py new file mode 100644 index 00000000..ccc396e2 --- /dev/null +++ b/pyslurm/core/job/__init__.py @@ -0,0 +1,3 @@ +from .job import Job, Jobs +from .step import JobStep, JobSteps +from .submission import JobSubmitDescription diff --git a/pyslurm/core/job/job.pxd b/pyslurm/core/job/job.pxd new file mode 100644 index 00000000..c41c8ced --- /dev/null +++ b/pyslurm/core/job/job.pxd @@ -0,0 +1,387 @@ +######################################################################### +# job.pyx - interface to retrieve slurm job informations +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm.core.common cimport cstr, ctime +from pyslurm.core.common.uint cimport * +from pyslurm.core.common.ctime cimport time_t + +from libc.string cimport memcpy, memset +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int64_t +from libc.stdlib cimport free + +from pyslurm.core.job.submission cimport JobSubmitDescription +from pyslurm.core.job.step cimport JobSteps, JobStep + +from pyslurm cimport slurm +from pyslurm.slurm cimport ( + working_cluster_rec, + slurm_msg_t, + job_id_msg_t, + slurm_msg_t_init, + return_code_msg_t, + slurm_send_recv_controller_msg, + slurm_free_return_code_msg, + slurm_free_job_info_msg, + slurm_free_job_info, + slurm_load_job, + slurm_load_jobs, + job_info_msg_t, + slurm_job_info_t, + slurm_job_state_string, + slurm_job_reason_string, + slurm_job_share_string, + slurm_job_batch_script, + slurm_get_job_stdin, + slurm_get_job_stdout, + slurm_get_job_stderr, + slurm_signal_job, + slurm_kill_job, + slurm_resume, + slurm_suspend, + slurm_update_job, + slurm_notify_job, + slurm_requeue, + xfree, + try_xmalloc, +) + + +cdef class Jobs(dict): + """A collection of Job objects. + + Args: + jobs (Union[list, dict], optional): + Jobs to initialize this collection with. + freeze (bool, optional): + Control whether this collection is "frozen" when reloading Job + information. + + Attributes: + memory (int): + Total amount of memory for all Jobs in this collection, in + Mebibytes + cpus (int): + Total amount of cpus for all Jobs in this collection. + ntasks (int): + Total amount of tasks for all Jobs in this collection. + cpu_time (int): + Total amount of CPU-Time used by all the Jobs in the collection. + This is the result of multiplying the run_time with the amount of + cpus for each job. + freeze (bool): + If this is set to True and the reload() method is called, then + *ONLY* Jobs that already exist in this collection will be + reloaded. New Jobs that are discovered will not be added to this + collection, but old Jobs which have already been purged from the + Slurm controllers memory will not be removed either. + The default is False, so old jobs will be removed, and new Jobs + will be added - basically the same behaviour as doing Jobs.load(). + """ + cdef: + job_info_msg_t *info + slurm_job_info_t tmp_info + + cdef public: + freeze + + +cdef class Job: + """A Slurm Job. + + All attributes in this class are read-only. + + Args: + job_id (int): + An Integer representing a Job-ID. + + Raises: + MemoryError: If malloc fails to allocate memory. + + Attributes: + steps (JobSteps): + Steps this Job has. + Before you can access the Steps data for a Job, you have to call + the reload() method of a Job instance or the load_steps() method + of a Jobs collection. + name (str): + Name of the Job + id (int): + Unique ID of the Job. + association_id (int): + ID of the Association this Job runs with. + account (str): + Name of the Account this Job is runs with. + user_id (int): + UID of the User who submitted the Job. + user_name (str): + Name of the User who submitted the Job. + group_id (int): + GID of the Group that Job runs under. + group_name (str): + Name of the Group this Job runs under. + priority (int): + Priority of the Job. + nice (int): + Nice Value of the Job. + qos (str): + QOS Name of the Job. + min_cpus_per_node (int): + Minimum Amount of CPUs per Node the Job requested. + state (str): + State this Job is currently in. + state_reason (str): + A Reason explaining why the Job is in its current state. + is_requeueable (bool): + Whether the Job is requeuable or not. + requeue_count (int): + Amount of times the Job has been requeued. + is_batch_job (bool): + Whether the Job is a batch job or not. + node_reboot_required (bool): + Whether the Job requires the Nodes to be rebooted first. + dependencies (dict): + Dependencies the Job has to other Jobs. + time_limit (int): + Time-Limit, in minutes, for this Job. + time_limit_min (int): + Minimum Time-Limit in minutes for this Job. + submit_time (int): + Time the Job was submitted, as unix timestamp. + eligible_time (int): + Time the Job is eligible to start, as unix timestamp. + accrue_time (int): + Job accrue time, as unix timestamp + start_time (int): + Time this Job has started execution, as unix timestamp. + resize_time (int): + Time the job was resized, as unix timestamp. + deadline (int): + Time when a pending Job will be cancelled, as unix timestamp. + preempt_eligible_time (int): + Time the Job is eligible for preemption, as unix timestamp. + preempt_time (int): + Time the Job was signaled for preemption, as unix timestamp. + suspend_time (int): + Last Time the Job was suspended, as unix timestamp. + last_sched_evaluation_time (int): + Last time evaluated for Scheduling, as unix timestamp. + pre_suspension_time (int): + Amount of seconds the Job ran prior to suspension, as unix + timestamp + mcs_label (str): + MCS Label for the Job + partition (str): + Name of the Partition the Job runs in. + submit_host (str): + Name of the Host this Job was submitted from. + batch_host (str): + Name of the Host where the Batch-Script is executed. + num_nodes (int): + Amount of Nodes the Job has requested or allocated. + max_nodes (int): + Maximum amount of Nodes the Job has requested. + allocated_nodes (str): + Nodes the Job is currently using. + This is only valid when the Job is running. If the Job is pending, + it will always return None. + required_nodes (str): + Nodes the Job is explicitly requiring to run on. + excluded_nodes (str): + Nodes that are explicitly excluded for execution. + scheduled_nodes (str): + Nodes the Job is scheduled on by the slurm controller. + derived_exit_code (int): + The derived exit code for the Job. + derived_exit_code_signal (int): + Signal for the derived exit code. + exit_code (int): + Code with which the Job has exited. + exit_code_signal (int): + The signal which has led to the exit code of the Job. + batch_constraints (list): + Features that node(s) should have for the batch script. + Controls where it is possible to execute the batch-script of the + job. Also see 'constraints' + federation_origin (str): + Federation Origin + federation_siblings_active (int): + Federation siblings active + federation_siblings_viable (int): + Federation siblings viable + cpus (int): + Total amount of CPUs the Job is using. + If the Job is still pending, this will be the amount of requested + CPUs. + cpus_per_task (int): + Number of CPUs per Task used. + cpus_per_gpu (int): + Number of CPUs per GPU used. + boards_per_node (int): + Number of boards per Node. + sockets_per_board (int): + Number of sockets per board. + sockets_per_node (int): + Number of sockets per node. + cores_per_socket (int): + Number of cores per socket. + threads_per_core (int): + Number of threads per core. + ntasks (int): + Number of parallel processes. + ntasks_per_node (int): + Number of parallel processes per node. + ntasks_per_board (int): + Number of parallel processes per board. + ntasks_per_socket (int): + Number of parallel processes per socket. + ntasks_per_core (int): + Number of parallel processes per core. + ntasks_per_gpu (int): + Number of parallel processes per GPU. + delay_boot_time (int): + https://slurm.schedmd.com/sbatch.html#OPT_delay-boot, in minutes + constraints (list): + A list of features the Job requires nodes to have. + In contrast, the 'batch_constraints' option only focuses on the + initial batch-script placement. This option however means features + to restrict the list of nodes a job is able to execute on in + general beyond the initial batch-script. + cluster (str): + Name of the cluster the job is executing on. + cluster_constraints (list): + A List of features that a cluster should have. + reservation (str): + Name of the reservation this Job uses. + resource_sharing (str): + Mode controlling how a job shares resources with others. + requires_contiguous_nodes (bool): + Whether the Job has allocated a set of contiguous nodes. + licenses (list): + List of licenses the Job needs. + network (str): + Network specification for the Job. + command (str): + The command that is executed for the Job. + working_directory (str): + Path to the working directory for this Job. + admin_comment (str): + An arbitrary comment set by an administrator for the Job. + system_comment (str): + An arbitrary comment set by the slurmctld for the Job. + container (str): + The container this Job uses. + comment (str): + An arbitrary comment set for the Job. + standard_input (str): + The path to the file for the standard input stream. + standard_output (str): + The path to the log file for the standard output stream. + standard_error (str): + The path to the log file for the standard error stream. + required_switches (int): + Number of switches required. + max_wait_time_switches (int): + Amount of seconds to wait for the switches. + burst_buffer (str): + Burst buffer specification + burst_buffer_state (str): + Burst buffer state + cpu_frequency_min (Union[str, int]): + Minimum CPU-Frequency requested. + cpu_frequency_max (Union[str, int]): + Maximum CPU-Frequency requested. + cpu_frequency_governor (Union[str, int]): + CPU-Frequency Governor requested. + wckey (str): + Name of the WCKey this Job uses. + mail_user (list): + Users that should receive Mails for this Job. + mail_types (list): + Mail Flags specified by the User. + heterogeneous_id (int): + Heterogeneous job id. + heterogeneous_offset (int): + Heterogeneous job offset. + temporary_disk_per_node (int): + Temporary disk space in Mebibytes available per Node. + array_id (int): + The master Array-Job ID. + array_tasks_parallel (int): + Max number of array tasks allowed to run simultaneously. + array_task_id (int): + Array Task ID of this Job if it is an Array-Job. + array_tasks_waiting (str): + Array Tasks that are still waiting. + end_time (int): + Time at which this Job will end, as unix timestamp. + run_time (int): + Amount of seconds the Job has been running. + cores_reserved_for_system (int): + Amount of cores reserved for System use only. + threads_reserved_for_system (int): + Amount of Threads reserved for System use only. + memory (int): + Total Amount of Memory this Job has, in Mebibytes + memory_per_cpu (int): + Amount of Memory per CPU this Job has, in Mebibytes + memory_per_node (int): + Amount of Memory per Node this Job has, in Mebibytes + memory_per_gpu (int): + Amount of Memory per GPU this Job has, in Mebibytes + gres_per_node (dict): + Generic Resources (e.g. GPU) this Job is using per Node. + profile_types (list): + Types for which detailed accounting data is collected. + gres_binding (str): + Binding Enforcement of a Generic Resource (e.g. GPU). + kill_on_invalid_dependency (bool): + Whether the Job should be killed on an invalid dependency. + spreads_over_nodes (bool): + Whether the Job should be spreaded over as many nodes as possible. + power_options (list): + Options set for Power Management. + is_cronjob (bool): + Whether this Job is a cronjob. + cronjob_time (str): + The time specification for the Cronjob. + cpu_time (int): + Amount of CPU-Time used by the Job so far. + This is the result of multiplying the run_time with the amount of + cpus. + """ + cdef: + slurm_job_info_t *ptr + dict passwd + dict groups + + cdef public JobSteps steps + + cdef _calc_run_time(self) + + @staticmethod + cdef _swap_data(Job dst, Job src) + + @staticmethod + cdef Job from_ptr(slurm_job_info_t *in_ptr) + diff --git a/pyslurm/core/job/job.pyx b/pyslurm/core/job/job.pyx new file mode 100644 index 00000000..1e160c80 --- /dev/null +++ b/pyslurm/core/job/job.pyx @@ -0,0 +1,1346 @@ +######################################################################### +# job.pyx - interface to retrieve slurm job informations +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# Note: Some functions in this File are annotated with additional Copyright +# notices. These functions are: +# +# - get_batch_script +# - get_resource_layout_per_node +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from os import WIFSIGNALED, WIFEXITED, WTERMSIG, WEXITSTATUS +import re +from typing import Union +from pyslurm.core.common import cstr, ctime +from pyslurm.core.common.uint import * +from pyslurm.core.job.util import * +from pyslurm.core.error import ( + RPCError, + verify_rpc, + slurm_errno, +) +from pyslurm.core.common.ctime import _raw_time +from pyslurm.core.common import ( + uid_to_name, + gid_to_name, + signal_to_num, + _getgrall_to_dict, + _getpwall_to_dict, + instance_to_dict, + _sum_prop, +) + + +cdef class Jobs(dict): + + def __cinit__(self): + self.info = NULL + + def __dealloc__(self): + slurm_free_job_info_msg(self.info) + + def __init__(self, jobs=None, freeze=False): + self.freeze = freeze + + if isinstance(jobs, dict): + self.update(jobs) + elif jobs is not None: + for job in jobs: + if isinstance(job, int): + self[job] = Job(job) + else: + self[job.id] = job + + @staticmethod + def load(preload_passwd_info=False, freeze=False): + """Retrieve all Jobs from the Slurm controller + + Args: + preload_passwd_info (bool, optional): + Decides whether to query passwd and groups information from + the system. + Could potentially speed up access to attributes of the Job + where a UID/GID is translated to a name. If True, the + information will fetched and stored in each of the Job + instances. + freeze (bool, optional): + Decide whether this collection of Jobs should be "frozen". + + Returns: + (Jobs): A collection of Job objects. + + Raises: + RPCError: When getting all the Jobs from the slurmctld failed. + MemoryError: If malloc fails to allocate memory. + """ + cdef: + dict passwd = {} + dict groups = {} + Jobs jobs = Jobs.__new__(Jobs) + int flags = slurm.SHOW_ALL | slurm.SHOW_DETAIL + Job job + + verify_rpc(slurm_load_jobs(0, &jobs.info, flags)) + + # If requested, preload the passwd and groups database to potentially + # speedup lookups for an attribute in a Job, e.g. user_name or + # group_name. + if preload_passwd_info: + passwd = _getpwall_to_dict() + groups = _getgrall_to_dict() + + # zero-out a dummy job_step_info_t + memset(&jobs.tmp_info, 0, sizeof(slurm_job_info_t)) + + # Put each job pointer into its own "Job" instance. + for cnt in range(jobs.info.record_count): + job = Job.from_ptr(&jobs.info.job_array[cnt]) + + # Prevent double free if xmalloc fails mid-loop and a MemoryError + # is raised by replacing it with a zeroed-out slurm_job_info_t. + jobs.info.job_array[cnt] = jobs.tmp_info + + if preload_passwd_info: + job.passwd = passwd + job.groups = groups + + jobs[job.id] = job + + # At this point we memcpy'd all the memory for the Jobs. Setting this + # to 0 will prevent the slurm job free function to deallocate the + # memory for the individual jobs. This should be fine, because they + # are free'd automatically in __dealloc__ since the lifetime of each + # job-pointer is tied to the lifetime of its corresponding "Job" + # instance. + jobs.info.record_count = 0 + + jobs.freeze = freeze + return jobs + + def reload(self): + """Reload the information for jobs in a collection. + + Raises: + RPCError: When getting the Jobs from the slurmctld failed. + """ + cdef Jobs reloaded_jobs = Jobs.load() + + for jid in list(self.keys()): + if jid in reloaded_jobs: + # Put the new data in. + self[jid] = reloaded_jobs[jid] + elif not self.freeze: + # Remove this instance from the current collection, as the Job + # doesn't exist anymore. + del self[jid] + + if not self.freeze: + for jid in reloaded_jobs: + if jid not in self: + self[jid] = reloaded_jobs[jid] + + return self + + def load_steps(self): + """Load all Job steps for this collection of Jobs. + + This function fills in the "steps" attribute for all Jobs in the + collection. + + Note: + Pending Jobs will be ignored, since they don't have any Steps yet. + + Raises: + RPCError: When retrieving the Job information for all the Steps + failed. + """ + cdef dict step_info = JobSteps.load_all() + + for jid in self: + # Ignore any Steps from Jobs which do not exist in this + # collection. + if jid in step_info: + self[jid].steps = step_info[jid] + + def as_list(self): + """Format the information as list of Job objects. + + Returns: + (list): List of Job objects + """ + return list(self.values()) + + @property + def memory(self): + return _sum_prop(self, Job.memory) + + @property + def cpus(self): + return _sum_prop(self, Job.cpus) + + @property + def ntasks(self): + return _sum_prop(self, Job.ntasks) + + @property + def cpu_time(self): + return _sum_prop(self, Job.cpu_time) + + +cdef class Job: + + def __cinit__(self): + self.ptr = NULL + + def __init__(self, job_id): + self._alloc_impl() + self.ptr.job_id = job_id + self.passwd = {} + self.groups = {} + self.steps = JobSteps.__new__(JobSteps) + + def _alloc_impl(self): + if not self.ptr: + self.ptr = try_xmalloc(sizeof(slurm_job_info_t)) + if not self.ptr: + raise MemoryError("xmalloc failed for job_info_t") + + def _dealloc_impl(self): + slurm_free_job_info(self.ptr) + self.ptr = NULL + + def __dealloc__(self): + self._dealloc_impl() + + def __eq__(self, other): + return isinstance(other, Job) and self.id == other.id + + @staticmethod + def load(job_id): + """Load information for a specific Job. + + Implements the slurm_load_job RPC. + + Note: + If the Job is not pending, the related Job steps will also be + loaded. + + Args: + job_id (int): + An Integer representing a Job-ID. + + Returns: + (pyslurm.Job): Returns a new Job instance + + Raises: + RPCError: If requesting the Job information from the slurmctld was + not successful. + MemoryError: If malloc failed to allocate memory. + + Examples: + >>> import pyslurm + >>> job = pyslurm.Job.load(9999) + """ + cdef: + job_info_msg_t *info = NULL + Job wrap = Job.__new__(Job) + + try: + verify_rpc(slurm_load_job(&info, job_id, slurm.SHOW_DETAIL)) + + if info and info.record_count: + # Copy info + wrap._alloc_impl() + memcpy(wrap.ptr, &info.job_array[0], sizeof(slurm_job_info_t)) + info.record_count = 0 + + if not slurm.IS_JOB_PENDING(wrap.ptr): + # Just ignore if the steps couldn't be loaded here. + try: + wrap.steps = JobSteps._load(wrap) + except RPCError: + pass + else: + raise RPCError(msg=f"RPC was successful but got no job data, " + "this should never happen") + except Exception as e: + raise e + finally: + slurm_free_job_info_msg(info) + + return wrap + + @staticmethod + cdef Job from_ptr(slurm_job_info_t *in_ptr): + cdef Job wrap = Job.__new__(Job) + wrap._alloc_impl() + wrap.passwd = {} + wrap.groups = {} + wrap.steps = JobSteps.__new__(JobSteps) + memcpy(wrap.ptr, in_ptr, sizeof(slurm_job_info_t)) + + return wrap + + cdef _swap_data(Job dst, Job src): + cdef slurm_job_info_t *tmp = NULL + if dst.ptr and src.ptr: + tmp = dst.ptr + dst.ptr = src.ptr + src.ptr = tmp + + def as_dict(self): + """Job information formatted as a dictionary. + + Returns: + (dict): Job information as dict + """ + return instance_to_dict(self) + + def send_signal(self, signal, steps="children", hurry=False): + """Send a signal to a running Job. + + Implements the slurm_signal_job RPC. + + Args: + signal (Union[str, int]): + Any valid signal which will be sent to the Job. Can be either + a str like 'SIGUSR1', or simply an int. + steps (str): + Selects which steps should be signaled. Valid values for this + are: "all", "batch" and "children". The default value is + "children", where all steps except the batch-step will be + signaled. + The value "batch" in contrast means, that only the batch-step + will be signaled. With "all" every step is signaled. + hurry (bool): + If True, no burst buffer data will be staged out. The default + value is False. + + Raises: + RPCError: When sending the signal was not successful. + + Examples: + Specifying the signal as a string: + + >>> from pyslurm import Job + >>> Job(9999).send_signal("SIGUSR1") + + or passing in a numeric signal: + + >>> Job(9999).send_signal(9) + """ + cdef uint16_t flags = 0 + + if steps.casefold() == "all": + flags |= slurm.KILL_FULL_JOB + elif steps.casefold() == "batch": + flags |= slurm.KILL_JOB_BATCH + + if hurry: + flags |= slurm.KILL_HURRY + + sig = signal_to_num(signal) + slurm_kill_job(self.id, sig, flags) + + # Ignore errors when the Job is already done or when SIGKILL was + # specified and the job id is already purged from slurmctlds memory. + errno = slurm_errno() + if (errno == slurm.ESLURM_ALREADY_DONE + or errno == slurm.ESLURM_INVALID_JOB_ID and sig == 9): + pass + else: + verify_rpc(errno) + + def cancel(self): + """Cancel a Job. + + Implements the slurm_kill_job RPC. + + Raises: + RPCError: When cancelling the Job was not successful. + + Examples: + >>> from pyslurm import Job + >>> Job(9999).cancel() + """ + self.send_signal(9) + + def suspend(self): + """Suspend a running Job. + + Implements the slurm_suspend RPC. + + Raises: + RPCError: When suspending the Job was not successful. + + Examples: + >>> from pyslurm import Job + >>> Job(9999).suspend() + """ + # TODO: Report as a misbehaviour to schedmd that slurm_suspend is not + # correctly returning error code when it cannot find the job in + # _slurm_rpc_suspend it should return ESLURM_INVALID_JOB_ID, but + # returns -1 + # https://github.com/SchedMD/slurm/blob/master/src/slurmctld/proc_req.c#L4693 + verify_rpc(slurm_suspend(self.id)) + + def unsuspend(self): + """Unsuspend a currently suspended Job. + + Implements the slurm_resume RPC. + + Raises: + RPCError: When unsuspending the Job was not successful. + + Examples: + >>> from pyslurm import Jobs + >>> Job(9999).unsuspend() + """ + # Same problem as described in suspend() + verify_rpc(slurm_resume(self.id)) + + def modify(self, JobSubmitDescription changes): + """Modify a Job. + + Implements the slurm_update_job RPC. + + Args: + changes (JobSubmitDescription): + A JobSubmitDescription object which contains all the + modifications that should be done on the Job. + + Raises: + RPCError: When updating the Job was not successful. + + Examples: + >>> from pyslurm import Job, JobSubmitDescription + >>> + >>> # Setting the new time-limit to 20 days + >>> changes = JobSubmitDescription(time_limit="20-00:00:00") + >>> Job(9999).modify(changes) + """ + changes._create_job_submit_desc(is_update=True) + changes.ptr.job_id = self.id + verify_rpc(slurm_update_job(changes.ptr)) + + def hold(self, mode=None): + """Hold a currently pending Job, preventing it from being scheduled. + + Args: + mode (str): + Determines in which mode the Job should be held. Possible + values are "user" or "admin". By default, the Job is held in + "admin" mode, meaning only an Administrator will be able to + release the Job again. If you specify the mode as "user", the + User will also be able to release the job. + + Note: + Uses the modify() function to set the Job's priority to 0. + + Raises: + RPCError: When holding the Job was not successful. + + Examples: + >>> from pyslurm import Job + >>> + >>> # Holding a Job (in "admin" mode by default) + >>> Job(9999).hold() + >>> + >>> # Holding a Job in "user" mode + >>> Job(9999).hold(mode="user") + """ + cdef JobSubmitDescription job_sub = JobSubmitDescription(priority=0) + + if mode and mode.casefold() == "user": + job_sub.ptr.alloc_sid = slurm.ALLOC_SID_USER_HOLD + + self.modify(job_sub) + + def release(self): + """Release a currently held Job, allowing it to be scheduled again. + + Note: + Uses the modify() function to reset the priority back to + be controlled by the slurmctld's priority calculation routine. + + Raises: + RPCError: When releasing a held Job was not successful. + + Examples: + >>> from pyslurm import Job + >>> Job(9999).release() + """ + self.modify(JobSubmitDescription(priority=slurm.INFINITE)) + + def requeue(self, hold=False): + """Requeue a currently running Job. + + Implements the slurm_requeue RPC. + + Args: + hold (bool): + Controls whether the Job should be put in a held state or not. + Default for this is 'False', so it will not be held. + + Raises: + RPCError: When requeing the Job was not successful. + + Examples: + >>> from pyslurm import Job + >>> + >>> # Requeing a Job while allowing it to be + >>> # scheduled again immediately + >>> Job(9999).requeue() + >>> + >>> # Requeing a Job while putting it in a held state + >>> Job(9999).requeue(hold=True) + """ + cdef uint32_t flags = 0 + + if hold: + flags |= slurm.JOB_REQUEUE_HOLD + + verify_rpc(slurm_requeue(self.id, flags)) + + def notify(self, msg): + """Sends a message to the Jobs stdout. + + Implements the slurm_notify_job RPC. + + Args: + msg (str): + The message that should be sent. + + Raises: + RPCError: When sending the message to the Job was not successful. + + Examples: + >>> from pyslurm import Job + >>> Job(9999).notify("Hello Friends!") + """ + verify_rpc(slurm_notify_job(self.id, msg)) + + def get_batch_script(self): + """Return the content of the script for a Batch-Job. + + Note: + The string returned also includes all the "\n" characters + (new-line). + + Returns: + (str): The content of the batch script. + + Raises: + RPCError: When retrieving the Batch-Script for the Job was not + successful. + + Examples: + >>> from pyslurm import Job + >>> script = Job(9999).get_batch_script() + """ + # The code for this function was taken from here: + # https://github.com/SchedMD/slurm/blob/7162f15af8deaf02c3bbf940d59e818cdeb5c69d/src/api/job_info.c#L1319 + # and therefore reimplements the slurm_job_batch_script API call, with + # slight modifications (e.g. Cython syntax). Otherwise we would have + # to parse the FILE* ptr we get from it back into a char* which + # would be a bit silly. + # + # The copyright notices for the file this function was taken from is + # included below: + # + # Portions Copyright (C) 2010-2017 SchedMD LLC . + # Copyright (C) 2002-2007 The Regents of the University of California. + # Copyright (C) 2008-2010 Lawrence Livermore National Security. + # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + # Written by Morris Jette et. al. + # CODE-OCEC-09-009. All rights reserved. + # + # Slurm is licensed under the GNU General Public License. For the full + # text of Slurm's License, please see here: + # pyslurm/slurm/SLURM_LICENSE + # + # Please, as mentioned above, also have a look at Slurm's DISCLAIMER + # under pyslurm/slurm/SLURM_DISCLAIMER + cdef: + job_id_msg_t msg + slurm_msg_t req + slurm_msg_t resp + int rc = slurm.SLURM_SUCCESS + str script = None + + slurm_msg_t_init(&req) + slurm_msg_t_init(&resp) + + memset(&msg, 0, sizeof(msg)) + msg.job_id = self.id + req.msg_type = slurm.REQUEST_BATCH_SCRIPT + req.data = &msg + + rc = slurm_send_recv_controller_msg(&req, &resp, working_cluster_rec) + verify_rpc(rc) + + if resp.msg_type == slurm.RESPONSE_BATCH_SCRIPT: + script = cstr.to_unicode(resp.data) + xfree(resp.data) + elif resp.msg_type == slurm.RESPONSE_SLURM_RC: + rc = ( resp.data).return_code + slurm_free_return_code_msg(resp.data) + verify_rpc(rc) + else: + verify_rpc(slurm.SLURM_ERROR) + + return script + + @property + def name(self): + return cstr.to_unicode(self.ptr.name) + + @property + def id(self): + return self.ptr.job_id + + @property + def association_id(self): + return u32_parse(self.ptr.assoc_id) + + @property + def account(self): + return cstr.to_unicode(self.ptr.account) + + @property + def user_id(self): + return u32_parse(self.ptr.user_id, zero_is_noval=False) + + @property + def user_name(self): + return uid_to_name(self.ptr.user_id, lookup=self.passwd) + + @property + def group_id(self): + return u32_parse(self.ptr.group_id, zero_is_noval=False) + + @property + def group_name(self): + return gid_to_name(self.ptr.group_id, lookup=self.groups) + + @property + def priority(self): + return u32_parse(self.ptr.priority, zero_is_noval=False) + + @property + def nice(self): + if self.ptr.nice == slurm.NO_VAL: + return None + + return self.ptr.nice - slurm.NICE_OFFSET + + @property + def qos(self): + return cstr.to_unicode(self.ptr.qos) + + @property + def min_cpus_per_node(self): + return u32_parse(self.ptr.pn_min_cpus) + + # I don't think this is used anymore - there is no way in sbatch to ask + # for a "maximum cpu" count, so it will always be empty. + # @property + # def max_cpus(self): + # """Maximum Amount of CPUs the Job requested.""" + # return u32_parse(self.ptr.max_cpus) + + @property + def state(self): + return cstr.to_unicode(slurm_job_state_string(self.ptr.job_state)) + + @property + def state_reason(self): + if self.ptr.state_desc: + return cstr.to_unicode(self.ptr.state_desc) + + return cstr.to_unicode(slurm_job_reason_string(self.ptr.state_reason)) + + @property + def is_requeueable(self): + return u16_parse_bool(self.ptr.requeue) + + @property + def requeue_count(self): + return u16_parse(self.ptr.restart_cnt, on_noval=0) + + @property + def is_batch_job(self): + return u16_parse_bool(self.ptr.batch_flag) + + @property + def requires_node_reboot(self): + return u8_parse_bool(self.ptr.reboot) + + @property + def dependencies(self): + return dependency_str_to_dict(cstr.to_unicode(self.ptr.dependency)) + + @property + def time_limit(self): + return _raw_time(self.ptr.time_limit) + + @property + def time_limit_min(self): + return _raw_time(self.ptr.time_min) + + @property + def submit_time(self): + return _raw_time(self.ptr.submit_time) + + @property + def eligible_time(self): + return _raw_time(self.ptr.eligible_time) + + @property + def accrue_time(self): + return _raw_time(self.ptr.accrue_time) + + @property + def start_time(self): + return _raw_time(self.ptr.start_time) + + @property + def resize_time(self): + return _raw_time(self.ptr.resize_time) + + @property + def deadline(self): + return _raw_time(self.ptr.deadline) + + @property + def preempt_eligible_time(self): + return _raw_time(self.ptr.preemptable_time) + + @property + def preempt_time(self): + return _raw_time(self.ptr.preempt_time) + + @property + def suspend_time(self): + return _raw_time(self.ptr.suspend_time) + + @property + def last_sched_evaluation_time(self): + return _raw_time(self.ptr.last_sched_eval) + + @property + def pre_suspension_time(self): + return _raw_time(self.ptr.pre_sus_time) + + @property + def mcs_label(self): + return cstr.to_unicode(self.ptr.mcs_label) + + @property + def partition(self): + return cstr.to_unicode(self.ptr.partition) + + @property + def submit_host(self): + return cstr.to_unicode(self.ptr.alloc_node) + + @property + def batch_host(self): + return cstr.to_unicode(self.ptr.batch_host) + + @property + def num_nodes(self): + return u32_parse(self.ptr.num_nodes) + + @property + def max_nodes(self): + return u32_parse(self.ptr.max_nodes) + + @property + def allocated_nodes(self): + return cstr.to_unicode(self.ptr.nodes) + + @property + def required_nodes(self): + return cstr.to_unicode(self.ptr.req_nodes) + + @property + def excluded_nodes(self): + return cstr.to_unicode(self.ptr.exc_nodes) + + @property + def scheduled_nodes(self): + return cstr.to_unicode(self.ptr.sched_nodes) + + @property + def derived_exit_code(self): + if (self.ptr.derived_ec == slurm.NO_VAL + or not WIFEXITED(self.ptr.derived_ec)): + return None + + return WEXITSTATUS(self.ptr.derived_ec) + + @property + def derived_exit_code_signal(self): + if (self.ptr.derived_ec == slurm.NO_VAL + or not WIFSIGNALED(self.ptr.derived_ec)): + return None + + return WTERMSIG(self.ptr.derived_ec) + + @property + def exit_code(self): + if (self.ptr.exit_code == slurm.NO_VAL + or not WIFEXITED(self.ptr.exit_code)): + return None + + return WEXITSTATUS(self.ptr.exit_code) + + @property + def exit_code_signal(self): + if (self.ptr.exit_code == slurm.NO_VAL + or not WIFSIGNALED(self.ptr.exit_code)): + return None + + return WTERMSIG(self.ptr.exit_code) + + @property + def batch_constraints(self): + return cstr.to_list(self.ptr.batch_features) + + @property + def federation_origin(self): + return cstr.to_unicode(self.ptr.fed_origin_str) + + @property + def federation_siblings_active(self): + return u64_parse(self.ptr.fed_siblings_active) + + @property + def federation_siblings_viable(self): + return u64_parse(self.ptr.fed_siblings_viable) + + @property + def cpus(self): + return u32_parse(self.ptr.num_cpus, on_noval=1) + + @property + def cpus_per_task(self): + if self.ptr.cpus_per_tres: + return None + + return u16_parse(self.ptr.cpus_per_task, on_noval=1) + + @property + def cpus_per_gpu(self): + if (not self.ptr.cpus_per_tres + or self.ptr.cpus_per_task != slurm.NO_VAL16): + return None + + # TODO: Make a function that, given a GRES type, safely extracts its + # value from the string. + val = cstr.to_unicode(self.ptr.cpus_per_tres).split(":")[2] + return u16_parse(val) + + @property + def boards_per_node(self): + return u16_parse(self.ptr.boards_per_node) + + @property + def sockets_per_board(self): + return u16_parse(self.ptr.sockets_per_board) + + @property + def sockets_per_node(self): + return u16_parse(self.ptr.sockets_per_node) + + @property + def cores_per_socket(self): + return u16_parse(self.ptr.cores_per_socket) + + @property + def threads_per_core(self): + return u16_parse(self.ptr.threads_per_core) + + @property + def ntasks(self): + return u32_parse(self.ptr.num_tasks, on_noval=1) + + @property + def ntasks_per_node(self): + return u16_parse(self.ptr.ntasks_per_node) + + @property + def ntasks_per_board(self): + return u16_parse(self.ptr.ntasks_per_board) + + @property + def ntasks_per_socket(self): + return u16_parse(self.ptr.ntasks_per_socket) + + @property + def ntasks_per_core(self): + return u16_parse(self.ptr.ntasks_per_core) + + @property + def ntasks_per_gpu(self): + return u16_parse(self.ptr.ntasks_per_tres) + + @property + def delay_boot_time(self): + return _raw_time(self.ptr.delay_boot) + + @property + def constraints(self): + return cstr.to_list(self.ptr.features) + + @property + def cluster(self): + return cstr.to_unicode(self.ptr.cluster) + + @property + def cluster_constraints(self): + return cstr.to_list(self.ptr.cluster_features) + + @property + def reservation(self): + return cstr.to_unicode(self.ptr.resv_name) + + @property + def resource_sharing(self): + return cstr.to_unicode(slurm_job_share_string(self.ptr.shared)) + + @property + def requires_contiguous_nodes(self): + return u16_parse_bool(self.ptr.contiguous) + + @property + def licenses(self): + return cstr.to_list(self.ptr.licenses) + + @property + def network(self): + return cstr.to_unicode(self.ptr.network) + + @property + def command(self): + return cstr.to_unicode(self.ptr.command) + + @property + def working_directory(self): + return cstr.to_unicode(self.ptr.work_dir) + + @property + def admin_comment(self): + return cstr.to_unicode(self.ptr.admin_comment) + + @property + def system_comment(self): + return cstr.to_unicode(self.ptr.system_comment) + + @property + def container(self): + return cstr.to_unicode(self.ptr.container) + + @property + def comment(self): + return cstr.to_unicode(self.ptr.comment) + + @property + def standard_input(self): + cdef char tmp[1024] + slurm_get_job_stdin(tmp, sizeof(tmp), self.ptr) + return cstr.to_unicode(tmp) + + @property + def standard_output(self): + cdef char tmp[1024] + slurm_get_job_stdout(tmp, sizeof(tmp), self.ptr) + return cstr.to_unicode(tmp) + + @property + def standard_error(self): + cdef char tmp[1024] + slurm_get_job_stderr(tmp, sizeof(tmp), self.ptr) + return cstr.to_unicode(tmp) + + @property + def required_switches(self): + return u32_parse(self.ptr.req_switch) + + @property + def max_wait_time_switches(self): + return _raw_time(self.ptr.wait4switch) + + @property + def burst_buffer(self): + return cstr.to_unicode(self.ptr.burst_buffer) + + @property + def burst_buffer_state(self): + return cstr.to_unicode(self.ptr.burst_buffer_state) + + @property + def cpu_frequency_min(self): + return cpu_freq_int_to_str(self.ptr.cpu_freq_min) + + @property + def cpu_frequency_max(self): + return cpu_freq_int_to_str(self.ptr.cpu_freq_max) + + @property + def cpu_frequency_governor(self): + return cpu_freq_int_to_str(self.ptr.cpu_freq_gov) + + # @property + # def tres_bindings(self): + # """str: ?""" + # # TODO: Find out how it works + # return cstr.to_unicode(self.ptr.tres_bind) + + # @property + # def tres_frequency(self): + # """?""" + # # TODO: Find out how it works + # return cstr.to_unicode(self.ptr.tres_freq) + + @property + def wckey(self): + return cstr.to_unicode(self.ptr.wckey) + + @property + def mail_user(self): + return cstr.to_list(self.ptr.mail_user) + + @property + def mail_types(self): + return mail_type_int_to_list(self.ptr.mail_type) + + @property + def heterogeneous_id(self): + return u32_parse(self.ptr.het_job_id, noval=0) + + @property + def heterogeneous_offset(self): + return u32_parse(self.ptr.het_job_offset, noval=0) + + # @property + # def hetjob_component_ids(self): + # """str: ?""" + # # TODO: Find out how to parse it in a more proper way? + # return cstr.to_unicode(self.ptr.het_job_id_set) + + @property + def temporary_disk_per_node(self): + return u32_parse(self.ptr.pn_min_tmp_disk) + + @property + def array_id(self): + return u32_parse(self.ptr.array_job_id) + + @property + def array_tasks_parallel(self): + return u32_parse(self.ptr.array_max_tasks) + + @property + def array_task_id(self): + return u32_parse(self.ptr.array_task_id) + + @property + def array_tasks_waiting(self): + task_str = cstr.to_unicode(self.ptr.array_task_str) + if not task_str: + return None + + if "%" in task_str: + # We don't want this % character and everything after it + # in here, so remove it. + task_str = task_str[:task_str.rindex("%")] + + return task_str + + @property + def end_time(self): + return _raw_time(self.ptr.end_time) + + # https://github.com/SchedMD/slurm/blob/d525b6872a106d32916b33a8738f12510ec7cf04/src/api/job_info.c#L480 + cdef _calc_run_time(self): + cdef time_t rtime + cdef time_t etime + + if slurm.IS_JOB_PENDING(self.ptr) or not self.ptr.start_time: + return 0 + elif slurm.IS_JOB_SUSPENDED(self.ptr): + return self.pre_suspension_time + else: + if slurm.IS_JOB_RUNNING(self.ptr) or self.ptr.end_time == 0: + etime = ctime.time(NULL) + else: + etime = self.ptr.end_time + + if self.ptr.suspend_time: + rtime = ctime.difftime(etime, self.ptr.suspend_time) + rtime += self.ptr.pre_sus_time + else: + rtime = ctime.difftime(etime, self.ptr.start_time) + + return u64_parse(rtime, on_noval=0) + + @property + def run_time(self): + return self._calc_run_time() + + @property + def cores_reserved_for_system(self): + if self.ptr.core_spec != slurm.NO_VAL16: + if not self.ptr.core_spec & slurm.CORE_SPEC_THREAD: + return self.ptr.core_spec + + @property + def threads_reserved_for_system(self): + if self.ptr.core_spec != slurm.NO_VAL16: + if self.ptr.core_spec & slurm.CORE_SPEC_THREAD: + return self.ptr.core_spec & (~slurm.CORE_SPEC_THREAD) + + @property + def memory(self): + mem_cpu = self.memory_per_cpu + if mem_cpu is not None: + total_cpus = self.cpus + if total_cpus is not None: + mem_cpu *= total_cpus + return mem_cpu + + mem_node = self.memory_per_node + if mem_node is not None: + num_nodes = self.min_nodes + if num_nodes is not None: + mem_node *= num_nodes + return mem_cpu + + # TODO + # mem_gpu = self.memory_per_gpu + # if mem_gpu is not None: + # num_nodes = self.min_nodes + # if num_nodes is not None: + # mem_node *= num_nodes + # return mem_cpu + + return None + + @property + def memory_per_cpu(self): + if self.ptr.pn_min_memory != slurm.NO_VAL64: + if self.ptr.pn_min_memory & slurm.MEM_PER_CPU: + mem = self.ptr.pn_min_memory & (~slurm.MEM_PER_CPU) + return u64_parse(mem) + else: + return None + + @property + def memory_per_node(self): + if self.ptr.pn_min_memory != slurm.NO_VAL64: + if not self.ptr.pn_min_memory & slurm.MEM_PER_CPU: + return u64_parse(self.ptr.pn_min_memory) + else: + return None + + @property + def memory_per_gpu(self): + if self.ptr.mem_per_tres and self.ptr.pn_min_memory == slurm.NO_VAL64: + # TODO: Make a function that, given a GRES type, safely extracts + # its value from the string. + mem = int(cstr.to_unicode(self.ptr.mem_per_tres).split(":")[2]) + return u64_parse(mem) + else: + return None + + @property + def gres_per_node(self): + return cstr.to_gres_dict(self.ptr.tres_per_node) + + @property + def profile_types(self): + return acctg_profile_int_to_list(self.ptr.profile) + + @property + def gres_binding(self): + if self.ptr.bitflags & slurm.GRES_ENFORCE_BIND: + return "enforce-binding" + elif self.ptr.bitflags & slurm.GRES_DISABLE_BIND: + return "disable-binding" + else: + return None + + @property + def kill_on_invalid_dependency(self): + return u64_parse_bool_flag(self.ptr.bitflags, slurm.KILL_INV_DEP) + + @property + def spreads_over_nodes(self): + return u64_parse_bool_flag(self.ptr.bitflags, slurm.SPREAD_JOB) + + @property + def power_options(self): + return power_type_int_to_list(self.ptr.power_flags) + + @property + def is_cronjob(self): + return u64_parse_bool_flag(self.ptr.bitflags, slurm.CRON_JOB) + + @property + def cronjob_time(self): + return cstr.to_unicode(self.ptr.cronspec) + + @property + def cpu_time(self): + return self.cpus * self.run_time + + @property + def pending_time(self): + # TODO + return None + + @property + def run_time_left(self): + # TODO + return None + + def get_resource_layout_per_node(self): + """Retrieve the resource layout of this Job on each node. + + This contains the following information: + * cpu_ids (str) + * gres (dict) + * memory (int) + + Returns: + (dict): Resource layout + """ + # The code for this function is a modified reimplementation from here: + # https://github.com/SchedMD/slurm/blob/d525b6872a106d32916b33a8738f12510ec7cf04/src/api/job_info.c#L739 + # + # The copyright notices for the file that contains the original code + # is below: + # + # Portions Copyright (C) 2010-2017 SchedMD LLC . + # Copyright (C) 2002-2007 The Regents of the University of California. + # Copyright (C) 2008-2010 Lawrence Livermore National Security. + # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + # Written by Morris Jette et. al. + # CODE-OCEC-09-009. All rights reserved. + # + # Slurm is licensed under the GNU General Public License. For the full + # text of Slurm's License, please see here: + # pyslurm/slurm/SLURM_LICENSE + # + # Please, as mentioned above, also have a look at Slurm's DISCLAIMER + # under pyslurm/slurm/SLURM_DISCLAIMER + # + # TODO: Explain the structure of the return value a bit more. + cdef: + slurm.job_resources *resources = self.ptr.job_resrcs + slurm.hostlist_t hl + uint32_t rel_node_inx + int bit_inx = 0 + int bit_reps = 0 + int sock_inx = 0 + uint32_t sock_reps = 0 + int i = 0, j + uint32_t k = 0 + char *host + char *gres = NULL + slurm.bitstr_t *cpu_bitmap + char cpu_bitmap_str[128] + uint32_t threads + dict output = {} + + if not resources or not resources.core_bitmap: + return output + + hl = slurm.slurm_hostlist_create(resources.nodes) + if not hl: + raise ValueError("Unable to create hostlist.") + + for rel_node_inx in range(resources.nhosts): + # Check how many consecutive nodes have the same cpu allocation + # layout. + if sock_reps >= resources.sock_core_rep_count[sock_inx]: + sock_inx += 1 + sock_reps = 0 + sock_reps += 1 + + # Get the next node from the list of nodenames + host = slurm.slurm_hostlist_shift(hl) + + # How many rounds we have to do in order to calculate the complete + # cpu bitmap. + bit_reps = (resources.sockets_per_node[sock_inx] + * resources.cores_per_socket[sock_inx]) + + # Calculate the amount of threads per core this job has on the + # specific host. + threads = _threads_per_core(host) + + # Allocate a new, big enough cpu bitmap + cpu_bitmap = slurm.slurm_bit_alloc(bit_reps * threads) + + # Calculate the cpu bitmap for this host. + for j in range(bit_reps): + if slurm.slurm_bit_test(resources.core_bitmap, bit_inx): + for k in range(threads): + slurm.slurm_bit_set(cpu_bitmap, (j*threads)+k) + bit_inx += 1 + + # Extract the cpu bitmap into a char *cpu_bitmap_str + slurm.slurm_bit_fmt(cpu_bitmap_str, + sizeof(cpu_bitmap_str), cpu_bitmap) + slurm.slurm_bit_free(&cpu_bitmap) + + nodename = cstr.to_unicode(host) + cpu_ids = cstr.to_unicode(cpu_bitmap_str) + mem = None + + if rel_node_inx < self.ptr.gres_detail_cnt: + gres = self.ptr.gres_detail_str[rel_node_inx] + + if resources.memory_allocated: + mem = u64_parse(resources.memory_allocated[rel_node_inx]) + + if nodename: + output[nodename] = { + "cpu_ids": cpu_ids, + "gres": cstr.to_gres_dict(gres), + "memory": mem, + } + + free(host) + + slurm.slurm_hostlist_destroy(hl) + return output + + +# https://github.com/SchedMD/slurm/blob/d525b6872a106d32916b33a8738f12510ec7cf04/src/api/job_info.c#L99 +cdef _threads_per_core(char *host): + # TODO + return 1 diff --git a/pyslurm/core/job/sbatch_opts.pyx b/pyslurm/core/job/sbatch_opts.pyx new file mode 100644 index 00000000..91724d29 --- /dev/null +++ b/pyslurm/core/job/sbatch_opts.pyx @@ -0,0 +1,204 @@ +######################################################################### +# sbatch_opt.pyx - utilities to parse #SBATCH options +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +import re +from pathlib import Path + +SBATCH_MAGIC = "#SBATCH" + + +class _SbatchOpt(): + def __init__(self, short_opt, long_opt, + our_attr_name, attr_param=None, is_boolean=False, + has_optional_args=False): + self.short_opt = short_opt + self.long_opt = long_opt + self.our_attr_name = our_attr_name + self.attr_param = attr_param + self.is_boolean = is_boolean + self.has_optional_args = has_optional_args + + +# Sorted by occurence in the sbatch manpage - keep in order. +SBATCH_OPTIONS = [ + _SbatchOpt("A", "account", "account"), + _SbatchOpt(None, "acctg-freq", "accounting_gather_frequency"), + _SbatchOpt("a", "array", "array"), + _SbatchOpt(None, "batch", "batch_constraints"), + _SbatchOpt(None, "bb", "burst_buffer"), + _SbatchOpt(None, "bbf", "burst_buffer_file"), + _SbatchOpt("b", "begin", "begin_time"), + _SbatchOpt("D", "chdir", "working_directory"), + _SbatchOpt(None, "cluster-constraint", "cluster_constraints"), + _SbatchOpt("M", "clusters", "clusters"), + _SbatchOpt(None, "comment","comment"), + _SbatchOpt("C", "constraint", "constraints"), + _SbatchOpt(None, "container", "container"), + _SbatchOpt(None, "contiguous", "requires_contiguous_nodes"), + _SbatchOpt("S", "core-spec", "cores_reserved_for_system"), + _SbatchOpt(None, "cores-per-socket", "cores_per_socket"), + _SbatchOpt(None, "cpu-freq", "cpu_frequency"), + _SbatchOpt(None, "cpus-per-gpu", "cpus_per_gpu"), + _SbatchOpt("c", "cpus-per-task", "cpus_per_task"), + _SbatchOpt(None, "deadline", "deadline"), + _SbatchOpt(None, "delay-boot", "delay_boot_time"), + _SbatchOpt("d", "dependency", "dependencies"), + _SbatchOpt("m", "distribution", "distribution"), + _SbatchOpt("e", "error", "standard_error"), + _SbatchOpt("x", "exclude", "excluded_nodes"), + _SbatchOpt(None, "exclusive", "resource_sharing", "no"), + _SbatchOpt(None, "export", "environment"), + _SbatchOpt(None, "export-file", None), + _SbatchOpt("B", "extra-node-info", None), + _SbatchOpt(None, "get-user-env", "get_user_environment"), + _SbatchOpt(None, "gid", "group_id"), + _SbatchOpt(None, "gpu-bind", "gpu_binding"), + _SbatchOpt(None, "gpu-freq", None), + _SbatchOpt("G", "gpus", "gpus"), + _SbatchOpt(None, "gpus-per-node", "gpus_per_node"), + _SbatchOpt(None, "gpus-per-socket", "gpus_per_socket"), + _SbatchOpt(None, "gpus-per-socket", "gpus_per_task"), + _SbatchOpt(None, "gres", "gres_per_node"), + _SbatchOpt(None, "gres-flags", "gres_binding"), + _SbatchOpt(None, "hint", None), + _SbatchOpt("H", "hold", "priority", 0), + _SbatchOpt(None, "ignore-pbs", None), + _SbatchOpt("i", "input", "standard_in"), + _SbatchOpt("J", "job-name", "name"), + _SbatchOpt(None, "kill-on-invalid-dep", "kill_on_invalid_dependency"), + _SbatchOpt("L", "licenses", "licenses"), + _SbatchOpt(None, "mail-type", "mail_types"), + _SbatchOpt(None, "mail-user", "mail_user"), + _SbatchOpt(None, "mcs-label", "mcs_label"), + _SbatchOpt(None, "mem", "memory_per_node"), + _SbatchOpt(None, "mem-bind", None), + _SbatchOpt(None, "mem-per-cpu", "memory_per_cpu"), + _SbatchOpt(None, "mem-per-gpu", "memory_per_gpu"), + _SbatchOpt(None, "mincpus", "min_cpus_per_node"), + _SbatchOpt(None, "network", "network"), + _SbatchOpt(None, "nice", "nice"), + _SbatchOpt("k", "no-kill", "kill_on_node_fail", False), + _SbatchOpt(None, "no-requeue", "is_requeueable", False), + _SbatchOpt("F", "nodefile", None), + _SbatchOpt("w", "nodelist", "required_nodes"), + _SbatchOpt("N", "nodes", "nodes"), + _SbatchOpt("n", "ntasks", "ntasks"), + _SbatchOpt(None, "ntasks-per-core", "ntasks_per_core"), + _SbatchOpt(None, "ntasks-per-gpu", "ntasks_per_gpu"), + _SbatchOpt(None, "ntasks-per-node", "ntasks_per_node"), + _SbatchOpt(None, "ntasks-per-socket", "ntasks_per_socket"), + _SbatchOpt(None, "open-mode", "log_files_open_mode"), + _SbatchOpt("o", "output", "standard_output"), + _SbatchOpt("O", "overcommit", "overcommit", True), + _SbatchOpt("s", "oversubscribe", "resource_sharing", "yes"), + _SbatchOpt("p", "partition", "partition"), + _SbatchOpt(None, "power", "power_options"), + _SbatchOpt(None, "prefer", None), + _SbatchOpt(None, "priority", "priority"), + _SbatchOpt(None, "profile", "profile_types"), + _SbatchOpt(None, "propagate", None), + _SbatchOpt("q", "qos", "qos"), + _SbatchOpt(None, "reboot", "requires_node_reboot", True), + _SbatchOpt(None, "requeue", "is_requeueable", True), + _SbatchOpt(None, "reservation", "reservations"), + _SbatchOpt(None, "signal", "signal"), + _SbatchOpt(None, "sockets-per-node", "sockets_per_node"), + _SbatchOpt(None, "spread-job", "spreads_over_nodes", True), + _SbatchOpt(None, "switches", "switches"), + _SbatchOpt(None, "thread-spec", "threads_reserved_for_system"), + _SbatchOpt(None, "threads-per-core", "threads_per_core"), + _SbatchOpt("t", "time", "time_limit"), + _SbatchOpt(None, "time-min", "time_limit_min"), + _SbatchOpt(None, "tmp", "temporary_disk_per_node"), + _SbatchOpt(None, "uid", "user_id"), + _SbatchOpt(None, "use-min-nodes", "use_min_nodes", True), + _SbatchOpt(None, "wait-all-nodes", "wait_all_nodes", True), + _SbatchOpt(None, "wckey", "wckey"), +] + + +def _parse_line(line): + # Remove the #SBATCH from the start + opts = line[len("#SBATCH"):] + + # Ignore possible comments after the options + opts = opts.split("#")[0].strip() + + # Now the line can be in these forms for example: + # * -t20 or -t 20 + # * --time=20 or --time 20 or --time20 + if "=" in opts: + # -t=21 or --time=20 + opts = "=".join(opts.replace("=", " ").split()) + opt, val = opts.split("=") + elif " " in opts: + # --time 20 or -t 20 + opts = "=".join(opts.split()) + opt, val = opts.split("=") + elif any(el.isdigit() for el in opts): + # -t20 or --time20 + opt, val = list(filter(None, re.split(r'(\d+)', opts))) + else: + # Probably a boolean flag, like --exclusive or -O + opt, val = opts, None + + # Remove "-" or "--" at the front. + opt = opt[1:] + if opt[0] == "-": + # Found second dash. + opt = opt[1:] + + return opt, val + + +def _find_opt(opt): + for sbopt in SBATCH_OPTIONS: + # Check if we can find the option in our predefined mapping. + if opt == sbopt.short_opt or opt == sbopt.long_opt: + return sbopt + + return None + + +def _parse_opts_from_batch_script(desc, script, overwrite): + flags_and_vals = {} + + if not script or not Path(script).is_file(): + return None + + script = Path(script).read_text() + for line in script.splitlines(): + line = line.lstrip() + + if line.startswith(SBATCH_MAGIC): + flag, val = _parse_line(line) + opt = _find_opt(flag) + + if not opt or opt.our_attr_name is None: + # Not supported + continue + + if getattr(desc, opt.our_attr_name) is None or overwrite: + val = opt.attr_param if val is None else val + setattr(desc, opt.our_attr_name, val) diff --git a/pyslurm/core/job/step.pxd b/pyslurm/core/job/step.pxd new file mode 100644 index 00000000..4cdd6c49 --- /dev/null +++ b/pyslurm/core/job/step.pxd @@ -0,0 +1,139 @@ +######################################################################### +# job/step.pxd - interface to retrieve slurm job step informations +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from .job cimport Job + +from pyslurm cimport slurm +from pyslurm.slurm cimport ( + job_step_info_t, + slurm_get_job_steps, + job_step_info_response_msg_t, + step_update_request_msg_t, + slurm_free_job_step_info_response_msg, + slurm_init_update_step_msg, + slurm_free_update_step_msg, + slurm_free_job_step_info_response_msg, + slurm_free_job_step_info_members, + slurm_update_step, + slurm_signal_job_step, + slurm_kill_job_step, + slurm_job_state_string, + xfree, + try_xmalloc, +) + +cdef class JobSteps(dict): + """A collection of :obj:`JobStep` objects for a given Job. + + Args: + job (Union[Job, int]): + A Job for which the Steps should be loaded. + + Raises: + RPCError: When getting the Job steps from the slurmctld failed. + MemoryError: If malloc fails to allocate memory. + """ + + cdef: + job_step_info_response_msg_t *info + job_step_info_t tmp_info + + @staticmethod + cdef JobSteps _load(Job job) + + cdef dict _get_info(self, uint32_t job_id, int flags) + + +cdef class JobStep: + """A Slurm Jobstep + + Args: + job (Union[Job, int]): + The Job this Step belongs to. + step (Union[int, str]): + Step-ID for this JobStep object. + + Raises: + MemoryError: If malloc fails to allocate memory. + + Attributes: + id (Union[str, int]): + The id for this step. + job_id (int): + The id for the Job this step belongs to. + name (str): + Name of the step. + user_id (int): + User ID who owns this step. + user_name (str): + Name of the User who owns this step. + time_limit (int): + Time limit in Minutes for this step. + network (str): + Network specification for the step. + cpu_frequency_min (Union[str, int]): + Minimum CPU-Frequency requested. + cpu_frequency_max (Union[str, int]): + Maximum CPU-Frequency requested. + cpu_frequency_governor (Union[str, int]): + CPU-Frequency Governor requested. + reserved_ports (str): + Reserved ports for the step. + cluster (str): + Name of the cluster this step runs on. + srun_host (str): + Name of the host srun was executed on. + srun_process_id (int): + Process ID of the srun command. + container (str): + Path to the container OCI. + allocated_nodes (str): + Nodes the Job is using. + start_time (int): + Time this step started, as unix timestamp. + run_time (int): + Seconds this step has been running for. + partition (str): + Name of the partition this step runs in. + state (str): + State the step is in. + allocated_cpus (int): + Number of CPUs this step uses in total. + ntasks (int): + Number of tasks this step uses. + distribution (dict): + Task distribution specification for the step. + command (str): + Command that was specified with srun. + slurm_protocol_version (int): + Slurm protocol version in use. + """ + + cdef: + job_step_info_t *ptr + step_update_request_msg_t *umsg + + @staticmethod + cdef JobStep from_ptr(job_step_info_t *in_ptr) diff --git a/pyslurm/core/job/step.pyx b/pyslurm/core/job/step.pyx new file mode 100644 index 00000000..d84330b1 --- /dev/null +++ b/pyslurm/core/job/step.pyx @@ -0,0 +1,463 @@ +######################################################################### +# job/step.pyx - interface to retrieve slurm job step informations +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from libc.string cimport memcpy, memset +from pyslurm.core.common cimport cstr, ctime +from pyslurm.core.common import cstr, ctime +from pyslurm.core.common.uint cimport * +from pyslurm.core.common.uint import * +from pyslurm.core.common.ctime cimport time_t +from pyslurm.core.error import RPCError, verify_rpc +from pyslurm.core.common import ( + signal_to_num, + instance_to_dict, + uid_to_name, +) +from pyslurm.core.job.util import cpu_freq_int_to_str +from pyslurm.core.job.task_dist cimport TaskDistribution + +from pyslurm.core.common.ctime import ( + secs_to_timestr, + mins_to_timestr, + timestr_to_mins, + timestamp_to_date, + _raw_time, +) + + +cdef class JobSteps(dict): + + def __dealloc__(self): + slurm_free_job_step_info_response_msg(self.info) + + def __cinit__(self): + self.info = NULL + + def __init__(self): + pass + + @staticmethod + def load(job): + cdef Job _job + _job = Job.load(job.id) if isinstance(job, Job) else Job.load(job) + return JobSteps._load(_job) + + @staticmethod + cdef JobSteps _load(Job job): + cdef JobSteps steps = JobSteps.__new__(JobSteps) + + step_info = steps._get_info(job.id, slurm.SHOW_ALL) + if not step_info and not slurm.IS_JOB_PENDING(job.ptr): + msg = f"Failed to load step info for Job {job.id}." + raise RPCError(msg=msg) + + # No super().__init__() needed? Cython probably already initialized + # the dict automatically. + steps.update(step_info[job.id]) + return steps + + cdef dict _get_info(self, uint32_t job_id, int flags): + cdef: + JobStep step + JobSteps steps + uint32_t cnt = 0 + dict out = {} + + rc = slurm_get_job_steps(0, job_id, slurm.NO_VAL, &self.info, + flags) + verify_rpc(rc) + + # zero-out a dummy job_step_info_t + memset(&self.tmp_info, 0, sizeof(job_step_info_t)) + + # Put each job-step pointer into its own "JobStep" instance. + for cnt in range(self.info.job_step_count): + step = JobStep.from_ptr(&self.info.job_steps[cnt]) + + # Prevent double free if xmalloc fails mid-loop and a MemoryError + # is raised by replacing it with a zeroed-out job_step_info_t. + self.info.job_steps[cnt] = self.tmp_info + + if not step.job_id in out: + steps = JobSteps.__new__(JobSteps) + out[step.job_id] = steps + + out[step.job_id].update({step.id: step}) + + # At this point we memcpy'd all the memory for the Steps. Setting this + # to 0 will prevent the slurm step free function to deallocate the + # memory for the individual steps. This should be fine, because they + # are free'd automatically in __dealloc__ since the lifetime of each + # step-pointer is tied to the lifetime of its corresponding JobStep + # instance. + self.info.job_step_count = 0 + + return out + + @staticmethod + def load_all(): + """Loads all the steps in the system. + + Returns: + (dict): A dict where every JobID (key) is mapped with an instance + of its JobSteps (value). + """ + cdef JobSteps steps = JobSteps.__new__(JobSteps) + return steps._get_info(slurm.NO_VAL, slurm.SHOW_ALL) + + +cdef class JobStep: + + def __cinit__(self): + self.ptr = NULL + self.umsg = NULL + + def __init__(self, job_id=0, step_id=0, **kwargs): + self._alloc_impl() + self.job_id = job_id.id if isinstance(job_id, Job) else job_id + self.id = step_id + + # Initialize attributes, if any were provided + for k, v in kwargs.items(): + setattr(self, k, v) + + def _alloc_info(self): + if not self.ptr: + self.ptr = try_xmalloc( + sizeof(job_step_info_t)) + if not self.ptr: + raise MemoryError("xmalloc failed for job_step_info_t") + + def _alloc_umsg(self): + if not self.umsg: + self.umsg = try_xmalloc( + sizeof(step_update_request_msg_t)) + if not self.ptr: + raise MemoryError("xmalloc failed for " + "step_update_request_msg_t") + slurm_init_update_step_msg(self.umsg) + + def _alloc_impl(self): + self._alloc_info() + self._alloc_umsg() + + def __dealloc__(self): + self._dealloc_impl() + + def _dealloc_impl(self): + slurm_free_job_step_info_members(self.ptr) + xfree(self.ptr) + slurm_free_update_step_msg(self.umsg) + self.umsg = NULL + + def __setattr__(self, name, val): + # When a user wants to set attributes on a instance that was created + # by calling JobSteps.load(), the "umsg" pointer is not yet allocated. + # We only allocate memory for it by the time the user actually wants + # to modify something. + self._alloc_umsg() + # Call descriptors __set__ directly + JobStep.__dict__[name].__set__(self, val) + + @staticmethod + def load(job_id, step_id): + """Load information for a specific job step. + + Implements the slurm_get_job_steps RPC. + + Args: + job_id (Union[Job, int]): + ID of the Job the Step belongs to. + step_id (Union[int, str]): + Step-ID for the Step to be loaded. + + Returns: + (pyslurm.JobStep): Returns a new JobStep instance + + Raises: + RPCError: When retrieving Step information from the slurmctld was + not successful. + MemoryError: If malloc failed to allocate memory. + + Examples: + >>> import pyslurm + >>> jobstep = pyslurm.JobStep.load(9999, 1) + """ + cdef: + job_step_info_response_msg_t *info = NULL + JobStep wrap = JobStep.__new__(JobStep) + + job_id = job_id.id if isinstance(job_id, Job) else job_id + rc = slurm_get_job_steps(0, job_id, dehumanize_step_id(step_id), + &info, slurm.SHOW_ALL) + verify_rpc(rc) + + if info and info.job_step_count == 1: + # Copy new info + wrap._alloc_impl() + memcpy(wrap.ptr, &info.job_steps[0], sizeof(job_step_info_t)) + info.job_step_count = 0 + slurm_free_job_step_info_response_msg(info) + else: + slurm_free_job_step_info_response_msg(info) + msg = f"Step {step_id} of Job {job_id} not found." + raise RPCError(msg=msg) + + return wrap + + @staticmethod + cdef JobStep from_ptr(job_step_info_t *in_ptr): + cdef JobStep wrap = JobStep.__new__(JobStep) + wrap._alloc_info() + memcpy(wrap.ptr, in_ptr, sizeof(job_step_info_t)) + return wrap + + def send_signal(self, signal): + """Send a signal to a running Job step. + + Implements the slurm_signal_job_step RPC. + + Args: + signal (Union[str, int]): + Any valid signal which will be sent to the Job. Can be either + a str like 'SIGUSR1', or simply an int. + + Raises: + RPCError: When sending the signal was not successful. + + Examples: + Specifying the signal as a string: + + >>> from pyslurm import JobStep + >>> JobStep(9999, 1).send_signal("SIGUSR1") + + or passing in a numeric signal: + + >>> JobStep(9999, 1).send_signal(9) + """ + step_id = self.ptr.step_id.step_id + sig = signal_to_num(signal) + verify_rpc(slurm_signal_job_step(self.job_id, step_id, sig)) + + def cancel(self): + """Cancel a Job step. + + Implements the slurm_kill_job_step RPC. + + Raises: + RPCError: When cancelling the Job was not successful. + + Examples: + >>> from pyslurm import JobStep + >>> JobStep(9999, 1).cancel() + """ + step_id = self.ptr.step_id.step_id + verify_rpc(slurm_kill_job_step(self.job_id, step_id, 9)) + + def modify(self, step=None, **kwargs): + """Modify a job step. + + Implements the slurm_update_step RPC. + + Args: + step (JobStep): + Another JobStep object which contains all the changes that + should be applied to this instance. + **kwargs: + You can also specify all the changes as keyword arguments. + Allowed values are only attributes which can actually be set + on a JobStep instance. If a step is explicitly specified as + parameter, all **kwargs will be ignored. + + Raises: + RPCError: When updating the JobStep was not successful. + + Examples: + >>> from pyslurm import JobStep + >>> + >>> # Setting the new time-limit to 20 days + >>> changes = JobStep(time_limit="20-00:00:00") + >>> JobStep(9999, 1).modify(changes) + >>> + >>> # Or by specifying the changes directly to the modify function + >>> JobStep(9999, 1).modify(time_limit="20-00:00:00") + """ + cdef JobStep js = self + + # Allow the user to both specify changes via object and **kwargs. + if step and isinstance(step, JobStep): + js = step + elif kwargs: + js = JobStep(**kwargs) + + js._alloc_umsg() + js.umsg.step_id = self.ptr.step_id.step_id + js.umsg.job_id = self.ptr.step_id.job_id + verify_rpc(slurm_update_step(js.umsg)) + + + def as_dict(self): + """JobStep information formatted as a dictionary. + + Returns: + (dict): JobStep information as dict + """ + return instance_to_dict(self) + + @property + def id(self): + return humanize_step_id(self.ptr.step_id.step_id) + + @id.setter + def id(self, val): + self.ptr.step_id.step_id = dehumanize_step_id(val) + + @property + def job_id(self): + return self.ptr.step_id.job_id + + @job_id.setter + def job_id(self, val): + self.ptr.step_id.job_id = int(val) + + @property + def name(self): + return cstr.to_unicode(self.ptr.name) + + @property + def user_id(self): + return u32_parse(self.ptr.user_id, zero_is_noval=False) + + @property + def user_name(self): + return uid_to_name(self.ptr.user_id) + + @property + def time_limit(self): + return _raw_time(self.ptr.time_limit) + + @time_limit.setter + def time_limit(self, val): + self.umsg.time_limit=self.ptr.time_limit = timestr_to_mins(val) + + @property + def network(self): + return cstr.to_unicode(self.ptr.network) + + @property + def cpu_frequency_min(self): + return cpu_freq_int_to_str(self.ptr.cpu_freq_min) + + @property + def cpu_frequency_max(self): + return cpu_freq_int_to_str(self.ptr.cpu_freq_max) + + @property + def cpu_frequency_governor(self): + return cpu_freq_int_to_str(self.ptr.cpu_freq_gov) + + @property + def reserved_ports(self): + return cstr.to_unicode(self.ptr.resv_ports) + + @property + def cluster(self): + return cstr.to_unicode(self.ptr.cluster) + + @property + def srun_host(self): + return cstr.to_unicode(self.ptr.srun_host) + + @property + def srun_process_id(self): + return u32_parse(self.ptr.srun_pid) + + @property + def container(self): + return cstr.to_unicode(self.ptr.container) + + @property + def allocated_nodes(self): + return cstr.to_list(self.ptr.nodes) + + @property + def start_time(self): + return _raw_time(self.ptr.start_time) + + @property + def run_time(self): + return _raw_time(self.ptr.run_time) + + @property + def partition(self): + return cstr.to_unicode(self.ptr.partition) + + @property + def state(self): + return cstr.to_unicode(slurm_job_state_string(self.ptr.state)) + + @property + def alloc_cpus(self): + return u32_parse(self.ptr.num_cpus) + + @property + def ntasks(self): + return u32_parse(self.ptr.num_tasks) + + @property + def distribution(self): + return TaskDistribution.from_int(self.ptr.task_dist) + + @property + def command(self): + return cstr.to_unicode(self.ptr.submit_line) + + @property + def slurm_protocol_version(self): + return u32_parse(self.ptr.start_protocol_ver) + + +def humanize_step_id(sid): + if sid == slurm.SLURM_BATCH_SCRIPT: + return "batch" + elif sid == slurm.SLURM_EXTERN_CONT: + return "extern" + elif sid == slurm.SLURM_INTERACTIVE_STEP: + return "interactive" + elif sid == slurm.SLURM_PENDING_STEP: + return "pending" + else: + return sid + +def dehumanize_step_id(sid): + if sid == "batch": + return slurm.SLURM_BATCH_SCRIPT + elif sid == "extern": + return slurm.SLURM_EXTERN_CONT + elif sid == "interactive": + return slurm.SLURM_INTERACTIVE_STEP + elif sid == "pending": + return slurm.SLURM_PENDING_STEP + else: + return int(sid) diff --git a/pyslurm/core/job/submission.pxd b/pyslurm/core/job/submission.pxd new file mode 100644 index 00000000..ebf0b0c5 --- /dev/null +++ b/pyslurm/core/job/submission.pxd @@ -0,0 +1,619 @@ +######################################################################### +# submission.pxd - interface for submitting slurm jobs +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.slurm cimport ( + job_desc_msg_t, + slurm_init_job_desc_msg, + slurm_free_job_desc_msg, + submit_response_msg_t, + slurm_submit_batch_job, + slurm_free_submit_response_response_msg, + slurm_env_array_free, + slurm_env_array_create, + slurm_env_array_merge, + slurm_env_array_overwrite, + slurm_job_share_string, + xfree, + try_xmalloc, +) + + +cdef class JobSubmitDescription: + """Description of a Slurm Job. + + Attributes: + name (str): + Name of the Job, same as -J/--job-name from sbatch. + account (str): + Account of the job, same as -A/--account from sbatch. + user_id (Union[str, int]): + Run the job as a different User, same as --uid from sbatch. + This requires root privileges. + You can both specify the name or numeric uid of the User. + group_id (Union[str, int]): + Run the job as a different Group, same as --gid from sbatch. + This requires root privileges. + You can both specify the name or numeric gid of the User. + priority (int): + Specific priority the Job will receive. + Same as --priority from sbatch. + You can achieve the behaviour of sbatch's --hold option by + specifying a priority of 0. + site_factor (int): + Site Factor of the Job. Only used when updating an existing Job. + wckey (str): + WCKey to use with the Job, same as --wckey from sbatch. + array (str): + Job Array specification, same as -a/--array from sbatch. + batch_constraints (str): + Batch Features of a Job, same as --batch from sbatch. + begin_time (str): + Defer allocation until the specified time, same as --begin from + sbatch. + clusters (Union[list, str]): + Clusters the job may run on, same as -M/--clusters from sbatch. + cluster_constraints (str): + Comma-separated str with cluster constraints for the job. + This is the same as --cluster-constraint from sbatch. + comment (str): + Arbitrary job comment, same as --comment from sbatch. + admin_comment (str): + Arbitrary job admin comment. + Only used when updating an existing job. + requires_contiguous_nodes (bool): + Whether allocated Nodes are required to form a contiguous set. + Same as --contiguous from sbatch. + cores_reserved_for_system (int): + Count of cores reserved for system not usable by the Job. + Same as -S/--core-spec from sbatch. + Mutually exclusive with `threads_reserved_for_system`. + threads_reserved_for_system (int): + Count of threads reserved for system not usable by the Job. + Same as --thread-spec from sbatch. + Mutually exclusive with `cores_reserved_for_system`. + working_directory (str): + Work directory for the Job. Default is current work-dir from where + the job was submitted. + Same as -D/--chdir from sbatch. + cpu_frequency (Union[dict, str]): + CPU Frequency for the Job, same as --cpu-freq from sbatch. + + Examples: + Specifying it as a dict: + + cpu_frequency = { + "min": "Low", + "max": "High", + "governor": "UserSpace" + } + + or like in sbatch with a string. For more info on that, check + out the sbatch documentation for --cpu-freq. + + If you only want to set a Governor without any min or max, you + can simply specify it as a standalone string: + + cpu_frequency = "Performance" + or + cpu_frequency = {"governor": "Performance"} + + If you want to set a specific, fixed frequency, you can do: + + cpu_frequency = + or either + cpu_frequency = {"max": } or cpu_freq = {"min": } + nodes (Union[dict, str, int]): + Amount of nodes needed for the job. + This is the same as -N/--nodes from sbatch. + + Examples: + Providing min/max nodes as a dict: + + nodes = { + "min": 3, + "max": 6 + } + + When no range is needed, you can also simply specify it as + int: + + nodes = 3 + + Other than that, a range can also be specified in a str like + with sbatch: + + nodes = "1-5" + deadline (str): + Deadline specification for the Job, same as --deadline from + sbatch. + delay_boot_time (Union[str, int]): + Delay boot specification for the Job, same as --delay-boot from + sbatch. + dependencies (Union[dict, str]): + Dependencies for the Job, same as -d/--dependency from sbatch. + excluded_nodes (Union[list, str]): + Exclude specific nodes for this Job. + This is the same as -x/--exclude from sbatch. + required_nodes (Union[list, str]): + Specific list of nodes required for the Job. + This is the same as -w/--nodelist from sbatch. + constraints (str): + Required node features for the Job. + This is the same as -C/--constraint from sbatch. + kill_on_node_fail (bool): + Should the job get killed if one of the Nodes fails? + This is the same as -k/--no-kill from sbatch. + licenses (Union[list, str]): + A list of licenses for the Job. + This is the same as -L/--licenses from sbatch. + mail_user (Union[list, str]): + List of email addresses for notifications. + This is the same as --mail-user from sbatch. + mail_types (Union[list, str]): + List of mail flags. + This is the same as --mail-type from sbatch. + mcs_label (str): + An MCS Label for the Job. + This is the same as --mcs-label from sbatch. + memory_per_cpu (Union[str, int]): + Memory required per allocated CPU. + + The default unit is in Mebibytes. You are also able to specify + unit suffixes like K|M|G|T. + This is the same as --mem-per-cpu from sbatch. This is mutually + exclusive with memory_per_node and memory_per_gpu. + + Examples: + # 1 MiB + memory_per_cpu = 1024 + + # 3 GiB + memory_per_cpu = "3G" + memory_per_node (Union[str, int]): + Memory required per whole node. + + The default unit is in Mebibytes. You are also able to specify + unit suffixes like K|M|G|T. + This is the same as --mem from sbatch. This is mutually exclusive + with memory_per_cpu and memory_per_gpu. + + Examples: + # 1 MiB + memory_per_node = 1024 + + # 3 GiB + memory_per_node = "3G" + memory_per_gpu (Union[str, int]): + Memory required per GPU. + + The default unit is in Mebibytes. You are also able to specify + unit suffixes like K|M|G|T. + This is the same as --mem-per-gpu from sbatch. This is mutually + exclusive with memory_per_node and memory_per_cpu. + + Examples: + # 1 MiB + memory_per_gpu = 1024 + + # 3 GiB + memory_per_gpu = "3G" + network (str): + Network types for the Job. + This is the same as --network from sbatch. + nice (int): + Adjusted scheduling priority for the Job. + This is the same as --nice from sbatch. + log_files_open_mode (str): + Mode in which standard_output and standard_error log files should be opened. + + Valid options are: + * append + * truncate + + This is the same as --open-mode from sbatch. + overcommit (bool): + If the resources should be overcommitted. + This is the same as -O/--overcommit from sbatch. + partitions (Union[list, str]): + A list of partitions the Job may use. + This is the same as -p/--partition from sbatch. + power_options (list): + A list of power management plugin options for the Job. + This is the same as --power from sbatch. + accounting_gather_frequency (Union[dict, str]): + Interval for accounting info to be gathered. + This is the same as --acctg-freq from sbatch. + + Examples: + Specifying it as a dict: + + accounting_gather_frequency = { + energy=60, + network=20, + } + + or as a single string: + + accounting_gather_frequency = "energy=60,network=20" + qos (str): + Quality of Service for the Job. + This is the same as -q/--qos from sbatch. + requires_node_reboot (bool): + Force the allocated nodes to reboot before the job starts. + This is the same --reboot from sbatch. + is_requeueable (bool): + If the Job is eligible for requeuing. + This is the same as --requeue from sbatch. + reservations (Union[list, str]): + A list of possible reservations the Job can use. + This is the same as --reservation from sbatch. + script (str): + Absolute Path or content of the batch script. + + You can specify either a path to a script which will be loaded, or + you can pass the script as a string. + If the script is passed as a string, providing arguments to it + (see "script_args") is not supported. + script_args (str): + Arguments passed to the batch script. + You can only set arguments if a file path was specified for + "script". + environment (Union[dict, str]): + Environment variables to be set for the Job. + This is the same as --export from sbatch. + resource_sharing (str): + Controls the resource sharing with other Jobs. + + This property combines functionality of --oversubscribe and + --exclusive from sbatch. + + Allowed values are are: + + * "oversubscribe" or "yes": + The Job allows resources to be shared with other running Jobs. + + * "user" + Only sharing resources with other Jobs that have the "user" + option set is allowed + + * "mcs" + Only sharing resources with other Jobs that have the "mcs" + option set is allowed. + + * "no" or "exclusive" + No sharing of resources is allowed. (--exclusive from sbatch) + distribution (Union[dict, str]): + TODO + time_limit (str): + The time limit for the job. + This is the same as -t/--time from sbatch. + time_limit_min (str): + A minimum time limit for the Job. + This is the same as --time-min from sbatch. + container (str): + Path to an OCI container bundle. + This is the same as --container from sbatch. + cpus_per_task (int): + The amount of cpus required for each task. + + This is the same as -c/--cpus-per-task from sbatch. + This is mutually exclusive with cpus_per_gpu. + cpus_per_gpu (int): + The amount of cpus required for each allocated GPU. + + This is the same as --cpus-per-gpu from sbatch. + This is mutually exclusive with cpus_per_task. + sockets_per_node (int): + Restrict Job to nodes with atleast this many sockets. + This is the same as --sockets-per-node from sbatch. + cores_per_socket (int): + Restrict Job to nodes with atleast this many cores per socket + This is the same as --cores-per-socket from sbatch. + threads_per_core (int): + Restrict Job to nodes with atleast this many threads per socket + This is the same as --threads-per-core from sbatch. + gpus (Union[dict, str, int]): + GPUs for the Job to be allocated in total. + + This is the same as -G/--gpus from sbatch. + Specifying the type of the GPU is optional. + + Examples: + Specifying the GPU counts as a dict: + + gpus = { + "tesla": 1, + "volta": 5, + } + + Or, for example, in string format: + + gpus = "tesla:1,volta:5" + + Or, if you don't care about the type of the GPU: + + gpus = 6 + gpus_per_socket (Union[dict, str, int]): + GPUs for the Job to be allocated per socket. + + This is the same as --gpus-per-socket from sbatch. + + Specifying the type of the GPU is optional. Note that setting + gpus_per_socket requires to also specify sockets_per_node. + + Examples: + Specifying it as a dict: + + gpus_per_socket = { + "tesla": 1, + "volta": 5, + } + + Or, for example, in string format: + + gpus_per_socket = "tesla:1,volta:5" + + Or, if you don't care about the type of the GPU: + + gpus_per_socket = 6 + gpus_per_task (Union[dict, str, int]): + GPUs for the Job to be allocated per task. + + This is the same as --gpus-per-task from sbatch. + + Specifying the type of the GPU is optional. Note that setting + "gpus_per_task" requires to also specify either one of "ntasks" or + "gpus". + + Examples: + Specifying it as a dict: + + gpus_per_task = { + "tesla": 1, + "volta": 5, + } + + Or, for example, in string format: + + gpus_per_task = "tesla:1,volta:5" + + Or, if you don't care about the type of the GPU: + + gpus_per_task = 6 + gres_per_node (Union[dict, str]): + Generic resources to be allocated per node. + + This is the same as --gres from sbatch. You should also use this + option if you want to specify GPUs per node (--gpus-per-node). + Specifying the type (by seperating GRES name and type with a + semicolon) is optional. + + Examples: + Specifying it as a dict: + + gres_per_node = { + "gpu:tesla": 1, + "gpu:volta": 5, + } + + Or, for example, in string format: + + gres_per_node = "gpu:tesla:1,gpu:volta:5" + + GPU Gres without a specific type: + + gres_per_node = "gpu:6" + gpu_binding (str): + Specify GPU binding for the Job. + This is the same as --gpu-bind from sbatch. + ntasks (int): + Maximum amount of tasks for the Job. + This is the same as -n/--ntasks from sbatch. + ntasks_per_node (int): + Amount of tasks to be invoked on each node. + This is the same as --ntasks-per-node from sbatch. + ntasks_per_socket (int): + Maximum amount of tasks to be invoked on each socket. + This is the same as --ntasks-per-socket from sbatch. + ntasks_per_core (int): + Maximum amount of tasks to be invoked on each core. + This is the same as --ntasks-per-core from sbatch. + ntasks_per_gpu (int): + Amount of tasks to be invoked per GPU. + This is the same as --ntasks-per-socket from sbatch. + switches (Union[dict, str, int]): + Maximum amount of leaf switches and wait time desired. + + This can also optionally include a maximum waiting time for these + switches. + This is the same as --switches from sbatch. + + Examples: + Specifying it as a dict: + + switches = { "count": 5, "max_wait_time": "00:10:00" } + + Or as a single string (sbatch-style): + + switches = "5@00:10:00" + signal (Union[dict, str]): + Warn signal to be sent to the Job. + + This is the same as --signal from sbatch. + The signal can both be specified with its name, e.g. "SIGKILL", or + as a number, e.g. 9 + + Examples: + Specifying it as a dict: + + signal = { + "signal": "SIGKILL", + "time": 120 + } + + The above will send a "SIGKILL" signal 120 seconds before the + Jobs' time limit is reached. + + Or, specifying it as a string (sbatch-style): + + signal = "SIGKILL@120" + standard_in (str): + Path to a File acting as standard_in for the batch-script. + This is the same as -i/--input from sbatch. + standard_in (str): + Path to a File acting as standard_in for the batch-script. + This is the same as -i/--input from sbatch. + standard_output (str): + Path to a File to write the Jobs standard_output. + This is the same as -o/--output from sbatch. + kill_on_invalid_dependency (bool): + Kill the job if it has an invalid dependency. + This is the same as --kill-on-invalid-dep from sbatch. + spreads_over_nodes (bool): + Spread the Job over as many nodes as possible. + This is the same as --spread-job from sbatch. + use_min_nodes (bool): + Prefer the minimum amount of nodes specified. + This is the same as --use-min-nodes from sbatch. + gres_binding (str): + Generic resource task binding options. + This is the --gres-flags option from sbatch. + + Possible values are: + * "enforce-binding" + * "disable-binding" + temporary_disk_per_node (Union[str, int]): + Amount of temporary disk space needed per node. + + This is the same as --tmp from sbatch. You can specify units like + K|M|G|T (multiples of 1024). + If no unit is specified, the value will be assumed as Mebibytes. + + Examples: + # 2048 MiB + tmp_disk_per_node = "2G" + + # 1024 MiB + tmp_disk_per_node = 1024 + get_user_environment (Union[str, bool, int]): + TODO + min_cpus_per_node (str): + Set the minimum amount of CPUs required per Node. + This is the same as --mincpus from sbatch. + wait_all_nodes (bool): + Controls when the execution of the command begins. + + A value of True means that the Job should begin execution only + after all nodes in the allocation are ready. Setting it to False, + the default, means that it is not waited for the nodes to be + ready. (i.e booted) + """ + cdef: + slurm.job_desc_msg_t *ptr + is_update + + cdef public: + name + account + user_id + group_id + priority + site_factor + wckey + array + batch_constraints + begin_time + clusters + cluster_constraints + comment + admin_comment + requires_contiguous_nodes + cores_reserved_for_system + threads_reserved_for_system + working_directory + cpu_frequency + nodes + deadline + delay_boot_time + dependencies + excluded_nodes + required_nodes + constraints + kill_on_node_fail + licenses + mail_user + mail_types + mcs_label + memory_per_cpu + memory_per_node + memory_per_gpu + network + nice + log_files_open_mode + overcommit + partitions + power_options + profile_types + accounting_gather_frequency + qos + requires_node_reboot + is_requeueable + reservations + script + script_args + environment + resource_sharing + distribution + time_limit + time_limit_min + container + cpus_per_task + cpus_per_gpu + sockets_per_node + cores_per_socket + threads_per_core + gpus + gpus_per_socket + gpus_per_task + gres_per_node + gpu_binding + ntasks + ntasks_per_node + ntasks_per_socket + ntasks_per_core + ntasks_per_gpu + switches + signal + standard_in + standard_output + standard_error + kill_on_invalid_dependency + spreads_over_nodes + use_min_nodes + gres_binding + temporary_disk_per_node + get_user_environment + min_cpus_per_node + wait_all_nodes diff --git a/pyslurm/core/job/submission.pyx b/pyslurm/core/job/submission.pyx new file mode 100644 index 00000000..e1f4039d --- /dev/null +++ b/pyslurm/core/job/submission.pyx @@ -0,0 +1,682 @@ +######################################################################### +# submission.pyx - interface for submitting slurm jobs +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from os import getcwd +from os import environ as pyenviron +import re +import typing +import shlex +from pathlib import Path +from pyslurm.core.common cimport cstr, ctime +from pyslurm.core.common import cstr +from pyslurm.core.common.uint cimport * +from pyslurm.core.common.uint import * +from pyslurm.core.common.ctime cimport time_t +from pyslurm.core.job.util import * +from pyslurm.core.error import RPCError, verify_rpc +from pyslurm.core.job.sbatch_opts import _parse_opts_from_batch_script +from pyslurm.core.common.ctime import ( + secs_to_timestr, + timestr_to_secs, + mins_to_timestr, + timestr_to_mins, + timestamp_to_date, + date_to_timestamp, +) +from pyslurm.core.job.task_dist cimport TaskDistribution + +from pyslurm.core.common import ( + humanize, + dehumanize, + signal_to_num, + user_to_uid, + group_to_gid, + uid_to_name, + gid_to_name, +) + + +cdef class JobSubmitDescription: + def __cinit__(self): + self.ptr = NULL + + def __init__(self, **kwargs): + # Initialize explicitly provided attributes, if any. + for k, v in kwargs.items(): + setattr(self, k, v) + + def __dealloc__(self): + slurm_free_job_desc_msg(self.ptr) + + def _alloc_and_init(self): + slurm_free_job_desc_msg(self.ptr) + + self.ptr = try_xmalloc(sizeof(job_desc_msg_t)) + if not self.ptr: + raise MemoryError("xmalloc for job_desc_msg_t failed.") + + slurm_init_job_desc_msg(self.ptr) + + def submit(self): + """Submit a batch job description. + + Returns: + (int): The ID of the submitted Job. + + Raises: + RPCError: When the job submission was not successful. + MemoryError: If malloc failed to allocate enough memory. + + Examples: + >>> desc = JobSubmitDescription( + >>> name="test-job", + >>> cpus_per_task=1, + >>> time_limit="10-00:00:00") + >>> + >>> job_id = desc.submit() + """ + cdef submit_response_msg_t *resp = NULL + + self._create_job_submit_desc() + verify_rpc(slurm_submit_batch_job(self.ptr, &resp)) + + job_id = resp.job_id + slurm_free_submit_response_response_msg(resp) + + return job_id + + def load_environment(self, overwrite=False): + """Load values of attributes provided through the environment. + + Args: + overwrite (bool): + If set to True, the value from an option found in the + environment will override its current value. Default is False + """ + self._parse_env(overwrite) + + def load_sbatch_options(self, overwrite=False): + """Load values from #SBATCH options in the batch script. + + Args: + overwrite (bool): + If set to True, the value from an option found in the in the + batch script will override its current value. Default is False + """ + _parse_opts_from_batch_script(self, self.script, overwrite) + + def _parse_env(self, overwrite=False): + for attr in dir(self): + if attr.startswith("_") or callable(attr): + # Ignore everything starting with "_" and all functions. + # Arguments directly specified upon object creation will + # always have precedence. + continue + + spec = attr.upper() + val = pyenviron.get(f"PYSLURM_JOBDESC_{spec)}") + if (val is not None + and (getattr(self, attr) is None or overwrite)): + + # Just convert literal true/false strings to bool. + tmp = val.casefold() + if tmp == "true": + val = True + elif tmp == "false": + val = False + + setattr(self, attr, val) + + def _create_job_submit_desc(self, is_update=False): + self.is_update = is_update + self._alloc_and_init() + cdef slurm.job_desc_msg_t *ptr = self.ptr + + if not self.is_update: + self._validate_options() + self._set_defaults() + + if self.nice: + ptr.nice = slurm.NICE_OFFSET + int(self.nice) + + if self.site_factor: + ptr.site_factor = slurm.NICE_OFFSET + int(self.site_factor) + + if self.user_id is not None: + ptr.user_id = user_to_uid(self.user_id) + if self.group_id is not None: + ptr.group_id = group_to_gid(self.group_id) + + cstr.fmalloc(&ptr.name, self.name) + cstr.fmalloc(&ptr.account, self.account) + cstr.fmalloc(&ptr.wckey, self.wckey) + cstr.fmalloc(&ptr.array_inx, self.array) + cstr.fmalloc(&ptr.batch_features, self.batch_constraints) + cstr.fmalloc(&ptr.cluster_features, self.cluster_constraints) + cstr.fmalloc(&ptr.comment, self.comment) + cstr.fmalloc(&ptr.work_dir, self.working_directory) + cstr.fmalloc(&ptr.features, self.constraints) + cstr.fmalloc(&ptr.mail_user, self.mail_user) + cstr.fmalloc(&ptr.mcs_label, self.mcs_label) + cstr.fmalloc(&ptr.network, self.network) + cstr.fmalloc(&ptr.qos, self.qos) + cstr.fmalloc(&ptr.container, self.container) + cstr.fmalloc(&ptr.std_in, self.standard_in) + cstr.fmalloc(&ptr.std_out, self.standard_output) + cstr.fmalloc(&ptr.std_err, self.standard_error) + cstr.fmalloc(&ptr.tres_per_job, cstr.from_gres_dict(self.gpus, "gpu")) + cstr.fmalloc(&ptr.tres_per_socket, + cstr.from_gres_dict(self.gpus_per_socket, "gpu")) + cstr.fmalloc(&ptr.tres_per_task, + cstr.from_gres_dict(self.gpus_per_task, "gpu")) + cstr.fmalloc(&ptr.tres_per_node, + cstr.from_gres_dict(self.gres_per_node)) + cstr.fmalloc(&ptr.cpus_per_tres, + cstr.from_gres_dict(self.cpus_per_gpu, "gpu")) + cstr.fmalloc(&ptr.admin_comment, self.admin_comment) + + cstr.from_list(&ptr.clusters, self.clusters) + cstr.from_list(&ptr.exc_nodes, self.excluded_nodes) + cstr.from_list(&ptr.req_nodes, self.required_nodes) + cstr.from_list(&ptr.licenses, self.licenses) + cstr.from_list(&ptr.partition, self.partitions) + cstr.from_list(&ptr.reservation, self.reservations) + cstr.from_dict(&ptr.acctg_freq, self.accounting_gather_frequency) + + ptr.deadline = date_to_timestamp(self.deadline) + ptr.begin_time = date_to_timestamp(self.begin_time) + ptr.delay_boot = timestr_to_secs(self.delay_boot_time) + ptr.time_limit = timestr_to_mins(self.time_limit) + ptr.time_min = timestr_to_mins(self.time_limit_min) + + ptr.priority = u32(self.priority, zero_is_noval=False) + ptr.num_tasks = u32(self.ntasks) + ptr.pn_min_tmp_disk = u32(dehumanize(self.temporary_disk_per_node)) + ptr.cpus_per_task = u16(self.cpus_per_task) + ptr.sockets_per_node = u16(self.sockets_per_node) + ptr.cores_per_socket = u16(self.cores_per_socket) + ptr.ntasks_per_socket = u16(self.ntasks_per_socket) + ptr.ntasks_per_tres = u16(self.ntasks_per_gpu) + ptr.ntasks_per_node = u16(self.ntasks_per_node) + ptr.threads_per_core = u16(self.threads_per_core) + ptr.ntasks_per_core = u16(self.ntasks_per_core) + u64_set_bool_flag(&ptr.bitflags, self.spreads_over_nodes, + slurm.SPREAD_JOB) + u64_set_bool_flag(&ptr.bitflags, self.kill_on_invalid_dependency, + slurm.KILL_INV_DEP) + u64_set_bool_flag(&ptr.bitflags, self.use_min_nodes, + slurm.USE_MIN_NODES) + ptr.contiguous = u16_bool(self.requires_contiguous_nodes) + ptr.kill_on_node_fail = u16_bool(self.kill_on_node_fail) + ptr.overcommit = u8_bool(self.overcommit) + ptr.reboot = u16_bool(self.requires_node_reboot) + ptr.requeue = u16_bool(self.is_requeueable) + ptr.wait_all_nodes = u16_bool(self.wait_all_nodes) + + ptr.mail_type = mail_type_list_to_int(self.mail_types) + ptr.power_flags = power_type_list_to_int(self.power_options) + ptr.profile = acctg_profile_list_to_int(self.profile_types) + ptr.shared = shared_type_str_to_int(self.resource_sharing) + + self._set_cpu_frequency() + self._set_nodes() + self._set_dependencies() + self._set_memory() + self._set_open_mode() + self._set_script() + self._set_script_args() + self._set_environment() + self._set_distribution() + self._set_gpu_binding() + self._set_gres_binding() + self._set_min_cpus() + + # TODO + # burst_buffer + # mem_bind, mem_bind_type? + # gpu_freq + # --hint + # spank_env + # --propagate for rlimits + + def _set_defaults(self): + if not self.ntasks: + self.ntasks = 1 + if not self.cpus_per_task: + self.cpus_per_task = 1 + if not self.working_directory: + self.working_directory = str(getcwd()) + if not self.environment: + # By default, sbatch also exports everything in the users env. + self.environment = "ALL" + + def _validate_options(self): + if not self.script: + raise ValueError("You need to provide a batch script.") + + if (self.memory_per_node and self.memory_per_cpu + or self.memory_per_gpu and self.memory_per_cpu + or self.memory_per_node and self.memory_per_gpu): + raise ValueError("Only one of memory_per_cpu, memory_per_node or " + "memory_per_gpu can be set.") + + if (self.ntasks_per_gpu and + (self.ptr.min_nodes != u32(None) or self.nodes + or self.gpus_per_task or self.gpus_per_socket + or self.ntasks_per_node)): + raise ValueError("ntasks_per_gpu is mutually exclusive with " + "nodes, gpus_per_task, gpus_per_socket and " + "ntasks_per_node.") + + if self.cpus_per_gpu and self.cpus_per_task: + raise ValueError("cpus_per_task and cpus_per_gpu " + "are mutually exclusive.") + + if (self.cores_reserved_for_system + and self.threads_reserved_for_system): + raise ValueError("cores_reserved_for_system is mutually " + " exclusive with threads_reserved_for_system.") + + def _set_core_spec(self): + if self.cores_reserved_for_system: + self.ptr.core_spec = u16(self.cores_reserved_for_system) + elif self.threads_reserved_for_system: + self.ptr.core_spec = u16(self.threads_reserved_for_system) + self.ptr.core_spec |= slurm.CORE_SPEC_THREAD + + def _set_cpu_frequency(self): + if not self.cpu_frequency: + return None + + freq = self.cpu_frequency + have_no_range = False + + # Alternatively support sbatch-like --cpu-freq setting. + if not isinstance(freq, dict): + freq_splitted = re.split("[-:]+", str(freq)) + freq_len = len(freq_splitted) + freq = {} + + # Transform cpu-freq string to the individual components. + if freq_splitted[0].isdigit(): + freq["max"] = freq_splitted[0] + else: + if freq_len > 1: + raise ValueError( + "Invalid cpu_frequency format: {kwargs}." + "Governor must be provided as single element or " + "as last element in the form of min-max:governor. " + ) + freq["governor"] = freq_splitted[0] + + if freq_len >= 2: + freq["min"] = freq["max"] + freq["max"] = freq_splitted[1] + + if freq_len == 3: + freq["governor"] = freq_splitted[2] + + freq_min = cpu_freq_str_to_int(freq.get("min")) + freq_max = cpu_freq_str_to_int(freq.get("max")) + freq_gov = cpu_gov_str_to_int(freq.get("governor")) + + if freq_min != u32(None): + if freq_max == u32(None): + freq_max = freq_min + freq_min = u32(None) + have_no_range = True + elif freq_max < freq_min: + raise ValueError( + f"min cpu-freq ({freq_min}) must be smaller " + f"than max cpu-freq ({freq_max})" + ) + elif freq_max != u32(None) and freq_min == u32(None): + have_no_range = True + + if have_no_range and freq_gov != u32(None): + raise ValueError( + "Setting Governor when specifying only either one " + "of min or max is not allowed." + ) + + self.ptr.cpu_freq_min = freq_min + self.ptr.cpu_freq_max = freq_max + self.ptr.cpu_freq_gov = freq_gov + + def _set_nodes(self): + vals = self.nodes + nmin=nmax = 1 + + if self.is_update: + return None + + # Support input like --nodes from sbatch (min-[max]) + if isinstance(vals, dict): + nmin = u32(vals.get("min", 1), on_noval=1) + nmax = u32(vals.get("max", 1), on_noval=nmin) + elif vals is not None: + v = str(vals).split("-", 1) + nmin = int(v[0]) + if nmin == 0: + nmin = 1 + if "-" in str(vals): + nmax = int(v[1]) + else: + nmax = nmin + + if not nmax: + nmax = nmin + if nmax < nmin: + raise ValueError("Max Nodecount cannot be " + "less than minimum nodecount.") + + self.ptr.min_nodes = nmin + self.ptr.max_nodes = nmax + + def _set_dependencies(self): + val = self.dependencies + final = None + + if isinstance(val, str): + # TODO: Even though everything is checked in the slurmctld, maybe + # still do some sanity checks here on the input when a string + # is provided. + final = val + elif val is not None: + satisfy = val.pop("satisfy", "all").casefold() + + if satisfy == "any": + delim = "?" + else: + delim = "," + + final = [] + for k, v in val.items(): + if k == "singleton" and bool(v): + final.append("singleton") + continue + + if not isinstance(v, list): + raise TypeError(f"Values for {k} must be list, " + f"got {type(v)}.") + # Convert everything to strings and add it to the dependency + # list. + v[:] = [str(s) for s in v] + final.append(f"{k}:{':'.join(v)}") + + final = delim.join(final) + + cstr.fmalloc(&self.ptr.dependency, final) + + def _set_memory(self): + if self.memory_per_cpu: + self.ptr.pn_min_memory = u64(dehumanize(self.memory_per_cpu)) + self.ptr.pn_min_memory |= slurm.MEM_PER_CPU + elif self.memory_per_node: + self.ptr.pn_min_memory = u64(dehumanize(self.memory_per_node)) + elif self.memory_per_gpu: + mem_gpu = u64(dehumanize(val)) + cstr.fmalloc(&self.ptr.mem_per_tres, f"gres:gpu:{mem_gpu}") + + def _set_open_mode(self): + val = self.log_files_open_mode + if val == "append": + self.ptr.open_mode = slurm.OPEN_MODE_APPEND + elif val == "truncate": + self.ptr.open_mode = slurm.OPEN_MODE_TRUNCATE + + def _set_script(self): + sfile = self.script + sbody = None + + if self.is_update: + return None + + if Path(sfile).is_file(): + # First assume the caller is passing a path to a script and we try + # to load it. + sbody = Path(sfile).read_text() + else: + # Otherwise assume that the script content is passed directly. + sbody = sfile + if self.script_args: + raise ValueError("Passing arguments to a script is only allowed " + "if it was loaded from a file.") + + # Validate the script + if not sbody or not len(sbody): + raise ValueError("Batch script is empty or none was provided.") + elif sbody.isspace(): + raise ValueError("Batch script contains only whitespace.") + elif not sbody.startswith("#!"): + msg = "Not a valid Batch script. " + msg += "First line must start with '#!'," + msg += "followed by the path to an interpreter" + raise ValueError(msg) + elif "\0" in sbody: + msg = "The Slurm Controller does not allow scripts that " + msg += "contain a NULL character: '\\0'." + raise ValueError(msg) + elif "\r\n" in sbody: + msg = "Batch script contains DOS line breaks (\\r\\n) " + msg += "instead of expected UNIX line breaks (\\n)." + raise ValueError(msg) + + cstr.fmalloc(&self.ptr.script, sbody) + + def _set_script_args(self): + args = self.script_args + if not args: + return None + + if isinstance(args, str): + sargs = shlex.split(args) + else: + sargs = list(args) + + # Script should always first in argv. + if sargs[0] != self.script: + sargs.insert(0, self.script) + + self.ptr.argc = len(sargs) + self.ptr.argv = try_xmalloc(self.ptr.argc * sizeof(char*)) + if not self.ptr.argv: + raise MemoryError("xmalloc failed for script_args") + + for idx, opt in enumerate(sargs): + cstr.fmalloc(&self.ptr.argv[idx], opt) + + def _set_environment(self): + if self.is_update: + return None + + vals = self.environment + get_user_env = self.get_user_environment + + # Clear any previous environment set for the Job. + slurm_env_array_free(self.ptr.environment) + self.ptr.env_size = 0 + + # Allocate a new environment. + self.ptr.environment = slurm_env_array_create() + + if isinstance(vals, str) or vals is None: + if vals is None or vals.casefold() == "all": + # This is the default. Export all current environment + # variables into the Job. + slurm_env_array_merge(&self.ptr.environment, + slurm.environ) + elif vals.casefold() == "none": + # Only env variables starting with "SLURM_" will be exported. + for var, val in pyenviron.items(): + if var.startswith("SLURM_"): + slurm_env_array_overwrite(&self.ptr.environment, + var, str(val)) + get_user_env = True + else: + # Assume Env-vars were provided sbatch style like a string. + # Setup all 'SLURM' env vars found first. + for var, val in pyenviron.items(): + if var.startswith("SLURM_"): + slurm_env_array_overwrite(&self.ptr.environment, + var, str(val)) + + # Merge the provided environment variables from the string in. + for idx, item in enumerate(vals.split(",")): + if idx == 0 and item.casefold() == "all": + slurm_env_array_merge(&self.ptr.environment, + slurm.environ) + continue + + if not "=" in item: + continue + + var, val = item.split("=", 1) + slurm_env_array_overwrite(&self.ptr.environment, + var, str(val)) + get_user_env = True + else: + # Here, the user provided an actual dictionary as Input. + # Setup all 'SLURM' env vars first. + for var, val in pyenviron.items(): + if var.startswith("SLURM_"): + slurm_env_array_overwrite(&self.ptr.environment, + var, str(val)) + + # Setup all User selected env vars. + for var, val in vals.items(): + slurm_env_array_overwrite(&self.ptr.environment, + var, str(val)) + + if get_user_env: + slurm_env_array_overwrite(&self.ptr.environment, + "SLURM_GET_USER_ENV", "1") + + # Calculate Environment size + while self.ptr.environment and self.ptr.environment[self.ptr.env_size]: + self.ptr.env_size+=1 + + def _set_distribution(self): + dist=plane = None + + if not self.distribution: + self.ptr.task_dist = slurm.SLURM_DIST_UNKNOWN + return None + + if isinstance(self.distribution, int): + # Assume the user meant to specify the plane size only. + plane = u16(self.distribution) + elif isinstance(self.distribution, str): + # Support sbatch style string input + dist = TaskDistribution.from_str(self.distribution) + plane = dist.plane if isinstance(dist.plane, int) else 0 + + if plane: + self.ptr.plane_size = plane + self.ptr.task_dist = slurm.SLURM_DIST_PLANE + elif dist is not None: + self.ptr.task_dist = dist.as_int() + + def _set_gpu_binding(self): + binding = self.gpu_binding + + if not binding: + if self.ptr.ntasks_per_tres != u16(None): + # Set gpu bind implicit to single:ntasks_per_gpu + binding = f"single:{self.ntasks_per_gpu}" + else: + binding = self.gpu_binding.replace("verbose,", "") \ + .replace("gpu:", "") + if "verbose" in self.gpu_binding: + binding = f"verbose,gpu:{binding}" + + cstr.fmalloc(&self.ptr.tres_bind, binding) + + def _set_min_cpus(self): + if self.min_cpus_per_node: + self.ptr.min_cpus = u16(self.min_cpus_per_node) + elif not self.is_update: + if self.overcommit: + self.ptr.min_cpus = max(self.ptr.min_nodes, 1) + + self.ptr.min_cpus = self.ptr.cpus_per_task * self.ptr.num_tasks + + def _set_switches(self): + kwargs = self.switches + if isinstance(kwargs, dict): + self.ptr.req_switch = u32(kwargs.get("count")) + self.ptr.wait4switch = timestr_to_secs(kwargs.get("max_wait_time")) + elif kwargs is not None: + vals = str(kwargs.split("@")) + if len(vals) > 1: + self.ptr.wait4switch = timestr_to_secs(vals[1]) + self.ptr.req_switch = u32(vals[0]) + + def _set_signal(self): + vals = self.signal + if not vals: + return None + + info = vals + # This supports input like the --signal option from sbatch + if vals and not isinstance(vals, dict): + info = {} + val_list = re.split("[:@]+", str(vals)) + + if len(val_list): + if ":" in str(vals): + flags = val_list.pop(0).casefold() + + if "r" in flags: + info["allow_reservation_overlap"] = True + + if "b" in flags: + info["batch_only"] = True + + if "@" in str(vals): + info["time"] = val_list[1] + + info["signal"] = val_list[0] + + # Parse values first to catch bad input + w_signal = u16(signal_to_num(info.get("signal"))) + w_time = u16(info.get("time"), on_noval=60) + batch_only = bool(info.get("batch_only")) + allow_resv_overlap = bool(info.get("allow_reservation_overlap")) + + # Then set it. At this point we can be sure that the input is correct. + self.ptr.warn_signal = w_signal + self.ptr.warn_time = w_time + u16_set_bool_flag(&self.ptr.warn_flags, + batch_only, slurm.KILL_JOB_BATCH) + u16_set_bool_flag(&self.ptr.warn_flags, + allow_resv_overlap, slurm.KILL_JOB_RESV) + + def _set_gres_binding(self): + if not self.gres_binding: + return None + elif self.gres_binding.casefold() == "enforce-binding": + self.ptr.bitflags |= slurm.GRES_ENFORCE_BIND + elif self.gres_binding.casefold() == "disable-binding": + self.ptr.bitflags |= slurm.GRES_DISABLE_BIND diff --git a/pyslurm/core/job/task_dist.pxd b/pyslurm/core/job/task_dist.pxd new file mode 100644 index 00000000..5fe76488 --- /dev/null +++ b/pyslurm/core/job/task_dist.pxd @@ -0,0 +1,41 @@ +######################################################################### +# task_dist.pxd - job task distribution +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.core.common.uint cimport u16 +from pyslurm.slurm cimport ( + task_dist_states_t, +) + + +cdef class TaskDistribution: + + cdef public: + str nodes + str sockets + str cores + plane + pack + + cdef task_dist_states_t state diff --git a/pyslurm/core/job/task_dist.pyx b/pyslurm/core/job/task_dist.pyx new file mode 100644 index 00000000..0c46cbc8 --- /dev/null +++ b/pyslurm/core/job/task_dist.pyx @@ -0,0 +1,352 @@ +######################################################################### +# task_dist.pyx - job task distribution +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + + +cdef class TaskDistribution: + + def __init__(self, nodes="block", sockets="cyclic", + cores=None, pack=None, plane_size=None): + self.nodes = nodes + self.sockets = sockets + self.cores = cores if cores else self.sockets + self.pack = pack + self.plane = plane_size + self.state = self._get_task_dist_state() + + def __eq__(self, other): + if not isinstance(other, TaskDistribution): + return NotImplemented + return self.as_int() == other.as_int() + + @staticmethod + def from_int(dist): + cdef TaskDistribution tdist = None + + if int(dist) <= 0 or dist == slurm.SLURM_DIST_UNKNOWN: + return None + + if (dist & slurm.SLURM_DIST_STATE_BASE) != slurm.SLURM_DIST_UNKNOWN: + tdist = _parse_task_dist_from_int(dist) + + dist_flag = dist & slurm.SLURM_DIST_STATE_FLAGS + tdist = _parse_task_dist_flags_from_int(tdist, dist_flag) + + if tdist: + tdist.state = dist + + return tdist + + def _to_str_no_flags(self): + if self.plane: + return "plane" + + dist_str = "" + nodes = self.nodes + if nodes is not None and nodes != "*": + dist_str = f"{nodes}" + else: + dist_str = "block" + + sockets = self.sockets + if sockets is not None and sockets != "*": + dist_str = f"{dist_str}:{sockets}" + else: + dist_str = f"{dist_str}:cyclic" + + cores = self.cores + if cores is not None and cores != "*": + dist_str = f"{dist_str}:{cores}" + else: + dist_str = f"{dist_str}:{sockets}" + + return dist_str + + def to_str(self): + dist_str = self._to_str_no_flags() + + if self.pack is not None: + dist_str = f"{dist_str},{'Pack' if self.pack else 'NoPack'}" + + return dist_str + + def to_dict(self): + return { + "nodes": self.nodes, + "sockets": self.sockets, + "cores": self.cores, + "plane": self.plane, + "pack": self.pack, + } + + def as_int(self): + return self.state + + def _get_task_dist_state(self): + cdef task_dist_states_t dist_state + + dist_str = self._to_str_no_flags() + if dist_str == "plane": + return slurm.SLURM_DIST_PLANE + + dist_state = _parse_str_to_task_dist_int(dist_str) + if dist_state == slurm.SLURM_DIST_UNKNOWN: + raise ValueError(f"Invalid distribution specification: {dist_str}") + + # Check for Pack/NoPack + # Don't do anything if it is None + if self.pack: + dist_state = (dist_state | slurm.SLURM_DIST_PACK_NODES) + elif self.pack is not None and not self.pack: + dist_state = (dist_state | slurm.SLURM_DIST_NO_PACK_NODES) + + return dist_state + + @staticmethod + def from_str(dist_str): + cdef TaskDistribution tdist = TaskDistribution.__new__(TaskDistribution) + + # Plane method - return early because nothing else can be + # specified when this is set. + if "plane" in dist_str: + if "plane=" in dist_str: + plane_size = u16(dist_str.split("=", 1)[1]) + return TaskDistribution(plane_size=plane_size) + else: + return TaskDistribution(plane_size=True) + + # [0] = distribution method for nodes:sockets:cores + # [1] = pack/nopack specification (true or false) + dist_items = dist_str.split(",", 1) + + # Parse the different methods + dist_methods = dist_items[0].split(":") + if len(dist_methods) and dist_methods[0] != "*": + tdist.nodes = dist_methods[0] + + if len(dist_methods) > 2 and dist_methods[1] != "*": + tdist.sockets = dist_methods[1] + + if len(dist_methods) >= 3: + if dist_methods[2] == "*": + tdist.cores = tdist.sockets + else: + tdist.cores = dist_methods[2] + + if len(dist_items) > 1: + if dist_items[1].casefold() == "pack": + tdist.pack = True + elif dist_items[1].casefold() == "nopack": + tdist.pack = False + + tdist.state = tdist._get_task_dist_state() + return tdist + + +# https://github.com/SchedMD/slurm/blob/510ba4f17dfa559b579aa054cb8a415dcc224abc/src/common/proc_args.c#L319 +def _parse_task_dist_from_int(dist): + cdef TaskDistribution out = TaskDistribution.__new__(TaskDistribution) + + state = dist & slurm.SLURM_DIST_STATE_BASE + if state == slurm.SLURM_DIST_BLOCK: + out.nodes = "block" + elif state == slurm.SLURM_DIST_CYCLIC: + out.nodes = "cyclic" + elif state == slurm.SLURM_DIST_PLANE: + out.plane = state + elif state == slurm.SLURM_DIST_ARBITRARY: + out.nodes = "arbitrary" + elif state == slurm.SLURM_DIST_CYCLIC_CYCLIC: + out.nodes = "cyclic" + out.sockets = "cyclic" + elif state == slurm.SLURM_DIST_CYCLIC_BLOCK: + out.nodes = "cyclic" + out.sockets = "block" + elif state == slurm.SLURM_DIST_CYCLIC_CFULL: + out.nodes = "cyclic" + out.sockets = "fcyclic" + elif state == slurm.SLURM_DIST_BLOCK_CYCLIC: + out.nodes = "block" + out.sockets = "cyclic" + elif state == slurm.SLURM_DIST_BLOCK_BLOCK: + out.nodes = "block" + out.sockets = "block" + elif state == slurm.SLURM_DIST_BLOCK_CFULL: + out.nodes = "block" + out.sockets = "fcyclic" + elif state == slurm.SLURM_DIST_CYCLIC_CYCLIC_CYCLIC: + out.nodes = "cyclic" + out.sockets = "cyclic" + out.cores = "cyclic" + elif state == slurm.SLURM_DIST_CYCLIC_CYCLIC_BLOCK: + out.nodes = "cyclic" + out.sockets = "cyclic" + out.cores = "block" + elif state == slurm.SLURM_DIST_CYCLIC_CYCLIC_CFULL: + out.nodes = "cyclic" + out.sockets = "cyclic" + out.cores = "fcyclic" + elif state == slurm.SLURM_DIST_CYCLIC_BLOCK_CYCLIC: + out.nodes = "cyclic" + out.sockets = "block" + out.cores = "cyclic" + elif state == slurm.SLURM_DIST_CYCLIC_BLOCK_CYCLIC: + out.nodes = "cyclic" + out.sockets = "block" + out.cores = "cyclic" + elif state == slurm.SLURM_DIST_CYCLIC_BLOCK_BLOCK: + out.nodes = "cyclic" + out.sockets = "block" + out.cores = "block" + elif state == slurm.SLURM_DIST_CYCLIC_BLOCK_CFULL: + out.nodes = "cyclic" + out.sockets = "block" + out.cores = "fcyclic" + elif state == slurm.SLURM_DIST_CYCLIC_CFULL_CYCLIC: + out.nodes = "cyclic" + out.sockets = "fcyclic" + out.cores = "cyclic" + elif state == slurm.SLURM_DIST_CYCLIC_CFULL_BLOCK: + out.nodes = "cyclic" + out.sockets = "fcyclic" + out.cores = "block" + elif state == slurm.SLURM_DIST_CYCLIC_CFULL_CFULL: + out.nodes = "cyclic" + out.sockets = "fcyclic" + out.cores = "fcyclic" + elif state == slurm.SLURM_DIST_BLOCK_CYCLIC_CYCLIC: + out.nodes = "block" + out.sockets = "cyclic" + out.cores = "cyclic" + elif state == slurm.SLURM_DIST_BLOCK_CYCLIC_BLOCK: + out.nodes = "block" + out.sockets = "cyclic" + out.cores = "block" + elif state == slurm.SLURM_DIST_BLOCK_CYCLIC_CFULL: + out.nodes = "block" + out.sockets = "cyclic" + out.cores = "fcyclic" + elif state == slurm.SLURM_DIST_BLOCK_BLOCK_CYCLIC: + out.nodes = "block" + out.sockets = "block" + out.cores = "cyclic" + elif state == slurm.SLURM_DIST_BLOCK_BLOCK_BLOCK: + out.nodes = "block" + out.sockets = "block" + out.cores = "block" + elif state == slurm.SLURM_DIST_BLOCK_BLOCK_CFULL: + out.nodes = "block" + out.sockets = "block" + out.cores = "fcyclic" + elif state == slurm.SLURM_DIST_BLOCK_CFULL_CYCLIC: + out.nodes = "block" + out.sockets = "fcyclic" + out.cores = "cyclic" + elif state == slurm.SLURM_DIST_BLOCK_CFULL_BLOCK: + out.nodes = "block" + out.sockets = "fcyclic" + out.cores = "block" + elif state == slurm.SLURM_DIST_BLOCK_CFULL_CFULL: + out.nodes = "block" + out.sockets = "fcyclic" + out.cores = "fcyclic" + else: + return None + + return out + + +def _parse_task_dist_flags_from_int(TaskDistribution dst, dist_flag): + if not dist_flag: + return dst + + cdef TaskDistribution _dst = dst + if not _dst: + _dst = TaskDistribution.__new__(TaskDistribution) + + if dist_flag == slurm.SLURM_DIST_PACK_NODES: + _dst.pack = True + elif dist_flag == slurm.SLURM_DIST_NO_PACK_NODES: + _dst.pack = False + + return _dst + + +def _parse_str_to_task_dist_int(dist_str): + # Select the correct distribution method according to dist_str. + if dist_str == "cyclic": + return slurm.SLURM_DIST_CYCLIC + elif dist_str == "block": + return slurm.SLURM_DIST_BLOCK + elif dist_str == "arbitrary" or dist_str == "hostfile": + return slurm.SLURM_DIST_ARBITRARY + elif dist_str == "cyclic:cyclic": + return slurm.SLURM_DIST_CYCLIC_CYCLIC + elif dist_str == "cyclic:block": + return slurm.SLURM_DIST_CYCLIC_BLOCK + elif dist_str == "block:block": + return slurm.SLURM_DIST_BLOCK_BLOCK + elif dist_str == "block:cyclic": + return slurm.SLURM_DIST_BLOCK_CYCLIC + elif dist_str == "block:fcyclic": + return slurm.SLURM_DIST_BLOCK_CFULL + elif dist_str == "cyclic:fcyclic": + return slurm.SLURM_DIST_CYCLIC_CFULL + elif dist_str == "cyclic:cyclic:cyclic": + return slurm.SLURM_DIST_CYCLIC_CYCLIC_CYCLIC + elif dist_str == "cyclic:cyclic:block": + return slurm.SLURM_DIST_CYCLIC_CYCLIC_BLOCK + elif dist_str == "cyclic:cyclic:fcyclic": + return slurm.SLURM_DIST_CYCLIC_CYCLIC_CFULL + elif dist_str == "cyclic:block:cyclic": + return slurm.SLURM_DIST_CYCLIC_BLOCK_CYCLIC + elif dist_str == "cyclic:block:block": + return slurm.SLURM_DIST_CYCLIC_BLOCK_BLOCK + elif dist_str == "cyclic:block:fcyclic": + return slurm.SLURM_DIST_CYCLIC_BLOCK_CFULL + elif dist_str == "cyclic:fcyclic:cyclic": + return slurm.SLURM_DIST_CYCLIC_CFULL_CYCLIC + elif dist_str == "cyclic:fcyclic:block": + return slurm.SLURM_DIST_CYCLIC_CFULL_BLOCK + elif dist_str == "cyclic:fcyclic:fcyclic": + return slurm.SLURM_DIST_CYCLIC_CFULL_CFULL + elif dist_str == "block:cyclic:cyclic": + return slurm.SLURM_DIST_BLOCK_CYCLIC_CYCLIC + elif dist_str == "block:cyclic:block": + return slurm.SLURM_DIST_BLOCK_CYCLIC_BLOCK + elif dist_str == "block:cyclic:fcyclic": + return slurm.SLURM_DIST_BLOCK_CYCLIC_CFULL + elif dist_str == "block:block:cyclic": + return slurm.SLURM_DIST_BLOCK_BLOCK_CYCLIC + elif dist_str == "block:block:block": + return slurm.SLURM_DIST_BLOCK_BLOCK_BLOCK + elif dist_str == "block:block:fcyclic": + return slurm.SLURM_DIST_BLOCK_BLOCK_CFULL + elif dist_str == "block:fcyclic:cyclic": + return slurm.SLURM_DIST_BLOCK_CFULL_CYCLIC + elif dist_str == "block:fcyclic:block": + return slurm.SLURM_DIST_BLOCK_CFULL_BLOCK + elif dist_str == "block:fcyclic:fcyclic": + return slurm.SLURM_DIST_BLOCK_CFULL_CFULL + else: + return slurm.SLURM_DIST_UNKNOWN diff --git a/pyslurm/core/job/util.pyx b/pyslurm/core/job/util.pyx new file mode 100644 index 00000000..7b463b2c --- /dev/null +++ b/pyslurm/core/job/util.pyx @@ -0,0 +1,345 @@ +######################################################################### +# util.pyx - utility functions used to parse various job flags +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from pyslurm cimport slurm +from pyslurm.core.common.uint import * +from pyslurm.core.common.uint cimport * + +# Note: Maybe consider using libslurmfull again to avoid having to reimplement +# some of these functions and keeping track for changes in new releases. + +def mail_type_list_to_int(mail_types): + """Convert a str or list of mail types to a uint16_t.""" + cdef uint16_t flags = 0 + types = mail_types + + if not types or "None" == types: + return slurm.NO_VAL16 + + if isinstance(types, str): + types = types.split(",") + + for typ in mail_types: + typ = typ.casefold() + + if "array_tasks" == typ: + flags |= slurm.MAIL_ARRAY_TASKS + + elif "begin" == typ: + flags |= slurm.MAIL_JOB_BEGIN + + elif "end" == typ: + flags |= slurm.MAIL_JOB_END + + elif "fail" == typ: + flags |= slurm.MAIL_JOB_FAIL + + # elif "invalid_depend" == typ: + # flags |= slurm.MAIL_INVALID_DEPEND + + elif "requeue" == typ: + flags |= slurm.MAIL_JOB_REQUEUE + + elif "stage_out" == typ: + flags |= slurm.MAIL_JOB_STAGE_OUT + + elif "time_limit" == typ: + flags |= slurm.MAIL_JOB_TIME100 + + elif "time_limit_90" == typ: + flags |= slurm.MAIL_JOB_TIME90 + + elif "time_limit_80" == typ: + flags |= slurm.MAIL_JOB_TIME80 + + elif "time_limit_50" == typ: + flags |= slurm.MAIL_JOB_TIME50 + + elif "all" == typ: + flags |= (slurm.MAIL_JOB_BEGIN + | slurm.MAIL_JOB_END + | slurm.MAIL_JOB_FAIL + | slurm.MAIL_JOB_REQUEUE + | slurm.MAIL_JOB_STAGE_OUT) + else: + raise ValueError("Invalid Mail type: {typ}.") + + return flags + + +def mail_type_int_to_list(uint16_t typ): + """Convert uint16_t to a list of mail types.""" + types = [] + + if typ == 0: + return types + + if typ & slurm.MAIL_ARRAY_TASKS: + types.append("array_tasks") + +# if typ & slurm.MAIL_INVALID_DEPEND: +# types.append("invalid_depend") + + if typ & slurm.MAIL_JOB_BEGIN: + types.append("begin") + + if typ & slurm.MAIL_JOB_END: + types.append("end") + + if typ & slurm.MAIL_JOB_FAIL: + types.append("fail") + + if typ & slurm.MAIL_JOB_REQUEUE: + types.append("requeue") + + if typ & slurm.MAIL_JOB_STAGE_OUT: + types.append("stage_out") + + if typ & slurm.MAIL_JOB_TIME50: + types.append("time_limit_50") + + if typ & slurm.MAIL_JOB_TIME80: + types.append("time_limit_80") + + if typ & slurm.MAIL_JOB_TIME90: + types.append("time_limit_90") + + if typ & slurm.MAIL_JOB_TIME100: + types.append("time_limit_100") + + return types + + +def acctg_profile_list_to_int(acctg_profiles): + """Convert a str or list of accounting gather profiles to uin32_t.""" + cdef uint32_t profile = 0 + profiles = acctg_profiles + + if not acctg_profiles: + return slurm.NO_VAL + + if "none" in acctg_profiles: + return slurm.ACCT_GATHER_PROFILE_NONE + elif "all" in acctg_profiles: + return slurm.ACCT_GATHER_PROFILE_ALL + + if "energy" in acctg_profiles: + profile |= slurm.ACCT_GATHER_PROFILE_ENERGY + + if "task" in acctg_profiles: + profile |= slurm.ACCT_GATHER_PROFILE_TASK + + if "lustre" in acctg_profiles: + profile |= slurm.ACCT_GATHER_PROFILE_LUSTRE + + if "network" in acctg_profiles: + profile |= slurm.ACCT_GATHER_PROFILE_NETWORK + + return profile + + +def acctg_profile_int_to_list(flags): + """Convert uin32_t accounting gather profiles to a list of strings.""" + profiles = [] + + if flags == 0 or flags == slurm.NO_VAL: + return [] + + if flags == slurm.ACCT_GATHER_PROFILE_ALL: + return ["all"] + elif flags == slurm.ACCT_GATHER_PROFILE_NONE: + return [] + + if flags & slurm.ACCT_GATHER_PROFILE_ENERGY: + profiles.append("energy") + + if flags & slurm.ACCT_GATHER_PROFILE_TASK: + profiles.append("task") + + if flags & slurm.ACCT_GATHER_PROFILE_LUSTRE: + profiles.append("lustre") + + if flags & slurm.ACCT_GATHER_PROFILE_NETWORK: + profiles.append("network") + + return profiles + + +def power_type_list_to_int(power_types): + """Convert a str or list of str with power types to uint8_t.""" + cdef uint8_t flags = 0 + + if not power_types: + return slurm.NO_VAL8 + + if "level" in power_types: + flags |= slurm.SLURM_POWER_FLAGS_LEVEL + + +def power_type_int_to_list(flags): + """Convert uint8_t power type flags to a list of strings.""" + types = [] + + if flags & slurm.SLURM_POWER_FLAGS_LEVEL: + types.append("level") + + return types + + +def shared_type_str_to_int(typ): + """Convert a job-sharing type str to its numerical representation.""" + if not typ: + return slurm.NO_VAL16 + + typ = typ.casefold() + if typ == "oversubscribe" or typ == "yes": + return slurm.JOB_SHARED_OK + elif typ == "user": + return slurm.JOB_SHARED_USER + elif typ == "mcs": + return slurm.JOB_SHARED_MCS + elif typ == "no" or typ == "exclusive": + return slurm.JOB_SHARED_NONE + else: + raise ValueError(f"Invalid resource_sharing type: {typ}.") + + +def cpu_gov_str_to_int(gov): + """Convert a cpu governor str to is numerical representation.""" + if not gov: + return u32(None) + + gov = gov.casefold() + rc = 0 + + if gov == "conservative": + rc = slurm.CPU_FREQ_CONSERVATIVE + elif gov == "ondemand": + rc = slurm.CPU_FREQ_ONDEMAND + elif gov == "performance": + rc = slurm.CPU_FREQ_PERFORMANCE + elif gov == "powersave": + rc = slurm.CPU_FREQ_POWERSAVE + elif gov == "userspace": + rc = slurm.CPU_FREQ_USERSPACE + elif gov == "schedutil": + rc = slurm.CPU_FREQ_SCHEDUTIL + else: + raise ValueError("Invalid cpu gov type: {}".format(gov)) + + return rc | slurm.CPU_FREQ_RANGE_FLAG + + +def cpu_freq_str_to_int(freq): + """Convert a cpu-frequency str to its numerical representation.""" + if not freq: + return u32(None) + + if isinstance(freq, str) and not freq.isdigit(): + freq = freq.casefold() + + if freq == "low": + return slurm.CPU_FREQ_LOW + elif freq == "highm1": + return slurm.CPU_FREQ_HIGHM1 + elif freq == "high": + return slurm.CPU_FREQ_HIGH + elif freq == "medium": + return slurm.CPU_FREQ_MEDIUM + else: + fr = u32(int(freq)) + if fr != slurm.NO_VAL: + return fr + + raise ValueError(f"Invalid cpu freq value: {freq}.") + + +def cpu_freq_int_to_str(freq): + """Convert a numerical cpufreq value to its string representation.""" + if freq == slurm.CPU_FREQ_LOW: + return "Low" + elif freq == slurm.CPU_FREQ_MEDIUM: + return "Medium" + elif freq == slurm.CPU_FREQ_HIGHM1: + return "Highm1" + elif freq == slurm.CPU_FREQ_HIGH: + return "High" + elif freq == slurm.CPU_FREQ_CONSERVATIVE: + return "Conservative" + elif freq == slurm.CPU_FREQ_PERFORMANCE: + return "Performance" + elif freq == slurm.CPU_FREQ_POWERSAVE: + return "PowerSave" + elif freq == slurm.CPU_FREQ_USERSPACE: + return "UserSpace" + elif freq == slurm.CPU_FREQ_ONDEMAND: + return "OnDemand" + elif freq == slurm.CPU_FREQ_SCHEDUTIL: + return "SchedUtil" + elif freq & slurm.CPU_FREQ_RANGE_FLAG: + return None + elif freq == slurm.NO_VAL or freq == 0: + return None + else: + # This is in kHz + return freq + + +def dependency_str_to_dict(dep): + if not dep: + return None + + out = { + "after": [], + "afterany": [], + "afterburstbuffer": [], + "aftercorr": [], + "afternotok": [], + "afterok": [], + "singleton": False, + "satisfy": "all", + } + + delim = "," + if "?" in dep: + delim = "?" + out["satisfy"] = "any" + + for item in dep.split(delim): + if item == "singleton": + out["singleton"] = True + + dep_and_job = item.split(":", 1) + if len(dep_and_job) != 2: + continue + + dep_name, jobs = dep_and_job[0], dep_and_job[1].split(":") + if dep_name not in out: + continue + + for job in jobs: + out[dep_name].append(int(job) if job.isdigit() else job) + + return out diff --git a/pyslurm/core/node.pxd b/pyslurm/core/node.pxd new file mode 100644 index 00000000..3f39ece7 --- /dev/null +++ b/pyslurm/core/node.pxd @@ -0,0 +1,222 @@ +######################################################################### +# node.pxd - interface to work with nodes in slurm +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from libc.string cimport memcpy, memset +from pyslurm cimport slurm +from pyslurm.slurm cimport ( + node_info_t, + node_info_msg_t, + update_node_msg_t, + partition_info_msg_t, + slurm_load_node, + slurm_load_node_single, + slurm_update_node, + slurm_delete_node, + slurm_create_node, + slurm_load_partitions, + slurm_free_update_node_msg, + slurm_init_update_node_msg, + slurm_populate_node_partitions, + slurm_free_node_info_msg, + slurm_free_node_info_members, + slurm_free_update_node_msg, + slurm_free_partition_info_msg, + slurm_get_select_nodeinfo, + slurm_sprint_cpu_bind_type, + slurm_node_state_string_complete, + slurm_node_state_string, + cpu_bind_type_t, +) + + +cdef class Nodes(dict): + """A collection of Node objects. + + Args: + nodes (Union[list, dict, str], optional): + Nodes to initialize this collection with. + + Attributes: + free_memory (int): + Amount of free memory in this node collection. (in Mebibytes) + real_memory (int): + Amount of real memory in this node collection. (in Mebibytes) + allocated_memory (int): + Amount of alloc Memory in this node collection. (in Mebibytes) + total_cpus (int): + Total amount of CPUs in this node collection. + idle_cpus (int): + Total amount of idle CPUs in this node collection. + allocated_cpus (int): + Total amount of allocated CPUs in this node collection. + effective_cpus (int): + Total amount of effective CPUs in this node collection. + current_watts (int): + Total amount of Watts consumed in this node collection. + avg_watts (int): + Amount of average watts consumed in this node collection. + + Raises: + MemoryError: If malloc fails to allocate memory. + """ + cdef: + node_info_msg_t *info + partition_info_msg_t *part_info + node_info_t tmp_info + + +cdef class Node: + """A Slurm node. + + Args: + name (str): + Name of a node + **kwargs: + Any writable property. Writable attributes include: + * name + * configured_gres + * address + * hostname + * extra + * comment + * weight + * available_features + * active_features + * cpu_binding + * state + + Attributes: + name (str): + Name of the node. + architecture (str): + Architecture of the node (e.g. x86_64) + configured_gres (dict): + Generic Resources this Node is configured with. + owner (str): + User that owns the Node. + address (str): + Address of the node. + hostname (str): + Hostname of the node. + extra (str): + Arbitrary string attached to the Node. + reason (str): + Reason why this node is in its current state. + reason_user (str): + Name of the User who set the reason. + comment (str): + Arbitrary node comment. + bcast_address (str): + Address of the node for sbcast. + slurm_version (str): + Version of slurm this node is running on. + operating_system (str): + Name of the operating system installed. + allocated_gres (dict): + Generic Resources currently in use on the node. + mcs_label (str): + MCS label for the node. + allocated_memory (int): + Memory in Mebibytes allocated on the node. + real_memory (int): + Real Memory in Mebibytes configured for this node. + free_memory (int): + Free Memory in Mebibytes on the node. + memory_reserved_for_system (int): + Raw Memory in Mebibytes reserved for the System not usable by + Jobs. + temporary_disk_space_per_node (int): + Amount of temporary disk space this node has, in Mebibytes. + weight (int): + Weight of the node in scheduling. + effective_cpus (int): + Number of effective CPUs the node has. + total_cpus (int): + Total amount of CPUs the node has. + sockets (int): + Number of sockets the node has. + cores_reserved_for_system (int): + Number of cores reserved for the System not usable by Jobs. + boards (int): + Number of boards the node has. + cores_per_socket (int): + Number of cores per socket configured for the node. + threads_per_core (int): + Number of threads per core configured for the node. + available_features (list): + List of features available on the node. + active_features (list): + List of features on the node. + partitions (list): + List of partitions this Node is part of. + boot_time (int): + Time the node has booted, as unix timestamp. + slurmd_start_time (int): + Time the slurmd has started on the Node, as unix timestamp. + last_busy_time (int): + Time this node was last busy, as unix timestamp. + reason_time (int): + Time the reason was set for the node, as unix timestamp. + allocated_cpus (int): + Number of allocated CPUs on the node. + idle_cpus (int): + Number of idle CPUs. + cpu_binding (str): + Default CPU-Binding on the node. + cap_watts (int): + Node cap watts. + current_watts (int): + Current amount of watts consumed on the node. + avg_watts (int): + Average amount of watts consumed on the node. + external_sensors (dict): + External Sensor info for the Node. + The dict returned contains the following information: + * joules_total (int) + * current_watts (int) + * temperature (int) + state (str): + State the node is currently in. + next_state (str): + Next state the node will be in. + cpu_load (float): + CPU Load on the Node. + slurmd_port (int): + Port the slurmd is listening on the node. + + Raises: + MemoryError: If malloc fails to allocate memory. + """ + cdef: + node_info_t *info + update_node_msg_t *umsg + dict passwd + dict groups + + @staticmethod + cdef _swap_data(Node dst, Node src) + + @staticmethod + cdef Node from_ptr(node_info_t *in_ptr) + diff --git a/pyslurm/core/node.pyx b/pyslurm/core/node.pyx new file mode 100644 index 00000000..17429ce1 --- /dev/null +++ b/pyslurm/core/node.pyx @@ -0,0 +1,719 @@ +######################################################################### +# node.pyx - interface to work with nodes in slurm +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm.slurm cimport xfree, try_xmalloc +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from pyslurm.core.common cimport cstr +from pyslurm.core.common import cstr +from pyslurm.core.common cimport ctime +from pyslurm.core.common import ctime +from pyslurm.core.common.ctime cimport time_t +from pyslurm.core.common.uint cimport * +from pyslurm.core.common.uint import * +from pyslurm.core.error import RPCError, verify_rpc +from pyslurm.core.common.ctime import timestamp_to_date, _raw_time +from pyslurm.core.common import ( + uid_to_name, + gid_to_name, + humanize, + _getgrall_to_dict, + _getpwall_to_dict, + cpubind_to_num, + instance_to_dict, + _sum_prop, + nodelist_from_range_str, +) + + +cdef class Nodes(dict): + + def __dealloc__(self): + slurm_free_node_info_msg(self.info) + slurm_free_partition_info_msg(self.part_info) + + def __cinit__(self): + self.info = NULL + self.part_info = NULL + + def __init__(self, nodes=None): + if isinstance(nodes, dict): + self.update(nodes) + elif isinstance(nodes, str): + nodelist = nodelist_from_range_str(nodes) + self.update({node: Node(node) for node in nodelist}) + elif nodes is not None: + for node in nodes: + if isinstance(node, str): + self[node] = Node(node) + else: + self[node.name] = node + + @staticmethod + def load(preload_passwd_info=False): + """Load all nodes in the system. + + Args: + preload_passwd_info (bool): + Decides whether to query passwd and groups information from + the system. + Could potentially speed up access to attributes of the Node + where a UID/GID is translated to a name. + If True, the information will fetched and stored in each of + the Node instances. The default is False. + + Returns: + (Nodes): Collection of node objects. + + Raises: + RPCError: When getting all the Nodes from the slurmctld failed. + MemoryError: If malloc fails to allocate memory. + """ + cdef: + dict passwd = {} + dict groups = {} + Nodes nodes = Nodes.__new__(Nodes) + int flags = slurm.SHOW_ALL + Node node + + verify_rpc(slurm_load_node(0, &nodes.info, flags)) + verify_rpc(slurm_load_partitions(0, &nodes.part_info, flags)) + slurm_populate_node_partitions(nodes.info, nodes.part_info) + + # If requested, preload the passwd and groups database to potentially + # speedup lookups for an attribute in a node, e.g "owner". + if preload_passwd_info: + passwd = _getpwall_to_dict() + groups = _getgrall_to_dict() + + # zero-out a dummy node_info_t + memset(&nodes.tmp_info, 0, sizeof(node_info_t)) + + # Put each node pointer into its own "Node" instance. + for cnt in range(nodes.info.record_count): + node = Node.from_ptr(&nodes.info.node_array[cnt]) + + # Prevent double free if xmalloc fails mid-loop and a MemoryError + # is raised by replacing it with a zeroed-out node_info_t. + nodes.info.node_array[cnt] = nodes.tmp_info + + if preload_passwd_info: + node.passwd = passwd + node.groups = groups + + nodes[node.name] = node + + # At this point we memcpy'd all the memory for the Nodes. Setting this + # to 0 will prevent the slurm node free function to deallocate the + # memory for the individual nodes. This should be fine, because they + # are free'd automatically in __dealloc__ since the lifetime of each + # node-pointer is tied to the lifetime of its corresponding "Node" + # instance. + nodes.info.record_count = 0 + + return nodes + + def reload(self): + """Reload the information for nodes in a collection. + + Note: + Only information for nodes which are already in the collection at + the time of calling this method will be reloaded. + + Raises: + RPCError: When getting the Nodes from the slurmctld failed. + """ + cdef Nodes reloaded_nodes + our_nodes = list(self.keys()) + + if not our_nodes: + return None + + reloaded_nodes = Nodes.load() + for node in list(self.keys()): + if node in reloaded_nodes: + # Put the new data in. + self[node] = reloaded_nodes[node] + + return self + + def as_list(self): + """Format the information as list of Node objects. + + Returns: + (list): List of Node objects + """ + return list(self.values()) + + @property + def free_memory(self): + return _sum_prop(self, Node.free_memory) + + @property + def real_memory(self): + return _sum_prop(self, Node.real_memory) + + @property + def allocated_memory(self): + return _sum_prop(self, Node.allocated_memory) + + @property + def total_cpus(self): + return _sum_prop(self, Node.total_cpus) + + @property + def idle_cpus(self): + return _sum_prop(self, Node.idle_cpus) + + @property + def allocated_cpus(self): + return _sum_prop(self, Node.allocated_cpus) + + @property + def effective_cpus(self): + return _sum_prop(self, Node.effective_cpus) + + @property + def current_watts(self): + return _sum_prop(self, Node.current_watts) + + @property + def avg_watts(self): + return _sum_prop(self, Node.avg_watts) + + +cdef class Node: + + def __cinit__(self): + self.info = NULL + self.umsg = NULL + + def __init__(self, name=None, **kwargs): + self._alloc_impl() + self.name = name + for k, v in kwargs.items(): + setattr(self, k, v) + + def _alloc_impl(self): + self._alloc_info() + self._alloc_umsg() + + def _alloc_info(self): + if not self.info: + self.info = try_xmalloc(sizeof(node_info_t)) + if not self.info: + raise MemoryError("xmalloc failed for node_info_t") + + def _alloc_umsg(self): + if not self.umsg: + self.umsg = try_xmalloc(sizeof(update_node_msg_t)) + if not self.umsg: + raise MemoryError("xmalloc failed for update_node_msg_t") + slurm_init_update_node_msg(self.umsg) + + def _dealloc_impl(self): + slurm_free_update_node_msg(self.umsg) + self.umsg = NULL + slurm_free_node_info_members(self.info) + xfree(self.info) + + def __dealloc__(self): + self._dealloc_impl() + + def __setattr__(self, name, val): + # When a user wants to set attributes on a Node instance that was + # created by calling Nodes(), the "umsg" pointer is not yet allocated. + # We only allocate memory for it by the time the user actually wants + # to modify something. + self._alloc_umsg() + # Call descriptors __set__ directly + Node.__dict__[name].__set__(self, val) + + def __eq__(self, other): + return isinstance(other, Node) and self.name == other.name + + @staticmethod + cdef Node from_ptr(node_info_t *in_ptr): + cdef Node wrap = Node.__new__(Node) + wrap._alloc_info() + wrap.passwd = {} + wrap.groups = {} + memcpy(wrap.info, in_ptr, sizeof(node_info_t)) + return wrap + + cdef _swap_data(Node dst, Node src): + cdef node_info_t *tmp = NULL + if dst.info and src.info: + tmp = dst.info + dst.info = src.info + src.info = tmp + + @staticmethod + def load(name): + """Load information for a specific node. + + Implements the slurm_load_node_single RPC. + + Returns: + (pyslurm.Node): Returns a new Node instance. + + Raises: + RPCError: If requesting the Node information from the slurmctld + was not successful. + MemoryError: If malloc failed to allocate memory. + + Examples: + >>> import pyslurm + >>> node = pyslurm.Node.load("localhost") + """ + cdef: + node_info_msg_t *node_info = NULL + partition_info_msg_t *part_info = NULL + Node wrap = Node.__new__(Node) + + try: + verify_rpc(slurm_load_node_single(&node_info, + name, slurm.SHOW_ALL)) + verify_rpc(slurm_load_partitions(0, &part_info, slurm.SHOW_ALL)) + slurm_populate_node_partitions(node_info, part_info) + + if node_info and node_info.record_count: + # Copy info + wrap._alloc_impl() + memcpy(wrap.info, &node_info.node_array[0], sizeof(node_info_t)) + node_info.record_count = 0 + else: + raise RPCError(msg=f"Node '{name}' does not exist") + except Exception as e: + raise e + finally: + slurm_free_node_info_msg(node_info) + slurm_free_partition_info_msg(part_info) + + return wrap + + def create(self, state="future"): + """Create a node. + + Implements the slurm_create_node RPC. + + Args: + future (str, optional): + An optional state the created Node should have. Allowed values + are "future" and "cloud". "future" is the default. + + Returns: + (Node): This function returns the current Node-instance object + itself. + + Raises: + RPCError: If creating the Node was not successful. + MemoryError: If malloc failed to allocate memory. + + Examples: + >>> from pyslurm import Node + >>> node = Node("testnode").create() + """ + if not self.name: + raise ValueError("You need to set a node name first.") + + self._alloc_umsg() + cstr.fmalloc(&self.umsg.extra, + f"NodeName={self.name} State={state}") + verify_rpc(slurm_create_node(self.umsg)) + + return self + + def modify(self, node=None, **kwargs): + """Modify a node. + + Implements the slurm_update_node RPC. + + Args: + node (pyslurm.Node): + Another Node object which contains all the changes that + should be applied to this instance. + **kwargs: + You can also specify all the changes as keyword arguments. + Allowed values are only attributes which can actually be set + on a Node instance. If a node is explicitly specified as + parameter, all **kwargs will be ignored. + + Raises: + RPCError: When updating the Node was not successful. + + Examples: + >>> from pyslurm import Node + >>> + >>> # Setting a new weight for the Node + >>> changes = Node(weight=100) + >>> Node("localhost").modify(changes) + >>> + >>> # Or by specifying the changes directly to the modify function + >>> Node("localhost").modify(weight=100) + """ + cdef Node n = self + + # Allow the user to both specify changes via a Node instance or + # **kwargs. + if node and isinstance(node, Node): + n = node + elif kwargs: + n = Node(**kwargs) + + n._alloc_umsg() + cstr.fmalloc(&n.umsg.node_names, self.name) + verify_rpc(slurm_update_node(n.umsg)) + + def delete(self): + """Delete a node. + + Implements the slurm_delete_node RPC. + + Raises: + RPCError: If deleting the Node was not successful. + MemoryError: If malloc failed to allocate memory. + + Examples: + >>> from pyslurm import Node + >>> Node("localhost").delete() + """ + self._alloc_umsg() + verify_rpc(slurm_delete_node(self.umsg)) + + def as_dict(self): + """Node information formatted as a dictionary. + + Returns: + (dict): Node information as dict + """ + return instance_to_dict(self) + + @property + def name(self): + return cstr.to_unicode(self.info.name) + + @name.setter + def name(self, val): + cstr.fmalloc2(&self.info.name, &self.umsg.node_names, val) + + @property + def architecture(self): + return cstr.to_unicode(self.info.arch) + + @property + def configured_gres(self): + return cstr.to_gres_dict(self.info.gres) + + @configured_gres.setter + def configured_gres(self, val): + cstr.fmalloc2(&self.info.gres, &self.umsg.gres, + cstr.from_gres_dict(val)) + + @property + def owner(self): + return uid_to_name(self.info.owner, lookup=self.passwd) + + @property + def address(self): + return cstr.to_unicode(self.info.node_addr) + + @address.setter + def address(self, val): + cstr.fmalloc2(&self.info.node_addr, &self.umsg.node_addr, val) + + @property + def hostname(self): + return cstr.to_unicode(self.info.node_hostname) + + @hostname.setter + def hostname(self, val): + cstr.fmalloc2(&self.info.node_hostname, &self.umsg.node_hostname, val) + + @property + def extra(self): + return cstr.to_unicode(self.info.extra) + + @extra.setter + def extra(self, val): + cstr.fmalloc2(&self.info.extra, &self.umsg.extra, val) + + @property + def reason(self): + return cstr.to_unicode(self.info.reason) + + @property + def reason_user(self): + return uid_to_name(self.info.reason_uid, lookup=self.passwd) + + @property + def comment(self): + return cstr.to_unicode(self.info.comment) + + @comment.setter + def comment(self, val): + cstr.fmalloc2(&self.info.comment, &self.umsg.comment, val) + + @property + def bcast_address(self): + return cstr.to_unicode(self.info.bcast_address) + + @property + def slurm_version(self): + return cstr.to_unicode(self.info.version) + + @property + def operating_system(self): + return cstr.to_unicode(self.info.os) + + @property + def allocated_gres(self): + return cstr.to_gres_dict(self.info.gres_used) + + @property + def mcs_label(self): + return cstr.to_unicode(self.info.mcs_label) + + @property + def allocated_memory(self): + cdef uint64_t alloc_memory = 0 + if self.info.select_nodeinfo: + slurm_get_select_nodeinfo( + self.info.select_nodeinfo, + slurm.SELECT_NODEDATA_MEM_ALLOC, + slurm.NODE_STATE_ALLOCATED, + &alloc_memory) + return alloc_memory + + @property + def real_memory(self): + return u64_parse(self.info.real_memory) + + @property + def free_memory(self): + return u64_parse(self.info.free_mem) + + @property + def memory_reserved_for_system(self): + return u64_parse(self.info.mem_spec_limit) + + @property + def temporary_disk_space(self): + return u32_parse(self.info.tmp_disk) + + @property + def weight(self): + return u32_parse(self.info.weight) + + @weight.setter + def weight(self, val): + self.info.weight=self.umsg.weight = u32(val) + + @property + def effective_cpus(self): + return u16_parse(self.info.cpus_efctv) + + @property + def total_cpus(self): + return u16_parse(self.info.cpus, on_noval=0) + + @property + def sockets(self): + return u16_parse(self.info.sockets, on_noval=0) + + @property + def cores_reserved_for_system(self): + return u16_parse(self.info.core_spec_cnt) + + @property + def boards(self): + return u16_parse(self.info.boards) + + @property + def cores_per_socket(self): + return u16_parse(self.info.cores) + + @property + def threads_per_core(self): + return u16_parse(self.info.threads) + + @property + def available_features(self): + return cstr.to_list(self.info.features) + + @available_features.setter + def available_features(self, val): + cstr.from_list2(&self.info.features, &self.umsg.features, val) + + @property + def active_features(self): + return cstr.to_list(self.info.features_act) + + @active_features.setter + def active_features(self, val): + cstr.from_list2(&self.info.features_act, &self.umsg.features_act, val) + + @property + def partitions(self): + return cstr.to_list(self.info.partitions) + + @property + def boot_time(self): + return _raw_time(self.info.boot_time) + + @property + def slurmd_start_time(self): + return _raw_time(self.info.slurmd_start_time) + + @property + def last_busy_time(self): + return _raw_time(self.info.last_busy) + + @property + def reason_time(self): + return _raw_time(self.info.reason_time) + +# @property +# def tres_configured(self): +# """dict: TRES that are configured on the node.""" +# return cstr.to_dict(self.info.tres_fmt_str) + +# @property +# def tres_alloc(self): +# cdef char *alloc_tres = NULL +# if self.info.select_nodeinfo: +# slurm_get_select_nodeinfo( +# self.info.select_nodeinfo, +# slurm.SELECT_NODEDATA_TRES_ALLOC_FMT_STR, +# slurm.NODE_STATE_ALLOCATED, +# &alloc_tres +# ) +# return cstr.to_gres_dict(alloc_tres) + + @property + def allocated_cpus(self): + cdef uint16_t alloc_cpus = 0 + if self.info.select_nodeinfo: + slurm_get_select_nodeinfo( + self.info.select_nodeinfo, + slurm.SELECT_NODEDATA_SUBCNT, + slurm.NODE_STATE_ALLOCATED, + &alloc_cpus + ) + return alloc_cpus + + @property + def idle_cpus(self): + efctv = self.effective_cpus + if not efctv: + return None + + return efctv - self.allocated_cpus + + @property + def cpu_binding(self): + cdef char cpu_bind[128] + slurm_sprint_cpu_bind_type(cpu_bind, + self.info.cpu_bind) + if cpu_bind == "(null type)": + return None + + return cstr.to_unicode(cpu_bind) + + @cpu_binding.setter + def cpu_binding(self, val): + self.info.cpu_bind=self.umsg.cpu_bind = cpubind_to_num(val) + + @property + def cap_watts(self): + if not self.info.power: + return 0 + return u32_parse(self.info.power.cap_watts, on_noval=0) + + @property + def current_watts(self): + if not self.info.energy: + return 0 + return u32_parse(self.info.energy.current_watts, on_noval=0) + + @property + def avg_watts(self): + if not self.info.energy: + return 0 + return u32_parse(self.info.energy.ave_watts, on_noval=0) + + @property + def external_sensors(self): + if not self.info.ext_sensors: + return {} + + return { + "joules_total": u64_parse(self.info.ext_sensors.consumed_energy), + "current_watts": u32_parse(self.info.ext_sensors.current_watts), + "temperature": u32_parse(self.info.ext_sensors.temperature) + } + + @property + def state(self): + cdef char* state = slurm_node_state_string_complete( + self.info.node_state) + state_str = cstr.to_unicode(state) + xfree(state) + return state_str + + @property + def next_state(self): + if ((self.info.next_state != slurm.NO_VAL) + and (self.info.node_state & slurm.NODE_STATE_REBOOT_REQUESTED + or self.info.node_state & slurm.NODE_STATE_REBOOT_ISSUED)): + return cstr.to_unicode( + slurm_node_state_string(self.info.next_state)) + else: + return None + + @state.setter + def state(self, val): + self.umsg.node_state=self.info.node_state = _node_state_from_str(val) + + @property + def cpu_load(self): + load = u32_parse(self.info.cpu_load) + return load / 100.0 if load is not None else 0.0 + + @property + def slurmd_port(self): + return u16_parse(self.info.port) + + +def _node_state_from_str(state, err_on_invalid=True): + if not state: + return slurm.NO_VAL + + for i in range(slurm.NODE_STATE_END): + if state == slurm_node_state_string(i): + return i + + if err_on_invalid: + raise ValueError(f"Invalid Node state: {state}") + else: + return slurm.NO_VAL diff --git a/pyslurm/core/slurmctld.pxd b/pyslurm/core/slurmctld.pxd new file mode 100644 index 00000000..f65655c8 --- /dev/null +++ b/pyslurm/core/slurmctld.pxd @@ -0,0 +1,38 @@ +######################################################################### +# slurmctld.pxd - pyslurm slurmctld api +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm cimport slurm +from pyslurm.slurm cimport ( + slurm_conf_t, + slurm_load_ctl_conf, + slurm_free_ctl_conf, + try_xmalloc, +) +from pyslurm.core.common cimport cstr +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int64_t +from pyslurm.core.common.uint cimport * + + +cdef class Config: + cdef slurm_conf_t *ptr diff --git a/pyslurm/core/slurmctld.pyx b/pyslurm/core/slurmctld.pyx new file mode 100644 index 00000000..2b5367c5 --- /dev/null +++ b/pyslurm/core/slurmctld.pyx @@ -0,0 +1,48 @@ +######################################################################### +# slurmctld.pyx - pyslurm slurmctld api +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# cython: c_string_type=unicode, c_string_encoding=default +# cython: language_level=3 + +from pyslurm.core.error import verify_rpc, RPCError + + +cdef class Config: + + def __cinit__(self): + self.ptr = NULL + + def __init__(self, job_id): + raise RuntimeError("Cannot instantiate class directly") + + def __dealloc__(self): + slurm_free_ctl_conf(self.ptr) + self.ptr = NULL + + @staticmethod + def load(): + cdef Config conf = Config.__new__(Config) + verify_rpc(slurm_load_ctl_conf(0, &conf.ptr)) + return conf + + @property + def cluster(self): + return cstr.to_unicode(self.ptr.cluster_name) diff --git a/pyslurm/pyslurm.pyx b/pyslurm/pyslurm.pyx index adbed03e..89b226a2 100644 --- a/pyslurm/pyslurm.pyx +++ b/pyslurm/pyslurm.pyx @@ -373,26 +373,6 @@ def slurm_load_slurmd_status(): return Status -def slurm_init(conf_file=None): - """Initialize the Slurm API internal structures. - - This function MUST be called before any internal API calls to ensure - Slurm's internal configuration structures have been populated. - - Args: - conf_file (str, optional): Absolute path to the configuration file. If - None (default value), libslurm automatically locates its own - configuration. - """ - if conf_file: - slurm.slurm_init(conf_file.encode('UTF-8')) - else: - slurm.slurm_init(NULL) - -def slurm_fini(): - """Cleanup Slurm internal configuration structures.""" - slurm.slurm_fini() - # # Slurm Config Class # @@ -6758,6 +6738,3 @@ cdef class licenses: else: apiError = slurm.slurm_get_errno() raise ValueError(slurm.stringOrNone(slurm.slurm_strerror(apiError), ''), apiError) - -# Automatically load Slurm configuration data structure at pyslurm module load -slurm_init() diff --git a/pyslurm/slurm/SLURM_DISCLAIMER b/pyslurm/slurm/SLURM_DISCLAIMER new file mode 100644 index 00000000..5fb615d5 --- /dev/null +++ b/pyslurm/slurm/SLURM_DISCLAIMER @@ -0,0 +1,159 @@ +Slurm was produced at Lawrence Livermore National Laboratory in collaboration +with various organizations. + +Copyright (C) 2012-2013 Los Alamos National Security, LLC. +Copyright (C) 2011 Trinity Centre for High Performance Computing +Copyright (C) 2010-2015 SchedMD LLC +Copyright (C) 2009-2013 CEA/DAM/DIF +Copyright (C) 2009-2011 Centro Svizzero di Calcolo Scientifico (CSCS) +Copyright (C) 2008-2011 Lawrence Livermore National Security +Copyright (C) 2008 Vijay Ramasubramanian +Copyright (C) 2007-2008 Red Hat, Inc. +Copyright (C) 2007-2013 National University of Defense Technology, China +Copyright (C) 2007-2015 Bull +Copyright (C) 2005-2008 Hewlett-Packard Development Company, L.P. +Copyright (C) 2004-2009, Marcus Holland-Moritz +Copyright (C) 2002-2007 The Regents of the University of California +Copyright (C) 2002-2003 Linux NetworX +Copyright (C) 2002 University of Chicago +Copyright (C) 2001, Paul Marquess +Copyright (C) 2000 Markus Friedl +Copyright (C) 1999, Kenneth Albanowski +Copyright (C) 1998 Todd C. Miller +Copyright (C) 1996-2003 Maximum Entropy Data Consultants Ltd, +Copyright (C) 1995 Tatu Ylonen , Espoo, Finland +Copyright (C) 1989-1994, 1996-1999, 2001 Free Software Foundation, Inc. +Many other organizations contributed code and/or documentation without +including a copyright notice. + +Written by: +Amjad Majid Ali (Colorado State University) +Par Andersson (National Supercomputer Centre, Sweden) +Don Albert (Bull) +Ernest Artiaga (Barcelona Supercomputer Center, Spain) +Danny Auble (LLNL, SchedMD LLC) +Susanne Balle (HP) +Anton Blanchard (Samba) +Janne Blomqvist (Aalto University, Finland) +David Bremer (LLNL) +Jon Bringhurst (LANL) +Bill Brophy (Bull) +Hongjia Cao (National University of Defense Techonogy, China) +Daniel Christians (HP) +Gilles Civario (Bull) +Chuck Clouston (Bull) +Joseph Donaghy (LLNL) +Chris Dunlap (LLNL) +Joey Ekstrom (LLNL/Bringham Young University) +Josh England (TGS Management Corporation) +Kent Engstrom (National Supercomputer Centre, Sweden) +Jim Garlick (LLNL) +Didier Gazen (Laboratoire d'Aerologie, France) +Raphael Geissert (Debian) +Yiannis Georgiou (Bull) +Andriy Grytsenko (Massive Solutions Limited, Ukraine) +Mark Grondona (LLNL) +Takao Hatazaki (HP, Japan) +Matthieu Hautreux (CEA, France) +Chris Holmes (HP) +David Hoppner +Nathan Huff (North Dakota State University) +David Jackson (Adaptive Computing) +Morris Jette (LLNL, SchedMD LLC) +Klaus Joas (University Karlsruhe, Germany) +Greg Johnson (LANL) +Jason King (LLNL) +Aaron Knister (Environmental Protection Agency) +Nancy Kritkausky (Bull) +Roman Kurakin (Institute of Natural Science and Ecology, Russia) +Eric Lin (Bull) +Don Lipari (LLNL) +Puenlap Lee (Bull) +Dennis Leepow +Bernard Li (Genome Sciences Centre, Canada) +Donald Lipari (LLNL) +Steven McDougall (SiCortex) +Donna Mecozzi (LLNL) +Bjorn-Helge Mevik (University of Oslo, Norway) +Chris Morrone (LLNL) +Pere Munt (Barcelona Supercomputer Center, Spain) +Michal Novotny (Masaryk University, Czech Republic) +Bryan O'Sullivan (Pathscale) +Gennaro Oliva (Institute of High Performance Computing and Networking, Italy) +Alejandro Lucero Palau (Barcelona Supercomputer Center, Spain) +Daniel Palermo (HP) +Dan Phung (LLNL/Columbia University) +Ashley Pittman (Quadrics, UK) +Vijay Ramasubramanian (University of Maryland) +Krishnakumar Ravi[KK] (HP) +Petter Reinholdtsen (University of Oslo, Norway) +Gerrit Renker (Swiss National Computer Centre) +Andy Riebs (HP) +Asier Roa (Barcelona Supercomputer Center, Spain) +Miguel Ros (Barcelona Supercomputer Center, Spain) +Beat Rubischon (DALCO AG, Switzerland) +Dan Rusak (Bull) +Eygene Ryabinkin (Kurchatov Institute, Russia) +Federico Sacerdoti (D.E. Shaw) +Rod Schultz (Bull) +Tyler Strickland (University of Florida) +Jeff Squyres (LAM MPI) +Prashanth Tamraparni (HP, India) +Jimmy Tang (Trinity College, Ireland) +Kevin Tew (LLNL/Bringham Young University) +Adam Todorski (Rensselaer Polytechnic Institute) +Nathan Weeks (Iowa State University) +Tim Wickberg (Rensselaer Polytechnic Institute) +Ramiro Brito Willmersdorf (Universidade Federal de Pemambuco, Brazil) +Jay Windley (Linux NetworX) +Anne-Marie Wunderlin (Bull) + +CODE-OCEC-09-009. All rights reserved. + +This file is part of Slurm, a resource management program. +For details, see . +Please also read the supplied file: DISCLAIMER. + +Slurm is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2 of the License, or (at your option) +any later version. + +Slurm is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along +with Slurm; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + +OUR NOTICE AND TERMS OF AND CONDITIONS OF THE GNU GENERAL PUBLIC LICENSE + +Our Preamble Notice + +Auspices + +This work performed under the auspices of the U.S. Department of Energy by +Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344. + +Disclaimer + +This work was sponsored by an agency of the United States government. +Neither the United States Government nor Lawrence Livermore National +Security, LLC, nor any of their employees, makes any warranty, express +or implied, or assumes any liability or responsibility for the accuracy, +completeness, or usefulness of any information, apparatus, product, or +process disclosed, or represents that its use would not infringe privately +owned rights. References herein to any specific commercial products, process, +or services by trade names, trademark, manufacturer or otherwise does not +necessarily constitute or imply its endorsement, recommendation, or +favoring by the United States Government or the Lawrence Livermore National +Security, LLC. The views and opinions of authors expressed herein do not +necessarily state or reflect those of the United States government or +Lawrence Livermore National Security, LLC, and shall not be used for +advertising or product endorsement purposes. + +The precise terms and conditions for copying, distribution and modification +is provided in the file named "COPYING" in this directory. diff --git a/pyslurm/slurm/SLURM_LICENSE b/pyslurm/slurm/SLURM_LICENSE new file mode 100644 index 00000000..0fd4db48 --- /dev/null +++ b/pyslurm/slurm/SLURM_LICENSE @@ -0,0 +1,389 @@ + SLURM LICENSE AGREEMENT + +All Slurm code and documentation is available under the GNU General Public +License. Some tools in the "contribs" directory have other licenses. See +the documentation for individual contributed tools for details. + +In addition, as a special exception, the copyright holders give permission +to link the code of portions of this program with the OpenSSL library under +certain conditions as described in each individual source file, and distribute +linked combinations including the two. You must obey the GNU General Public +License in all respects for all of the code used other than OpenSSL. If you +modify file(s) with this exception, you may extend this exception to your +version of the file(s), but you are not obligated to do so. If you do not +wish to do so, delete this exception statement from your version. If you +delete this exception statement from all source files in the program, then +also delete it here. + +NO WARRANTY: Because the program is licensed free of charge, there is no +warranty for the program. See section 11 below for full details. + +============================================================================= + +OUR NOTICE AND TERMS OF AND CONDITIONS OF THE GNU GENERAL PUBLIC LICENSE + +Auspices + +Portions of this work were performed under the auspices of the U.S. Department +of Energy by Lawrence Livermore National Laboratory under Contract +DE-AC52-07NA27344. + +Disclaimer + +This work was sponsored by an agency of the United States government. +Neither the United States Government nor Lawrence Livermore National +Security, LLC, nor any of their employees, makes any warranty, express +or implied, or assumes any liability or responsibility for the accuracy, +completeness, or usefulness of any information, apparatus, product, or +process disclosed, or represents that its use would not infringe privately +owned rights. References herein to any specific commercial products, process, +or services by trade names, trademark, manufacturer or otherwise does not +necessarily constitute or imply its endorsement, recommendation, or +favoring by the United States Government or the Lawrence Livermore National +Security, LLC. The views and opinions of authors expressed herein do not +necessarily state or reflect those of the United States government or +Lawrence Livermore National Security, LLC, and shall not be used for +advertising or product endorsement purposes. + +============================================================================= + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/pyslurm/slurm/__init__.pxd b/pyslurm/slurm/__init__.pxd index f1fbdd6f..f29bfc00 100644 --- a/pyslurm/slurm/__init__.pxd +++ b/pyslurm/slurm/__init__.pxd @@ -61,7 +61,6 @@ cdef extern from '' nogil: cdef extern from *: ctypedef struct slurm_job_credential ctypedef struct switch_jobinfo - ctypedef struct job_resources ctypedef struct select_jobinfo ctypedef struct select_nodeinfo ctypedef struct jobacctinfo diff --git a/pyslurm/slurm/extra.pxi b/pyslurm/slurm/extra.pxi index 50fccb23..0ccb0708 100644 --- a/pyslurm/slurm/extra.pxi +++ b/pyslurm/slurm/extra.pxi @@ -5,7 +5,6 @@ # For example: to communicate with the slurmctld directly in order # to retrieve the actual batch-script as a string. # - # https://github.com/SchedMD/slurm/blob/26abe9188ea8712ba1eab4a8eb6322851f06a108/src/common/slurm_persist_conn.h#L51 ctypedef enum persist_conn_type_t: PERSIST_TYPE_NONE = 0 @@ -23,6 +22,7 @@ ctypedef struct persist_msg_t: uint16_t msg_type ctypedef int (*_slurm_persist_conn_t_callback_proc) (void *arg, persist_msg_t *msg, buf_t **out_buffer, uint32_t *uid) + ctypedef void (*_slurm_persist_conn_t_callback_fini)(void *arg) # https://github.com/SchedMD/slurm/blob/26abe9188ea8712ba1eab4a8eb6322851f06a108/src/common/slurm_persist_conn.h#L66 @@ -116,7 +116,7 @@ ctypedef struct slurm_msg_t: # https://github.com/SchedMD/slurm/blob/fe82218def7b57f5ecda9222e80662ebbb6415f8/src/common/slurm_protocol_defs.c#L865 cdef extern void slurm_free_return_code_msg(return_code_msg_t *msg) -# https://github.com/SchedMD/slurm/blob/2d2e83674b59410a7ed8ab6fc8d8acfcfa8beaf9/src/common/slurm_protocol_api.c#L2401 +# https://github.com/SchedMD/slurm/blob/2d2e83674b59410a7ed8ab6fc8d8acfcfa8beaf9/src/common/slurm_protocol_api.c#L2401 cdef extern int slurm_send_recv_controller_msg(slurm_msg_t *request_msg, slurm_msg_t *response_msg, slurmdb_cluster_rec_t *working_cluster_rec) @@ -124,29 +124,58 @@ cdef extern int slurm_send_recv_controller_msg(slurm_msg_t *request_msg, # https://github.com/SchedMD/slurm/blob/fe82218def7b57f5ecda9222e80662ebbb6415f8/src/common/slurm_protocol_defs.c#L168 cdef extern void slurm_msg_t_init(slurm_msg_t *msg) +# https://github.com/SchedMD/slurm/blob/master/src/common/job_resources.h +ctypedef struct job_resources: + bitstr_t *core_bitmap + bitstr_t *core_bitmap_used + uint32_t cpu_array_cnt + uint16_t *cpu_array_value + uint32_t *cpu_array_reps + uint16_t *cpus + uint16_t *cpus_used + uint16_t *cores_per_socket + uint16_t cr_type + uint64_t *memory_allocated + uint64_t *memory_used + uint32_t nhosts + bitstr_t *node_bitmap + uint32_t node_req + char *nodes + uint32_t ncpus + uint32_t *sock_core_rep_count + uint16_t *sockets_per_node + uint16_t *tasks_per_node + uint16_t threads_per_core + uint8_t whole_node -# Global Environment +# +# TRES +# +ctypedef enum tres_types_t: + TRES_CPU = 1 + TRES_MEM + TRES_ENERGY + TRES_NODE + TRES_BILLING + TRES_FS_DISK + TRES_VMEM + TRES_PAGES + TRES_STATIC_CNT +# Global Environment cdef extern char **environ # # Slurm Memory routines +# We simply use the macros from xmalloc.h - more convenient # -cdef extern void slurm_xfree (void **) -cdef extern void *slurm_xcalloc(size_t, size_t, bool, bool, const char *, int, const char *) +cdef extern from "pyslurm/slurm/xmalloc.h" nogil: + void xfree(void *__p) + void *xmalloc(size_t __sz) + void *try_xmalloc(size_t __sz) -cdef inline xfree(void *__p): - slurm_xfree(&__p) - -cdef inline void *xmalloc(size_t __sz): - return slurm_xcalloc(1, __sz, True, False, __FILE__, __LINE__, __FUNCTION__) - -cdef inline void *try_xmalloc(size_t __sz): - return slurm_xcalloc(1, __sz, True, True, __FILE__, __LINE__, __FUNCTION__) - -cdef inline void xfree_ptr(void *__p): - slurm_xfree(&__p) +cdef extern void slurm_xfree_ptr(void *) # # Slurm xstring functions @@ -177,6 +206,16 @@ cdef extern void slurm_free_job_step_info_members(job_step_info_t *msg) cdef extern char *slurm_job_state_string(uint16_t inx) cdef extern char *slurm_job_reason_string(int inx) cdef extern char *slurm_job_share_string(uint16_t shared) +cdef extern void slurm_free_update_step_msg(step_update_request_msg_t *msg) + +# +# Slurm Node functions +# + +cdef extern int slurm_get_select_nodeinfo(dynamic_plugin_data_t *nodeinfo, select_nodedata_type data_type, node_states state, void *data) +cdef extern char *slurm_node_state_string_complete(uint32_t inx) +cdef extern void slurm_free_update_node_msg(update_node_msg_t *msg) +cdef extern void slurm_free_node_info_members(node_info_t *node) # # Slurm environment functions @@ -191,6 +230,7 @@ cdef extern void slurm_env_array_free(char **env_array) # cdef extern char *slurm_preempt_mode_string (uint16_t preempt_mode) +cdef extern uint16_t slurm_preempt_mode_num (const char *preempt_mode) cdef extern char *slurm_node_state_string (uint32_t inx) cdef extern char *slurm_step_layout_type_name (task_dist_states_t task_dist) cdef extern char *slurm_reservation_flags_string (reserve_info_t *resv_ptr) @@ -199,3 +239,35 @@ cdef extern int slurm_addto_char_list_with_case(List char_list, char *names, boo cdef extern int slurm_addto_step_list(List step_list, char *names) cdef extern int slurmdb_report_set_start_end_time(time_t *start, time_t *end) cdef extern uint16_t slurm_get_track_wckey() +cdef extern void slurm_sprint_cpu_bind_type(char *str, cpu_bind_type_t cpu_bind_type) + +# Slurm bit functions + +cdef extern bitstr_t *slurm_bit_alloc(bitoff_t nbits) +cdef extern void slurm_bit_set(bitstr_t *b, bitoff_t bit) +cdef extern int slurm_bit_test(bitstr_t *b, bitoff_t bit) +cdef extern char *slurm_bit_fmt(char *str, int32_t len, bitstr_t *b) +cdef extern void slurm_bit_free(bitstr_t **b) + + +cdef extern from *: + """ + #define bit_free(__b) slurm_bit_free((bitstr_t **)&(__b)) + #define FREE_NULL_BITMAP(_X) \ + do { \ + if (_X) \ + bit_free(_X); \ + _X = NULL; \ + } while(0) \ + """ + void bit_free(bitstr_t *_X) + void FREE_NULL_BITMAP(bitstr_t *_X) + +cdef extern char *slurm_hostlist_deranged_string_malloc(hostlist_t hl) + +# +# Slurmdbd functions +# + +cdef extern void slurmdb_job_cond_def_start_end(slurmdb_job_cond_t *job_cond) +cdef extern uint64_t slurmdb_find_tres_count_in_string(char *tres_str_in, int id) diff --git a/pyslurm/slurm/xmalloc.h b/pyslurm/slurm/xmalloc.h new file mode 100644 index 00000000..f1db7b5f --- /dev/null +++ b/pyslurm/slurm/xmalloc.h @@ -0,0 +1,117 @@ +/*****************************************************************************\ + * xmalloc.h - enhanced malloc routines for slurm + * - default: never return if errors are encountered. + * - attempt to report file, line, and calling function on assertion failure + * - use configurable slurm log facility for reporting errors + ***************************************************************************** + * Copyright (C) 2002 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Jim Garlick and + * Mark Grondona + * CODE-OCEC-09-009. All rights reserved. + * + * This file is part of Slurm, a resource management program. + * For details, see . + * Please also read the included file: DISCLAIMER. + * + * Slurm is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with Slurm; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ***************************************************************************** + * Description: + * + * void *xmalloc(size_t size); + * void xrealloc(void *p, size_t newsize); + * void xfree(void *p); + * int xsize(void *p); + * + * xmalloc(size) allocates size bytes and returns a pointer to the allocated + * memory. The memory is set to zero. xmalloc() will not return unless + * there are no errors. The memory must be freed using xfree(). + * + * xrealloc(p, newsize) changes the size of the block pointed to by p to the + * value of newsize. Newly allocated memory is zeroed. If p is NULL, + * xrealloc() performs the same function as `p = xmalloc(newsize)'. If p + * is not NULL, it is required to have been initialized with a call to + * [try_]xmalloc() or [try_]xrealloc(). + * + * xfree(p) frees the memory block pointed to by p. The memory must have been + * initialized with a call to [try_]xmalloc() or [try_]xrealloc(). + * + * xsize(p) returns the current size of the memory allocation pointed to by + * p. The memory must have been allocated with [try_]xmalloc() or + * [try_]xrealloc(). + * +\*****************************************************************************/ + +#ifndef _XMALLOC_H +#define _XMALLOC_H + +#include +#include + +#define xcalloc(__cnt, __sz) \ + slurm_xcalloc(__cnt, __sz, true, false, __FILE__, __LINE__, __func__) + +#define try_xcalloc(__cnt, __sz) \ + slurm_xcalloc(__cnt, __sz, true, true, __FILE__, __LINE__, __func__) + +#define xcalloc_nz(__cnt, __sz) \ + slurm_xcalloc(__cnt, __sz, false, false, __FILE__, __LINE__, __func__) + +#define xmalloc(__sz) \ + slurm_xcalloc(1, __sz, true, false, __FILE__, __LINE__, __func__) + +#define try_xmalloc(__sz) \ + slurm_xcalloc(1, __sz, true, true, __FILE__, __LINE__, __func__) + +#define xmalloc_nz(__sz) \ + slurm_xcalloc(1, __sz, false, false, __FILE__, __LINE__, __func__) + +#define xfree(__p) slurm_xfree((void **)&(__p)) + +#define xfree_array(__p) slurm_xfree_array((void ***)&(__p)) + +#define xrecalloc(__p, __cnt, __sz) \ + slurm_xrecalloc((void **)&(__p), __cnt, __sz, true, false, __FILE__, __LINE__, __func__) + +#define xrealloc(__p, __sz) \ + slurm_xrecalloc((void **)&(__p), 1, __sz, true, false, __FILE__, __LINE__, __func__) + +#define try_xrealloc(__p, __sz) \ + slurm_xrecalloc((void **)&(__p), 1, __sz, true, true, __FILE__, __LINE__, __func__) + +#define xrealloc_nz(__p, __sz) \ + slurm_xrecalloc((void **)&(__p), 1, __sz, false, false, __FILE__, __LINE__, __func__) + +void *slurm_xcalloc(size_t, size_t, bool, bool, const char *, int, const char *); +void slurm_xfree(void **); +void slurm_xfree_array(void ***); +void *slurm_xrecalloc(void **, size_t, size_t, bool, bool, const char *, int, const char *); + +size_t xsize(void *item); + +void xfree_ptr(void *); + +#endif /* !_XMALLOC_H */ diff --git a/setup.cfg b/setup.cfg index 17a6e9f3..78d52108 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,9 @@ +[options] +packages = find: + +[options.packages.find] +include = pyslurm, pyslurm.* + [bdist_rpm] release = 1 packager = Giovanni Torres diff --git a/setup.py b/setup.py index 796faa6a..7b96fdc8 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,6 @@ url="https://github.com/PySlurm/pyslurm", platforms=["Linux"], keywords=["HPC", "Batch Scheduler", "Resource Manager", "Slurm", "Cython"], - packages=["pyslurm"], classifiers=[ "Development Status :: 5 - Production/Stable", "Environment :: Console", diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py new file mode 100644 index 00000000..bf70149c --- /dev/null +++ b/tests/integration/conftest.py @@ -0,0 +1,44 @@ +######################################################################### +# conftest.py - pytest fixtures +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +import pytest +from pyslurm import ( + Job, + JobSubmitDescription, +) +from util import create_simple_job_desc + + +@pytest.fixture +def submit_job(): + + jobs = [] + def _job(script=None, **kwargs): + job_desc = create_simple_job_desc(script, **kwargs) + job = Job(job_desc.submit()) + + jobs.append(job) + return job + + yield _job + + for j in jobs: + j.cancel() diff --git a/tests/integration/test_db_connection.py b/tests/integration/test_db_connection.py new file mode 100644 index 00000000..876ec63d --- /dev/null +++ b/tests/integration/test_db_connection.py @@ -0,0 +1,56 @@ +######################################################################### +# test_db_connection.py - database connection api integration tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_db_connection.py - Test database connectin api functionalities.""" + +import pytest +import pyslurm + + +def test_create_instance(): + with pytest.raises(RuntimeError): + pyslurm.db.Connection() + + +def test_open(): + conn = pyslurm.db.Connection.open() + assert conn.is_open + + +def test_close(): + conn = pyslurm.db.Connection.open() + assert conn.is_open + + conn.close() + assert not conn.is_open + # no-op + conn.close() + + +def test_commit(): + conn = pyslurm.db.Connection.open() + assert conn.is_open + conn.commit() + + +def test_rollback(): + conn = pyslurm.db.Connection.open() + assert conn.is_open + conn.rollback() diff --git a/tests/integration/test_db_job.py b/tests/integration/test_db_job.py new file mode 100644 index 00000000..2c84ef4f --- /dev/null +++ b/tests/integration/test_db_job.py @@ -0,0 +1,100 @@ +######################################################################### +# test_db_job.py - database job api integration tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_db_job.py - Unit test database job api functionalities.""" + +import pytest +import pyslurm +import time +import util + + +# TODO: Instead of submitting new Jobs and waiting to test Database API +# functionality, we could just fill a slurm database with data on a host, then +# dump the slurm_acct_db to a SQL file and import it in the test environment +# before the integration tests are ran. +# Just a few Jobs and other stuff is enough to keep it small, so it could also +# be put in the repository and uploaded to github. + + +def test_load_single(submit_job): + job = submit_job() + util.wait() + db_job = pyslurm.db.Job.load(job.id) + + assert db_job.id == job.id + + with pytest.raises(pyslurm.RPCError): + pyslurm.db.Job.load(1000) + + +def test_parse_all(submit_job): + job = submit_job() + util.wait() + db_job = pyslurm.db.Job.load(job.id) + job_dict = db_job.as_dict() + + assert job_dict["stats"] + assert job_dict["steps"] + + +def test_modify(submit_job): + # TODO + pass + + +def test_if_steps_exist(submit_job): + # TODO + pass + + +def test_load_with_filter_node(submit_job): + # TODO + pass + + +def test_load_with_filter_qos(submit_job): + # TODO + pass + + +def test_load_with_filter_cluster(submit_job): + # TODO + pass + + +def test_load_with_filter_multiple(submit_job): + # TODO + pass + + +def test_load_with_script(submit_job): + script = util.create_job_script() + job = submit_job(script=script) + util.wait(5) + db_job = pyslurm.db.Job.load(job.id, with_script=True) + assert db_job.script == script + + +def test_load_with_env(submit_job): + job = submit_job() + util.wait(5) + db_job = pyslurm.db.Job.load(job.id, with_env=True) + assert db_job.environment diff --git a/tests/integration/test_db_qos.py b/tests/integration/test_db_qos.py new file mode 100644 index 00000000..5bbd69e4 --- /dev/null +++ b/tests/integration/test_db_qos.py @@ -0,0 +1,55 @@ +######################################################################### +# test_db_qos.py - database qos api integration tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_db_qos.py - Integration test database qos api functionalities.""" + +import pytest +import pyslurm +import time +import util + + +def test_load_single(): + qos = pyslurm.db.QualityOfService.load("normal") + + assert qos.name == "normal" + assert qos.id == 1 + + with pytest.raises(pyslurm.RPCError): + pyslurm.db.QualityOfService.load("qos_non_existent") + + +def test_parse_all(submit_job): + qos = pyslurm.db.QualityOfService.load("normal") + qos_dict = qos.as_dict() + + assert qos_dict + assert qos_dict["name"] == qos.name + + +def test_load_all(): + qos = pyslurm.db.QualitiesOfService.load() + assert qos + + +def test_load_with_filter_name(): + qfilter = pyslurm.db.QualityOfServiceSearchFilter(names=["non_existent"]) + qos = pyslurm.db.QualitiesOfService.load(qfilter) + assert not qos diff --git a/tests/integration/test_job.py b/tests/integration/test_job.py new file mode 100644 index 00000000..15c4bdef --- /dev/null +++ b/tests/integration/test_job.py @@ -0,0 +1,162 @@ +######################################################################### +# test_job.py - job api integration tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_job.py - Integration test job api functionalities.""" + +import time +import pytest +import pyslurm +import util +from util import create_simple_job_desc +from pyslurm import ( + Job, + Jobs, + JobSubmitDescription, + RPCError, +) + + +def test_parse_all(submit_job): + job = submit_job() + # Use the as_dict() function to test if parsing works for all + # properties on a simple Job without error. + Job.load(job.id).as_dict() + + +def test_load(submit_job): + job = submit_job() + jid = job.id + + # Nothing has been loaded at this point, just make sure everything is + # on default values. + assert job.ntasks == 1 + assert job.cpus_per_task == 1 + assert job.time_limit == None + + # Now load the job info + job = Job.load(jid) + + assert job.id == jid + assert job.ntasks == 2 + assert job.cpus_per_task == 3 + assert job.time_limit == 1440 + + with pytest.raises(RPCError): + Job.load(99999) + + +def test_cancel(submit_job): + job = submit_job() + job.cancel() + # make sure the job is actually cancelled + time.sleep(util.WAIT_SECS_SLURMCTLD) + assert Job.load(job.id).state == "CANCELLED" + + +def test_send_signal(submit_job): + job = submit_job() + + time.sleep(util.WAIT_SECS_SLURMCTLD) + assert Job.load(job.id).state == "RUNNING" + + # Send a SIGKILL (basically cancelling the Job) + job.send_signal(9) + + # make sure the job is actually cancelled + time.sleep(util.WAIT_SECS_SLURMCTLD) + assert Job.load(job.id).state == "CANCELLED" + + +def test_suspend_unsuspend(submit_job): + job = submit_job() + + time.sleep(util.WAIT_SECS_SLURMCTLD) + job.suspend() + assert Job.load(job.id).state == "SUSPENDED" + + job.unsuspend() + # make sure the job is actually running again + time.sleep(util.WAIT_SECS_SLURMCTLD) + assert Job.load(job.id).state == "RUNNING" + + +# Don't need to test hold/resume, since it uses just job.modify() to set +# priority to 0/INFINITE. +def test_modify(submit_job): + job = submit_job(priority=0) + job = Job(job.id) + + changes = JobSubmitDescription( + time_limit = "2-00:00:00", + ntasks = 5, + cpus_per_task = 4, + ) + + job.modify(changes) + job = Job.load(job.id) + + assert job.time_limit == 2880 + assert job.ntasks == 5 + assert job.cpus_per_task == 4 + + +def test_requeue(submit_job): + job = submit_job() + job = Job.load(job.id) + + assert job.requeue_count == 0 + + time.sleep(util.WAIT_SECS_SLURMCTLD) + job.requeue() + job = Job.load(job.id) + + assert job.requeue_count == 1 + + +def test_notify(submit_job): + job = submit_job() + time.sleep(util.WAIT_SECS_SLURMCTLD) + + # Could check the logfile, but we just assume for now + # that when this function raises no Exception, everything worked. + job.notify("Hello Friends!") + + +def test_get_batch_script(submit_job): + script_body = create_simple_job_desc().script + job = submit_job() + + assert script_body == job.get_batch_script() + + +def test_get_job_queue(submit_job): + # Submit 10 jobs, gather the job_ids in a list + job_list = [submit_job() for i in range(10)] + + jobs = Jobs.load() + for job in job_list: + # Check to see if all the Jobs we submitted exist + assert job.id in jobs + assert isinstance(jobs[job.id], Job) + + +def test_get_resource_layout_per_node(submit_job): + # TODO + assert True diff --git a/tests/integration/test_job_steps.py b/tests/integration/test_job_steps.py new file mode 100644 index 00000000..4ad2de39 --- /dev/null +++ b/tests/integration/test_job_steps.py @@ -0,0 +1,180 @@ +######################################################################### +# test_job_steps.py - job steps api integration tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_job_steps.py - Test the job steps api functions.""" + +import pytest +import time +from pyslurm import ( + JobStep, + JobSteps, + RPCError, +) +import util + + +def create_job_script_multi_step(steps=None): + default = f""" + srun -n1 -N1 -c2 \ + -J step_zero --distribution=block:cyclic:block,Pack \ + sleep 300 & + srun -n1 -N1 -c3 \ + -t 10 -J step_one --distribution=block:cyclic:block,Pack \ + sleep 300 &""" + + job_script = f"""\ +#!/bin/bash + +echo "Got args: $@" + +/usr/bin/env + +{default if steps is None else steps} +wait +""" + return job_script + + +def test_load(submit_job): + job = submit_job(script=create_job_script_multi_step()) + + # Load the step info, waiting one second to make sure the Step + # actually exists. + time.sleep(util.WAIT_SECS_SLURMCTLD) + step = JobStep.load(job.id, "batch") + + assert step.id == "batch" + assert step.job_id == job.id + assert step.name == "batch" + # Job was submitted with ntasks=2, but the batch step always has just 1. + assert step.ntasks == 1 + # Job was submitted with a time-limit of 1 day, but it seems this doesn't + # propagate through for the steps if not set explicitly. + assert step.time_limit is None + + # Now try to load the first and second Step started by srun + step_zero = JobStep.load(job, 0) + step_one = JobStep.load(job, 1) + + # It is possible that the srun executed as the second command will + # become the Step with ID '0' - so we just swap it. + if step_zero.name == "step_one": + tmp = step_zero + step_zero = step_one + step_one = tmp + + assert step_one.id == 0 + assert step_zero.id == 1 + + step = step_zero + assert step.job_id == job.id + assert step.name == "step_zero" + assert step.ntasks == 1 + assert step.alloc_cpus == 2 + assert step.time_limit is None + + step = step_one + assert step.job_id == job.id + assert step.name == "step_one" + assert step.ntasks == 1 + assert step.alloc_cpus == 3 + assert step.time_limit == 10 + + +def test_collection(submit_job): + job = submit_job(script=create_job_script_multi_step()) + + time.sleep(util.WAIT_SECS_SLURMCTLD) + steps = JobSteps.load(job) + + assert steps != {} + # We have 3 Steps: batch, 0 and 1 + assert len(steps) == 3 + assert ("batch" in steps and + 0 in steps and + 1 in steps) + + +def test_cancel(submit_job): + job = submit_job(script=create_job_script_multi_step()) + + time.sleep(util.WAIT_SECS_SLURMCTLD) + steps = JobSteps.load(job) + assert len(steps) == 3 + assert ("batch" in steps and + 0 in steps and + 1 in steps) + + steps[0].cancel() + + time.sleep(util.WAIT_SECS_SLURMCTLD) + steps = JobSteps.load(job) + assert len(steps) == 2 + assert ("batch" in steps and + 1 in steps) + + +def test_modify(submit_job): + steps = "srun -t 20 sleep 100" + job = submit_job(script=create_job_script_multi_step(steps)) + + time.sleep(util.WAIT_SECS_SLURMCTLD) + step = JobStep.load(job, 0) + assert step.time_limit == 20 + + step.modify(JobStep(time_limit="00:05:00")) + assert JobStep.load(job, 0).time_limit == 5 + + step.modify(time_limit="00:15:00") + assert JobStep.load(job, 0).time_limit == 15 + + +def test_send_signal(submit_job): + steps = "srun -t 10 sleep 100" + job = submit_job(script=create_job_script_multi_step(steps)) + + time.sleep(util.WAIT_SECS_SLURMCTLD) + step = JobStep.load(job, 0) + assert step.state == "RUNNING" + + # Send a SIGTERM (basically cancelling the Job) + step.send_signal(15) + + # Make sure the job is actually cancelled. + # If a RPCError is raised, this means the Step got cancelled. + time.sleep(util.WAIT_SECS_SLURMCTLD) + with pytest.raises(RPCError): + step = JobStep.load(job, 0) + + +def test_load_with_wrong_step_id(submit_job): + job = submit_job() + + with pytest.raises(RPCError): + JobStep.load(job, 3) + + +def test_parse_all(submit_job): + job = submit_job() + + # Use the as_dict() function to test if parsing works for all + # properties on a simple JobStep without error. + time.sleep(util.WAIT_SECS_SLURMCTLD) + JobStep.load(job, "batch").as_dict() diff --git a/tests/integration/test_job_submit.py b/tests/integration/test_job_submit.py new file mode 100644 index 00000000..d2f7c98b --- /dev/null +++ b/tests/integration/test_job_submit.py @@ -0,0 +1,43 @@ +######################################################################### +# test_job_submit.py - job submit api integration tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_job_submit.py - Test the job submit api functions.""" + +import time +import pytest +import pyslurm +from os import environ as pyenviron +from util import create_simple_job_desc, create_job_script +from pyslurm import ( + Job, + Jobs, + JobSubmitDescription, + RPCError, +) + + +def test_submit_example1(): + # TODO + assert True + + +def test_submit_example2(): + # TODO + assert True diff --git a/tests/integration/test_node.py b/tests/integration/test_node.py new file mode 100644 index 00000000..3e1306da --- /dev/null +++ b/tests/integration/test_node.py @@ -0,0 +1,72 @@ +######################################################################### +# test_node.py - node api integration tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_node.py - Test the node api functions.""" + +import sys +import time +import pytest +import pyslurm +import os +from pyslurm import Node, Nodes, RPCError + + +def test_load(): + name = Nodes.load().as_list()[0].name + + # Now load the node info + node = Node.load(name) + assert node.name == name + assert node.weight is not None + assert node.slurm_version is not None + + with pytest.raises(RPCError, + match=f"Node 'nonexistent' does not exist"): + Node.load("nonexistent") + + +def test_create(): + node = Node("testhostpyslurm") + node.create() + + with pytest.raises(RPCError, + match=f"Invalid node state specified"): + Node("testhostpyslurm2").create("idle") + + +# def test_delete(): +# node = Node("testhost1").delete() + + +def test_modify(): + node = Node(Nodes.load().as_list()[0].name) + + node.modify(weight=10000) + assert Node.load(node.name).weight == 10000 + + node.modify(Node(weight=20000)) + assert Node.load(node.name).weight == 20000 + + node.modify(Node(weight=5000)) + assert Node.load(node.name).weight == 5000 + + +def test_parse_all(): + Node.load(Nodes.load().as_list()[0].name).as_dict() diff --git a/tests/integration/util.py b/tests/integration/util.py new file mode 100644 index 00000000..f5032f1a --- /dev/null +++ b/tests/integration/util.py @@ -0,0 +1,65 @@ +######################################################################### +# util.py - utility functions for tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +import pytest +from pyslurm import ( + Job, + JobSubmitDescription, +) +import time + +# Horrendous, but works for now, because when testing against a real slurmctld +# we need to wait a bit for state changes (i.e. we cancel a job and +# immediately check after if the state is really "CANCELLED", but the state +# hasn't changed yet, so we need to wait a bit) +WAIT_SECS_SLURMCTLD = 3 + + +def wait(secs=WAIT_SECS_SLURMCTLD): + time.sleep(secs) + + +def create_job_script(): + job_script = """\ +#!/bin/bash + +echo "Got args: $@" + +/usr/bin/env + +sleep 500\ + +""" + return job_script + + +def create_simple_job_desc(script=None, **kwargs): + job = JobSubmitDescription(**kwargs) + + job.name = "test_job" + job.standard_output = "/tmp/slurm-test-%j.out" + job.memory_per_cpu = "1G" + job.ntasks = 2 + job.cpus_per_task = 3 + job.script = create_job_script() if not script else script + job.time_limit = "1-00:00:00" + + return job diff --git a/tests/unit/test_common.py b/tests/unit/test_common.py new file mode 100644 index 00000000..ca3f1cfd --- /dev/null +++ b/tests/unit/test_common.py @@ -0,0 +1,395 @@ +######################################################################### +# test_common.py - common utility tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_common.py - Test the most commonly used helper functions.""" + +import pyslurm +import pytest +import datetime +from pyslurm import Job, JobSubmitDescription, Node +from pyslurm.core.common.ctime import ( + timestr_to_mins, + timestr_to_secs, + mins_to_timestr, + secs_to_timestr, + date_to_timestamp, + timestamp_to_date, +) +from pyslurm.core.common.uint import ( + u8, + u16, + u32, + u64, + u8_parse, + u16_parse, + u32_parse, + u64_parse, +) +from pyslurm.core.common import ( + uid_to_name, + gid_to_name, + user_to_uid, + group_to_gid, + expand_range_str, + humanize, + dehumanize, + signal_to_num, + cpubind_to_num, + nodelist_from_range_str, + nodelist_to_range_str, + _sum_prop, +) +from pyslurm.core.common import cstr + + +class TestTypes: + + def test_strings(self): + n = Node() + + n.name = "Testing fmalloc string routines." + assert n.name == "Testing fmalloc string routines." + + n.name = None + assert n.name is None + + # Everything after a \0 will be cut off + n.name = "test1\0test2" + assert n.name == "test1" + + n.name = "\0" + assert n.name is None + + def test_lists(self): + n = Node() + input_as_list = ["test1", "test2", "test3", "test4"] + input_as_str = ",".join(input_as_list) + + n.available_features = input_as_list + assert n.available_features == input_as_list + + n.available_features = input_as_str + assert n.available_features == input_as_list + + n.available_features = [] + assert n.available_features == [] + + n.available_features = "" + assert n.available_features == [] + + n.available_features = None + assert n.available_features == [] + + def test_str_to_dict(self): + expected_dict = {"key1": "value1", "key2": "value2"} + input_str = "key1=value1,key2=value2" + assert cstr.to_dict(input_str) == expected_dict + assert cstr.to_dict("") == {} + + def test_dict_to_str(self): + input_dict = {"key1": "value1", "key2": "value2"} + expected_str = "key1=value1,key2=value2" + assert cstr.dict_to_str(input_dict) == expected_str + + input_dict = {"key1": "value1", "key2": "value2"} + expected_str = "key1=value1,key2=value2" + assert cstr.dict_to_str(input_dict) == expected_str + + expected_str = "key1-value1:key2-value2" + assert cstr.dict_to_str(input_dict, delim1=":", delim2="-") == expected_str + + input_dict = {"key1=": "value1", "key2": "value2"} + expected_str = "key1=value1,key2=value2" + with pytest.raises(ValueError, + match=r"Key or Value cannot contain either*"): + assert cstr.dict_to_str(input_dict) == expected_str + + expected_str = "key1=value1,key2=value2" + assert cstr.dict_to_str(expected_str) == expected_str + + assert cstr.dict_to_str({}) == None + assert cstr.dict_to_str("") == None + + def test_dict_to_gres_str(self): + input_dict = {"gpu:tesla": 3} + expected_str = "gres:gpu:tesla:3" + assert cstr.from_gres_dict(input_dict) == expected_str + assert cstr.from_gres_dict(expected_str) == expected_str + + input_dict = {"gpu": 3} + expected_str = "gres:gpu:3" + assert cstr.from_gres_dict(input_dict) == expected_str + assert cstr.from_gres_dict(expected_str) == expected_str + + def test_str_to_gres_dict(self): + assert True + + def _uint_impl(self, func_set, func_get, typ): + val = func_set(2**typ-2) + assert func_get(val) == None + + val = func_set(None) + assert func_get(val) == None + + val = func_set(str(2**typ-2)) + assert func_get(val) == None + + val = func_set("unlimited", inf=True) + assert func_get(val) == "unlimited" + + val = func_set(0) + assert func_get(val) == None + + val = func_set(0, zero_is_noval=False) + assert func_get(val, zero_is_noval=False) == 0 + + with pytest.raises(TypeError, + match="an integer is required"): + val = func_set("unlimited") + + with pytest.raises(OverflowError, + match=r"can't convert negative value to*"): + val = func_set(-1) + + with pytest.raises(OverflowError, + match=r"value too large to convert to*|" + "Python int too large*"): + val = func_set(2**typ) + + def test_u8(self): + self._uint_impl(u8, u8_parse, 8) + + def test_u16(self): + self._uint_impl(u16, u16_parse, 16) + + def test_u32(self): + self._uint_impl(u32, u32_parse, 32) + + def test_u64(self): + self._uint_impl(u64, u64_parse, 64) + +# def _uint_bool_impl(self, arg): +# js = JobSubmitDescription() + +# setattr(js, arg, True) +# assert getattr(js, arg) == True + +# setattr(js, arg, False) +# assert getattr(js, arg) == False + +# # Set to true again to make sure toggling actually works. +# setattr(js, arg, True) +# assert getattr(js, arg) == True + +# setattr(js, arg, None) +# assert getattr(js, arg) == False + +# def test_u8_bool(self): +# self._uint_bool_impl("overcommit") + +# def test_u16_bool(self): +# self._uint_bool_impl("requires_contiguous_nodes") + +# def test_u64_bool_flag(self): +# self._uint_bool_impl("kill_on_invalid_dependency") + + +class TestTime: + + def test_parse_minutes(self): + mins = 60 + mins_str = "01:00:00" + + assert timestr_to_mins(mins_str) == mins + assert timestr_to_mins("unlimited") == 2**32-1 + assert timestr_to_mins(None) == 2**32-2 + + assert mins_to_timestr(mins) == mins_str + assert mins_to_timestr(2**32-1) == "unlimited" + assert mins_to_timestr(2**32-2) == None + assert mins_to_timestr(0) == None + + with pytest.raises(ValueError, + match="Invalid Time Specification: invalid_val."): + timestr_to_mins("invalid_val") + + def test_parse_seconds(self): + secs = 3600 + secs_str = "01:00:00" + + assert timestr_to_secs(secs_str) == secs + assert timestr_to_secs("unlimited") == 2**32-1 + assert timestr_to_secs(None) == 2**32-2 + + assert secs_to_timestr(secs) == secs_str + assert secs_to_timestr(2**32-1) == "unlimited" + assert secs_to_timestr(2**32-2) == None + assert secs_to_timestr(0) == None + + with pytest.raises(ValueError, + match="Invalid Time Specification: invalid_val."): + timestr_to_secs("invalid_val") + + def test_parse_date(self): + timestamp = 1667941697 + date = "2022-11-08T21:08:17" + datetime_date = datetime.datetime(2022, 11, 8, 21, 8, 17) + + # Converting date str to timestamp with the slurm API functions may + # not yield the expected timestamp above due to using local time zone + assert date_to_timestamp(date) == timestamp + assert date_to_timestamp(timestamp) == timestamp + assert date_to_timestamp(datetime_date) == timestamp + + assert timestamp_to_date(timestamp) == date + assert timestamp_to_date(0) == None + assert timestamp_to_date(2**32-1) == None + assert timestamp_to_date(2**32-2) == None + + with pytest.raises(ValueError, + match="Invalid Time Specification: 2022-11-08T21"): + date_to_timestamp("2022-11-08T21") + +class TestMiscUtil: + + def test_parse_uid(self): + name = uid_to_name(0) + assert name == "root" + + lookup = {0: "root"} + name = uid_to_name(0, lookup=lookup) + assert name == "root" + + uid = user_to_uid("root") + assert uid == 0 + + with pytest.raises(KeyError): + name = uid_to_name(2**32-5) + + with pytest.raises(KeyError): + name = user_to_uid("invalid_user") + + def test_parse_gid(self): + name = gid_to_name(0) + assert name == "root" + + lookup = {0: "root"} + name = gid_to_name(0, lookup=lookup) + assert name == "root" + + gid = group_to_gid("root") + assert gid == 0 + + with pytest.raises(KeyError): + name = gid_to_name(2**32-5) + + with pytest.raises(KeyError): + name = group_to_gid("invalid_group") + + def test_expand_range_str(self): + r = expand_range_str("1-5,6,7,10-11") + assert r == [1, 2, 3, 4, 5, 6, 7, 10, 11] + + def test_humanize(self): + val = humanize(1024) + assert val == "1.0G" + + val = humanize(2**20) + assert val == "1.0T" + + val = humanize(800) + assert val == "800.0M" + + val = humanize("unlimited") + assert val == "unlimited" + + val = humanize(None) + assert val == None + + with pytest.raises(ValueError): + val = humanize("invalid_val") + + def test_dehumanize(self): + # Note: default target unit for dehumanize is "M". + val = dehumanize(1024) + assert val == 1024 + + val = dehumanize("2M") + assert val == 2 + + val = dehumanize("10G") + assert val == 10240 + + val = dehumanize("9.6G") + assert val == round(1024*9.6) + + val = dehumanize("10T") + assert val == 10*(2**20) + + val = dehumanize("10T", target="G") + assert val == 10*(2**10) + + with pytest.raises(ValueError, + match="Invalid value specified: 10L"): + val = dehumanize("10L") + + with pytest.raises(ValueError, + match="could not convert string to float: 'invalid_val'"): + val = dehumanize("invalid_valM") + + def test_signal_to_num(self): + sig = signal_to_num("SIGKILL") + assert sig == 9 + + sig = signal_to_num(7) + assert sig == 7 + + with pytest.raises(ValueError): + sig = signal_to_num("invalid_sig") + + def test_nodelist_from_range_str(self): + nodelist = ["node001", "node007", "node008", "node009"] + nodelist_str = ",".join(nodelist) + assert nodelist == nodelist_from_range_str("node[001,007-009]") + assert nodelist_from_range_str("node[001,007:009]") == [] + + def test_nodelist_to_range_str(self): + nodelist = ["node001", "node007", "node008", "node009"] + nodelist_str = ",".join(nodelist) + assert "node[001,007-009]" == nodelist_to_range_str(nodelist) + assert "node[001,007-009]" == nodelist_to_range_str(nodelist_str) + + def test_summarize_property(self): + class TestObject: + @property + def memory(self): + return 10240 + + @property + def cpus(self): + return None + + object_dict = {i: TestObject() for i in range(10)} + + expected = 10240 * 10 + assert _sum_prop(object_dict, TestObject.memory) == expected + + expected = 0 + assert _sum_prop(object_dict, TestObject.cpus) == 0 diff --git a/tests/unit/test_db_job.py b/tests/unit/test_db_job.py new file mode 100644 index 00000000..43ea5227 --- /dev/null +++ b/tests/unit/test_db_job.py @@ -0,0 +1,52 @@ +######################################################################### +# test_db_job.py - database job unit tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_db_job.py - Unit test basic database job functionalities.""" + +import pytest +import pyslurm + + +def test_search_filter(): + job_filter = pyslurm.db.JobSearchFilter() + + job_filter.clusters = ["test1"] + job_filter.partitions = ["partition1", "partition2"] + job_filter._create() + + job_filter.ids = [1000, 1001] + job_filter._create() + + job_filter.with_script = True + job_filter._create() + + job_filter.with_env = True + with pytest.raises(ValueError): + job_filter._create() + + +def test_collection_init(): + # TODO + assert True + + +def test_create_instance(): + job = pyslurm.db.Job(9999) + assert job.id == 9999 diff --git a/tests/unit/test_db_qos.py b/tests/unit/test_db_qos.py new file mode 100644 index 00000000..acf12fea --- /dev/null +++ b/tests/unit/test_db_qos.py @@ -0,0 +1,49 @@ +######################################################################### +# test_db_qos.py - database qos unit tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_db_qos.py - Unit test basic database qos functionalities.""" + +import pytest +import pyslurm + + +def test_search_filter(): + qos_filter = pyslurm.db.QualityOfServiceSearchFilter() + qos_filter._create() + + qos_filter.ids = [1, 2] + qos_filter._create() + + qos_filter.preempt_modes = ["cluster"] + qos_filter._create() + + with pytest.raises(ValueError): + qos_filter.preempt_modes = ["invalid_preempt_mode"] + qos_filter._create() + + +def test_create_collection_instance(): + # TODO + assert True + + +def test_create_instance(): + qos = pyslurm.db.QualityOfService("test") + assert qos.name == "test" diff --git a/tests/unit/test_db_slurm_list.py b/tests/unit/test_db_slurm_list.py new file mode 100644 index 00000000..41df371c --- /dev/null +++ b/tests/unit/test_db_slurm_list.py @@ -0,0 +1,134 @@ +######################################################################### +# test_db_slurm_list.py - Slurm list tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_db_slurm_List.py - Unit test basic Slurm list functionalities.""" + +import pytest +import pyslurm +from pyslurm.core.db.util import SlurmList + + +def test_create_and_destroy_list(): + slist = SlurmList() + assert not slist.is_null + + slist2 = SlurmList(["user1", "user2"]) + assert not slist.is_null + assert slist2.cnt == 2 + assert slist2.itr_cnt == 0 + assert slist2.is_itr_null + + slist2._dealloc_itr() + slist2._dealloc_list() + assert slist2.is_null + + +def test_append(): + slist = SlurmList() + input_list = ["user1", "user2", "user3"] + slist.append(input_list) + assert slist.cnt == len(input_list) + + input_str = "user4" + slist.append(input_str) + assert slist.cnt == 4 + + input_int = 10 + slist.append(input_int) + assert slist.cnt == 5 + + input_ignore_none = ["user6", None] + slist.append(input_ignore_none) + assert slist.cnt == 6 + + +def test_convert_to_pylist(): + input_list = ["user1", "user2", "user3"] + slist = SlurmList(input_list) + assert slist.cnt == 3 + assert slist.to_pylist() == input_list + + +def test_iter(): + input_list = ["user1", "user2", "user3"] + slist = SlurmList(input_list) + assert slist.itr_cnt == 0 + assert slist.is_itr_null + assert not slist.is_null + assert slist.cnt == 3 + + for idx, slurm_item in enumerate(slist): + assert not slist.is_itr_null + assert slurm_item.has_data + assert slist.itr_cnt == idx+1 + + assert slist.itr_cnt == 0 + assert slist.is_itr_null + + slist._dealloc_list() + assert slist.is_null + assert slist.cnt == 0 + + for item in slist: + # Should not be possible to get here + assert False + + +def test_iter_and_pop(): + input_list = ["user1", "user2", "user3"] + slist = SlurmList(input_list) + assert slist.itr_cnt == 0 + assert slist.is_itr_null + assert slist.cnt == 3 + + for idx, slurm_item in enumerate(SlurmList.iter_and_pop(slist)): + assert slist.is_itr_null + assert slurm_item.has_data + + assert slist.cnt == 0 + assert slist.itr_cnt == 0 + assert slist.is_itr_null + + # Round 2 on existing object + slist.append(["user10", "user11"]) + assert slist.itr_cnt == 0 + assert slist.cnt == 2 + + for slurm_item in SlurmList.iter_and_pop(slist): + assert slurm_item.has_data + + assert slist.cnt == 0 + assert slist.itr_cnt == 0 + assert slist.is_itr_null + + +def test_iter_and_pop_on_null_list(): + input_list = ["user1", "user2", "user3"] + slist = SlurmList(input_list) + assert not slist.is_null + assert slist.cnt == 3 + + slist._dealloc_list() + assert slist.is_null + assert slist.cnt == 0 + + for slurm_item in SlurmList.iter_and_pop(slist): + # Should not be possible to get here + assert False diff --git a/tests/unit/test_job.py b/tests/unit/test_job.py new file mode 100644 index 00000000..edcf65d4 --- /dev/null +++ b/tests/unit/test_job.py @@ -0,0 +1,74 @@ +######################################################################### +# test_job.py - job unit tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_job.py - Unit test basic job functionalities.""" + +import pytest +import pyslurm +from pyslurm import Job +from pyslurm.core.job.util import * + +def test_create_instance(): + job = Job(9999) + assert job.id == 9999 + + +def test_parse_all(): + # Use the as_dict() function to test if parsing works for all + # properties on a simple Job without error. + Job(9999).as_dict() + + +def test_parse_dependencies_to_dict(): + expected = None + assert dependency_str_to_dict("") == expected + + expected = { + "after": [1, 2], + "afterany": [], + "afterburstbuffer": [], + "aftercorr": [], + "afternotok": [], + "afterok": [3], + "singleton": False, + "satisfy": "all", + } + input_str = "after:1:2,afterok:3" + assert dependency_str_to_dict(input_str) == expected + + +def test_mail_types_int_to_list(): + expected = [] + assert mail_type_int_to_list(0) == expected + + +def test_acctg_profile_int_to_list(): + expected = [] + assert acctg_profile_int_to_list(0) == expected + + +def test_power_type_int_to_list(): + expected = [] + assert power_type_int_to_list(0) == expected + + +def test_cpu_freq_int_to_str(): + expected = None + assert cpu_freq_int_to_str(0) == expected diff --git a/tests/unit/test_job_steps.py b/tests/unit/test_job_steps.py new file mode 100644 index 00000000..c222ef34 --- /dev/null +++ b/tests/unit/test_job_steps.py @@ -0,0 +1,44 @@ +######################################################################### +# test_job_steps.py - job steps unit tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_job_steps.py - Unit test basic job step functionality.""" + +import pytest +from pyslurm import JobStep, Job +from pyslurm.core.job.step import ( + humanize_step_id, + dehumanize_step_id, +) + +def test_create_instance(): + step = JobStep(9999, 1) + assert step.id == 1 + assert step.job_id == 9999 + + job = Job(10000) + step2 = JobStep(job, 2) + assert step2.id == 2 + assert step2.job_id == 10000 + + +def test_parse_all(): + # Use the as_dict() function to test if parsing works for all + # properties on a simple JobStep without error. + JobStep(9999, 1).as_dict() diff --git a/tests/unit/test_job_submit.py b/tests/unit/test_job_submit.py new file mode 100644 index 00000000..d0daf41b --- /dev/null +++ b/tests/unit/test_job_submit.py @@ -0,0 +1,306 @@ +######################################################################### +# test_job_submit.py - job submission unit tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_job_submit.py - Test the job submit api functions.""" + +import sys +import time +import pytest +import pyslurm +import tempfile +import os +from os import environ as pyenviron +from util import create_simple_job_desc, create_job_script +from pyslurm import ( + Job, + Jobs, + JobSubmitDescription, + RPCError, +) + +def job_desc(**kwargs): + return JobSubmitDescription(script=create_job_script(), **kwargs) + + +def test_environment(): + job = job_desc() + + # Everything in the current environment will be exported + job.environment = "ALL" + job._create_job_submit_desc() + + # Only SLURM_* Vars from the current env will be exported + job.environment = "NONE" + job._create_job_submit_desc() + + # TODO: more test cases + # Test explicitly set vars as dict +# job.environment = { +# "PYSLURM_TEST_VAR_1": 2, +# "PYSLURM_TEST_VAR_2": "test-value", +# } + + +def test_cpu_frequencyuency(): + job = job_desc() + job._create_job_submit_desc() + + job.cpu_frequency = "Performance" + job._create_job_submit_desc() + + job.cpu_frequency = {"governor": "Performance"} + job._create_job_submit_desc() + + job.cpu_frequency = 1000000 + job._create_job_submit_desc() + + job.cpu_frequency = {"max": 1000000} + job._create_job_submit_desc() + + job.cpu_frequency = "1000000-3700000" + job._create_job_submit_desc() + + job.cpu_frequency = {"min": 1000000, "max": 3700000} + job._create_job_submit_desc() + + job.cpu_frequency = "1000000-3700000:Performance" + job._create_job_submit_desc() + + job.cpu_frequency = {"min": 1000000, "max": 3700000, + "governor": "Performance"} + job._create_job_submit_desc() + + with pytest.raises(ValueError, + match=r"Invalid cpu_frequency format*"): + job.cpu_frequency = "Performance:3700000" + job._create_job_submit_desc() + + with pytest.raises(ValueError, + match=r"min cpu-freq*"): + job.cpu_frequency = "4000000-3700000" + job._create_job_submit_desc() + + with pytest.raises(ValueError, + match=r"Invalid cpu freq value*"): + job.cpu_frequency = "3700000:Performance" + job._create_job_submit_desc() + + with pytest.raises(ValueError, + match=r"Setting Governor when specifying*"): + job.cpu_frequency = {"max": 3700000, "governor": "Performance"} + job._create_job_submit_desc() + + with pytest.raises(ValueError, + match=r"Setting Governor when specifying*"): + job.cpu_frequency = {"min": 3700000, "governor": "Performance"} + job._create_job_submit_desc() + + +def test_nodes(): + job = job_desc() + job._create_job_submit_desc() + + job.nodes = "5" + job._create_job_submit_desc() + + job.nodes = {"min": 5, "max": 5} + job._create_job_submit_desc() + + job.nodes = "5-10" + job._create_job_submit_desc() + + job.nodes = {"min": 5, "max": 10} + job._create_job_submit_desc() + + with pytest.raises(ValueError, + match=r"Max Nodecount cannot be less than*"): + job.nodes = {"min": 10, "max": 5} + job._create_job_submit_desc() + + +def test_script(): + job = job_desc() + script = create_job_script() + job._create_job_submit_desc() + + job.script = script + assert job.script == script + assert job.script_args is None + + # Try passing in a path to a script. + fd, path = tempfile.mkstemp() + try: + with os.fdopen(fd, 'w') as tmp: + tmp.write(script) + + job.script = path + job.script_args = "-t 10 input.csv" + job._create_job_submit_desc() + finally: + os.remove(path) + + with pytest.raises(ValueError, + match=r"Passing arguments to a script*"): + job.script = "#!/bin/bash\nsleep 10" + job.script_args = "-t 10" + job._create_job_submit_desc() + + with pytest.raises(ValueError, + match=r"The Slurm Controller does not allow*"): + job.script = script + "\0" + job.script_args = None + job._create_job_submit_desc() + + with pytest.raises(ValueError, + match="You need to provide a batch script."): + job.script = "" + job.script_args = None + job._create_job_submit_desc() + + with pytest.raises(ValueError, + match=r"Batch script contains DOS line breaks*"): + job.script = script + "\r\n" + job.script_args = None + job._create_job_submit_desc() + + +def test_dependencies(): + job = job_desc() + job._create_job_submit_desc() + + job.dependencies = "after:70:90:60+30,afterok:80" + job._create_job_submit_desc() + + job.dependencies = "after:70:90:60?afterok:80" + job._create_job_submit_desc() + + job.dependencies = { + "afterany": [40, 30, 20], + "afternotok": [100], + "satisfy": "any", + "singleton": True, + } + job._create_job_submit_desc() + + +def test_cpus(): + job = job_desc() + job._create_job_submit_desc() + + job.cpus_per_task = 5 + job._create_job_submit_desc() + + with pytest.raises(ValueError, + match="cpus_per_task and cpus_per_gpu are mutually exclusive."): + job.cpus_per_gpu = 5 + job._create_job_submit_desc() + + job.cpus_per_task = None + job.cpus_per_gpu = 5 + job._create_job_submit_desc() + + with pytest.raises(ValueError, + match="cpus_per_task and cpus_per_gpu are mutually exclusive."): + job.cpus_per_task = 5 + job._create_job_submit_desc() + + +def test_gres_per_node(): + job = job_desc() + job._create_job_submit_desc() + + job.gres_per_node = "gpu:tesla:1,gpu:volta:5" + job._create_job_submit_desc() + + job.gres_per_node = {"gpu:tesla": 1, "gpu:volta": 1} + job._create_job_submit_desc() + + +def test_signal(): + job = job_desc() + job._create_job_submit_desc() + + job.signal = 7 + job._create_job_submit_desc() + + job.signal = {"batch_only": True} + job._create_job_submit_desc() + + job.signal = "7@120" + job._create_job_submit_desc() + + job.signal = "RB:8@180" + job._create_job_submit_desc() + + +def test_setting_attrs_with_env_vars(): + pyenviron["PYSLURM_JOBDESC_ACCOUNT"] = "account1" + pyenviron["PYSLURM_JOBDESC_NAME"] = "jobname" + pyenviron["PYSLURM_JOBDESC_WCKEY"] = "wckey" + pyenviron["PYSLURM_JOBDESC_CLUSTERS"] = "cluster1,cluster2" + pyenviron["PYSLURM_JOBDESC_COMMENT"] = "A simple job comment" + pyenviron["PYSLURM_JOBDESC_REQUIRES_CONTIGUOUS_NODES"] = "True" + pyenviron["PYSLURM_JOBDESC_WORKING_DIRECTORY"] = "/work/user1" + + job = job_desc(working_directory="/work/user2") + job.load_environment() + + assert job.account == "account1" + assert job.name == "jobname" + assert job.wckey == "wckey" + assert job.clusters == "cluster1,cluster2" + assert job.comment == "A simple job comment" + assert job.working_directory == "/work/user2" + assert job.requires_contiguous_nodes == True + job._create_job_submit_desc() + + +def test_parsing_sbatch_options_from_script(): + job = job_desc(working_directory="/work/user2") + + fd, path = tempfile.mkstemp() + try: + with os.fdopen(fd, 'w') as tmp: + tmp.write( + """#!/bin/bash + + #SBATCH --time 20 + #SBATCH --mem-per-cpu =1G + #SBATCH -G 1 + #SBATCH --exclusive + #SBATCH --ntasks = 2 + #SBATCH -c=3 # inline-comments should be ignored + + sleep 1000 + """ + ) + + job.script = path + job.load_sbatch_options() + assert job.time_limit == "20" + assert job.memory_per_cpu == "1G" + assert job.gpus == "1" + assert job.resource_sharing == "no" + assert job.ntasks == "2" + assert job.cpus_per_task == "3" + job._create_job_submit_desc() + finally: + os.remove(path) + diff --git a/tests/unit/test_node.py b/tests/unit/test_node.py new file mode 100644 index 00000000..2caf8d37 --- /dev/null +++ b/tests/unit/test_node.py @@ -0,0 +1,44 @@ +######################################################################### +# test_node.py - node unit tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_node.py - Unit Test basic functionality of the Node class.""" + +import pytest +import pyslurm +from pyslurm import Node, Nodes + + +def test_create_instance(): + node = Node("localhost") + assert node.name == "localhost" + + +def test_parse_all(): + Node("localhost").as_dict() + + +def test_create_nodes_collection(): + # TODO + assert True + + +def test_setting_attributes(): + # TODO + assert True diff --git a/tests/unit/test_task_dist.py b/tests/unit/test_task_dist.py new file mode 100644 index 00000000..52a3e07c --- /dev/null +++ b/tests/unit/test_task_dist.py @@ -0,0 +1,52 @@ +######################################################################### +# test_task_dist.py - task distribution unit tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +"""test_task_dist.py - Test task distribution functions.""" + +import pyslurm +from pyslurm.core.job.task_dist import TaskDistribution + + +def test_from_int(): + expected = None + assert TaskDistribution.from_int(0) == expected + + +def test_from_str(): + + input_str = "cyclic:cyclic:cyclic" + expected = TaskDistribution("cyclic", "cyclic", "cyclic") + parsed = TaskDistribution.from_str(input_str) + assert parsed == expected + assert parsed.to_str() == input_str + + input_str = "*:*:fcyclic,NoPack" + expected = TaskDistribution("*", "*", "fcyclic", False) + parsed = TaskDistribution.from_str(input_str) + assert parsed == expected + assert parsed.to_str() == "block:cyclic:fcyclic,NoPack" + + input_plane_size = 10 + expected = TaskDistribution(plane_size=input_plane_size) + parsed = TaskDistribution.from_str(f"plane={input_plane_size}") + assert parsed == expected + assert parsed.to_str() == "plane" + assert parsed.plane == 10 +# assert parsed.as_int() == pyslurm.SLURM_DIST_PLANE diff --git a/tests/unit/util.py b/tests/unit/util.py new file mode 100644 index 00000000..d142a3a4 --- /dev/null +++ b/tests/unit/util.py @@ -0,0 +1,56 @@ +######################################################################### +# util.py - utility functions for tests +######################################################################### +# Copyright (C) 2023 Toni Harzendorf +# +# This file is part of PySlurm +# +# PySlurm is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# PySlurm is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with PySlurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +import pytest +from pyslurm import ( + Job, + JobSubmitDescription, +) + +# TODO: Figure out how to share this properly between the unit and integration +# folders + +def create_job_script(): + job_script = """\ +#!/bin/bash + +echo "Got args: $@" + +/usr/bin/env + +sleep 500\ + +""" + return job_script + + +def create_simple_job_desc(script=None, **kwargs): + job = JobSubmitDescription(**kwargs) + + job.name = "test_job" + job.standard_output = "/tmp/slurm-test-%j.out" + job.memory_per_cpu = "1G" + job.ntasks = 2 + job.cpus_per_task = 3 + job.script = create_job_script() if not script else script + job.time_limit = "1-00:00:00" + + return job diff --git a/valgrind-pyslurm.supp b/valgrind-pyslurm.supp new file mode 100644 index 00000000..d7243f44 --- /dev/null +++ b/valgrind-pyslurm.supp @@ -0,0 +1,544 @@ +# Initial suppression file taken from here: +# https://github.com/python/cpython/blob/77a3196b7cc17d90a8aae5629aa71ff183b9266a/Misc/valgrind-python.supp +# Extended with Slurm specific suppressions + +{ + Python _PyFunction_Vectorcall + Memcheck:Leak + match-leak-kinds: possible + fun:malloc + fun:_PyObject_GC_NewVar + obj:/usr/bin/python3.10 + fun:_PyFunction_Vectorcall + fun:_PyEval_EvalFrameDefault + fun:_PyFunction_Vectorcall + fun:_PyEval_EvalFrameDefault + fun:_PyFunction_Vectorcall + fun:_PyEval_EvalFrameDefault + fun:_PyFunction_Vectorcall + fun:_PyEval_EvalFrameDefault + fun:_PyFunction_Vectorcall +} + +### +### IGNORE POSSIBLE LEAKS CAUSED BY SOME INIT FUNCTIONS IN libslurm +### + +{ + Slurm select_g_init + Memcheck:Leak + match-leak-kinds: possible + ... + fun:select_g_init + ... +} + +{ + Slurm slurm_auth_init + Memcheck:Leak + match-leak-kinds: possible + ... + fun:slurm_auth_init + ... +} + +{ + Slurm slurm_conf_init/slurm_init + Memcheck:Leak + match-leak-kinds: possible + ... + fun:slurm_conf_init + fun:slurm_init + ... +} + +{ + Slurm hash_g_init + Memcheck:Leak + match-leak-kinds: possible + ... + fun:hash_g_init + ... +} + +{ + ADDRESS_IN_RANGE/Invalid read of size 4 + Memcheck:Addr4 + fun:address_in_range +} + +{ + ADDRESS_IN_RANGE/Invalid read of size 4 + Memcheck:Value4 + fun:address_in_range +} + +{ + ADDRESS_IN_RANGE/Invalid read of size 8 (x86_64 aka amd64) + Memcheck:Value8 + fun:address_in_range +} + +{ + ADDRESS_IN_RANGE/Conditional jump or move depends on uninitialised value + Memcheck:Cond + fun:address_in_range +} + +# +# Leaks (including possible leaks) +# Hmmm, I wonder if this masks some real leaks. I think it does. +# Will need to fix that. +# + +{ + Suppress leaking the GIL. Happens once per process, see comment in ceval.c. + Memcheck:Leak + fun:malloc + fun:PyThread_allocate_lock + fun:PyEval_InitThreads +} + +{ + Suppress leaking the GIL after a fork. + Memcheck:Leak + fun:malloc + fun:PyThread_allocate_lock + fun:PyEval_ReInitThreads +} + +{ + Suppress leaking the autoTLSkey. This looks like it shouldn't leak though. + Memcheck:Leak + fun:malloc + fun:PyThread_create_key + fun:_PyGILState_Init + fun:Py_InitializeEx + fun:Py_Main +} + +{ + Hmmm, is this a real leak or like the GIL? + Memcheck:Leak + fun:malloc + fun:PyThread_ReInitTLS +} + +{ + Handle PyMalloc confusing valgrind (possibly leaked) + Memcheck:Leak + fun:realloc + fun:_PyObject_GC_Resize + fun:COMMENT_THIS_LINE_TO_DISABLE_LEAK_WARNING +} + +{ + Handle PyMalloc confusing valgrind (possibly leaked) + Memcheck:Leak + fun:malloc + fun:_PyObject_GC_New + fun:COMMENT_THIS_LINE_TO_DISABLE_LEAK_WARNING +} + +{ + Handle PyMalloc confusing valgrind (possibly leaked) + Memcheck:Leak + fun:malloc + fun:_PyObject_GC_NewVar + fun:COMMENT_THIS_LINE_TO_DISABLE_LEAK_WARNING +} + +# +# Non-python specific leaks +# + +{ + Handle pthread issue (possibly leaked) + Memcheck:Leak + fun:calloc + fun:allocate_dtv + fun:_dl_allocate_tls_storage + fun:_dl_allocate_tls +} + +{ + Handle pthread issue (possibly leaked) + Memcheck:Leak + fun:memalign + fun:_dl_allocate_tls_storage + fun:_dl_allocate_tls +} + +{ + ADDRESS_IN_RANGE/Invalid read of size 4 + Memcheck:Addr4 + fun:_PyObject_Free +} + +{ + ADDRESS_IN_RANGE/Invalid read of size 4 + Memcheck:Value4 + fun:_PyObject_Free +} + +{ + ADDRESS_IN_RANGE/Use of uninitialised value of size 8 + Memcheck:Addr8 + fun:_PyObject_Free +} + +{ + ADDRESS_IN_RANGE/Use of uninitialised value of size 8 + Memcheck:Value8 + fun:_PyObject_Free +} + +{ + ADDRESS_IN_RANGE/Conditional jump or move depends on uninitialised value + Memcheck:Cond + fun:_PyObject_Free +} + +{ + ADDRESS_IN_RANGE/Invalid read of size 4 + Memcheck:Addr4 + fun:_PyObject_Realloc +} + +{ + ADDRESS_IN_RANGE/Invalid read of size 4 + Memcheck:Value4 + fun:_PyObject_Realloc +} + +{ + ADDRESS_IN_RANGE/Use of uninitialised value of size 8 + Memcheck:Addr8 + fun:_PyObject_Realloc +} + +{ + ADDRESS_IN_RANGE/Use of uninitialised value of size 8 + Memcheck:Value8 + fun:_PyObject_Realloc +} + +{ + ADDRESS_IN_RANGE/Conditional jump or move depends on uninitialised value + Memcheck:Cond + fun:_PyObject_Realloc +} + +### +### All the suppressions below are for errors that occur within libraries +### that Python uses. The problems to not appear to be related to Python's +### use of the libraries. +### + +{ + Generic ubuntu ld problems + Memcheck:Addr8 + obj:/lib/ld-2.4.so + obj:/lib/ld-2.4.so + obj:/lib/ld-2.4.so + obj:/lib/ld-2.4.so +} + +{ + Generic gentoo ld problems + Memcheck:Cond + obj:/lib/ld-2.3.4.so + obj:/lib/ld-2.3.4.so + obj:/lib/ld-2.3.4.so + obj:/lib/ld-2.3.4.so +} + +{ + DBM problems, see test_dbm + Memcheck:Param + write(buf) + fun:write + obj:/usr/lib/libdb1.so.2 + obj:/usr/lib/libdb1.so.2 + obj:/usr/lib/libdb1.so.2 + obj:/usr/lib/libdb1.so.2 + fun:dbm_close +} + +{ + DBM problems, see test_dbm + Memcheck:Value8 + fun:memmove + obj:/usr/lib/libdb1.so.2 + obj:/usr/lib/libdb1.so.2 + obj:/usr/lib/libdb1.so.2 + obj:/usr/lib/libdb1.so.2 + fun:dbm_store + fun:dbm_ass_sub +} + +{ + DBM problems, see test_dbm + Memcheck:Cond + obj:/usr/lib/libdb1.so.2 + obj:/usr/lib/libdb1.so.2 + obj:/usr/lib/libdb1.so.2 + fun:dbm_store + fun:dbm_ass_sub +} + +{ + DBM problems, see test_dbm + Memcheck:Cond + fun:memmove + obj:/usr/lib/libdb1.so.2 + obj:/usr/lib/libdb1.so.2 + obj:/usr/lib/libdb1.so.2 + obj:/usr/lib/libdb1.so.2 + fun:dbm_store + fun:dbm_ass_sub +} + +{ + GDBM problems, see test_gdbm + Memcheck:Param + write(buf) + fun:write + fun:gdbm_open + +} + +{ + Uninitialised byte(s) false alarm, see bpo-35561 + Memcheck:Param + epoll_ctl(event) + fun:epoll_ctl + fun:pyepoll_internal_ctl +} + +{ + ZLIB problems, see test_gzip + Memcheck:Cond + obj:/lib/libz.so.1.2.3 + obj:/lib/libz.so.1.2.3 + fun:deflate +} + +{ + Avoid problems w/readline doing a putenv and leaking on exit + Memcheck:Leak + fun:malloc + fun:xmalloc + fun:sh_set_lines_and_columns + fun:_rl_get_screen_size + fun:_rl_init_terminal_io + obj:/lib/libreadline.so.4.3 + fun:rl_initialize +} + +# Valgrind emits "Conditional jump or move depends on uninitialised value(s)" +# false alarms on GCC builtin strcmp() function. The GCC code is correct. +# +# Valgrind bug: https://bugs.kde.org/show_bug.cgi?id=264936 +{ + bpo-38118: Valgrind emits false alarm on GCC builtin strcmp() + Memcheck:Cond + fun:PyUnicode_Decode +} + + +### +### These occur from somewhere within the SSL, when running +### test_socket_sll. They are too general to leave on by default. +### +###{ +### somewhere in SSL stuff +### Memcheck:Cond +### fun:memset +###} +###{ +### somewhere in SSL stuff +### Memcheck:Value4 +### fun:memset +###} +### +###{ +### somewhere in SSL stuff +### Memcheck:Cond +### fun:MD5_Update +###} +### +###{ +### somewhere in SSL stuff +### Memcheck:Value4 +### fun:MD5_Update +###} + +# Fedora's package "openssl-1.0.1-0.1.beta2.fc17.x86_64" on x86_64 +# See http://bugs.python.org/issue14171 +{ + openssl 1.0.1 prng 1 + Memcheck:Cond + fun:bcmp + fun:fips_get_entropy + fun:FIPS_drbg_instantiate + fun:RAND_init_fips + fun:OPENSSL_init_library + fun:SSL_library_init + fun:init_hashlib +} + +{ + openssl 1.0.1 prng 2 + Memcheck:Cond + fun:fips_get_entropy + fun:FIPS_drbg_instantiate + fun:RAND_init_fips + fun:OPENSSL_init_library + fun:SSL_library_init + fun:init_hashlib +} + +{ + openssl 1.0.1 prng 3 + Memcheck:Value8 + fun:_x86_64_AES_encrypt_compact + fun:AES_encrypt +} + +# +# All of these problems come from using test_socket_ssl +# +{ + from test_socket_ssl + Memcheck:Cond + fun:BN_bin2bn +} + +{ + from test_socket_ssl + Memcheck:Cond + fun:BN_num_bits_word +} + +{ + from test_socket_ssl + Memcheck:Value4 + fun:BN_num_bits_word +} + +{ + from test_socket_ssl + Memcheck:Cond + fun:BN_mod_exp_mont_word +} + +{ + from test_socket_ssl + Memcheck:Cond + fun:BN_mod_exp_mont +} + +{ + from test_socket_ssl + Memcheck:Param + write(buf) + fun:write + obj:/usr/lib/libcrypto.so.0.9.7 +} + +{ + from test_socket_ssl + Memcheck:Cond + fun:RSA_verify +} + +{ + from test_socket_ssl + Memcheck:Value4 + fun:RSA_verify +} + +{ + from test_socket_ssl + Memcheck:Value4 + fun:DES_set_key_unchecked +} + +{ + from test_socket_ssl + Memcheck:Value4 + fun:DES_encrypt2 +} + +{ + from test_socket_ssl + Memcheck:Cond + obj:/usr/lib/libssl.so.0.9.7 +} + +{ + from test_socket_ssl + Memcheck:Value4 + obj:/usr/lib/libssl.so.0.9.7 +} + +{ + from test_socket_ssl + Memcheck:Cond + fun:BUF_MEM_grow_clean +} + +{ + from test_socket_ssl + Memcheck:Cond + fun:memcpy + fun:ssl3_read_bytes +} + +{ + from test_socket_ssl + Memcheck:Cond + fun:SHA1_Update +} + +{ + from test_socket_ssl + Memcheck:Value4 + fun:SHA1_Update +} + +{ + test_buffer_non_debug + Memcheck:Addr4 + fun:PyUnicodeUCS2_FSConverter +} + +{ + test_buffer_non_debug + Memcheck:Addr4 + fun:PyUnicode_FSConverter +} + +{ + wcscmp_false_positive + Memcheck:Addr8 + fun:wcscmp + fun:_PyOS_GetOpt + fun:Py_Main + fun:main +} + +# Additional suppressions for the unified decimal tests: +{ + test_decimal + Memcheck:Addr4 + fun:PyUnicodeUCS2_FSConverter +} + +{ + test_decimal2 + Memcheck:Addr4 + fun:PyUnicode_FSConverter +} +