From 14d0a9254e2194c9a1e00641aae4a824f94f42d5 Mon Sep 17 00:00:00 2001 From: Shahzeb Siddiqui Date: Thu, 28 Mar 2024 16:16:40 -0400 Subject: [PATCH 01/10] add getter methods output_file, error_file, and exitcode to retrieve output/error file and exitcode that can be used by subclasses add logic for extracting output and error file in slurm using 'scontrol show job' at job submission time which will be useful for extracting the output. Add method get_output_and_error_files in base class Job that will be used by subclass to implement how each scheduler will extract output and error files --- buildtest/executors/slurm.py | 10 ++--- buildtest/scheduler/cobalt.py | 15 ------- buildtest/scheduler/job.py | 20 +++++++++ buildtest/scheduler/lsf.py | 81 ++++++++++++++--------------------- buildtest/scheduler/pbs.py | 16 +------ buildtest/scheduler/slurm.py | 69 +++++++++++++++++++++++++++++ 6 files changed, 127 insertions(+), 84 deletions(-) diff --git a/buildtest/executors/slurm.py b/buildtest/executors/slurm.py index 8a2272a66..8df379be5 100644 --- a/buildtest/executors/slurm.py +++ b/buildtest/executors/slurm.py @@ -107,6 +107,8 @@ def run(self, builder): msg = f"[blue]{builder}[/blue]: JobID {builder.metadata['jobid']} dispatched to scheduler" console.print(msg) + + builder.job.get_output_and_error_files() self.logger.debug(msg) return builder @@ -128,12 +130,8 @@ def gather(self, builder): f"[{builder.name}] returncode: {builder.metadata['result']['returncode']}" ) - builder.metadata["outfile"] = os.path.join( - builder.job.workdir(), builder.name + ".out" - ) - builder.metadata["errfile"] = os.path.join( - builder.job.workdir(), builder.name + ".err" - ) + builder.metadata["outfile"] = buildtest.job.output_file() + builder.metadata["errfile"] = buildtest.job.error_file() console.print(f"[blue]{builder}[/]: Job {builder.job.get()} is complete! ") builder.post_run_steps() diff --git a/buildtest/scheduler/cobalt.py b/buildtest/scheduler/cobalt.py index 7ca4abc6d..c2bdfc47c 100644 --- a/buildtest/scheduler/cobalt.py +++ b/buildtest/scheduler/cobalt.py @@ -55,21 +55,6 @@ def cobalt_log(self): return self._cobaltlog - def output_file(self): - """Return job output file""" - - return self._outfile - - def error_file(self): - """Return job error file""" - - return self._errfile - - def exitcode(self): - """Return job exit code""" - - return self._exitcode - def poll(self): """Poll job by running ``qstat -l --header State `` which retrieves job state.""" diff --git a/buildtest/scheduler/job.py b/buildtest/scheduler/job.py index fd0295904..d61360b9b 100644 --- a/buildtest/scheduler/job.py +++ b/buildtest/scheduler/job.py @@ -8,6 +8,9 @@ class Job: def __init__(self, jobID): self.jobid = jobID self._state = None + self._outfile = None + self._errfile = None + self._exitcode = None # used to store the job elapsed time self.elapsedtime = 0 @@ -47,3 +50,20 @@ def cancel(self): def poll(self): """Poll job and update job state.""" raise NotImplementedError + + def get_output_and_error_files(self): + """Get output and error of job""" + raise NotImplementedError + + + def output_file(self): + """Return output file of job""" + return self._outfile + + def error_file(self): + """Return error file of job""" + return self._errfile + + def exitcode(self): + """Return exit code of job""" + return self._exitcode diff --git a/buildtest/scheduler/lsf.py b/buildtest/scheduler/lsf.py index 54ece37b7..a4941fa91 100644 --- a/buildtest/scheduler/lsf.py +++ b/buildtest/scheduler/lsf.py @@ -93,41 +93,42 @@ def poll(self): # if job is running and the start time is not recorded then we record the start time if self.is_running() and not self.starttime: self.starttime = time.time() + def get_output_and_error_files(self): + """This method will extract output and error file for a given jobID by running the following commands: + ``bjobs -noheader -o 'output_file' `` and ``bjobs -noheader -o 'error_file' `` - def gather(self): - """This method will retrieve the output and error file for a given jobID using the following commands. + .. code-block:: console - .. code-block:: console + $ bjobs -noheader -o 'output_file' 70910 + hold_job.out - $ bjobs -noheader -o 'output_file' 70910 - hold_job.out + .. code-block:: console - .. code-block:: console + $ bjobs -noheader -o 'error_file' 70910 + hold_job.err + """ + # get path to output file + query = f"bjobs -noheader -o 'output_file' {self.jobid} " + logger.debug( + f"Extracting OUTPUT FILE for job: {self.jobid} by running '{query}'" + ) + cmd = BuildTestCommand(query) + cmd.execute() + self._outfile = "".join(cmd.get_output()).rstrip() + logger.debug(f"Output File: {self._outfile}") - $ bjobs -noheader -o 'error_file' 70910 - hold_job.err - - We will gather job record at onset of job completion by running ``bjobs -o ' ' -json``. The format - fields extracted from job are the following: - - - "job_name" - - "stat" - - "user" - - "user_group" - - "queue" - - "proj_name" - - "pids" - - "exit_code" - - "from_host" - - "exec_host" - - "submit_time" - - "start_time" - - "finish_time" - - "nthreads" - - "exec_home" - - "exec_cwd" - - "output_file" - - "error_file" + # get path to error file + query = f"bjobs -noheader -o 'error_file' {self.jobid} " + logger.debug( + f"Extracting ERROR FILE for job: {self.jobid} by running '{query}'" + ) + cmd = BuildTestCommand(query) + cmd.execute() + self._errfile = "".join(cmd.get_output()).rstrip() + logger.debug(f"Error File: {self._errfile}") + + def gather(self): + """We will gather job record at onset of job completion by running ``bjobs -o ' ' -json``. T Shown below is the output format and we retrieve the job records defined in **RECORDS** property @@ -162,25 +163,7 @@ def gather(self): } """ - # get path to output file - query = f"bjobs -noheader -o 'output_file' {self.jobid} " - logger.debug( - f"Extracting OUTPUT FILE for job: {self.jobid} by running '{query}'" - ) - cmd = BuildTestCommand(query) - cmd.execute() - self._outfile = "".join(cmd.get_output()).rstrip() - logger.debug(f"Output File: {self._outfile}") - - # get path to error file - query = f"bjobs -noheader -o 'error_file' {self.jobid} " - logger.debug( - f"Extracting ERROR FILE for job: {self.jobid} by running '{query}'" - ) - cmd = BuildTestCommand(query) - cmd.execute() - self._errfile = "".join(cmd.get_output()).rstrip() - logger.debug(f"Error File: {self._errfile}") + self.get_output_and_error_files() format_fields = [ "job_name", diff --git a/buildtest/scheduler/pbs.py b/buildtest/scheduler/pbs.py index 37105e5ff..6ea3a2064 100644 --- a/buildtest/scheduler/pbs.py +++ b/buildtest/scheduler/pbs.py @@ -36,18 +36,6 @@ def is_suspended(self): """Return ``True`` if job is suspended which would be in one of these states ``H``, ``U``, ``S``.""" return self._state in ["H", "U", "S"] - def output_file(self): - """Return output file of job""" - return self._outfile - - def error_file(self): - """Return error file of job""" - return self._errfile - - def exitcode(self): - """Return exit code of job""" - return self._exitcode - def success(self): """This method determines if job was completed successfully and returns ``True`` if exit code is 0. @@ -64,7 +52,7 @@ def fail(self): """Return ``True`` if their is a job failure which would be if exit code is not 0""" return not self.success() - def fetch_output_error_files(self): + def get_output_error_files(self): """Fetch output and error files right after job submission.""" query = f"qstat -f {self.jobid}" cmd = BuildTestCommand(query) @@ -96,7 +84,7 @@ def fetch_output_error_files(self): def is_output_ready(self): """Check if the output and error file exists.""" if not self._outfile or not self._errfile: - self.fetch_output_error_files() + self.get_output_error_files() return os.path.exists(self._outfile) and os.path.exists(self._errfile) def poll(self): diff --git a/buildtest/scheduler/slurm.py b/buildtest/scheduler/slurm.py index aefe0232d..b33c53e69 100644 --- a/buildtest/scheduler/slurm.py +++ b/buildtest/scheduler/slurm.py @@ -148,6 +148,75 @@ def poll(self): if self.is_running() and not self.starttime: self.starttime = time.time() + def get_output_and_error_files(self): + """This method will extract file paths to StdOut and StdErr using ``scontrol show job `` command that will + be used to set output and error file. + + .. code-block:: console + + siddiq90@login07> scontrol show job 23608796 + JobId=23608796 JobName=perlmutter-gpu.slurm + UserId=siddiq90(92503) GroupId=siddiq90(92503) MCS_label=N/A + Priority=69119 Nice=0 Account=nstaff_g QOS=gpu_debug + JobState=PENDING Reason=Priority Dependency=(null) + Requeue=0 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 + RunTime=00:00:00 TimeLimit=00:05:00 TimeMin=N/A + SubmitTime=2024-03-28T12:36:05 EligibleTime=2024-03-28T12:36:05 + AccrueTime=2024-03-28T12:36:05 + StartTime=2024-03-28T12:36:14 EndTime=2024-03-28T12:41:14 Deadline=N/A + SuspendTime=None SecsPreSuspend=0 LastSchedEval=2024-03-28T12:36:12 Scheduler=Backfill:* + Partition=gpu_ss11 AllocNode:Sid=login07:1529462 + ReqNodeList=(null) ExcNodeList=(null) + NodeList= + NumNodes=1-1 NumCPUs=4 NumTasks=4 CPUs/Task=1 ReqB:S:C:T=0:0:*:* + ReqTRES=cpu=4,mem=229992M,node=1,billing=4,gres/gpu=1 + AllocTRES=(null) + Socks/Node=* NtasksPerN:B:S:C=4:0:*:* CoreSpec=* + MinCPUsNode=4 MinMemoryNode=0 MinTmpDiskNode=0 + Features=gpu&a100 DelayBoot=00:00:00 + OverSubscribe=NO Contiguous=0 Licenses=u1:1 Network=(null) + Command=/global/u1/s/siddiq90/jobs/perlmutter-gpu.slurm + WorkDir=/global/u1/s/siddiq90/jobs + StdErr=/global/u1/s/siddiq90/jobs/slurm-23608796.out + StdIn=/dev/null + StdOut=/global/u1/s/siddiq90/jobs/slurm-23608796.out + Power= + TresPerJob=gres:gpu:1 + + + """ + + query = f"scontrol show job {self.jobid}" + if self.cluster: + query += f" --clusters={self.cluster}" + + cmd = BuildTestCommand(query) + cmd.execute() + logger.debug(f"Querying JobID: '{self.jobid}' by running: '{query}'") + content = " ".join(cmd.get_output()) + + logger.debug(f"Output of scontrol show job {self.jobid}:\n{content}") + + pattern=r"StdOut=(?P.+)" + match = re.search(pattern, content) + logger.debug(f"Extracting StdOut file by applying regular expression: {pattern}") + if match: + self._outfile = match.group("stdout") + else: + logger.error(f"Unable to extract StdOut file from output: {content}") + + pattern=r"StdErr=(?P.+)" + match = re.search(pattern, output) + logger.debug(f"Extracting StdOut file by applying regular expression: {pattern}") + if match: + self._errfile = match.group("stderr") + else: + logger.error(f"Unable to extract StdErr file from error: {content}") + + + self.logger.debug(f"Output File: {self._outfile}") + self.logger.debug(f"Error File: {self._errfile}") + def gather(self): """Gather job record which is called after job completion. We use `sacct` to gather job record and return the job record as a dictionary. The command we run is From decaced09ec702447abf87c63230ba0ed5c334fb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Mar 2024 20:21:38 +0000 Subject: [PATCH 02/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- buildtest/scheduler/job.py | 1 - buildtest/scheduler/lsf.py | 15 +++---- buildtest/scheduler/slurm.py | 77 +++++++++++++++++++----------------- 3 files changed, 48 insertions(+), 45 deletions(-) diff --git a/buildtest/scheduler/job.py b/buildtest/scheduler/job.py index d61360b9b..80ff4212d 100644 --- a/buildtest/scheduler/job.py +++ b/buildtest/scheduler/job.py @@ -55,7 +55,6 @@ def get_output_and_error_files(self): """Get output and error of job""" raise NotImplementedError - def output_file(self): """Return output file of job""" return self._outfile diff --git a/buildtest/scheduler/lsf.py b/buildtest/scheduler/lsf.py index a4941fa91..6ae29a12f 100644 --- a/buildtest/scheduler/lsf.py +++ b/buildtest/scheduler/lsf.py @@ -93,19 +93,20 @@ def poll(self): # if job is running and the start time is not recorded then we record the start time if self.is_running() and not self.starttime: self.starttime = time.time() + def get_output_and_error_files(self): """This method will extract output and error file for a given jobID by running the following commands: - ``bjobs -noheader -o 'output_file' `` and ``bjobs -noheader -o 'error_file' `` + ``bjobs -noheader -o 'output_file' `` and ``bjobs -noheader -o 'error_file' `` - .. code-block:: console + .. code-block:: console - $ bjobs -noheader -o 'output_file' 70910 - hold_job.out + $ bjobs -noheader -o 'output_file' 70910 + hold_job.out - .. code-block:: console + .. code-block:: console - $ bjobs -noheader -o 'error_file' 70910 - hold_job.err + $ bjobs -noheader -o 'error_file' 70910 + hold_job.err """ # get path to output file query = f"bjobs -noheader -o 'output_file' {self.jobid} " diff --git a/buildtest/scheduler/slurm.py b/buildtest/scheduler/slurm.py index b33c53e69..e2119348b 100644 --- a/buildtest/scheduler/slurm.py +++ b/buildtest/scheduler/slurm.py @@ -150,38 +150,38 @@ def poll(self): def get_output_and_error_files(self): """This method will extract file paths to StdOut and StdErr using ``scontrol show job `` command that will - be used to set output and error file. - - .. code-block:: console - - siddiq90@login07> scontrol show job 23608796 - JobId=23608796 JobName=perlmutter-gpu.slurm - UserId=siddiq90(92503) GroupId=siddiq90(92503) MCS_label=N/A - Priority=69119 Nice=0 Account=nstaff_g QOS=gpu_debug - JobState=PENDING Reason=Priority Dependency=(null) - Requeue=0 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 - RunTime=00:00:00 TimeLimit=00:05:00 TimeMin=N/A - SubmitTime=2024-03-28T12:36:05 EligibleTime=2024-03-28T12:36:05 - AccrueTime=2024-03-28T12:36:05 - StartTime=2024-03-28T12:36:14 EndTime=2024-03-28T12:41:14 Deadline=N/A - SuspendTime=None SecsPreSuspend=0 LastSchedEval=2024-03-28T12:36:12 Scheduler=Backfill:* - Partition=gpu_ss11 AllocNode:Sid=login07:1529462 - ReqNodeList=(null) ExcNodeList=(null) - NodeList= - NumNodes=1-1 NumCPUs=4 NumTasks=4 CPUs/Task=1 ReqB:S:C:T=0:0:*:* - ReqTRES=cpu=4,mem=229992M,node=1,billing=4,gres/gpu=1 - AllocTRES=(null) - Socks/Node=* NtasksPerN:B:S:C=4:0:*:* CoreSpec=* - MinCPUsNode=4 MinMemoryNode=0 MinTmpDiskNode=0 - Features=gpu&a100 DelayBoot=00:00:00 - OverSubscribe=NO Contiguous=0 Licenses=u1:1 Network=(null) - Command=/global/u1/s/siddiq90/jobs/perlmutter-gpu.slurm - WorkDir=/global/u1/s/siddiq90/jobs - StdErr=/global/u1/s/siddiq90/jobs/slurm-23608796.out - StdIn=/dev/null - StdOut=/global/u1/s/siddiq90/jobs/slurm-23608796.out - Power= - TresPerJob=gres:gpu:1 + be used to set output and error file. + + .. code-block:: console + + siddiq90@login07> scontrol show job 23608796 + JobId=23608796 JobName=perlmutter-gpu.slurm + UserId=siddiq90(92503) GroupId=siddiq90(92503) MCS_label=N/A + Priority=69119 Nice=0 Account=nstaff_g QOS=gpu_debug + JobState=PENDING Reason=Priority Dependency=(null) + Requeue=0 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 + RunTime=00:00:00 TimeLimit=00:05:00 TimeMin=N/A + SubmitTime=2024-03-28T12:36:05 EligibleTime=2024-03-28T12:36:05 + AccrueTime=2024-03-28T12:36:05 + StartTime=2024-03-28T12:36:14 EndTime=2024-03-28T12:41:14 Deadline=N/A + SuspendTime=None SecsPreSuspend=0 LastSchedEval=2024-03-28T12:36:12 Scheduler=Backfill:* + Partition=gpu_ss11 AllocNode:Sid=login07:1529462 + ReqNodeList=(null) ExcNodeList=(null) + NodeList= + NumNodes=1-1 NumCPUs=4 NumTasks=4 CPUs/Task=1 ReqB:S:C:T=0:0:*:* + ReqTRES=cpu=4,mem=229992M,node=1,billing=4,gres/gpu=1 + AllocTRES=(null) + Socks/Node=* NtasksPerN:B:S:C=4:0:*:* CoreSpec=* + MinCPUsNode=4 MinMemoryNode=0 MinTmpDiskNode=0 + Features=gpu&a100 DelayBoot=00:00:00 + OverSubscribe=NO Contiguous=0 Licenses=u1:1 Network=(null) + Command=/global/u1/s/siddiq90/jobs/perlmutter-gpu.slurm + WorkDir=/global/u1/s/siddiq90/jobs + StdErr=/global/u1/s/siddiq90/jobs/slurm-23608796.out + StdIn=/dev/null + StdOut=/global/u1/s/siddiq90/jobs/slurm-23608796.out + Power= + TresPerJob=gres:gpu:1 """ @@ -197,23 +197,26 @@ def get_output_and_error_files(self): logger.debug(f"Output of scontrol show job {self.jobid}:\n{content}") - pattern=r"StdOut=(?P.+)" + pattern = r"StdOut=(?P.+)" match = re.search(pattern, content) - logger.debug(f"Extracting StdOut file by applying regular expression: {pattern}") + logger.debug( + f"Extracting StdOut file by applying regular expression: {pattern}" + ) if match: self._outfile = match.group("stdout") else: logger.error(f"Unable to extract StdOut file from output: {content}") - pattern=r"StdErr=(?P.+)" + pattern = r"StdErr=(?P.+)" match = re.search(pattern, output) - logger.debug(f"Extracting StdOut file by applying regular expression: {pattern}") + logger.debug( + f"Extracting StdOut file by applying regular expression: {pattern}" + ) if match: self._errfile = match.group("stderr") else: logger.error(f"Unable to extract StdErr file from error: {content}") - self.logger.debug(f"Output File: {self._outfile}") self.logger.debug(f"Error File: {self._errfile}") From ca54ac83862f8fbae972d5721ec0fa061db8a550 Mon Sep 17 00:00:00 2001 From: Shahzeb Siddiqui Date: Thu, 28 Mar 2024 13:31:52 -0700 Subject: [PATCH 03/10] fix issues after testing slurm job submission on NERSC system. There was some typos in variable names --- buildtest/executors/slurm.py | 4 ++-- buildtest/scheduler/slurm.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/buildtest/executors/slurm.py b/buildtest/executors/slurm.py index 8df379be5..a47cdf7db 100644 --- a/buildtest/executors/slurm.py +++ b/buildtest/executors/slurm.py @@ -130,8 +130,8 @@ def gather(self, builder): f"[{builder.name}] returncode: {builder.metadata['result']['returncode']}" ) - builder.metadata["outfile"] = buildtest.job.output_file() - builder.metadata["errfile"] = buildtest.job.error_file() + builder.metadata["outfile"] = builder.job.output_file() + builder.metadata["errfile"] = builder.job.error_file() console.print(f"[blue]{builder}[/]: Job {builder.job.get()} is complete! ") builder.post_run_steps() diff --git a/buildtest/scheduler/slurm.py b/buildtest/scheduler/slurm.py index e2119348b..30defeba6 100644 --- a/buildtest/scheduler/slurm.py +++ b/buildtest/scheduler/slurm.py @@ -1,4 +1,5 @@ import logging +import re import time from buildtest.scheduler.job import Job @@ -208,7 +209,7 @@ def get_output_and_error_files(self): logger.error(f"Unable to extract StdOut file from output: {content}") pattern = r"StdErr=(?P.+)" - match = re.search(pattern, output) + match = re.search(pattern, content) logger.debug( f"Extracting StdOut file by applying regular expression: {pattern}" ) @@ -217,8 +218,8 @@ def get_output_and_error_files(self): else: logger.error(f"Unable to extract StdErr file from error: {content}") - self.logger.debug(f"Output File: {self._outfile}") - self.logger.debug(f"Error File: {self._errfile}") + logger.debug(f"Output File: {self._outfile}") + logger.debug(f"Error File: {self._errfile}") def gather(self): """Gather job record which is called after job completion. We use `sacct` to gather From 5129d6f09de3970217935fb6fbeec97948bce45f Mon Sep 17 00:00:00 2001 From: Shahzeb Siddiqui Date: Fri, 29 Mar 2024 07:49:38 -0400 Subject: [PATCH 04/10] add method retrieve_jobdata in Job class that will be used to retrieve job data that is implemented by subclass used for retrieving job record upon completion. The method jobdata will return job record which will be stored in internal variable self._jobdata. We renamed the methods in each subclass to use retrieve_jobdata to get the job records. Finally in each executor class we will invoke builder.job.jobdata() method to get the job data instead of having a return value from method which helps clean up code --- buildtest/executors/base.py | 2 +- buildtest/executors/slurm.py | 2 +- buildtest/scheduler/job.py | 7 +++++++ buildtest/scheduler/lsf.py | 19 ++----------------- buildtest/scheduler/pbs.py | 2 +- buildtest/scheduler/slurm.py | 11 +++-------- 6 files changed, 15 insertions(+), 28 deletions(-) diff --git a/buildtest/executors/base.py b/buildtest/executors/base.py index 1033206a5..1bb20320a 100644 --- a/buildtest/executors/base.py +++ b/buildtest/executors/base.py @@ -126,7 +126,7 @@ def gather(self, builder): builder.record_endtime() - builder.metadata["job"] = builder.job.gather() + builder.metadata["job"] = builder.job.jobdata() builder.metadata["result"]["returncode"] = builder.job.exitcode() self.logger.debug( diff --git a/buildtest/executors/slurm.py b/buildtest/executors/slurm.py index a47cdf7db..bd1a7a4c1 100644 --- a/buildtest/executors/slurm.py +++ b/buildtest/executors/slurm.py @@ -122,7 +122,7 @@ def gather(self, builder): """ builder.record_endtime() - builder.metadata["job"] = builder.job.gather() + builder.metadata["job"] = builder.job.jobdata() builder.metadata["result"]["returncode"] = builder.job.exitcode() diff --git a/buildtest/scheduler/job.py b/buildtest/scheduler/job.py index 80ff4212d..ce0e86014 100644 --- a/buildtest/scheduler/job.py +++ b/buildtest/scheduler/job.py @@ -11,6 +11,7 @@ def __init__(self, jobID): self._outfile = None self._errfile = None self._exitcode = None + self._jobdata = None # used to store the job elapsed time self.elapsedtime = 0 @@ -66,3 +67,9 @@ def error_file(self): def exitcode(self): """Return exit code of job""" return self._exitcode + + def retrieve_jobdata(self): + raise NotImplementedError + + def jobdata(self): + return self._jobdata diff --git a/buildtest/scheduler/lsf.py b/buildtest/scheduler/lsf.py index 6ae29a12f..f56678a03 100644 --- a/buildtest/scheduler/lsf.py +++ b/buildtest/scheduler/lsf.py @@ -39,21 +39,6 @@ def is_failed(self): return self._state == "EXIT" - def output_file(self): - """Return job output file""" - - return self._outfile - - def error_file(self): - """Return job error file""" - - return self._errfile - - def exitcode(self): - """Return job exit code""" - - return self._exitcode - def poll(self): """Given a job id we poll the LSF Job by retrieving its job state, output file, error file and exit code. We run the following commands to retrieve following states @@ -128,7 +113,7 @@ def get_output_and_error_files(self): self._errfile = "".join(cmd.get_output()).rstrip() logger.debug(f"Error File: {self._errfile}") - def gather(self): + def retrieve_jobdata(self): """We will gather job record at onset of job completion by running ``bjobs -o ' ' -json``. T Shown below is the output format and we retrieve the job records defined in **RECORDS** property @@ -203,7 +188,7 @@ def gather(self): for field, value in records.items(): job_data[field] = value - return job_data + self._jobdata = job_data def cancel(self): """Cancel LSF Job by running ``bkill ``. This method is called if job pending time exceeds diff --git a/buildtest/scheduler/pbs.py b/buildtest/scheduler/pbs.py index 6ea3a2064..8f8533a91 100644 --- a/buildtest/scheduler/pbs.py +++ b/buildtest/scheduler/pbs.py @@ -247,7 +247,7 @@ def poll(self): if self.is_running() and not self.starttime: self.starttime = time.time() - def gather(self): + def retrieve_jobdata(self): """This method is called once job is complete. We will gather record of job by running ``qstat -x -f -F json `` and return the json object as a dict. This method is responsible for getting output file, error file and exit status of job. diff --git a/buildtest/scheduler/slurm.py b/buildtest/scheduler/slurm.py index 30defeba6..b536f4138 100644 --- a/buildtest/scheduler/slurm.py +++ b/buildtest/scheduler/slurm.py @@ -90,11 +90,6 @@ def workdir(self): return self._workdir - def exitcode(self): - """Return job exit code""" - - return self._exitcode - def cancel(self): """Cancel job by running ``scancel ``. If job is specified to a slurm cluster we cancel job using ``scancel --clusters=``. This method @@ -221,8 +216,8 @@ def get_output_and_error_files(self): logger.debug(f"Output File: {self._outfile}") logger.debug(f"Error File: {self._errfile}") - def gather(self): - """Gather job record which is called after job completion. We use `sacct` to gather + def retrieve_jobdata(self): + """This method will get job record which is called after job completion. We use `sacct` to gather job record and return the job record as a dictionary. The command we run is ``sacct -j -X -n -P -o ,,...,``. We retrieve the following format fields from job record: @@ -326,4 +321,4 @@ def gather(self): for field, value in zip(sacct_fields, out): job_data[field] = value - return job_data + self._jobdata = job_data From 0d7a48f894705a5b156aa5f806a07831d315ae1d Mon Sep 17 00:00:00 2001 From: Shahzeb Siddiqui Date: Fri, 29 Mar 2024 08:08:06 -0400 Subject: [PATCH 05/10] make changes necessary for cobalt scheduler with changes to method names --- buildtest/executors/cobalt.py | 5 ++++- buildtest/scheduler/cobalt.py | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/buildtest/executors/cobalt.py b/buildtest/executors/cobalt.py index 2c7726651..6a25d7040 100644 --- a/buildtest/executors/cobalt.py +++ b/buildtest/executors/cobalt.py @@ -106,11 +106,14 @@ def run(self, builder): logger.debug(f"Output file will be written to: {builder.metadata['outfile']}") logger.debug(f"Error file will be written to: {builder.metadata['errfile']}") - builder.metadata["job"] = builder.job.gather() + # gather job record + builder.job.retrieve_jobdata() + builder.metadata["job"] = builder.job.jobdata() logger.debug(json.dumps(builder.metadata["job"], indent=2)) return builder + def poll(self, builder): """This method is responsible for polling Cobalt job by invoking the builder method ``builder.job.poll()``. We check the job state and existence of output file. If file diff --git a/buildtest/scheduler/cobalt.py b/buildtest/scheduler/cobalt.py index c2bdfc47c..947ea383e 100644 --- a/buildtest/scheduler/cobalt.py +++ b/buildtest/scheduler/cobalt.py @@ -75,7 +75,7 @@ def poll(self): logger.debug(f"Job ID: '{self.job}' Job State: {self._state}") - def gather(self): + def retrieve_jobdata(self): """Gather Job state by running **qstat -lf ** which retrieves all fields. The output is in text format which is parsed into key/value pair and stored in a dictionary. This method will return a dict containing the job record @@ -108,7 +108,8 @@ def gather(self): value = value.strip() job_record[key] = value - return job_record + self._jobdata = job_record + def cancel(self): """Cancel job by running ``qdel ``. This method is called if job timer exceeds From 8da708dc17f9b6ca32a770d334fff15955e9d41f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 29 Mar 2024 12:13:33 +0000 Subject: [PATCH 06/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- buildtest/executors/cobalt.py | 1 - buildtest/scheduler/cobalt.py | 1 - 2 files changed, 2 deletions(-) diff --git a/buildtest/executors/cobalt.py b/buildtest/executors/cobalt.py index 6a25d7040..324c889d0 100644 --- a/buildtest/executors/cobalt.py +++ b/buildtest/executors/cobalt.py @@ -113,7 +113,6 @@ def run(self, builder): return builder - def poll(self, builder): """This method is responsible for polling Cobalt job by invoking the builder method ``builder.job.poll()``. We check the job state and existence of output file. If file diff --git a/buildtest/scheduler/cobalt.py b/buildtest/scheduler/cobalt.py index 947ea383e..8f01dce92 100644 --- a/buildtest/scheduler/cobalt.py +++ b/buildtest/scheduler/cobalt.py @@ -110,7 +110,6 @@ def retrieve_jobdata(self): self._jobdata = job_record - def cancel(self): """Cancel job by running ``qdel ``. This method is called if job timer exceeds ``maxpendtime`` if job is pending. From b21b0178e05f818ff21230d9487457a0d1663aba Mon Sep 17 00:00:00 2001 From: Shahzeb Siddiqui Date: Fri, 29 Mar 2024 13:02:39 +0000 Subject: [PATCH 07/10] update example buildspec to use queue named 'iris' and update the executor names in buildspec --- tests/examples/jlse/hold_job.yml | 4 ++-- tests/examples/jlse/hostname.yml | 4 ++-- tests/settings/jlse.yml | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/examples/jlse/hold_job.yml b/tests/examples/jlse/hold_job.yml index 311a07df3..fc5c5d8f2 100644 --- a/tests/examples/jlse/hold_job.yml +++ b/tests/examples/jlse/hold_job.yml @@ -1,8 +1,8 @@ buildspecs: hold_job: - executor: jlse.cobalt.testing + executor: jlse.cobalt.iris type: script tags: [jobs] - description: Hold Job on testing queue + description: Hold Job in queue cobalt: ["-n 1", "-t 10", "-h"] run: hostname diff --git a/tests/examples/jlse/hostname.yml b/tests/examples/jlse/hostname.yml index 72207373b..77903c088 100644 --- a/tests/examples/jlse/hostname.yml +++ b/tests/examples/jlse/hostname.yml @@ -1,8 +1,8 @@ buildspecs: hostname_test: - executor: jlse.cobalt.testing + executor: jlse.cobalt.iris type: script tags: [jobs] - description: Run hostname on testing queue + description: Run hostname as batch job cobalt: ["-n 1", "-t 10"] run: hostname diff --git a/tests/settings/jlse.yml b/tests/settings/jlse.yml index ca1326812..ee0a919be 100644 --- a/tests/settings/jlse.yml +++ b/tests/settings/jlse.yml @@ -1,7 +1,7 @@ system: jlse: # hostnames on JLSE where jobs are run are jlsebatch[1-2] - hostnames: ['^jlsebatch/d{1}$'] + hostnames: ['^jlsebatch\d{1}$'] moduletool: environment-modules poolsize: 8 max_jobs: 10 @@ -33,8 +33,8 @@ system: description: submit jobs on local machine using python shell shell: python cobalt: - testing: - queue: testing + iris: + queue: iris compilers: find: gcc: "^(gcc)" From 58e2e5a4842d693e07a46f695e410c95b698e834 Mon Sep 17 00:00:00 2001 From: Shahzeb Siddiqui Date: Fri, 29 Mar 2024 13:03:13 +0000 Subject: [PATCH 08/10] fix issues relating to cobalt job submission --- buildtest/executors/cobalt.py | 25 +++++++++++-------------- buildtest/scheduler/cobalt.py | 7 +++++-- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/buildtest/executors/cobalt.py b/buildtest/executors/cobalt.py index 324c889d0..401afbee1 100644 --- a/buildtest/executors/cobalt.py +++ b/buildtest/executors/cobalt.py @@ -130,23 +130,20 @@ def poll(self, builder): self.gather(builder) return + builder.stop() - # if job is pending or suspended check if builder timer duration exceeds maxpendtime if so cancel job - if builder.job.is_pending() or builder.job.is_suspended(): - logger.debug(f"Time Duration: {builder.duration}") - logger.debug(f"Max Pend Time: {self.maxpendtime}") - - # if timer time is more than requested pend time then cancel job - if int(builder.timer.duration()) > self.maxpendtime: - builder.job.cancel() - builder.failed() - console.print( - f"[blue]{builder}[/]: [red]Cancelling Job {builder.job.get()} because job exceeds max pend time of {self.maxpendtime} sec with current pend time of {builder.timer.duration()} sec[/red] " - ) - return - builder.start() + if builder.job.is_running(): + builder.job.elapsedtime = time.time() - builder.job.starttime + builder.job.elapsedtime = round(builder.job.elapsedtime, 2) + if self._cancel_job_if_elapsedtime_exceeds_timeout(builder): + return + if builder.job.is_suspended() or builder.job.is_pending(): + if self._cancel_job_if_pendtime_exceeds_maxpendtime(builder): + return + builder.start() + def gather(self, builder): """This method is responsible for moving output and error file in the run directory. We need to read ``.cobaltlog`` file which contains diff --git a/buildtest/scheduler/cobalt.py b/buildtest/scheduler/cobalt.py index 8f01dce92..fe1335957 100644 --- a/buildtest/scheduler/cobalt.py +++ b/buildtest/scheduler/cobalt.py @@ -1,5 +1,5 @@ import logging - +import time from buildtest.scheduler.job import Job from buildtest.utils.command import BuildTestCommand @@ -73,7 +73,10 @@ def poll(self): if job_state: self._state = job_state - logger.debug(f"Job ID: '{self.job}' Job State: {self._state}") + logger.debug(f"Job ID: '{self.jobid}' Job State: {self._state}") + + if self.is_running() and not self.starttime: + self.starttime = time.time() def retrieve_jobdata(self): """Gather Job state by running **qstat -lf ** which retrieves all fields. From 4fbfa56188be46e91bbc8d52abbbebb326c3ad9d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 29 Mar 2024 13:03:57 +0000 Subject: [PATCH 09/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- buildtest/executors/cobalt.py | 3 +-- buildtest/scheduler/cobalt.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/buildtest/executors/cobalt.py b/buildtest/executors/cobalt.py index 401afbee1..2d9dc7a17 100644 --- a/buildtest/executors/cobalt.py +++ b/buildtest/executors/cobalt.py @@ -130,7 +130,6 @@ def poll(self, builder): self.gather(builder) return - builder.stop() if builder.job.is_running(): @@ -143,7 +142,7 @@ def poll(self, builder): if self._cancel_job_if_pendtime_exceeds_maxpendtime(builder): return builder.start() - + def gather(self, builder): """This method is responsible for moving output and error file in the run directory. We need to read ``.cobaltlog`` file which contains diff --git a/buildtest/scheduler/cobalt.py b/buildtest/scheduler/cobalt.py index fe1335957..a63e0a321 100644 --- a/buildtest/scheduler/cobalt.py +++ b/buildtest/scheduler/cobalt.py @@ -1,5 +1,6 @@ import logging import time + from buildtest.scheduler.job import Job from buildtest.utils.command import BuildTestCommand From 17465e067ebc7b28defe77efbbbe0ead0d90303d Mon Sep 17 00:00:00 2001 From: Shahzeb Siddiqui Date: Fri, 29 Mar 2024 11:06:53 -0400 Subject: [PATCH 10/10] fix api generation issue reported by readthedocs build due to indentation issues --- buildtest/scheduler/pbs.py | 3 +- buildtest/scheduler/slurm.py | 56 ++++++++++++++++++------------------ 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/buildtest/scheduler/pbs.py b/buildtest/scheduler/pbs.py index 8f8533a91..644211ca5 100644 --- a/buildtest/scheduler/pbs.py +++ b/buildtest/scheduler/pbs.py @@ -94,8 +94,7 @@ def poll(self): .. code-block:: console - - (buildtest) adaptive50@e4spro-cluster:~/Documents/buildtest/aws_oddc$ qstat -f 40680075.e4spro-cluster + (buildtest) adaptive50@e4spro-cluster:~/Documents/buildtest/aws_oddc$ qstat -f 40680075.e4spro-cluster Job Id: 40680075.e4spro-cluster Job_Name = hostname_test Job_Owner = adaptive50@server.nodus.com diff --git a/buildtest/scheduler/slurm.py b/buildtest/scheduler/slurm.py index b536f4138..d81abd7e0 100644 --- a/buildtest/scheduler/slurm.py +++ b/buildtest/scheduler/slurm.py @@ -150,34 +150,34 @@ def get_output_and_error_files(self): .. code-block:: console - siddiq90@login07> scontrol show job 23608796 - JobId=23608796 JobName=perlmutter-gpu.slurm - UserId=siddiq90(92503) GroupId=siddiq90(92503) MCS_label=N/A - Priority=69119 Nice=0 Account=nstaff_g QOS=gpu_debug - JobState=PENDING Reason=Priority Dependency=(null) - Requeue=0 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 - RunTime=00:00:00 TimeLimit=00:05:00 TimeMin=N/A - SubmitTime=2024-03-28T12:36:05 EligibleTime=2024-03-28T12:36:05 - AccrueTime=2024-03-28T12:36:05 - StartTime=2024-03-28T12:36:14 EndTime=2024-03-28T12:41:14 Deadline=N/A - SuspendTime=None SecsPreSuspend=0 LastSchedEval=2024-03-28T12:36:12 Scheduler=Backfill:* - Partition=gpu_ss11 AllocNode:Sid=login07:1529462 - ReqNodeList=(null) ExcNodeList=(null) - NodeList= - NumNodes=1-1 NumCPUs=4 NumTasks=4 CPUs/Task=1 ReqB:S:C:T=0:0:*:* - ReqTRES=cpu=4,mem=229992M,node=1,billing=4,gres/gpu=1 - AllocTRES=(null) - Socks/Node=* NtasksPerN:B:S:C=4:0:*:* CoreSpec=* - MinCPUsNode=4 MinMemoryNode=0 MinTmpDiskNode=0 - Features=gpu&a100 DelayBoot=00:00:00 - OverSubscribe=NO Contiguous=0 Licenses=u1:1 Network=(null) - Command=/global/u1/s/siddiq90/jobs/perlmutter-gpu.slurm - WorkDir=/global/u1/s/siddiq90/jobs - StdErr=/global/u1/s/siddiq90/jobs/slurm-23608796.out - StdIn=/dev/null - StdOut=/global/u1/s/siddiq90/jobs/slurm-23608796.out - Power= - TresPerJob=gres:gpu:1 + siddiq90@login07> scontrol show job 23608796 + JobId=23608796 JobName=perlmutter-gpu.slurm + UserId=siddiq90(92503) GroupId=siddiq90(92503) MCS_label=N/A + Priority=69119 Nice=0 Account=nstaff_g QOS=gpu_debug + JobState=PENDING Reason=Priority Dependency=(null) + Requeue=0 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 + RunTime=00:00:00 TimeLimit=00:05:00 TimeMin=N/A + SubmitTime=2024-03-28T12:36:05 EligibleTime=2024-03-28T12:36:05 + AccrueTime=2024-03-28T12:36:05 + StartTime=2024-03-28T12:36:14 EndTime=2024-03-28T12:41:14 Deadline=N/A + SuspendTime=None SecsPreSuspend=0 LastSchedEval=2024-03-28T12:36:12 Scheduler=Backfill:* + Partition=gpu_ss11 AllocNode:Sid=login07:1529462 + ReqNodeList=(null) ExcNodeList=(null) + NodeList= + NumNodes=1-1 NumCPUs=4 NumTasks=4 CPUs/Task=1 ReqB:S:C:T=0:0:*:* + ReqTRES=cpu=4,mem=229992M,node=1,billing=4,gres/gpu=1 + AllocTRES=(null) + Socks/Node=* NtasksPerN:B:S:C=4:0:*:* CoreSpec=* + MinCPUsNode=4 MinMemoryNode=0 MinTmpDiskNode=0 + Features=gpu&a100 DelayBoot=00:00:00 + OverSubscribe=NO Contiguous=0 Licenses=u1:1 Network=(null) + Command=/global/u1/s/siddiq90/jobs/perlmutter-gpu.slurm + WorkDir=/global/u1/s/siddiq90/jobs + StdErr=/global/u1/s/siddiq90/jobs/slurm-23608796.out + StdIn=/dev/null + StdOut=/global/u1/s/siddiq90/jobs/slurm-23608796.out + Power= + TresPerJob=gres:gpu:1 """