From 0d07511f014b9f33e973dff9cca3bfe6400b7f93 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 12 May 2021 12:25:08 +0200 Subject: [PATCH 01/96] New version 2.11.3.1 --- PILOTVERSION | 2 +- pilot/util/constants.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 00d0494d..243cbdb7 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.2.22 \ No newline at end of file +2.11.3.1 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index a9054620..a40ec8fd 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -13,8 +13,8 @@ # Pilot version RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates -REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '22' # build number should be reset to '1' for every new development cycle +REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates +BUILD = '1' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 830ab0aeb422282b083574ad585cda7ba75a5212 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 13 May 2021 10:52:21 +0200 Subject: [PATCH 02/96] Added diagnostics to failed remote file open verification --- pilot/user/atlas/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 13d0546e..a8657296 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -318,7 +318,7 @@ def get_payload_command(job): # fail the job if the remote files could not be verified if ec != 0: - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec) + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec, msg=diagnostics) raise PilotException(diagnostics, code=ec) else: logger.debug('no remote file open verification') From 2f8660723e548ad4330a6976c5c9b96488ada09f Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 13 May 2021 16:55:28 +0200 Subject: [PATCH 03/96] Updated comment --- pilot/util/default.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index efb144bd..d99a7e85 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -41,7 +41,7 @@ testtransfertype: NULL pandaserver: https://pandaserver.cern.ch:25443 # pandaserver: https://aipanda007.cern.ch:25443 -# The URL for the iDDS server (update actual URL later) +# The URL for the iDDS server iddsserver: https://pandaserver.cern.ch:25443 # The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5 * 60=300 s in ddebug mode) From ef73e5bbbcb5f8ddf8a6b29b0be9a7c3fc58a065 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 13 May 2021 16:56:19 +0200 Subject: [PATCH 04/96] Updated comment --- pilot/util/default.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index d99a7e85..3a50a0d0 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -44,7 +44,7 @@ pandaserver: https://pandaserver.cern.ch:25443 # The URL for the iDDS server iddsserver: https://pandaserver.cern.ch:25443 -# The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5 * 60=300 s in ddebug mode) +# The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5 * 60=300 s in debug mode) heartbeat: 1800 debug_heartbeat: 300 From 0b928df490df09477d58ed2089242302f6e6d7a4 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 13 May 2021 16:59:09 +0200 Subject: [PATCH 05/96] Updated comment --- pilot/util/default.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index 3a50a0d0..1ab5b762 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -44,7 +44,7 @@ pandaserver: https://pandaserver.cern.ch:25443 # The URL for the iDDS server iddsserver: https://pandaserver.cern.ch:25443 -# The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5 * 60=300 s in debug mode) +# The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5*60 = 300 s in debug mode) heartbeat: 1800 debug_heartbeat: 300 From 521374269d68e1be3b4a446dc8525a1775d509bd Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 13 May 2021 19:35:04 +0200 Subject: [PATCH 06/96] Updated log message --- pilot/control/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 62ab7238..25396851 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -882,7 +882,7 @@ def validate(queues, traces, args): # run the delayed space check now proceed_with_local_space_check = True if (args.harvester_submitmode.lower() == 'push' and args.update_server) else False if proceed_with_local_space_check: - logger.debug('pilot will not perform delayed space check') + logger.debug('pilot will now perform delayed space check') ec, diagnostics = check_local_space() if ec != 0: traces.pilot['error_code'] = errors.NOLOCALSPACE From d67cab8ede3b27b3904136c0b35da1feaa422b35 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 17 May 2021 10:51:48 +0200 Subject: [PATCH 07/96] Updated comment --- pilot/util/default.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index 1ab5b762..f8de374e 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -69,7 +69,7 @@ maximum_input_file_sizes: 14336 MB # Size limit of payload stdout size during running. unit is in kB (value = 2 * 1024 ** 2) local_size_limit_stdout: 2097152 -# Looping job time limits; if job does not write anything in N hours, it is considered a looping job +# Looping job time limits; if job does not write anything in N minutes, it is considered to be a looping looping_verification_time: 900 # for both production and user analysis jobs, 2*3600 looping_limit_default: 7200 From f3bbf969bed436274b07e4af0caca673f605ffc9 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 17 May 2021 11:47:21 +0200 Subject: [PATCH 08/96] Now adding resimevents to job metrics also when it is zero --- PILOTVERSION | 2 +- pilot/user/atlas/common.py | 4 ++-- pilot/user/atlas/jobmetrics.py | 2 +- pilot/util/constants.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 243cbdb7..88b6eae4 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.1 \ No newline at end of file +2.11.3.2 \ No newline at end of file diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index a8657296..6a7d448c 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -1408,10 +1408,10 @@ def get_resimevents(jobreport_dictionary): This information is reported with the jobMetrics. :param jobreport_dictionary: job report dictionary. - :return: resimevents (int) + :return: resimevents (int or None) """ - resimevents = 0 + resimevents = None executor_dictionary = get_executor_dictionary(jobreport_dictionary) if executor_dictionary != {}: diff --git a/pilot/user/atlas/jobmetrics.py b/pilot/user/atlas/jobmetrics.py index 503542e9..20b5d31a 100644 --- a/pilot/user/atlas/jobmetrics.py +++ b/pilot/user/atlas/jobmetrics.py @@ -54,7 +54,7 @@ def get_job_metrics_string(job): job_metrics += get_job_metrics_entry("dbTime", job.dbtime) if job.dbdata and job.dbdata != "": job_metrics += get_job_metrics_entry("dbData", job.dbdata) - if job.resimevents: + if job.resimevents is not None: job_metrics += get_job_metrics_entry("resimevents", job.resimevents) # get the max disk space used by the payload (at the end of a job) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index a40ec8fd..161089a2 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '1' # build number should be reset to '1' for every new development cycle +BUILD = '2' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 9787898c020c9ecf75f1b3289975c3cde2e801ae Mon Sep 17 00:00:00 2001 From: Shuwei Ye Date: Tue, 18 May 2021 14:58:26 -0400 Subject: [PATCH 09/96] Changed gs.py for GCS buckets with automatic bucket name extraction --- pilot/copytool/gs.py | 68 +++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 17 deletions(-) diff --git a/pilot/copytool/gs.py b/pilot/copytool/gs.py index af9cef30..fb5ed595 100644 --- a/pilot/copytool/gs.py +++ b/pilot/copytool/gs.py @@ -23,6 +23,7 @@ from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import PilotException from pilot.util.ruciopath import get_rucio_path +from pilot.util.config import config logger = logging.getLogger(__name__) errors = ErrorCodes() @@ -52,17 +53,32 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): :param fspec: file spec data :return: dictionary {'surl': surl} """ + ddm = ddmconf.get(fspec.ddmendpoint) if not ddm: raise PilotException('failed to resolve ddmendpoint by name=%s' % fspec.ddmendpoint) - if ddm.is_deterministic: - surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), get_rucio_path(fspec.scope, fspec.lfn)) - elif ddm.type in ['OS_ES', 'OS_LOGS']: - surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), fspec.lfn) - fspec.protocol_id = protocol.get('id') + dataset = fspec.dataset + if dataset: + dataset = dataset.replace("#{pandaid}",os.environ['PANDAID']) else: - raise PilotException('resolve_surl(): Failed to construct SURL for non deterministic ddm=%s: NOT IMPLEMENTED', fspec.ddmendpoint) + dataset = "" + + remotePath = os.path.join(protocol.get('path', ''), dataset) + + # pilot ID is passed by the envvar GTAG + # try: + # rprotocols = ddm.rprotocols + # logger.debug('ddm.rprotocols=%s' % rprotocols) + # if "http_access" in rprotocols: + # http_access = rprotocols["http_access"] + # os.environ['GTAG'] = http_access + os.path.join(remotePath, config.Pilot.pilotlog) + # logger.debug('http_access=%s' % http_access) + # except Exception as e: + # logger.warning("Failed in get 'http_access' in ddm.rprotocols") + + surl = protocol.get('endpoint', '') + remotePath + logger.info('For GCS bucket, set surl=%s' % surl) # example: # protocol = {u'path': u'/atlas-eventservice', u'endpoint': u's3://s3.cern.ch:443/', u'flavour': u'AWS-S3-SSL', u'id': 175} @@ -72,7 +88,7 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): def copy_in(files, **kwargs): """ - Download given files from an S3 bucket. + Download given files from a GCS bucket. :param files: list of `FileSpec` objects :raise: PilotException in case of controlled error @@ -103,7 +119,7 @@ def download_file(path, surl, object_name=None): :param path: Path to local file after download (string). :param surl: remote path (string). - :param object_name: S3 object name. If not specified then file_name from path is used. + :param object_name: GCS object name. If not specified then file_name from path is used. :return: True if file was uploaded (else False), diagnostics (string). """ @@ -136,11 +152,22 @@ def copy_out(files, **kwargs): for fspec in files: - path = os.path.join(workdir, fspec.lfn) + logger.info('Going to process fspec.turl=%s' % fspec.turl) + import re + # bucket = re.sub(r'gs://(.*?)/.*', r'\1', fspec.turl) + reObj = re.match(r'gs://([^/]*)/(.*)', fspec.turl) + (bucket, remotePath) = reObj.groups() + + + # ["pilotlog.txt", "payload.stdout", "payload.stderr"]: + for logFile in os.listdir(workdir): + if logFile.endswith("gz"): + continue + path = os.path.join(workdir, logFile) if os.path.exists(path): - bucket = 'bucket' # UPDATE ME - logger.info('uploading %s to bucket=%s using object name=%s' % (path, bucket, fspec.lfn)) - status, diagnostics = upload_file(path, bucket, object_name=fspec.lfn) + objectName = os.path.join(remotePath, logFile) + logger.info('uploading %s to bucket=%s using object name=%s' % (path, bucket, objectName)) + status, diagnostics = upload_file(path, bucket, object_name=objectName) if not status: ## an error occurred # create new error code(s) in ErrorCodes.py and set it/them in resolve_common_transfer_errors() @@ -163,25 +190,32 @@ def copy_out(files, **kwargs): def upload_file(file_name, bucket, object_name=None): """ - Upload a file to an S3 bucket. + Upload a file to a GCS bucket. :param file_name: File to upload. :param bucket: Bucket to upload to (string). - :param object_name: S3 object name. If not specified then file_name is used. + :param object_name: GCS object name. If not specified then file_name is used. :return: True if file was uploaded (else False), diagnostics (string). """ - # if S3 object_name was not specified, use file_name + # if GCS object_name was not specified, use file_name if object_name is None: object_name = file_name + # os.environ['GTAG'] = http_access + os.path.join(remotePath, config.Pilot.pilotlog) + # logger.debug('http_access=%s' % http_access) + # upload the file try: client = storage.Client() gs_bucket = client.get_bucket(bucket) - remote_path = file_name # update me - blob = gs_bucket.blob(remote_path) + logger.info('uploading a file to bucket=%s in full path=%s' % (bucket, object_name)) + blob = gs_bucket.blob(object_name) blob.upload_from_filename(filename=file_name) + if file_name.endswith(config.Pilot.pilotlog): + url_pilotLog = blob.public_url + os.environ['GTAG'] = url_pilotLog + logger.debug("Set envvar GTAG with the pilotLot URL=%s" % url_pilotLog) except Exception as e: diagnostics = 'exception caught in gs client: %s' % e logger.critical(diagnostics) From c54abb2f9a4ecb732e7a708e0dfbf77f4c280765 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 19 May 2021 11:27:20 +0200 Subject: [PATCH 10/96] Removed default protocol value from trace - previously set to copy tool name (this is probably irrelevant since the proper value is set later when it is known) --- PILOTVERSION | 2 +- pilot/api/data.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 88b6eae4..6ee41598 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.2 \ No newline at end of file +2.11.3.3 \ No newline at end of file diff --git a/pilot/api/data.py b/pilot/api/data.py index ea1ba48f..bf5d73be 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -491,7 +491,7 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 module = self.copytool_modules[name]['module_name'] self.logger.info('trying to use copytool=%s for activity=%s' % (name, activity)) copytool = __import__('pilot.copytool.%s' % module, globals(), locals(), [module], 0) # Python 2/3 - self.trace_report.update(protocol=name) + #self.trace_report.update(protocol=name) except PilotException as e: caught_errors.append(e) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 161089a2..06625198 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '2' # build number should be reset to '1' for every new development cycle +BUILD = '3' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 8273d4719852afd52712c9a8c47ae990efe9167f Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 19 May 2021 15:18:23 +0200 Subject: [PATCH 11/96] Added new pilot option for turning on/off rucio traces --- pilot.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pilot.py b/pilot.py index 1b8ed093..9bcff83a 100755 --- a/pilot.py +++ b/pilot.py @@ -379,6 +379,11 @@ def get_args(): dest='jobtype', default='', help='Job type (managed, user)') + arg_parser.add_argument('--use-rucio-traces', + dest='use_rucio_traces', + type=str2bool, + default=True, + help='Use rucio traces') # HPC options arg_parser.add_argument('--hpc-resource', @@ -467,6 +472,9 @@ def set_environment_variables(args, mainworkdir): # set the (HPC) resource name (if set in options) environ['PILOT_RESOURCE_NAME'] = args.hpc_resource + # allow for the possibility of turning off rucio traces + environ['PILOT_USE_RUCIO_TRACES'] = str(args.use_rucio_traces) + # event service executor type environ['PILOT_ES_EXECUTOR_TYPE'] = args.executor_type From 77589e807a2fb65e805eb5380f416b0214c057f0 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 19 May 2021 15:59:14 +0200 Subject: [PATCH 12/96] Pilot now only sends rucio traces if required --- pilot/util/tracereport.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pilot/util/tracereport.py b/pilot/util/tracereport.py index b00e7067..9f2f8c09 100644 --- a/pilot/util/tracereport.py +++ b/pilot/util/tracereport.py @@ -133,6 +133,11 @@ def send(self): :return: Boolean. """ + # only send trace if it is actually required (can be turned off with pilot option) + if environ.get('PILOT_USE_RUCIO_TRACES', 'True') == 'False': + logger.debug('rucio trace does not need to be sent') + return True + url = config.Rucio.url logger.info("tracing server: %s" % url) logger.info("sending tracing report: %s" % str(self)) From 396c66d6b6d56e3b8049d815690c01e202514f0e Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 21 May 2021 17:13:47 +0200 Subject: [PATCH 13/96] Removed -p all option for xcache kill. Added debug info for xcache message log. Added preliminary functions for advanced debug mode. --- PILOTVERSION | 2 +- pilot/control/data.py | 3 ++ pilot/control/job.py | 70 +++++++++++++++++++++++++------------- pilot/info/jobdata.py | 5 +-- pilot/user/atlas/common.py | 10 ++++-- pilot/util/constants.py | 2 +- 6 files changed, 62 insertions(+), 30 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 6ee41598..39d30056 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.3 \ No newline at end of file +2.11.3.7 \ No newline at end of file diff --git a/pilot/control/data.py b/pilot/control/data.py index be4fc8a2..d0c710ec 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -445,6 +445,9 @@ def set_xcache_log(line): result = re.findall(pattern, line) if result: os.environ['ALRB_XCACHE_LOG'] = result[0] + logger.debug('extracted xcache log path: ALRB_XCACHE_LOG=\'%s\'' % result[0]) + else: + logger.warning('failed to extract log path for ALRB_XCACHE_LOG from line: \'%s\'' % line) def copytool_in(queues, traces, args): diff --git a/pilot/control/job.py b/pilot/control/job.py index 25396851..95221dd5 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -477,6 +477,37 @@ def get_panda_server(url, port): return pandaserver +def get_debug_command(cmd): + """ + Identify and filter the given debug command. + + Note: only a single command will be allowed from a predefined list: tail, ls, gdb, ps, du. + + :param cmd: raw debug command from job definition (string). + :return: debug_mode (Boolean, True if command is deemed ok), debug_command (string). + """ + + debug_mode = False + debug_command = "" + + allowed_commands = ['tail', 'ls', 'ps', 'gdb', 'du'] + forbidden_commands = ['rm'] + try: + tmp = cmd.split(' ') + com = tmp[0] + opts = tmp[1] + except Exception as e: + logger.warning('failed to identify debug command: %s' % e) + else: + if com not in allowed_commands: + logger.warning('command=%s is not in the list of allowed commands: %s' % (com, str(allowed_commands))) + elif ';' in opts or ';' in opts: + logger.warning('debug command cannot contain \';\': \'%s\'' % cmd) + elif com in forbidden_commands: + logger.warning('command=%s is not allowed' % com) + return debug_mode, debug_command + + def handle_backchannel_command(res, job, args, test_tobekilled=False): """ Does the server update contain any backchannel information? if so, update the job object. @@ -493,9 +524,15 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): res['command'] = 'tobekilled' if 'command' in res and res.get('command') != 'NULL': - # look for 'tobekilled', 'softkill', 'debug', 'debugoff' # warning: server might return comma-separated string, 'debug,tobekilled' - if 'tobekilled' in res.get('command'): + cmd = res.get('command') + # is it a 'command options'-type? debug_command=tail .., ls .., gdb .., ps .., du .. + if ' ' in cmd: + try: + job.debug, job.debug_command = get_debug_command(cmd) + except Exception as e: + logger.debug('exception caught in get_debug_command(): %s' % e) + elif 'tobekilled' in cmd: logger.info('pilot received a panda server signal to kill job %s at %s' % (job.jobid, time_stamp())) set_pilot_state(job=job, state="failed") @@ -506,18 +543,18 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): else: logger.debug('no pid to kill') args.abort_job.set() - elif 'softkill' in res.get('command'): + elif 'softkill' in cmd: logger.info('pilot received a panda server signal to softkill job %s at %s' % (job.jobid, time_stamp())) # event service kill instruction - elif 'debug' in res.get('command'): - logger.info('pilot received a command to turn on debug mode from the server') + elif 'debug' in cmd: + logger.info('pilot received a command to turn on standard debug mode from the server') job.debug = True - elif 'debugoff' in res.get('command'): + elif 'debugoff' in cmd: logger.info('pilot received a command to turn off debug mode from the server') job.debug = False else: - logger.warning('received unknown server command via backchannel: %s' % res.get('command')) + logger.warning('received unknown server command via backchannel: %s' % cmd) def add_data_structure_ids(data, version_tag): @@ -723,7 +760,6 @@ def add_memory_info(data, workdir, name=""): pilot_user = os.environ.get('PILOT_USER', 'generic').lower() utilities = __import__('pilot.user.%s.utilities' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: - #for key in job.utilities utility_node = utilities.get_memory_monitor_info(workdir, name=name) data.update(utility_node) except Exception as e: @@ -731,20 +767,6 @@ def add_memory_info(data, workdir, name=""): pass -#def get_list_of_log_files(): -# """ -# Return a list of log files produced by the payload. -# -# :return: list of log files. -# """ -# -# list_of_files = get_files() -# if not list_of_files: # some TRFs produce logs with different naming scheme -# list_of_files = get_files(pattern="log.*") -# -# return list_of_files - - def remove_pilot_logs_from_list(list_of_files): """ Remove any pilot logs from the list of last updated files. @@ -753,6 +775,8 @@ def remove_pilot_logs_from_list(list_of_files): :return: list of files (list). """ + # note: better to move experiment specific files to user area + # ignore the pilot log files try: to_be_removed = [config.Pilot.pilotlog, config.Pilot.stageinlog, config.Pilot.stageoutlog, @@ -760,7 +784,7 @@ def remove_pilot_logs_from_list(list_of_files): config.Pilot.remotefileverification_log, config.Pilot.base_trace_report, config.Container.container_script, config.Container.release_setup, config.Container.stagein_status_dictionary, config.Container.stagein_replica_dictionary, - 'eventLoopHeartBeat.txt'] + 'eventLoopHeartBeat.txt', 'memory_monitor_output.txt', 'memory_monitor_summary.json_snapshot'] except Exception as e: logger.warning('exception caught: %s' % e) to_be_removed = [] diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index fd959d61..f3ec0cb7 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -118,7 +118,8 @@ class JobData(BaseData): attemptnr = 0 # job attempt number destinationdblock = "" ## to be moved to FileSpec (job.outdata) datasetin = "" ## TO BE DEPRECATED: moved to FileSpec (job.indata) - debug = False # + debug = False # debug mode, when True, pilot will send debug info back to the server + debug_command = 'tail' # debug command (can be defined on the task side) produserid = "" # the user DN (added to trace report) jobdefinitionid = "" # the job definition id (added to trace report) infilesguids = "" # @@ -610,7 +611,7 @@ def clean__jobparams(self, raw, value): :return: updated job parameters (string). """ - #value += ' --athenaopts "HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER" someblah' + # value += ' --athenaopts "HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER" someblah' logger.info('cleaning jobparams: %s' % value) # user specific pre-filtering diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 6a7d448c..2a1a4502 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -1897,7 +1897,7 @@ def get_utility_commands(order=None, job=None): elif order == UTILITY_AFTER_PAYLOAD_FINISHED: if job.postprocess and job.postprocess.get('command', ''): com = download_command(job.postprocess, job.workdir) - elif 'pilotXcache' in job.infosys.queuedata.catchall: + if 'pilotXcache' in job.infosys.queuedata.catchall: com = xcache_deactivation_command(job.workdir) elif order == UTILITY_BEFORE_STAGEIN: if 'pilotXcache' in job.infosys.queuedata.catchall: @@ -1944,11 +1944,15 @@ def xcache_deactivation_command(workdir): copy(path, dest) except Exception as e: logger.warning('exception caught copying xcache log: %s' % e) - + else: + if not path: + logger.warning('ALRB_XCACHE_LOG is not set') + if path and not os.path.exists(path): + logger.warning('path does not exist: %s' % path) command = "%s " % get_asetup(asetup=False) command += "lsetup xcache; xcache kill" # -C centos7 - return {'command': command, 'args': '-p all'} + return {'command': command, 'args': ''} def get_utility_command_setup(name, job, setup=None): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 06625198..9a459253 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '3' # build number should be reset to '1' for every new development cycle +BUILD = '7' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From fca62f243a33a75a93f1fd8492810a6ec03c0f79 Mon Sep 17 00:00:00 2001 From: Shuwei Ye Date: Fri, 21 May 2021 13:55:36 -0400 Subject: [PATCH 14/96] Adjusted gs.py text format to pass the flake8 check --- pilot/copytool/gs.py | 81 +++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 43 deletions(-) diff --git a/pilot/copytool/gs.py b/pilot/copytool/gs.py index fb5ed595..03be7e77 100644 --- a/pilot/copytool/gs.py +++ b/pilot/copytool/gs.py @@ -22,7 +22,6 @@ from .common import resolve_common_transfer_errors from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import PilotException -from pilot.util.ruciopath import get_rucio_path from pilot.util.config import config logger = logging.getLogger(__name__) @@ -60,11 +59,11 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): dataset = fspec.dataset if dataset: - dataset = dataset.replace("#{pandaid}",os.environ['PANDAID']) + dataset = dataset.replace("#{pandaid}", os.environ['PANDAID']) else: - dataset = "" + dataset = "" - remotePath = os.path.join(protocol.get('path', ''), dataset) + remote_path = os.path.join(protocol.get('path', ''), dataset) # pilot ID is passed by the envvar GTAG # try: @@ -72,12 +71,12 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): # logger.debug('ddm.rprotocols=%s' % rprotocols) # if "http_access" in rprotocols: # http_access = rprotocols["http_access"] - # os.environ['GTAG'] = http_access + os.path.join(remotePath, config.Pilot.pilotlog) + # os.environ['GTAG'] = http_access + os.path.join(remote_path, config.Pilot.pilotlog) # logger.debug('http_access=%s' % http_access) # except Exception as e: # logger.warning("Failed in get 'http_access' in ddm.rprotocols") - surl = protocol.get('endpoint', '') + remotePath + surl = protocol.get('endpoint', '') + remote_path logger.info('For GCS bucket, set surl=%s' % surl) # example: @@ -151,39 +150,38 @@ def copy_out(files, **kwargs): workdir = kwargs.pop('workdir') for fspec in files: - - logger.info('Going to process fspec.turl=%s' % fspec.turl) - import re - # bucket = re.sub(r'gs://(.*?)/.*', r'\1', fspec.turl) - reObj = re.match(r'gs://([^/]*)/(.*)', fspec.turl) - (bucket, remotePath) = reObj.groups() - - - # ["pilotlog.txt", "payload.stdout", "payload.stderr"]: - for logFile in os.listdir(workdir): - if logFile.endswith("gz"): - continue - path = os.path.join(workdir, logFile) - if os.path.exists(path): - objectName = os.path.join(remotePath, logFile) - logger.info('uploading %s to bucket=%s using object name=%s' % (path, bucket, objectName)) - status, diagnostics = upload_file(path, bucket, object_name=objectName) - - if not status: ## an error occurred - # create new error code(s) in ErrorCodes.py and set it/them in resolve_common_transfer_errors() - error = resolve_common_transfer_errors(diagnostics, is_stagein=False) + logger.info('Going to process fspec.turl=%s' % fspec.turl) + + import re + # bucket = re.sub(r'gs://(.*?)/.*', r'\1', fspec.turl) + reObj = re.match(r'gs://([^/]*)/(.*)', fspec.turl) + (bucket, remote_path) = reObj.groups() + + # ["pilotlog.txt", "payload.stdout", "payload.stderr"]: + for logfile in os.listdir(workdir): + if logfile.endswith("gz"): + continue + path = os.path.join(workdir, logfile) + if os.path.exists(path): + object_name = os.path.join(remote_path, logfile) + logger.info('uploading %s to bucket=%s using object name=%s' % (path, bucket, object_name)) + status, diagnostics = upload_file(path, bucket, object_name=object_name) + + if not status: ## an error occurred + # create new error code(s) in ErrorCodes.py and set it/them in resolve_common_transfer_errors() + error = resolve_common_transfer_errors(diagnostics, is_stagein=False) + fspec.status = 'failed' + fspec.status_code = error.get('rcode') + raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) + else: + diagnostics = 'local output file does not exist: %s' % path + logger.warning(diagnostics) fspec.status = 'failed' - fspec.status_code = error.get('rcode') - raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) - else: - diagnostics = 'local output file does not exist: %s' % path - logger.warning(diagnostics) - fspec.status = 'failed' - fspec.status_code = errors.STAGEOUTFAILED - raise PilotException(diagnostics, code=fspec.status_code, state=fspec.status) + fspec.status_code = errors.STAGEOUTFAILED + raise PilotException(diagnostics, code=fspec.status_code, state=fspec.status) - fspec.status = 'transferred' - fspec.status_code = 0 + fspec.status = 'transferred' + fspec.status_code = 0 return files @@ -202,9 +200,6 @@ def upload_file(file_name, bucket, object_name=None): if object_name is None: object_name = file_name - # os.environ['GTAG'] = http_access + os.path.join(remotePath, config.Pilot.pilotlog) - # logger.debug('http_access=%s' % http_access) - # upload the file try: client = storage.Client() @@ -213,9 +208,9 @@ def upload_file(file_name, bucket, object_name=None): blob = gs_bucket.blob(object_name) blob.upload_from_filename(filename=file_name) if file_name.endswith(config.Pilot.pilotlog): - url_pilotLog = blob.public_url - os.environ['GTAG'] = url_pilotLog - logger.debug("Set envvar GTAG with the pilotLot URL=%s" % url_pilotLog) + url_pilotLog = blob.public_url + os.environ['GTAG'] = url_pilotLog + logger.debug("Set envvar GTAG with the pilotLot URL=%s" % url_pilotLog) except Exception as e: diagnostics = 'exception caught in gs client: %s' % e logger.critical(diagnostics) From 37b26eb604ee634a5fbcbfaf7d553880eede97cd Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 09:39:21 +0200 Subject: [PATCH 15/96] Corrected xcache kill. Refactored env var support functions. Initial commit for dask class --- PILOTVERSION | 2 +- pilot/api/dask.py | 40 +++++++++++++++++++++++++++++++++++++ pilot/control/data.py | 41 +++++++++++++++++++------------------- pilot/user/atlas/common.py | 2 +- pilot/util/constants.py | 2 +- 5 files changed, 63 insertions(+), 24 deletions(-) create mode 100644 pilot/api/dask.py diff --git a/PILOTVERSION b/PILOTVERSION index 39d30056..bd49a880 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.7 \ No newline at end of file +2.11.3.11 \ No newline at end of file diff --git a/pilot/api/dask.py b/pilot/api/dask.py new file mode 100644 index 00000000..2fe123a9 --- /dev/null +++ b/pilot/api/dask.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Authors: +# - Paul Nilsson, paul.nilsson@cern.ch, 2021 + +#from pilot.common.exception import NotDefined, NotSameLength, UnknownException +#from pilot.util.filehandling import get_table_from_file +#from pilot.util.math import mean, sum_square_dev, sum_dev, chi2, float_to_rounded_string + +import logging +logger = logging.getLogger(__name__) + + +class Dask(object): + """ + Dask interface class. + """ + + status = None + loadbalancerip = None + + def __init__(self, **kwargs): + """ + Init function. + + :param kwargs: + """ + + pass + + def install(self): + """ + + """ + + pass diff --git a/pilot/control/data.py b/pilot/control/data.py index d0c710ec..314caa52 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -416,38 +416,37 @@ def stage_out_auto(site, files): def xcache_proxy(output): + """ + + """ for line in output.split('\n'): if 'ALRB_XCACHE_PROXY' in line: - set_xcache_proxy(line, remote='REMOTE' in line) - if 'Messages logged in' in line: - set_xcache_log(line) - + remote = 'REMOTE' in line + name = 'ALRB_XCACHE_PROXY_REMOTE' if remote else 'ALRB_XCACHE_PROXY' + pattern = r'\ export\ ALRB_XCACHE_PROXY_REMOTE\=\"(.+)\"' if remote else r'\ export\ ALRB_XCACHE_PROXY\=\"(.+)\"' + set_xcache_var(line, name=name, pattern=pattern) + elif 'ALRB_XCACHE_MYPROCESS' in line: + set_xcache_var(line, name='ALRB_XCACHE_MYPROCESS', pattern=r'\ ALRB_XCACHE_MYPROCESS\=(.+)') + elif 'Messages logged in' in line: + set_xcache_var(line, name='ALRB_XCACHE_LOG', pattern=r'xcache\ started\ successfully.\ \ Messages\ logged\ in\ (.+)') -def set_xcache_proxy(line, remote=None): - - import re - pattern = r'\ export\ ALRB_XCACHE_PROXY_REMOTE\=\"(.+)\"' if remote else r'\ export\ ALRB_XCACHE_PROXY\=\"(.+)\"' - pattern = re.compile(pattern) - result = re.findall(pattern, line) - if result: - if remote: - os.environ['ALRB_XCACHE_PROXY_REMOTE'] = result[0] - else: - os.environ['ALRB_XCACHE_PROXY'] = result[0] +def set_xcache_var(line, name='', pattern=''): + """ + Extract the value of a given environmental variable from a given stdout line. -def set_xcache_log(line): + :param line: line from stdout to be investigated (string). + :param name: name of env var (string). + :param pattern: regex pattern (string). + :return: + """ import re - pattern = r'xcache\ started\ successfully.\ \ Messages\ logged\ in\ (.+)' pattern = re.compile(pattern) result = re.findall(pattern, line) if result: - os.environ['ALRB_XCACHE_LOG'] = result[0] - logger.debug('extracted xcache log path: ALRB_XCACHE_LOG=\'%s\'' % result[0]) - else: - logger.warning('failed to extract log path for ALRB_XCACHE_LOG from line: \'%s\'' % line) + os.environ[name] = result[0] def copytool_in(queues, traces, args): diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 2a1a4502..1c156b71 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -1952,7 +1952,7 @@ def xcache_deactivation_command(workdir): command = "%s " % get_asetup(asetup=False) command += "lsetup xcache; xcache kill" # -C centos7 - return {'command': command, 'args': ''} + return {'command': command, 'args': '-p $ALRB_XCACHE_MYPROCESS'} def get_utility_command_setup(name, job, setup=None): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 9a459253..e64ab6e2 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '7' # build number should be reset to '1' for every new development cycle +BUILD = '11' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From d95d55923da39e02f5b81ca8b73cf361354fc531 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 09:44:40 +0200 Subject: [PATCH 16/96] Added function comments --- pilot/control/data.py | 4 ++++ pilot/util/constants.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pilot/control/data.py b/pilot/control/data.py index 314caa52..53f671c7 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -417,9 +417,13 @@ def stage_out_auto(site, files): def xcache_proxy(output): """ + Extract env vars from xcache stdout and set them. + :param output: command output (string). + :return: """ + # loop over each line in the xcache stdout and identify the needed environmental variables for line in output.split('\n'): if 'ALRB_XCACHE_PROXY' in line: remote = 'REMOTE' in line diff --git a/pilot/util/constants.py b/pilot/util/constants.py index e64ab6e2..6837fa95 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '11' # build number should be reset to '1' for every new development cycle +BUILD = '12' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 3f1df918096d059f1a0a322c54d5f7b1da4a77b1 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 12:08:33 +0200 Subject: [PATCH 17/96] Moved xcache code to atlas area. Corrected xcache output handling. Lots of debugging info for xcache [to be removed again] --- PILOTVERSION | 2 +- pilot/control/data.py | 72 ++++++++++++++++-------------- pilot/control/payloads/generic.py | 41 ++++++++++++++--- pilot/user/atlas/common.py | 73 ++++++++++++++++++++++++++++++- pilot/user/generic/common.py | 15 +++++++ pilot/util/constants.py | 2 +- 6 files changed, 162 insertions(+), 43 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index bd49a880..c51eef03 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.11 \ No newline at end of file +2.11.3.13 \ No newline at end of file diff --git a/pilot/control/data.py b/pilot/control/data.py index 53f671c7..f8e80c94 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -32,7 +32,7 @@ from pilot.util.constants import PILOT_PRE_STAGEIN, PILOT_POST_STAGEIN, PILOT_PRE_STAGEOUT, PILOT_POST_STAGEOUT, LOG_TRANSFER_IN_PROGRESS,\ LOG_TRANSFER_DONE, LOG_TRANSFER_NOT_DONE, LOG_TRANSFER_FAILED, SERVER_UPDATE_RUNNING, MAX_KILL_WAIT_TIME, UTILITY_BEFORE_STAGEIN from pilot.util.container import execute -from pilot.util.filehandling import remove +from pilot.util.filehandling import remove, write_file from pilot.util.processes import threads_aborted from pilot.util.queuehandling import declare_failed_by_kill, put_in_queue from pilot.util.timing import add_to_pilot_timing @@ -415,42 +415,39 @@ def stage_out_auto(site, files): return files -def xcache_proxy(output): +def write_output(filename, output): """ - Extract env vars from xcache stdout and set them. + Write command output to file. - :param output: command output (string). + :param filename: file name (string). + :param output: command stdout/stderr (string). :return: """ - # loop over each line in the xcache stdout and identify the needed environmental variables - for line in output.split('\n'): - if 'ALRB_XCACHE_PROXY' in line: - remote = 'REMOTE' in line - name = 'ALRB_XCACHE_PROXY_REMOTE' if remote else 'ALRB_XCACHE_PROXY' - pattern = r'\ export\ ALRB_XCACHE_PROXY_REMOTE\=\"(.+)\"' if remote else r'\ export\ ALRB_XCACHE_PROXY\=\"(.+)\"' - set_xcache_var(line, name=name, pattern=pattern) - elif 'ALRB_XCACHE_MYPROCESS' in line: - set_xcache_var(line, name='ALRB_XCACHE_MYPROCESS', pattern=r'\ ALRB_XCACHE_MYPROCESS\=(.+)') - elif 'Messages logged in' in line: - set_xcache_var(line, name='ALRB_XCACHE_LOG', pattern=r'xcache\ started\ successfully.\ \ Messages\ logged\ in\ (.+)') + try: + write_file(filename, output, unique=True) + except PilotException as e: + logger.warning('failed to write utility output to file: %s, %s' % (e, output)) + else: + logger.debug('wrote %s' % filename) -def set_xcache_var(line, name='', pattern=''): +def write_utility_output(workdir, step, stdout, stderr): """ - Extract the value of a given environmental variable from a given stdout line. - - :param line: line from stdout to be investigated (string). - :param name: name of env var (string). - :param pattern: regex pattern (string). + Write the utility command output to stdout, stderr files to the job.workdir for the current step. + -> _stdout.txt, _stderr.txt + Example of step: xcache. + + :param workdir: job workdir (string). + :param step: utility step (string). + :param stdout: command stdout (string). + :param stderr: command stderr (string). :return: """ - import re - pattern = re.compile(pattern) - result = re.findall(pattern, line) - if result: - os.environ[name] = result[0] + # dump to files + write_output(os.path.join(workdir, step + '_stdout.txt'), stdout) + write_output(os.path.join(workdir, step + '_stderr.txt'), stderr) def copytool_in(queues, traces, args): @@ -480,15 +477,26 @@ def copytool_in(queues, traces, args): user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 cmd = user.get_utility_commands(job=job, order=UTILITY_BEFORE_STAGEIN) if cmd: + # xcache debug + exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + logger.debug('[before xcache start] stdout=%s' % stdout) + logger.debug('[before xcache start] stderr=%s' % stderr) + exit_code, stdout, stderr = execute(cmd.get('command')) - logger.debug('exit_code=%d' % exit_code) - logger.debug('stderr=%s' % stderr) logger.debug('stdout=%s' % stdout) - # move code to user area - xcache_proxy(stdout) + logger.debug('stderr=%s' % stderr) + + # xcache debug + exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + logger.debug('[after xcache start] stdout=%s' % stdout) + logger.debug('[after xcache start] stderr=%s' % stderr) + + # perform any action necessary after command execution (e.g. stdout processing) + kwargs = {'label': cmd.get('label', 'utility'), 'output': stdout} + user.post_prestagein_utility_command(**kwargs) - logger.debug('ALRB_XCACHE_PROXY=%s' % os.environ.get('ALRB_XCACHE_PROXY', '')) - logger.debug('ALRB_XCACHE_PROXY_REMOTE=%s' % os.environ.get('ALRB_XCACHE_PROXY_REMOTE', '')) + # write output to log files + write_utility_output(job.workdir, cmd.get('label', 'utility'), stdout, stderr) # place it in the current stage-in queue (used by the jobs' queue monitoring) put_in_queue(job, queues.current_data_in) diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 1d699a0f..3c674fd6 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -92,7 +92,7 @@ def utility_before_payload(self, job): cmd_dictionary = user.get_utility_commands(order=UTILITY_BEFORE_PAYLOAD, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.debug('utility command to be executed before the payload: %s' % cmd) + logger.info('utility command (\'%s\') to be executed before the payload: %s' % (cmd_dictionary.get('label', 'utility'), cmd)) return cmd @@ -114,7 +114,7 @@ def utility_with_payload(self, job): cmd_dictionary = user.get_utility_commands(order=UTILITY_WITH_PAYLOAD, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.debug('utility command to be executed with the payload: %s' % cmd) + logger.info('utility command (\'%s\') to be executed with the payload: %s' % (cmd_dictionary.get('label', 'utility'), cmd)) return cmd @@ -138,7 +138,7 @@ def get_utility_command(self, order=None): cmd_dictionary = user.get_utility_commands(order=order, job=self.__job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.info('utility command to be executed after the payload: %s' % cmd) + logger.info('utility command (\'%s\') to be executed after the payload: %s' % (cmd_dictionary.get('label', 'utility'), cmd)) return cmd @@ -231,7 +231,7 @@ def utility_after_payload_finished(self, job): cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_FINISHED, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.debug('utility command to be executed after the payload has finished: %s' % cmd) + logger.info('utility command (\'%s\') to be executed after the payload has finished: %s' % (cmd_dictionary.get('label', 'utility'), cmd)) return cmd @@ -285,13 +285,20 @@ def write_utility_output(self, workdir, step, stdout, stderr): elif step == 'postprocess': self.__postprocess_stdout_name = name_stdout self.__postprocess_stderr_name = name_stderr - write_file(os.path.join(workdir, step + '_stdout.txt'), stdout, unique=True) + name = os.path.join(workdir, step + '_stdout.txt') + write_file(name, stdout, unique=True) except PilotException as e: logger.warning('failed to write utility stdout to file: %s, %s' % (e, stdout)) + else: + logger.debug('wrote %s' % name) + try: - write_file(os.path.join(workdir, step + '_stderr.txt'), stderr, unique=True) + name = os.path.join(workdir, step + '_stderr.txt') + write_file(name, stderr, unique=True) except PilotException as e: logger.warning('failed to write utility stderr to file: %s, %s' % (e, stderr)) + else: + logger.debug('wrote %s' % name) def pre_payload(self, job): """ @@ -581,6 +588,11 @@ def run(self): # noqa: C901 # now run the main payload, when it finishes, run the postprocess (if necessary) # note: no need to run any main payload in HPO Horovod jobs on Kubernetes if os.environ.get('HARVESTER_HOROVOD', '') == '': + + exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + logger.debug('[before payload start] stdout=%s' % stdout) + logger.debug('[before payload start] stderr=%s' % stderr) + proc = self.run_payload(self.__job, cmd, self.__out, self.__err) else: proc = None @@ -627,6 +639,10 @@ def run(self): # noqa: C901 set_pilot_state(job=self.__job, state=state) logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n' % (proc.pid, exit_code, self.__job.state)) + exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + logger.debug('[after payload finish] stdout=%s' % stdout) + logger.debug('[after payload finish] stderr=%s' % stderr) + # stop the utility command (e.g. a coprocess if necessary if proc_co: logger.debug('stopping utility command: %s' % utility_cmd) @@ -673,7 +689,18 @@ def run_utility_after_payload_finished(self): exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'postprocess') elif cmd_after_payload: logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload) - exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'xcache') + + # xcache debug + exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + logger.debug('[before xcache kill] stdout=%s' % stdout) + logger.debug('[before xcache kill] stderr=%s' % stderr) + + exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'xcache_kill') + + # xcache debug + exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + logger.debug('[after xcache kill] stdout=%s' % stdout) + logger.debug('[after xcache kill] stderr=%s' % stderr) return exit_code diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 1c156b71..ee0ecd90 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -1882,30 +1882,99 @@ def get_utility_commands(order=None, job=None): if order == UTILITY_BEFORE_PAYLOAD and job.preprocess: if job.preprocess.get('command', ''): com = download_command(job.preprocess, job.workdir) + com['label'] = 'preprocess' elif order == UTILITY_WITH_PAYLOAD: - com = {'command': 'NetworkMonitor', 'args': ''} + com = {'command': 'NetworkMonitor', 'args': '', 'label': 'networkmonitor'} elif order == UTILITY_AFTER_PAYLOAD_STARTED: cmd = config.Pilot.utility_after_payload_started if cmd: - com = {'command': cmd, 'args': ''} + com = {'command': cmd, 'args': '', 'label': cmd.lower()} elif order == UTILITY_AFTER_PAYLOAD_STARTED2 and job.coprocess: if job.coprocess.get('command', ''): com = download_command(job.coprocess, job.workdir) + com['label'] = 'coprocess' elif order == UTILITY_AFTER_PAYLOAD and job.postprocess: if job.postprocess.get('command', ''): com = download_command(job.postprocess, job.workdir) + com['label'] = 'postprocess' elif order == UTILITY_AFTER_PAYLOAD_FINISHED: if job.postprocess and job.postprocess.get('command', ''): com = download_command(job.postprocess, job.workdir) + com['label'] = 'postprocess' if 'pilotXcache' in job.infosys.queuedata.catchall: com = xcache_deactivation_command(job.workdir) + com['label'] = 'xcache_kill' elif order == UTILITY_BEFORE_STAGEIN: if 'pilotXcache' in job.infosys.queuedata.catchall: com = xcache_activation_command(job.jobid) + com['label'] = 'xcache' return com +def post_prestagein_utility_command(**kwargs): + """ + Execute any post pre-stage-in utility commands. + + :param kwargs: kwargs (dictionary). + :return: + """ + + label = kwargs.get('label', 'unknown_label') + stdout = kwargs.get('output', None) + + if stdout: + logger.debug('processing stdout for label=%s' % label) + xcache_proxy(stdout) + else: + logger.warning('no output for label=%s' % label) + + alrb_xcache_files = os.environ.get('ALRB_XCACHE_FILES', '') + if alrb_xcache_files: + cmd = 'cat $ALRB_XCACHE_FILES/settings.sh' + exit_code, _stdout, _stderr = execute(cmd, usecontainer=False) + logger.debug('cmd=%s:\n\n%s\n\n' % _stdout) + + +def xcache_proxy(output): + """ + Extract env vars from xcache stdout and set them. + + :param output: command output (string). + :return: + """ + + # loop over each line in the xcache stdout and identify the needed environmental variables + for line in output.split('\n'): + if 'ALRB_XCACHE_PROXY' in line: + remote = 'REMOTE' in line + name = 'ALRB_XCACHE_PROXY_REMOTE' if remote else 'ALRB_XCACHE_PROXY' + pattern = r'\ export\ ALRB_XCACHE_PROXY_REMOTE\=\"(.+)\"' if remote else r'\ export\ ALRB_XCACHE_PROXY\=\"(.+)\"' + set_xcache_var(line, name=name, pattern=pattern) + elif 'ALRB_XCACHE_MYPROCESS' in line: + set_xcache_var(line, name='ALRB_XCACHE_MYPROCESS', pattern=r'\ ALRB_XCACHE_MYPROCESS\=(.+)') + elif 'Messages logged in' in line: + set_xcache_var(line, name='ALRB_XCACHE_LOG', pattern=r'xcache\ started\ successfully.\ \ Messages\ logged\ in\ (.+)') + elif 'ALRB_XCACHE_FILES' in line: + set_xcache_var(line, name='ALRB_XCACHE_FILES', pattern=r'\ ALRB_XCACHE_FILES\=(.+)') + + +def set_xcache_var(line, name='', pattern=''): + """ + Extract the value of a given environmental variable from a given stdout line. + + :param line: line from stdout to be investigated (string). + :param name: name of env var (string). + :param pattern: regex pattern (string). + :return: + """ + + pattern = re.compile(pattern) + result = re.findall(pattern, line) + if result: + os.environ[name] = result[0] + + def xcache_activation_command(jobid): """ Return the xcache service activation command. diff --git a/pilot/user/generic/common.py b/pilot/user/generic/common.py index 7df04fc8..b21442e1 100644 --- a/pilot/user/generic/common.py +++ b/pilot/user/generic/common.py @@ -256,3 +256,18 @@ def update_server(job): """ pass + + +def post_prestagein_utility_command(**kwargs): + """ + Execute any post pre-stage-in utility commands. + + :param kwargs: kwargs (dictionary). + :return: + """ + + # label = kwargs.get('label', 'unknown_label') + # stdout = kwargs.get('output', None) + + pass + diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 6837fa95..c05c9593 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '12' # build number should be reset to '1' for every new development cycle +BUILD = '13' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 515f623dd7dc44201cb8ed2a8f3326c31008f2be Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 12:49:39 +0200 Subject: [PATCH 18/96] Dask validation --- pilot/api/dask.py | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 2fe123a9..e611a645 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -10,6 +10,7 @@ #from pilot.common.exception import NotDefined, NotSameLength, UnknownException #from pilot.util.filehandling import get_table_from_file #from pilot.util.math import mean, sum_square_dev, sum_dev, chi2, float_to_rounded_string +from pilot.util.container import execute import logging logger = logging.getLogger(__name__) @@ -32,9 +33,46 @@ def __init__(self, **kwargs): pass - def install(self): + def install(self, block=True): """ """ - pass + # can dask be installed? + if not self._validate(): + logger.warning('validation failed') + self.status = 'failed' + else: + logger.info('dask has been validated') + self.status = 'validated' + + def _validate(self): + """ + Make sure that pre-conditions are met before any installation can be attempted. + + Pre-conditions: required libraries and commands + 1. library: dask + 2. library: dask_kubernetes + 3. command: helm + 4. command: kubectl + """ + + try: + import dask + import dask_kubernetes + except Exception as error: + logger.warning('module not available: %s' % error) + return False + + commands = ['helm', 'kubectl'] + found = False + for cmd in commands: + exit_code, stdout, stderr = execute('which %s' % cmd, mute=True) + found = True if not 'not found' in stdout else False + if not found in stdout: + logger.warning(stdout) + break + if not found: + return False + + return True From d6e8c26966ffb8039aef27790d2d91a053209b60 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 12:58:57 +0200 Subject: [PATCH 19/96] Removed args object from establish_logging() --- pilot.py | 2 +- pilot/control/job.py | 2 +- pilot/scripts/open_remote_file.py | 2 +- pilot/scripts/stagein.py | 2 +- pilot/scripts/stageout.py | 2 +- pilot/util/filehandling.py | 9 +++++---- 6 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pilot.py b/pilot.py index 9bcff83a..ac40b52b 100755 --- a/pilot.py +++ b/pilot.py @@ -587,7 +587,7 @@ def get_pilot_source_dir(): set_environment_variables(args, mainworkdir) # setup and establish standard logging - establish_logging(args) + establish_logging(debug=args.debug, nopilotlog=args.nopilotlog) # execute main function trace = main() diff --git a/pilot/control/job.py b/pilot/control/job.py index 95221dd5..d5d6b41d 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -1676,7 +1676,7 @@ def retrieve(queues, traces, args): # noqa: C901 logging.info('pilot has finished for previous job - re-establishing logging') logging.handlers = [] logging.shutdown() - establish_logging(args) + establish_logging(debug=args.debug, nopilotlog=args.nopilotlog) pilot_version_banner() getjob_requests = 0 add_to_pilot_timing('1', PILOT_MULTIJOB_START_TIME, time.time(), args) diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py index 96ab6805..92dec03b 100644 --- a/pilot/scripts/open_remote_file.py +++ b/pilot/scripts/open_remote_file.py @@ -99,7 +99,7 @@ def try_open_file(turl): print("remote file open verification not desired") exit(0) - establish_logging(args, filename=logname) + establish_logging(debug=args.debug, nopilotlog=args.nopilotlog, filename=logname) logger = logging.getLogger(__name__) # get the file info diff --git a/pilot/scripts/stagein.py b/pilot/scripts/stagein.py index 851ca307..00ea77b7 100644 --- a/pilot/scripts/stagein.py +++ b/pilot/scripts/stagein.py @@ -356,7 +356,7 @@ def extract_error_info(err): args.debug = True args.nopilotlog = False - establish_logging(args, filename=config.Pilot.stageinlog) + establish_logging(debug=args.debug, nopilotlog=args.nopilotlog, filename=config.Pilot.stageinlog) logger = logging.getLogger(__name__) #ret = verify_args() diff --git a/pilot/scripts/stageout.py b/pilot/scripts/stageout.py index f60219d1..2872801b 100644 --- a/pilot/scripts/stageout.py +++ b/pilot/scripts/stageout.py @@ -289,7 +289,7 @@ def extract_error_info(err): args.debug = True args.nopilotlog = False - establish_logging(args, filename=config.Pilot.stageoutlog) + establish_logging(debug=args.debug, nopilotlog=args.nopilotlog, filename=config.Pilot.stageoutlog) logger = logging.getLogger(__name__) #ret = verify_args() diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index 7858b4b6..9ebeac46 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -930,11 +930,12 @@ def dump(path, cmd="cat"): logger.info("path %s does not exist" % path) -def establish_logging(args, filename=config.Pilot.pilotlog): +def establish_logging(debug=True, nopilotlog=False, filename=config.Pilot.pilotlog): """ Setup and establish logging. - :param args: pilot arguments object. + :param debug: debug mode (Boolean), + :param nopilotlog: True when pilot log is not known (Boolean). :param filename: name of log file. :return: """ @@ -944,7 +945,7 @@ def establish_logging(args, filename=config.Pilot.pilotlog): _logger.propagate = False console = logging.StreamHandler(sys.stdout) - if args.debug: + if debug: format_str = '%(asctime)s | %(levelname)-8s | %(threadName)-19s | %(name)-32s | %(funcName)-25s | %(message)s' level = logging.DEBUG else: @@ -953,7 +954,7 @@ def establish_logging(args, filename=config.Pilot.pilotlog): #rank, maxrank = get_ranks_info() #if rank is not None: # format_str = 'Rank {0} |'.format(rank) + format_str - if args.nopilotlog: + if nopilotlog: logging.basicConfig(level=level, format=format_str, filemode='w') else: logging.basicConfig(filename=filename, level=level, format=format_str, filemode='w') From 14a86fdbc94d6d9c3efa4af408e6e2dab44d6b7a Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 13:00:41 +0200 Subject: [PATCH 20/96] Added logging to dask --- pilot/api/dask.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index e611a645..756596c9 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -11,6 +11,7 @@ #from pilot.util.filehandling import get_table_from_file #from pilot.util.math import mean, sum_square_dev, sum_dev, chi2, float_to_rounded_string from pilot.util.container import execute +from pilot.util.filehandling import establish_logging import logging logger = logging.getLogger(__name__) @@ -57,6 +58,8 @@ def _validate(self): 4. command: kubectl """ + establish_logging(debug=True) + try: import dask import dask_kubernetes From 8669f8d93bcfada2f707256feb3c1c08c7bc5cf0 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 13:06:15 +0200 Subject: [PATCH 21/96] Update --- pilot/api/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 756596c9..1c5f32eb 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -72,7 +72,7 @@ def _validate(self): for cmd in commands: exit_code, stdout, stderr = execute('which %s' % cmd, mute=True) found = True if not 'not found' in stdout else False - if not found in stdout: + if found not in stdout: logger.warning(stdout) break if not found: From 92b1571f2e903571ace6e0b9316fddda5cd8872e Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 13:06:53 +0200 Subject: [PATCH 22/96] Update --- pilot/api/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 1c5f32eb..d517c31f 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -72,7 +72,7 @@ def _validate(self): for cmd in commands: exit_code, stdout, stderr = execute('which %s' % cmd, mute=True) found = True if not 'not found' in stdout else False - if found not in stdout: + if not found: logger.warning(stdout) break if not found: From df9f0a002ef87e51293b9b8943be322f01e1a533 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 14:17:39 +0200 Subject: [PATCH 23/96] Added override values --- pilot/api/dask.py | 56 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index d517c31f..4c5196c2 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -11,7 +11,9 @@ #from pilot.util.filehandling import get_table_from_file #from pilot.util.math import mean, sum_square_dev, sum_dev, chi2, float_to_rounded_string from pilot.util.container import execute -from pilot.util.filehandling import establish_logging +from pilot.util.filehandling import establish_logging, write_file + +import os import logging logger = logging.getLogger(__name__) @@ -24,6 +26,10 @@ class Dask(object): status = None loadbalancerip = None + servicetype = "LoadBalancer" + jupyter = False + overrides = "override_values.yaml" + _workdir = os.getcwd() def __init__(self, **kwargs): """ @@ -32,7 +38,15 @@ def __init__(self, **kwargs): :param kwargs: """ - pass + _servicetype = kwargs.get('servicetype', None) + if _servicetype: + self.servicetype = _servicetype + _jupyter = kwargs.get('jupyter', None) + if _jupyter: + self.jupyter = _jupyter + _overrides = kwargs.get('overrides', None) + if _overrides: + self.overrides = _overrides def install(self, block=True): """ @@ -44,9 +58,17 @@ def install(self, block=True): logger.warning('validation failed') self.status = 'failed' else: - logger.info('dask has been validated') + logger.debug('dask has been validated') self.status = 'validated' + # is the single-dask cluster already running? + cmd = 'kubectl get services' + exit_code, stdout, stderr = execute(cmd, mute=True) + if exit_code: + logger.warning('failed to execute \'%s\': %s' % (cmd, stdout)) + self.status = 'failed' + else: + def _validate(self): """ Make sure that pre-conditions are met before any installation can be attempted. @@ -56,17 +78,22 @@ def _validate(self): 2. library: dask_kubernetes 3. command: helm 4. command: kubectl + 5. copy relevant yaml file(s) """ establish_logging(debug=True) + # import relevant modules try: import dask + logger.debug('dask imported') import dask_kubernetes + logger.debug('dask_kubernetes imported') except Exception as error: logger.warning('module not available: %s' % error) return False + # verify relevant commands commands = ['helm', 'kubectl'] found = False for cmd in commands: @@ -75,7 +102,30 @@ def _validate(self): if not found: logger.warning(stdout) break + else: + logger.debug('%s verified' % cmd) if not found: return False + # create yaml file(s) + self._generate_override_script() + return True + + def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'): + """ + + """ + + filename = os.path.join(self._workdir, self.overrides) + if os.path.exists(filename): + logger.info('file \'%s\' already exists - will not override') + return + + script = "" + if not jupyter: + script += 'jupyter:\n\tenabled: false\n\n' + if servicetype: + script += 'scheduler:\n\tserviceType: \"%s\"\n' % servicetype + + write_file(filename, script) From 0d505f3d04e8e563f5411bcc51a55d7f3423a4c9 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 14:18:50 +0200 Subject: [PATCH 24/96] Update --- pilot/api/dask.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 4c5196c2..2cbeaee1 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -68,6 +68,7 @@ def install(self, block=True): logger.warning('failed to execute \'%s\': %s' % (cmd, stdout)) self.status = 'failed' else: + pass def _validate(self): """ From 72bc46db8d13db0253a0d3f74d444fed8f5fe805 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 14:24:54 +0200 Subject: [PATCH 25/96] Removed tabs --- pilot/api/dask.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 2cbeaee1..1565a839 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -68,6 +68,7 @@ def install(self, block=True): logger.warning('failed to execute \'%s\': %s' % (cmd, stdout)) self.status = 'failed' else: + # parse output pass def _validate(self): @@ -115,7 +116,11 @@ def _validate(self): def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'): """ + Generate a values yaml script, unless it already exists. + :param jupyter: False if jupyter notebook server should be disabled (Boolean). + :param servicetype: name of service type (string). + :return: """ filename = os.path.join(self._workdir, self.overrides) @@ -125,8 +130,10 @@ def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'): script = "" if not jupyter: - script += 'jupyter:\n\tenabled: false\n\n' + script += 'jupyter:\n enabled: false\n\n' if servicetype: - script += 'scheduler:\n\tserviceType: \"%s\"\n' % servicetype + script += 'scheduler:\n serviceType: \"%s\"\n' % servicetype - write_file(filename, script) + status = write_file(filename, script) + if status: + logger.debug('generated script: %s' % filename) From d13735ef37443030b48d8ae57f8c153973ed732a Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 14:40:53 +0200 Subject: [PATCH 26/96] Parsing of kubectl output --- pilot/api/dask.py | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 1565a839..0442815a 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -12,6 +12,7 @@ #from pilot.util.math import mean, sum_square_dev, sum_dev, chi2, float_to_rounded_string from pilot.util.container import execute from pilot.util.filehandling import establish_logging, write_file +from pilot.util.parameters import convert_to_int import os @@ -24,6 +25,7 @@ class Dask(object): Dask interface class. """ + servicename = 'single-dask' status = None loadbalancerip = None servicetype = "LoadBalancer" @@ -38,6 +40,9 @@ def __init__(self, **kwargs): :param kwargs: """ + _servicename = kwargs.get('servicename', None) + if _servicename: + self.servicename = _servicename _servicetype = kwargs.get('servicetype', None) if _servicetype: self.servicetype = _servicetype @@ -69,7 +74,8 @@ def install(self, block=True): self.status = 'failed' else: # parse output - pass + dictionary = self._convert_to_dict(stdout) + logger.debug('d=%s' % str(dictionary)) def _validate(self): """ @@ -137,3 +143,35 @@ def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'): status = write_file(filename, script) if status: logger.debug('generated script: %s' % filename) + + def _convert_to_dict(self, output): + """ + + """ + + dictionary = {} + summary_keys = [] # to keep track of content + header_locked = False + + for line in output.split('\n'): + try: + # Remove empty entries from list (caused by multiple \t) + _l = line.replace('\n', '') + _l = [_f for _f in _l.split('\t') if _f] # Python 3 + + # define dictionary keys + if type(_l[0]) == str and not header_locked: + summary_keys = _l + for key in _l: + dictionary[key] = [] + header_locked = True + else: # sort the memory measurements in the correct columns + for i, key in enumerate(_l): + # for key in _l: + key_entry = summary_keys[i] # e.g. Time + value = convert_to_int(key) + dictionary[key_entry].append(value) + except Exception: + logger.warning("unexpected format of utility output: %s" % line) + + return dictionary From c0d7da570d3f9d7ac60655d6ab30cde7de630e79 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 15:32:44 +0200 Subject: [PATCH 27/96] Checking if service is running --- pilot/api/dask.py | 81 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 24 deletions(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 0442815a..d7b2c01c 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -15,6 +15,7 @@ from pilot.util.parameters import convert_to_int import os +import re import logging logger = logging.getLogger(__name__) @@ -67,15 +68,50 @@ def install(self, block=True): self.status = 'validated' # is the single-dask cluster already running? - cmd = 'kubectl get services' - exit_code, stdout, stderr = execute(cmd, mute=True) - if exit_code: - logger.warning('failed to execute \'%s\': %s' % (cmd, stdout)) - self.status = 'failed' - else: - # parse output - dictionary = self._convert_to_dict(stdout) - logger.debug('d=%s' % str(dictionary)) + name = '%s-scheduler' % self.servicename + if self.is_running(name=name): + logger.info('service %s is running') + + def is_running(self, name='single-dask-scheduler'): + """ + + """ + + status = False + dictionary = self._get_dictionary(cmd='kubectl get services') + for key in dictionary: + if key == name: + status = True if self._is_valid_ip(dictionary[key]['EXTERNAL-IP']) else False + break + + return status + + def _is_valid_ip(self, ip): + """ + + """ + + regex = "^((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.){3}(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])$" + return True if re.search(regex, ip) else False + + def _get_dictionary(self, cmd=None): + """ + + """ + + dictionary = {} + if not cmd: + return dictionary + + exit_code, stdout, stderr = execute(cmd, mute=True) + if exit_code: + logger.warning('failed to execute \'%s\': %s' % (cmd, stdout)) + self.status = 'failed' + else: + # parse output + dictionary = self._convert_to_dict(stdout) + + return dictionary def _validate(self): """ @@ -153,24 +189,21 @@ def _convert_to_dict(self, output): summary_keys = [] # to keep track of content header_locked = False + dictionary = {} + first_line = [] for line in output.split('\n'): try: # Remove empty entries from list (caused by multiple \t) - _l = line.replace('\n', '') - _l = [_f for _f in _l.split('\t') if _f] # Python 3 - - # define dictionary keys - if type(_l[0]) == str and not header_locked: - summary_keys = _l - for key in _l: - dictionary[key] = [] - header_locked = True - else: # sort the memory measurements in the correct columns - for i, key in enumerate(_l): - # for key in _l: - key_entry = summary_keys[i] # e.g. Time - value = convert_to_int(key) - dictionary[key_entry].append(value) + _l = line + _l = [_f for _f in _l.split('\t') if _f] + + if first_line == []: # "NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + first_line = _l[1:] + else: + dictionary[_l[0]] = {} + for i in range(len(_l[1:])): + dictionary[_l[0]][first_line[i]] = _l[1:][i] + except Exception: logger.warning("unexpected format of utility output: %s" % line) From 7480a83dca7eb8f4e7a171ee5bf4f4be223ae3bd Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 15:34:43 +0200 Subject: [PATCH 28/96] Update --- pilot/api/dask.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index d7b2c01c..b17c032e 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -70,7 +70,9 @@ def install(self, block=True): # is the single-dask cluster already running? name = '%s-scheduler' % self.servicename if self.is_running(name=name): - logger.info('service %s is running') + logger.info('service %s is running' % name) + else: + logger.info('service %s is not yet running' % name) def is_running(self, name='single-dask-scheduler'): """ @@ -167,7 +169,7 @@ def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'): filename = os.path.join(self._workdir, self.overrides) if os.path.exists(filename): - logger.info('file \'%s\' already exists - will not override') + logger.info('file \'%s\' already exists - will not override' % filename) return script = "" From 02f40e07ac1c3634263afc4d96276cedee2697fa Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 15:40:16 +0200 Subject: [PATCH 29/96] Update --- pilot/api/dask.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index b17c032e..98bde617 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -81,9 +81,12 @@ def is_running(self, name='single-dask-scheduler'): status = False dictionary = self._get_dictionary(cmd='kubectl get services') + logger.debug('d=%s' % str(dictionary)) for key in dictionary: if key == name: + logger.debug('ip:%s' % dictionary[key]['EXTERNAL-IP']) status = True if self._is_valid_ip(dictionary[key]['EXTERNAL-IP']) else False + logger.debug('status=%s' % str(status)) break return status From 172cceb521f01888b4fb777f932c61d42df04358 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 15:44:25 +0200 Subject: [PATCH 30/96] Update --- pilot/api/dask.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 98bde617..847e35d7 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -190,13 +190,10 @@ def _convert_to_dict(self, output): """ - dictionary = {} - summary_keys = [] # to keep track of content - header_locked = False - dictionary = {} first_line = [] for line in output.split('\n'): + logger.debug('line=%s' % line) try: # Remove empty entries from list (caused by multiple \t) _l = line @@ -212,4 +209,5 @@ def _convert_to_dict(self, output): except Exception: logger.warning("unexpected format of utility output: %s" % line) + logger.debug('dictionary=%s' % str(dictionary)) return dictionary From f91c677b50f95952bd2764d9fce2388c2e3e197b Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 15:47:12 +0200 Subject: [PATCH 31/96] Update --- pilot/api/dask.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 847e35d7..372a5fdf 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -198,9 +198,10 @@ def _convert_to_dict(self, output): # Remove empty entries from list (caused by multiple \t) _l = line _l = [_f for _f in _l.split('\t') if _f] - + logger.debug('_l=%s' % _l) if first_line == []: # "NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE first_line = _l[1:] + logger.debug('first line=%s' % first_line) else: dictionary[_l[0]] = {} for i in range(len(_l[1:])): From 6b92b2191a501eeaa3b091c6f5bcec0dce9215b5 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 15:51:18 +0200 Subject: [PATCH 32/96] Update --- pilot/api/dask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 372a5fdf..0f544a5c 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -196,8 +196,8 @@ def _convert_to_dict(self, output): logger.debug('line=%s' % line) try: # Remove empty entries from list (caused by multiple \t) - _l = line - _l = [_f for _f in _l.split('\t') if _f] + _l = re.sub(' +', ' ', line) + _l = [_f for _f in _l.split(' ') if _f] logger.debug('_l=%s' % _l) if first_line == []: # "NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE first_line = _l[1:] From 60a9bef3c78ca57df20b309bdfe098ad58f0de49 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 16:05:41 +0200 Subject: [PATCH 33/96] Installation and uninstallation of dask --- pilot/api/dask.py | 51 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 0f544a5c..3f81af00 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -16,6 +16,7 @@ import os import re +from time import sleep import logging logger = logging.getLogger(__name__) @@ -54,6 +55,16 @@ def __init__(self, **kwargs): if _overrides: self.overrides = _overrides + def uninstall(self): + """ + + """ + + cmd = 'helm uninstall %s' % self.servicename + exit_code, stdout, stderr = execute(cmd, mute=True) + if not exit_code: + logger.info('service %s has been uninstalled' % self.servicename) + def install(self, block=True): """ @@ -70,9 +81,29 @@ def install(self, block=True): # is the single-dask cluster already running? name = '%s-scheduler' % self.servicename if self.is_running(name=name): - logger.info('service %s is running' % name) + logger.info('service %s is already running - nothing to install' % name) else: - logger.info('service %s is not yet running' % name) + logger.info('service %s is not yet running - proceed with installation' % name) + + # + override_option = "-f %s" % self.overrides if self.overrides else "" + cmd = 'helm install %s %s dask/dask' % (override_option, self.servicename) + #exit_code, stdout, stderr = execute(cmd, mute=True) + exit_code = 0 + if not exit_code: + logger.info('installation of service %s is in progress' % self.servicename) + + if block: + while True: + name = '%s-scheduler' % self.servicename + if self.is_running(name=name): + logger.info('service %s is running' % name) + self.status = 'running' + break + else: + self.status = 'pending' + sleep(2) + # note: in non-blocking mode, status is not getting updated def is_running(self, name='single-dask-scheduler'): """ @@ -81,12 +112,9 @@ def is_running(self, name='single-dask-scheduler'): status = False dictionary = self._get_dictionary(cmd='kubectl get services') - logger.debug('d=%s' % str(dictionary)) for key in dictionary: if key == name: - logger.debug('ip:%s' % dictionary[key]['EXTERNAL-IP']) status = True if self._is_valid_ip(dictionary[key]['EXTERNAL-IP']) else False - logger.debug('status=%s' % str(status)) break return status @@ -181,9 +209,12 @@ def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'): if servicetype: script += 'scheduler:\n serviceType: \"%s\"\n' % servicetype - status = write_file(filename, script) - if status: - logger.debug('generated script: %s' % filename) + if script: + status = write_file(filename, script) + if status: + logger.debug('generated script: %s' % filename) + else: + self.overrides = None def _convert_to_dict(self, output): """ @@ -193,15 +224,12 @@ def _convert_to_dict(self, output): dictionary = {} first_line = [] for line in output.split('\n'): - logger.debug('line=%s' % line) try: # Remove empty entries from list (caused by multiple \t) _l = re.sub(' +', ' ', line) _l = [_f for _f in _l.split(' ') if _f] - logger.debug('_l=%s' % _l) if first_line == []: # "NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE first_line = _l[1:] - logger.debug('first line=%s' % first_line) else: dictionary[_l[0]] = {} for i in range(len(_l[1:])): @@ -210,5 +238,4 @@ def _convert_to_dict(self, output): except Exception: logger.warning("unexpected format of utility output: %s" % line) - logger.debug('dictionary=%s' % str(dictionary)) return dictionary From 49e5c0cef2645a18d39dc4022b8a144751cc05a8 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 16:10:03 +0200 Subject: [PATCH 34/96] Update --- pilot/api/dask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 3f81af00..c3b95c39 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -63,6 +63,7 @@ def uninstall(self): cmd = 'helm uninstall %s' % self.servicename exit_code, stdout, stderr = execute(cmd, mute=True) if not exit_code: + self.status = 'uninstalled' logger.info('service %s has been uninstalled' % self.servicename) def install(self, block=True): @@ -88,8 +89,7 @@ def install(self, block=True): # override_option = "-f %s" % self.overrides if self.overrides else "" cmd = 'helm install %s %s dask/dask' % (override_option, self.servicename) - #exit_code, stdout, stderr = execute(cmd, mute=True) - exit_code = 0 + exit_code, stdout, stderr = execute(cmd, mute=True) if not exit_code: logger.info('installation of service %s is in progress' % self.servicename) From 5826863b20f1be6145a766e35aa791ceb9ff8972 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 16:26:06 +0200 Subject: [PATCH 35/96] Connected cluster --- pilot/api/dask.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index c3b95c39..6d488afe 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -34,6 +34,7 @@ class Dask(object): jupyter = False overrides = "override_values.yaml" _workdir = os.getcwd() + cluster = None def __init__(self, **kwargs): """ @@ -55,16 +56,20 @@ def __init__(self, **kwargs): if _overrides: self.overrides = _overrides - def uninstall(self): + def uninstall(self, block=True): """ """ + logger.info('uninstalling service %s' % self.servicename) + if block: + logger.warning('blocking mode not yet implemented') + cmd = 'helm uninstall %s' % self.servicename exit_code, stdout, stderr = execute(cmd, mute=True) if not exit_code: self.status = 'uninstalled' - logger.info('service %s has been uninstalled' % self.servicename) + logger.info('uninstall of service %s has been requested' % self.servicename) def install(self, block=True): """ @@ -239,3 +244,29 @@ def _convert_to_dict(self, output): logger.warning("unexpected format of utility output: %s" % line) return dictionary + + def connect_cluster(self): + """ + + """ + + logger.info('connecting to HelmCluster') + self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename) + + def scale(self, number): + """ + + """ + + if number > 2: + logger.warning('too large scale: %d (please use <= 2 for now)' % number) + return + if not self.cluster: + self.connect_cluster() + if not self.cluster: + logger.warning('cluster not connected - cannot proceed') + self.status = 'failed' + return + + logger.info('setting scale to: %d' % number) + self.cluster.scale = number From c8cc5d55a1527035326b9e2a58c0f969879704b7 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 16:32:14 +0200 Subject: [PATCH 36/96] Update --- pilot/api/dask.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 6d488afe..0709e965 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -251,6 +251,7 @@ def connect_cluster(self): """ logger.info('connecting to HelmCluster') + import dask_kubernetes self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename) def scale(self, number): From 8126ef221b8c3ef4e91f096748dd38379e3f5a42 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 16:33:32 +0200 Subject: [PATCH 37/96] Update --- pilot/api/dask.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 0709e965..3174574d 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -251,8 +251,12 @@ def connect_cluster(self): """ logger.info('connecting to HelmCluster') - import dask_kubernetes - self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename) + try: + import dask_kubernetes + except Exception as error: + logger.warning('failed to import dask_kubernetes') + else: + self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename) def scale(self, number): """ From 6e2b1514061172f9265d520ce73f388a1985c3a1 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 16:34:09 +0200 Subject: [PATCH 38/96] Update --- pilot/api/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 3174574d..491af4c4 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -250,13 +250,13 @@ def connect_cluster(self): """ - logger.info('connecting to HelmCluster') try: import dask_kubernetes except Exception as error: logger.warning('failed to import dask_kubernetes') else: self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename) + logger.info('connected to HelmCluster') def scale(self, number): """ From beed675a6398752445b7185d71f71bf2f740ea05 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 16:37:38 +0200 Subject: [PATCH 39/96] Update --- pilot/api/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 491af4c4..c62c3a6f 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -274,4 +274,4 @@ def scale(self, number): return logger.info('setting scale to: %d' % number) - self.cluster.scale = number + self.cluster.scale(number) From 7a177e4fe326cf9440c0a83a9c03689dd762f65f Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 18:56:04 +0200 Subject: [PATCH 40/96] Fixed bad xcache debugging --- PILOTVERSION | 2 +- pilot/api/dask.py | 3 --- pilot/control/data.py | 12 ++++++------ pilot/control/payloads/generic.py | 24 ++++++++++++------------ pilot/util/constants.py | 2 +- 5 files changed, 20 insertions(+), 23 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index c51eef03..0bab6e7d 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.13 \ No newline at end of file +2.11.3.15 \ No newline at end of file diff --git a/pilot/api/dask.py b/pilot/api/dask.py index c62c3a6f..3d027847 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -8,11 +8,8 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2021 #from pilot.common.exception import NotDefined, NotSameLength, UnknownException -#from pilot.util.filehandling import get_table_from_file -#from pilot.util.math import mean, sum_square_dev, sum_dev, chi2, float_to_rounded_string from pilot.util.container import execute from pilot.util.filehandling import establish_logging, write_file -from pilot.util.parameters import convert_to_int import os import re diff --git a/pilot/control/data.py b/pilot/control/data.py index f8e80c94..a754b20d 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -478,18 +478,18 @@ def copytool_in(queues, traces, args): cmd = user.get_utility_commands(job=job, order=UTILITY_BEFORE_STAGEIN) if cmd: # xcache debug - exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[before xcache start] stdout=%s' % stdout) - logger.debug('[before xcache start] stderr=%s' % stderr) + exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + logger.debug('[before xcache start] stdout=%s' % _stdout) + logger.debug('[before xcache start] stderr=%s' % _stderr) exit_code, stdout, stderr = execute(cmd.get('command')) logger.debug('stdout=%s' % stdout) logger.debug('stderr=%s' % stderr) # xcache debug - exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[after xcache start] stdout=%s' % stdout) - logger.debug('[after xcache start] stderr=%s' % stderr) + exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + logger.debug('[after xcache start] stdout=%s' % _stdout) + logger.debug('[after xcache start] stderr=%s' % _stderr) # perform any action necessary after command execution (e.g. stdout processing) kwargs = {'label': cmd.get('label', 'utility'), 'output': stdout} diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 3c674fd6..c6bf1bf6 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -589,9 +589,9 @@ def run(self): # noqa: C901 # note: no need to run any main payload in HPO Horovod jobs on Kubernetes if os.environ.get('HARVESTER_HOROVOD', '') == '': - exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[before payload start] stdout=%s' % stdout) - logger.debug('[before payload start] stderr=%s' % stderr) + exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + logger.debug('[before payload start] stdout=%s' % _stdout) + logger.debug('[before payload start] stderr=%s' % _stderr) proc = self.run_payload(self.__job, cmd, self.__out, self.__err) else: @@ -639,9 +639,9 @@ def run(self): # noqa: C901 set_pilot_state(job=self.__job, state=state) logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n' % (proc.pid, exit_code, self.__job.state)) - exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[after payload finish] stdout=%s' % stdout) - logger.debug('[after payload finish] stderr=%s' % stderr) + exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + logger.debug('[after payload finish] stdout=%s' % _stdout) + logger.debug('[after payload finish] stderr=%s' % _stderr) # stop the utility command (e.g. a coprocess if necessary if proc_co: @@ -691,16 +691,16 @@ def run_utility_after_payload_finished(self): logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload) # xcache debug - exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[before xcache kill] stdout=%s' % stdout) - logger.debug('[before xcache kill] stderr=%s' % stderr) + exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + logger.debug('[before xcache kill] stdout=%s' % _stdout) + logger.debug('[before xcache kill] stderr=%s' % _stderr) exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'xcache_kill') # xcache debug - exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[after xcache kill] stdout=%s' % stdout) - logger.debug('[after xcache kill] stderr=%s' % stderr) + _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + logger.debug('[after xcache kill] stdout=%s' % _stdout) + logger.debug('[after xcache kill] stderr=%s' % _stderr) return exit_code diff --git a/pilot/util/constants.py b/pilot/util/constants.py index c05c9593..50ebe53d 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '13' # build number should be reset to '1' for every new development cycle +BUILD = '15' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From cac5620e0a45a8ec6980dd80c6f80d8b08ac90d0 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 19:02:26 +0200 Subject: [PATCH 41/96] Part 1 of 2; fix for postprocesses and xcache --- pilot/control/payloads/generic.py | 8 ++++---- pilot/user/atlas/common.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index c6bf1bf6..c50fa634 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -652,8 +652,7 @@ def run(self): # noqa: C901 logger.warning('detected unset exit_code from wait_graceful - reset to -1') exit_code = -1 - if state != 'failed': - exit_code = self.run_utility_after_payload_finished() + exit_code = self.run_utility_after_payload_finished(state) self.post_payload(self.__job) @@ -670,10 +669,11 @@ def run(self): # noqa: C901 return exit_code - def run_utility_after_payload_finished(self): + def run_utility_after_payload_finished(self, state): """ Run utility command after the main payload has finished. + :param state: payload state; finished/failed (string). :return: exit code (int). """ @@ -683,7 +683,7 @@ def run_utility_after_payload_finished(self): except Exception as e: logger.error(e) else: - if cmd_after_payload and self.__job.postprocess: + if cmd_after_payload and self.__job.postprocess and state != 'failed': cmd_after_payload = self.__job.setup + cmd_after_payload logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload) exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'postprocess') diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index ee0ecd90..7358c5ba 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -1901,7 +1901,7 @@ def get_utility_commands(order=None, job=None): if job.postprocess and job.postprocess.get('command', ''): com = download_command(job.postprocess, job.workdir) com['label'] = 'postprocess' - if 'pilotXcache' in job.infosys.queuedata.catchall: + if 'pilotXcache' in job.infosys.queuedata.catchall: # should be UTILITY_AFTER_PAYLOAD_FINISHED2 com = xcache_deactivation_command(job.workdir) com['label'] = 'xcache_kill' elif order == UTILITY_BEFORE_STAGEIN: From 4964894a14ec4a203057f1e51030e1991ae31161 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 20:30:34 +0200 Subject: [PATCH 42/96] Update --- PILOTVERSION | 2 +- pilot/control/payloads/generic.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 0bab6e7d..d6ebd519 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.15 \ No newline at end of file +2.11.3.17 \ No newline at end of file diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index c50fa634..92db9b85 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -602,7 +602,7 @@ def run(self): # noqa: C901 # run the post-process command even if there was no main payload if os.environ.get('HARVESTER_HOROVOD', '') != '': logger.info('No need to execute any main payload') - exit_code = self.run_utility_after_payload_finished() + exit_code = self.run_utility_after_payload_finished(True) self.post_payload(self.__job) else: break diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 50ebe53d..b276f475 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '15' # build number should be reset to '1' for every new development cycle +BUILD = '17' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 3eb851a87d5261c1dd0fa45c17a415d480d920a7 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 20:36:03 +0200 Subject: [PATCH 43/96] Flake8 corrections --- PILOTVERSION | 2 +- pilot/api/dask.py | 24 ++++++++++++------------ pilot/user/generic/common.py | 1 - pilot/util/constants.py | 2 +- 4 files changed, 14 insertions(+), 15 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index d6ebd519..492fd442 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.17 \ No newline at end of file +2.11.3.18 \ No newline at end of file diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 3d027847..2f737ac3 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -7,6 +7,12 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2021 +try: + # import dask + import dask_kubernetes +except Exception: + pass + #from pilot.common.exception import NotDefined, NotSameLength, UnknownException from pilot.util.container import execute from pilot.util.filehandling import establish_logging, write_file @@ -126,7 +132,7 @@ def _is_valid_ip(self, ip): """ - regex = "^((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.){3}(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])$" + regex = r"^((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.){3}(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])$" return True if re.search(regex, ip) else False def _get_dictionary(self, cmd=None): @@ -162,22 +168,16 @@ def _validate(self): establish_logging(debug=True) - # import relevant modules - try: - import dask - logger.debug('dask imported') - import dask_kubernetes - logger.debug('dask_kubernetes imported') - except Exception as error: - logger.warning('module not available: %s' % error) - return False + # check imported modules + # dask + # dask_kubernetes # verify relevant commands commands = ['helm', 'kubectl'] found = False for cmd in commands: exit_code, stdout, stderr = execute('which %s' % cmd, mute=True) - found = True if not 'not found' in stdout else False + found = True if 'not found' not in stdout else False if not found: logger.warning(stdout) break @@ -250,7 +250,7 @@ def connect_cluster(self): try: import dask_kubernetes except Exception as error: - logger.warning('failed to import dask_kubernetes') + logger.warning('failed to import dask_kubernetes: %s' % error) else: self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename) logger.info('connected to HelmCluster') diff --git a/pilot/user/generic/common.py b/pilot/user/generic/common.py index b21442e1..069494a8 100644 --- a/pilot/user/generic/common.py +++ b/pilot/user/generic/common.py @@ -270,4 +270,3 @@ def post_prestagein_utility_command(**kwargs): # stdout = kwargs.get('output', None) pass - diff --git a/pilot/util/constants.py b/pilot/util/constants.py index b276f475..92968e7f 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '17' # build number should be reset to '1' for every new development cycle +BUILD = '18' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From e0c80b56125e78883b64a090dfcd0b5f273057be Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 20:37:24 +0200 Subject: [PATCH 44/96] Flake8 corrections --- pilot/api/dask.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 2f737ac3..ce3f40fb 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -247,13 +247,8 @@ def connect_cluster(self): """ - try: - import dask_kubernetes - except Exception as error: - logger.warning('failed to import dask_kubernetes: %s' % error) - else: - self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename) - logger.info('connected to HelmCluster') + self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename) + logger.info('connected to HelmCluster') def scale(self, number): """ From 574bdaa6dad5defcc3047cdfe871368df41f84c2 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 25 May 2021 20:42:01 +0200 Subject: [PATCH 45/96] Update --- PILOTVERSION | 2 +- pilot/user/atlas/common.py | 4 ++-- pilot/util/constants.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 492fd442..678fd135 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.18 \ No newline at end of file +2.11.3.19 \ No newline at end of file diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 7358c5ba..d3c06380 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -1932,8 +1932,8 @@ def post_prestagein_utility_command(**kwargs): alrb_xcache_files = os.environ.get('ALRB_XCACHE_FILES', '') if alrb_xcache_files: cmd = 'cat $ALRB_XCACHE_FILES/settings.sh' - exit_code, _stdout, _stderr = execute(cmd, usecontainer=False) - logger.debug('cmd=%s:\n\n%s\n\n' % _stdout) + exit_code, _stdout, _stderr = execute(cmd) + logger.debug('cmd=%s:\n\n%s\n\n' % (cmd, _stdout)) def xcache_proxy(output): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 92968e7f..be972231 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '18' # build number should be reset to '1' for every new development cycle +BUILD = '19' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From ab1406bbe3a8e88929c41170bc644116a3400a76 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 26 May 2021 17:28:45 +0200 Subject: [PATCH 46/96] Now allowing for different dask_kubernetes manager --- pilot/api/dask.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index ce3f40fb..2f7a4066 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -94,6 +94,8 @@ def install(self, block=True): else: logger.info('service %s is not yet running - proceed with installation' % name) + # perform helm updates before actual instqllation + cmd = '' # override_option = "-f %s" % self.overrides if self.overrides else "" cmd = 'helm install %s %s dask/dask' % (override_option, self.servicename) @@ -242,13 +244,13 @@ def _convert_to_dict(self, output): return dictionary - def connect_cluster(self): + def connect_cluster(self, release_name=self.servicename, manager=dask_kubernetes.HelmCluster): """ """ - self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename) - logger.info('connected to HelmCluster') + self.cluster = manager(release_name=self.servicename) + logger.info('connected to %s' % manager.__name__) def scale(self, number): """ @@ -267,3 +269,12 @@ def scale(self, number): logger.info('setting scale to: %d' % number) self.cluster.scale(number) + + def shutdown(self): + """ + Shutdown logging. + + """ + + logging.handlers = [] + logging.shutdown() From 49b0775669533214d339ee8156a1260d48623f4c Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 26 May 2021 17:30:39 +0200 Subject: [PATCH 47/96] Update --- pilot/api/dask.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pilot/api/dask.py b/pilot/api/dask.py index 2f7a4066..ad051c00 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -244,12 +244,14 @@ def _convert_to_dict(self, output): return dictionary - def connect_cluster(self, release_name=self.servicename, manager=dask_kubernetes.HelmCluster): + def connect_cluster(self, release_name=None, manager=dask_kubernetes.HelmCluster): """ """ - self.cluster = manager(release_name=self.servicename) + if not release_name: + release_name = self.servicename + self.cluster = manager(release_name=release_name) logger.info('connected to %s' % manager.__name__) def scale(self, number): From 0b2462c2a32f7f61f250164a2911a885ec7570a0 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 27 May 2021 10:43:58 +0200 Subject: [PATCH 48/96] Updated post-process handling to support multiple post-processes (like xcache + HPO). Added '-b 4' option to xcache start. Now expanding env vars in xml, needed for xcache. --- PILOTVERSION | 2 +- pilot/control/data.py | 2 ++ pilot/control/payloads/generic.py | 18 ++++++++++++------ pilot/user/atlas/common.py | 11 ++++++----- pilot/user/atlas/metadata.py | 1 + pilot/util/constants.py | 9 +++++---- 6 files changed, 27 insertions(+), 16 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 678fd135..e265e54b 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.19 \ No newline at end of file +2.11.3.21 \ No newline at end of file diff --git a/pilot/control/data.py b/pilot/control/data.py index a754b20d..5df19f89 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -683,6 +683,7 @@ def get_input_file_dictionary(indata): Return an input file dictionary. Format: {'guid': 'pfn', ..} Normally use_turl would be set to True if direct access is used. + Note: any environment variables in the turls will be expanded :param indata: list of FileSpec objects. :return: file dictionary. @@ -692,6 +693,7 @@ def get_input_file_dictionary(indata): for fspec in indata: ret[fspec.guid] = fspec.turl if fspec.status == 'remote_io' else fspec.lfn + ret[fspec.guid] = os.path.expandvars(ret[fspec.guid]) # correction for ND and mv # in any case use the lfn instead of pfn since there are trf's that have problems with pfn's diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 92db9b85..630639d3 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -23,7 +23,7 @@ from pilot.util.container import execute from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED, \ UTILITY_AFTER_PAYLOAD_FINISHED, PILOT_PRE_SETUP, PILOT_POST_SETUP, PILOT_PRE_PAYLOAD, PILOT_POST_PAYLOAD, \ - UTILITY_AFTER_PAYLOAD_STARTED2 + UTILITY_AFTER_PAYLOAD_STARTED2, UTILITY_AFTER_PAYLOAD_FINISHED2 from pilot.util.filehandling import write_file from pilot.util.processes import kill_processes from pilot.util.timing import add_to_pilot_timing @@ -210,7 +210,7 @@ def utility_after_payload_started_new(self, job): # # also store the full command in case it needs to be restarted later (by the job_monitor() thread) # job.utilities[cmd_dictionary.get('command')] = [proc, 1, utilitycommand] - def utility_after_payload_finished(self, job): + def utility_after_payload_finished(self, job, horovod_mode): """ Prepare commands/utilities to run after payload has finished. @@ -219,6 +219,8 @@ def utility_after_payload_finished(self, job): REFACTOR :param job: job object. + :param horovod_mode: True if HARVESTER_HOROVOD is set (Boolean). + :return: """ cmd = "" @@ -227,8 +229,10 @@ def utility_after_payload_finished(self, job): pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 + order = UTILITY_AFTER_PAYLOAD_FINISHED if not horovod_mode else UTILITY_AFTER_PAYLOAD_FINISHED2 + # should any additional commands be prepended to the payload execution string? - cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_FINISHED, job=job) + cmd_dictionary = user.get_utility_commands(order=order, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) logger.info('utility command (\'%s\') to be executed after the payload has finished: %s' % (cmd_dictionary.get('label', 'utility'), cmd)) @@ -602,7 +606,7 @@ def run(self): # noqa: C901 # run the post-process command even if there was no main payload if os.environ.get('HARVESTER_HOROVOD', '') != '': logger.info('No need to execute any main payload') - exit_code = self.run_utility_after_payload_finished(True) + exit_code = self.run_utility_after_payload_finished(True, horovod_mode=True) self.post_payload(self.__job) else: break @@ -669,17 +673,19 @@ def run(self): # noqa: C901 return exit_code - def run_utility_after_payload_finished(self, state): + def run_utility_after_payload_finished(self, state, horovod_mode=False): """ Run utility command after the main payload has finished. + In horovod mode, select the corresponding post-process. Otherwise, select different post-process (e.g. Xcache). :param state: payload state; finished/failed (string). + :param horovod_mode: True if HARVESTER_HOROVOD is set (Boolean). :return: exit code (int). """ exit_code = 0 try: - cmd_after_payload = self.utility_after_payload_finished(self.__job) + cmd_after_payload = self.utility_after_payload_finished(self.__job, horovod_mode=horovod_mode) except Exception as e: logger.error(e) else: diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index d3c06380..10f11f5d 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -35,7 +35,7 @@ from pilot.util.config import config from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED,\ UTILITY_AFTER_PAYLOAD, UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_STARTED2,\ - UTILITY_BEFORE_STAGEIN + UTILITY_BEFORE_STAGEIN, UTILITY_AFTER_PAYLOAD_FINISHED2 from pilot.util.container import execute from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy,\ copy_pilot_source, write_file, read_json, read_file, update_extension, get_local_file_size, calculate_checksum @@ -1898,12 +1898,13 @@ def get_utility_commands(order=None, job=None): com = download_command(job.postprocess, job.workdir) com['label'] = 'postprocess' elif order == UTILITY_AFTER_PAYLOAD_FINISHED: + if 'pilotXcache' in job.infosys.queuedata.catchall: + com = xcache_deactivation_command(job.workdir) + com['label'] = 'xcache_kill' + elif order == UTILITY_AFTER_PAYLOAD_FINISHED2: if job.postprocess and job.postprocess.get('command', ''): com = download_command(job.postprocess, job.workdir) com['label'] = 'postprocess' - if 'pilotXcache' in job.infosys.queuedata.catchall: # should be UTILITY_AFTER_PAYLOAD_FINISHED2 - com = xcache_deactivation_command(job.workdir) - com['label'] = 'xcache_kill' elif order == UTILITY_BEFORE_STAGEIN: if 'pilotXcache' in job.infosys.queuedata.catchall: com = xcache_activation_command(job.jobid) @@ -1990,7 +1991,7 @@ def xcache_activation_command(jobid): # ${ALRB_XCACHE_PROXY}root://atlasxrootd-kit.gridka.de:1094//pnfs/gridka.de/../DAOD_FTAG4.24348858._000020.pool.root.1 command = "%s " % get_asetup(asetup=False) # add 'xcache list' which will also kill any orphaned processes lingering in the system - command += "lsetup xcache; xcache list; xcache start -d $PWD/%s/xcache -C centos7 --disklow 4g --diskhigh 5g" % jobid + command += "lsetup xcache; xcache list; xcache start -d $PWD/%s/xcache -C centos7 --disklow 4g --diskhigh 5g -b 4" % jobid return {'command': command, 'args': ''} diff --git a/pilot/user/atlas/metadata.py b/pilot/user/atlas/metadata.py index e5d45f3b..25f18d66 100644 --- a/pilot/user/atlas/metadata.py +++ b/pilot/user/atlas/metadata.py @@ -21,6 +21,7 @@ def create_input_file_metadata(file_dictionary, workdir, filename="PoolFileCatal """ Create a Pool File Catalog for the files listed in the input dictionary. The function creates properly formatted XML (pretty printed) and writes the XML to file. + Note: any environment variables in the pfn tags will be expanded (see pilot/control/data::get_input_file_dictionary()). Format: dictionary = {'guid': 'pfn', ..} diff --git a/pilot/util/constants.py b/pilot/util/constants.py index be972231..f7104532 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -12,9 +12,9 @@ # Pilot version RELEASE = '2' # released number should be fixed at 2 for Pilot 2 -VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates +VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '19' # build number should be reset to '1' for every new development cycle +BUILD = '21' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 @@ -28,8 +28,9 @@ UTILITY_AFTER_PAYLOAD_STARTED2 = 4 UTILITY_AFTER_PAYLOAD = 5 UTILITY_AFTER_PAYLOAD_FINISHED = 6 -UTILITY_BEFORE_STAGEIN = 7 -UTILITY_WITH_STAGEIN = 8 +UTILITY_AFTER_PAYLOAD_FINISHED2 = 7 +UTILITY_BEFORE_STAGEIN = 8 +UTILITY_WITH_STAGEIN = 9 # Timing constants that allow for additional constants to be defined for values before the pilot is started, ie for # wrapper timing purposes. From dd8e5aa0932a14faf6b18436706b7771505954fc Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 27 May 2021 11:03:24 +0200 Subject: [PATCH 49/96] Updated post-process handling (added label) to support multiple post-processes (like xcache + HPO). --- PILOTVERSION | 2 +- pilot/control/payloads/generic.py | 43 +++++++++++++++++-------------- pilot/util/constants.py | 2 +- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index e265e54b..f1cab8c7 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.21 \ No newline at end of file +2.11.3.22 \ No newline at end of file diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 630639d3..8236f24b 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -210,17 +210,17 @@ def utility_after_payload_started_new(self, job): # # also store the full command in case it needs to be restarted later (by the job_monitor() thread) # job.utilities[cmd_dictionary.get('command')] = [proc, 1, utilitycommand] - def utility_after_payload_finished(self, job, horovod_mode): + def utility_after_payload_finished(self, job, order): """ Prepare commands/utilities to run after payload has finished. This command will be executed later. - REFACTOR + The order constant can be UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2 :param job: job object. - :param horovod_mode: True if HARVESTER_HOROVOD is set (Boolean). - :return: + :param order: constant used for utility selection (constant). + :return: command (string), label (string). """ cmd = "" @@ -229,15 +229,13 @@ def utility_after_payload_finished(self, job, horovod_mode): pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 - order = UTILITY_AFTER_PAYLOAD_FINISHED if not horovod_mode else UTILITY_AFTER_PAYLOAD_FINISHED2 - # should any additional commands be prepended to the payload execution string? cmd_dictionary = user.get_utility_commands(order=order, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) logger.info('utility command (\'%s\') to be executed after the payload has finished: %s' % (cmd_dictionary.get('label', 'utility'), cmd)) - return cmd + return cmd, cmd_dictionary.get('label') def execute_utility_command(self, cmd, job, label): """ @@ -606,7 +604,7 @@ def run(self): # noqa: C901 # run the post-process command even if there was no main payload if os.environ.get('HARVESTER_HOROVOD', '') != '': logger.info('No need to execute any main payload') - exit_code = self.run_utility_after_payload_finished(True, horovod_mode=True) + exit_code = self.run_utility_after_payload_finished(True, UTILITY_AFTER_PAYLOAD_FINISHED2) self.post_payload(self.__job) else: break @@ -656,7 +654,8 @@ def run(self): # noqa: C901 logger.warning('detected unset exit_code from wait_graceful - reset to -1') exit_code = -1 - exit_code = self.run_utility_after_payload_finished(state) + for order in [UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2]: + exit_code = self.run_utility_after_payload_finished(state, order) self.post_payload(self.__job) @@ -673,40 +672,44 @@ def run(self): # noqa: C901 return exit_code - def run_utility_after_payload_finished(self, state, horovod_mode=False): + def run_utility_after_payload_finished(self, state, order): """ Run utility command after the main payload has finished. In horovod mode, select the corresponding post-process. Otherwise, select different post-process (e.g. Xcache). + The order constant can be UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2 + :param state: payload state; finished/failed (string). - :param horovod_mode: True if HARVESTER_HOROVOD is set (Boolean). + :param order: constant used for utility selection (constant). :return: exit code (int). """ exit_code = 0 try: - cmd_after_payload = self.utility_after_payload_finished(self.__job, horovod_mode=horovod_mode) + cmd_after_payload, label = self.utility_after_payload_finished(self.__job, order) except Exception as e: logger.error(e) else: if cmd_after_payload and self.__job.postprocess and state != 'failed': cmd_after_payload = self.__job.setup + cmd_after_payload logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload) - exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'postprocess') + exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label) elif cmd_after_payload: logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload) # xcache debug - exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[before xcache kill] stdout=%s' % _stdout) - logger.debug('[before xcache kill] stderr=%s' % _stderr) + if 'xcache' in cmd_after_payload: + _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + logger.debug('[before xcache kill] stdout=%s' % _stdout) + logger.debug('[before xcache kill] stderr=%s' % _stderr) - exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'xcache_kill') + exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label) # xcache debug - _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[after xcache kill] stdout=%s' % _stdout) - logger.debug('[after xcache kill] stderr=%s' % _stderr) + if 'xcache' in cmd_after_payload: + _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + logger.debug('[after xcache kill] stdout=%s' % _stdout) + logger.debug('[after xcache kill] stderr=%s' % _stderr) return exit_code diff --git a/pilot/util/constants.py b/pilot/util/constants.py index f7104532..bdf4f12b 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '21' # build number should be reset to '1' for every new development cycle +BUILD = '22' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From e8d2fef985e194d02c3c689a6268ed1237bdad44 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 27 May 2021 12:10:03 +0200 Subject: [PATCH 50/96] Refactored get_utility_commands() --- PILOTVERSION | 2 +- pilot/user/atlas/common.py | 108 +++++++++++++++++++++++++---------- pilot/user/generic/common.py | 8 +-- pilot/util/constants.py | 11 ++-- 4 files changed, 88 insertions(+), 41 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index f1cab8c7..259ef8c5 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.22 \ No newline at end of file +2.11.3.23 \ No newline at end of file diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 10f11f5d..b8053c41 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -34,8 +34,7 @@ from pilot.util.auxiliary import is_python3 from pilot.util.config import config from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED,\ - UTILITY_AFTER_PAYLOAD, UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_STARTED2,\ - UTILITY_BEFORE_STAGEIN, UTILITY_AFTER_PAYLOAD_FINISHED2 + UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_STARTED2, UTILITY_BEFORE_STAGEIN, UTILITY_AFTER_PAYLOAD_FINISHED2 from pilot.util.container import execute from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy,\ copy_pilot_source, write_file, read_json, read_file, update_extension, get_local_file_size, calculate_checksum @@ -1837,6 +1836,8 @@ def download_command(process, workdir): """ Download the pre/postprocess commands if necessary. + Process FORMAT: {'command': , 'args': , 'label': } + :param process: pre/postprocess dictionary. :param workdir: job workdir (string). :return: updated pre/postprocess dictionary. @@ -1870,46 +1871,87 @@ def get_utility_commands(order=None, job=None): should be returned. If order=UTILITY_WITH_STAGEIN, the commands that should be executed parallel with stage-in will be returned. - FORMAT: {'command': , 'args': } + FORMAT: {'command': , 'args': , 'label': } :param order: optional sorting order (see pilot.util.constants). :param job: optional job object. :return: dictionary of utilities to be executed in parallel with the payload. """ - com = {} - if order == UTILITY_BEFORE_PAYLOAD and job.preprocess: - if job.preprocess.get('command', ''): - com = download_command(job.preprocess, job.workdir) - com['label'] = 'preprocess' + return get_precopostprocess_command(job.preprocess, job.workdir, 'preprocess') elif order == UTILITY_WITH_PAYLOAD: - com = {'command': 'NetworkMonitor', 'args': '', 'label': 'networkmonitor'} + return {'command': 'NetworkMonitor', 'args': '', 'label': 'networkmonitor'} elif order == UTILITY_AFTER_PAYLOAD_STARTED: - cmd = config.Pilot.utility_after_payload_started - if cmd: - com = {'command': cmd, 'args': '', 'label': cmd.lower()} + return get_utility_after_payload_started() elif order == UTILITY_AFTER_PAYLOAD_STARTED2 and job.coprocess: - if job.coprocess.get('command', ''): - com = download_command(job.coprocess, job.workdir) - com['label'] = 'coprocess' - elif order == UTILITY_AFTER_PAYLOAD and job.postprocess: - if job.postprocess.get('command', ''): - com = download_command(job.postprocess, job.workdir) - com['label'] = 'postprocess' + return get_precopostprocess_command(job.coprocess, job.workdir, 'coprocess') elif order == UTILITY_AFTER_PAYLOAD_FINISHED: - if 'pilotXcache' in job.infosys.queuedata.catchall: - com = xcache_deactivation_command(job.workdir) - com['label'] = 'xcache_kill' + return get_xcache_command(job.infosys.queuedata.catchall, job.workdir, job.jobid, 'xcache_kill', xcache_deactivation_command) elif order == UTILITY_AFTER_PAYLOAD_FINISHED2: - if job.postprocess and job.postprocess.get('command', ''): - com = download_command(job.postprocess, job.workdir) - com['label'] = 'postprocess' + return get_precopostprocess_command(job.postprocess, job.workdir, 'postprocess') elif order == UTILITY_BEFORE_STAGEIN: - if 'pilotXcache' in job.infosys.queuedata.catchall: - com = xcache_activation_command(job.jobid) - com['label'] = 'xcache' + return get_xcache_command(job.infosys.queuedata.catchall, job.workdir, job.jobid, 'xcache_start', xcache_activation_command) + +def get_precopostprocess_command(process, workdir, label): + """ + Return the pre/co/post-process command dictionary. + + Command FORMAT: {'command': , 'args': , 'label': } + + The returned command has the structure: { 'command': , } + :param process: pre/co/post-process (dictionary). + :param workdir: working directory (string). + :param label: label (string). + :return: command (dictionary). + """ + + com = {} + if process.get('command', ''): + com = download_command(process, workdir) + com['label'] = label + return com + + +def get_utility_after_payload_started(): + """ + Return the command dictionary for the utility after the payload has started. + + Command FORMAT: {'command': , 'args': , 'label': } + + :return: command (dictionary). + """ + + com = {} + try: + cmd = config.Pilot.utility_after_payload_started + except Exception: + pass + else: + if cmd: + com = {'command': cmd, 'args': '', 'label': cmd.lower()} + return com + + +def get_xcache_command(catchall, workdir, jobid, label, xcache_function): + """ + Return the proper xcache command for either activation or deactivation. + + Command FORMAT: {'command': , 'args': , 'label': } + + :param catchall: queuedata catchall field (string). + :param workdir: job working directory (string). + :param jobid: PanDA job id (string). + :param label: label (string). + :param xcache_function: activation/deactivation function name (function). + :return: command (dictionary). + """ + + com = {} + if 'pilotXcache' in catchall: + com = xcache_function(jobid=jobid, workdir=workdir) + com['label'] = label return com @@ -1976,10 +2018,13 @@ def set_xcache_var(line, name='', pattern=''): os.environ[name] = result[0] -def xcache_activation_command(jobid): +def xcache_activation_command(workdir='', jobid=''): """ Return the xcache service activation command. + Note: the workdir is not used here, but the function prototype needs it in the called (xcache_activation_command needs it). + + :param workdir: unused work directory - do not remove (string). :param jobid: PanDA job id to guarantee that xcache process is unique (int). :return: xcache command (string). """ @@ -1996,13 +2041,16 @@ def xcache_activation_command(jobid): return {'command': command, 'args': ''} -def xcache_deactivation_command(workdir): +def xcache_deactivation_command(workdir='', jobid=''): """ Return the xcache service deactivation command. This service should be stopped after the payload has finished. Copy the messages log before shutting down. + Note: the job id is not used here, but the function prototype needs it in the called (xcache_activation_command needs it). + :param workdir: payload work directory (string). + :param jobid: unused job id - do not remove (string). :return: xcache command (string). """ diff --git a/pilot/user/generic/common.py b/pilot/user/generic/common.py index 069494a8..51f05632 100644 --- a/pilot/user/generic/common.py +++ b/pilot/user/generic/common.py @@ -12,7 +12,7 @@ from pilot.common.exception import TrfDownloadFailure from pilot.util.config import config -from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_AFTER_PAYLOAD +from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED from pilot.util.filehandling import read_file from .setup import get_analysis_trf @@ -130,7 +130,7 @@ def get_utility_commands(order=None, job=None): If the optional order parameter is set, the function should return the list of corresponding commands. E.g. if order=UTILITY_BEFORE_PAYLOAD, the function should return all commands that are to be executed before the payload. If order=UTILITY_WITH_PAYLOAD, the corresponding commands will be prepended to the payload execution - string. If order=UTILITY_AFTER_PAYLOAD, the commands that should be executed after the payload has been started + string. If order=UTILITY_AFTER_PAYLOAD_STARTED, the commands that should be executed after the payload has been started should be returned. FORMAT: {'command': , 'args': } @@ -160,14 +160,14 @@ def get_utility_command_execution_order(name): Should the given utility command be executed before or after the payload? :param name: utility name (string). - :return: execution order constant (UTILITY_BEFORE_PAYLOAD or UTILITY_AFTER_PAYLOAD) + :return: execution order constant (UTILITY_BEFORE_PAYLOAD or UTILITY_AFTER_PAYLOAD_STARTED) """ # example implementation if name == 'monitor': return UTILITY_BEFORE_PAYLOAD else: - return UTILITY_AFTER_PAYLOAD + return UTILITY_AFTER_PAYLOAD_STARTED def post_utility_command_action(name, job): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index bdf4f12b..8fdf5847 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '22' # build number should be reset to '1' for every new development cycle +BUILD = '23' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 @@ -26,11 +26,10 @@ UTILITY_WITH_PAYLOAD = 2 UTILITY_AFTER_PAYLOAD_STARTED = 3 UTILITY_AFTER_PAYLOAD_STARTED2 = 4 -UTILITY_AFTER_PAYLOAD = 5 -UTILITY_AFTER_PAYLOAD_FINISHED = 6 -UTILITY_AFTER_PAYLOAD_FINISHED2 = 7 -UTILITY_BEFORE_STAGEIN = 8 -UTILITY_WITH_STAGEIN = 9 +UTILITY_AFTER_PAYLOAD_FINISHED = 5 +UTILITY_AFTER_PAYLOAD_FINISHED2 = 6 +UTILITY_BEFORE_STAGEIN = 7 +UTILITY_WITH_STAGEIN = 8 # Timing constants that allow for additional constants to be defined for values before the pilot is started, ie for # wrapper timing purposes. From 9cc2fac9b56fd642d406ac8b5907fff371e6657c Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 27 May 2021 12:18:50 +0200 Subject: [PATCH 51/96] Update --- PILOTVERSION | 2 +- pilot/user/atlas/common.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 259ef8c5..b24b5952 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.23 \ No newline at end of file +2.11.3.24 \ No newline at end of file diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index b8053c41..75d75f0f 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -1888,7 +1888,7 @@ def get_utility_commands(order=None, job=None): return get_precopostprocess_command(job.coprocess, job.workdir, 'coprocess') elif order == UTILITY_AFTER_PAYLOAD_FINISHED: return get_xcache_command(job.infosys.queuedata.catchall, job.workdir, job.jobid, 'xcache_kill', xcache_deactivation_command) - elif order == UTILITY_AFTER_PAYLOAD_FINISHED2: + elif order == UTILITY_AFTER_PAYLOAD_FINISHED2 and job.postprocess: return get_precopostprocess_command(job.postprocess, job.workdir, 'postprocess') elif order == UTILITY_BEFORE_STAGEIN: return get_xcache_command(job.infosys.queuedata.catchall, job.workdir, job.jobid, 'xcache_start', xcache_activation_command) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 8fdf5847..76949f59 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '23' # build number should be reset to '1' for every new development cycle +BUILD = '24' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 18a30b7b1f9a0cf7283d02c4cf7a3c57f92199ad Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 27 May 2021 12:25:52 +0200 Subject: [PATCH 52/96] Refactored validate() --- PILOTVERSION | 2 +- pilot/control/job.py | 51 ++++++++++++++++++++++++++--------------- pilot/util/constants.py | 2 +- 3 files changed, 34 insertions(+), 21 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index b24b5952..72a0e522 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.24 \ No newline at end of file +2.11.3.25 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index d5d6b41d..aac2d848 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -833,11 +833,11 @@ def get_payload_log_tail(job): def validate(queues, traces, args): """ - (add description) + Perform validation of job. - :param queues: - :param traces: - :param args: + :param queues: queues object. + :param traces: traces object. + :param args: args object. :return: """ @@ -904,21 +904,7 @@ def validate(queues, traces, args): store_jobid(job.jobid, args.sourcedir) # run the delayed space check now - proceed_with_local_space_check = True if (args.harvester_submitmode.lower() == 'push' and args.update_server) else False - if proceed_with_local_space_check: - logger.debug('pilot will now perform delayed space check') - ec, diagnostics = check_local_space() - if ec != 0: - traces.pilot['error_code'] = errors.NOLOCALSPACE - # set the corresponding error code - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOLOCALSPACE, msg=diagnostics) - logger.debug('Failed to validate job=%s' % job.jobid) - put_in_queue(job, queues.failed_jobs) - else: - put_in_queue(job, queues.validated_jobs) - else: - put_in_queue(job, queues.validated_jobs) - + delayed_space_check(queues, traces, args, job) else: logger.debug('Failed to validate job=%s' % job.jobid) put_in_queue(job, queues.failed_jobs) @@ -933,6 +919,33 @@ def validate(queues, traces, args): logger.debug('[job] validate thread has finished') +def delayed_space_check(queues, traces, args, job): + """ + Run the delayed space check if necessary. + + :param queues: queues object. + :param traces: traces object. + :param args: args object. + :param job: job object. + :return: + """ + + proceed_with_local_space_check = True if (args.harvester_submitmode.lower() == 'push' and args.update_server) else False + if proceed_with_local_space_check: + logger.debug('pilot will now perform delayed space check') + ec, diagnostics = check_local_space() + if ec != 0: + traces.pilot['error_code'] = errors.NOLOCALSPACE + # set the corresponding error code + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOLOCALSPACE, msg=diagnostics) + logger.debug('Failed to validate job=%s' % job.jobid) + put_in_queue(job, queues.failed_jobs) + else: + put_in_queue(job, queues.validated_jobs) + else: + put_in_queue(job, queues.validated_jobs) + + def create_k8_link(job_dir): """ Create a soft link to the payload workdir on Kubernetes if SHARED_DIR exists. diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 76949f59..6199ffca 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '24' # build number should be reset to '1' for every new development cycle +BUILD = '25' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From c7bcc4ccd67427f6a871bff86e780eecece45c1a Mon Sep 17 00:00:00 2001 From: Shuwei Ye Date: Thu, 27 May 2021 07:41:32 -0400 Subject: [PATCH 53/96] Lowercased some variable names in gs.py to comply with flake8 --- pilot/copytool/gs.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pilot/copytool/gs.py b/pilot/copytool/gs.py index 03be7e77..68e50b5c 100644 --- a/pilot/copytool/gs.py +++ b/pilot/copytool/gs.py @@ -154,8 +154,8 @@ def copy_out(files, **kwargs): import re # bucket = re.sub(r'gs://(.*?)/.*', r'\1', fspec.turl) - reObj = re.match(r'gs://([^/]*)/(.*)', fspec.turl) - (bucket, remote_path) = reObj.groups() + reobj = re.match(r'gs://([^/]*)/(.*)', fspec.turl) + (bucket, remote_path) = reobj.groups() # ["pilotlog.txt", "payload.stdout", "payload.stderr"]: for logfile in os.listdir(workdir): @@ -208,9 +208,9 @@ def upload_file(file_name, bucket, object_name=None): blob = gs_bucket.blob(object_name) blob.upload_from_filename(filename=file_name) if file_name.endswith(config.Pilot.pilotlog): - url_pilotLog = blob.public_url - os.environ['GTAG'] = url_pilotLog - logger.debug("Set envvar GTAG with the pilotLot URL=%s" % url_pilotLog) + url_pilotlog = blob.public_url + os.environ['GTAG'] = url_pilotlog + logger.debug("Set envvar GTAG with the pilotLot URL=%s" % url_pilotlog) except Exception as e: diagnostics = 'exception caught in gs client: %s' % e logger.critical(diagnostics) From cb8d6fbdf328996ea86b3e7acfe868e373140326 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 27 May 2021 14:43:36 +0200 Subject: [PATCH 54/96] Added error code for no ctypes. Now using ctypes to guarantee orphans actually having parent processes --- PILOTVERSION | 2 +- pilot/common/errorcodes.py | 4 +++- pilot/control/job.py | 35 +++++++++++++++++++++++++++++++++-- pilot/util/constants.py | 2 +- pilot/util/processes.py | 3 ++- 5 files changed, 40 insertions(+), 6 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 72a0e522..7963461e 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.25 \ No newline at end of file +2.11.3.26 \ No newline at end of file diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py index 08aa02b7..49233a76 100644 --- a/pilot/common/errorcodes.py +++ b/pilot/common/errorcodes.py @@ -147,6 +147,7 @@ class ErrorCodes: XRDCPERROR = 1362 KILLPAYLOAD = 1363 # note, not a failure but a kill instruction from Raythena MISSINGCREDENTIALS = 1364 + NOCTYPES = 1365 _error_messages = { GENERALERROR: "General pilot error, consult batch log", @@ -272,7 +273,8 @@ class ErrorCodes: REMOTEFILECOULDNOTBEOPENED: "Remote file could not be opened", XRDCPERROR: "Xrdcp was unable to open file", KILLPAYLOAD: "Raythena has decided to kill payload", - MISSINGCREDENTIALS: "Unable to locate credentials for S3 transfer" + MISSINGCREDENTIALS: "Unable to locate credentials for S3 transfer", + NOCTYPES: "Python module ctypes not available on worker node" } put_error_codes = [1135, 1136, 1137, 1141, 1152, 1181] diff --git a/pilot/control/job.py b/pilot/control/job.py index aac2d848..820292df 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -893,8 +893,7 @@ def validate(queues, traces, args): # pre-cleanup pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - utilities = __import__('pilot.user.%s.utilities' % pilot_user, globals(), locals(), [pilot_user], - 0) # Python 2/3 + utilities = __import__('pilot.user.%s.utilities' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: utilities.precleanup() except Exception as e: @@ -905,6 +904,9 @@ def validate(queues, traces, args): # run the delayed space check now delayed_space_check(queues, traces, args, job) + + # make sure that ctypes is available (needed at the end by orphan killer) + verify_ctypes(queues, job) else: logger.debug('Failed to validate job=%s' % job.jobid) put_in_queue(job, queues.failed_jobs) @@ -919,6 +921,35 @@ def validate(queues, traces, args): logger.debug('[job] validate thread has finished') +def verify_ctypes(queues, job): + """ + Verify ctypes and make sure all subprocess are parented. + + :param queues: queues object. + :param job: job object. + :return: + """ + + try: + import ctypes + except Exception as e: + diagnostics = 'ctypes python module could not be imported: %s' % e + logger.warning(diagnostics) + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOCTYPES, msg=diagnostics) + logger.debug('Failed to validate job=%s' % job.jobid) + put_in_queue(job, queues.failed_jobs) + else: + logger.debug('ctypes python module imported') + + # make sure all children are parented by the pilot + # specifically, this will include any 'orphans', i.e. if the pilot kills all subprocesses at the end, + # 'orphans' will be included (orphans seem like the wrong name) + libc = ctypes.CDLL('libc.so.6') + PR_SET_CHILD_SUBREAPER = 36 + libc.prctl(PR_SET_CHILD_SUBREAPER, 1) + logger.debug('all child subprocesses will be parented') + + def delayed_space_check(queues, traces, args, job): """ Run the delayed space check if necessary. diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 6199ffca..7339925b 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '25' # build number should be reset to '1' for every new development cycle +BUILD = '26' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 8abdbb64..8f88b3fa 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -177,7 +177,8 @@ def kill_processes(pid): kill_process(i) # kill any remaining orphan processes - kill_orphans() + # note: this should no longer be necessary since ctypes has made sure all subprocesses are parented + # kill_orphans() def kill_child_processes(pid): From 2cf2b5a710996836908057465bab9934a15778bd Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 27 May 2021 20:45:05 +0200 Subject: [PATCH 55/96] Implemented tail and ls debug commands --- PILOTVERSION | 2 +- pilot/control/job.py | 122 ++++++++++++++++++++++++++++++++++++---- pilot/util/constants.py | 2 +- pilot/util/processes.py | 3 +- 4 files changed, 114 insertions(+), 15 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 7963461e..6d74a3a7 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.26 \ No newline at end of file +2.11.3.27c \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 820292df..ec76fbc8 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -25,6 +25,7 @@ from json import dumps #, loads from re import findall +from glob import glob from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import ExcThread, PilotException #, JobAlreadyRunning @@ -495,16 +496,18 @@ def get_debug_command(cmd): try: tmp = cmd.split(' ') com = tmp[0] - opts = tmp[1] except Exception as e: logger.warning('failed to identify debug command: %s' % e) else: if com not in allowed_commands: logger.warning('command=%s is not in the list of allowed commands: %s' % (com, str(allowed_commands))) - elif ';' in opts or ';' in opts: + elif ';' in cmd or ';' in cmd: logger.warning('debug command cannot contain \';\': \'%s\'' % cmd) elif com in forbidden_commands: logger.warning('command=%s is not allowed' % com) + else: + debug_mode = True + debug_command = cmd return debug_mode, debug_command @@ -527,7 +530,7 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): # warning: server might return comma-separated string, 'debug,tobekilled' cmd = res.get('command') # is it a 'command options'-type? debug_command=tail .., ls .., gdb .., ps .., du .. - if ' ' in cmd: + if ' ' in cmd and 'tobekilled' not in cmd: try: job.debug, job.debug_command = get_debug_command(cmd) except Exception as e: @@ -547,16 +550,25 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): logger.info('pilot received a panda server signal to softkill job %s at %s' % (job.jobid, time_stamp())) # event service kill instruction + job.debug_command = 'softkill' elif 'debug' in cmd: logger.info('pilot received a command to turn on standard debug mode from the server') job.debug = True + job.debug_command = 'debug' elif 'debugoff' in cmd: logger.info('pilot received a command to turn off debug mode from the server') job.debug = False + job.debug_command = 'debugoff' else: logger.warning('received unknown server command via backchannel: %s' % cmd) + job.debug = True + # job.debug_command = 'tail payload.stdout' # OK + # job.debug_command = 'ls -ltr workDir' # test with user job + job.debug_command = 'ls -ltr %s' % job.workdir + + def add_data_structure_ids(data, version_tag): """ Add pilot, batch and scheduler ids to the data structure for getJob, updateJob. @@ -622,9 +634,9 @@ def get_data_structure(job, state, args, xml=None, metadata=None): # in debug mode, also send a tail of the latest log file touched by the payload if job.debug: - stdout_tail = get_payload_log_tail(job) - if stdout_tail: - data['stdout'] = stdout_tail + stdout = get_debug_stdout(job.debug_command, job.workdir) + if stdout: + data['stdout'] = stdout # add the core count if job.corecount and job.corecount != 'null' and job.corecount != 'NULL': @@ -665,6 +677,82 @@ def get_data_structure(job, state, args, xml=None, metadata=None): return data +def get_debug_stdout(debug_command, workdir): + """ + Return the requested output from a given debug command. + + :param debug_command: full debug command (string). + :param workdir: job work directory (string). + :return: output (string). + """ + + if debug_command == 'debug': + return get_payload_log_tail(workdir) + elif 'tail' in debug_command: + return get_requested_log_tail(debug_command, workdir) + elif 'ls ' in debug_command: + return get_ls(debug_command, workdir) + else: + logger.warning('command not handled yet: %s' % debug_command) + return '' + + +def get_ls(debug_command, workdir): + """ + + """ + + items = debug_command.split(' ') + # cmd = items[0] + options = ' '.join(items[1:]) + path = options.split(' ')[-1] if ' ' in options else options + finalpath = os.path.join(workdir, path) + debug_command = debug_command.replace(path, finalpath) + + ec, stdout, stderr = execute(debug_command) + logger.debug("%s:\n\n%s\n\n" % (debug_command, stdout)) + + return stdout + + +def get_requested_log_tail(debug_command, workdir): + """ + Return the tail of the requested log. + + Examples + tail workdir/tmp.stdout* <- pilot finds the requested log file in the specified relative path + tail log.RAWtoALL <- pilot finds the requested log file + + :param debug_command: full debug command (string). + :param workdir: job work directory (string). + :return: output (string). + """ + + _tail = "" + items = debug_command.split(' ') + cmd = items[0] + options = ' '.join(items[1:]) + logger.debug('debug command: %s' % cmd) + logger.debug('debug options: %s' % options) + + # assume that the path is the last of the options; + path = options.split(' ')[-1] if ' ' in options else options + fullpath = os.path.join(workdir, path) + + # find all files with the given pattern and pick the latest updated file (if several) + files = glob(fullpath) + if files: + logger.info('files found: %s' % str(files)) + _tail = get_latest_log_tail(files) + else: + logger.warning('did not find \'%s\' in path %s' % (path, fullpath)) + + if _tail: + logger.debug('tail =\n\n%s\n\n' % _tail) + + return _tail + + def add_error_codes(data, job): """ Add error codes to data structure. @@ -798,16 +886,14 @@ def remove_pilot_logs_from_list(list_of_files): return new_list_of_files -def get_payload_log_tail(job): +def get_payload_log_tail(workdir): """ Return the tail of the payload stdout or its latest updated log file. - :param job: job object. + :param workdir: job work directory (string). :return: tail of stdout (string). """ - stdout_tail = "" - # find the latest updated log file # list_of_files = get_list_of_log_files() # find the latest updated text file @@ -816,10 +902,22 @@ def get_payload_log_tail(job): if not list_of_files: logger.info('no log files were found (will use default %s)' % config.Payload.payloadstdout) - list_of_files = [os.path.join(job.workdir, config.Payload.payloadstdout)] + list_of_files = [os.path.join(workdir, config.Payload.payloadstdout)] + + return get_latest_log_tail(list_of_files) + + +def get_latest_log_tail(files): + """ + Get the tail of the latest updated file from the given file list. + + :param files: files (list). + """ + + stdout_tail = "" try: - latest_file = max(list_of_files, key=os.path.getmtime) + latest_file = max(files, key=os.path.getmtime) logger.info('tail of file %s will be added to heartbeat' % latest_file) # now get the tail of the found log file and protect against potentially large tails diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 7339925b..3242c710 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '26' # build number should be reset to '1' for every new development cycle +BUILD = '27c' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 8f88b3fa..213ac5d6 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -178,7 +178,8 @@ def kill_processes(pid): # kill any remaining orphan processes # note: this should no longer be necessary since ctypes has made sure all subprocesses are parented - # kill_orphans() + # if orphan process killing is not desired, set env var PILOT_NOKILL + kill_orphans() def kill_child_processes(pid): From 405fe95ad2151137ef45693b3734e2dd82a2bb56 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 28 May 2021 17:37:52 +0200 Subject: [PATCH 56/96] Skipping xrootd when finding pid for prmon --- PILOTVERSION | 2 +- pilot/control/job.py | 28 +++++++++++++++++++++++++--- pilot/user/atlas/common.py | 2 +- pilot/user/atlas/utilities.py | 2 +- pilot/util/constants.py | 2 +- 5 files changed, 29 insertions(+), 7 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 6d74a3a7..98c67543 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.27c \ No newline at end of file +2.11.3.27g \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index ec76fbc8..42c2e8b0 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -565,8 +565,10 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): job.debug = True # job.debug_command = 'tail payload.stdout' # OK - # job.debug_command = 'ls -ltr workDir' # test with user job - job.debug_command = 'ls -ltr %s' % job.workdir + # job.debug_command = 'ls -ltr workDir' # test with user jo + # job.debug_command = 'ls -ltr %s' % job.workdir + # 'ps -ef' + job.debug_command = 'ps axo pgid,ppid,comm,args' def add_data_structure_ids(data, version_tag): @@ -692,14 +694,34 @@ def get_debug_stdout(debug_command, workdir): return get_requested_log_tail(debug_command, workdir) elif 'ls ' in debug_command: return get_ls(debug_command, workdir) + elif 'ps ' in debug_command or 'gdb ' in debug_command: + return get_general_command_stdout(debug_command) else: logger.warning('command not handled yet: %s' % debug_command) return '' +def get_general_command_stdout(debug_command): + """ + Return the output from the requested debug command. + + :param debug_command: full debug command (string). + :return: output (string). + """ + + ec, stdout, stderr = execute(debug_command) + logger.debug("%s:\n\n%s\n\n" % (debug_command, stdout)) + + return stdout + + def get_ls(debug_command, workdir): """ + Return the requested ls debug command. + :param debug_command: full debug command (string). + :param workdir: job work directory (string). + :return: output (string). """ items = debug_command.split(' ') @@ -717,7 +739,7 @@ def get_ls(debug_command, workdir): def get_requested_log_tail(debug_command, workdir): """ - Return the tail of the requested log. + Return the tail of the requested debug log. Examples tail workdir/tmp.stdout* <- pilot finds the requested log file in the specified relative path diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 75d75f0f..634f0305 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -191,7 +191,7 @@ def open_remote_files(indata, workdir): not_opened += turl if not not_opened else ",%s" % turl if not_opened: ec = errors.REMOTEFILECOULDNOTBEOPENED - diagnostics = "turl not opened:%s" % not_opened if "," not in not_opened else "turls not opened:%s" % not_opened + diagnostics = "Remote file could not be opened: %s" % not_opened if "," not in not_opened else "turls not opened:%s" % not_opened else: logger.info('nothing to verify (for remote files)') diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py index a8264668..f04a50e8 100644 --- a/pilot/user/atlas/utilities.py +++ b/pilot/user/atlas/utilities.py @@ -286,7 +286,7 @@ def get_pid_for_jobid(ps, jobid): pid = None for line in ps.split('\n'): - if jobid in line: + if jobid in line and 'xrootd' not in line: # extract pid _pid = search(r'(\d+) ', line) try: diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 3242c710..1891c015 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '27c' # build number should be reset to '1' for every new development cycle +BUILD = '27g' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From fa2debc10efe22c4ecd9cfe3709a74edc51cbed4 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 4 Jun 2021 08:25:31 +0200 Subject: [PATCH 57/96] Many fixes for debug mode, including full containerisation of gdb command (not finished). Fixed resimevents. --- PILOTVERSION | 2 +- pilot.py | 59 +------------ pilot/control/data.py | 10 ++- pilot/control/job.py | 129 +++++++++++++++++++++------- pilot/info/jobdata.py | 2 +- pilot/user/atlas/common.py | 92 ++++++++++++++++++-- pilot/user/atlas/container.py | 27 ++++-- pilot/user/generic/common.py | 14 ++++ pilot/util/constants.py | 2 +- pilot/util/default.cfg | 2 +- pilot/util/filehandling.py | 21 +++++ pilot/util/middleware.py | 47 ++++++++++- pilot/util/processes.py | 153 ++++++++++++++++++++++++++++++++++ 13 files changed, 451 insertions(+), 109 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 98c67543..bceaa76a 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.27g \ No newline at end of file +2.11.3.28f \ No newline at end of file diff --git a/pilot.py b/pilot.py index ac40b52b..eee04af1 100755 --- a/pilot.py +++ b/pilot.py @@ -101,62 +101,6 @@ class Args: pass -# rename module to pilot2 to avoid conflict in import with pilot directory -def import_module(**kwargs): - """ - This function allows for importing the pilot code. - - :param kwargs: pilot options (dictionary). - :return: pilot error code (integer). - """ - - argument_dictionary = {'-a': kwargs.get('workdir', ''), - '-d': kwargs.get('debug', None), - '-w': kwargs.get('workflow', 'generic'), - '-l': kwargs.get('lifetime', '3600'), - '-q': kwargs.get('queue'), # required - '-r': kwargs.get('resource'), # required - '-s': kwargs.get('site'), # required - '-j': kwargs.get('job_label', 'ptest'), # change default later to 'managed' - '-i': kwargs.get('version_tag', 'PR'), - '-t': kwargs.get('verify_proxy', True), - '-z': kwargs.get('update_server', True), - '--cacert': kwargs.get('cacert', None), - '--capath': kwargs.get('capath'), - '--url': kwargs.get('url', ''), - '-p': kwargs.get('port', '25443'), - '--country-group': kwargs.get('country_group', ''), - '--working-group': kwargs.get('working_group', ''), - '--allow-other-country': kwargs.get('allow_other_country', 'False'), - '--allow-same-user': kwargs.get('allow_same_user', 'True'), - '--pilot-user': kwargs.get('pilot_user', 'generic'), - '--input-dir': kwargs.get('input_dir', ''), - '--output-dir': kwargs.get('output_dir', ''), - '--hpc-resource': kwargs.get('hpc_resource', ''), - '--harvester-workdir': kwargs.get('harvester_workdir', ''), - '--harvester-datadir': kwargs.get('harvester_datadir', ''), - '--harvester-eventstatusdump': kwargs.get('harvester_eventstatusdump', ''), - '--harvester-workerattributes': kwargs.get('harvester_workerattributes', ''), - '--harvester-submitmode': kwargs.get('harvester_submitmode', ''), - '--resource-type': kwargs.get('resource_type', '') - } - - args = Args() - parser = argparse.ArgumentParser() - try: - _items = list(argument_dictionary.items()) # Python 3 - except Exception: - _items = argument_dictionary.iteritems() # Python 2 - for key, value in _items: - print(key, value) - parser.add_argument(key) - parser.parse_args(args=[key, value], namespace=args) # convert back int and bool strings to int and bool?? - - # call main pilot function - - return 0 - - def str2bool(v): """ Helper function to convert string to bool """ @@ -478,6 +422,9 @@ def set_environment_variables(args, mainworkdir): # event service executor type environ['PILOT_ES_EXECUTOR_TYPE'] = args.executor_type + if args.output_dir: + environ['PILOT_OUTPUT_DIR'] = args.output_dir + # keep track of the server urls _port = ":%s" % args.port url = args.url if _port in args.url else args.url + _port diff --git a/pilot/control/data.py b/pilot/control/data.py index 5df19f89..83c731ee 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -721,7 +721,7 @@ def filter_files_for_log(directory): return filtered_files -def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], output_files=[], is_looping=False): +def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], output_files=[], is_looping=False, debugmode=False): """ Create the tarball for the job. @@ -732,11 +732,13 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out :param input_files: list of input files to remove (list). :param output_files: list of output files to remove (list). :param is_looping: True for looping jobs, False by default (Boolean). + :param debugmode: True if debug mode has been switched on (Boolean). :raises LogFileCreationFailure: in case of log file creation problem. :return: """ - logger.debug('preparing to create log file') + logger.debug('preparing to create log file (debug mode=%s)' % str(debugmode)) + # PILOT_HOME is the launch directory of the pilot (or the one specified in pilot options as pilot workdir) pilot_home = os.environ.get('PILOT_HOME', os.getcwd()) current_dir = os.getcwd() @@ -747,7 +749,7 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out if cleanup: pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 - user.remove_redundant_files(workdir, islooping=is_looping) + user.remove_redundant_files(workdir, islooping=is_looping, debugmode=debugmode) # remove any present input/output files before tarring up workdir for f in input_files + output_files: @@ -894,7 +896,7 @@ def _stage_out_new(job, args): output_files = [fspec.lfn for fspec in job.outdata] create_log(job.workdir, logfile.lfn, tarball_name, args.cleanup, input_files=input_files, output_files=output_files, - is_looping=errors.LOOPINGJOB in job.piloterrorcodes) + is_looping=errors.LOOPINGJOB in job.piloterrorcodes, debugmode=job.debug) except LogFileCreationFailure as e: logger.warning('failed to create tar file: %s' % e) set_pilot_state(job=job, state="failed") diff --git a/pilot/control/job.py b/pilot/control/job.py index 42c2e8b0..c311d3c4 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -41,15 +41,16 @@ SERVER_UPDATE_UPDATING, SERVER_UPDATE_NOT_DONE from pilot.util.container import execute from pilot.util.filehandling import find_text_files, tail, is_json, copy, remove, write_json, establish_logging, write_file, \ - create_symlink + create_symlink, locate_file from pilot.util.harvester import request_new_jobs, remove_job_request_file, parse_job_definition_file, \ is_harvester_mode, get_worker_attributes_file, publish_job_report, publish_work_report, get_event_status_file, \ publish_stageout_files from pilot.util.jobmetrics import get_job_metrics from pilot.util.math import mean +from pilot.util.middleware import containerise_general_command from pilot.util.monitoring import job_monitor_tasks, check_local_space from pilot.util.monitoringtime import MonitoringTime -from pilot.util.processes import cleanup, threads_aborted, kill_process +from pilot.util.processes import cleanup, threads_aborted, kill_process, get_pid_from_command, kill_processes from pilot.util.proxy import get_distinguished_name from pilot.util.queuehandling import scan_for_jobs, put_in_queue, queue_report, purge_queue from pilot.util.timing import add_to_pilot_timing, timing_report, get_postgetjob_time, get_time_since, time_stamp @@ -562,13 +563,14 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): else: logger.warning('received unknown server command via backchannel: %s' % cmd) - - job.debug = True - # job.debug_command = 'tail payload.stdout' # OK - # job.debug_command = 'ls -ltr workDir' # test with user jo + # for testing debug mode + #job.debug = True + # job.debug_command = 'tail payload.stdout' + # job.debug_command = 'ls -ltr workDir' # not really tested # job.debug_command = 'ls -ltr %s' % job.workdir - # 'ps -ef' - job.debug_command = 'ps axo pgid,ppid,comm,args' + # job.debug_command = 'ps -ef' + # job.debug_command = 'ps axo pid,ppid,pgid,args' + #job.debug_command = 'gdb --pid % -ex \'generate-core-file\'' def add_data_structure_ids(data, version_tag): @@ -636,10 +638,23 @@ def get_data_structure(job, state, args, xml=None, metadata=None): # in debug mode, also send a tail of the latest log file touched by the payload if job.debug: - stdout = get_debug_stdout(job.debug_command, job.workdir) + # for gdb commands, use the proper gdb version (the system one may be too old) + #if 'gdb ' in job.debug_command: + # pilot_user = os.environ.get('PILOT_USER', 'generic').lower() + # user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 + # user.preprocess_debug_command(job) + + stdout = get_debug_stdout(job) if stdout: data['stdout'] = stdout + # in case gdb was successfully used, the payload can now be killed + if 'gdb ' in job.debug_command and job.pid: + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PANDAKILL, + msg='payload was killed after gdb produced requested core file') + logger.debug('will proceed to kill payload processes') + kill_processes(job.pid) + # add the core count if job.corecount and job.corecount != 'null' and job.corecount != 'NULL': data['coreCount'] = job.corecount @@ -679,42 +694,91 @@ def get_data_structure(job, state, args, xml=None, metadata=None): return data -def get_debug_stdout(debug_command, workdir): +def get_debug_stdout(job): """ Return the requested output from a given debug command. - :param debug_command: full debug command (string). - :param workdir: job work directory (string). + :param job: job object. :return: output (string). """ - if debug_command == 'debug': - return get_payload_log_tail(workdir) - elif 'tail' in debug_command: - return get_requested_log_tail(debug_command, workdir) - elif 'ls ' in debug_command: - return get_ls(debug_command, workdir) - elif 'ps ' in debug_command or 'gdb ' in debug_command: - return get_general_command_stdout(debug_command) + if job.debug_command == 'debug': + return get_payload_log_tail(job.workdir) + elif 'tail' in job.debug_command: + return get_requested_log_tail(job.debug_command, job.workdir) + elif 'ls ' in job.debug_command: + return get_ls(job.debug_command, job.workdir) + elif 'ps ' in job.debug_command or 'gdb ' in job.debug_command: + return get_general_command_stdout(job) else: - logger.warning('command not handled yet: %s' % debug_command) + logger.warning('command not handled yet: %s' % job.debug_command) return '' -def get_general_command_stdout(debug_command): +def get_general_command_stdout(job): """ Return the output from the requested debug command. - :param debug_command: full debug command (string). + :param job: job object. :return: output (string). """ - ec, stdout, stderr = execute(debug_command) - logger.debug("%s:\n\n%s\n\n" % (debug_command, stdout)) + stdout = '' + + # for gdb, we might have to process the debug command (e.g. to identify the proper pid to debug) + if 'gdb ' in job.debug_command and '--pid %' in job.debug_command: + pilot_user = os.environ.get('PILOT_USER', 'generic').lower() + user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 + job.debug_command = user.process_debug_command(job.debug_command, job.jobid) + + if job.debug_command: + if 'gdb ' in job.debug_command: + logger.info('gdb execution will be done by a script') + try: + containerise_general_command(job, job.infosys.queuedata.container_options, + label='general', + container_type='container') + except PilotException as e: + logger.warning('general containerisation threw a pilot exception: %s' % e) + except Exception as e: + logger.warning('general containerisation threw an exception: %s' % e) + + # in case a core file was produced, locate it + path = locate_core_file(job.debug_command) + if path: + # copy it to the working directory (so it will be saved in the log) + try: + copy(path, job.workdir) + except Exception: + pass + else: + ec, stdout, stderr = execute(job.debug_command) + logger.debug("%s:\n\n%s\n\n" % (job.debug_command, stdout)) return stdout +def locate_core_file(debug_command): + """ + + """ + + path = None + pid = get_pid_from_command(debug_command) + if pid: + filename = 'core.%d' % pid + path = os.path.join(os.environ.get('PILOT_HOME', '.'), filename) + if os.path.exists(path): + logger.debug('found core file at: %s' % path) + + else: + logger.debug('did not find %s in %s' % (filename, path)) + else: + logger.warning('cannot locate core file since pid could not be extracted from debug command') + + return path + + def get_ls(debug_command, workdir): """ Return the requested ls debug command. @@ -728,6 +792,8 @@ def get_ls(debug_command, workdir): # cmd = items[0] options = ' '.join(items[1:]) path = options.split(' ')[-1] if ' ' in options else options + if path.startswith('-'): + path = '.' finalpath = os.path.join(workdir, path) debug_command = debug_command.replace(path, finalpath) @@ -2385,10 +2451,15 @@ def job_monitor(queues, traces, args): # noqa: C901 update_time = send_heartbeat_if_time(jobs[i], args, update_time) # note: when sending a state change to the server, the server might respond with 'tobekilled' - if jobs[i].state == 'failed': - logger.warning('job state is \'failed\' - order log transfer and abort job_monitor() (1)') - jobs[i].stageout = 'log' # only stage-out log file - put_in_queue(jobs[i], queues.data_out) + try: + jobs[i] + except Exception as e: + logger.warning('detected stale jobs[i] object in job_monitor: %s' % e) + else: + if jobs[i].state == 'failed': + logger.warning('job state is \'failed\' - order log transfer and abort job_monitor() (1)') + jobs[i].stageout = 'log' # only stage-out log file + put_in_queue(jobs[i], queues.data_out) # sleep for a while if stage-in has not completed time.sleep(1) diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index f3ec0cb7..a39ba3ab 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -89,7 +89,7 @@ class JobData(BaseData): neventsw = 0 # number of events written dbtime = None # dbdata = None # - resimevents = 0 # ReSim events from job report (ATLAS) + resimevents = None # ReSim events from job report (ATLAS) payload = "" # payload name utilities = {} # utility processes { : [, number of launches, command string], .. } pid = None # payload pid diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 634f0305..dd77a154 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -38,6 +38,7 @@ from pilot.util.container import execute from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy,\ copy_pilot_source, write_file, read_json, read_file, update_extension, get_local_file_size, calculate_checksum +from pilot.util.processes import convert_ps_to_dict, find_cmd_pids, get_trimmed_dictionary, find_pid, is_child from pilot.util.tracereport import TraceReport import logging @@ -1553,22 +1554,24 @@ def cleanup_looping_payload(workdir): remove(path) -def cleanup_payload(workdir, outputfiles=[]): +def cleanup_payload(workdir, outputfiles=[], removecores=True): """ Cleanup of payload (specifically AthenaMP) sub directories prior to log file creation. Also remove core dumps. - :param workdir: working directory (string) - :param outputfiles: list of output files + :param workdir: working directory (string). + :param outputfiles: list of output files. + :param removecores: remove core files if True (Boolean). :return: """ - remove_core_dumps(workdir) + if removecores: + remove_core_dumps(workdir) for ampdir in glob('%s/athenaMP-workers-*' % workdir): for (p, d, f) in os.walk(ampdir): for filename in f: - if 'core' in filename or 'pool.root' in filename or 'tmp.' in filename: + if ('core' in filename and removecores) or 'pool.root' in filename or 'tmp.' in filename: path = os.path.join(p, filename) path = os.path.abspath(path) remove(path) @@ -1775,13 +1778,16 @@ def remove_special_files(workdir, dir_list, outputfiles): remove_dir_tree(f) -def remove_redundant_files(workdir, outputfiles=[], islooping=False): +def remove_redundant_files(workdir, outputfiles=[], islooping=False, debugmode=False): """ Remove redundant files and directories prior to creating the log file. + Note: in debug mode, any core files should not be removed before creating the log. + :param workdir: working directory (string). :param outputfiles: list of protected output files (list). :param islooping: looping job variable to make sure workDir is not removed in case of looping (boolean). + :param debugmode: True if debug mode has been switched on (Boolean). :return: """ @@ -1796,7 +1802,7 @@ def remove_redundant_files(workdir, outputfiles=[], islooping=False): # remove core and pool.root files from AthenaMP sub directories try: logger.debug('cleaning up payload') - cleanup_payload(workdir, outputfiles) + cleanup_payload(workdir, outputfiles, removecores=not debugmode) except Exception as e: logger.warning("failed to execute cleanup_payload(): %s" % e) @@ -2350,3 +2356,75 @@ def update_server(job): logger.warning('path does not exist: %s' % path) else: logger.debug('no need to update logstash for this job') + + +def preprocess_debug_command(job): + """ + + """ + + return + + + # Should the pilot do the setup or does jobPars already contain the information? + preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams) + # get the general setup command and then verify it if required + resource_name = get_resource_name() # 'grid' if no hpc_resource is set + resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0) # Python 3, -1 -> 0 + cmd = resource.get_setup_command(job, preparesetup) + if not cmd.endswith(';'): + cmd += '; ' + if cmd not in job.debug_command: + job.debug_command = cmd + job.debug_command + + +def process_debug_command(debug_command, pandaid): + """ + In debug mode, the server can send a special debug command to the pilot via the updateJob backchannel. + This function can be used to process that command, i.e. to identify a proper pid to debug (which is unknown + to the server). + + For gdb, the server might send a command with gdb option --pid %. The pilot need to replace the % with the proper + pid. The default (hardcoded) process will be that of athena.py. The pilot will find the corresponding pid. + + :param debug_command: debug command (string). + :param pandaid: PanDA id (string). + :return: updated debug command (string). + """ + + pandaid_pid = None + if '--pid %' in debug_command: + # replace the % with the pid for athena.py + # note: if athena.py is not yet running, the --pid % will remain. Otherwise the % will be replaced by the pid + # first find the pid (if athena.py is running) + cmd = 'ps axo pid,ppid,pgid,args' + exit_code, stdout, stderr = execute(cmd) + if stdout: + logger.debug('ps=\n\n%s\n' % stdout) + # convert the ps output to a dictionary + dictionary = convert_ps_to_dict(stdout) + # trim this dictionary to reduce the size (only keep the PID and PPID lists) + trimmed_dictionary = get_trimmed_dictionary(['PID', 'PPID'], dictionary) + # what is the pid of the trf? + pandaid_pid = find_pid(pandaid, dictionary) + # find all athena processes + pids = find_cmd_pids('athena.py', dictionary) + # which of the found pids are children of the trf? (which has an export PandaID=.. attached to it) + for pid in pids: + try: + child = is_child(pid, pandaid_pid, trimmed_dictionary) + except RuntimeError as e: + logger.warning('too many recursions: %s (cannot identify athena process)' % e) + else: + if child: + logger.info('pid=%d is a child process of the trf of this job' % pid) + debug_command = debug_command.replace('--pid %', '--pid %d' % pid) + logger.info('updated debug command: %s' % debug_command) + break + else: + logger.info('pid=%d is not a child process of the trf of this job' % pid) + if not pids: + logger.debug('athena is not yet running (no corresponding pid)') + debug_command = '' # reset the command to prevent the payload from being killed (will be killed when gdb has run) + + return debug_command diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index c518f249..84c1cfff 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -899,7 +899,7 @@ def create_root_container_command(workdir, cmd): return command -def create_middleware_container_command(workdir, cmd, container_options, label='stagein'): +def create_middleware_container_command(workdir, cmd, container_options, label='stagein', proxy=True): """ Create the stage-in/out container command. @@ -924,10 +924,16 @@ def create_middleware_container_command(workdir, cmd, container_options, label=' command = 'cd %s;' % workdir # add bits and pieces for the containerisation - middleware_container = get_middleware_container() - content = get_middleware_container_script(middleware_container, cmd) + middleware_container = get_middleware_container(label=label) + content = get_middleware_container_script(middleware_container, cmd, label=label) # store it in setup.sh - script_name = 'stagein.sh' if label == 'stage-in' else 'stageout.sh' + if label == 'stage-in': + script_name = 'stagein.sh' + elif label == 'stage-out': + script_name = 'stageout.sh' + else: + script_name = 'general.sh' + try: status = write_file(os.path.join(workdir, script_name), content) except PilotException as e: @@ -935,9 +941,10 @@ def create_middleware_container_command(workdir, cmd, container_options, label=' else: if status: # generate the final container command - x509 = os.environ.get('X509_USER_PROXY', '') - if x509: - command += 'export X509_USER_PROXY=%s;' % x509 + if proxy: + x509 = os.environ.get('X509_USER_PROXY', '') + if x509: + command += 'export X509_USER_PROXY=%s;' % x509 command += 'export ALRB_CONT_RUNPAYLOAD=\"source /srv/%s\";' % script_name command += get_asetup(alrb=True) # export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase; command += 'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh -c %s' % middleware_container @@ -995,13 +1002,17 @@ def get_middleware_container_script(middleware_container, cmd, asetup=False): return content -def get_middleware_container(): +def get_middleware_container(label=None): """ Return the middleware container. + :param label: label (string). :return: path (string). """ + if label and label == 'general': + return 'CentOS7' + path = config.Container.middleware_container if path.startswith('/') and not os.path.exists(path): logger.warning('requested middleware container path does not exist: %s (switching to default value)' % path) diff --git a/pilot/user/generic/common.py b/pilot/user/generic/common.py index 51f05632..c747446e 100644 --- a/pilot/user/generic/common.py +++ b/pilot/user/generic/common.py @@ -270,3 +270,17 @@ def post_prestagein_utility_command(**kwargs): # stdout = kwargs.get('output', None) pass + + +def process_debug_command(debug_command, pandaid): + """ + In debug mode, the server can send a special debug command to the pilot via the updateJob backchannel. + This function can be used to process that command, i.e. to identify a proper pid to debug (which is unknown + to the server). + + :param debug_command: debug command (string), payload pid (int). + :param pandaid: PanDA id (string). + :return: updated debug command (string) + """ + + return debug_command diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 1891c015..88e0b391 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '27g' # build number should be reset to '1' for every new development cycle +BUILD = '28g' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index f8de374e..a1c4a07d 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -46,7 +46,7 @@ iddsserver: https://pandaserver.cern.ch:25443 # The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5*60 = 300 s in debug mode) heartbeat: 1800 -debug_heartbeat: 300 +debug_heartbeat: 60 # Heartbeat message file (only used when Pilot is not sending heartbeats to server) heartbeat_message: heartbeat.json diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index 9ebeac46..72c4c3f3 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -1077,3 +1077,24 @@ def create_symlink(from_path='', to_path=''): logger.warning('failed to create symlink from %s to %s: %s' % (from_path, to_path, e)) else: logger.debug('created symlink from %s to %s' % (from_path, to_path)) + + +def locate_file(pattern): + """ + Locate a file defined by the pattern. + + Example: + pattern = os.path.join(os.getcwd(), '**/core.123') + -> /Users/Paul/Development/python/tt/core.123 + + :param pattern: pattern name (string). + :return: path (string). + """ + + path = None + for fname in glob(pattern): + if os.path.isfile(fname): + path = fname + + return path + diff --git a/pilot/util/middleware.py b/pilot/util/middleware.py index d88d8ddd..a9fe101b 100644 --- a/pilot/util/middleware.py +++ b/pilot/util/middleware.py @@ -20,13 +20,56 @@ errors = ErrorCodes() +def containerise_general_command(job, container_options, label='command', container_type='container'): + """ + Containerise a general command by execution in a script that can be run in a container. + + :param job: job object. + :param label: label (string). + :param container_options: container options from queuedata (string). + :param container_type: optional 'container/bash' + :raises PilotException: for general failures. + :return: + """ + + cwd = getcwd() + + if container_type == 'container': + # add bits and pieces needed to run the cmd in a container + pilot_user = environ.get('PILOT_USER', 'generic').lower() + user = __import__('pilot.user.%s.container' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 + try: + cmd = user.create_middleware_container_command(job.workdir, job.debug_command, container_options, label=label, proxy=False) + except PilotException as e: + raise e + else: + logger.warning('not yet implemented') + raise PilotException + + try: + logger.info('*** executing %s (logging will be redirected) ***' % label) + exit_code, stdout, stderr = execute(cmd, job=job, usecontainer=False) + except Exception as e: + logger.info('*** %s has failed ***' % label) + logger.warning('exception caught: %s' % e) + else: + if exit_code == 0: + logger.info('*** %s has finished ***' % label) + else: + logger.info('*** %s has failed ***' % label) + logger.debug('%s script returned exit_code=%d' % (label, exit_code)) + + def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, container_options, external_dir, label='stage-in', container_type='container'): """ Containerise the middleware by performing stage-in/out steps in a script that in turn can be run in a container. + Note: a container will only be used for option container_type='container'. If this is 'bash', then stage-in/out will still be done by a script, but not containerised. + Note: this function is tailor made for stage-in/out. + :param job: job object. :param xdata: list of FileSpec objects. :param queue: queue name (string). @@ -37,9 +80,9 @@ def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, :param external_dir: input or output files directory (string). :param label: optional 'stage-in/out' (String). :param container_type: optional 'container/bash' - :return: :raises StageInFailure: for stage-in failures :raises StageOutFailure: for stage-out failures + :return: """ cwd = getcwd() @@ -123,6 +166,8 @@ def get_command(job, xdata, queue, script, eventtype, localsite, remotesite, ext """ Get the middleware container execution command. + Note: this function is tailor made for stage-in/out. + :param job: job object. :param xdata: list of FileSpec objects. :param queue: queue name (string). diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 213ac5d6..e5b94ae8 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -630,3 +630,156 @@ def threads_aborted(abort_at=2): aborted = True return aborted + + +def convert_ps_to_dict(output, pattern=r'(\d+) (\d+) (\d+) (.+)'): + """ + Convert output from a ps command to a dictionary. + + Example: ps axo pid,ppid,pgid,cmd + PID PPID PGID COMMAND + 22091 6672 22091 bash + 32581 22091 32581 ps something;sdfsdfds/athena.py ddfg + -> dictionary = { 'PID': [22091, 32581], 'PPID': [22091, 6672], .. , 'COMMAND': ['ps ..', 'bash']} + + :param output: ps stdout (string). + :param pattern: regex pattern matching the ps output (raw string). + :return: dictionary. + """ + + dictionary = {} + first_line = [] # e.g. PID PPID PGID COMMAND + + for line in output.split('\n'): + try: + # remove leading and trailing spaces + line = line.strip() + # remove multiple spaces inside the line + _l = re.sub(' +', ' ', line) + + if first_line == []: + _l = [_f for _f in _l.split(' ') if _f] + first_line = _l + for i in range(len(_l)): + dictionary[_l[i]] = [] + else: # e.g. 22091 6672 22091 bash + match = re.search(pattern, _l) + if match: + for i in range(len(first_line)): + try: + var = int(match.group(i + 1)) + except Exception: + var = match.group(i + 1) + dictionary[first_line[i]].append(var) + + except Exception as e: + print("unexpected format of utility output: %s" % e) + + return dictionary + + +def get_trimmed_dictionary(keys, dictionary): + """ + Return a sub-dictionary with only the given keys. + + :param keys: keys to keep (list). + :param dictionary: full dictionary. + :return: trimmed dictionary. + """ + + subdictionary = {} + for key in keys: + if key in dictionary: + subdictionary[key] = dictionary[key] + + return subdictionary + + +def find_cmd_pids(cmd, ps_dictionary): + """ + Find all pids for the given command. + Example. cmd = 'athena.py' -> pids = [1234, 2267] (in case there are two pilots running on the WN). + + :param cmd: command (string). + :param ps_dictionary: converted ps output (dictionary). + """ + + pids = [] + i = -1 + for _cmd in ps_dictionary.get('COMMAND'): + i += 1 + if cmd in _cmd: + pids.append(ps_dictionary.get('PID')[i]) + return pids + + +def find_pid(pandaid, ps_dictionary): + """ + Find the process id for the command that contains 'export PandaID=%d'. + + :param pandaid: PanDA ID (string). + :param ps_dictionaryL ps output dictionary. + :return: pid (int). + """ + + pid = -1 + i = -1 + pandaid_cmd = 'export PandaID=%s' % pandaid + for _cmd in ps_dictionary.get('COMMAND'): + i += 1 + if pandaid_cmd in _cmd: + pid = ps_dictionary.get('PID')[i] + break + + return pid + + +def is_child(pid, pandaid_pid, dictionary): + """ + Is the given pid a child process of the pandaid_pid? + Proceed recursively until the parent pandaid_pid has been found, or return False if it fails to find it. + """ + + try: + # where are we at in the PID list? + index = dictionary.get('PID').index(pid) + except ValueError: + # not in the list + return False + else: + # get the corresponding ppid + ppid = dictionary.get('PPID')[index] + + print(index, pid, ppid, pandaid_pid) + # is the current parent the same as the pandaid_pid? if yes, we are done + if ppid == pandaid_pid: + return True + else: + # try another pid + return is_child(ppid, pandaid_pid, dictionary) + + +def get_pid_from_command(cmd, pattern=r'gdb --pid (\d+)'): + """ + Identify an explicit process id in the given command. + + Example: + cmd = 'gdb --pid 19114 -ex \'generate-core-file\'' + -> pid = 19114 + + :param cmd: command containing a pid (string). + :param pattern: regex pattern (raw string). + :return: pid (int). + """ + + pid = None + match = re.search(pattern, cmd) + if match: + try: + pid = int(match.group(1)) + except Exception: + pid = None + else: + print('no match for pattern \'%s\' in command=\'%s\'' % (pattern, cmd)) + + return pid From 523442db5a32c6b6e7676fad09223e9604f149ec Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 4 Jun 2021 15:58:37 +0200 Subject: [PATCH 58/96] Now moving raythena/AthenaMP output to shared directory (if --output-dir was used) --- .../workexecutor/plugins/raythenaexecutor.py | 21 +++++++++++++++++- pilot/util/filehandling.py | 22 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py b/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py index 202f10ae..dc0140da 100644 --- a/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py +++ b/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py @@ -16,7 +16,7 @@ from pilot.common.errorcodes import ErrorCodes from pilot.eventservice.esprocess.esprocess import ESProcess from pilot.info.filespec import FileSpec -from pilot.util.filehandling import calculate_checksum +from pilot.util.filehandling import calculate_checksum, move from .baseexecutor import BaseExecutor @@ -62,6 +62,21 @@ def create_file_spec(self, pfn): file_spec = FileSpec(filetype='output', **file_data) return file_spec + def move_output(self, pfn): + """ + Move output file from given PFN path to PILOT_OUTPUT_DIR if set. + + :param pfn: physical file name (string). + :return: + """ + + outputdir = os.environ.get('PILOT_OUTPUT_DIR', None) + if outputdir: + try: + move(pfn, outputdir) + except Exception as e: + logger.warning('failed to move output: %s' % e) + def update_finished_event_ranges(self, out_messagess): """ Update finished event ranges @@ -81,6 +96,10 @@ def update_finished_event_ranges(self, out_messagess): for checksum_key in fspec.checksum: event_range_status[checksum_key] = fspec.checksum[checksum_key] event_ranges.append(event_range_status) + + # move the output to a common area if necessary + self.move_output(out_msg['output']) + event_ranges_status = {"esOutput": {"numEvents": len(event_ranges)}, "eventRanges": event_ranges} event_range_message = {'version': 1, 'eventRanges': json.dumps([event_ranges_status])} self.update_events(event_range_message) diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index 72c4c3f3..2c76fb52 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -524,6 +524,28 @@ def tar_files(wkdir, excludedfiles, logfile_name, attempt=0): return 0 +def move(path1, path2): + """ + Move a file from path1 to path2. + + :param path1: source path (string). + :param path2: destination path2 (string). + """ + + if not os.path.exists(path1): + logger.warning('file copy failure: path does not exist: %s' % path1) + raise NoSuchFile("File does not exist: %s" % path1) + + try: + import shutil + shutil.move(path1, path2) + except IOError as e: + logger.warning("exception caught during file move: %s" % e) + raise FileHandlingFailure(e) + else: + logger.info("moved %s to %s" % (path1, path2)) + + def copy(path1, path2): """ Copy path1 to path2. From e82677099f9e2e0ed9a6af1df8d4cd752b02ea30 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 7 Jun 2021 15:48:05 +0200 Subject: [PATCH 59/96] Added some debug info for direct access --- PILOTVERSION | 2 +- pilot/api/data.py | 5 +++-- pilot/info/jobdata.py | 2 +- pilot/util/constants.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index bceaa76a..ceb642a6 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.28f \ No newline at end of file +2.11.3.28 \ No newline at end of file diff --git a/pilot/api/data.py b/pilot/api/data.py index bf5d73be..87c4373a 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -896,9 +896,10 @@ def set_status_for_direct_access(self, files, workdir): if not direct_lan and not direct_wan: self.logger.debug('direct lan/wan transfer will not be used for lfn=%s' % fspec.lfn) self.logger.debug('lfn=%s, direct_lan=%s, direct_wan=%s, direct_access_lan=%s, direct_access_wan=%s, ' - 'direct_localinput_allowed_schemas=%s, remoteinput_allowed_schemas=%s' % + 'direct_localinput_allowed_schemas=%s, remoteinput_allowed_schemas=%s, domain=%s' % (fspec.lfn, direct_lan, direct_wan, fspec.direct_access_lan, fspec.direct_access_wan, - str(self.direct_localinput_allowed_schemas), str(self.direct_remoteinput_allowed_schemas))) + str(self.direct_localinput_allowed_schemas), str(self.direct_remoteinput_allowed_schemas), + fspec.domain)) if direct_lan or direct_wan: fspec.status_code = 0 diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index a39ba3ab..0b53dd63 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -274,7 +274,7 @@ def prepare_infiles(self, data): idat[key] = getattr(self.infosys.queuedata, key) finfo = FileSpec(filetype='input', **idat) - logger.info('added file %s' % lfn) + logger.info('added file \'%s\' with accessmode \'%s\'' % (lfn, accessmode)) ret.append(finfo) return ret diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 88e0b391..c4aa5050 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '28g' # build number should be reset to '1' for every new development cycle +BUILD = '28' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From be5e9ffbbcbecc18ed2e7b7b2184490c69fbe9a7 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 7 Jun 2021 15:55:58 +0200 Subject: [PATCH 60/96] Flake8 --- pilot/control/job.py | 14 +++++++------- pilot/user/atlas/common.py | 20 +++++++++----------- pilot/util/filehandling.py | 1 - 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index c311d3c4..09e6252c 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -41,7 +41,7 @@ SERVER_UPDATE_UPDATING, SERVER_UPDATE_NOT_DONE from pilot.util.container import execute from pilot.util.filehandling import find_text_files, tail, is_json, copy, remove, write_json, establish_logging, write_file, \ - create_symlink, locate_file + create_symlink from pilot.util.harvester import request_new_jobs, remove_job_request_file, parse_job_definition_file, \ is_harvester_mode, get_worker_attributes_file, publish_job_report, publish_work_report, get_event_status_file, \ publish_stageout_files @@ -639,10 +639,10 @@ def get_data_structure(job, state, args, xml=None, metadata=None): # in debug mode, also send a tail of the latest log file touched by the payload if job.debug: # for gdb commands, use the proper gdb version (the system one may be too old) - #if 'gdb ' in job.debug_command: - # pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - # user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 - # user.preprocess_debug_command(job) + if 'gdb ' in job.debug_command: + pilot_user = os.environ.get('PILOT_USER', 'generic').lower() + user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 + user.preprocess_debug_command(job) stdout = get_debug_stdout(job) if stdout: @@ -1131,8 +1131,8 @@ def verify_ctypes(queues, job): # specifically, this will include any 'orphans', i.e. if the pilot kills all subprocesses at the end, # 'orphans' will be included (orphans seem like the wrong name) libc = ctypes.CDLL('libc.so.6') - PR_SET_CHILD_SUBREAPER = 36 - libc.prctl(PR_SET_CHILD_SUBREAPER, 1) + pr_set_child_subreaper = 36 + libc.prctl(pr_set_child_subreaper, 1) logger.debug('all child subprocesses will be parented') diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index dd77a154..d1db47ef 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -2360,22 +2360,20 @@ def update_server(job): def preprocess_debug_command(job): """ - + (Currently not used - not needed if e.g. gdb will be run in a container) """ return - - # Should the pilot do the setup or does jobPars already contain the information? - preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams) + #preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams) # get the general setup command and then verify it if required - resource_name = get_resource_name() # 'grid' if no hpc_resource is set - resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0) # Python 3, -1 -> 0 - cmd = resource.get_setup_command(job, preparesetup) - if not cmd.endswith(';'): - cmd += '; ' - if cmd not in job.debug_command: - job.debug_command = cmd + job.debug_command + #resource_name = get_resource_name() # 'grid' if no hpc_resource is set + #resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0) # Python 3, -1 -> 0 + #cmd = resource.get_setup_command(job, preparesetup) + #if not cmd.endswith(';'): + # cmd += '; ' + #if cmd not in job.debug_command: + # job.debug_command = cmd + job.debug_command def process_debug_command(debug_command, pandaid): diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index 2c76fb52..53972c30 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -1119,4 +1119,3 @@ def locate_file(pattern): path = fname return path - From de4e150ee8ccfad149b8b2e199313d45cc2c71e4 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 7 Jun 2021 16:05:53 +0200 Subject: [PATCH 61/96] Refactoring --- pilot/control/job.py | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 09e6252c..d9c77078 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -638,22 +638,7 @@ def get_data_structure(job, state, args, xml=None, metadata=None): # in debug mode, also send a tail of the latest log file touched by the payload if job.debug: - # for gdb commands, use the proper gdb version (the system one may be too old) - if 'gdb ' in job.debug_command: - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 - user.preprocess_debug_command(job) - - stdout = get_debug_stdout(job) - if stdout: - data['stdout'] = stdout - - # in case gdb was successfully used, the payload can now be killed - if 'gdb ' in job.debug_command and job.pid: - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PANDAKILL, - msg='payload was killed after gdb produced requested core file') - logger.debug('will proceed to kill payload processes') - kill_processes(job.pid) + data['stdout'] = process_debug_mode(job) # add the core count if job.corecount and job.corecount != 'null' and job.corecount != 'NULL': @@ -694,6 +679,32 @@ def get_data_structure(job, state, args, xml=None, metadata=None): return data +def process_debug_mode(job): + """ + Handle debug mode - preprocess debug command, get the output and kill the payload in case of gdb. + + :param job: job object. + :return: stdout from debug command (string). + """ + + # for gdb commands, use the proper gdb version (the system one may be too old) + if 'gdb ' in job.debug_command: + pilot_user = os.environ.get('PILOT_USER', 'generic').lower() + user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 + user.preprocess_debug_command(job) + + stdout = get_debug_stdout(job) + if stdout: + # in case gdb was successfully used, the payload can now be killed + if 'gdb ' in job.debug_command and job.pid: + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PANDAKILL, + msg='payload was killed after gdb produced requested core file') + logger.debug('will proceed to kill payload processes') + kill_processes(job.pid) + + return stdout + + def get_debug_stdout(job): """ Return the requested output from a given debug command. From 667b4fbd6d349632af4f3be1aa6e59f4a1591eee Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 8 Jun 2021 14:05:25 +0200 Subject: [PATCH 62/96] Will not fail jobs on sites that fail to import ctypes --- PILOTVERSION | 2 +- pilot/control/job.py | 6 +++--- pilot/util/constants.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index ceb642a6..38b6ed73 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.28 \ No newline at end of file +2.11.3.29 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index d9c77078..05e72c5b 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -1132,9 +1132,9 @@ def verify_ctypes(queues, job): except Exception as e: diagnostics = 'ctypes python module could not be imported: %s' % e logger.warning(diagnostics) - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOCTYPES, msg=diagnostics) - logger.debug('Failed to validate job=%s' % job.jobid) - put_in_queue(job, queues.failed_jobs) + #job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOCTYPES, msg=diagnostics) + #logger.debug('Failed to validate job=%s' % job.jobid) + #put_in_queue(job, queues.failed_jobs) else: logger.debug('ctypes python module imported') diff --git a/pilot/util/constants.py b/pilot/util/constants.py index c4aa5050..8127c5ae 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '28' # build number should be reset to '1' for every new development cycle +BUILD = '30' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From b95210f95ab4a537b08572e23841f55fac9b784f Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 8 Jun 2021 14:44:21 +0200 Subject: [PATCH 63/96] Fixed case where job object size calculation fails with exception due to object changing size --- PILOTVERSION | 2 +- pilot/info/jobdata.py | 8 +++++++- pilot/util/constants.py | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 38b6ed73..f2dbf9d1 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.29 \ No newline at end of file +2.11.3.31 \ No newline at end of file diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index 0b53dd63..f3269a24 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -95,6 +95,7 @@ class JobData(BaseData): pid = None # payload pid pgrp = None # payload process group sizes = {} # job object sizes { timestamp: size, .. } + currentsize = 0 # current job object size command = "" # full payload command (set for container jobs) setup = "" # full payload setup (needed by postprocess command) zombies = [] # list of zombie process ids @@ -986,7 +987,12 @@ def get_size(self): :return: size (int). """ - return get_object_size(self) + # protect against the case where the object changes size during calculation (rare) + try: + self.currentsize = get_object_size(self) + except Exception: + pass + return self.currentsize def collect_zombies(self, tn=None): """ diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 8127c5ae..560bfaf9 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '30' # build number should be reset to '1' for every new development cycle +BUILD = '32' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From a9cac88e98706ba19d50755517b10749888bb898 Mon Sep 17 00:00:00 2001 From: Alexey Anisenkov Date: Wed, 9 Jun 2021 15:27:32 +0700 Subject: [PATCH 64/96] make free space check (check_availablespace) being optional for specific copytools;ignore the check for mv copytool --- pilot/api/data.py | 10 ++++++---- pilot/copytool/mv.py | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index ea1ba48f..7d9246c5 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -857,10 +857,12 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 kwargs['activity'] = activity # verify file sizes and available space for stage-in - if self.infosys.queuedata.maxinputsize != -1: - self.check_availablespace(remain_files) - else: - self.logger.info('skipping input file size check since maxinputsize=-1') + if getattr(copytool, 'check_availablespace', True): + if self.infosys.queuedata.maxinputsize != -1: + self.check_availablespace(remain_files) + else: + self.logger.info('skipping input file size check since maxinputsize=-1') + show_memory_usage() # add the trace report diff --git a/pilot/copytool/mv.py b/pilot/copytool/mv.py index 539c6457..73093a92 100644 --- a/pilot/copytool/mv.py +++ b/pilot/copytool/mv.py @@ -19,6 +19,7 @@ logger = logging.getLogger(__name__) require_replicas = False # indicate if given copytool requires input replicas to be resolved +check_availablespace = False # indicate whether space check should be applied before stage-in transfers using given copytool def create_output_list(files, init_dir, ddmconf): From 196b79d98c9a6c5a2400b335ac10d9231b1e749a Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 9 Jun 2021 11:35:05 +0200 Subject: [PATCH 65/96] gdb updates. Now avoiding containerisation of gdb command since core file can't be located. --- PILOTVERSION | 2 +- pilot/control/job.py | 25 +++++++++++++------------ pilot/info/jobdata.py | 2 +- pilot/user/atlas/common.py | 23 +++++++++++------------ pilot/user/atlas/container.py | 10 ++++++---- pilot/util/constants.py | 6 +++--- 6 files changed, 35 insertions(+), 33 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index f2dbf9d1..1d38d273 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.3.31 \ No newline at end of file +2.12.1.39 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 05e72c5b..a080b8b2 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -743,8 +743,8 @@ def get_general_command_stdout(job): job.debug_command = user.process_debug_command(job.debug_command, job.jobid) if job.debug_command: - if 'gdb ' in job.debug_command: - logger.info('gdb execution will be done by a script') + _containerisation = False # set this with some logic instead - not used for now + if _containerisation: try: containerise_general_command(job, job.infosys.queuedata.container_options, label='general', @@ -753,18 +753,19 @@ def get_general_command_stdout(job): logger.warning('general containerisation threw a pilot exception: %s' % e) except Exception as e: logger.warning('general containerisation threw an exception: %s' % e) - - # in case a core file was produced, locate it - path = locate_core_file(job.debug_command) - if path: - # copy it to the working directory (so it will be saved in the log) - try: - copy(path, job.workdir) - except Exception: - pass else: ec, stdout, stderr = execute(job.debug_command) - logger.debug("%s:\n\n%s\n\n" % (job.debug_command, stdout)) + logger.debug("%s (stdout):\n\n%s\n\n" % (job.debug_command, stdout)) + logger.debug("%s (stderr):\n\n%s\n\n" % (job.debug_command, stderr)) + + # in case a core file was produced, locate it + path = locate_core_file(job.debug_command) if 'gdb ' in job.debug_command else '' + if path: + # copy it to the working directory (so it will be saved in the log) + try: + copy(path, job.workdir) + except Exception: + pass return stdout diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index f3269a24..4e6ecbfe 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -120,7 +120,7 @@ class JobData(BaseData): destinationdblock = "" ## to be moved to FileSpec (job.outdata) datasetin = "" ## TO BE DEPRECATED: moved to FileSpec (job.indata) debug = False # debug mode, when True, pilot will send debug info back to the server - debug_command = 'tail' # debug command (can be defined on the task side) + debug_command = '' # debug command (can be defined on the task side) produserid = "" # the user DN (added to trace report) jobdefinitionid = "" # the job definition id (added to trace report) infilesguids = "" # diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index d1db47ef..1a1344de 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -2360,20 +2360,19 @@ def update_server(job): def preprocess_debug_command(job): """ - (Currently not used - not needed if e.g. gdb will be run in a container) + """ - return # Should the pilot do the setup or does jobPars already contain the information? - #preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams) + preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams) # get the general setup command and then verify it if required - #resource_name = get_resource_name() # 'grid' if no hpc_resource is set - #resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0) # Python 3, -1 -> 0 - #cmd = resource.get_setup_command(job, preparesetup) - #if not cmd.endswith(';'): - # cmd += '; ' - #if cmd not in job.debug_command: - # job.debug_command = cmd + job.debug_command + resource_name = get_resource_name() # 'grid' if no hpc_resource is set + resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0) # Python 3, -1 -> 0 + cmd = resource.get_setup_command(job, preparesetup) + if not cmd.endswith(';'): + cmd += '; ' + if cmd not in job.debug_command: + job.debug_command = cmd + job.debug_command def process_debug_command(debug_command, pandaid): @@ -2398,7 +2397,7 @@ def process_debug_command(debug_command, pandaid): cmd = 'ps axo pid,ppid,pgid,args' exit_code, stdout, stderr = execute(cmd) if stdout: - logger.debug('ps=\n\n%s\n' % stdout) + #logger.debug('ps=\n\n%s\n' % stdout) # convert the ps output to a dictionary dictionary = convert_ps_to_dict(stdout) # trim this dictionary to reduce the size (only keep the PID and PPID lists) @@ -2421,7 +2420,7 @@ def process_debug_command(debug_command, pandaid): break else: logger.info('pid=%d is not a child process of the trf of this job' % pid) - if not pids: + if not pids or '--pid %' in debug_command: logger.debug('athena is not yet running (no corresponding pid)') debug_command = '' # reset the command to prevent the payload from being killed (will be killed when gdb has run) diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index 84c1cfff..2a81807a 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -971,7 +971,7 @@ def get_root_container_script(cmd): return content -def get_middleware_container_script(middleware_container, cmd, asetup=False): +def get_middleware_container_script(middleware_container, cmd, asetup=False, label=''): """ Return the content of the middleware container script. If asetup is True, atlasLocalSetup will be added to the command. @@ -991,9 +991,11 @@ def get_middleware_container_script(middleware_container, cmd, asetup=False): content += 'export ALRB_LOCAL_PY3=YES; ' if asetup: # export ATLAS_LOCAL_ROOT_BASE=/cvmfs/..;source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh --quiet; content += get_asetup(asetup=False) - content += sitename + 'lsetup rucio davix xrootd; ' - content += 'python3 %s ' % cmd if is_python3() else 'python %s' % cmd - + if label == 'stagein' or label == 'stageout': + content += sitename + 'lsetup rucio davix xrootd; ' + content += 'python3 %s ' % cmd if is_python3() else 'python %s' % cmd + else: + content += cmd if not asetup: content += '\nexit $?' diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 560bfaf9..b24d9de6 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -12,9 +12,9 @@ # Pilot version RELEASE = '2' # released number should be fixed at 2 for Pilot 2 -VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates -REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '32' # build number should be reset to '1' for every new development cycle +VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates +REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates +BUILD = '39' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 8bb688d33f5fa97a80b7d2e839c6da2e1766e2de Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 11 Jun 2021 19:41:31 +0200 Subject: [PATCH 66/96] Testing du. Lazy logging updates. Pylint fixes --- PILOTVERSION | 2 +- pilot.py | 19 +-- pilot/api/analytics.py | 17 ++- pilot/api/dask.py | 28 ++-- pilot/api/data.py | 152 ++++++++------------ pilot/api/es_data.py | 4 +- pilot/common/pluginfactory.py | 7 +- pilot/control/data.py | 131 ++++++++--------- pilot/control/interceptor.py | 9 +- pilot/control/job.py | 30 ++-- pilot/control/payloads/eventservice.py | 8 +- pilot/control/payloads/eventservicemerge.py | 6 +- pilot/control/payloads/generic.py | 106 +++++++------- pilot/user/atlas/dbrelease.py | 2 +- pilot/user/atlas/setup.py | 4 +- pilot/util/constants.py | 2 +- 16 files changed, 240 insertions(+), 287 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 1d38d273..6fca09e1 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.12.1.39 \ No newline at end of file +2.12.1.40b \ No newline at end of file diff --git a/pilot.py b/pilot.py index eee04af1..82fc4d99 100755 --- a/pilot.py +++ b/pilot.py @@ -10,6 +10,7 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019 from __future__ import print_function # Python 2 (2to3 complains about this) +from __future__ import absolute_import import argparse import logging @@ -68,7 +69,7 @@ def main(): infosys.init(args.queue) # check if queue is ACTIVE if infosys.queuedata.state != 'ACTIVE': - logger.critical('specified queue is NOT ACTIVE: %s -- aborting' % infosys.queuedata.name) + logger.critical('specified queue is NOT ACTIVE: %s -- aborting', infosys.queuedata.name) return errors.PANDAQUEUENOTACTIVE except PilotException as error: logger.fatal(error) @@ -81,14 +82,14 @@ def main(): environ['PILOT_SITENAME'] = infosys.queuedata.resource #args.site # TODO: replace with singleton # set requested workflow - logger.info('pilot arguments: %s' % str(args)) + logger.info('pilot arguments: %s', str(args)) workflow = __import__('pilot.workflow.%s' % args.workflow, globals(), locals(), [args.workflow], 0) # Python 3, -1 -> 0 # execute workflow try: exit_code = workflow.run(args) except Exception as e: - logger.fatal('main pilot function caught exception: %s' % e) + logger.fatal('main pilot function caught exception: %s', e) exit_code = None return exit_code @@ -450,9 +451,9 @@ def wrap_up(initdir, mainworkdir, args): try: rmtree(mainworkdir) except Exception as e: - logging.warning("failed to remove %s: %s" % (mainworkdir, e)) + logging.warning("failed to remove %s: %s", mainworkdir, e) else: - logging.info("removed %s" % mainworkdir) + logging.info("removed %s", mainworkdir) # in Harvester mode, create a kill_worker file that will instruct Harvester that the pilot has finished if args.harvester: @@ -464,15 +465,15 @@ def wrap_up(initdir, mainworkdir, args): except Exception: exit_code = trace else: - logging.info('traces error code: %d' % exit_code) + logging.info('traces error code: %d', exit_code) if trace.pilot['nr_jobs'] <= 1: if exit_code != 0: - logging.info('an exit code was already set: %d (will be converted to a standard shell code)' % exit_code) + logging.info('an exit code was already set: %d (will be converted to a standard shell code)', exit_code) elif trace.pilot['nr_jobs'] > 0: if trace.pilot['nr_jobs'] == 1: - logging.getLogger(__name__).info('pilot has finished (%d job was processed)' % trace.pilot['nr_jobs']) + logging.getLogger(__name__).info('pilot has finished (%d job was processed)', trace.pilot['nr_jobs']) else: - logging.getLogger(__name__).info('pilot has finished (%d jobs were processed)' % trace.pilot['nr_jobs']) + logging.getLogger(__name__).info('pilot has finished (%d jobs were processed)', trace.pilot['nr_jobs']) exit_code = SUCCESS elif trace.pilot['state'] == FAILURE: logging.critical('pilot workflow failure -- aborting') diff --git a/pilot/api/analytics.py b/pilot/api/analytics.py index 3b509b57..aa7e047b 100644 --- a/pilot/api/analytics.py +++ b/pilot/api/analytics.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 from .services import Services from pilot.common.exception import NotDefined, NotSameLength, UnknownException @@ -146,21 +146,20 @@ def get_fitted_data(self, filename, x_name='Time', y_name='pss+swap', precision= y = y[:-2] if (len(x) > 7 and len(y) > 7) and len(x) == len(y): - logger.info('fitting %s vs %s' % (y_name, x_name)) + logger.info('fitting %s vs %s', y_name, x_name) try: fit = self.fit(x, y) _slope = self.slope() except Exception as e: - logger.warning('failed to fit data, x=%s, y=%s: %s' % (str(x), str(y), e)) + logger.warning('failed to fit data, x=%s, y=%s: %s', str(x), str(y), e) else: if _slope: slope = float_to_rounded_string(fit.slope(), precision=precision) chi2 = float_to_rounded_string(fit.chi2(), precision=0) # decimals are not needed for chi2 if slope != "": - logger.info('current memory leak: %s B/s (using %d data points, chi2=%s)' % - (slope, len(x), chi2)) + logger.info('current memory leak: %s B/s (using %d data points, chi2=%s)', slope, len(x), chi2) else: - logger.warning('wrong length of table data, x=%s, y=%s (must be same and length>=4)' % (str(x), str(y))) + logger.warning('wrong length of table data, x=%s, y=%s (must be same and length>=4)', str(x), str(y)) return {"slope": slope, "chi2": chi2} @@ -182,8 +181,8 @@ def extract_from_table(self, table, x_name, y_name): y2_name = y_name.split('+')[1] y1_value = table.get(y1_name, []) y2_value = table.get(y2_name, []) - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) x = [] y = [] else: @@ -238,7 +237,7 @@ def __init__(self, **kwargs): self.set_intersect() self.set_chi2() else: - logger.warning("\'%s\' model is not implemented" % self._model) + logger.warning("\'%s\' model is not implemented", self._model) raise NotImplementedError() def fit(self): diff --git a/pilot/api/dask.py b/pilot/api/dask.py index ad051c00..ab5ff3eb 100644 --- a/pilot/api/dask.py +++ b/pilot/api/dask.py @@ -64,7 +64,7 @@ def uninstall(self, block=True): """ - logger.info('uninstalling service %s' % self.servicename) + logger.info('uninstalling service %s', self.servicename) if block: logger.warning('blocking mode not yet implemented') @@ -72,7 +72,7 @@ def uninstall(self, block=True): exit_code, stdout, stderr = execute(cmd, mute=True) if not exit_code: self.status = 'uninstalled' - logger.info('uninstall of service %s has been requested' % self.servicename) + logger.info('uninstall of service %s has been requested', self.servicename) def install(self, block=True): """ @@ -90,9 +90,9 @@ def install(self, block=True): # is the single-dask cluster already running? name = '%s-scheduler' % self.servicename if self.is_running(name=name): - logger.info('service %s is already running - nothing to install' % name) + logger.info('service %s is already running - nothing to install', name) else: - logger.info('service %s is not yet running - proceed with installation' % name) + logger.info('service %s is not yet running - proceed with installation', name) # perform helm updates before actual instqllation cmd = '' @@ -101,13 +101,13 @@ def install(self, block=True): cmd = 'helm install %s %s dask/dask' % (override_option, self.servicename) exit_code, stdout, stderr = execute(cmd, mute=True) if not exit_code: - logger.info('installation of service %s is in progress' % self.servicename) + logger.info('installation of service %s is in progress', self.servicename) if block: while True: name = '%s-scheduler' % self.servicename if self.is_running(name=name): - logger.info('service %s is running' % name) + logger.info('service %s is running', name) self.status = 'running' break else: @@ -148,7 +148,7 @@ def _get_dictionary(self, cmd=None): exit_code, stdout, stderr = execute(cmd, mute=True) if exit_code: - logger.warning('failed to execute \'%s\': %s' % (cmd, stdout)) + logger.warning('failed to execute \'%s\': %s', cmd, stdout) self.status = 'failed' else: # parse output @@ -184,7 +184,7 @@ def _validate(self): logger.warning(stdout) break else: - logger.debug('%s verified' % cmd) + logger.debug('%s verified', cmd) if not found: return False @@ -204,7 +204,7 @@ def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'): filename = os.path.join(self._workdir, self.overrides) if os.path.exists(filename): - logger.info('file \'%s\' already exists - will not override' % filename) + logger.info('file \'%s\' already exists - will not override', filename) return script = "" @@ -216,7 +216,7 @@ def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'): if script: status = write_file(filename, script) if status: - logger.debug('generated script: %s' % filename) + logger.debug('generated script: %s', filename) else: self.overrides = None @@ -240,7 +240,7 @@ def _convert_to_dict(self, output): dictionary[_l[0]][first_line[i]] = _l[1:][i] except Exception: - logger.warning("unexpected format of utility output: %s" % line) + logger.warning("unexpected format of utility output: %s", line) return dictionary @@ -252,7 +252,7 @@ def connect_cluster(self, release_name=None, manager=dask_kubernetes.HelmCluster if not release_name: release_name = self.servicename self.cluster = manager(release_name=release_name) - logger.info('connected to %s' % manager.__name__) + logger.info('connected to %s', manager.__name__) def scale(self, number): """ @@ -260,7 +260,7 @@ def scale(self, number): """ if number > 2: - logger.warning('too large scale: %d (please use <= 2 for now)' % number) + logger.warning('too large scale: %d (please use <= 2 for now)', number) return if not self.cluster: self.connect_cluster() @@ -269,7 +269,7 @@ def scale(self, number): self.status = 'failed' return - logger.info('setting scale to: %d' % number) + logger.info('setting scale to: %d', number) self.cluster.scale(number) def shutdown(self): diff --git a/pilot/api/data.py b/pilot/api/data.py index cbfabad6..7f104b54 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -6,7 +6,7 @@ # # Authors: # - Mario Lassnig, mario.lassnig@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # - Tobias Wegner, tobias.wegner@cern.ch, 2017-2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2018-2019 @@ -69,7 +69,7 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_ super(StagingClient, self).__init__() if not logger: - logger = logging.getLogger('%s.%s' % (__name__, 'null')) + logger = logging.getLogger('%s.%s', __name__, 'null') logger.disabled = True self.logger = logger @@ -99,7 +99,7 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_ self.trace_report.update(clientState='BAD_COPYTOOL', stateReason=msg) self.trace_report.send() raise PilotException("failed to resolve acopytools settings") - logger.info('configured copytools per activity: acopytools=%s' % self.acopytools) + logger.info('configured copytools per activity: acopytools=%s', self.acopytools) # get an initialized trace report (has to be updated for get/put if not defined before) self.trace_report = trace_report if trace_report else TraceReport(pq=os.environ.get('PILOT_SITENAME', '')) @@ -268,7 +268,7 @@ def resolve_replicas(self, files, use_vp=False): # add signature lifetime for signed URL storages query.update(signature_lifetime=24 * 3600) # note: default is otherwise 1h - logger.info('calling rucio.list_replicas() with query=%s' % query) + logger.info('calling rucio.list_replicas() with query=%s', query) try: replicas = c.list_replicas(**query) @@ -278,7 +278,7 @@ def resolve_replicas(self, files, use_vp=False): show_memory_usage() replicas = list(replicas) - logger.debug("replicas received from Rucio: %s" % replicas) + logger.debug("replicas received from Rucio: %s", replicas) files_lfn = dict(((e.scope, e.lfn), e) for e in xfiles) for replica in replicas: @@ -294,18 +294,18 @@ def resolve_replicas(self, files, use_vp=False): self.trace_report.update(validateStart=time.time()) status = True if fdat.filesize != replica['bytes']: - logger.warning("Filesize of input file=%s mismatched with value from Rucio replica: filesize=%s, replica.filesize=%s, fdat=%s" - % (fdat.lfn, fdat.filesize, replica['bytes'], fdat)) + logger.warning("Filesize of input file=%s mismatched with value from Rucio replica: filesize=%s, replica.filesize=%s, fdat=%s", + fdat.lfn, fdat.filesize, replica['bytes'], fdat) status = False if not fdat.filesize: fdat.filesize = replica['bytes'] - logger.warning("Filesize value for input file=%s is not defined, assigning info from Rucio replica: filesize=%s" % (fdat.lfn, replica['bytes'])) + logger.warning("Filesize value for input file=%s is not defined, assigning info from Rucio replica: filesize=%s", fdat.lfn, replica['bytes']) for ctype in ['adler32', 'md5']: if fdat.checksum.get(ctype) != replica[ctype] and replica[ctype]: - logger.warning("Checksum value of input file=%s mismatched with info got from Rucio replica: checksum=%s, replica.checksum=%s, fdat=%s" - % (fdat.lfn, fdat.checksum, replica[ctype], fdat)) + logger.warning("Checksum value of input file=%s mismatched with info got from Rucio replica: checksum=%s, replica.checksum=%s, fdat=%s", + fdat.lfn, fdat.checksum, replica[ctype], fdat) status = False if not fdat.checksum.get(ctype) and replica[ctype]: @@ -489,33 +489,32 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 code=ErrorCodes.UNKNOWNCOPYTOOL) module = self.copytool_modules[name]['module_name'] - self.logger.info('trying to use copytool=%s for activity=%s' % (name, activity)) + self.logger.info('trying to use copytool=%s for activity=%s', name, activity) copytool = __import__('pilot.copytool.%s' % module, globals(), locals(), [module], 0) # Python 2/3 #self.trace_report.update(protocol=name) - except PilotException as e: - caught_errors.append(e) - self.logger.debug('error: %s' % e) + except PilotException as error: + caught_errors.append(error) + self.logger.debug('error: %s', error) continue - except Exception as e: - self.logger.warning('failed to import copytool module=%s, error=%s' % (module, e)) + except Exception as error: + self.logger.warning('failed to import copytool module=%s, error=%s', module, error) continue try: - #self.logger.debug('kwargs=%s' % str(kwargs)) result = self.transfer_files(copytool, remain_files, activity, **kwargs) - self.logger.debug('transfer_files() using copytool=%s completed with result=%s' % (copytool, str(result))) + self.logger.debug('transfer_files() using copytool=%s completed with result=%s', copytool, str(result)) show_memory_usage() break - except PilotException as e: - self.logger.warning('failed to transfer_files() using copytool=%s .. skipped; error=%s' % (copytool, e)) - caught_errors.append(e) - except TimeoutException as e: - self.logger.warning('function timed out: %s' % e) - caught_errors.append(e) - except Exception as e: - self.logger.warning('failed to transfer files using copytool=%s .. skipped; error=%s' % (copytool, e)) - caught_errors.append(e) + except PilotException as error: + self.logger.warning('failed to transfer_files() using copytool=%s .. skipped; error=%s', copytool, error) + caught_errors.append(error) + except TimeoutException as error: + self.logger.warning('function timed out: %s', error) + caught_errors.append(error) + except Exception as error: + self.logger.warning('failed to transfer files using copytool=%s .. skipped; error=%s', copytool, error) + caught_errors.append(error) import traceback self.logger.error(traceback.format_exc()) @@ -537,7 +536,7 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 errmsg = caught_errors[0].get_last_error() elif caught_errors and isinstance(caught_errors[0], TimeoutException): code = ErrorCodes.STAGEINTIMEOUT if self.mode == 'stage-in' else ErrorCodes.STAGEOUTTIMEOUT # is it stage-in/out? - self.logger.warning('caught time-out exception: %s' % caught_errors[0]) + self.logger.warning('caught time-out exception: %s', caught_errors[0]) else: code = ErrorCodes.STAGEINFAILED if self.mode == 'stage-in' else ErrorCodes.STAGEOUTFAILED # is it stage-in/out? details = str(caught_errors) + ":" + 'failed to transfer files using copytools=%s' % copytools @@ -575,13 +574,13 @@ def require_protocols(self, files, copytool, activity, local_dir=''): protocols = self.resolve_protocol(fspec, allowed_schemas) if not protocols and 'mv' not in self.infosys.queuedata.copytools: # no protocols found error = 'Failed to resolve protocol for file=%s, allowed_schemas=%s, fspec=%s' % (fspec.lfn, allowed_schemas, fspec) - self.logger.error("resolve_protocol: %s" % error) + self.logger.error("resolve_protocol: %s", error) raise PilotException(error, code=ErrorCodes.NOSTORAGEPROTOCOL) # take first available protocol for copytool: FIX ME LATER if need (do iterate over all allowed protocols?) protocol = protocols[0] - self.logger.info("Resolved protocol to be used for transfer: \'%s\': lfn=\'%s\'" % (protocol, fspec.lfn)) + self.logger.info("Resolved protocol to be used for transfer: \'%s\': lfn=\'%s\'", protocol, fspec.lfn) resolve_surl = getattr(copytool, 'resolve_surl', None) if not callable(resolve_surl): @@ -608,7 +607,7 @@ def resolve_protocols(self, files): ddm = ddmconf.get(fdat.ddmendpoint) if not ddm: error = 'Failed to resolve output ddmendpoint by name=%s (from PanDA), please check configuration.' % fdat.ddmendpoint - self.logger.error("resolve_protocols: %s, fspec=%s" % (error, fdat)) + self.logger.error("resolve_protocols: %s, fspec=%s", error, fdat) raise PilotException(error, code=ErrorCodes.NOSTORAGE) protocols = [] @@ -689,13 +688,13 @@ def resolve_replica(self, fspec, primary_schemas=None, allowed_schemas=None, dom pschemas = 'any' if primary_schemas and not primary_schemas[0] else ','.join(primary_schemas or []) error = 'Failed to find replica for file=%s, domain=%s, allowed_schemas=%s, pschemas=%s, fspec=%s' % (fspec.lfn, domain, schemas, pschemas, fspec) - self.logger.info("resolve_replica: %s" % error) + self.logger.info("resolve_replica: %s", error) return # prefer SRM protocol for surl -- to be verified, can it be deprecated? rse_replicas = replicas.get(replica['ddmendpoint'], []) surl = self.get_preferred_replica(rse_replicas, ['srm']) or rse_replicas[0] - self.logger.info("[stage-in] surl (srm replica) from Rucio: pfn=%s, ddmendpoint=%s" % (surl['pfn'], surl['ddmendpoint'])) + self.logger.info("[stage-in] surl (srm replica) from Rucio: pfn=%s, ddmendpoint=%s", surl['pfn'], surl['ddmendpoint']) return {'surl': surl['pfn'], 'ddmendpoint': replica['ddmendpoint'], 'pfn': replica['pfn'], 'domain': replica['domain']} @@ -719,42 +718,10 @@ def get_direct_access_variables(self, job): if job and not job.is_analysis() and job.transfertype != 'direct': # task forbids direct access allow_direct_access = False - self.logger.info('switched off direct access mode for production job since transfertype=%s' % job.transfertype) + self.logger.info('switched off direct access mode for production job since transfertype=%s', job.transfertype) return allow_direct_access, direct_access_type - #def set_accessmodes_for_direct_access(self, files, direct_access_type): ## TO BE DEPRECATED (anisyonk) - # """ - # Update the FileSpec accessmodes for direct access and sort the files to get candidates for remote_io coming - # first in order to exclude them from checking of available space for stage-in. - # - # :param files: FileSpec objects. - # :param direct_access_type: type of direct access (LAN or WAN) (string). - # :return: - # """ - # - # # sort the files - # files = sorted(files, key=lambda x: x.is_directaccess(ensure_replica=False), reverse=True) - # - # # populate allowremoteinputs for each FileSpec object - # for fdata in files: - # is_directaccess = fdata.is_directaccess(ensure_replica=False) - # if is_directaccess and direct_access_type == 'WAN': ## is it the same for ES workflow ?? -- test and verify/FIXME LATER - # fdata.allowremoteinputs = True - # self.logger.info("check direct access for lfn=%s: allow_direct_access=true, fdata.is_directaccess()=%s =>" - # " is_directaccess=%s, allowremoteinputs=%s" % (fdata.lfn, - # fdata.is_directaccess(ensure_replica=False), - # is_directaccess, fdata.allowremoteinputs)) - # # must update accessmode for user jobs (it is only set already for production jobs) - # if fdata.accessmode != 'direct' and is_directaccess and fdata.accessmode != 'copy': - # fdata.accessmode = 'direct' - # - # # reset accessmode if direct access is not to be used - # if fdata.accessmode == 'direct' and not is_directaccess: - # fdata.accessmode = '' - # - # self.logger.info('accessmode for LFN=%s: %s (is_directaccess=%s)' % (fdata.lfn, fdata.accessmode, is_directaccess)) - def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C901 """ Automatically stage in files using the selected copy tool module. @@ -780,7 +747,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 # overwrite allowed_schemas for VP jobs if kwargs['use_vp']: allowed_schemas = ['root'] - self.logger.debug('overwrote allowed_schemas for VP job: %s' % str(allowed_schemas)) + self.logger.debug('overwrote allowed_schemas for VP job: %s', str(allowed_schemas)) for fspec in files: resolve_replica = getattr(copytool, 'resolve_replica', None) @@ -796,11 +763,11 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 fspec.is_directaccess(ensure_replica=False) else None) replica = resolve_replica(fspec, primary_schemas, allowed_schemas, domain='lan') else: - self.logger.info("[stage-in] LAN access is DISABLED for lfn=%s (fspec.allow_lan=%s)" % (fspec.lfn, fspec.allow_lan)) + self.logger.info("[stage-in] LAN access is DISABLED for lfn=%s (fspec.allow_lan=%s)", fspec.lfn, fspec.allow_lan) if not replica and fspec.allow_lan: - self.logger.info("[stage-in] No LAN replica found for lfn=%s, primary_schemas=%s, allowed_schemas=%s" % - (fspec.lfn, primary_schemas, allowed_schemas)) + self.logger.info("[stage-in] No LAN replica found for lfn=%s, primary_schemas=%s, allowed_schemas=%s", + fspec.lfn, primary_schemas, allowed_schemas) # check remote replicas if not replica and fspec.allow_wan: @@ -812,8 +779,8 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 replica = resolve_replica(fspec, primary_schemas, allowed_schemas, domain='wan') if not replica and fspec.allow_wan: - self.logger.info("[stage-in] No WAN replica found for lfn=%s, primary_schemas=%s, allowed_schemas=%s" % - (fspec.lfn, primary_schemas, allowed_schemas)) + self.logger.info("[stage-in] No WAN replica found for lfn=%s, primary_schemas=%s, allowed_schemas=%s", + fspec.lfn, primary_schemas, allowed_schemas) if not replica: raise ReplicasNotFound('No replica found for lfn=%s (allow_lan=%s, allow_wan=%s)' % (fspec.lfn, fspec.allow_lan, fspec.allow_wan)) @@ -826,8 +793,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 if replica.get('domain'): fspec.domain = replica['domain'] - self.logger.info("[stage-in] found replica to be used for lfn=%s: ddmendpoint=%s, pfn=%s" % - (fspec.lfn, fspec.ddmendpoint, fspec.turl)) + self.logger.info("[stage-in] found replica to be used for lfn=%s: ddmendpoint=%s, pfn=%s", fspec.lfn, fspec.ddmendpoint, fspec.turl) # prepare files (resolve protocol/transfer url) if getattr(copytool, 'require_input_protocols', False) and files: @@ -845,7 +811,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 if not copytool.is_valid_for_copy_in(remain_files): msg = 'input is not valid for transfers using copytool=%s' % copytool self.logger.warning(msg) - self.logger.debug('input: %s' % remain_files) + self.logger.debug('input: %s', remain_files) self.trace_report.update(clientState='NO_REPLICA', stateReason=msg) self.trace_report.send() raise PilotException('invalid input data for transfer operation') @@ -867,7 +833,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 # add the trace report kwargs['trace_report'] = self.trace_report - self.logger.info('ready to transfer (stage-in) files: %s' % remain_files) + self.logger.info('ready to transfer (stage-in) files: %s', remain_files) # use bulk downloads if necessary # if kwargs['use_bulk_transfer'] @@ -896,12 +862,11 @@ def set_status_for_direct_access(self, files, workdir): # direct_lan = True if not direct_lan and not direct_wan: - self.logger.debug('direct lan/wan transfer will not be used for lfn=%s' % fspec.lfn) + self.logger.debug('direct lan/wan transfer will not be used for lfn=%s', fspec.lfn) self.logger.debug('lfn=%s, direct_lan=%s, direct_wan=%s, direct_access_lan=%s, direct_access_wan=%s, ' - 'direct_localinput_allowed_schemas=%s, remoteinput_allowed_schemas=%s, domain=%s' % - (fspec.lfn, direct_lan, direct_wan, fspec.direct_access_lan, fspec.direct_access_wan, - str(self.direct_localinput_allowed_schemas), str(self.direct_remoteinput_allowed_schemas), - fspec.domain)) + 'direct_localinput_allowed_schemas=%s, remoteinput_allowed_schemas=%s, domain=%s', + fspec.lfn, direct_lan, direct_wan, fspec.direct_access_lan, fspec.direct_access_wan, + str(self.direct_localinput_allowed_schemas), str(self.direct_remoteinput_allowed_schemas), fspec.domain) if direct_lan or direct_wan: fspec.status_code = 0 @@ -911,8 +876,8 @@ def set_status_for_direct_access(self, files, workdir): if alrb_xcache_proxy and direct_lan: #fspec.is_directaccess(ensure_replica=False): fspec.turl = '${ALRB_XCACHE_PROXY}' + fspec.turl - self.logger.info('stage-in: direct access (remote i/o) will be used for lfn=%s (direct_lan=%s, direct_wan=%s), turl=%s' % - (fspec.lfn, direct_lan, direct_wan, fspec.turl)) + self.logger.info('stage-in: direct access (remote i/o) will be used for lfn=%s (direct_lan=%s, direct_wan=%s), turl=%s', + fspec.lfn, direct_lan, direct_wan, fspec.turl) # send trace localsite = os.environ.get('RUCIO_LOCAL_SITE_ID') @@ -934,7 +899,7 @@ def set_status_for_direct_access(self, files, workdir): if not os.path.exists(_workdir): path = os.path.join('/srv', config.Pilot.base_trace_report) if not os.path.exists(path): - self.logger.debug('writing base trace report to: %s' % path) + self.logger.debug('writing base trace report to: %s', path) write_json(path, self.trace_report) else: self.trace_report.send() @@ -948,7 +913,7 @@ def check_availablespace(self, files): """ for f in files: - self.logger.debug('lfn=%s filesize=%d accessmode=%s' % (f.lfn, f.filesize, f.accessmode)) + self.logger.debug('lfn=%s filesize=%d accessmode=%s', f.lfn, f.filesize, f.accessmode) maxinputsize = convert_mb_to_b(get_maximum_input_sizes()) totalsize = reduce(lambda x, y: x + y.filesize, files, 0) @@ -959,12 +924,11 @@ def check_availablespace(self, files): (len(files), totalsize, maxinputsize) raise SizeTooLarge(error) - self.logger.info("total input file size=%s B within allowed limit=%s B (zero value means unlimited)" % - (totalsize, maxinputsize)) + self.logger.info("total input file size=%s B within allowed limit=%s B (zero value means unlimited)", totalsize, maxinputsize) # get available space available_space = convert_mb_to_b(get_local_disk_space(os.getcwd())) - self.logger.info("locally available space: %d B" % available_space) + self.logger.info("locally available space: %d B", available_space) # are we within the limit? if totalsize > available_space: @@ -1019,17 +983,17 @@ def prepare_destinations(self, files, activities): # take the fist choice for now, extend the logic later if need ddm = storages[0] - self.logger.info("[prepare_destinations][%s]: allowed (local) destinations: %s" % (activity, storages)) - self.logger.info("[prepare_destinations][%s]: resolved default destination ddm=%s" % (activity, ddm)) + self.logger.info("[prepare_destinations][%s]: allowed (local) destinations: %s", activity, storages) + self.logger.info("[prepare_destinations][%s]: resolved default destination ddm=%s", activity, ddm) for e in files: if not e.ddmendpoint: # no preferences => use default destination self.logger.info("[prepare_destinations][%s]: fspec.ddmendpoint is not set for lfn=%s" - " .. will use default ddm=%s as (local) destination" % (activity, e.lfn, ddm)) + " .. will use default ddm=%s as (local) destination", activity, e.lfn, ddm) e.ddmendpoint = ddm elif e.ddmendpoint not in storages: # fspec.ddmendpoint is not in associated storages => assume it as final (non local) alternative destination self.logger.info("[prepare_destinations][%s]: Requested fspec.ddmendpoint=%s is not in the list of allowed (local) destinations" - " .. will consider default ddm=%s for transfer and tag %s as alt. location" % (activity, e.ddmendpoint, ddm, e.ddmendpoint)) + " .. will consider default ddm=%s for transfer and tag %s as alt. location", activity, e.ddmendpoint, ddm, e.ddmendpoint) e.ddmendpoint = ddm e.ddmendpoint_alt = e.ddmendpoint # consider me later @@ -1135,10 +1099,10 @@ def transfer_files(self, copytool, files, activity, **kwargs): if not copytool.is_valid_for_copy_out(files): self.logger.warning('Input is not valid for transfers using copytool=%s' % copytool) - self.logger.debug('Input: %s' % files) + self.logger.debug('Input: %s', files) raise PilotException('Invalid input for transfer operation') - self.logger.info('ready to transfer (stage-out) files: %s' % files) + self.logger.info('ready to transfer (stage-out) files: %s', files) if self.infosys: kwargs['copytools'] = self.infosys.queuedata.copytools diff --git a/pilot/api/es_data.py b/pilot/api/es_data.py index 708e6de7..e246cbd9 100644 --- a/pilot/api/es_data.py +++ b/pilot/api/es_data.py @@ -7,7 +7,7 @@ # Authors: # - Wen Guan, wen.guan@cern,ch, 2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2019 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021 import logging @@ -46,7 +46,7 @@ def prepare_sources(self, files, activities=None): fspec.scope = 'transient' if storage_id: fspec.ddmendpoint = self.infosys.get_ddmendpoint(storage_id) - logger.info("Processed file with storage id: %s" % fspec) + logger.info("Processed file with storage id: %s", fspec) class StageOutESClient(StageOutClient): diff --git a/pilot/common/pluginfactory.py b/pilot/common/pluginfactory.py index cf2b5f27..27925299 100644 --- a/pilot/common/pluginfactory.py +++ b/pilot/common/pluginfactory.py @@ -6,6 +6,7 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021 import logging @@ -30,11 +31,11 @@ def get_plugin(self, confs): class_name = confs['class'] if class_name is None: - logger.error("[class] is not defined in confs: %s" % confs) + logger.error("[class] is not defined in confs: %s", confs) return None if class_name not in self.classMap: - logger.info("Trying to import %s" % class_name) + logger.info("Trying to import %s", class_name) components = class_name.split('.') mod = __import__('.'.join(components[:-1])) for comp in components[1:]: @@ -48,7 +49,7 @@ def get_plugin(self, confs): args[key] = confs[key] cls = self.classMap[class_name] - logger.info("Importing %s with args: %s" % (cls, args)) + logger.info("Importing %s with args: %s", cls, args) impl = cls(**args) return impl diff --git a/pilot/control/data.py b/pilot/control/data.py index 83c731ee..6f820d33 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -7,7 +7,7 @@ # Authors: # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # - Wen Guan, wen.guan@cern.ch, 2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2018 @@ -63,7 +63,7 @@ def control(queues, traces, args): pass else: exc_type, exc_obj, exc_trace = exc - logger.warning("thread \'%s\' received an exception from bucket: %s" % (thread.name, exc_obj)) + logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj) # deal with the exception # .. @@ -107,8 +107,8 @@ def skip_special_files(job): user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: user.update_stagein(job) - except Exception as e: - logger.warning('caught exception: %s' % e) + except Exception as error: + logger.warning('caught exception: %s', error) def update_indata(job): @@ -124,7 +124,7 @@ def update_indata(job): if fspec.status == 'no_transfer': toberemoved.append(fspec) for fspec in toberemoved: - logger.info('removing fspec object (lfn=%s) from list of input files' % fspec.lfn) + logger.info('removing fspec object (lfn=%s) from list of input files', fspec.lfn) job.indata.remove(fspec) @@ -193,11 +193,11 @@ def _stage_in(args, job): pilot.util.middleware.containerise_middleware(job, job.indata, args.queue, eventtype, localsite, remotesite, job.infosys.queuedata.container_options, args.input_dir, label=label, container_type=job.infosys.queuedata.container_type.get("middleware")) - except PilotException as e: - logger.warning('stage-in containerisation threw a pilot exception: %s' % e) - except Exception as e: + except PilotException as error: + logger.warning('stage-in containerisation threw a pilot exception: %s', error) + except Exception as error: import traceback - logger.warning('stage-in containerisation threw an exception: %s' % e) + logger.warning('stage-in containerisation threw an exception: %s', error) logger.error(traceback.format_exc()) else: try: @@ -224,17 +224,17 @@ def _stage_in(args, job): msg = errors.format_diagnostics(error.get_error_code(), error_msg) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code(), msg=msg) except Exception as error: - logger.error('failed to stage-in: error=%s' % error) + logger.error('failed to stage-in: error=%s', error) logger.info('summary of transferred files:') - for e in job.indata: - status = e.status if e.status else "(not transferred)" - logger.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, status)) + for infile in job.indata: + status = infile.status if infile.status else "(not transferred)" + logger.info(" -- lfn=%s, status_code=%s, status=%s", infile.lfn, infile.status_code, status) # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args) - remain_files = [e for e in job.indata if e.status not in ['remote_io', 'transferred', 'no_transfer']] + remain_files = [infile for infile in job.indata if infile.status not in ['remote_io', 'transferred', 'no_transfer']] logger.info("stage-in finished") if not remain_files else logger.info("stage-in failed") return not remain_files @@ -255,8 +255,8 @@ def get_rse(data, lfn=""): if lfn == "": try: return data[0].ddmendpoint - except Exception as e: - logger.warning("exception caught: %s" % e) + except Exception as error: + logger.warning("exception caught: %s", error) logger.warning("end point is currently unknown") return "unknown" @@ -426,10 +426,10 @@ def write_output(filename, output): try: write_file(filename, output, unique=True) - except PilotException as e: - logger.warning('failed to write utility output to file: %s, %s' % (e, output)) + except PilotException as error: + logger.warning('failed to write utility output to file: %s, %s', error, output) else: - logger.debug('wrote %s' % filename) + logger.debug('wrote %s', filename) def write_utility_output(workdir, step, stdout, stderr): @@ -479,17 +479,17 @@ def copytool_in(queues, traces, args): if cmd: # xcache debug exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[before xcache start] stdout=%s' % _stdout) - logger.debug('[before xcache start] stderr=%s' % _stderr) + logger.debug('[before xcache start] stdout=%s', _stdout) + logger.debug('[before xcache start] stderr=%s', _stderr) exit_code, stdout, stderr = execute(cmd.get('command')) - logger.debug('stdout=%s' % stdout) - logger.debug('stderr=%s' % stderr) + logger.debug('stdout=%s', stdout) + logger.debug('stderr=%s', stderr) # xcache debug exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[after xcache start] stdout=%s' % _stdout) - logger.debug('[after xcache start] stderr=%s' % _stderr) + logger.debug('[after xcache start] stdout=%s', _stdout) + logger.debug('[after xcache start] stderr=%s', _stderr) # perform any action necessary after command execution (e.g. stdout processing) kwargs = {'label': cmd.get('label', 'utility'), 'output': stdout} @@ -530,7 +530,7 @@ def copytool_in(queues, traces, args): # remove the job from the current stage-in queue _job = queues.current_data_in.get(block=True, timeout=1) if _job: - logger.debug('job %s has been removed from the current_data_in queue' % _job.jobid) + logger.debug('job %s has been removed from the current_data_in queue', _job.jobid) # now create input file metadata if required by the payload if os.environ.get('PILOT_ES_EXECUTOR_TYPE', 'generic') == 'generic': @@ -538,12 +538,12 @@ def copytool_in(queues, traces, args): user = __import__('pilot.user.%s.metadata' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 file_dictionary = get_input_file_dictionary(job.indata) xml = user.create_input_file_metadata(file_dictionary, job.workdir) - logger.info('created input file metadata:\n%s' % xml) + logger.info('created input file metadata:\n%s', xml) else: # remove the job from the current stage-in queue _job = queues.current_data_in.get(block=True, timeout=1) if _job: - logger.debug('job %s has been removed from the current_data_in queue' % _job.jobid) + logger.debug('job %s has been removed from the current_data_in queue', _job.jobid) logger.warning('stage-in failed, adding job object to failed_data_in queue') job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEINFAILED) set_pilot_state(job=job, state="failed") @@ -607,7 +607,7 @@ def copytool_out(queues, traces, args): if is_already_processed(queues, processed_jobs): continue - logger.info('will perform stage-out for job id=%s' % job.jobid) + logger.info('will perform stage-out for job id=%s', job.jobid) if args.abort_job.is_set(): traces.pilot['command'] = 'abort' @@ -669,7 +669,7 @@ def is_already_processed(queues, processed_jobs): for jobid in processed_jobs: if jobid in jobids: - logger.warning('output from job %s has already been staged out' % jobid) + logger.warning('output from job %s has already been staged out', jobid) found = True break if found: @@ -737,7 +737,7 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out :return: """ - logger.debug('preparing to create log file (debug mode=%s)' % str(debugmode)) + logger.debug('preparing to create log file (debug mode=%s)', str(debugmode)) # PILOT_HOME is the launch directory of the pilot (or the one specified in pilot options as pilot workdir) pilot_home = os.environ.get('PILOT_HOME', os.getcwd()) @@ -755,7 +755,7 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out for f in input_files + output_files: path = os.path.join(workdir, f) if os.path.exists(path): - logger.info('removing file: %s' % path) + logger.info('removing file: %s', path) remove(path) # rename the workdir for the tarball creation @@ -765,7 +765,7 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out workdir = newworkdir fullpath = os.path.join(workdir, logfile_name) # /some/path/to/dirname/log.tgz - logger.info('will create archive %s' % fullpath) + logger.info('will create archive %s', fullpath) try: cmd = "pwd;tar cvfz %s %s --dereference --one-file-system; echo $?" % (fullpath, tarball_name) exit_code, stdout, stderr = execute(cmd) @@ -774,11 +774,11 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out else: if pilot_home != current_dir: os.chdir(pilot_home) - logger.debug('stdout = %s' % stdout) + logger.debug('stdout = %s', stdout) try: os.rename(workdir, orgworkdir) - except Exception as e: - logger.debug('exception caught: %s' % e) + except Exception as error: + logger.debug('exception caught: %s', error) def _do_stageout(job, xdata, activity, queue, title, output_dir=''): @@ -793,7 +793,7 @@ def _do_stageout(job, xdata, activity, queue, title, output_dir=''): :return: True in case of success transfers """ - logger.info('prepare to stage-out %d %s file(s)' % (len(xdata), title)) + logger.info('prepare to stage-out %d %s file(s)', len(xdata), title) label = 'stage-out' # should stage-in be done by a script (for containerisation) or by invoking the API (ie classic mode)? @@ -805,10 +805,10 @@ def _do_stageout(job, xdata, activity, queue, title, output_dir=''): pilot.util.middleware.containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, job.infosys.queuedata.container_options, output_dir, label=label, container_type=job.infosys.queuedata.container_type.get("middleware")) - except PilotException as e: - logger.warning('stage-out containerisation threw a pilot exception: %s' % e) - except Exception as e: - logger.warning('stage-out containerisation threw an exception: %s' % e) + except PilotException as error: + logger.warning('stage-out containerisation threw a pilot exception: %s', error) + except Exception as error: + logger.warning('stage-out containerisation threw an exception: %s', error) else: try: logger.info('stage-out will not be done in a container') @@ -838,16 +838,14 @@ def _do_stageout(job, xdata, activity, queue, title, output_dir=''): logger.debug('stage-out client completed') logger.info('summary of transferred files:') - for e in xdata: - if not e.status: + for iofile in xdata: + if not iofile.status: status = "(not transferred)" else: - status = e.status - logger.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, status)) + status = iofile.status + logger.info(" -- lfn=%s, status_code=%s, status=%s", iofile.lfn, iofile.status_code, status) - remain_files = [e for e in xdata if e.status not in ['transferred']] - logger.debug('remain_files=%s' % str(remain_files)) - logger.debug('xdata=%s' % str(xdata)) + remain_files = [iofile for iofile in xdata if iofile.status not in ['transferred']] return not remain_files @@ -897,8 +895,8 @@ def _stage_out_new(job, args): create_log(job.workdir, logfile.lfn, tarball_name, args.cleanup, input_files=input_files, output_files=output_files, is_looping=errors.LOOPINGJOB in job.piloterrorcodes, debugmode=job.debug) - except LogFileCreationFailure as e: - logger.warning('failed to create tar file: %s' % e) + except LogFileCreationFailure as error: + logger.warning('failed to create tar file: %s', error) set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.LOGFILECREATIONFAILURE) return False @@ -918,32 +916,27 @@ def _stage_out_new(job, args): # generate fileinfo details to be send to Panda fileinfo = {} - for e in job.outdata + job.logdata: - if e.status in ['transferred']: - logger.debug('got surl=%s' % e.surl) - logger.debug('got turl=%s' % e.turl) - fileinfo[e.lfn] = {'guid': e.guid, 'fsize': e.filesize, - 'adler32': e.checksum.get('adler32'), - 'surl': e.turl} + for iofile in job.outdata + job.logdata: + if iofile.status in ['transferred']: + fileinfo[iofile.lfn] = {'guid': iofile.guid, + 'fsize': iofile.filesize, + 'adler32': iofile.checksum.get('adler32'), + 'surl': iofile.turl} job.fileinfo = fileinfo - logger.info('prepared job.fileinfo=%s' % job.fileinfo) # WARNING THE FOLLOWING RESETS ANY PREVIOUS STAGEOUT ERRORS if not is_success: # set error code + message (a more precise error code might have been set already) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEOUTFAILED) set_pilot_state(job=job, state="failed") - logger.warning('stage-out failed') # with error: %d, %s (setting job state to failed)' % - # logger.warning('stage-out failed with error: %d, %s (setting job state to failed)' % - # (job['pilotErrorCode'], job['pilotErrorDiag'])) - # send_state(job, args, 'failed') + logger.warning('stage-out failed') return False logger.info('stage-out finished correctly') if not job.state or (job.state and job.state == 'stageout'): # is the job state already set? if so, don't change the state (unless it's the stageout state) - logger.debug('changing job state from %s to finished' % job.state) + logger.debug('changing job state from %s to finished', job.state) set_pilot_state(job=job, state="finished") # send final server update since all transfers have finished correctly @@ -984,13 +977,10 @@ def queue_monitoring(queues, traces, args): # TODO: put in data_out queue instead? if not _stage_out_new(job, args): - logger.info("job %s failed during stage-in and stage-out of log, adding job object to failed_data_outs " - "queue" % job.jobid) - #queues.failed_data_out.put(job) + logger.info("job %s failed during stage-in and stage-out of log, adding job object to failed_data_outs queue", job.jobid) put_in_queue(job, queues.failed_data_out) else: - logger.info("job %s failed during stage-in, adding job object to failed_jobs queue" % job.jobid) - #queues.failed_jobs.put(job) + logger.info("job %s failed during stage-in, adding job object to failed_jobs queue", job.jobid) put_in_queue(job, queues.failed_jobs) # monitor the finished_data_out queue @@ -1020,12 +1010,11 @@ def queue_monitoring(queues, traces, args): set_pilot_state(job=job, state="failed") if not _stage_out_new(job, args): logger.info("job %s failed during stage-out of data file(s) as well as during stage-out of log, " - "adding job object to failed_jobs queue" % job.jobid) + "adding job object to failed_jobs queue", job.jobid) else: logger.info("job %s failed during stage-out of data file(s) - stage-out of log succeeded, adding job " - "object to failed_jobs queue" % job.jobid) + "object to failed_jobs queue", job.jobid) - #queues.failed_jobs.put(job) put_in_queue(job, queues.failed_jobs) if abort: diff --git a/pilot/control/interceptor.py b/pilot/control/interceptor.py index e7987a3a..31f4c395 100644 --- a/pilot/control/interceptor.py +++ b/pilot/control/interceptor.py @@ -5,7 +5,9 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2021 + +# Note: leave this module for now - the code might be useful for reuse import time @@ -29,9 +31,6 @@ def run(args): :returns: """ - # t = threading.current_thread() - # logger.debug('job.control is run by thread: %s' % t.name) - targets = {'receive': receive, 'send': send} threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'args': args}, name=name) for name, target in list(targets.items())] # Python 2/3 @@ -48,7 +47,7 @@ def run(args): pass else: exc_type, exc_obj, exc_trace = exc - logger.warning("thread \'%s\' received an exception from bucket: %s" % (thread.name, exc_obj)) + logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj) # deal with the exception # .. diff --git a/pilot/control/job.py b/pilot/control/job.py index a080b8b2..94b1c94c 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -7,7 +7,7 @@ # Authors: # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # - Wen Guan, wen.guan@cern.ch, 2018 from __future__ import print_function # Python 2 @@ -72,9 +72,6 @@ def control(queues, traces, args): :return: """ - # t = threading.current_thread() - # logger.debug('job.control is run by thread: %s' % t.name) - targets = {'validate': validate, 'retrieve': retrieve, 'create_data_payload': create_data_payload, 'queue_monitor': queue_monitor, 'job_monitor': job_monitor} threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'queues': queues, 'traces': traces, 'args': args}, @@ -92,7 +89,7 @@ def control(queues, traces, args): pass else: exc_type, exc_obj, exc_trace = exc - logger.warning("thread \'%s\' received an exception from bucket: %s" % (thread.name, exc_obj)) + logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj) # deal with the exception # .. @@ -142,8 +139,8 @@ def _validate_job(job): try: kwargs = {'job': job} job.usecontainer = container.do_use_container(**kwargs) - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) return True if user.verify_job(job) else False @@ -564,8 +561,12 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): logger.warning('received unknown server command via backchannel: %s' % cmd) # for testing debug mode - #job.debug = True - # job.debug_command = 'tail payload.stdout' + + + + job.debug = True + job.debug_command = 'du -dk' + # job.debug_command = 'tail -30 payload.stdout' # job.debug_command = 'ls -ltr workDir' # not really tested # job.debug_command = 'ls -ltr %s' % job.workdir # job.debug_command = 'ps -ef' @@ -688,7 +689,7 @@ def process_debug_mode(job): """ # for gdb commands, use the proper gdb version (the system one may be too old) - if 'gdb ' in job.debug_command: + if job.debug_command.startswith('gdb '): pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 user.preprocess_debug_command(job) @@ -696,7 +697,7 @@ def process_debug_mode(job): stdout = get_debug_stdout(job) if stdout: # in case gdb was successfully used, the payload can now be killed - if 'gdb ' in job.debug_command and job.pid: + if job.debug_command.startswith('gdb ') and job.pid: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PANDAKILL, msg='payload was killed after gdb produced requested core file') logger.debug('will proceed to kill payload processes') @@ -715,15 +716,16 @@ def get_debug_stdout(job): if job.debug_command == 'debug': return get_payload_log_tail(job.workdir) - elif 'tail' in job.debug_command: + elif 'tail ' in job.debug_command: return get_requested_log_tail(job.debug_command, job.workdir) elif 'ls ' in job.debug_command: return get_ls(job.debug_command, job.workdir) elif 'ps ' in job.debug_command or 'gdb ' in job.debug_command: return get_general_command_stdout(job) else: - logger.warning('command not handled yet: %s' % job.debug_command) - return '' + # general command, execute and return output + exit_code, stdout, stderr = execute(job.debug_command) + return stdout def get_general_command_stdout(job): diff --git a/pilot/control/payloads/eventservice.py b/pilot/control/payloads/eventservice.py index 1d601739..07829a73 100644 --- a/pilot/control/payloads/eventservice.py +++ b/pilot/control/payloads/eventservice.py @@ -53,11 +53,11 @@ def run_payload(self, job, cmd, out, err): logger.fatal('could not define payload command') return None - logger.info("payload execution command: %s" % executable) + logger.info("payload execution command: %s", executable) try: payload = {'executable': executable, 'workdir': job.workdir, 'output_file': out, 'error_file': err, 'job': job} - logger.debug("payload: %s" % payload) + logger.debug("payload: %s", payload) logger.info("Starting EventService WorkExecutor") executor_type = self.get_executor_type() @@ -66,14 +66,14 @@ def run_payload(self, job, cmd, out, err): executor.start() logger.info("EventService WorkExecutor started") - logger.info("ESProcess started with pid: %s" % executor.get_pid()) + logger.info("ESProcess started with pid: %s", executor.get_pid()) job.pid = executor.get_pid() if job.pid: job.pgrp = os.getpgid(job.pid) self.utility_after_payload_started(job) except Exception as e: - logger.error('could not execute: %s' % str(e)) + logger.error('could not execute: %s', str(e)) return None return executor diff --git a/pilot/control/payloads/eventservicemerge.py b/pilot/control/payloads/eventservicemerge.py index a23c00b2..5c3d454d 100644 --- a/pilot/control/payloads/eventservicemerge.py +++ b/pilot/control/payloads/eventservicemerge.py @@ -6,7 +6,7 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2021 import os @@ -25,9 +25,9 @@ def untar_file(self, lfn, job): pfn = os.path.join(job.workdir, lfn) command = "tar -xf %s -C %s" % (pfn, job.workdir) - logger.info("Untar file: %s" % command) + logger.info("untar file: %s", command) exit_code, stdout, stderr = execute(command) - logger.info("exit_code: %s, stdout: %s, stderr: %s\n" % (exit_code, stdout, stderr)) + logger.info("exit_code: %s, stdout: %s, stderr: %s\n", exit_code, stdout, stderr) def utility_before_payload(self, job): """ diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 8236f24b..94fba2af 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -92,7 +92,7 @@ def utility_before_payload(self, job): cmd_dictionary = user.get_utility_commands(order=UTILITY_BEFORE_PAYLOAD, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.info('utility command (\'%s\') to be executed before the payload: %s' % (cmd_dictionary.get('label', 'utility'), cmd)) + logger.info('utility command (\'%s\') to be executed before the payload: %s', cmd_dictionary.get('label', 'utility'), cmd) return cmd @@ -114,7 +114,7 @@ def utility_with_payload(self, job): cmd_dictionary = user.get_utility_commands(order=UTILITY_WITH_PAYLOAD, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.info('utility command (\'%s\') to be executed with the payload: %s' % (cmd_dictionary.get('label', 'utility'), cmd)) + logger.info('utility command (\'%s\') to be executed with the payload: %s', cmd_dictionary.get('label', 'utility'), cmd) return cmd @@ -138,7 +138,7 @@ def get_utility_command(self, order=None): cmd_dictionary = user.get_utility_commands(order=order, job=self.__job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.info('utility command (\'%s\') to be executed after the payload: %s' % (cmd_dictionary.get('label', 'utility'), cmd)) + logger.info('utility command (\'%s\') to be executed after the payload: %s', cmd_dictionary.get('label', 'utility'), cmd) return cmd @@ -156,7 +156,7 @@ def utility_after_payload_started(self, job): cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_STARTED, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.info('utility command to be executed after the payload: %s' % cmd) + logger.info('utility command to be executed after the payload: %s', cmd) # how should this command be executed? utilitycommand = user.get_utility_command_setup(cmd_dictionary.get('command'), job) @@ -166,8 +166,8 @@ def utility_after_payload_started(self, job): try: proc1 = execute(utilitycommand, workdir=job.workdir, returnproc=True, usecontainer=False, stdout=PIPE, stderr=PIPE, cwd=job.workdir, job=job) - except Exception as e: - logger.error('could not execute: %s' % e) + except Exception as error: + logger.error('could not execute: %s', error) else: # store process handle in job object, and keep track on how many times the command has been launched # also store the full command in case it needs to be restarted later (by the job_monitor() thread) @@ -191,7 +191,7 @@ def utility_after_payload_started_new(self, job): cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_STARTED, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.info('utility command to be executed after the payload: %s' % cmd) + logger.info('utility command to be executed after the payload: %s', cmd) return cmd @@ -203,8 +203,8 @@ def utility_after_payload_started_new(self, job): # try: # proc = execute(utilitycommand, workdir=job.workdir, returnproc=True, usecontainer=False, # stdout=PIPE, stderr=PIPE, cwd=job.workdir, job=job) -# except Exception as e: -# logger.error('could not execute: %s' % e) +# except Exception as error: +# logger.error('could not execute: %s', error) # else: # # store process handle in job object, and keep track on how many times the command has been launched # # also store the full command in case it needs to be restarted later (by the job_monitor() thread) @@ -233,7 +233,7 @@ def utility_after_payload_finished(self, job, order): cmd_dictionary = user.get_utility_commands(order=order, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.info('utility command (\'%s\') to be executed after the payload has finished: %s' % (cmd_dictionary.get('label', 'utility'), cmd)) + logger.info('utility command (\'%s\') to be executed after the payload has finished: %s', cmd_dictionary.get('label', 'utility'), cmd) return cmd, cmd_dictionary.get('label') @@ -249,7 +249,7 @@ def execute_utility_command(self, cmd, job, label): exit_code, stdout, stderr = execute(cmd, workdir=job.workdir, cwd=job.workdir, usecontainer=False) if exit_code: - logger.warning('command returned non-zero exit code: %s (exit code = %d) - see utility logs for details' % (cmd, exit_code)) + logger.warning('command returned non-zero exit code: %s (exit code = %d) - see utility logs for details', cmd, exit_code) if label == 'preprocess': err = errors.PREPROCESSFAILURE elif label == 'postprocess': @@ -289,18 +289,18 @@ def write_utility_output(self, workdir, step, stdout, stderr): self.__postprocess_stderr_name = name_stderr name = os.path.join(workdir, step + '_stdout.txt') write_file(name, stdout, unique=True) - except PilotException as e: - logger.warning('failed to write utility stdout to file: %s, %s' % (e, stdout)) + except PilotException as error: + logger.warning('failed to write utility stdout to file: %s, %s', error, stdout) else: - logger.debug('wrote %s' % name) + logger.debug('wrote %s', name) try: name = os.path.join(workdir, step + '_stderr.txt') write_file(name, stderr, unique=True) - except PilotException as e: - logger.warning('failed to write utility stderr to file: %s, %s' % (e, stderr)) + except PilotException as error: + logger.warning('failed to write utility stderr to file: %s, %s', error, stderr) else: - logger.debug('wrote %s' % name) + logger.debug('wrote %s', name) def pre_payload(self, job): """ @@ -331,13 +331,13 @@ def run_command(self, cmd, label=None): """ if label: - logger.info('\n\n%s:\n\n%s\n' % (label, cmd)) + logger.info('\n\n%s:\n\n%s\n', label, cmd) if label == 'coprocess': try: out = open(os.path.join(self.__job.workdir, self.__coprocess_stdout_name), 'wb') err = open(os.path.join(self.__job.workdir, self.__coprocess_stderr_name), 'wb') - except Exception as e: - logger.warning('failed to open coprocess stdout/err: %s' % e) + except Exception as error: + logger.warning('failed to open coprocess stdout/err: %s', error) out = None err = None else: @@ -346,14 +346,14 @@ def run_command(self, cmd, label=None): try: proc = execute(cmd, workdir=self.__job.workdir, returnproc=True, stdout=out, stderr=err, usecontainer=False, cwd=self.__job.workdir, job=self.__job) - except Exception as e: - logger.error('could not execute: %s' % str(e)) + except Exception as error: + logger.error('could not execute: %s', error) return None if type(proc) == tuple and not proc[0]: logger.error('failed to execute command') return None - logger.info('started %s -- pid=%s executable=%s' % (label, proc.pid, cmd)) + logger.info('started %s -- pid=%s executable=%s', label, proc.pid, cmd) return proc @@ -374,25 +374,23 @@ def run_payload(self, job, cmd, out, err): # add time for PILOT_PRE_PAYLOAD self.pre_payload(job) - logger.info("\n\npayload execution command:\n\n%s\n" % cmd) + logger.info("\n\npayload execution command:\n\n%s\n", cmd) try: proc = execute(cmd, workdir=job.workdir, returnproc=True, usecontainer=True, stdout=out, stderr=err, cwd=job.workdir, job=job) - except Exception as e: - logger.error('could not execute: %s' % str(e)) + except Exception as error: + logger.error('could not execute: %s', error) return None if type(proc) == tuple and not proc[0]: logger.error('failed to execute payload') return None - logger.info('started -- pid=%s executable=%s' % (proc.pid, cmd)) + logger.info('started -- pid=%s executable=%s', proc.pid, cmd) job.pid = proc.pid job.pgrp = os.getpgid(job.pid) set_pilot_state(job=job, state="running") #_cmd = self.utility_with_payload(job) - #if _cmd: - # logger.info('could have executed: %s' % _cmd) self.utility_after_payload_started(job) @@ -457,13 +455,13 @@ def wait_graceful(self, args, proc): for i in range(60): # Python 2/3 if args.graceful_stop.is_set(): breaker = True - logger.info('breaking -- sending SIGTERM pid=%s' % proc.pid) + logger.info('breaking -- sending SIGTERM pid=%s', proc.pid) os.killpg(os.getpgid(proc.pid), signal.SIGTERM) # proc.terminate() break time.sleep(1) if breaker: - logger.info('breaking -- sleep 3s before sending SIGKILL pid=%s' % proc.pid) + logger.info('breaking -- sleep 3s before sending SIGKILL pid=%s', proc.pid) time.sleep(3) proc.kill() break @@ -471,7 +469,7 @@ def wait_graceful(self, args, proc): exit_code = proc.poll() if iteration % 10 == 0: - logger.info('running: iteration=%d pid=%s exit_code=%s' % (iteration, proc.pid, exit_code)) + logger.info('running: iteration=%d pid=%s exit_code=%s', iteration, proc.pid, exit_code) if exit_code is not None: break else: @@ -504,7 +502,7 @@ def get_payload_command(self, job): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code()) self.__traces.pilot['error_code'] = job.piloterrorcodes[0] logger.fatal( - 'could not define payload command (traces error set to: %d)' % self.__traces.pilot['error_code']) + 'could not define payload command (traces error set to: %d)', self.__traces.pilot['error_code']) return cmd @@ -527,19 +525,19 @@ def run_preprocess(self, job): if cmd_before_payload: cmd_before_payload = job.setup + cmd_before_payload - logger.info("\n\npreprocess execution command:\n\n%s\n" % cmd_before_payload) + logger.info("\n\npreprocess execution command:\n\n%s\n", cmd_before_payload) exit_code = self.execute_utility_command(cmd_before_payload, job, 'preprocess') if exit_code == 160: logger.fatal('no more HP points - time to abort processing loop') elif exit_code: # set error code job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PREPROCESSFAILURE) - logger.fatal('cannot continue since preprocess failed: exit_code=%d' % exit_code) + logger.fatal('cannot continue since preprocess failed: exit_code=%d', exit_code) else: # in case the preprocess produced a command, chmod it path = os.path.join(job.workdir, job.containeroptions.get('containerExec', 'does_not_exist')) if os.path.exists(path): - logger.debug('chmod 0o755: %s' % path) + logger.debug('chmod 0o755: %s', path) os.chmod(path, 0o755) return exit_code @@ -566,7 +564,7 @@ def run(self): # noqa: C901 # abort when nothing more to run, or when the preprocess returns a special exit code iteration = 0 while True: - logger.info('payload iteration loop #%d' % (iteration + 1)) + logger.info('payload iteration loop #%d', iteration + 1) os.environ['PILOT_EXEC_ITERATION_COUNT'] = '%s' % iteration show_memory_usage() @@ -592,8 +590,8 @@ def run(self): # noqa: C901 if os.environ.get('HARVESTER_HOROVOD', '') == '': exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[before payload start] stdout=%s' % _stdout) - logger.debug('[before payload start] stderr=%s' % _stderr) + logger.debug('[before payload start] stdout=%s', _stdout) + logger.debug('[before payload start] stderr=%s', _stderr) proc = self.run_payload(self.__job, cmd, self.__out, self.__err) else: @@ -622,7 +620,7 @@ def run(self): # noqa: C901 # allow for a secondary command to be started after the payload (e.g. a coprocess) utility_cmd = self.get_utility_command(order=UTILITY_AFTER_PAYLOAD_STARTED2) if utility_cmd: - logger.debug('starting utility command: %s' % utility_cmd) + logger.debug('starting utility command: %s', utility_cmd) label = 'coprocess' if 'coprocess' in utility_cmd else None proc_co = self.run_command(utility_cmd, label=label) @@ -639,15 +637,15 @@ def run(self): # noqa: C901 else: state = 'finished' if exit_code == 0 else 'failed' set_pilot_state(job=self.__job, state=state) - logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n' % (proc.pid, exit_code, self.__job.state)) + logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n', proc.pid, exit_code, self.__job.state) exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[after payload finish] stdout=%s' % _stdout) - logger.debug('[after payload finish] stderr=%s' % _stderr) + logger.debug('[after payload finish] stdout=%s', _stdout) + logger.debug('[after payload finish] stderr=%s', _stderr) # stop the utility command (e.g. a coprocess if necessary if proc_co: - logger.debug('stopping utility command: %s' % utility_cmd) + logger.debug('stopping utility command: %s', utility_cmd) kill_processes(proc_co.pid) if exit_code is None: @@ -692,24 +690,24 @@ def run_utility_after_payload_finished(self, state, order): else: if cmd_after_payload and self.__job.postprocess and state != 'failed': cmd_after_payload = self.__job.setup + cmd_after_payload - logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload) + logger.info("\n\npostprocess execution command:\n\n%s\n", cmd_after_payload) exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label) elif cmd_after_payload: - logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload) + logger.info("\n\npostprocess execution command:\n\n%s\n", cmd_after_payload) # xcache debug if 'xcache' in cmd_after_payload: _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[before xcache kill] stdout=%s' % _stdout) - logger.debug('[before xcache kill] stderr=%s' % _stderr) + logger.debug('[before xcache kill] stdout=%s', _stdout) + logger.debug('[before xcache kill] stderr=%s', _stderr) exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label) # xcache debug if 'xcache' in cmd_after_payload: _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[after xcache kill] stdout=%s' % _stdout) - logger.debug('[after xcache kill] stderr=%s' % _stderr) + logger.debug('[after xcache kill] stdout=%s', _stdout) + logger.debug('[after xcache kill] stderr=%s', _stderr) return exit_code @@ -727,11 +725,11 @@ def stop_utilities(self): if utproc: user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 sig = user.get_utility_command_kill_signal(utcmd) - logger.info("stopping process \'%s\' with signal %d" % (utcmd, sig)) + logger.info("stopping process \'%s\' with signal %d", utcmd, sig) try: os.killpg(os.getpgid(utproc.pid), sig) - except Exception as e: - logger.warning('exception caught: %s (ignoring)' % e) + except Exception as error: + logger.warning('exception caught: %s (ignoring)', error) user.post_utility_command_action(utcmd, self.__job) @@ -748,4 +746,4 @@ def rename_log_files(self, iteration): if os.path.exists(name): os.rename(name, name + '%d' % iteration) else: - logger.warning('cannot rename %s since it does not exist' % name) + logger.warning('cannot rename %s since it does not exist', name) diff --git a/pilot/user/atlas/dbrelease.py b/pilot/user/atlas/dbrelease.py index cbbec484..5f090b5d 100644 --- a/pilot/user/atlas/dbrelease.py +++ b/pilot/user/atlas/dbrelease.py @@ -58,7 +58,7 @@ def get_dbrelease_dir(): :return: path to DBRelease (string). """ - path = os.path.expandvars('$VO_ATLAS_SW_DIR/database/DBRelease') if 'VO_ATLAS_SW_DIR' in os.environ else os.path.expandvars('$OSG_APP/database/DBRelease') + path = os.path.join(os.environ.get('VO_ATLAS_SW_DIR', 'OSG_APP'), 'database/DBRelease') if path == "" or path.startswith('OSG_APP'): logger.warning("note: the DBRelease database directory is not available (will not attempt to skip DBRelease stage-in)") else: diff --git a/pilot/user/atlas/setup.py b/pilot/user/atlas/setup.py index 87d77faf..f21dfe27 100644 --- a/pilot/user/atlas/setup.py +++ b/pilot/user/atlas/setup.py @@ -413,8 +413,8 @@ def get_payload_environment_variables(cmd, job_id, task_id, attempt_nr, processi def get_writetoinput_filenames(writetofile): """ Extract the writeToFile file name(s). - writeToFile='tmpin_mc16_13TeV.345935.PhPy8EG_A14_ttbarMET100_200_hdamp258p75_nonallhad.merge.AOD.e6620_e5984_s3126_r10724_r10726_tid15760866_00:AOD.15760866._000002.pool.root.1' - -> return 'tmpin_mc16_13TeV.345935.PhPy8EG_A14_ttbarMET100_200_hdamp258p75_nonallhad.merge.AOD.e6620_e5984_s3126_r10724_r10726_tid15760866_00' + writeToFile='tmpin_mc16_13TeV.blah:AOD.15760866._000002.pool.root.1' + -> return 'tmpin_mc16_13TeV.blah' :param writetofile: string containing file name information. :return: list of file names diff --git a/pilot/util/constants.py b/pilot/util/constants.py index b24d9de6..55163017 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '39' # build number should be reset to '1' for every new development cycle +BUILD = '40b' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 552b54885dbe2d6d25e2a4efedc26ef0136f3e61 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 14 Jun 2021 14:17:05 +0200 Subject: [PATCH 67/96] General debug commands now supported. Added event number to job metrics (to be tested). Now creating core dump when looping is detected, added to log. --- PILOTVERSION | 2 +- pilot/control/job.py | 124 +++++++++++++-------------------- pilot/control/payload.py | 2 +- pilot/user/atlas/jobmetrics.py | 82 +++++++++++++++++++--- pilot/util/auxiliary.py | 55 ++++++++++++++- pilot/util/constants.py | 2 +- pilot/util/filehandling.py | 17 +++++ pilot/util/loopingjob.py | 32 ++++++++- pilot/util/processes.py | 26 ------- 9 files changed, 226 insertions(+), 116 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 6fca09e1..abaf199a 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.12.1.40b \ No newline at end of file +2.12.1.40 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 94b1c94c..58b6c84e 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -33,7 +33,7 @@ from pilot.util import https from pilot.util.auxiliary import get_batchsystem_jobid, get_job_scheduler_id, get_pilot_id, \ set_pilot_state, get_pilot_state, check_for_final_server_update, pilot_version_banner, is_virtual_machine, \ - is_python3, show_memory_usage, has_instruction_sets + is_python3, show_memory_usage, has_instruction_sets, locate_core_file from pilot.util.config import config from pilot.util.common import should_abort, was_pilot_killed from pilot.util.constants import PILOT_MULTIJOB_START_TIME, PILOT_PRE_GETJOB, PILOT_POST_GETJOB, PILOT_KILL_SIGNAL, LOG_TRANSFER_NOT_DONE, \ @@ -50,7 +50,7 @@ from pilot.util.middleware import containerise_general_command from pilot.util.monitoring import job_monitor_tasks, check_local_space from pilot.util.monitoringtime import MonitoringTime -from pilot.util.processes import cleanup, threads_aborted, kill_process, get_pid_from_command, kill_processes +from pilot.util.processes import cleanup, threads_aborted, kill_process, kill_processes from pilot.util.proxy import get_distinguished_name from pilot.util.queuehandling import scan_for_jobs, put_in_queue, queue_report, purge_queue from pilot.util.timing import add_to_pilot_timing, timing_report, get_postgetjob_time, get_time_since, time_stamp @@ -158,14 +158,14 @@ def verify_error_code(job): """ if job.piloterrorcode == 0 and len(job.piloterrorcodes) > 0: - logger.warning('piloterrorcode set to first piloterrorcodes list entry: %s' % str(job.piloterrorcodes)) + logger.warning('piloterrorcode set to first piloterrorcodes list entry: %s', str(job.piloterrorcodes)) job.piloterrorcode = job.piloterrorcodes[0] if job.piloterrorcode != 0 and job.is_analysis(): if errors.is_recoverable(code=job.piloterrorcode): job.piloterrorcode = -abs(job.piloterrorcode) job.state = 'failed' - logger.info('failed user job is recoverable (error code=%s)' % job.piloterrorcode) + logger.info('failed user job is recoverable (error code=%s)', job.piloterrorcode) else: logger.info('failed user job is not recoverable') else: @@ -184,8 +184,6 @@ def get_proper_state(job, state): :return: valid server state (string). """ - logger.debug('state=%s' % state) - logger.debug('serverstate=%s' % job.serverstate) if job.serverstate == "finished" or job.serverstate == "failed": pass elif job.serverstate == "" and state != "finished" and state != "failed": @@ -194,7 +192,6 @@ def get_proper_state(job, state): job.serverstate = state else: job.serverstate = 'running' - logger.debug('serverstate=%s' % job.serverstate) return job.serverstate @@ -219,7 +216,7 @@ def publish_harvester_reports(state, args, data, job, final): # publish work report if not publish_work_report(data, path): - logger.debug('failed to write to workerAttributesFile %s' % path) + logger.debug('failed to write to workerAttributesFile %s', path) return False # check if we are in final state then write out information for output files @@ -227,9 +224,9 @@ def publish_harvester_reports(state, args, data, job, final): # Use the job information to write Harvester event_status.dump file event_status_file = get_event_status_file(args) if publish_stageout_files(job, event_status_file): - logger.debug('wrote log and output files to file %s' % event_status_file) + logger.debug('wrote log and output files to file %s', event_status_file) else: - logger.warning('could not write log and output files to file %s' % event_status_file) + logger.warning('could not write log and output files to file %s', event_status_file) return False # publish job report @@ -258,8 +255,8 @@ def write_heartbeat_to_file(data): path = os.path.join(os.environ.get('PILOT_HOME'), config.Pilot.heartbeat_message) if write_json(path, data): - logger.debug('heartbeat dictionary: %s' % data) - logger.debug('wrote heartbeat to file %s' % path) + logger.debug('heartbeat dictionary: %s', data) + logger.debug('wrote heartbeat to file %s', path) return True else: return False @@ -289,7 +286,7 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) if state == 'finished' or state == 'failed' or state == 'holding': final = True os.environ['SERVER_UPDATE'] = SERVER_UPDATE_UPDATING - logger.info('job %s has %s - %s final server update' % (job.jobid, state, tag)) + logger.info('job %s has %s - %s final server update', job.jobid, state, tag) # make sure that job.state is 'failed' if there's a set error code if job.piloterrorcode or job.piloterrorcodes: @@ -301,7 +298,7 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) verify_error_code(job) else: final = False - logger.info('job %s has state \'%s\' - %s heartbeat' % (job.jobid, state, tag)) + logger.info('job %s has state \'%s\' - %s heartbeat', job.jobid, state, tag) # build the data structure needed for getJob, updateJob data = get_data_structure(job, state, args, xml=xml, metadata=metadata) @@ -323,7 +320,7 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) attempt = 0 done = False while attempt < max_attempts and not done: - logger.info('job update attempt %d/%d' % (attempt + 1, max_attempts)) + logger.info('job update attempt %d/%d', attempt + 1, max_attempts) # get the URL for the PanDA server from pilot options or from config pandaserver = get_panda_server(args.url, args.port) @@ -334,8 +331,8 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) attempt += 1 time_after = int(time.time()) - logger.info('server updateJob request completed in %ds for job %s' % (time_after - time_before, job.jobid)) - logger.info("server responded with: res = %s" % str(res)) + logger.info('server updateJob request completed in %ds for job %s', time_after - time_before, job.jobid) + logger.info("server responded with: res = %s", str(res)) show_memory_usage() @@ -351,9 +348,9 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) logger.info('skipping job update for fake test job') return True - except Exception as e: - logger.warning('exception caught while sending https request: %s' % e) - logger.warning('possibly offending data: %s' % data) + except Exception as error: + logger.warning('exception caught while sending https request: %s', error) + logger.warning('possibly offending data: %s', data) pass if final: @@ -400,7 +397,7 @@ def get_job_status_from_server(job_id, url, port): # open connection ret = https.request('{pandaserver}/server/panda/getStatus'.format(pandaserver=pandaserver), data=data) response = ret[1] - logger.info("response: %s" % str(response)) + logger.info("response: %s", str(response)) if response: try: # decode the response @@ -410,21 +407,21 @@ def get_job_status_from_server(job_id, url, port): status = response['status'] # e.g. 'holding' attempt_nr = int(response['attemptNr']) # e.g. '0' status_code = int(response['StatusCode']) # e.g. '0' - except Exception as e: + except Exception as error: logger.warning( - "exception: dispatcher did not return allowed values: %s, %s" % (str(ret), e)) + "exception: dispatcher did not return allowed values: %s, %s", str(ret), error) status = "unknown" attempt_nr = -1 status_code = 20 else: - logger.debug('server job status=%s, attempt_nr=%d, status_code=%d' % (status, attempt_nr, status_code)) + logger.debug('server job status=%s, attempt_nr=%d, status_code=%d', status, attempt_nr, status_code) else: - logger.warning("dispatcher did not return allowed values: %s" % str(ret)) + logger.warning("dispatcher did not return allowed values: %s", str(ret)) status = "unknown" attempt_nr = -1 status_code = 20 - except Exception as e: - logger.warning("could not interpret job status from dispatcher: %s" % e) + except Exception as error: + logger.warning("could not interpret job status from dispatcher: %s", error) status = 'unknown' attempt_nr = -1 status_code = -1 @@ -471,7 +468,7 @@ def get_panda_server(url, port): if default in pandaserver: rnd = random.choice([socket.getfqdn(vv) for vv in set([v[-1][0] for v in socket.getaddrinfo(default, 25443, socket.AF_INET)])]) pandaserver = pandaserver.replace(default, rnd) - logger.debug('updated %s to %s' % (default, pandaserver)) + logger.debug('updated %s to %s', default, pandaserver) return pandaserver @@ -494,15 +491,15 @@ def get_debug_command(cmd): try: tmp = cmd.split(' ') com = tmp[0] - except Exception as e: - logger.warning('failed to identify debug command: %s' % e) + except Exception as error: + logger.warning('failed to identify debug command: %s', error) else: if com not in allowed_commands: - logger.warning('command=%s is not in the list of allowed commands: %s' % (com, str(allowed_commands))) + logger.warning('command=%s is not in the list of allowed commands: %s', com, str(allowed_commands)) elif ';' in cmd or ';' in cmd: - logger.warning('debug command cannot contain \';\': \'%s\'' % cmd) + logger.warning('debug command cannot contain \';\': \'%s\'', cmd) elif com in forbidden_commands: - logger.warning('command=%s is not allowed' % com) + logger.warning('command=%s is not allowed', com) else: debug_mode = True debug_command = cmd @@ -531,11 +528,10 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): if ' ' in cmd and 'tobekilled' not in cmd: try: job.debug, job.debug_command = get_debug_command(cmd) - except Exception as e: - logger.debug('exception caught in get_debug_command(): %s' % e) + except Exception as error: + logger.debug('exception caught in get_debug_command(): %s', error) elif 'tobekilled' in cmd: - logger.info('pilot received a panda server signal to kill job %s at %s' % - (job.jobid, time_stamp())) + logger.info('pilot received a panda server signal to kill job %s at %s', job.jobid, time_stamp()) set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PANDAKILL) if job.pid: @@ -545,8 +541,7 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): logger.debug('no pid to kill') args.abort_job.set() elif 'softkill' in cmd: - logger.info('pilot received a panda server signal to softkill job %s at %s' % - (job.jobid, time_stamp())) + logger.info('pilot received a panda server signal to softkill job %s at %s', job.jobid, time_stamp()) # event service kill instruction job.debug_command = 'softkill' elif 'debug' in cmd: @@ -558,20 +553,17 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): job.debug = False job.debug_command = 'debugoff' else: - logger.warning('received unknown server command via backchannel: %s' % cmd) + logger.warning('received unknown server command via backchannel: %s', cmd) # for testing debug mode - - - - job.debug = True - job.debug_command = 'du -dk' + # job.debug = True + # job.debug_command = 'du -sk' # job.debug_command = 'tail -30 payload.stdout' # job.debug_command = 'ls -ltr workDir' # not really tested # job.debug_command = 'ls -ltr %s' % job.workdir # job.debug_command = 'ps -ef' # job.debug_command = 'ps axo pid,ppid,pgid,args' - #job.debug_command = 'gdb --pid % -ex \'generate-core-file\'' + # job.debug_command = 'gdb --pid % -ex \'generate-core-file\'' def add_data_structure_ids(data, version_tag): @@ -647,13 +639,13 @@ def get_data_structure(job, state, args, xml=None, metadata=None): #data['coreCount'] = mean(job.corecounts) if job.corecounts else job.corecount if job.corecounts: _mean = mean(job.corecounts) - logger.info('mean actualcorecount: %f' % _mean) + logger.info('mean actualcorecount: %f', _mean) data['meanCoreCount'] = _mean # get the number of events, should report in heartbeat in case of preempted. if job.nevents != 0: data['nEvents'] = job.nevents - logger.info("total number of processed events: %d (read)" % job.nevents) + logger.info("total number of processed events: %d (read)", job.nevents) else: logger.info("payload/TRF did not report the number of read events") @@ -725,6 +717,7 @@ def get_debug_stdout(job): else: # general command, execute and return output exit_code, stdout, stderr = execute(job.debug_command) + logger.info('debug_command: %s:\n\n%s\n', job.debug_command, stdout) return stdout @@ -751,17 +744,17 @@ def get_general_command_stdout(job): containerise_general_command(job, job.infosys.queuedata.container_options, label='general', container_type='container') - except PilotException as e: - logger.warning('general containerisation threw a pilot exception: %s' % e) - except Exception as e: - logger.warning('general containerisation threw an exception: %s' % e) + except PilotException as error: + logger.warning('general containerisation threw a pilot exception: %s', error) + except Exception as error: + logger.warning('general containerisation threw an exception: %s', error) else: ec, stdout, stderr = execute(job.debug_command) - logger.debug("%s (stdout):\n\n%s\n\n" % (job.debug_command, stdout)) - logger.debug("%s (stderr):\n\n%s\n\n" % (job.debug_command, stderr)) + logger.debug("%s (stdout):\n\n%s\n\n", job.debug_command, stdout) + logger.debug("%s (stderr):\n\n%s\n\n", job.debug_command, stderr) # in case a core file was produced, locate it - path = locate_core_file(job.debug_command) if 'gdb ' in job.debug_command else '' + path = locate_core_file(cmd=job.debug_command) if 'gdb ' in job.debug_command else '' if path: # copy it to the working directory (so it will be saved in the log) try: @@ -772,27 +765,6 @@ def get_general_command_stdout(job): return stdout -def locate_core_file(debug_command): - """ - - """ - - path = None - pid = get_pid_from_command(debug_command) - if pid: - filename = 'core.%d' % pid - path = os.path.join(os.environ.get('PILOT_HOME', '.'), filename) - if os.path.exists(path): - logger.debug('found core file at: %s' % path) - - else: - logger.debug('did not find %s in %s' % (filename, path)) - else: - logger.warning('cannot locate core file since pid could not be extracted from debug command') - - return path - - def get_ls(debug_command, workdir): """ Return the requested ls debug command. diff --git a/pilot/control/payload.py b/pilot/control/payload.py index f6bef60e..61cb033c 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -371,7 +371,7 @@ def perform_initial_payload_error_analysis(job, exit_code): logger.warning('error code(s) already set: %s' % str(job.piloterrorcodes)) else: # check if core dumps exist, if so remove them and return True - if remove_core_dumps(job.workdir): + if remove_core_dumps(job.workdir) and not job.debug: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.COREDUMP) else: logger.warning('initial error analysis did not resolve the issue (and core dumps were not found)') diff --git a/pilot/user/atlas/jobmetrics.py b/pilot/user/atlas/jobmetrics.py index 20b5d31a..d0802040 100644 --- a/pilot/user/atlas/jobmetrics.py +++ b/pilot/user/atlas/jobmetrics.py @@ -5,16 +5,18 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 from pilot.api import analytics from pilot.util.jobmetrics import get_job_metrics_entry +from pilot.util.filehandling import find_last_line from .cpu import get_core_count from .common import get_db_info, get_resimevents from .utilities import get_memory_monitor_output_filename import os +import re import logging logger = logging.getLogger(__name__) @@ -31,7 +33,7 @@ def get_job_metrics_string(job): # report core count (will also set corecount in job object) corecount = get_core_count(job) - logger.debug('job definition core count: %d' % corecount) + logger.debug('job definition core count: %d', corecount) #if corecount is not None and corecount != "NULL" and corecount != 'null': # job_metrics += get_job_metrics_entry("coreCount", corecount) @@ -69,14 +71,32 @@ def get_job_metrics_string(job): if max_space > zero: job_metrics += get_job_metrics_entry("workDirSize", max_space) else: - logger.info("will not add max space = %d B to job metrics" % max_space) + logger.info("will not add max space = %d B to job metrics", max_space) # get analytics data - path = os.path.join(job.workdir, get_memory_monitor_output_filename()) + job_metrics = add_analytics_data(job_metrics, job.workdir, job.state) + + # extract event number from file and add to job metrics if it exists + job_metrics = add_event_number(job_metrics, job.workdir) + + return job_metrics + + +def add_analytics_data(job_metrics, workdir, state): + """ + Add the memory leak+chi2 analytics data to the job metrics. + + :param job_metrics: job metrics (string). + :param workdir: work directory (string). + :param state: job state (string). + :return: updated job metrics (string). + """ + + path = os.path.join(workdir, get_memory_monitor_output_filename()) if os.path.exists(path): client = analytics.Analytics() # do not include tails on final update - tails = False if (job.state == "finished" or job.state == "failed" or job.state == "holding") else True + tails = False if (state == "finished" or state == "failed" or state == "holding") else True data = client.get_fitted_data(path, tails=tails) slope = data.get("slope", "") chi2 = data.get("chi2", "") @@ -88,6 +108,28 @@ def get_job_metrics_string(job): return job_metrics +def add_event_number(job_metrics, workdir): + """ + Extract event number from file and add to job metrics if it exists + + :param job_metrics: job metrics (string). + :param workdir: work directory (string). + :return: updated job metrics (string). + """ + + path = os.path.join(workdir, 'eventLoopHeartBeat.txt') + if os.path.exists(path): + last_line = find_last_line(path) + if last_line: + event_number = get_number_in_string(last_line) + if event_number: + job_metrics += get_job_metrics_entry("eventnumber", event_number) + else: + logger.debug('file %s does not exist (skip for now)', path) + + return job_metrics + + def get_job_metrics(job): """ Return a properly formatted job metrics string. @@ -109,17 +151,41 @@ def get_job_metrics(job): job_metrics = job_metrics.lstrip().rstrip() if job_metrics != "": - logger.debug('job metrics=\"%s\"' % (job_metrics)) + logger.debug('job metrics=\"%s\"', job_metrics) else: logger.debug("no job metrics (all values are zero)") # is job_metrics within allowed size? if len(job_metrics) > 500: - logger.warning("job_metrics out of size (%d)" % (len(job_metrics))) + logger.warning("job_metrics out of size (%d)",len(job_metrics)) # try to reduce the field size and remove the last entry which might be cut job_metrics = job_metrics[:500] job_metrics = " ".join(job_metrics.split(" ")[:-1]) - logger.warning("job_metrics has been reduced to: %s" % (job_metrics)) + logger.warning("job_metrics has been reduced to: %s", job_metrics) return job_metrics + + +def get_number_in_string(line, pattern=r'\ done\ processing\ event\ \#(\d+)\,'): + """ + Extract a number from the given string. + + E.g. file eventLoopHeartBeat.txt contains + done processing event #20166959, run #276689 22807 events read so far <<<=== + This function will return 20166959 as in int. + + :param line: line from a file (string). + :param pattern: reg ex pattern (raw string). + :return: extracted number (int). + """ + + event_number = None + match = re.search(pattern, line) + if match: + try: + event_number = int(match.group(1)) + except Exception: + pass + + return event_number diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index bd938c7b..c1a33cec 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -5,9 +5,10 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 import os +import re import sys from collections import Set, Mapping, deque, OrderedDict @@ -539,3 +540,55 @@ def has_instruction_sets(instruction_sets): ret += '|%s' % i.upper() if ret else i.upper() return ret + + +def locate_core_file(cmd=None, pid=None): + """ + Locate the core file produced by gdb. + + :param cmd: optional command containing pid corresponding to core file (string). + :param pid: optional pid to use with core file (core.pid) (int). + :return: path to core file (string). + """ + + path = None + if not pid and cmd: + pid = get_pid_from_command(cmd) + if pid: + filename = 'core.%d' % pid + path = os.path.join(os.environ.get('PILOT_HOME', '.'), filename) + if os.path.exists(path): + logger.debug('found core file at: %s', path) + + else: + logger.debug('did not find %s in %s', filename, path) + else: + logger.warning('cannot locate core file since pid could not be extracted from command') + + return path + + +def get_pid_from_command(cmd, pattern=r'gdb --pid (\d+)'): + """ + Identify an explicit process id in the given command. + + Example: + cmd = 'gdb --pid 19114 -ex \'generate-core-file\'' + -> pid = 19114 + + :param cmd: command containing a pid (string). + :param pattern: regex pattern (raw string). + :return: pid (int). + """ + + pid = None + match = re.search(pattern, cmd) + if match: + try: + pid = int(match.group(1)) + except Exception: + pid = None + else: + print('no match for pattern \'%s\' in command=\'%s\'' % (pattern, cmd)) + + return pid diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 55163017..ef399020 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '40b' # build number should be reset to '1' for every new development cycle +BUILD = '40' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index 53972c30..79ccb710 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -1119,3 +1119,20 @@ def locate_file(pattern): path = fname return path + + +def find_last_line(filename): + """ + Find the last line in a (not too large) file. + + :param filename: file name, full path (string). + :return: last line (string). + """ + + last_line = "" + with open(filename) as f: + for line in f: + pass + last_line = line + + return last_line diff --git a/pilot/util/loopingjob.py b/pilot/util/loopingjob.py index e2c451a0..e45056de 100644 --- a/pilot/util/loopingjob.py +++ b/pilot/util/loopingjob.py @@ -8,10 +8,10 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2018-2020 from pilot.common.errorcodes import ErrorCodes -from pilot.util.auxiliary import whoami, set_pilot_state, cut_output +from pilot.util.auxiliary import whoami, set_pilot_state, cut_output, locate_core_file from pilot.util.config import config from pilot.util.container import execute -from pilot.util.filehandling import remove_files, find_latest_modified_file, verify_file_list +from pilot.util.filehandling import remove_files, find_latest_modified_file, verify_file_list, copy from pilot.util.parameters import convert_to_int from pilot.util.processes import kill_processes from pilot.util.timing import time_stamp @@ -64,6 +64,10 @@ def looping_job(job, mt): logger.info('looping limit: %d s' % looping_limit) if ct - time_last_touched > looping_limit: try: + # first produce core dump and copy it + create_core_dump(pid=job.pid, workdir=job.workdir) + # set debug mode to prevent core file from being removed before log creation + job.debug = True kill_looping_job(job) except Exception as e: logger.warning('exception caught: %s' % e) @@ -73,6 +77,30 @@ def looping_job(job, mt): return exit_code, diagnostics +def create_core_dump(pid=None, workdir=None): + """ + Create core dump and copy it to work directory + """ + + if not pid or not workdir: + logger.warning('cannot create core file since pid or workdir is unknown') + return + + cmd = 'gdb --pid %d -ex \'generate-core-file\'' % pid + exit_code, stdout, stderr = execute(cmd) + if not exit_code: + path = locate_core_file(pid=pid) + if path: + try: + copy(path, workdir) + except Exception as error: + logger.warning('failed to copy core file: %s', error) + else: + logger.debug('copied core dump to workdir') + + else: + logger.warning('failed to execute command: %s, stdout+err=%s', cmd, stdout + stderr) + def get_time_for_last_touch(job, mt, looping_limit): """ Return the time when the files in the workdir were last touched. diff --git a/pilot/util/processes.py b/pilot/util/processes.py index e5b94ae8..6ee9b84d 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -757,29 +757,3 @@ def is_child(pid, pandaid_pid, dictionary): else: # try another pid return is_child(ppid, pandaid_pid, dictionary) - - -def get_pid_from_command(cmd, pattern=r'gdb --pid (\d+)'): - """ - Identify an explicit process id in the given command. - - Example: - cmd = 'gdb --pid 19114 -ex \'generate-core-file\'' - -> pid = 19114 - - :param cmd: command containing a pid (string). - :param pattern: regex pattern (raw string). - :return: pid (int). - """ - - pid = None - match = re.search(pattern, cmd) - if match: - try: - pid = int(match.group(1)) - except Exception: - pid = None - else: - print('no match for pattern \'%s\' in command=\'%s\'' % (pattern, cmd)) - - return pid From 29a819adafaf33db8855c1a71830f8917d65f82e Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 14 Jun 2021 14:31:35 +0200 Subject: [PATCH 68/96] Added prmon to list with unwanted files in looping job killer --- pilot/user/generic/loopingjob_definitions.py | 1 + pilot/util/constants.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pilot/user/generic/loopingjob_definitions.py b/pilot/user/generic/loopingjob_definitions.py index ad392257..9f64b65c 100644 --- a/pilot/user/generic/loopingjob_definitions.py +++ b/pilot/user/generic/loopingjob_definitions.py @@ -34,6 +34,7 @@ def remove_unwanted_files(workdir, files): _files = [] for _file in files: if not (workdir == _file or + "prmon" in _file or "pilotlog" in _file or ".lib.tgz" in _file or ".py" in _file or diff --git a/pilot/util/constants.py b/pilot/util/constants.py index ef399020..273dcf6c 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '40' # build number should be reset to '1' for every new development cycle +BUILD = '41' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From f53b7d5bf3edb87778fa6aa5f24d1b0ae20a0f18 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 15 Jun 2021 14:31:54 +0200 Subject: [PATCH 69/96] Pylint updates. Fixed debug mode to allow 'debug,' --- PILOTVERSION | 2 +- pilot.py | 6 +- pilot/api/data.py | 6 +- pilot/control/job.py | 243 +++++++++++++++++----------------- pilot/control/payload.py | 36 ++--- pilot/user/atlas/dbrelease.py | 48 +++---- pilot/user/atlas/setup.py | 36 +++-- pilot/util/auxiliary.py | 8 +- pilot/util/constants.py | 2 +- pilot/util/filehandling.py | 114 ++++++++-------- pilot/util/harvester.py | 4 +- pilot/util/loopingjob.py | 30 ++--- pilot/util/monitoring.py | 83 ++++++------ pilot/util/processes.py | 101 +++++++------- 14 files changed, 358 insertions(+), 361 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index abaf199a..fb45e883 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.12.1.40 \ No newline at end of file +2.12.1.44 \ No newline at end of file diff --git a/pilot.py b/pilot.py index 82fc4d99..c3fc0e7c 100755 --- a/pilot.py +++ b/pilot.py @@ -363,10 +363,10 @@ def create_main_work_dir(args): try: # create the main PanDA Pilot work directory mkdirs(mainworkdir) - except Exception as e: + except PilotException as error: # print to stderr since logging has not been established yet - print('failed to create workdir at %s -- aborting: %s' % (mainworkdir, e), file=sys.stderr) - exit_code = shell_exit_code(e._errorCode) + print('failed to create workdir at %s -- aborting: %s' % (mainworkdir, error), file=sys.stderr) + exit_code = shell_exit_code(error._errorCode) else: mainworkdir = getcwd() diff --git a/pilot/api/data.py b/pilot/api/data.py index 7f104b54..ccdc874d 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -272,8 +272,8 @@ def resolve_replicas(self, files, use_vp=False): try: replicas = c.list_replicas(**query) - except Exception as e: - raise PilotException("Failed to get replicas from Rucio: %s" % e, code=ErrorCodes.RUCIOLISTREPLICASFAILED) + except Exception as error: + raise PilotException("Failed to get replicas from Rucio: %s" % error, code=ErrorCodes.RUCIOLISTREPLICASFAILED) show_memory_usage() @@ -775,7 +775,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 primary_schemas = (self.direct_remoteinput_allowed_schemas if fspec.direct_access_wan and fspec.is_directaccess(ensure_replica=False) else None) xschemas = self.remoteinput_allowed_schemas - allowed_schemas = [e for e in allowed_schemas if e in xschemas] if allowed_schemas else xschemas + allowed_schemas = [schema for schema in allowed_schemas if schema in xschemas] if allowed_schemas else xschemas replica = resolve_replica(fspec, primary_schemas, allowed_schemas, domain='wan') if not replica and fspec.allow_wan: diff --git a/pilot/control/job.py b/pilot/control/job.py index 58b6c84e..11a3de6a 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -488,6 +488,10 @@ def get_debug_command(cmd): allowed_commands = ['tail', 'ls', 'ps', 'gdb', 'du'] forbidden_commands = ['rm'] + + # remove any 'debug,' command that the server might send redundantly + if ',' in cmd and 'debug' in cmd: + cmd = cmd.replace('debug,', '').replace(',debug', '') try: tmp = cmd.split(' ') com = tmp[0] @@ -784,7 +788,7 @@ def get_ls(debug_command, workdir): debug_command = debug_command.replace(path, finalpath) ec, stdout, stderr = execute(debug_command) - logger.debug("%s:\n\n%s\n\n" % (debug_command, stdout)) + logger.debug("%s:\n\n%s\n\n", debug_command, stdout) return stdout @@ -806,8 +810,8 @@ def get_requested_log_tail(debug_command, workdir): items = debug_command.split(' ') cmd = items[0] options = ' '.join(items[1:]) - logger.debug('debug command: %s' % cmd) - logger.debug('debug options: %s' % options) + logger.debug('debug command: %s', cmd) + logger.debug('debug options: %s', options) # assume that the path is the last of the options; path = options.split(' ')[-1] if ' ' in options else options @@ -816,13 +820,13 @@ def get_requested_log_tail(debug_command, workdir): # find all files with the given pattern and pick the latest updated file (if several) files = glob(fullpath) if files: - logger.info('files found: %s' % str(files)) + logger.info('files found: %s', str(files)) _tail = get_latest_log_tail(files) else: - logger.warning('did not find \'%s\' in path %s' % (path, fullpath)) + logger.warning('did not find \'%s\' in path %s', path, fullpath) if _tail: - logger.debug('tail =\n\n%s\n\n' % _tail) + logger.debug('tail =\n\n%s\n\n', _tail) return _tail @@ -840,7 +844,7 @@ def add_error_codes(data, job): pilot_error_code = job.piloterrorcode pilot_error_codes = job.piloterrorcodes if pilot_error_codes != []: - logger.warning('pilotErrorCodes = %s (will report primary/first error code)' % str(pilot_error_codes)) + logger.warning('pilotErrorCodes = %s (will report primary/first error code)', str(pilot_error_codes)) data['pilotErrorCode'] = pilot_error_codes[0] else: data['pilotErrorCode'] = pilot_error_code @@ -849,7 +853,7 @@ def add_error_codes(data, job): pilot_error_diag = job.piloterrordiag pilot_error_diags = job.piloterrordiags if pilot_error_diags != []: - logger.warning('pilotErrorDiags = %s (will report primary/first error diag)' % str(pilot_error_diags)) + logger.warning('pilotErrorDiags = %s (will report primary/first error diag)', str(pilot_error_diags)) data['pilotErrorDiag'] = pilot_error_diags[0] else: data['pilotErrorDiag'] = pilot_error_diag @@ -874,7 +878,7 @@ def get_cpu_consumption_time(cpuconsumptiontime): except Exception: constime = None if constime and constime > 10 ** 9: - logger.warning("unrealistic cpuconsumptiontime: %d (reset to -1)" % constime) + logger.warning("unrealistic cpuconsumptiontime: %d (reset to -1)", constime) constime = -1 return constime @@ -903,7 +907,7 @@ def add_timing_and_extracts(data, job, state, args): user = __import__('pilot.user.%s.diagnose' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 extracts = user.get_log_extracts(job, state) if extracts != "": - logger.warning('\nXXXXXXXXXXXXXXXXXXXXX[begin log extracts]\n%s\nXXXXXXXXXXXXXXXXXXXXX[end log extracts]' % extracts) + logger.warning('\nXXXXXXXXXXXXXXXXXXXXX[begin log extracts]\n%s\nXXXXXXXXXXXXXXXXXXXXX[end log extracts]', extracts) data['pilotLog'] = extracts[:1024] data['endTime'] = time.time() @@ -924,8 +928,8 @@ def add_memory_info(data, workdir, name=""): try: utility_node = utilities.get_memory_monitor_info(workdir, name=name) data.update(utility_node) - except Exception as e: - logger.info('memory information not available: %s' % e) + except Exception as error: + logger.info('memory information not available: %s', error) pass @@ -947,16 +951,15 @@ def remove_pilot_logs_from_list(list_of_files): config.Container.container_script, config.Container.release_setup, config.Container.stagein_status_dictionary, config.Container.stagein_replica_dictionary, 'eventLoopHeartBeat.txt', 'memory_monitor_output.txt', 'memory_monitor_summary.json_snapshot'] - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) to_be_removed = [] new_list_of_files = [] for filename in list_of_files: - if os.path.basename(filename) not in to_be_removed and '/pilot/' not in filename: + if os.path.basename(filename) not in to_be_removed and '/pilot/' not in filename and 'prmon' not in filename: new_list_of_files.append(filename) - #logger.debug('list_of_files=%s' % str(new_list_of_files)) return new_list_of_files @@ -975,7 +978,7 @@ def get_payload_log_tail(workdir): list_of_files = remove_pilot_logs_from_list(list_of_files) if not list_of_files: - logger.info('no log files were found (will use default %s)' % config.Payload.payloadstdout) + logger.info('no log files were found (will use default %s)', config.Payload.payloadstdout) list_of_files = [os.path.join(workdir, config.Payload.payloadstdout)] return get_latest_log_tail(list_of_files) @@ -992,13 +995,13 @@ def get_latest_log_tail(files): try: latest_file = max(files, key=os.path.getmtime) - logger.info('tail of file %s will be added to heartbeat' % latest_file) + logger.info('tail of file %s will be added to heartbeat', latest_file) # now get the tail of the found log file and protect against potentially large tails stdout_tail = latest_file + "\n" + tail(latest_file) stdout_tail = stdout_tail[-2048:] - except Exception as e: - logger.warning('failed to get payload stdout tail: %s' % e) + except Exception as error: + logger.warning('failed to get payload stdout tail: %s', error) return stdout_tail @@ -1024,7 +1027,7 @@ def validate(queues, traces, args): # set the environmental variable for the task id os.environ['PanDA_TaskID'] = str(job.taskid) - logger.info('processing PanDA job %s from task %s' % (job.jobid, job.taskid)) + logger.info('processing PanDA job %s from task %s', job.jobid, job.taskid) if _validate_job(job): @@ -1032,16 +1035,16 @@ def validate(queues, traces, args): os.setpgrp() job_dir = os.path.join(args.mainworkdir, 'PanDA_Pilot-%s' % job.jobid) - logger.debug('creating job working directory: %s' % job_dir) + logger.debug('creating job working directory: %s', job_dir) try: os.mkdir(job_dir) os.chmod(job_dir, 0o770) job.workdir = job_dir - except Exception as e: - logger.debug('cannot create working directory: %s' % str(e)) + except Exception as error: + logger.debug('cannot create working directory: %s', error) traces.pilot['error_code'] = errors.MKDIR job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(traces.pilot['error_code']) - job.piloterrordiag = e + job.piloterrordiag = error put_in_queue(job, queues.failed_jobs) break else: @@ -1051,15 +1054,15 @@ def validate(queues, traces, args): # # stream the job object to file # job_dict = job.to_json() # write_json(os.path.join(job.workdir, 'job.json'), job_dict) -# except Exception as e: -# logger.debug('exception caught: %s' % e) +# except Exception as error: +# logger.debug('exception caught: %s', error) # else: # try: # _job_dict = read_json(os.path.join(job.workdir, 'job.json')) # job_dict = loads(_job_dict) # _job = JobData(job_dict, use_kmap=False) -# except Exception as e: -# logger.warning('exception caught: %s' % e) +# except Exception as error: +# logger.warning('exception caught: %s', error) create_symlink(from_path='../%s' % config.Pilot.pilotlog, to_path=os.path.join(job_dir, config.Pilot.pilotlog)) @@ -1068,8 +1071,8 @@ def validate(queues, traces, args): utilities = __import__('pilot.user.%s.utilities' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: utilities.precleanup() - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) # store the PanDA job id for the wrapper to pick up store_jobid(job.jobid, args.sourcedir) @@ -1080,7 +1083,7 @@ def validate(queues, traces, args): # make sure that ctypes is available (needed at the end by orphan killer) verify_ctypes(queues, job) else: - logger.debug('Failed to validate job=%s' % job.jobid) + logger.debug('Failed to validate job=%s', job.jobid) put_in_queue(job, queues.failed_jobs) # proceed to set the job_aborted flag? @@ -1104,11 +1107,11 @@ def verify_ctypes(queues, job): try: import ctypes - except Exception as e: - diagnostics = 'ctypes python module could not be imported: %s' % e + except Exception as error: + diagnostics = 'ctypes python module could not be imported: %s' % error logger.warning(diagnostics) #job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOCTYPES, msg=diagnostics) - #logger.debug('Failed to validate job=%s' % job.jobid) + #logger.debug('Failed to validate job=%s', job.jobid) #put_in_queue(job, queues.failed_jobs) else: logger.debug('ctypes python module imported') @@ -1141,7 +1144,7 @@ def delayed_space_check(queues, traces, args, job): traces.pilot['error_code'] = errors.NOLOCALSPACE # set the corresponding error code job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOLOCALSPACE, msg=diagnostics) - logger.debug('Failed to validate job=%s' % job.jobid) + logger.debug('Failed to validate job=%s', job.jobid) put_in_queue(job, queues.failed_jobs) else: put_in_queue(job, queues.validated_jobs) @@ -1177,10 +1180,10 @@ def store_jobid(jobid, init_dir): path = os.path.join(os.path.join(init_dir, 'pilot2'), config.Pilot.jobid_file) path = path.replace('pilot2/pilot2', 'pilot2') # dirty fix for bad paths mode = 'a' if os.path.exists(path) else 'w' - logger.debug('path=%s mode=%s' % (path, mode)) + logger.debug('path=%s mode=%s', path, mode) write_file(path, "%s\n" % str(jobid), mode=mode, mute=False) - except Exception as e: - logger.warning('exception caught while trying to store job id: %s' % e) + except Exception as error: + logger.warning('exception caught while trying to store job id: %s', error) def create_data_payload(queues, traces, args): @@ -1331,7 +1334,7 @@ def get_dispatcher_dictionary(args): taskid = get_task_id() if taskid != "" and args.allow_same_user: data['taskID'] = taskid - logger.info("will download a new job belonging to task id: %s" % (data['taskID'])) + logger.info("will download a new job belonging to task id: %s", data['taskID']) if args.resource_type != "": data['resourceType'] = args.resource_type @@ -1401,7 +1404,7 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge maximum_getjob_requests = 60 if harvester else max_getjob_requests # 1 s apart (if harvester) if getjob_requests > int(maximum_getjob_requests): - logger.warning('reached maximum number of getjob requests (%s) -- will abort pilot' % maximum_getjob_requests) + logger.warning('reached maximum number of getjob requests (%s) -- will abort pilot', maximum_getjob_requests) # use singleton: # instruct the pilot to wrap up quickly os.environ['PILOT_WRAP_UP'] = 'QUICKLY' @@ -1415,7 +1418,7 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge return False if (currenttime - starttime > timefloor) and jobnumber > 0: - logger.warning("the pilot has run out of time (timefloor=%d has been passed)" % timefloor) + logger.warning("the pilot has run out of time (timefloor=%d has been passed)", timefloor) # use singleton: # instruct the pilot to wrap up quickly os.environ['PILOT_WRAP_UP'] = 'QUICKLY' @@ -1423,8 +1426,7 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge # timefloor not relevant for the first job if jobnumber > 0: - logger.info('since timefloor=%d s and only %d s has passed since launch, pilot can run another job' % - (timefloor, currenttime - starttime)) + logger.info('since timefloor=%d s and only %d s has passed since launch, pilot can run another job',timefloor, currenttime - starttime) if harvester and jobnumber > 0: # unless it's the first job (which is preplaced in the init dir), instruct Harvester to place another job @@ -1454,7 +1456,7 @@ def getjob_server_command(url, port): if not findall(port_pattern, url): url = url + ':%s' % port else: - logger.debug('URL already contains port: %s' % url) + logger.debug('URL already contains port: %s', url) else: url = config.Pilot.pandaserver if url == "": @@ -1482,7 +1484,7 @@ def get_job_definition_from_file(path, harvester): if is_json(path): job_definition_list = parse_job_definition_file(path) if not job_definition_list: - logger.warning('no jobs were found in Harvester job definitions file: %s' % path) + logger.warning('no jobs were found in Harvester job definitions file: %s', path) return {} else: # remove the job definition file from the original location, place a renamed copy in the pilot dir @@ -1498,11 +1500,11 @@ def get_job_definition_from_file(path, harvester): with open(path, 'r') as jobdatafile: response = jobdatafile.read() if len(response) == 0: - logger.fatal('encountered empty job definition file: %s' % path) + logger.fatal('encountered empty job definition file: %s', path) res = None # this is a fatal error, no point in continuing as the file will not be replaced else: # parse response message - # logger.debug('%s:\n\n%s\n\n' % (path, response)) + # logger.debug('%s:\n\n%s\n\n', path, response) try: from urlparse import parse_qsl # Python 2 except Exception: @@ -1534,7 +1536,7 @@ def get_job_definition_from_server(args): cmd = getjob_server_command(args.url, args.port) if cmd != "": - logger.info('executing server command: %s' % cmd) + logger.info('executing server command: %s', cmd) res = https.request(cmd, data=data) return res @@ -1585,7 +1587,7 @@ def get_job_definition(args): logger.info('will use a fake PanDA job') res = get_fake_job() elif os.path.exists(path): - logger.info('will read job definition from file %s' % path) + logger.info('will read job definition from file %s', path) res = get_job_definition_from_file(path, args.harvester) else: if args.harvester and args.harvester_submitmode.lower() == 'push': @@ -1733,7 +1735,7 @@ def get_fake_job(input=True): 'taskID': 'NULL', 'logFile': '%s.job.log.tgz' % job_name} else: - logger.warning('unknown test job type: %s' % config.Pilot.testjobtype) + logger.warning('unknown test job type: %s', config.Pilot.testjobtype) if res: if not input: @@ -1747,7 +1749,7 @@ def get_fake_job(input=True): if config.Pilot.testtransfertype == "NULL" or config.Pilot.testtransfertype == 'direct': res['transferType'] = config.Pilot.testtransfertype else: - logger.warning('unknown test transfer type: %s (ignored)' % config.Pilot.testtransfertype) + logger.warning('unknown test transfer type: %s (ignored)', config.Pilot.testtransfertype) if config.Pilot.testjobcommand == 'sleep': res['transformation'] = 'sleep' @@ -1826,7 +1828,7 @@ def retrieve(queues, traces, args): # noqa: C901 # get a job definition from a source (file or server) res = get_job_definition(args) - logger.info('job definition = %s' % str(res)) + logger.info('job definition = %s', str(res)) if res is None: logger.fatal('fatal error in job download loop - cannot continue') @@ -1839,7 +1841,7 @@ def retrieve(queues, traces, args): # noqa: C901 if not res: delay = get_job_retrieval_delay(args.harvester) if not args.harvester: - logger.warning('did not get a job -- sleep %d s and repeat' % delay) + logger.warning('did not get a job -- sleep %d s and repeat', delay) for i in range(delay): if args.graceful_stop.is_set(): break @@ -1848,7 +1850,7 @@ def retrieve(queues, traces, args): # noqa: C901 # it seems the PanDA server returns StatusCode as an int, but the aCT returns it as a string # note: StatusCode keyword is not available in job definition files from Harvester (not needed) if 'StatusCode' in res and res['StatusCode'] != '0' and res['StatusCode'] != 0: - logger.warning('did not get a job -- sleep 60s and repeat -- status: %s' % res['StatusCode']) + logger.warning('did not get a job -- sleep 60s and repeat -- status: %s', res['StatusCode']) for i in range(60): if args.graceful_stop.is_set(): break @@ -1864,11 +1866,11 @@ def retrieve(queues, traces, args): # noqa: C901 #try: # job_status, job_attempt_nr, job_status_code = get_job_status_from_server(job.jobid, args.url, args.port) # if job_status == "running": - # pilot_error_diag = "job %s is already running elsewhere - aborting" % (job.jobid) + # pilot_error_diag = "job %s is already running elsewhere - aborting" % job.jobid # logger.warning(pilot_error_diag) # raise JobAlreadyRunning(pilot_error_diag) - #except Exception as e: - # logger.warning("%s" % e) + #except Exception as error: + # logger.warning("%s", error) # write time stamps to pilot timing file # note: PILOT_POST_GETJOB corresponds to START_TIME in Pilot 1 add_to_pilot_timing(job.jobid, PILOT_PRE_GETJOB, time_pre_getjob, args) @@ -1941,8 +1943,8 @@ def create_job(dispatcher_response, queue): #job.workdir = os.getcwd() - logger.info('received job: %s (sleep until the job has finished)' % job.jobid) - logger.info('job details: \n%s' % job) + logger.info('received job: %s (sleep until the job has finished)', job.jobid) + logger.info('job details: \n%s', job) # payload environment wants the PANDAID to be set, also used below os.environ['PANDAID'] = job.jobid @@ -1968,13 +1970,13 @@ def has_job_completed(queues, args): else: make_job_report(job) cmd = 'ls -lF %s' % os.environ.get('PILOT_HOME') - logger.debug('%s:\n' % cmd) + logger.debug('%s:\n', cmd) ec, stdout, stderr = execute(cmd) logger.debug(stdout) queue_report(queues) job.reset_errors() - logger.info("job %s has completed (purged errors)" % job.jobid) + logger.info("job %s has completed (purged errors)", job.jobid) # cleanup of any remaining processes if job.pid: @@ -1987,14 +1989,14 @@ def has_job_completed(queues, args): #finished_queue_snapshot = list(queues.finished_jobs.queue) #peek = [obj for obj in finished_queue_snapshot if jobid == obj.jobid] #if peek: - # logger.info("job %s has completed (finished)" % jobid) + # logger.info("job %s has completed (finished)", jobid) # return True # is there anything in the failed_jobs queue? #failed_queue_snapshot = list(queues.failed_jobs.queue) #peek = [obj for obj in failed_queue_snapshot if jobid == obj.jobid] #if peek: - # logger.info("job %s has completed (failed)" % jobid) + # logger.info("job %s has completed (failed)", jobid) # return True return False @@ -2021,31 +2023,31 @@ def get_job_from_queue(queues, state): else: # make sure that state=failed set_pilot_state(job=job, state=state) - logger.info("job %s has state=%s" % (job.jobid, job.state)) + logger.info("job %s has state=%s", job.jobid, job.state) return job -def is_queue_empty(queues, q): +def is_queue_empty(queues, queue): """ Check if the given queue is empty (without pulling). :param queues: pilot queues object. - :param q: queue name (string). + :param queue: queue name (string). :return: True if queue is empty, False otherwise """ status = False - if q in queues._fields: - _q = getattr(queues, q) - jobs = list(_q.queue) + if queue in queues._fields: + _queue = getattr(queues, queue) + jobs = list(_queue.queue) if len(jobs) > 0: - logger.info('queue %s not empty: found %d job(s)' % (q, len(jobs))) + logger.info('queue %s not empty: found %d job(s)', queue, len(jobs)) else: - logger.info('queue %s is empty' % q) + logger.info('queue %s is empty', queue) status = True else: - logger.warning('queue %s not present in %s' % (q, queues._fields)) + logger.warning('queue %s not present in %s', queue, queues._fields) return status @@ -2072,7 +2074,7 @@ def order_log_transfer(queues, job): while n < nmax: # refresh the log_transfer since it might have changed log_transfer = job.get_status('LOG_TRANSFER') - logger.info('waiting for log transfer to finish (#%d/#%d): %s' % (n + 1, nmax, log_transfer)) + logger.info('waiting for log transfer to finish (#%d/#%d): %s', n + 1, nmax, log_transfer) if is_queue_empty(queues, 'data_out') and \ (log_transfer == LOG_TRANSFER_DONE or log_transfer == LOG_TRANSFER_FAILED): # set in data component logger.info('stage-out of log has completed') @@ -2083,7 +2085,7 @@ def order_log_transfer(queues, job): time.sleep(2) n += 1 - logger.info('proceeding with server update (n=%d)' % n) + logger.info('proceeding with server update (n=%d)', n) def wait_for_aborted_job_stageout(args, queues, job): @@ -2101,9 +2103,9 @@ def wait_for_aborted_job_stageout(args, queues, job): time_since_kill = get_time_since('1', PILOT_KILL_SIGNAL, args) was_killed = was_pilot_killed(args.timing) if was_killed: - logger.info('%d s passed since kill signal was intercepted - make sure that stage-out has finished' % time_since_kill) - except Exception as e: - logger.warning('exception caught: %s' % e) + logger.info('%d s passed since kill signal was intercepted - make sure that stage-out has finished', time_since_kill) + except Exception as error: + logger.warning('exception caught: %s', error) time_since_kill = 60 else: if time_since_kill > 60 or time_since_kill < 0: # fail-safe @@ -2113,7 +2115,7 @@ def wait_for_aborted_job_stageout(args, queues, job): # if stage-out has not finished, we need to wait (less than two minutes or the batch system will issue # a hard SIGKILL) max_wait_time = 2 * 60 - time_since_kill - 5 - logger.debug('using max_wait_time = %d s' % max_wait_time) + logger.debug('using max_wait_time = %d s', max_wait_time) t0 = time.time() while time.time() - t0 < max_wait_time: if job in queues.finished_data_out.queue or job in queues.failed_data_out.queue: @@ -2180,14 +2182,14 @@ def queue_monitor(queues, traces, args): # noqa: C901 while i < imax and os.environ.get('PILOT_WRAP_UP', '') == 'NORMAL': job = get_finished_or_failed_job(args, queues) if job: - logger.debug('returned job has state=%s' % job.state) + logger.debug('returned job has state=%s', job.state) #if job.state == 'failed': # logger.warning('will abort failed job (should prepare for final server update)') break i += 1 state = get_pilot_state() # the job object is not available, but the state is also kept in PILOT_JOB_STATE if state != 'stage-out': - # logger.info("no need to wait since job state=\'%s\'" % state) + # logger.info("no need to wait since job state=\'%s\'", state) break pause_queue_monitor(1) if not abort_thread else pause_queue_monitor(10) @@ -2197,7 +2199,7 @@ def queue_monitor(queues, traces, args): # noqa: C901 completed_jobids = queues.completed_jobids.queue if queues.completed_jobids else [] if job and job.jobid not in completed_jobids: - logger.info("preparing for final server update for job %s in state=\'%s\'" % (job.jobid, job.state)) + logger.info("preparing for final server update for job %s in state=\'%s\'", job.jobid, job.state) if args.job_aborted.is_set(): # wait for stage-out to finish for aborted job @@ -2214,7 +2216,7 @@ def queue_monitor(queues, traces, args): # noqa: C901 logger.warning('failed to dequeue job: queue is empty (did job fail before job monitor started?)') make_job_report(job) else: - logger.debug('job %s was dequeued from the monitored payloads queue' % _job.jobid) + logger.debug('job %s was dequeued from the monitored payloads queue', _job.jobid) # now ready for the next job (or quit) put_in_queue(job.jobid, queues.completed_jobids) @@ -2250,8 +2252,8 @@ def update_server(job, args): metadata = user.get_metadata(job.workdir) try: user.update_server(job) - except Exception as e: - logger.warning('exception caught in update_server(): %s' % e) + except Exception as error: + logger.warning('exception caught in update_server(): %s', error) if job.fileinfo: send_state(job, args, job.state, xml=dumps(job.fileinfo), metadata=metadata) else: @@ -2266,7 +2268,7 @@ def pause_queue_monitor(delay): :return: """ - logger.warning('since job:queue_monitor is responsible for sending job updates, we sleep for %d s' % delay) + logger.warning('since job:queue_monitor is responsible for sending job updates, we sleep for %d s', delay) time.sleep(delay) @@ -2323,8 +2325,8 @@ def get_heartbeat_period(debug=False): try: return int(config.Pilot.heartbeat if not debug else config.Pilot.debug_heartbeat) - except Exception as e: - logger.warning('bad config data for heartbeat period: %s (will use default 1800 s)' % e) + except Exception as error: + logger.warning('bad config data for heartbeat period: %s (will use default 1800 s)', error) return 1800 @@ -2338,7 +2340,7 @@ def check_for_abort_job(args, caller=''): """ abort_job = False if args.abort_job.is_set(): - logger.warning('%s detected an abort_job request (signal=%s)' % (caller, args.signal)) + logger.warning('%s detected an abort_job request (signal=%s)', caller, args.signal) logger.warning('in case pilot is running more than one job, all jobs will be aborted') abort_job = True @@ -2371,8 +2373,7 @@ def interceptor(queues, traces, args): jobs = queues.monitored_payloads.queue if jobs: for i in range(len(jobs)): - - logger.info('interceptor loop %d: looking for communication file' % n) + logger.info('interceptor loop %d: looking for communication file', n) time.sleep(30) n += 1 @@ -2439,8 +2440,8 @@ def job_monitor(queues, traces, args): # noqa: C901 # note: when sending a state change to the server, the server might respond with 'tobekilled' try: jobs[i] - except Exception as e: - logger.warning('detected stale jobs[i] object in job_monitor: %s' % e) + except Exception as error: + logger.warning('detected stale jobs[i] object in job_monitor: %s', error) else: if jobs[i].state == 'failed': logger.warning('job state is \'failed\' - order log transfer and abort job_monitor() (1)') @@ -2464,9 +2465,9 @@ def job_monitor(queues, traces, args): # noqa: C901 peeking_time = int(time.time()) for i in range(len(jobs)): current_id = jobs[i].jobid - logger.info('monitor loop #%d: job %d:%s is in state \'%s\'' % (n, i, current_id, jobs[i].state)) + logger.info('monitor loop #%d: job %d:%s is in state \'%s\'', n, i, current_id, jobs[i].state) if jobs[i].state == 'finished' or jobs[i].state == 'failed': - logger.info('will abort job monitoring soon since job state=%s (job is still in queue)' % jobs[i].state) + logger.info('will abort job monitoring soon since job state=%s (job is still in queue)', jobs[i].state) break # perform the monitoring tasks @@ -2482,8 +2483,8 @@ def job_monitor(queues, traces, args): # noqa: C901 else: try: fail_monitored_job(jobs[i], exit_code, diagnostics, queues, traces) - except Exception as e: - logger.warning('(1) exception caught: %s (job id=%s)' % (e, current_id)) + except Exception as error: + logger.warning('(1) exception caught: %s (job id=%s)', error, current_id) break # run this check again in case job_monitor_tasks() takes a long time to finish (and the job object @@ -2491,15 +2492,15 @@ def job_monitor(queues, traces, args): # noqa: C901 try: _job = jobs[i] except Exception: - logger.info('aborting job monitoring since job object (job id=%s) has expired' % current_id) + logger.info('aborting job monitoring since job object (job id=%s) has expired', current_id) break # send heartbeat if it is time (note that the heartbeat function might update the job object, e.g. # by turning on debug mode, ie we need to get the heartbeat period in case it has changed) try: update_time = send_heartbeat_if_time(_job, args, update_time) - except Exception as e: - logger.warning('(2) exception caught: %s (job id=%s)' % (e, current_id)) + except Exception as error: + logger.warning('(2) exception caught: %s (job id=%s)', error, current_id) break else: # note: when sending a state change to the server, the server might respond with 'tobekilled' @@ -2594,7 +2595,7 @@ def fail_monitored_job(job, exit_code, diagnostics, queues, traces): job.piloterrordiag = diagnostics traces.pilot['error_code'] = exit_code put_in_queue(job, queues.failed_payloads) - logger.info('aborting job monitoring since job state=%s' % job.state) + logger.info('aborting job monitoring since job state=%s', job.state) def make_job_report(job): @@ -2609,37 +2610,37 @@ def make_job_report(job): logger.info('') logger.info('job summary report') logger.info('--------------------------------------------------') - logger.info('PanDA job id: %s' % job.jobid) - logger.info('task id: %s' % job.taskid) + logger.info('PanDA job id: %s', job.jobid) + logger.info('task id: %s', job.taskid) n = len(job.piloterrorcodes) if n > 0: for i in range(n): - logger.info('error %d/%d: %s: %s' % (i + 1, n, job.piloterrorcodes[i], job.piloterrordiags[i])) + logger.info('error %d/%d: %s: %s', i + 1, n, job.piloterrorcodes[i], job.piloterrordiags[i]) else: logger.info('errors: (none)') if job.piloterrorcode != 0: - logger.info('pilot error code: %d' % job.piloterrorcode) - logger.info('pilot error diag: %s' % job.piloterrordiag) + logger.info('pilot error code: %d', job.piloterrorcode) + logger.info('pilot error diag: %s', job.piloterrordiag) info = "" for key in job.status: info += key + " = " + job.status[key] + " " - logger.info('status: %s' % info) + logger.info('status: %s', info) s = "" if job.is_analysis() and job.state != 'finished': s = '(user job is recoverable)' if errors.is_recoverable(code=job.piloterrorcode) else '(user job is not recoverable)' - logger.info('pilot state: %s %s' % (job.state, s)) - logger.info('transexitcode: %d' % job.transexitcode) - logger.info('exeerrorcode: %d' % job.exeerrorcode) - logger.info('exeerrordiag: %s' % job.exeerrordiag) - logger.info('exitcode: %d' % job.exitcode) - logger.info('exitmsg: %s' % job.exitmsg) - logger.info('cpuconsumptiontime: %d %s' % (job.cpuconsumptiontime, job.cpuconsumptionunit)) - logger.info('nevents: %d' % job.nevents) - logger.info('neventsw: %d' % job.neventsw) - logger.info('pid: %s' % job.pid) - logger.info('pgrp: %s' % str(job.pgrp)) - logger.info('corecount: %d' % job.corecount) - logger.info('event service: %s' % str(job.is_eventservice)) - logger.info('sizes: %s' % str(job.sizes)) + logger.info('pilot state: %s %s', job.state, s) + logger.info('transexitcode: %d', job.transexitcode) + logger.info('exeerrorcode: %d', job.exeerrorcode) + logger.info('exeerrordiag: %s', job.exeerrordiag) + logger.info('exitcode: %d', job.exitcode) + logger.info('exitmsg: %s', job.exitmsg) + logger.info('cpuconsumptiontime: %d %s', job.cpuconsumptiontime, job.cpuconsumptionunit) + logger.info('nevents: %d', job.nevents) + logger.info('neventsw: %d', job.neventsw) + logger.info('pid: %s', job.pid) + logger.info('pgrp: %s', str(job.pgrp)) + logger.info('corecount: %d', job.corecount) + logger.info('event service: %s', str(job.is_eventservice)) + logger.info('sizes: %s', str(job.sizes)) logger.info('--------------------------------------------------') logger.info('') diff --git a/pilot/control/payload.py b/pilot/control/payload.py index 61cb033c..33029f0c 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -8,7 +8,7 @@ # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 # - Tobias Wegner, tobias.wegner@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # - Wen Guan, wen.guan@cern.ch, 2017-2018 import os @@ -64,7 +64,7 @@ def control(queues, traces, args): pass else: exc_type, exc_obj, exc_trace = exc - logger.warning("thread \'%s\' received an exception from bucket: %s" % (thread.name, exc_obj)) + logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj) # deal with the exception # .. @@ -146,8 +146,8 @@ def _validate_payload(job): user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: status = user.validate(job) - except Exception as e: - logger.fatal('failed to execute user validate() function: %s' % e) + except Exception as error: + logger.fatal('failed to execute user validate() function: %s', error) status = False return status @@ -213,13 +213,13 @@ def execute_payloads(queues, traces, args): # noqa: C901 #queues.monitored_payloads.put(job) put_in_queue(job, queues.monitored_payloads) - logger.info('job %s added to monitored payloads queue' % job.jobid) + logger.info('job %s added to monitored payloads queue', job.jobid) try: out = open(os.path.join(job.workdir, config.Payload.payloadstdout), 'wb') err = open(os.path.join(job.workdir, config.Payload.payloadstderr), 'wb') - except Exception as e: - logger.warning('failed to open payload stdout/err: %s' % e) + except Exception as error: + logger.warning('failed to open payload stdout/err: %s', error) out = None err = None send_state(job, args, 'starting') @@ -230,7 +230,7 @@ def execute_payloads(queues, traces, args): # noqa: C901 break payload_executor = get_payload_executor(args, job, out, err, traces) - logger.info("Got payload executor: %s" % payload_executor) + logger.info("Got payload executor: %s", payload_executor) show_memory_usage() @@ -252,13 +252,13 @@ def execute_payloads(queues, traces, args): # noqa: C901 0) # Python 2/3 try: user.update_output_for_hpo(job) - except Exception as e: - logger.warning('exception caught by update_output_for_hpo(): %s' % e) + except Exception as error: + logger.warning('exception caught by update_output_for_hpo(): %s', error) else: for dat in job.outdata: if not dat.guid: dat.guid = get_guid() - logger.warning('guid not set: generated guid=%s for lfn=%s' % (dat.guid, dat.lfn)) + logger.warning('guid not set: generated guid=%s for lfn=%s', dat.guid, dat.lfn) #if traces.pilot['nr_jobs'] == 1: # logger.debug('faking job failure in first multi-job') @@ -275,8 +275,8 @@ def execute_payloads(queues, traces, args): # noqa: C901 user = __import__('pilot.user.%s.diagnose' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: exit_code_interpret = user.interpret(job) - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) #exit_code_interpret = -1 job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.INTERNALPILOTPROBLEM) @@ -298,8 +298,8 @@ def execute_payloads(queues, traces, args): # noqa: C901 except queue.Empty: continue - except Exception as e: - logger.fatal('execute payloads caught an exception (cannot recover): %s, %s' % (e, traceback.format_exc())) + except Exception as error: + logger.fatal('execute payloads caught an exception (cannot recover): %s, %s', error, traceback.format_exc()) if job: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADEXECUTIONEXCEPTION) #queues.failed_payloads.put(job) @@ -346,7 +346,7 @@ def perform_initial_payload_error_analysis(job, exit_code): if exit_code != 0: msg = "" ec = 0 - logger.warning('main payload execution returned non-zero exit code: %d' % exit_code) + logger.warning('main payload execution returned non-zero exit code: %d', exit_code) stderr = read_file(os.path.join(job.workdir, config.Payload.payloadstderr)) if stderr != "": msg = errors.extract_stderr_error(stderr) @@ -357,7 +357,7 @@ def perform_initial_payload_error_analysis(job, exit_code): else: fatal = True if msg != "": - logger.warning("extracted message from stderr:\n%s" % msg) + logger.warning("extracted message from stderr:\n%s", msg) ec = set_error_code_from_stderr(msg, fatal) if not ec: @@ -368,7 +368,7 @@ def perform_initial_payload_error_analysis(job, exit_code): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec, msg=msg) else: if job.piloterrorcodes: - logger.warning('error code(s) already set: %s' % str(job.piloterrorcodes)) + logger.warning('error code(s) already set: %s', str(job.piloterrorcodes)) else: # check if core dumps exist, if so remove them and return True if remove_core_dumps(job.workdir) and not job.debug: diff --git a/pilot/user/atlas/dbrelease.py b/pilot/user/atlas/dbrelease.py index 5f090b5d..c3cf9ee4 100644 --- a/pilot/user/atlas/dbrelease.py +++ b/pilot/user/atlas/dbrelease.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2021 import os import re @@ -63,9 +63,9 @@ def get_dbrelease_dir(): logger.warning("note: the DBRelease database directory is not available (will not attempt to skip DBRelease stage-in)") else: if os.path.exists(path): - logger.info("local DBRelease path verified: %s (will attempt to skip DBRelease stage-in)" % path) + logger.info("local DBRelease path verified: %s (will attempt to skip DBRelease stage-in)", path) else: - logger.warning("note: local DBRelease path does not exist: %s (will not attempt to skip DBRelease stage-in)" % path) + logger.warning("note: local DBRelease path does not exist: %s (will not attempt to skip DBRelease stage-in)", path) return path @@ -95,14 +95,14 @@ def is_dbrelease_available(version): # is the required DBRelease version available? if dir_list: if version in dir_list: - logger.info("found version %s in path %s (%d releases found)" % (version, path, len(dir_list))) + logger.info("found version %s in path %s (%d releases found)", version, path, len(dir_list)) status = True else: - logger.warning("did not find version %s in path %s (%d releases found)" % (version, path, len(dir_list))) + logger.warning("did not find version %s in path %s (%d releases found)", version, path, len(dir_list)) else: - logger.warning("empty DBRelease directory list: %s" % path) + logger.warning("empty DBRelease directory list: %s", path) else: - logger.warning('no such DBRelease path: %s' % path) + logger.warning('no such DBRelease path: %s', path) return status @@ -131,13 +131,13 @@ def create_setup_file(version, path): try: status = write_file(path, txt) - except FileHandlingFailure as e: - logger.warning('failed to create DBRelease setup file: %s' % e) + except FileHandlingFailure as error: + logger.warning('failed to create DBRelease setup file: %s', error) else: - logger.info("Created setup file with the following content:.................................\n%s" % txt) + logger.info("Created setup file with the following content:.................................\n%s", txt) logger.info("...............................................................................") else: - logger.warning('failed to create %s for DBRelease version=%s and directory=%s' % (path, version, d)) + logger.warning('failed to create %s for DBRelease version=%s and directory=%s', path, version, d) return status @@ -158,25 +158,25 @@ def create_dbrelease(version, path): _path = os.path.join(dbrelease_path, version) try: mkdirs(_path, chmod=None) - except PilotException as e: - logger.warning('failed to create directories for DBRelease: %s' % e) + except PilotException as error: + logger.warning('failed to create directories for DBRelease: %s', error) else: - logger.debug('created directories: %s' % _path) + logger.debug('created directories: %s', _path) # create the setup file in the DBRelease directory version_path = os.path.join(dbrelease_path, version) setup_filename = "setup.py" _path = os.path.join(version_path, setup_filename) if create_setup_file(version, _path): - logger.info("created DBRelease setup file: %s" % _path) + logger.info("created DBRelease setup file: %s", _path) # now create a new DBRelease tarball filename = os.path.join(path, "DBRelease-%s.tar.gz" % version) - logger.info("creating file: %s" % filename) + logger.info("creating file: %s", filename) try: tar = tarfile.open(filename, "w:gz") - except Exception as e: - logger.warning("could not create DBRelease tar file: %s" % e) + except Exception as error: + logger.warning("could not create DBRelease tar file: %s", error) else: if tar: # add the setup file to the tar file @@ -186,10 +186,10 @@ def create_dbrelease(version, path): try: _link = os.path.join(path, "DBRelease/current") os.symlink(version, _link) - except Exception as e: - logger.warning("failed to create symbolic link %s: %s" % (_link, e)) + except Exception as error: + logger.warning("failed to create symbolic link %s: %s", _link, error) else: - logger.warning("created symbolic link: %s" % _link) + logger.warning("created symbolic link: %s", _link) # add the symbolic link to the tar file tar.add(_link) @@ -197,17 +197,17 @@ def create_dbrelease(version, path): # done with the tar archive tar.close() - logger.info("created new DBRelease tar file: %s" % filename) + logger.info("created new DBRelease tar file: %s", filename) status = True else: logger.warning("failed to open DBRelease tar file") # clean up if rmdirs(dbrelease_path): - logger.debug("cleaned up directories in path: %s" % dbrelease_path) + logger.debug("cleaned up directories in path: %s", dbrelease_path) else: logger.warning("failed to create DBRelease setup file") if rmdirs(dbrelease_path): - logger.debug("cleaned up directories in path: %s" % dbrelease_path) + logger.debug("cleaned up directories in path: %s", dbrelease_path) return status diff --git a/pilot/user/atlas/setup.py b/pilot/user/atlas/setup.py index f21dfe27..e01aeece 100644 --- a/pilot/user/atlas/setup.py +++ b/pilot/user/atlas/setup.py @@ -196,7 +196,7 @@ def set_inds(dataset): inds = ds break if inds != "": - logger.info("setting INDS environmental variable to: %s" % (inds)) + logger.info("setting INDS environmental variable to: %s", inds) os.environ['INDS'] = inds else: logger.warning("INDS unknown") @@ -219,24 +219,24 @@ def get_analysis_trf(transform, workdir): harvester_workdir = os.environ.get('HARVESTER_WORKDIR') if harvester_workdir is not None: search_pattern = "%s/jobO.*.tar.gz" % harvester_workdir - logger.debug("search_pattern - %s" % search_pattern) + logger.debug("search_pattern - %s", search_pattern) jobopt_files = glob.glob(search_pattern) for jobopt_file in jobopt_files: - logger.debug("jobopt_file = %s workdir = %s" % (jobopt_file, workdir)) + logger.debug("jobopt_file = %s workdir = %s", jobopt_file, workdir) try: copy(jobopt_file, workdir) - except Exception as e: - logger.error("could not copy file %s to %s : %s" % (jobopt_file, workdir, e)) + except Exception as error: + logger.error("could not copy file %s to %s : %s", jobopt_file, workdir, error) if '/' in transform: transform_name = transform.split('/')[-1] else: - logger.warning('did not detect any / in %s (using full transform name)' % transform) + logger.warning('did not detect any / in %s (using full transform name)', transform) transform_name = transform # is the command already available? (e.g. if already downloaded by a preprocess/main process step) if os.path.exists(os.path.join(workdir, transform_name)): - logger.info('script %s is already available - no need to download again' % transform_name) + logger.info('script %s is already available - no need to download again', transform_name) return ec, diagnostics, transform_name original_base_url = "" @@ -255,7 +255,7 @@ def get_analysis_trf(transform, workdir): status = False for base_url in get_valid_base_urls(order=original_base_url): trf = re.sub(original_base_url, base_url, transform) - logger.debug("attempting to download script: %s" % trf) + logger.debug("attempting to download script: %s", trf) status, diagnostics = download_transform(trf, transform_name, workdir) if status: break @@ -265,11 +265,11 @@ def get_analysis_trf(transform, workdir): logger.info("successfully downloaded script") path = os.path.join(workdir, transform_name) - logger.debug("changing permission of %s to 0o755" % path) + logger.debug("changing permission of %s to 0o755", path) try: os.chmod(path, 0o755) # Python 2/3 - except Exception as e: - diagnostics = "failed to chmod %s: %s" % (transform_name, e) + except Exception as error: + diagnostics = "failed to chmod %s: %s" % (transform_name, error) return errors.CHMODTRF, diagnostics, "" return ec, diagnostics, transform_name @@ -307,7 +307,7 @@ def download_transform(url, transform_name, workdir): # try to download the trf a maximum of 3 times while trial <= max_trials: - logger.info("executing command [trial %d/%d]: %s" % (trial, max_trials, cmd)) + logger.info("executing command [trial %d/%d]: %s", trial, max_trials, cmd) exit_code, stdout, stderr = execute(cmd, mute=True) if not stdout: @@ -317,14 +317,14 @@ def download_transform(url, transform_name, workdir): diagnostics = "curl command failed: %d, %s, %s" % (exit_code, stdout, stderr) logger.warning(diagnostics) if trial == max_trials: - logger.fatal('could not download transform: %s' % stdout) + logger.fatal('could not download transform: %s', stdout) status = False break else: logger.info("will try again after 60 s") sleep(60) else: - logger.info("curl command returned: %s" % stdout) + logger.info("curl command returned: %s", stdout) status = True break trial += 1 @@ -456,12 +456,11 @@ def replace_lfns_with_turls(cmd, workdir, filename, infiles, writetofile=""): # if turl.startswith('root://') and turl not in cmd: if turl not in cmd: cmd = cmd.replace(inputfile, turl) - logger.info("replaced '%s' with '%s' in the run command" % (inputfile, turl)) + logger.info("replaced '%s' with '%s' in the run command", inputfile, turl) # replace the LFNs with TURLs in the writetofile input file list (if it exists) if writetofile and turl_dictionary: filenames = get_writetoinput_filenames(writetofile) - logger.info("filenames=%s" % filenames) for fname in filenames: new_lines = [] path = os.path.join(workdir, fname) @@ -479,10 +478,9 @@ def replace_lfns_with_turls(cmd, workdir, filename, infiles, writetofile=""): lines = '\n'.join(new_lines) if lines: write_file(path, lines) - logger.info("lines=%s" % lines) else: - logger.warning("file does not exist: %s" % path) + logger.warning("file does not exist: %s", path) else: - logger.warning("could not find file: %s (cannot locate TURLs for direct access)" % filename) + logger.warning("could not find file: %s (cannot locate TURLs for direct access)", filename) return cmd diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index c1a33cec..e9820e8b 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -98,7 +98,7 @@ def display_architecture_info(): dump("/etc/issue") dump("$MACHTYPE", cmd="echo") else: - logger.info("\n%s" % stdout) + logger.info("\n%s", stdout) def get_batchsystem_jobid(): @@ -309,7 +309,7 @@ def inner(obj): pass # : unbound method iteritems() must be called # with OrderedDict instance as first argument (got nothing instead) - #logger.debug('exception caught for obj=%s: %s' % (str(obj), e)) + #logger.debug('exception caught for obj=%s: %s', (str(obj), e)) # Check for custom object instances - may subclass above too if hasattr(obj, '__dict__'): @@ -376,7 +376,7 @@ def check_for_final_server_update(update_server): if server_update == SERVER_UPDATE_FINAL or server_update == SERVER_UPDATE_TROUBLE: logger.info('server update done, finishing') break - logger.info('server update not finished (#%d/#%d)' % (i + 1, max_i)) + logger.info('server update not finished (#%d/#%d)', i + 1, max_i) sleep(30) i += 1 @@ -444,7 +444,7 @@ def show_memory_usage(): _value = extract_memory_usage_value(_stdout) except Exception: _value = "(unknown)" - logger.debug('current pilot memory usage:\n\n%s\n\nusage: %s kB\n' % (_stdout, _value)) + logger.debug('current pilot memory usage:\n\n%s\n\nusage: %s kB\n', _stdout, _value) def get_memory_usage(pid): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 273dcf6c..9eaf93ef 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '41' # build number should be reset to '1' for every new development cycle +BUILD = '44' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index 79ccb710..c3f80c37 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -58,8 +58,8 @@ def mkdirs(workdir, chmod=0o770): # Python 2/3 os.makedirs(workdir) if chmod: os.chmod(workdir, chmod) - except Exception as e: - raise MKDirFailure(e) + except Exception as error: + raise MKDirFailure(error) def rmdirs(path): @@ -74,8 +74,8 @@ def rmdirs(path): try: rmtree(path) - except OSError as e: - logger.warning("failed to remove directories %s: %s" % (path, e)) + except OSError as error: + logger.warning("failed to remove directories %s: %s", path, error) else: status = True @@ -122,17 +122,17 @@ def write_file(path, contents, mute=True, mode='w', unique=False): if f: try: f.write(contents) - except IOError as e: - raise FileHandlingFailure(e) + except IOError as error: + raise FileHandlingFailure(error) else: status = True f.close() if not mute: if 'w' in mode: - logger.info('created file: %s' % path) + logger.info('created file: %s', path) if 'a' in mode: - logger.info('appended file: %s' % path) + logger.info('appended file: %s', path) return status @@ -151,8 +151,8 @@ def open_file(filename, mode): f = None try: f = open(filename, mode) - except IOError as e: - raise FileHandlingFailure(e) + except IOError as error: + raise FileHandlingFailure(error) return f @@ -329,8 +329,8 @@ def read_list(filename): try: with open(filename, 'r') as filehandle: _list = load(filehandle) - except IOError as e: - logger.warning('failed to read %s: %s' % (filename, e)) + except IOError as error: + logger.warning('failed to read %s: %s', filename, error) return convert(_list) @@ -349,9 +349,9 @@ def read_json(filename): if f: try: dictionary = load(f) - except Exception as e: - logger.warning('exception caught: %s' % e) - #raise FileHandlingFailure(str(e)) + except Exception as error: + logger.warning('exception caught: %s', error) + #raise FileHandlingFailure(str(error)) else: f.close() @@ -359,8 +359,8 @@ def read_json(filename): if dictionary != {}: try: dictionary = convert(dictionary) - except Exception as e: - raise ConversionFailure(e) + except Exception as error: + raise ConversionFailure(error) return dictionary @@ -383,8 +383,8 @@ def write_json(filename, data, sort_keys=True, indent=4, separators=(',', ': ')) try: with open(filename, 'w') as fh: dumpjson(data, fh, sort_keys=sort_keys, indent=indent, separators=separators) - except IOError as e: - raise FileHandlingFailure(e) + except IOError as error: + raise FileHandlingFailure(error) else: status = True @@ -434,8 +434,8 @@ def remove(path): try: os.remove(path) - except OSError as e: - logger.warning("failed to remove file: %s (%s, %s)" % (path, e.errno, e.strerror)) + except OSError as error: + logger.warning("failed to remove file: %s (%s, %s)", path, error.errno, error.strerror) return -1 return 0 @@ -449,8 +449,8 @@ def remove_dir_tree(path): try: rmtree(path) - except OSError as e: - logger.warning("failed to remove directory: %s (%s, %s)" % (path, e.errno, e.strerror)) + except OSError as error: + logger.warning("failed to remove directory: %s (%s, %s)", path, error.errno, error.strerror) return -1 return 0 @@ -466,7 +466,7 @@ def remove_files(workdir, files): ec = 0 if type(files) != list: - logger.warning('files parameter not a list: %s' % str(type(list))) + logger.warning('files parameter not a list: %s', str(type(list))) ec = -1 else: for f in files: @@ -533,17 +533,17 @@ def move(path1, path2): """ if not os.path.exists(path1): - logger.warning('file copy failure: path does not exist: %s' % path1) + logger.warning('file copy failure: path does not exist: %s', path1) raise NoSuchFile("File does not exist: %s" % path1) try: import shutil shutil.move(path1, path2) - except IOError as e: - logger.warning("exception caught during file move: %s" % e) - raise FileHandlingFailure(e) + except IOError as error: + logger.warning("exception caught during file move: %s", error) + raise FileHandlingFailure(error) else: - logger.info("moved %s to %s" % (path1, path2)) + logger.info("moved %s to %s", path1, path2) def copy(path1, path2): @@ -557,16 +557,16 @@ def copy(path1, path2): """ if not os.path.exists(path1): - logger.warning('file copy failure: path does not exist: %s' % path1) + logger.warning('file copy failure: path does not exist: %s', path1) raise NoSuchFile("File does not exist: %s" % path1) try: copy2(path1, path2) - except IOError as e: - logger.warning("exception caught during file copy: %s" % e) - raise FileHandlingFailure(e) + except IOError as error: + logger.warning("exception caught during file copy: %s", error) + raise FileHandlingFailure(error) else: - logger.info("copied %s to %s" % (path1, path2)) + logger.info("copied %s to %s", path1, path2) def find_executable(name): @@ -596,8 +596,8 @@ def get_directory_size(directory="."): try: # convert to int and B size = int(stdout.split()[0]) * 1024 - except Exception as e: - logger.warning('exception caught while trying convert dirsize: %s' % e) + except Exception as error: + logger.warning('exception caught while trying convert dirsize: %s', error) return size @@ -615,13 +615,13 @@ def add_to_total_size(path, total_size): # Get the file size fsize = get_local_file_size(path) if fsize: - logger.info("size of file %s: %d B" % (path, fsize)) + logger.info("size of file %s: %d B", path, fsize) try: total_size += long(fsize) # Python 2 # noqa: F821 except Exception: total_size += int(fsize) # Python 3 (note order in try statement) else: - logger.warning("skipping file %s since it is not present" % path) + logger.warning("skipping file %s since it is not present", path) return total_size @@ -639,10 +639,10 @@ def get_local_file_size(filename): if os.path.exists(filename): try: file_size = os.path.getsize(filename) - except Exception as e: - logger.warning("failed to get file size: %s" % e) + except Exception as error: + logger.warning("failed to get file size: %s", error) else: - logger.warning("local file does not exist: %s" % filename) + logger.warning("local file does not exist: %s", filename) return file_size @@ -683,8 +683,8 @@ def get_table_from_file(filename, header=None, separator="\t", convert_to_float= try: f = open_file(filename, 'r') - except Exception as e: - logger.warning("failed to open file: %s, %s" % (filename, e)) + except Exception as error: + logger.warning("failed to open file: %s, %s", filename, error) else: firstline = True for line in f: @@ -704,8 +704,8 @@ def get_table_from_file(filename, header=None, separator="\t", convert_to_float= if convert_to_float: try: field = float(field) - except Exception as e: - logger.warning("failed to convert %s to float: %s (aborting)" % (field, e)) + except Exception as error: + logger.warning("failed to convert %s to float: %s (aborting)", field, error) return None tabledict[key].append(field) i += 1 @@ -906,7 +906,7 @@ def verify_file_list(list_of_files): diff = diff_lists(list_of_files, filtered_list) if diff: - logger.debug('found %d file(s) that do not exist (e.g. %s)' % (len(diff), diff[0])) + logger.debug('found %d file(s) that do not exist (e.g. %s)', len(diff), diff[0]) return filtered_list @@ -927,8 +927,8 @@ def find_latest_modified_file(list_of_files): try: latest_file = max(list_of_files, key=os.path.getmtime) mtime = int(os.path.getmtime(latest_file)) - except Exception as e: - logger.warning("int conversion failed for mod time: %s" % e) + except Exception as error: + logger.warning("int conversion failed for mod time: %s", error) latest_file = "" mtime = None @@ -947,9 +947,9 @@ def dump(path, cmd="cat"): if os.path.exists(path) or cmd == "echo": _cmd = "%s %s" % (cmd, path) exit_code, stdout, stderr = execute(_cmd) - logger.info("%s:\n%s" % (_cmd, stdout + stderr)) + logger.info("%s:\n%s", _cmd, stdout + stderr) else: - logger.info("path %s does not exist" % path) + logger.info("path %s does not exist", path) def establish_logging(debug=True, nopilotlog=False, filename=config.Pilot.pilotlog): @@ -1001,7 +1001,7 @@ def remove_core_dumps(workdir): coredumps = coredumps1 + coredumps2 if coredumps: for coredump in coredumps: - logger.info("removing core dump: %s" % str(coredump)) + logger.info("removing core dump: %s", str(coredump)) remove(coredump) found = True @@ -1072,14 +1072,14 @@ def copy_pilot_source(workdir): diagnostics = "" srcdir = os.path.join(os.environ.get('PILOT_SOURCE_DIR', '.'), 'pilot2') try: - logger.debug('copy %s to %s' % (srcdir, workdir)) + logger.debug('copy %s to %s', srcdir, workdir) cmd = 'cp -r %s/* %s' % (srcdir, workdir) exit_code, stdout, stderr = execute(cmd) if exit_code != 0: diagnostics = 'file copy failed: %d, %s' % (exit_code, stdout) logger.warning(diagnostics) - except Exception as e: - diagnostics = 'exception caught when copying pilot2 source: %s' % e + except Exception as error: + diagnostics = 'exception caught when copying pilot2 source: %s' % error logger.warning(diagnostics) return diagnostics @@ -1095,10 +1095,10 @@ def create_symlink(from_path='', to_path=''): try: os.symlink(from_path, to_path) - except Exception as e: - logger.warning('failed to create symlink from %s to %s: %s' % (from_path, to_path, e)) + except Exception as error: + logger.warning('failed to create symlink from %s to %s: %s', from_path, to_path, error) else: - logger.debug('created symlink from %s to %s' % (from_path, to_path)) + logger.debug('created symlink from %s to %s', from_path, to_path) def locate_file(pattern): diff --git a/pilot/util/harvester.py b/pilot/util/harvester.py index 643253ef..91891549 100644 --- a/pilot/util/harvester.py +++ b/pilot/util/harvester.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 import os import os.path @@ -68,7 +68,7 @@ def remove_job_request_file(): path = get_job_request_file_name() if os.path.exists(path): if remove(path) == 0: - logger.info('removed %s' % path) + logger.info('removed %s', path) else: logger.debug('there is no job request file') diff --git a/pilot/util/loopingjob.py b/pilot/util/loopingjob.py index e45056de..bcb1876b 100644 --- a/pilot/util/loopingjob.py +++ b/pilot/util/loopingjob.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 from pilot.common.errorcodes import ErrorCodes from pilot.util.auxiliary import whoami, set_pilot_state, cut_output, locate_core_file @@ -59,9 +59,9 @@ def looping_job(job, mt): # the payload process is considered to be looping if it's files have not been touched within looping_limit time if time_last_touched: ct = int(time.time()) - logger.info('current time: %d' % ct) - logger.info('last time files were touched: %d' % time_last_touched) - logger.info('looping limit: %d s' % looping_limit) + logger.info('current time: %d', ct) + logger.info('last time files were touched: %d', time_last_touched) + logger.info('looping limit: %d s', looping_limit) if ct - time_last_touched > looping_limit: try: # first produce core dump and copy it @@ -69,8 +69,8 @@ def looping_job(job, mt): # set debug mode to prevent core file from being removed before log creation job.debug = True kill_looping_job(job) - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) else: logger.info('no files were touched') @@ -126,14 +126,14 @@ def get_time_for_last_touch(job, mt, looping_limit): # remove unwanted list items (*.py, *.pyc, workdir, ...) files = loopingjob_definitions.remove_unwanted_files(job.workdir, files) if files: - logger.info('found %d files that were recently updated' % len(files)) - logger.debug('recent files:\n%s' % files) + logger.info('found %d files that were recently updated', len(files)) + logger.debug('recent files:\n%s', files) updated_files = verify_file_list(files) # now get the mod times for these file, and identify the most recently update file latest_modified_file, mtime = find_latest_modified_file(updated_files) if latest_modified_file: - logger.info("file %s is the most recently updated file (at time=%d)" % (latest_modified_file, mtime)) + logger.info("file %s is the most recently updated file (at time=%d)", latest_modified_file, mtime) else: logger.warning('looping job algorithm failed to identify latest updated file') return mt.ct_looping_last_touched @@ -148,7 +148,7 @@ def get_time_for_last_touch(job, mt, looping_limit): # cut the output if too long stdout = cut_output(stdout) stderr = cut_output(stderr) - logger.warning('find command failed: %d, %s, %s' % (exit_code, stdout, stderr)) + logger.warning('find command failed: %d, %s, %s', exit_code, stdout, stderr) return mt.ct_looping_last_touched @@ -168,19 +168,19 @@ def kill_looping_job(job): cmd = 'ps -fwu %s' % whoami() exit_code, stdout, stderr = execute(cmd, mute=True) - logger.info("%s: %s" % (cmd + '\n', stdout)) + logger.info("%s: %s", cmd + '\n', stdout) cmd = 'ls -ltr %s' % (job.workdir) exit_code, stdout, stderr = execute(cmd, mute=True) - logger.info("%s: %s" % (cmd + '\n', stdout)) + logger.info("%s: %s", cmd + '\n', stdout) cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % whoami() exit_code, stdout, stderr = execute(cmd, mute=True) - logger.info("%s: %s" % (cmd + '\n', stdout)) + logger.info("%s: %s", cmd + '\n', stdout) cmd = 'pstree -g -a' exit_code, stdout, stderr = execute(cmd, mute=True) - logger.info("%s: %s" % (cmd + '\n', stdout)) + logger.info("%s: %s", cmd + '\n', stdout) # set the relevant error code if job.state == 'stagein': @@ -212,6 +212,6 @@ def get_looping_job_limit(): looping_limit = convert_to_int(config.Pilot.looping_limit_default, default=2 * 3600) looping_limit_min_default = convert_to_int(config.Pilot.looping_limit_min_default, default=2 * 3600) looping_limit = max(looping_limit, looping_limit_min_default) - logger.info("using looping job limit: %d s" % looping_limit) + logger.info("using looping job limit: %d s", looping_limit) return looping_limit diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 27252ed9..c1f442d6 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -54,15 +54,15 @@ def job_monitor_tasks(job, mt, args): check_hz() try: cpuconsumptiontime = get_current_cpu_consumption_time(job.pid) - except Exception as e: - diagnostics = "Exception caught: %s" % e + except Exception as error: + diagnostics = "Exception caught: %s" % error logger.warning(diagnostics) exit_code = get_exception_error_code(diagnostics) return exit_code, diagnostics else: job.cpuconsumptiontime = int(round(cpuconsumptiontime)) job.cpuconversionfactor = 1.0 - logger.info('CPU consumption time for pid=%d: %f (rounded to %d)' % (job.pid, cpuconsumptiontime, job.cpuconsumptiontime)) + logger.info('CPU consumption time for pid=%d: %f (rounded to %d)', job.pid, cpuconsumptiontime, job.cpuconsumptiontime) # check how many cores the payload is using set_number_used_cores(job) @@ -123,7 +123,7 @@ def display_oom_info(payload_pid): payload_score = get_score(payload_pid) if payload_pid else 'UNKNOWN' pilot_score = get_score(os.getpid()) - logger.info('oom_score(pilot) = %s, oom_score(payload) = %s' % (pilot_score, payload_score)) + logger.info('oom_score(pilot) = %s, oom_score(payload) = %s', pilot_score, payload_score) def get_score(pid): @@ -136,8 +136,8 @@ def get_score(pid): try: score = '%s' % read_file('/proc/%d/oom_score' % pid) - except Exception as e: - logger.warning('caught exception reading oom_score: %s' % e) + except Exception as error: + logger.warning('caught exception reading oom_score: %s', error) score = 'UNKNOWN' else: if score.endswith('\n'): @@ -207,8 +207,8 @@ def verify_memory_usage(current_time, mt, job): # is the used memory within the allowed limit? try: exit_code, diagnostics = memory.memory_usage(job) - except Exception as e: - logger.warning('caught exception: %s' % e) + except Exception as error: + logger.warning('caught exception: %s', error) exit_code = -1 if exit_code != 0: logger.warning('ignoring failure to parse memory monitor output') @@ -291,8 +291,8 @@ def verify_looping_job(current_time, mt, job): # is the job looping? try: exit_code, diagnostics = looping_job(job, mt) - except Exception as e: - diagnostics = 'exception caught in looping job algorithm: %s' % e + except Exception as error: + diagnostics = 'exception caught in looping job algorithm: %s' % error logger.warning(diagnostics) if "No module named" in diagnostics: exit_code = errors.BLACKHOLE @@ -371,15 +371,15 @@ def verify_running_processes(current_time, mt, pid): nproc = get_number_of_child_processes(pid) try: nproc_env = int(os.environ.get('PILOT_MAXNPROC', 0)) - except Exception as e: - logger.warning('failed to convert PILOT_MAXNPROC to int: %s' % e) + except Exception as error: + logger.warning('failed to convert PILOT_MAXNPROC to int: %s', error) else: if nproc > nproc_env: # set the maximum number of found processes os.environ['PILOT_MAXNPROC'] = str(nproc) if nproc_env > 0: - logger.info('maximum number of monitored processes: %d' % nproc_env) + logger.info('maximum number of monitored processes: %d', nproc_env) return 0, "" @@ -417,19 +417,19 @@ def utility_monitor(job): try: proc1 = execute(utility_command, workdir=job.workdir, returnproc=True, usecontainer=False, stdout=PIPE, stderr=PIPE, cwd=job.workdir, queuedata=job.infosys.queuedata) - except Exception as e: - logger.error('could not execute: %s' % e) + except Exception as error: + logger.error('could not execute: %s', error) else: # store process handle in job object, and keep track on how many times the # command has been launched job.utilities[utcmd] = [proc1, utility_subprocess_launches + 1, utility_command] else: - logger.warning('detected crashed utility subprocess - too many restarts, will not restart %s again' % utcmd) + logger.warning('detected crashed utility subprocess - too many restarts, will not restart %s again', utcmd) else: # check the utility output (the selector option adds a substring to the output file name) filename = usercommon.get_utility_command_output_filename(utcmd, selector=True) path = os.path.join(job.workdir, filename) if not os.path.exists(path): - logger.warning('file: %s does not exist' % path) + logger.warning('file: %s does not exist', path) time.sleep(10) @@ -444,10 +444,9 @@ def get_local_size_limit_stdout(bytes=True): try: localsizelimit_stdout = int(config.Pilot.local_size_limit_stdout) - except Exception as e: + except Exception as error: localsizelimit_stdout = 2097152 - logger.warning('bad value in config for local_size_limit_stdout: %s (will use value: %d kB)' % - (e, localsizelimit_stdout)) + logger.warning('bad value in config for local_size_limit_stdout: %s (will use value: %d kB)', error, localsizelimit_stdout) # convert from kB to B if bytes: @@ -484,17 +483,17 @@ def check_payload_stdout(job): # now loop over all files and check each individually (any large enough file will fail the job) for filename in file_list: - logger.debug('check_payload_stdout: filename=%s' % filename) + logger.debug('check_payload_stdout: filename=%s', filename) if "job.log.tgz" in filename: - logger.info("skipping file size check of file (%s) since it is a special log file" % (filename)) + logger.info("skipping file size check of file (%s) since it is a special log file", filename) continue if os.path.exists(filename): try: # get file size in bytes fsize = os.path.getsize(filename) - except Exception as e: - logger.warning("could not read file size of %s: %s" % (filename, e)) + except Exception as error: + logger.warning("could not read file size of %s: %s", filename, error) else: # is the file too big? localsizelimit_stdout = get_local_size_limit_stdout() @@ -517,9 +516,9 @@ def check_payload_stdout(job): # remove any lingering input files from the work dir exit_code = remove_files(job.workdir, lfns) else: - logger.info("payload log (%s) within allowed size limit (%d B): %d B" % (os.path.basename(filename), localsizelimit_stdout, fsize)) + logger.info("payload log (%s) within allowed size limit (%d B): %d B", os.path.basename(filename), localsizelimit_stdout, fsize) else: - logger.info("skipping file size check of payload stdout file (%s) since it has not been created yet" % filename) + logger.info("skipping file size check of payload stdout file (%s) since it has not been created yet", filename) return exit_code, diagnostics @@ -539,7 +538,7 @@ def check_local_space(initial=True): # is there enough local space to run a job? cwd = os.getcwd() - logger.debug('checking local space on %s' % cwd) + logger.debug('checking local space on %s', cwd) spaceleft = convert_mb_to_b(get_local_disk_space(cwd)) # B (diskspace is in MB) free_space_limit = human2bytes(config.Pilot.free_space_limit) if initial else human2bytes(config.Pilot.free_space_limit_running) @@ -549,7 +548,7 @@ def check_local_space(initial=True): ec = errors.NOLOCALSPACE logger.warning(diagnostics) else: - logger.info('sufficient remaining disk space (%d B)' % spaceleft) + logger.info('sufficient remaining disk space (%d B)', spaceleft) return ec, diagnostics @@ -578,11 +577,11 @@ def check_work_dir(job): exit_code = errors.USERDIRTOOLARGE diagnostics = "work directory (%s) is too large: %d B (must be < %d B)" % \ (job.workdir, workdirsize, maxwdirsize) - logger.fatal("%s" % diagnostics) + logger.fatal("%s", diagnostics) cmd = 'ls -altrR %s' % job.workdir _ec, stdout, stderr = execute(cmd, mute=True) - logger.info("%s: %s" % (cmd + '\n', stdout)) + logger.info("%s: %s", cmd + '\n', stdout) # kill the job # pUtil.createLockFile(True, self.__env['jobDic'][k][1].workdir, lockfile="JOBWILLBEKILLED") @@ -598,13 +597,13 @@ def check_work_dir(job): # remeasure the size of the workdir at this point since the value is stored below workdirsize = get_directory_size(directory=job.workdir) else: - logger.info("size of work directory %s: %d B (within %d B limit)" % (job.workdir, workdirsize, maxwdirsize)) + logger.info("size of work directory %s: %d B (within %d B limit)", job.workdir, workdirsize, maxwdirsize) # Store the measured disk space (the max value will later be sent with the job metrics) if workdirsize > 0: job.add_workdir_size(workdirsize) else: - logger.warning('job work dir does not exist: %s' % job.workdir) + logger.warning('job work dir does not exist: %s', job.workdir) else: logger.warning('skipping size check of workdir since it has not been created yet') @@ -621,17 +620,17 @@ def get_max_allowed_work_dir_size(queuedata): try: maxwdirsize = convert_mb_to_b(get_maximum_input_sizes()) # from MB to B, e.g. 16336 MB -> 17,129,537,536 B - except Exception as e: + except Exception as error: max_input_size = get_max_input_size() maxwdirsize = max_input_size + config.Pilot.local_size_limit_stdout * 1024 logger.info("work directory size check will use %d B as a max limit (maxinputsize [%d B] + local size limit for" - " stdout [%d B])" % (maxwdirsize, max_input_size, config.Pilot.local_size_limit_stdout * 1024)) - logger.warning('conversion caught exception: %s' % e) + " stdout [%d B])", maxwdirsize, max_input_size, config.Pilot.local_size_limit_stdout * 1024) + logger.warning('conversion caught exception: %s', error) else: # grace margin, as discussed in https://its.cern.ch/jira/browse/ATLASPANDA-482 margin = 10.0 # percent, read later from somewhere maxwdirsize = int(maxwdirsize * (1 + margin / 100.0)) - logger.info("work directory size check will use %d B as a max limit (10%% grace limit added)" % maxwdirsize) + logger.info("work directory size check will use %d B as a max limit (10%% grace limit added)", maxwdirsize) return maxwdirsize @@ -654,8 +653,8 @@ def get_max_input_size(queuedata, megabyte=False): _maxinputsize = int(_maxinputsize) # MB else: # convert to B int _maxinputsize = int(_maxinputsize) * 1024 * 1024 # MB -> B - except Exception as e: - logger.warning("schedconfig.maxinputsize: %s" % e) + except Exception as error: + logger.warning("schedconfig.maxinputsize: %s", error) if megabyte: _maxinputsize = max_input_file_sizes_mb else: @@ -667,9 +666,9 @@ def get_max_input_size(queuedata, megabyte=False): _maxinputsize = max_input_file_sizes if megabyte: - logger.info("max input size = %d MB (pilot default)" % _maxinputsize) + logger.info("max input size = %d MB (pilot default)", _maxinputsize) else: - logger.info("Max input size = %d B (pilot default)" % _maxinputsize) + logger.info("Max input size = %d B (pilot default)", _maxinputsize) return _maxinputsize @@ -693,12 +692,12 @@ def check_output_file_sizes(job): fsize = get_local_file_size(path) max_fsize = human2bytes(config.Pilot.maximum_output_file_size) if fsize and fsize < max_fsize: - logger.info('output file %s is within allowed size limit (%d B < %d B)' % (path, fsize, max_fsize)) + logger.info('output file %s is within allowed size limit (%d B < %d B)', path, fsize, max_fsize) else: exit_code = errors.OUTPUTFILETOOLARGE diagnostics = 'output file %s is not within allowed size limit (%d B > %d B)' % (path, fsize, max_fsize) logger.warning(diagnostics) else: - logger.info('output file size check: skipping output file %s since it does not exist' % path) + logger.info('output file size check: skipping output file %s since it does not exist', path) return exit_code, diagnostics diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 6ee9b84d..c51b717e 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 import os import time @@ -47,8 +47,8 @@ def find_processes_in_group(cpids, pid): try: thispid = int(lines[i].split()[0]) thisppid = int(lines[i].split()[1]) - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) if thisppid == pid: find_processes_in_group(cpids, thispid) @@ -84,7 +84,7 @@ def get_process_commands(euid, pids): exit_code, stdout, stderr = execute(cmd, mute=True) if exit_code != 0 or stdout == '': - logger.warning('ps command failed: %d, \"%s\", \"%s\"' % (exit_code, stdout, stderr)) + logger.warning('ps command failed: %d, \"%s\", \"%s\"', exit_code, stdout, stderr) else: # extract the relevant processes p_commands = stdout.split('\n') @@ -153,13 +153,13 @@ def kill_processes(pid): return children.reverse() - logger.info("process IDs to be killed: %s (in reverse order)" % str(children)) + logger.info("process IDs to be killed: %s (in reverse order)", str(children)) # find which commands are still running try: cmds = get_process_commands(os.geteuid(), children) - except Exception as e: - logger.warning("get_process_commands() threw an exception: %s" % e) + except Exception as error: + logger.warning("get_process_commands() threw an exception: %s", error) else: if len(cmds) <= 1: logger.warning("found no corresponding commands to process id(s)") @@ -195,13 +195,13 @@ def kill_child_processes(pid): # reverse the process order so that the athena process is killed first (otherwise the stdout will be truncated) children.reverse() - logger.info("process IDs to be killed: %s (in reverse order)" % str(children)) + logger.info("process IDs to be killed: %s (in reverse order)", str(children)) # find which commands are still running try: cmds = get_process_commands(os.geteuid(), children) - except Exception as e: - logger.warning("get_process_commands() threw an exception: %s" % e) + except Exception as error: + logger.warning("get_process_commands() threw an exception: %s", error) else: if len(cmds) <= 1: logger.warning("found no corresponding commands to process id(s)") @@ -231,26 +231,26 @@ def kill_process_group(pgrp): _sleep = True # kill the process gracefully - logger.info("killing group process %d" % pgrp) + logger.info("killing group process %d", pgrp) try: os.killpg(pgrp, signal.SIGTERM) - except Exception as e: - logger.warning("exception thrown when killing child group process under SIGTERM: %s" % e) + except Exception as error: + logger.warning("exception thrown when killing child group process under SIGTERM: %s", error) _sleep = False else: - logger.info("SIGTERM sent to process group %d" % pgrp) + logger.info("SIGTERM sent to process group %d", pgrp) if _sleep: _t = 30 - logger.info("sleeping %d s to allow processes to exit" % _t) + logger.info("sleeping %d s to allow processes to exit", _t) time.sleep(_t) try: os.killpg(pgrp, signal.SIGKILL) - except Exception as e: - logger.warning("exception thrown when killing child group process with SIGKILL: %s" % e) + except Exception as error: + logger.warning("exception thrown when killing child group process with SIGKILL: %s", error) else: - logger.info("SIGKILL sent to process group %d" % pgrp) + logger.info("SIGKILL sent to process group %d", pgrp) status = True return status @@ -270,7 +270,7 @@ def kill_process(pid): kill(pid, signal.SIGTERM) _t = 10 - logger.info("sleeping %d s to allow process to exit" % _t) + logger.info("sleeping %d s to allow process to exit", _t) time.sleep(_t) # now do a hard kill just in case some processes haven't gone away @@ -291,10 +291,10 @@ def kill(pid, sig): status = False try: os.kill(pid, sig) - except Exception as e: - logger.warning("exception thrown when killing process %d with signal=%d: %s" % (pid, sig, e)) + except Exception as error: + logger.warning("exception thrown when killing process %d with signal=%d: %s", pid, sig, error) else: - logger.info("killed process %d with signal=%d" % (pid, sig)) + logger.info("killed process %d with signal=%d", pid, sig) status = True return status @@ -313,12 +313,12 @@ def get_number_of_child_processes(pid): n = 0 try: find_processes_in_group(children, pid) - except Exception as e: - logger.warning("exception caught in find_processes_in_group: %s" % e) + except Exception as error: + logger.warning("exception caught in find_processes_in_group: %s", error) else: if pid: n = len(children) - logger.info("number of running child processes to parent process %d: %d" % (pid, n)) + logger.info("number of running child processes to parent process %d: %d", pid, n) else: logger.debug("pid not yet set") return n @@ -335,16 +335,16 @@ def killpg(pid, sig, args): try: os.killpg(int(pid), sig) - except Exception as e: - logger.warning("failed to execute killpg(): %s" % e) + except Exception as error: + logger.warning("failed to execute killpg(): %s", error) cmd = 'kill -%d %s' % (sig, pid) exit_code, rs, stderr = execute(cmd) if exit_code != 0: logger.warning(rs) else: - logger.info("killed orphaned process %s (%s)" % (pid, args)) + logger.info("killed orphaned process %s (%s)", pid, args) else: - logger.info("killed orphaned process group %s (%s)" % (pid, args)) + logger.info("killed orphaned process group %s (%s)", pid, args) def get_pilot_pid_from_processes(_processes, pattern): @@ -364,8 +364,8 @@ def get_pilot_pid_from_processes(_processes, pattern): args = ids.group(3) try: pid = int(pid) - except Exception as e: - logger.warning('failed to convert pid to int: %s' % e) + except Exception as error: + logger.warning('failed to convert pid to int: %s', error) continue if 'pilot.py' in args and 'python' in args: pilot_pid = pid @@ -405,30 +405,29 @@ def kill_orphans(): args = ids.group(3) try: pid = int(pid) - except Exception as e: - logger.warning('failed to convert pid to int: %s' % e) + except Exception as error: + logger.warning('failed to convert pid to int: %s', error) continue if 'cvmfs2' in args: - logger.info("ignoring possible orphan process running cvmfs2: pid=%s, ppid=%s, args=\'%s\'" % - (pid, ppid, args)) + logger.info("ignoring possible orphan process running cvmfs2: pid=%s, ppid=%s, args=\'%s\'", pid, ppid, args) elif 'pilots_starter.py' in args or 'runpilot2-wrapper.sh' in args: - logger.info("ignoring pilot launcher: pid=%s, ppid=%s, args='%s'" % (pid, ppid, args)) + logger.info("ignoring pilot launcher: pid=%s, ppid=%s, args='%s'", pid, ppid, args) elif ppid == '1': count += 1 - logger.info("found orphan process: pid=%s, ppid=%s, args='%s'" % (pid, ppid, args)) + logger.info("found orphan process: pid=%s, ppid=%s, args='%s'", pid, ppid, args) if 'bash' in args or ('python' in args and 'pilot.py' in args): logger.info("will not kill bash process") else: killpg(pid, signal.SIGTERM, args) _t = 10 - logger.info("sleeping %d s to allow processes to exit" % _t) + logger.info("sleeping %d s to allow processes to exit", _t) time.sleep(_t) killpg(pid, signal.SIGKILL, args) if count == 0: logger.info("did not find any orphan processes") else: - logger.info("found %d orphan process(es)" % count) + logger.info("found %d orphan process(es)", count) def get_max_memory_usage_from_cgroups(): @@ -453,19 +452,19 @@ def get_max_memory_usage_from_cgroups(): if ":memory:" in out: pos = out.find('/') path = out[pos:] - logger.info("extracted path = %s" % path) + logger.info("extracted path = %s", path) pre = get_cgroups_base_path() if pre != "": path = pre + os.path.join(path, "memory.max_usage_in_bytes") - logger.info("path to CGROUPS memory info: %s" % path) + logger.info("path to CGROUPS memory info: %s", path) max_memory = read_file(path) else: logger.info("CGROUPS base path could not be extracted - not a CGROUPS site") else: - logger.warning("invalid format: %s (expected ..:memory:[path])" % out) + logger.warning("invalid format: %s (expected ..:memory:[path])", out) else: - logger.info("path %s does not exist (not a CGROUPS site)" % path) + logger.info("path %s does not exist (not a CGROUPS site)", path) return max_memory @@ -518,7 +517,7 @@ def get_instant_cpu_consumption_time(pid): hz = os.sysconf(os.sysconf_names['SC_CLK_TCK']) if type(hz) != int: - logger.warning('unknown SC_CLK_TCK: %s' % str(hz)) + logger.warning('unknown SC_CLK_TCK: %s', str(hz)) return 0.0 if pid and hz and hz > 0: @@ -586,21 +585,21 @@ def cleanup(job, args): # make sure the workdir is deleted if args.cleanup: if remove_dir_tree(job.workdir): - logger.info('removed %s' % job.workdir) + logger.info('removed %s', job.workdir) if os.path.exists(job.workdir): - logger.warning('work directory still exists: %s' % job.workdir) + logger.warning('work directory still exists: %s', job.workdir) else: - logger.debug('work directory was removed: %s' % job.workdir) + logger.debug('work directory was removed: %s', job.workdir) else: - logger.info('workdir not removed %s' % job.workdir) + logger.info('workdir not removed %s', job.workdir) # collect any zombie processes job.collect_zombies(tn=10) logger.info("collected zombie processes") if job.pid: - logger.info("will now attempt to kill all subprocesses of pid=%d" % job.pid) + logger.info("will now attempt to kill all subprocesses of pid=%d", job.pid) kill_processes(job.pid) else: logger.warning('cannot kill any subprocesses since job.pid is not set') @@ -672,8 +671,8 @@ def convert_ps_to_dict(output, pattern=r'(\d+) (\d+) (\d+) (.+)'): var = match.group(i + 1) dictionary[first_line[i]].append(var) - except Exception as e: - print("unexpected format of utility output: %s" % e) + except Exception as error: + print("unexpected format of utility output: %s", error) return dictionary From 0dd9750c6292fb4b764708bcf524def728a52d3c Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 15 Jun 2021 14:38:29 +0200 Subject: [PATCH 70/96] Pylint updates --- pilot/control/monitor.py | 47 +++++++++++++------------- pilot/control/payloads/eventservice.py | 4 +-- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 8770f82b..390776ac 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -6,7 +6,7 @@ # # Authors: # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # NOTE: this module should deal with non-job related monitoring, such as thread monitoring. Job monitoring is # a task for the job_monitor thread in the Job component. @@ -56,7 +56,7 @@ def control(queues, traces, args): try: # overall loop counter (ignoring the fact that more than one job may be running) - n = 0 + niter = 0 while not args.graceful_stop.is_set(): # every seconds, run the monitoring checks @@ -84,8 +84,8 @@ def control(queues, traces, args): args.graceful_stop.set() break else: - if n % 60 == 0: - logger.info('%d s have passed since pilot start' % time_since_start) + if niter % 60 == 0: + logger.info('%d s have passed since pilot start', time_since_start) time.sleep(1) # time to check the CPU? @@ -93,12 +93,12 @@ def control(queues, traces, args): processes = get_process_info('python pilot2/pilot.py', pid=getpid()) if processes: logger.info('-' * 100) - logger.info('PID=%d has CPU usage=%s%% MEM usage=%s%% CMD=%s' % (getpid(), processes[0], processes[1], processes[2])) - n = processes[3] - if n > 1: - logger.info('there are %d such processes running' % n) + logger.info('PID=%d has CPU usage=%s%% MEM usage=%s%% CMD=%s', getpid(), processes[0], processes[1], processes[2]) + nproc = processes[3] + if nproc > 1: + logger.info('there are %d such processes running', nproc) else: - logger.info('there is %d such process running' % n) + logger.info('there is %d such process running', nproc) logger.info('-' * 100) tcpu = time.time() @@ -111,20 +111,19 @@ def control(queues, traces, args): for thread in threading.enumerate(): # logger.info('thread name: %s' % thread.name) if not thread.is_alive(): - logger.fatal('thread \'%s\' is not alive' % thread.name) + logger.fatal('thread \'%s\' is not alive', thread.name) # args.graceful_stop.set() - n += 1 + niter += 1 - except Exception as e: - print(("monitor: exception caught: %s" % e)) - raise PilotException(e) + except Exception as error: + print(("monitor: exception caught: %s" % error)) + raise PilotException(error) logger.info('[monitor] control thread has ended') #def log_lifetime(sig, frame, traces): -# logger.info('lifetime: %i used, %i maximum' % (int(time.time() - traces.pilot['lifetime_start']), -# traces.pilot['lifetime_max'])) +# logger.info('lifetime: %i used, %i maximum', int(time.time() - traces.pilot['lifetime_start']), traces.pilot['lifetime_max']) def get_process_info(cmd, user=None, args='aufx', pid=None): @@ -194,7 +193,7 @@ def run_checks(queues, args): t_max = 2 * 60 logger.warning('pilot monitor received instruction that abort_job has been requested') - logger.warning('will wait for a maximum of %d seconds for threads to finish' % t_max) + logger.warning('will wait for a maximum of %d seconds for threads to finish', t_max) t0 = time.time() while time.time() - t0 < t_max: if args.job_aborted.is_set(): @@ -210,7 +209,7 @@ def run_checks(queues, args): args.graceful_stop.set() if not args.job_aborted.is_set(): - logger.warning('will wait for a maximum of %d seconds for graceful_stop to take effect' % t_max) + logger.warning('will wait for a maximum of %d seconds for graceful_stop to take effect', t_max) t_max = 10 t0 = time.time() while time.time() - t0 < t_max: @@ -241,20 +240,20 @@ def get_max_running_time(lifetime, queuedata): # use the schedconfig value if set, otherwise use the pilot option lifetime value if not queuedata: logger.warning('queuedata could not be extracted from queues, will use default for max running time ' - '(%d s)' % max_running_time) + '(%d s)', max_running_time) else: if queuedata.maxtime: try: max_running_time = int(queuedata.maxtime) - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) logger.warning('failed to convert maxtime from queuedata, will use default value for max running time ' - '(%d s)' % max_running_time) + '(%d s)', max_running_time) else: if max_running_time == 0: max_running_time = lifetime # fallback to default value - logger.info('will use default value for max running time: %d s' % max_running_time) + logger.info('will use default value for max running time: %d s', max_running_time) else: - logger.info('will use queuedata.maxtime value for max running time: %d s' % max_running_time) + logger.info('will use queuedata.maxtime value for max running time: %d s', max_running_time) return max_running_time diff --git a/pilot/control/payloads/eventservice.py b/pilot/control/payloads/eventservice.py index 07829a73..3e0390d3 100644 --- a/pilot/control/payloads/eventservice.py +++ b/pilot/control/payloads/eventservice.py @@ -72,8 +72,8 @@ def run_payload(self, job, cmd, out, err): job.pgrp = os.getpgid(job.pid) self.utility_after_payload_started(job) - except Exception as e: - logger.error('could not execute: %s', str(e)) + except Exception as error: + logger.error('could not execute: %s', str(error)) return None return executor From f26dfdef150c9a57485492c2dba37fb16ac2466b Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 15 Jun 2021 14:48:53 +0200 Subject: [PATCH 71/96] Flake8 corrections --- pilot/control/job.py | 2 +- pilot/user/atlas/jobmetrics.py | 2 +- pilot/util/loopingjob.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 11a3de6a..a980120f 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -1426,7 +1426,7 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge # timefloor not relevant for the first job if jobnumber > 0: - logger.info('since timefloor=%d s and only %d s has passed since launch, pilot can run another job',timefloor, currenttime - starttime) + logger.info('since timefloor=%d s and only %d s has passed since launch, pilot can run another job', timefloor, currenttime - starttime) if harvester and jobnumber > 0: # unless it's the first job (which is preplaced in the init dir), instruct Harvester to place another job diff --git a/pilot/user/atlas/jobmetrics.py b/pilot/user/atlas/jobmetrics.py index d0802040..0d974175 100644 --- a/pilot/user/atlas/jobmetrics.py +++ b/pilot/user/atlas/jobmetrics.py @@ -157,7 +157,7 @@ def get_job_metrics(job): # is job_metrics within allowed size? if len(job_metrics) > 500: - logger.warning("job_metrics out of size (%d)",len(job_metrics)) + logger.warning("job_metrics out of size (%d)", len(job_metrics)) # try to reduce the field size and remove the last entry which might be cut job_metrics = job_metrics[:500] diff --git a/pilot/util/loopingjob.py b/pilot/util/loopingjob.py index bcb1876b..b3b97a0a 100644 --- a/pilot/util/loopingjob.py +++ b/pilot/util/loopingjob.py @@ -101,6 +101,7 @@ def create_core_dump(pid=None, workdir=None): else: logger.warning('failed to execute command: %s, stdout+err=%s', cmd, stdout + stderr) + def get_time_for_last_touch(job, mt, looping_limit): """ Return the time when the files in the workdir were last touched. From 54b5a0a8393cdc99e5df0de7430bf429c30e8f7a Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 17 Jun 2021 11:24:30 +0200 Subject: [PATCH 72/96] Pylint corrections. Fixes for localSite problem in traces --- PILOTVERSION | 2 +- pilot/control/job.py | 5 ++- pilot/copytool/common.py | 6 ++-- pilot/copytool/gfal.py | 3 +- pilot/copytool/gs.py | 23 +++++++------- pilot/copytool/lsm.py | 27 ++++++++-------- pilot/copytool/mv.py | 12 +++---- pilot/copytool/objectstore.py | 10 +++--- pilot/copytool/rucio.py | 59 ++++++++++++++++++----------------- pilot/copytool/s3.py | 20 ++++++------ pilot/copytool/xrdcp.py | 31 +++++++++--------- pilot/util/auxiliary.py | 44 ++++++++++++++++++++++++++ pilot/util/constants.py | 2 +- pilot/util/tracereport.py | 7 +++++ pilot/util/workernode.py | 30 +++++++++--------- pilot/workflow/generic_hpc.py | 12 +++---- 16 files changed, 175 insertions(+), 118 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index fb45e883..a14d3a59 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.12.1.44 \ No newline at end of file +2.12.1.46 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index a980120f..449af861 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -33,7 +33,7 @@ from pilot.util import https from pilot.util.auxiliary import get_batchsystem_jobid, get_job_scheduler_id, get_pilot_id, \ set_pilot_state, get_pilot_state, check_for_final_server_update, pilot_version_banner, is_virtual_machine, \ - is_python3, show_memory_usage, has_instruction_sets, locate_core_file + is_python3, show_memory_usage, has_instruction_sets, locate_core_file, get_display_info from pilot.util.config import config from pilot.util.common import should_abort, was_pilot_killed from pilot.util.constants import PILOT_MULTIJOB_START_TIME, PILOT_PRE_GETJOB, PILOT_POST_GETJOB, PILOT_KILL_SIGNAL, LOG_TRANSFER_NOT_DONE, \ @@ -661,11 +661,14 @@ def get_data_structure(job, state, args, xml=None, metadata=None): data['cpuConsumptionUnit'] = job.cpuconsumptionunit + "+" + get_cpu_model() instruction_sets = has_instruction_sets(['AVX2']) + product, vendor = get_display_info() if instruction_sets: if 'cpuConsumptionUnit' in data: data['cpuConsumptionUnit'] += '+' + instruction_sets else: data['cpuConsumptionUnit'] = instruction_sets + if product and vendor: + logger.debug('cpuConsumptionUnit: could have added: product=%s, vendor=%s', product, vendor) # add memory information if available add_memory_info(data, job.workdir, name=job.memorymonitor) diff --git a/pilot/copytool/common.py b/pilot/copytool/common.py index ce5f0df3..12381b3d 100644 --- a/pilot/copytool/common.py +++ b/pilot/copytool/common.py @@ -6,7 +6,7 @@ # # Authors: # - Tobias Wegner, tobias.wegner@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # - Mario Lassnig, mario.lassnig@cern.ch, 2020 import logging @@ -61,8 +61,8 @@ def verify_catalog_checksum(fspec, path): checksum_local = calculate_checksum(path, algorithm=checksum_type) if checksum_type == 'ad32': checksum_type = 'adler32' - logger.info('checksum (catalog): %s (type: %s)' % (checksum_catalog, checksum_type)) - logger.info('checksum (local): %s' % checksum_local) + logger.info('checksum (catalog): %s (type: %s)', checksum_catalog, checksum_type) + logger.info('checksum (local): %s', checksum_local) if checksum_local and checksum_local != '' and checksum_local != checksum_catalog: diagnostics = 'checksum verification failed for LFN=%s: checksum (catalog)=%s != checksum (local)=%s' % \ (fspec.lfn, checksum_catalog, checksum_local) diff --git a/pilot/copytool/gfal.py b/pilot/copytool/gfal.py index 2c184c89..54034e7f 100644 --- a/pilot/copytool/gfal.py +++ b/pilot/copytool/gfal.py @@ -57,7 +57,8 @@ def copy_in(files, **kwargs): if not check_for_gfal(): raise StageInFailure("No GFAL2 tools found") - localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None) + # note, env vars might be unknown inside middleware contrainers, if so get the value already in the trace report + localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', trace_report.get_value('localSite')) for fspec in files: # update the trace report localsite = localsite if localsite else fspec.ddmendpoint diff --git a/pilot/copytool/gs.py b/pilot/copytool/gs.py index 68e50b5c..ddaa68d6 100644 --- a/pilot/copytool/gs.py +++ b/pilot/copytool/gs.py @@ -6,6 +6,7 @@ # # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2021 +# - Shuwei import os import logging @@ -73,11 +74,11 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): # http_access = rprotocols["http_access"] # os.environ['GTAG'] = http_access + os.path.join(remote_path, config.Pilot.pilotlog) # logger.debug('http_access=%s' % http_access) - # except Exception as e: + # except Exception: # logger.warning("Failed in get 'http_access' in ddm.rprotocols") surl = protocol.get('endpoint', '') + remote_path - logger.info('For GCS bucket, set surl=%s' % surl) + logger.info('For GCS bucket, set surl=%s', surl) # example: # protocol = {u'path': u'/atlas-eventservice', u'endpoint': u's3://s3.cern.ch:443/', u'flavour': u'AWS-S3-SSL', u'id': 175} @@ -97,7 +98,7 @@ def copy_in(files, **kwargs): dst = fspec.workdir or kwargs.get('workdir') or '.' path = os.path.join(dst, fspec.lfn) - logger.info('downloading surl=%s to local file %s' % (fspec.surl, path)) + logger.info('downloading surl=%s to local file %s', fspec.surl, path) status, diagnostics = download_file(path, fspec.surl, object_name=fspec.lfn) if not status: ## an error occurred @@ -131,8 +132,8 @@ def download_file(path, surl, object_name=None): target = pathlib.Path(object_name) with target.open(mode="wb") as downloaded_file: client.download_blob_to_file(surl, downloaded_file) - except Exception as e: - diagnostics = 'exception caught in gs client: %s' % e + except Exception as error: + diagnostics = 'exception caught in gs client: %s' % error logger.critical(diagnostics) return False, diagnostics @@ -150,7 +151,7 @@ def copy_out(files, **kwargs): workdir = kwargs.pop('workdir') for fspec in files: - logger.info('Going to process fspec.turl=%s' % fspec.turl) + logger.info('Going to process fspec.turl=%s', fspec.turl) import re # bucket = re.sub(r'gs://(.*?)/.*', r'\1', fspec.turl) @@ -164,7 +165,7 @@ def copy_out(files, **kwargs): path = os.path.join(workdir, logfile) if os.path.exists(path): object_name = os.path.join(remote_path, logfile) - logger.info('uploading %s to bucket=%s using object name=%s' % (path, bucket, object_name)) + logger.info('uploading %s to bucket=%s using object name=%s', path, bucket, object_name) status, diagnostics = upload_file(path, bucket, object_name=object_name) if not status: ## an error occurred @@ -204,15 +205,15 @@ def upload_file(file_name, bucket, object_name=None): try: client = storage.Client() gs_bucket = client.get_bucket(bucket) - logger.info('uploading a file to bucket=%s in full path=%s' % (bucket, object_name)) + logger.info('uploading a file to bucket=%s in full path=%s', bucket, object_name) blob = gs_bucket.blob(object_name) blob.upload_from_filename(filename=file_name) if file_name.endswith(config.Pilot.pilotlog): url_pilotlog = blob.public_url os.environ['GTAG'] = url_pilotlog - logger.debug("Set envvar GTAG with the pilotLot URL=%s" % url_pilotlog) - except Exception as e: - diagnostics = 'exception caught in gs client: %s' % e + logger.debug("Set envvar GTAG with the pilotLot URL=%s", url_pilotlog) + except Exception as error: + diagnostics = 'exception caught in gs client: %s' % error logger.critical(diagnostics) return False, diagnostics diff --git a/pilot/copytool/lsm.py b/pilot/copytool/lsm.py index 8f63cd46..67d8b791 100644 --- a/pilot/copytool/lsm.py +++ b/pilot/copytool/lsm.py @@ -7,7 +7,7 @@ # Authors: # - Pavlo Svirin, pavlo.svirin@cern.ch, 2017 # - Tobias Wegner, tobias.wegner@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2018 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 import os import logging @@ -75,7 +75,9 @@ def copy_in(files, **kwargs): copysetup = get_copysetup(copytools, 'lsm') trace_report = kwargs.get('trace_report') #allow_direct_access = kwargs.get('allow_direct_access') - localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None) + + # note, env vars might be unknown inside middleware contrainers, if so get the value already in the trace report + localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', trace_report.get_value('localSite')) for fspec in files: # update the trace report @@ -99,17 +101,16 @@ def copy_in(files, **kwargs): source = fspec.turl destination = os.path.join(dst, fspec.lfn) - logger.info("transferring file %s from %s to %s" % (fspec.lfn, source, destination)) + logger.info("transferring file %s from %s to %s", fspec.lfn, source, destination) exit_code, stdout, stderr = move(source, destination, dst_in=True, copysetup=copysetup) if exit_code != 0: - logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s" % (exit_code, stdout, stderr)) + logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr) error = resolve_common_transfer_errors(stderr, is_stagein=True) fspec.status = 'failed' fspec.status_code = error.get('rcode') - logger.warning('error=%d' % error.get('rcode')) trace_report.update(clientState=error.get('state') or 'STAGEIN_ATTEMPT_FAILED', stateReason=error.get('error'), timeEnd=time()) trace_report.send() @@ -186,7 +187,7 @@ def copy_out(files, **kwargs): except Exception: opts = " ".join(["%s %s" % (k, v) for (k, v) in list(opts.items())]) # Python 3 - logger.info("transferring file %s from %s to %s" % (fspec.lfn, source, destination)) + logger.info("transferring file %s from %s to %s", fspec.lfn, source, destination) nretries = 1 # input parameter to function? for retry in range(nretries): @@ -246,7 +247,7 @@ def move_all_files_in(files, nretries=1): stderr = "" for entry in files: # entry = {'name':, 'source':, 'destination':} - logger.info("transferring file %s from %s to %s" % (entry['name'], entry['source'], entry['destination'])) + logger.info("transferring file %s from %s to %s", entry['name'], entry['source'], entry['destination']) source = entry['source'] + '/' + entry['name'] destination = os.path.join(entry['destination'], entry['name']) @@ -255,7 +256,7 @@ def move_all_files_in(files, nretries=1): if exit_code != 0: if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: - logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s" % (exit_code, stdout, stderr)) + logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr) return exit_code, stdout, stderr else: # all successful break @@ -276,7 +277,7 @@ def move_all_files_out(files, nretries=1): stderr = "" for entry in files: # entry = {'name':, 'source':, 'destination':} - logger.info("transferring file %s from %s to %s" % (entry['name'], entry['source'], entry['destination'])) + logger.info("transferring file %s from %s to %s", entry['name'], entry['source'], entry['destination']) destination = entry['destination'] + '/' + entry['name'] source = os.path.join(entry['source'], entry['name']) @@ -285,7 +286,7 @@ def move_all_files_out(files, nretries=1): if exit_code != 0: if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: - logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s" % (exit_code, stdout, stderr)) + logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr) return exit_code, stdout, stderr else: # all successful break @@ -321,16 +322,16 @@ def move(source, destination, dst_in=True, copysetup="", options=None): try: exit_code, stdout, stderr = execute(cmd, usecontainer=False, copytool=True) #, timeout=get_timeout(fspec.filesize)) - except Exception as e: + except Exception as error: if dst_in: exit_code = ErrorCodes.STAGEINFAILED else: exit_code = ErrorCodes.STAGEOUTFAILED - stdout = 'exception caught: e' % e + stdout = 'exception caught: e' % error stderr = '' logger.warning(stdout) - logger.info('exit_code=%d, stdout=%s, stderr=%s' % (exit_code, stdout, stderr)) + logger.info('exit_code=%d, stdout=%s, stderr=%s', exit_code, stdout, stderr) return exit_code, stdout, stderr diff --git a/pilot/copytool/mv.py b/pilot/copytool/mv.py index 73093a92..3ff42143 100644 --- a/pilot/copytool/mv.py +++ b/pilot/copytool/mv.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # - Tobias Wegner, tobias.wegner@cern.ch, 2018 # - David Cameron, david.cameron@cern.ch, 2018-2019 @@ -48,12 +48,12 @@ def create_output_list(files, init_dir, ddmconf): # resolve token value from fspec.ddmendpoint token = ddmconf.get(fspec.ddmendpoint).token if not token: - logger.info('No space token info for %s' % fspec.ddmendpoint) + logger.info('No space token info for %s', fspec.ddmendpoint) else: arcturl = re.sub(r'((:\d+)/)', r'\2;autodir=no;spacetoken=%s/' % token, arcturl) arcturl += ':checksumtype=%s:checksumvalue=%s' % (checksumtype, checksum) - logger.info('Adding to output.list: %s %s' % (fspec.lfn, arcturl)) + logger.info('Adding to output.list: %s %s', fspec.lfn, arcturl) # Write output.list with open(os.path.join(init_dir, 'output.list'), 'a') as f: f.write('%s %s\n' % (fspec.lfn, arcturl)) @@ -124,7 +124,7 @@ def copy_out(files, copy_type="mv", **kwargs): raise StageOutFailure(stdout) # Create output list for ARC CE if necessary - logger.debug('init_dir for output.list=%s' % os.path.dirname(kwargs.get('workdir'))) + logger.debug('init_dir for output.list=%s', os.path.dirname(kwargs.get('workdir'))) output_dir = kwargs.get('output_dir', '') if not output_dir: create_output_list(files, os.path.dirname(kwargs.get('workdir')), kwargs.get('ddmconf', None)) @@ -168,11 +168,11 @@ def move_all_files(files, copy_type, workdir): # resolve canonical path source = os.path.realpath(source) - logger.info("transferring file %s from %s to %s" % (name, source, destination)) + logger.info("transferring file %s from %s to %s", name, source, destination) exit_code, stdout, stderr = copy_method(source, destination) if exit_code != 0: - logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s" % (exit_code, stdout, stderr)) + logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr) fspec.status = 'failed' if fspec.filetype == 'input': fspec.status_code = ErrorCodes.STAGEINFAILED diff --git a/pilot/copytool/objectstore.py b/pilot/copytool/objectstore.py index a8ccb38d..e13c20e4 100644 --- a/pilot/copytool/objectstore.py +++ b/pilot/copytool/objectstore.py @@ -7,7 +7,7 @@ # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2019 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2021 import os import json @@ -73,7 +73,7 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): # :return: protocol as dictionary # """ # -# logger.info("Resolving protocol for file(lfn: %s, ddmendpoint: %s) with activity(%s)" % (fspec.lfn, fspec.ddmendpoint, activity)) +# logger.info("Resolving protocol for file(lfn: %s, ddmendpoint: %s) with activity(%s)", fspec.lfn, fspec.ddmendpoint, activity) # # activity = get_ddm_activity(activity) # protocols = ddm.arprotocols.get(activity) @@ -87,7 +87,7 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): # logger.error(err) # raise PilotException(err) # protocol = protocols_allow[0] -# logger.info("Resolved protocol for file(lfn: %s, ddmendpoint: %s) with activity(%s): %s" % (fspec.lfn, fspec.ddmendpoint, activity, protocol)) +# logger.info("Resolved protocol for file(lfn: %s, ddmendpoint: %s) with activity(%s): %s", fspec.lfn, fspec.ddmendpoint, activity, protocol) # return protocol @@ -109,7 +109,7 @@ def copy_in(files, **kwargs): for fspec in files: cmd = [] - logger.info("To transfer file: %s" % fspec) + logger.info("To transfer file: %s", fspec) if fspec.protocol_id: ddm = ddmconf.get(fspec.ddmendpoint) if ddm: @@ -212,7 +212,7 @@ def copy_out(files, **kwargs): cwd = fspec.workdir or kwargs.get('workdir') or '.' path = os.path.join(cwd, 'rucio_upload.json') if not os.path.exists(path): - logger.error('Failed to resolve Rucio summary JSON, wrong path? file=%s' % path) + logger.error('Failed to resolve Rucio summary JSON, wrong path? file=%s', path) else: with open(path, 'rb') as f: summary = json.load(f) diff --git a/pilot/copytool/rucio.py b/pilot/copytool/rucio.py index 626821eb..675688e5 100644 --- a/pilot/copytool/rucio.py +++ b/pilot/copytool/rucio.py @@ -48,7 +48,7 @@ def verify_stage_out(fspec): from rucio.rse import rsemanager as rsemgr rse_settings = rsemgr.get_rse_info(fspec.ddmendpoint) uploaded_file = {'name': fspec.lfn, 'scope': fspec.scope} - logger.info('Checking file: %s' % str(fspec.lfn)) + logger.info('Checking file: %s', str(fspec.lfn)) return rsemgr.exists(rse_settings, [uploaded_file]) @@ -66,15 +66,16 @@ def copy_in(files, **kwargs): trace_report = kwargs.get('trace_report') use_pcache = kwargs.get('use_pcache') #job = kwargs.get('job') - #use_pcache = job.infosys.queuedata.use_pcache if job else False - logger.debug('use_pcache=%s' % use_pcache) # don't spoil the output, we depend on stderr parsing os.environ['RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]' - localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None) + logger.debug('RUCIO_LOCAL_SITE_ID=%s', os.environ.get('RUCIO_LOCAL_SITE_ID', '')) + logger.debug('trace_report[localSite]=%s', trace_report.get_value('localSite')) + # note, env vars might be unknown inside middleware contrainers, if so get the value already in the trace report + localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', trace_report.get_value('localSite')) for fspec in files: - logger.info('rucio copytool, downloading file with scope:%s lfn:%s' % (str(fspec.scope), str(fspec.lfn))) + logger.info('rucio copytool, downloading file with scope:%s lfn:%s', str(fspec.scope), str(fspec.lfn)) # update the trace report localsite = localsite if localsite else fspec.ddmendpoint trace_report.update(localSite=localsite, remoteSite=fspec.ddmendpoint, filesize=fspec.filesize) @@ -152,7 +153,7 @@ def copy_in(files, **kwargs): def get_protocol(trace_report_out): """ - Extract the protocol used for the transdfer from the dictionary returned by rucio. + Extract the protocol used for the transfer from the dictionary returned by rucio. :param trace_report_out: returned rucio transfer dictionary (dictionary). :return: protocol (string). @@ -160,8 +161,8 @@ def get_protocol(trace_report_out): try: p = trace_report_out[0].get('protocol') - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s' % error) p = '' return p @@ -481,21 +482,21 @@ def _stage_in_api(dst, fspec, trace_report, trace_report_out, transfer_timeout, result = download_client.download_pfns([f], 1, trace_custom_fields=trace_pattern, traces_copy_out=trace_report_out) else: result = download_client.download_dids([f], trace_custom_fields=trace_pattern, traces_copy_out=trace_report_out) - except Exception as e: + except Exception as error: logger.warning('*** rucio API download client failed ***') - logger.warning('caught exception: %s' % e) - logger.debug('trace_report_out=%s' % trace_report_out) + logger.warning('caught exception: %s', error) + logger.debug('trace_report_out=%s', trace_report_out) # only raise an exception if the error info cannot be extracted if not trace_report_out: - raise e + raise error if not trace_report_out[0].get('stateReason'): - raise e + raise error ec = -1 else: logger.info('*** rucio API download client finished ***') - logger.debug('client returned %s' % result) + logger.debug('client returned %s', result) - logger.debug('trace_report_out=%s' % trace_report_out) + logger.debug('trace_report_out=%s', trace_report_out) return ec, trace_report_out @@ -552,18 +553,18 @@ def _stage_in_bulk(dst, files, trace_report_out=None, trace_common_fields=None): logger.info('*** rucio API downloading files (taking over logging) ***') try: result = download_client.download_pfns(file_list, num_threads, trace_custom_fields=trace_pattern, traces_copy_out=trace_report_out) - except Exception as e: + except Exception as error: logger.warning('*** rucio API download client failed ***') - logger.warning('caught exception: %s' % e) - logger.debug('trace_report_out=%s' % trace_report_out) + logger.warning('caught exception: %s', error) + logger.debug('trace_report_out=%s', trace_report_out) # only raise an exception if the error info cannot be extracted if not trace_report_out: - raise e + raise error if not trace_report_out[0].get('stateReason'): - raise e + raise error else: logger.info('*** rucio API download client finished ***') - logger.debug('client returned %s' % result) + logger.debug('client returned %s', result) def _stage_out_api(fspec, summary_file_path, trace_report, trace_report_out, transfer_timeout): @@ -607,31 +608,31 @@ def _stage_out_api(fspec, summary_file_path, trace_report, trace_report_out, tra logger.debug('summary_file_path=%s' % summary_file_path) logger.debug('trace_report_out=%s' % trace_report_out) result = upload_client.upload([f], summary_file_path=summary_file_path, traces_copy_out=trace_report_out) - except Exception as e: + except Exception as error: logger.warning('*** rucio API upload client failed ***') - logger.warning('caught exception: %s' % e) + logger.warning('caught exception: %s', error) import traceback logger.error(traceback.format_exc()) - logger.debug('trace_report_out=%s' % trace_report_out) + logger.debug('trace_report_out=%s', trace_report_out) if not trace_report_out: - raise e + raise error if not trace_report_out[0].get('stateReason'): - raise e + raise error ec = -1 except UnboundLocalError: logger.warning('*** rucio API upload client failed ***') logger.warning('rucio still needs a bug fix of the summary in the uploadclient') else: logger.warning('*** rucio API upload client finished ***') - logger.debug('client returned %s' % result) + logger.debug('client returned %s', result) try: file_exists = verify_stage_out(fspec) logger.info('file exists at the storage: %s' % str(file_exists)) if not file_exists: raise StageOutFailure('physical check after upload failed') - except Exception as e: - msg = 'file existence verification failed with: %s' % e + except Exception as error: + msg = 'file existence verification failed with: %s' % error logger.info(msg) raise StageOutFailure(msg) diff --git a/pilot/copytool/s3.py b/pilot/copytool/s3.py index 365f49cb..a0a480bc 100644 --- a/pilot/copytool/s3.py +++ b/pilot/copytool/s3.py @@ -81,7 +81,7 @@ def copy_in(files, **kwargs): bucket = 'bucket' # UPDATE ME path = os.path.join(dst, fspec.lfn) - logger.info('downloading object %s from bucket=%s to local file %s' % (fspec.lfn, bucket, path)) + logger.info('downloading object %s from bucket=%s to local file %s', fspec.lfn, bucket, path) status, diagnostics = download_file(path, bucket, object_name=fspec.lfn) if not status: ## an error occurred @@ -113,12 +113,12 @@ def download_file(path, bucket, object_name=None): try: s3 = boto3.client('s3') s3.download_file(bucket, object_name, path) - except ClientError as e: - diagnostics = 'S3 ClientError: %s' % e + except ClientError as error: + diagnostics = 'S3 ClientError: %s' % error logger.critical(diagnostics) return False, diagnostics - except Exception as e: - diagnostics = 'exception caught in s3_client: %s' % e + except Exception as error: + diagnostics = 'exception caught in s3_client: %s' % error logger.critical(diagnostics) return False, diagnostics @@ -140,7 +140,7 @@ def copy_out(files, **kwargs): path = os.path.join(workdir, fspec.lfn) if os.path.exists(path): bucket = 'bucket' # UPDATE ME - logger.info('uploading %s to bucket=%s using object name=%s' % (path, bucket, fspec.lfn)) + logger.info('uploading %s to bucket=%s using object name=%s', path, bucket, fspec.lfn) status, diagnostics = upload_file(path, bucket, object_name=fspec.lfn) if not status: ## an error occurred @@ -181,12 +181,12 @@ def upload_file(file_name, bucket, object_name=None): s3_client = boto3.client('s3') #response = s3_client.upload_file(file_name, bucket, object_name) s3_client.upload_file(file_name, bucket, object_name) - except ClientError as e: - diagnostics = 'S3 ClientError: %s' % e + except ClientError as error: + diagnostics = 'S3 ClientError: %s' % error logger.critical(diagnostics) return False, diagnostics - except Exception as e: - diagnostics = 'exception caught in s3_client: %s' % e + except Exception as error: + diagnostics = 'exception caught in s3_client: %s' % error logger.critical(diagnostics) return False, diagnostics diff --git a/pilot/copytool/xrdcp.py b/pilot/copytool/xrdcp.py index 9eafbfc5..bfcd2f75 100644 --- a/pilot/copytool/xrdcp.py +++ b/pilot/copytool/xrdcp.py @@ -6,7 +6,7 @@ # # Authors: # - Tobias Wegner, tobias.wegner@cern.ch, 2017-2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # Reimplemented by Alexey Anisenkov @@ -42,28 +42,28 @@ def _resolve_checksum_option(setup, **kwargs): if setup: cmd = "source %s; %s" % (setup, cmd) - logger.info("Execute command (%s) to check xrdcp client version" % cmd) + logger.info("Execute command (%s) to check xrdcp client version", cmd) rcode, stdout, stderr = execute(cmd, **kwargs) - logger.info("return code: %s" % rcode) - logger.info("return output: %s" % (stdout + stderr)) + logger.info("return code: %s", rcode) + logger.info("return output: %s", stdout + stderr) cmd = "%s -h" % copy_command if setup: cmd = "source %s; %s" % (setup, cmd) - logger.info("Execute command (%s) to decide which option should be used to calc/verify file checksum.." % cmd) + logger.info("Execute command (%s) to decide which option should be used to calc/verify file checksum..", cmd) rcode, stdout, stderr = execute(cmd, **kwargs) output = stdout + stderr - logger.info("return code: %s" % rcode) - logger.debug("return output: %s" % output) + logger.info("return code: %s", rcode) + logger.debug("return output: %s", output) coption = "" checksum_type = 'adler32' ## consider only adler32 for now if rcode: - logger.error('FAILED to execute command=%s: %s' % (cmd, output)) + logger.error('FAILED to execute command=%s: %s', cmd, output) else: if "--cksum" in output: coption = "--cksum %s:print" % checksum_type @@ -73,7 +73,7 @@ def _resolve_checksum_option(setup, **kwargs): coption = "-md5" if coption: - logger.info("Use %s option to get the checksum for %s command" % (coption, copy_command)) + logger.info("Use %s option to get the checksum for %s command", coption, copy_command) return coption @@ -96,7 +96,7 @@ def _stagefile(coption, source, destination, filesize, is_stagein, setup=None, * #logger.info("Executing command: %s, timeout=%s" % (cmd, timeout)) rcode, stdout, stderr = execute(cmd, **kwargs) - logger.info('rcode=%d, stdout=%s, stderr=%s' % (rcode, stdout, stderr)) + logger.info('rcode=%d, stdout=%s, stderr=%s', rcode, stdout, stderr) if rcode: ## error occurred error = resolve_common_transfer_errors(stdout + stderr, is_stagein=is_stagein) @@ -138,7 +138,8 @@ def copy_in(files, **kwargs): coption = _resolve_checksum_option(setup, **kwargs) trace_report = kwargs.get('trace_report') - localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None) + # note, env vars might be unknown inside middleware contrainers, if so get the value already in the trace report + localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', trace_report.get_value('localSite')) for fspec in files: # update the trace report localsite = localsite if localsite else fspec.ddmendpoint @@ -243,7 +244,7 @@ def get_file_info_from_output(output): return None, None, None if not ("xrootd" in output or "XRootD" in output or "adler32" in output): - logger.warning("WARNING: Failed to extract checksum: Unexpected output: %s" % output) + logger.warning("WARNING: Failed to extract checksum: Unexpected output: %s", output) return None, None, None pattern = r"(?Pmd5|adler32):\ (?P[a-zA-Z0-9]+)\ \S+\ (?P[0-9]+)" # Python 3 (added r) @@ -258,10 +259,10 @@ def get_file_info_from_output(output): if filesize: try: filesize = int(filesize) - except ValueError as e: - logger.warning('failed to convert filesize to int: %s' % e) + except ValueError as error: + logger.warning('failed to convert filesize to int: %s', error) filesize = None else: - logger.warning("WARNING: Checksum/file size info not found in output: failed to match pattern=%s in output=%s" % (pattern, output)) + logger.warning("WARNING: Checksum/file size info not found in output: failed to match pattern=%s in output=%s", pattern, output) return filesize, checksum, checksum_type diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index e9820e8b..23314ae6 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -592,3 +592,47 @@ def get_pid_from_command(cmd, pattern=r'gdb --pid (\d+)'): print('no match for pattern \'%s\' in command=\'%s\'' % (pattern, cmd)) return pid + + +def list_hardware(): + """ + Execute lshw to list local hardware. + + :return: lshw output (string). + """ + + exit_code, stdout, stderr = execute('lshw -numeric -C display', mute=True) + if 'Command not found' in stdout or 'Command not found' in stderr: + stdout = '' + return stdout + + +def get_display_info(): + """ + Extract the product and vendor from the lshw command. + E.g. + product: GD 5446 [1013:B8] + vendor: Cirrus Logic [1013] + -> GD 5446, Cirrus Logic + + :return: product (string), vendor (string). + """ + + vendor = '' + product = '' + stdout = list_hardware() + if stdout: + vendor_pattern = re.compile(r'vendor\:\ (.+)\ .') + product_pattern = re.compile(r'product\:\ (.+)\ .') + + for line in stdout.split('\n'): + if 'vendor' in line: + result = re.findall(vendor_pattern, line) + if result: + vendor = result[0] + elif 'product' in line: + result = re.findall(product_pattern, line) + if result: + product = result[0] + + return product, vendor diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 9eaf93ef..667d188b 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '44' # build number should be reset to '1' for every new development cycle +BUILD = '46' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/tracereport.py b/pilot/util/tracereport.py index 9f2f8c09..717d17f5 100644 --- a/pilot/util/tracereport.py +++ b/pilot/util/tracereport.py @@ -102,6 +102,13 @@ def init(self, job): exit_code, stdout, stderr = execute(cmd) self['uuid'] = stdout.replace('-', '') + def get_value(self, key): + """ + + """ + + return self.get(key, None) + def verify_trace(self): """ Verify the trace consistency. diff --git a/pilot/util/workernode.py b/pilot/util/workernode.py index f23bb86f..e15ade4f 100644 --- a/pilot/util/workernode.py +++ b/pilot/util/workernode.py @@ -5,13 +5,13 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 import os import re -from pilot.util.disk import disk_usage from pilot.info import infosys +from pilot.util.disk import disk_usage import logging logger = logging.getLogger(__name__) @@ -36,8 +36,8 @@ def get_local_disk_space(path): if not diskpipe.close(): try: disk = float(disks.splitlines()[1].split()[3]) - except ValueError as e: - logger.warning('exception caught while trying to convert disk info: %s' % e) + except ValueError as error: + logger.warning('exception caught while trying to convert disk info: %s', error) return disk @@ -56,8 +56,8 @@ def get_meminfo(): if mems.upper().find("MEMTOTAL") != -1: try: mem = float(mems.split()[1]) / 1024 # value listed by command as kB, convert to MB - except ValueError as e: - logger.warning('exception caught while trying to convert meminfo: %s' % e) + except ValueError as error: + logger.warning('exception caught while trying to convert meminfo: %s', error) break mems = fd.readline() @@ -78,8 +78,8 @@ def get_cpuinfo(): if line.find("cpu MHz") != -1: # Python 2/3 try: cpu = float(line.split(":")[1]) - except ValueError as e: - logger.warning('exception caught while trying to convert cpuinfo: %s' % e) + except ValueError as error: + logger.warning('exception caught while trying to convert cpuinfo: %s', error) break # command info is the same for all cores, so break here return cpu @@ -114,21 +114,21 @@ def get_disk_space(queuedata): # --- non Job related queue data # jobinfo provider is required to consider overwriteAGIS data coming from Job _maxinputsize = infosys.queuedata.maxwdir - logger.debug("resolved value from global infosys.queuedata instance: infosys.queuedata.maxwdir=%s B" % _maxinputsize) + logger.debug("resolved value from global infosys.queuedata instance: infosys.queuedata.maxwdir=%s B", _maxinputsize) _maxinputsize = queuedata.maxwdir - logger.debug("resolved value: queuedata.maxwdir=%s B" % _maxinputsize) + logger.debug("resolved value: queuedata.maxwdir=%s B", _maxinputsize) try: du = disk_usage(os.path.abspath(".")) _diskspace = int(du[2] / (1024 * 1024)) # need to convert from B to MB - except ValueError as e: - logger.warning("failed to extract disk space: %s (will use schedconfig default)" % e) + except ValueError as error: + logger.warning("failed to extract disk space: %s (will use schedconfig default)", error) _diskspace = _maxinputsize else: - logger.info("available WN disk space: %d MB" % (_diskspace)) + logger.info("available WN disk space: %d MB", _diskspace) _diskspace = min(_diskspace, _maxinputsize) - logger.info("sending disk space %d MB to dispatcher" % (_diskspace)) + logger.info("sending disk space %d MB to dispatcher", _diskspace) return _diskspace @@ -226,5 +226,3 @@ def check_hz(): import traceback logger.fatal('failed to read SC_CLK_TCK - will not be able to perform CPU consumption calculation') logger.warning(traceback.format_exc()) - else: - logger.debug('SC_CLK_TCK=%s' % str(hz)) diff --git a/pilot/workflow/generic_hpc.py b/pilot/workflow/generic_hpc.py index 8b567599..caf3309f 100644 --- a/pilot/workflow/generic_hpc.py +++ b/pilot/workflow/generic_hpc.py @@ -6,7 +6,7 @@ # # Authors: # - Mario Lassnig, mario.lassnig@cern.ch, 2016 -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 # - Danila Oleynik danila.oleynik@cern.ch, 2018 import functools @@ -48,9 +48,9 @@ def interrupt(args, signum, frame): """ try: - logger.info('caught signal: %s' % [v for v, k in signal.__dict__.iteritems() if k == signum][0]) # Python 2 + logger.info('caught signal: %s', [v for v, k in signal.__dict__.iteritems() if k == signum][0]) # Python 2 except Exception: - logger.info('caught signal: %s' % [v for v, k in list(signal.__dict__.items()) if k == signum][0]) # Python 3 + logger.info('caught signal: %s', [v for v, k in list(signal.__dict__.items()) if k == signum][0]) # Python 3 args.graceful_stop.set() @@ -212,11 +212,11 @@ def run(args): logger.debug("Final report: {0}".format(work_report)) add_to_pilot_timing(job.jobid, PILOT_POST_FINAL_UPDATE, time.time(), args) - except Exception as e: + except Exception as error: work_report["jobStatus"] = "failed" - work_report["exitMsg"] = str(e) + work_report["exitMsg"] = str(error) publish_work_report(work_report, worker_attributes_file) - logging.exception('exception caught:') + logging.exception('exception caught: %s', error) traces.pilot['state'] = FAILURE return traces From a77cd07f1dc162c0062a494cc1b041d54d916974 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 17 Jun 2021 11:36:59 +0200 Subject: [PATCH 73/96] Pylint corrections. Fixes for localSite problem in traces --- pilot/control/job.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 449af861..77ea47ab 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -1974,7 +1974,7 @@ def has_job_completed(queues, args): make_job_report(job) cmd = 'ls -lF %s' % os.environ.get('PILOT_HOME') logger.debug('%s:\n', cmd) - ec, stdout, stderr = execute(cmd) + _, stdout, _ = execute(cmd) logger.debug(stdout) queue_report(queues) @@ -2375,7 +2375,7 @@ def interceptor(queues, traces, args): # peek at the jobs in the validated_jobs queue and send the running ones to the heartbeat function jobs = queues.monitored_payloads.queue if jobs: - for i in range(len(jobs)): + for _ in range(len(jobs)): logger.info('interceptor loop %d: looking for communication file', n) time.sleep(30) From cb0adac8757468d0e5a6983f4931acf1058485d9 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 17 Jun 2021 13:25:05 +0200 Subject: [PATCH 74/96] Pylint corrections --- pilot/control/data.py | 133 ++++++++++++------------- pilot/control/job.py | 35 +++---- pilot/control/monitor.py | 29 +++--- pilot/control/payload.py | 37 ++++--- pilot/control/payloads/eventservice.py | 6 +- pilot/control/payloads/generic.py | 30 +++--- 6 files changed, 133 insertions(+), 137 deletions(-) diff --git a/pilot/control/data.py b/pilot/control/data.py index 6f820d33..20151b92 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -271,7 +271,7 @@ def get_rse(data, lfn=""): return rse -def stage_in_auto(site, files): +def stage_in_auto(files): """ Separate dummy implementation for automatic stage-in outside of pilot workflows. Should be merged with regular stage-in functionality later, but we need to have @@ -289,47 +289,47 @@ def stage_in_auto(site, files): '--no-subdir'] # quickly remove non-existing destinations - for f in files: - if not os.path.exists(f['destination']): - f['status'] = 'failed' - f['errmsg'] = 'Destination directory does not exist: %s' % f['destination'] - f['errno'] = 1 + for _file in files: + if not os.path.exists(_file['destination']): + _file['status'] = 'failed' + _file['errmsg'] = 'Destination directory does not exist: %s' % _file['destination'] + _file['errno'] = 1 else: - f['status'] = 'running' - f['errmsg'] = 'File not yet successfully downloaded.' - f['errno'] = 2 + _file['status'] = 'running' + _file['errmsg'] = 'File not yet successfully downloaded.' + _file['errno'] = 2 - for f in files: - if f['errno'] == 1: + for _file in files: + if _file['errno'] == 1: continue tmp_executable = objectcopy.deepcopy(executable) - tmp_executable += ['--dir', f['destination']] - tmp_executable.append('%s:%s' % (f['scope'], - f['name'])) + tmp_executable += ['--dir', _file['destination']] + tmp_executable.append('%s:%s' % (_file['scope'], + _file['name'])) process = subprocess.Popen(tmp_executable, bufsize=-1, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - f['errno'] = 2 + _file['errno'] = 2 while True: time.sleep(0.5) exit_code = process.poll() if exit_code is not None: - stdout, stderr = process.communicate() + _, stderr = process.communicate() if exit_code == 0: - f['status'] = 'done' - f['errno'] = 0 - f['errmsg'] = 'File successfully downloaded.' + _file['status'] = 'done' + _file['errno'] = 0 + _file['errmsg'] = 'File successfully downloaded.' else: - f['status'] = 'failed' - f['errno'] = 3 + _file['status'] = 'failed' + _file['errno'] = 3 try: # the Details: string is set in rucio: lib/rucio/common/exception.py in __str__() - f['errmsg'] = [detail for detail in stderr.split('\n') if detail.startswith('Details:')][0][9:-1] - except Exception as e: - f['errmsg'] = 'Could not find rucio error message details - please check stderr directly: %s' % str(e) + _file['errmsg'] = [detail for detail in stderr.split('\n') if detail.startswith('Details:')][0][9:-1] + except Exception as error: + _file['errmsg'] = 'Could not find rucio error message details - please check stderr directly: %s' % error break else: continue @@ -337,7 +337,7 @@ def stage_in_auto(site, files): return files -def stage_out_auto(site, files): +def stage_out_auto(files): """ Separate dummy implementation for automatic stage-out outside of pilot workflows. Should be merged with regular stage-out functionality later, but we need to have @@ -351,63 +351,60 @@ def stage_out_auto(site, files): 'rucio', '-v', 'upload'] # quickly remove non-existing destinations - for f in files: - if not os.path.exists(f['file']): - f['status'] = 'failed' - f['errmsg'] = 'Source file does not exist: %s' % f['file'] - f['errno'] = 1 + for _file in files: + if not os.path.exists(_file['file']): + _file['status'] = 'failed' + _file['errmsg'] = 'Source file does not exist: %s' % _file['file'] + _file['errno'] = 1 else: - f['status'] = 'running' - f['errmsg'] = 'File not yet successfully uploaded.' - f['errno'] = 2 + _file['status'] = 'running' + _file['errmsg'] = 'File not yet successfully uploaded.' + _file['errno'] = 2 - for f in files: - if f['errno'] == 1: + for _file in files: + if _file['errno'] == 1: continue tmp_executable = objectcopy.deepcopy(executable) - tmp_executable += ['--rse', f['rse']] + tmp_executable += ['--rse', _file['rse']] - if 'no_register' in list(f.keys()) and f['no_register']: # Python 2/3 + if 'no_register' in list(_file.keys()) and _file['no_register']: # Python 2/3 tmp_executable += ['--no-register'] - if 'summary' in list(f.keys()) and f['summary']: # Python 2/3 + if 'summary' in list(_file.keys()) and _file['summary']: # Python 2/3 tmp_executable += ['--summary'] - if 'lifetime' in list(f.keys()): # Python 2/3 - tmp_executable += ['--lifetime', str(f['lifetime'])] + if 'lifetime' in list(_file.keys()): # Python 2/3 + tmp_executable += ['--lifetime', str(_file['lifetime'])] - if 'guid' in list(f.keys()): # Python 2/3 - tmp_executable += ['--guid', f['guid']] + if 'guid' in list(_file.keys()): # Python 2/3 + tmp_executable += ['--guid', _file['guid']] - if 'attach' in list(f.keys()): # Python 2/3 - tmp_executable += ['--scope', f['scope'], '%s:%s' % (f['attach']['scope'], f['attach']['name']), f['file']] + if 'attach' in list(_file.keys()): # Python 2/3 + tmp_executable += ['--scope', _file['scope'], '%s:%s' % (_file['attach']['scope'], _file['attach']['name']), _file['file']] else: - tmp_executable += ['--scope', f['scope'], f['file']] + tmp_executable += ['--scope', _file['scope'], _file['file']] - process = subprocess.Popen(tmp_executable, - bufsize=-1, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - f['errno'] = 2 + process = subprocess.Popen(tmp_executable, bufsize=-1, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + _file['errno'] = 2 while True: time.sleep(0.5) exit_code = process.poll() if exit_code is not None: - stdout, stderr = process.communicate() + _, stderr = process.communicate() if exit_code == 0: - f['status'] = 'done' - f['errno'] = 0 - f['errmsg'] = 'File successfully uploaded.' + _file['status'] = 'done' + _file['errno'] = 0 + _file['errmsg'] = 'File successfully uploaded.' else: - f['status'] = 'failed' - f['errno'] = 3 + _file['status'] = 'failed' + _file['errno'] = 3 try: # the Details: string is set in rucio: lib/rucio/common/exception.py in __str__() - f['errmsg'] = [detail for detail in stderr.split('\n') if detail.startswith('Details:')][0][9:-1] - except Exception as e: - f['errmsg'] = 'Could not find rucio error message details - please check stderr directly: %s' % str(e) + _file['errmsg'] = [detail for detail in stderr.split('\n') if detail.startswith('Details:')][0][9:-1] + except Exception as error: + _file['errmsg'] = 'Could not find rucio error message details - please check stderr directly: %s' % error break else: continue @@ -478,16 +475,16 @@ def copytool_in(queues, traces, args): cmd = user.get_utility_commands(job=job, order=UTILITY_BEFORE_STAGEIN) if cmd: # xcache debug - exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + _, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') logger.debug('[before xcache start] stdout=%s', _stdout) logger.debug('[before xcache start] stderr=%s', _stderr) - exit_code, stdout, stderr = execute(cmd.get('command')) + _, stdout, stderr = execute(cmd.get('command')) logger.debug('stdout=%s', stdout) logger.debug('stderr=%s', stderr) # xcache debug - exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + _, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') logger.debug('[after xcache start] stdout=%s', _stdout) logger.debug('[after xcache start] stderr=%s', _stderr) @@ -711,7 +708,7 @@ def filter_files_for_log(directory): """ filtered_files = [] maxfilesize = 10 - for root, dirnames, filenames in os.walk(directory): + for root, _, filenames in os.walk(directory): for filename in filenames: location = os.path.join(root, filename) if os.path.exists(location): # do not include broken links @@ -752,8 +749,8 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out user.remove_redundant_files(workdir, islooping=is_looping, debugmode=debugmode) # remove any present input/output files before tarring up workdir - for f in input_files + output_files: - path = os.path.join(workdir, f) + for fname in input_files + output_files: + path = os.path.join(workdir, fname) if os.path.exists(path): logger.info('removing file: %s', path) remove(path) @@ -768,9 +765,9 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out logger.info('will create archive %s', fullpath) try: cmd = "pwd;tar cvfz %s %s --dereference --one-file-system; echo $?" % (fullpath, tarball_name) - exit_code, stdout, stderr = execute(cmd) - except Exception as e: - raise LogFileCreationFailure(e) + _, stdout, _ = execute(cmd) + except Exception as error: + raise LogFileCreationFailure(error) else: if pilot_home != current_dir: os.chdir(pilot_home) diff --git a/pilot/control/job.py b/pilot/control/job.py index 77ea47ab..cf6b8394 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -17,6 +17,7 @@ import hashlib import random import socket +import logging try: import Queue as queue # noqa: N813 @@ -56,9 +57,7 @@ from pilot.util.timing import add_to_pilot_timing, timing_report, get_postgetjob_time, get_time_since, time_stamp from pilot.util.workernode import get_disk_space, collect_workernode_info, get_node_name, get_cpu_model -import logging logger = logging.getLogger(__name__) - errors = ErrorCodes() @@ -351,7 +350,6 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) except Exception as error: logger.warning('exception caught while sending https request: %s', error) logger.warning('possibly offending data: %s', data) - pass if final: os.environ['SERVER_UPDATE'] = SERVER_UPDATE_TROUBLE @@ -723,7 +721,7 @@ def get_debug_stdout(job): return get_general_command_stdout(job) else: # general command, execute and return output - exit_code, stdout, stderr = execute(job.debug_command) + _, stdout, _ = execute(job.debug_command) logger.info('debug_command: %s:\n\n%s\n', job.debug_command, stdout) return stdout @@ -756,7 +754,7 @@ def get_general_command_stdout(job): except Exception as error: logger.warning('general containerisation threw an exception: %s', error) else: - ec, stdout, stderr = execute(job.debug_command) + _, stdout, stderr = execute(job.debug_command) logger.debug("%s (stdout):\n\n%s\n\n", job.debug_command, stdout) logger.debug("%s (stderr):\n\n%s\n\n", job.debug_command, stderr) @@ -790,7 +788,7 @@ def get_ls(debug_command, workdir): finalpath = os.path.join(workdir, path) debug_command = debug_command.replace(path, finalpath) - ec, stdout, stderr = execute(debug_command) + _, stdout, _ = execute(debug_command) logger.debug("%s:\n\n%s\n\n", debug_command, stdout) return stdout @@ -933,7 +931,6 @@ def add_memory_info(data, workdir, name=""): data.update(utility_node) except Exception as error: logger.info('memory information not available: %s', error) - pass def remove_pilot_logs_from_list(list_of_files): @@ -1142,8 +1139,8 @@ def delayed_space_check(queues, traces, args, job): proceed_with_local_space_check = True if (args.harvester_submitmode.lower() == 'push' and args.update_server) else False if proceed_with_local_space_check: logger.debug('pilot will now perform delayed space check') - ec, diagnostics = check_local_space() - if ec != 0: + exit_code, diagnostics = check_local_space() + if exit_code != 0: traces.pilot['error_code'] = errors.NOLOCALSPACE # set the corresponding error code job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOLOCALSPACE, msg=diagnostics) @@ -1398,8 +1395,8 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge # pilot can report the error with a server update) proceed_with_local_space_check = False if (submitmode.lower() == 'push' and update_server) else True if proceed_with_local_space_check: - ec, diagnostics = check_local_space() - if ec != 0: + exit_code, diagnostics = check_local_space() + if exit_code != 0: traces.pilot['error_code'] = errors.NOLOCALSPACE return False else: @@ -1515,8 +1512,8 @@ def get_job_definition_from_file(path, harvester): datalist = parse_qsl(response, keep_blank_values=True) # convert to dictionary - for d in datalist: - res[d[0]] = d[1] + for data in datalist: + res[data[0]] = data[1] if os.path.exists(path): remove(path) @@ -1716,11 +1713,11 @@ def get_fake_job(input=True): 'destinationDblock': job_name, 'dispatchDBlockToken': 'NULL', 'jobPars': '-a sources.20115461.derivation.tgz -r ./ -j "Reco_tf.py ' - '--inputAODFile AOD.07709524._000050.pool.root.1 --outputDAODFile test.pool.root ' - '--reductionConf HIGG3D1" -i "[\'AOD.07709524._000050.pool.root.1\']" -m "[]" -n "[]" --trf' - ' --useLocalIO --accessmode=copy -o ' - '"{\'IROOT\': [(\'DAOD_HIGG3D1.test.pool.root\', \'%s.root\')]}" ' - '--sourceURL https://aipanda012.cern.ch:25443' % (job_name), + '--inputAODFile AOD.07709524._000050.pool.root.1 --outputDAODFile test.pool.root ' + '--reductionConf HIGG3D1" -i "[\'AOD.07709524._000050.pool.root.1\']" -m "[]" -n "[]" --trf' + ' --useLocalIO --accessmode=copy -o ' + '"{\'IROOT\': [(\'DAOD_HIGG3D1.test.pool.root\', \'%s.root\')]}" ' + '--sourceURL https://aipanda012.cern.ch:25443' % (job_name), 'attemptNr': '0', 'swRelease': 'Atlas-20.7.6', 'nucleus': 'NULL', @@ -1845,7 +1842,7 @@ def retrieve(queues, traces, args): # noqa: C901 delay = get_job_retrieval_delay(args.harvester) if not args.harvester: logger.warning('did not get a job -- sleep %d s and repeat', delay) - for i in range(delay): + for _ in range(delay): if args.graceful_stop.is_set(): break time.sleep(1) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 390776ac..99ba180b 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -41,15 +41,15 @@ def control(queues, traces, args): :return: """ - t0 = time.time() - traces.pilot['lifetime_start'] = t0 # ie referring to when pilot monitoring began - traces.pilot['lifetime_max'] = t0 + t_0 = time.time() + traces.pilot['lifetime_start'] = t_0 # ie referring to when pilot monitoring began + traces.pilot['lifetime_max'] = t_0 threadchecktime = int(config.Pilot.thread_check) # for CPU usage debugging cpuchecktime = int(config.Pilot.cpu_check) - tcpu = t0 + tcpu = t_0 queuedata = get_queuedata_from_job(queues) max_running_time = get_max_running_time(args.lifetime, queuedata) @@ -74,8 +74,7 @@ def control(queues, traces, args): time_since_start = get_time_since_start(args) grace_time = 10 * 60 if time_since_start - grace_time > max_running_time: - logger.fatal('max running time (%d s) minus grace time (%d s) has been exceeded - must abort pilot' % - (max_running_time, grace_time)) + logger.fatal('max running time (%d s) minus grace time (%d s) has been exceeded - must abort pilot', max_running_time, grace_time) logger.info('setting REACHED_MAXTIME and graceful stop') environ['REACHED_MAXTIME'] = 'REACHED_MAXTIME' # TODO: use singleton instead # do not set graceful stop if pilot has not finished sending the final job update @@ -109,7 +108,7 @@ def control(queues, traces, args): if int(time.time() - traces.pilot['lifetime_start']) % threadchecktime == 0: # get all threads for thread in threading.enumerate(): - # logger.info('thread name: %s' % thread.name) + # logger.info('thread name: %s', thread.name) if not thread.is_alive(): logger.fatal('thread \'%s\' is not alive', thread.name) # args.graceful_stop.set() @@ -150,14 +149,14 @@ def get_process_info(cmd, user=None, args='aufx', pid=None): """ processes = [] - n = 0 + num = 0 if not user: user = getuid() pattern = re.compile(r"\S+|[-+]?\d*\.\d+|\d+") arguments = ['ps', '-u', user, args, '--no-headers'] process = Popen(arguments, stdout=PIPE, stderr=PIPE) - stdout, notused = process.communicate() + stdout, _ = process.communicate() for line in stdout.splitlines(): found = re.findall(pattern, line) if found is not None: @@ -166,12 +165,12 @@ def get_process_info(cmd, user=None, args='aufx', pid=None): mem = found[3] command = ' '.join(found[10:]) if cmd in command: - n += 1 + num += 1 if processid == str(pid): processes = [cpu, mem, command] if processes: - processes.append(n) + processes.append(num) return processes @@ -194,8 +193,8 @@ def run_checks(queues, args): t_max = 2 * 60 logger.warning('pilot monitor received instruction that abort_job has been requested') logger.warning('will wait for a maximum of %d seconds for threads to finish', t_max) - t0 = time.time() - while time.time() - t0 < t_max: + t_0 = time.time() + while time.time() - t_0 < t_max: if args.job_aborted.is_set(): logger.warning('job_aborted has been set - aborting pilot monitoring') args.abort_job.clear() @@ -211,8 +210,8 @@ def run_checks(queues, args): if not args.job_aborted.is_set(): logger.warning('will wait for a maximum of %d seconds for graceful_stop to take effect', t_max) t_max = 10 - t0 = time.time() - while time.time() - t0 < t_max: + t_0 = time.time() + while time.time() - t_0 < t_max: if args.job_aborted.is_set(): logger.warning('job_aborted has been set - aborting pilot monitoring') args.abort_job.clear() diff --git a/pilot/control/payload.py b/pilot/control/payload.py index 33029f0c..b51063df 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -203,7 +203,7 @@ def execute_payloads(queues, traces, args): # noqa: C901 peek = [s_job for s_job in q_snapshot if job.jobid == s_job.jobid] if len(peek) == 0: put_in_queue(job, queues.validated_payloads) - for i in range(10): # Python 3 + for _ in range(10): # Python 3 if args.graceful_stop.is_set(): break time.sleep(1) @@ -329,8 +329,7 @@ def set_cpu_consumption_time(job): job.cpuconsumptiontime = int(round(cpuconsumptiontime)) job.cpuconsumptionunit = "s" job.cpuconversionfactor = 1.0 - logger.info('CPU consumption time: %f %s (rounded to %d %s)' % - (cpuconsumptiontime, job.cpuconsumptionunit, job.cpuconsumptiontime, job.cpuconsumptionunit)) + logger.info('CPU consumption time: %f %s (rounded to %d %s)', cpuconsumptiontime, job.cpuconsumptionunit, job.cpuconsumptiontime, job.cpuconsumptionunit) def perform_initial_payload_error_analysis(job, exit_code): @@ -345,7 +344,7 @@ def perform_initial_payload_error_analysis(job, exit_code): if exit_code != 0: msg = "" - ec = 0 + exit_code = 0 logger.warning('main payload execution returned non-zero exit code: %d', exit_code) stderr = read_file(os.path.join(job.workdir, config.Payload.payloadstderr)) if stderr != "": @@ -358,14 +357,14 @@ def perform_initial_payload_error_analysis(job, exit_code): fatal = True if msg != "": logger.warning("extracted message from stderr:\n%s", msg) - ec = set_error_code_from_stderr(msg, fatal) + exit_code = set_error_code_from_stderr(msg, fatal) - if not ec: - ec = errors.resolve_transform_error(exit_code, stderr) - if ec != 0: + if not exit_code: + exit_code = errors.resolve_transform_error(exit_code, stderr) + if exit_code != 0: if msg: - msg = errors.format_diagnostics(ec, msg) - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec, msg=msg) + msg = errors.format_diagnostics(exit_code, msg) + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code, msg=msg) else: if job.piloterrorcodes: logger.warning('error code(s) already set: %s', str(job.piloterrorcodes)) @@ -390,23 +389,23 @@ def set_error_code_from_stderr(msg, fatal): """ if "Failed invoking the NEWUSER namespace runtime" in msg: - ec = errors.SINGULARITYNEWUSERNAMESPACE + exit_code = errors.SINGULARITYNEWUSERNAMESPACE elif "Failed to create user namespace" in msg: - ec = errors.SINGULARITYFAILEDUSERNAMESPACE + exit_code = errors.SINGULARITYFAILEDUSERNAMESPACE elif "command not found" in msg: - ec = errors.TRANSFORMNOTFOUND + exit_code = errors.TRANSFORMNOTFOUND elif "SL5 is unsupported" in msg: - ec = errors.UNSUPPORTEDSL5OS + exit_code = errors.UNSUPPORTEDSL5OS elif "resource temporarily unavailable" in msg: - ec = errors.SINGULARITYRESOURCEUNAVAILABLE + exit_code = errors.SINGULARITYRESOURCEUNAVAILABLE elif "unrecognized arguments" in msg: - ec = errors.UNRECOGNIZEDTRFARGUMENTS + exit_code = errors.UNRECOGNIZEDTRFARGUMENTS elif fatal: - ec = errors.UNRECOGNIZEDTRFSTDERR + exit_code = errors.UNRECOGNIZEDTRFSTDERR else: - ec = 0 + exit_code = 0 - return ec + return exit_code def validate_post(queues, traces, args): diff --git a/pilot/control/payloads/eventservice.py b/pilot/control/payloads/eventservice.py index 3e0390d3..dc36ec72 100644 --- a/pilot/control/payloads/eventservice.py +++ b/pilot/control/payloads/eventservice.py @@ -100,15 +100,15 @@ def wait_graceful(self, args, proc): :return: """ - t1 = time.time() + t_1 = time.time() while proc.is_alive(): if args.graceful_stop.is_set(): logger.debug("Graceful stop is set, stopping work executor") proc.stop() break - if time.time() > t1 + 300: # 5 minutes + if time.time() > t_1 + 300: # 5 minutes logger.info("Process is still running") - t1 = time.time() + t_1 = time.time() time.sleep(2) while proc.is_alive(): diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 94fba2af..747e6acb 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -349,7 +349,7 @@ def run_command(self, cmd, label=None): except Exception as error: logger.error('could not execute: %s', error) return None - if type(proc) == tuple and not proc[0]: + if isinstance(proc, tuple) and not proc[0]: logger.error('failed to execute command') return None @@ -381,7 +381,7 @@ def run_payload(self, job, cmd, out, err): except Exception as error: logger.error('could not execute: %s', error) return None - if type(proc) == tuple and not proc[0]: + if isinstance(proc, tuple) and not proc[0]: logger.error('failed to execute payload') return None @@ -405,13 +405,17 @@ def extract_setup(self, cmd): :return: updated secondary command (string). """ - def cut_str_from(_cmd, s): - # cut the string from the position of the given _cmd - return _cmd[:_cmd.find(s)] + def cut_str_from(_cmd, _str): + """ + Cut the string from the position of the given _cmd + """ + return _cmd[:_cmd.find(_str)] def cut_str_from_last_semicolon(_cmd): - # cut the string from the last semicolon - # NOTE: this will not work if jobParams also contain ; + """ + Cut the string from the last semicolon + NOTE: this will not work if jobParams also contain ; + """ # remove any trailing spaces and ;-signs _cmd = _cmd.strip() _cmd = _cmd[:-1] if _cmd.endswith(';') else _cmd @@ -452,7 +456,7 @@ def wait_graceful(self, args, proc): time.sleep(0.1) iteration += 1 - for i in range(60): # Python 2/3 + for _ in range(60): # Python 2/3 if args.graceful_stop.is_set(): breaker = True logger.info('breaking -- sending SIGTERM pid=%s', proc.pid) @@ -519,9 +523,9 @@ def run_preprocess(self, job): try: # note: this might update the jobparams cmd_before_payload = self.utility_before_payload(job) - except Exception as e: - logger.error(e) - raise e + except Exception as error: + logger.error(error) + raise error if cmd_before_payload: cmd_before_payload = job.setup + cmd_before_payload @@ -685,8 +689,8 @@ def run_utility_after_payload_finished(self, state, order): exit_code = 0 try: cmd_after_payload, label = self.utility_after_payload_finished(self.__job, order) - except Exception as e: - logger.error(e) + except Exception as error: + logger.error(error) else: if cmd_after_payload and self.__job.postprocess and state != 'failed': cmd_after_payload = self.__job.setup + cmd_after_payload From 5c09bb5e4b9381bd81dad4ebddb810494ad3b197 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 17 Jun 2021 15:03:40 +0200 Subject: [PATCH 75/96] Flake8 correction. UTF-8 fix for Popen --- pilot/util/container.py | 11 +++++------ pilot/util/workernode.py | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pilot/util/container.py b/pilot/util/container.py index 3ab76b66..0a6c20b0 100644 --- a/pilot/util/container.py +++ b/pilot/util/container.py @@ -80,12 +80,11 @@ def execute(executable, **kwargs): exe = ['/bin/bash', '-c', executable] # try: intercept exception such as OSError -> report e.g. error.RESOURCEUNAVAILABLE: "Resource temporarily unavailable" - process = subprocess.Popen(exe, - bufsize=-1, - stdout=stdout, - stderr=stderr, - cwd=cwd, - preexec_fn=setpgrp) #setsid) + if is_python3(): + process = subprocess.Popen(exe, bufsize=-1, stdout=stdout, stderr=stderr, cwd=cwd, preexec_fn=setpgrp, encoding='utf-8') # Python 3 + else: + process = subprocess.Popen(exe, bufsize=-1, stdout=stdout, stderr=stderr, cwd=cwd, preexec_fn=setpgrp) # Python 2 + if returnproc: return process else: diff --git a/pilot/util/workernode.py b/pilot/util/workernode.py index e15ade4f..e96d3ba9 100644 --- a/pilot/util/workernode.py +++ b/pilot/util/workernode.py @@ -221,7 +221,7 @@ def check_hz(): """ try: - hz = os.sysconf(os.sysconf_names['SC_CLK_TCK']) + _ = os.sysconf(os.sysconf_names['SC_CLK_TCK']) except Exception: import traceback logger.fatal('failed to read SC_CLK_TCK - will not be able to perform CPU consumption calculation') From c70d879fb21cce147ed0233e8c58c400ddd833ee Mon Sep 17 00:00:00 2001 From: Brinick Simmons Date: Mon, 21 Jun 2021 10:56:05 +0200 Subject: [PATCH 76/96] Fix pylint issues --- pilot/user/atlas/common.py | 1413 ++++++++++++++++++++++-------------- 1 file changed, 857 insertions(+), 556 deletions(-) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 1a1344de..d02faaf8 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -8,40 +8,74 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # - Wen Guan, wen.guan@cern.ch, 2018 -import os -import re -import fnmatch from collections import defaultdict +import fnmatch from glob import glob +import logging +import os +import re +from random import randint from signal import SIGTERM, SIGUSR1 +from typing import Type +# from tarfile import ExFileObject try: from functools import reduce # Python 3 -except Exception: +except ImportError: pass from .container import create_root_container_command from .dbrelease import get_dbrelease_version, create_dbrelease -from .setup import should_pilot_prepare_setup, is_standard_atlas_job, get_asetup,\ - set_inds, get_analysis_trf, get_payload_environment_variables, replace_lfns_with_turls -from .utilities import get_memory_monitor_setup, get_network_monitor_setup, post_memory_monitor_action,\ - get_memory_monitor_summary_filename, get_prefetcher_setup, get_benchmark_setup, get_memory_monitor_output_filename,\ - get_metadata_dict_from_txt - -from pilot.util.auxiliary import get_resource_name, show_memory_usage +from .setup import ( + should_pilot_prepare_setup, + is_standard_atlas_job, + get_asetup, + set_inds, + get_analysis_trf, + get_payload_environment_variables, + replace_lfns_with_turls, +) +from .utilities import ( + get_memory_monitor_setup, + get_network_monitor_setup, + post_memory_monitor_action, + get_memory_monitor_summary_filename, + get_prefetcher_setup, + get_benchmark_setup, + get_memory_monitor_output_filename, + get_metadata_dict_from_txt, +) + +from pilot.util.auxiliary import get_resource_name, show_memory_usage, is_python3 from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import TrfDownloadFailure, PilotException -from pilot.util.auxiliary import is_python3 from pilot.util.config import config -from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED,\ - UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_STARTED2, UTILITY_BEFORE_STAGEIN, UTILITY_AFTER_PAYLOAD_FINISHED2 +from pilot.util.constants import ( + UTILITY_BEFORE_PAYLOAD, + UTILITY_WITH_PAYLOAD, + UTILITY_AFTER_PAYLOAD_STARTED, + UTILITY_AFTER_PAYLOAD_FINISHED, + UTILITY_AFTER_PAYLOAD_STARTED2, + UTILITY_BEFORE_STAGEIN, + UTILITY_AFTER_PAYLOAD_FINISHED2 +) from pilot.util.container import execute -from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy,\ - copy_pilot_source, write_file, read_json, read_file, update_extension, get_local_file_size, calculate_checksum -from pilot.util.processes import convert_ps_to_dict, find_cmd_pids, get_trimmed_dictionary, find_pid, is_child +from pilot.util.filehandling import ( + copy, copy_pilot_source, calculate_checksum, + get_guid, get_local_file_size, + remove, remove_dir_tree, remove_core_dumps, read_file, read_json, + update_extension, + write_file, + # read_list +) +from pilot.util.processes import ( + convert_ps_to_dict, + find_pid, find_cmd_pids, + get_trimmed_dictionary, + is_child +) from pilot.util.tracereport import TraceReport -import logging logger = logging.getLogger(__name__) errors = ErrorCodes() @@ -49,8 +83,9 @@ def sanity_check(): """ - Perform an initial sanity check before doing anything else in a given workflow. - This function can be used to verify importing of modules that are otherwise used much later, but it is better to abort + Perform an initial sanity check before doing anything else in a + given workflow. This function can be used to verify importing of + modules that are otherwise used much later, but it is better to abort the pilot if a problem is discovered early. :return: exit code (0 if all is ok, otherwise non-zero exit code). @@ -61,7 +96,8 @@ def sanity_check(): #try: # from rucio.client.downloadclient import DownloadClient # from rucio.client.uploadclient import UploadClient - # # note: must do something with Download/UploadClients or flake8 will complain - but do not instantiate + # # note: must do something with Download/UploadClients or flake8 + # will complain - but do not instantiate #except Exception as e: # logger.warning('sanity check failed: %s' % e) # exit_code = errors.MIDDLEWAREIMPORTFAILURE @@ -81,7 +117,9 @@ def validate(job): status = True if 'DBRelease' in job.jobparams: - logger.debug('encountered DBRelease info in job parameters - will attempt to create a local DBRelease file') + logger.debug(( + 'encountered DBRelease info in job parameters - ' + 'will attempt to create a local DBRelease file')) version = get_dbrelease_version(job.jobparams) if version: status = create_dbrelease(version, job.workdir) @@ -94,20 +132,22 @@ def validate(job): if status: if job.imagename and job.imagename.startswith('/'): if os.path.exists(job.imagename): - logger.info('verified that image exists: %s' % job.imagename) + logger.info('verified that image exists: %s', job.imagename) else: status = False - logger.warning('image does not exist: %s' % job.imagename) + logger.warning('image does not exist: %s', job.imagename) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.IMAGENOTFOUND) # cleanup job parameters if only copy-to-scratch #if job.only_copy_to_scratch(): # logger.debug('job.params=%s' % job.jobparams) # if ' --usePFCTurl' in job.jobparams: - # logger.debug('cleaning up --usePFCTurl from job parameters since all input is copy-to-scratch') + # logger.debug('cleaning up --usePFCTurl from job parameters + # since all input is copy-to-scratch') # job.jobparams = job.jobparams.replace(' --usePFCTurl', '') # if ' --directIn' in job.jobparams: - # logger.debug('cleaning up --directIn from job parameters since all input is copy-to-scratch') + # logger.debug('cleaning up --directIn from job parameters + # since all input is copy-to-scratch') # job.jobparams = job.jobparams.replace(' --directIn', '') return status @@ -122,7 +162,7 @@ def open_remote_files(indata, workdir): :return: exit code (int), diagnostics (string). """ - ec = 0 + exitcode = 0 diagnostics = "" not_opened = "" @@ -140,22 +180,25 @@ def open_remote_files(indata, workdir): final_script_path = os.path.join(workdir, script) os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH') + ':' + workdir script_path = os.path.join('pilot/scripts', script) - d1 = os.path.join(os.path.join(os.environ['PILOT_HOME'], 'pilot2'), script_path) - d2 = os.path.join(workdir, script_path) - full_script_path = d1 if os.path.exists(d1) else d2 + dir1 = os.path.join(os.path.join(os.environ['PILOT_HOME'], 'pilot2'), script_path) + dir2 = os.path.join(workdir, script_path) + full_script_path = dir1 if os.path.exists(dir1) else dir2 if not os.path.exists(full_script_path): # do not set ec since this will be a pilot issue rather than site issue - diagnostics = 'cannot perform file open test - script path does not exist: %s' % full_script_path + diagnostics = ( + 'cannot perform file open test - script path does ' + 'not exist: %s' % full_script_path + ) logger.warning(diagnostics) - logger.warning('tested both path=%s and path=%s (none exists)' % (d1, d2)) - return ec, diagnostics, not_opened + logger.warning('tested both path=%s and path=%s (none exists)', dir1, dir2) + return exitcode, diagnostics, not_opened try: copy(full_script_path, final_script_path) - except Exception as e: + except PilotException as exc: # do not set ec since this will be a pilot issue rather than site issue - diagnostics = 'cannot perform file open test - pilot source copy failed: %s' % e + diagnostics = 'cannot perform file open test - pilot source copy failed: %s' % exc logger.warning(diagnostics) - return ec, diagnostics, not_opened + return exitcode, diagnostics, not_opened else: # correct the path when containers have been used final_script_path = os.path.join('.', script) @@ -165,38 +208,45 @@ def open_remote_files(indata, workdir): show_memory_usage() - logger.info('*** executing file open verification script:\n\n\'%s\'\n\n' % cmd) + logger.info('*** executing file open verification script:\n\n\'%s\'\n\n', cmd) exit_code, stdout, stderr = execute(cmd, usecontainer=False) if config.Pilot.remotefileverification_log: - write_file(os.path.join(workdir, config.Pilot.remotefileverification_log), stdout + stderr, mute=False) + fpath = os.path.join(workdir, config.Pilot.remotefileverification_log) + write_file(fpath, stdout + stderr, mute=False) show_memory_usage() # error handling if exit_code: - logger.warning('script %s finished with ec=%d' % (script, exit_code)) + logger.warning('script %s finished with ec=%d', script, exit_code) else: - dictionary_path = os.path.join(workdir, config.Pilot.remotefileverification_dictionary) + dictionary_path = os.path.join( + workdir, + config.Pilot.remotefileverification_dictionary + ) if not dictionary_path: - logger.warning('file does not exist: %s' % dictionary_path) + logger.warning('file does not exist: %s', dictionary_path) else: file_dictionary = read_json(dictionary_path) if not file_dictionary: - logger.warning('could not read dictionary from %s' % dictionary_path) + logger.warning('could not read dictionary from %s', dictionary_path) else: not_opened = "" for turl in file_dictionary: opened = file_dictionary[turl] - logger.info('turl could be opened: %s' % turl) if opened else logger.info('turl could not be opened: %s' % turl) if not opened: + logger.info('turl could not be opened: %s', turl) not_opened += turl if not not_opened else ",%s" % turl + else: + logger.info('turl could be opened: %s', turl) + if not_opened: - ec = errors.REMOTEFILECOULDNOTBEOPENED + exitcode = errors.REMOTEFILECOULDNOTBEOPENED diagnostics = "Remote file could not be opened: %s" % not_opened if "," not in not_opened else "turls not opened:%s" % not_opened else: logger.info('nothing to verify (for remote files)') - return ec, diagnostics, not_opened + return exitcode, diagnostics, not_opened def get_file_open_command(script_path, turls): @@ -217,19 +267,22 @@ def extract_turls(indata): :return: comma-separated list of turls (string). """ - turls = "" - for f in indata: - if f.status == 'remote_io': - turls += f.turl if not turls else ",%s" % f.turl + # turls = "" + # for filespc in indata: + # if filespc.status == 'remote_io': + # turls += filespc.turl if not turls else ",%s" % filespc.turl + # return turls - return turls + return ",".join( + fspec.turl for fspec in indata if fspec.status == 'remote_io' + ) def process_remote_file_traces(path, job, not_opened_turls): """ Report traces for remote files. - The function reads back the base trace report (common part of all traces) and updates it per file before reporting - it to the Rucio server. + The function reads back the base trace report (common part of all traces) + and updates it per file before reporting it to the Rucio server. :param path: path to base trace report (string). :param job: job object. @@ -239,8 +292,8 @@ def process_remote_file_traces(path, job, not_opened_turls): try: base_trace_report = read_json(path) - except PilotException as e: - logger.warning('failed to open base trace report (cannot send trace reports): %s' % e) + except PilotException as exc: + logger.warning('failed to open base trace report (cannot send trace reports): %s', exc) else: if not base_trace_report: logger.warning('failed to read back base trace report (cannot send trace reports)') @@ -262,13 +315,13 @@ def process_remote_file_traces(path, job, not_opened_turls): if trace_report: trace_report.send() else: - logger.warning('failed to create trace report for turl=%s' % fspec.turl) + logger.warning('failed to create trace report for turl=%s', fspec.turl) def get_payload_command(job): """ - Return the full command for executing the payload, including the sourcing of all setup files and setting of - environment variables. + Return the full command for executing the payload, including the + sourcing of all setup files and setting of environment variables. :param job: job object. :raises PilotException: TrfDownloadFailure. @@ -285,52 +338,55 @@ def get_payload_command(job): # Is it a user job or not? userjob = job.is_analysis() - logger.info('pilot is running a user analysis job') if userjob else logger.info('pilot is running a production job') + logger.info('pilot is running a %s job', 'user analysis' if userjob else 'production') resource_name = get_resource_name() # 'grid' if no hpc_resource is set - resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0) # Python 3, -1 -> 0 + + # Python 3, level -1 -> 0 + modname = 'pilot.user.atlas.resource.%s' % resource_name + resource = __import__(modname, globals(), locals(), [resource_name], 0) # get the general setup command and then verify it if required cmd = resource.get_setup_command(job, preparesetup) if cmd: - ec, diagnostics = resource.verify_setup_command(cmd) - if ec != 0: - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec) - raise PilotException(diagnostics, code=ec) + exitcode, diagnostics = resource.verify_setup_command(cmd) + if exitcode != 0: + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exitcode) + raise PilotException(diagnostics, code=exitcode) # make sure that remote file can be opened before executing payload catchall = job.infosys.queuedata.catchall.lower() if job.infosys.queuedata.catchall else '' if config.Pilot.remotefileverification_log and 'remoteio_test=false' not in catchall: - ec = 0 + exitcode = 0 diagnostics = "" not_opened_turls = "" try: - ec, diagnostics, not_opened_turls = open_remote_files(job.indata, job.workdir) - except Exception as e: - logger.warning('caught exception: %s' % e) + exitcode, diagnostics, not_opened_turls = open_remote_files(job.indata, job.workdir) + except PilotException as exc: + logger.warning('caught exception: %s', exc) else: # read back the base trace report path = os.path.join(job.workdir, config.Pilot.base_trace_report) if not os.path.exists(path): - logger.warning('base trace report does not exist (%s) - input file traces should already have been sent' % path) + logger.warning(( + 'base trace report does not exist (%s) - input file ' + 'traces should already have been sent', path)) else: process_remote_file_traces(path, job, not_opened_turls) # fail the job if the remote files could not be verified - if ec != 0: - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec, msg=diagnostics) - raise PilotException(diagnostics, code=ec) + if exitcode != 0: + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exitcode, msg=diagnostics) + raise PilotException(diagnostics, code=exitcode) else: logger.debug('no remote file open verification') if is_standard_atlas_job(job.swrelease): - # Normal setup (production and user jobs) logger.info("preparing normal production/analysis job setup command") cmd = get_normal_payload_command(cmd, job, preparesetup, userjob) - - else: # Generic, non-ATLAS specific jobs, or at least a job with undefined swRelease - + else: + # Generic, non-ATLAS specific jobs, or at least a job with undefined swRelease logger.info("generic job (non-ATLAS specific or with undefined swRelease)") cmd = get_generic_payload_command(cmd, job, preparesetup, userjob) @@ -341,7 +397,8 @@ def get_payload_command(job): # only if not using a user container if not job.imagename: site = os.environ.get('PILOT_SITENAME', '') - variables = get_payload_environment_variables(cmd, job.jobid, job.taskid, job.attemptnr, job.processingtype, site, userjob) + variables = get_payload_environment_variables( + cmd, job.jobid, job.taskid, job.attemptnr, job.processingtype, site, userjob) cmd = ''.join(variables) + cmd # prepend PanDA job id in case it is not there already (e.g. runcontainer jobs) @@ -350,24 +407,36 @@ def get_payload_command(job): cmd = cmd.replace(';;', ';') - # For direct access in prod jobs, we need to substitute the input file names with the corresponding TURLs + # For direct access in prod jobs, we need to substitute the input file names + # with the corresponding TURLs # get relevant file transfer info #use_copy_tool, use_direct_access, use_pfc_turl = get_file_transfer_info(job) #if not userjob and use_direct_access and job.transfertype == 'direct': - if not userjob and not job.is_build_job() and job.has_remoteio(): ## ported from old logic + + ## ported from old logic + if not userjob and not job.is_build_job() and job.has_remoteio(): ## ported from old logic but still it looks strange (anisyonk) - ## the "PoolFileCatalog.xml" should already contains proper TURLs values as it created by create_input_file_metadata() - ## if the case is just to patch `writetofile` file, than logic should be cleaned and decoupled - ## anyway, instead of parsing the file, it's much more easy to generate properly `writetofile` content from the beginning with TURL data + ## the "PoolFileCatalog.xml" should already contains proper TURLs + ## values as it created by create_input_file_metadata() if the case + ## is just to patch `writetofile` file, than logic should be cleaned + ## and decoupled anyway, instead of parsing the file, it's much easier + ## to generate properly `writetofile` content from the beginning + ## with TURL data lfns = job.get_lfns_and_guids()[0] - cmd = replace_lfns_with_turls(cmd, job.workdir, "PoolFileCatalog.xml", lfns, writetofile=job.writetofile) + cmd = replace_lfns_with_turls( + cmd, + job.workdir, + "PoolFileCatalog.xml", + lfns, + writetofile=job.writetofile + ) # Explicitly add the ATHENA_PROC_NUMBER (or JOB value) cmd = add_athena_proc_number(cmd) show_memory_usage() - logger.info('payload run command: %s' % cmd) + logger.info('payload run command: %s', cmd) return cmd @@ -379,27 +448,30 @@ def get_normal_payload_command(cmd, job, preparesetup, userjob): :param cmd: any preliminary command setup (string). :param job: job object. :param userjob: True for user analysis jobs, False otherwise (bool). - :param preparesetup: True if the pilot should prepare the setup, False if already in the job parameters. + :param preparesetup: True if the pilot should prepare the setup, + False if already in the job parameters. :return: normal payload command (string). """ - # set the INDS env variable (used by runAthena but also for EventIndex production jobs) + # set the INDS env variable + # (used by runAthena but also for EventIndex production jobs) set_inds(job.datasetin) # realDatasetsIn if userjob: # Try to download the trf (skip when user container is to be used) - ec, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir) - if ec != 0: + exitcode, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir) + if exitcode != 0: raise TrfDownloadFailure(diagnostics) - else: - logger.debug('user analysis trf: %s' % trf_name) + + logger.debug('user analysis trf: %s', trf_name) if preparesetup: _cmd = get_analysis_run_command(job, trf_name) else: _cmd = job.jobparams - # Correct for multi-core if necessary (especially important in case coreCount=1 to limit parallel make) + # Correct for multi-core if necessary (especially important in + # case coreCount=1 to limit parallel make) cmd += "; " + add_makeflags(job.corecount, "") + _cmd else: # Add Database commands if they are set by the local site @@ -437,19 +509,19 @@ def get_generic_payload_command(cmd, job, preparesetup, userjob): #if job.imagename != "" or "--containerImage" in job.jobparams: # job.transformation = os.path.join(os.path.dirname(job.transformation), "runcontainer") # logger.warning('overwrote job.transformation, now set to: %s' % job.transformation) - ec, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir) - if ec != 0: + exitcode, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir) + if exitcode != 0: raise TrfDownloadFailure(diagnostics) - else: - logger.debug('user analysis trf: %s' % trf_name) + + logger.debug('user analysis trf: %s', trf_name) if preparesetup: _cmd = get_analysis_run_command(job, trf_name) else: _cmd = job.jobparams - # correct for multi-core if necessary (especially important in case coreCount=1 to limit parallel make) - # only if not using a user container + # correct for multi-core if necessary (especially important in case + # coreCount=1 to limit parallel make), only if not using a user container if not job.imagename: cmd += "; " + add_makeflags(job.corecount, "") + _cmd else: @@ -471,7 +543,8 @@ def get_generic_payload_command(cmd, job, preparesetup, userjob): def add_athena_proc_number(cmd): """ - Add the ATHENA_PROC_NUMBER and ATHENA_CORE_NUMBER to the payload command if necessary. + Add the ATHENA_PROC_NUMBER and ATHENA_CORE_NUMBER to + the payload command if necessary. :param cmd: payload execution command (string). :return: updated payload execution command (string). @@ -480,13 +553,13 @@ def add_athena_proc_number(cmd): # get the values if they exist try: value1 = int(os.environ['ATHENA_PROC_NUMBER_JOB']) - except Exception as e: - logger.warning('failed to convert ATHENA_PROC_NUMBER_JOB to int: %s' % e) + except (TypeError, ValueError) as exc: + logger.warning('failed to convert ATHENA_PROC_NUMBER_JOB to int: %s', exc) value1 = None try: value2 = int(os.environ['ATHENA_CORE_NUMBER']) - except Exception as e: - logger.warning('failed to convert ATHENA_CORE_NUMBER to int: %s' % e) + except (TypeError, ValueError) as exc: + logger.warning('failed to convert ATHENA_CORE_NUMBER to int: %s', exc) value2 = None if "ATHENA_PROC_NUMBER" not in cmd: @@ -496,9 +569,13 @@ def add_athena_proc_number(cmd): if value1 > 1: cmd = 'export ATHENA_PROC_NUMBER=%d;' % value1 + cmd else: - logger.info("will not add ATHENA_PROC_NUMBER to cmd since the value is %s" % str(value1)) + logger.info(( + "will not add ATHENA_PROC_NUMBER to cmd " + "since the value is %s", str(value1))) else: - logger.warning("don't know how to set ATHENA_PROC_NUMBER (could not find it in os.environ)") + logger.warning(( + "don't know how to set ATHENA_PROC_NUMBER " + "(could not find it in os.environ)")) else: logger.info("ATHENA_PROC_NUMBER already in job command") @@ -506,9 +583,13 @@ def add_athena_proc_number(cmd): if value2 > 1: cmd = 'export ATHENA_CORE_NUMBER=%d;' % value2 + cmd else: - logger.info("will not add ATHENA_CORE_NUMBER to cmd since the value is %s" % str(value2)) + logger.info(( + "will not add ATHENA_CORE_NUMBER to cmd since the " + "value is %s", str(value2))) else: - logger.warning('there is no ATHENA_CORE_NUMBER in os.environ (cannot add it to payload command)') + logger.warning(( + 'there is no ATHENA_CORE_NUMBER in os.environ ' + '(cannot add it to payload command)')) return cmd @@ -534,7 +615,8 @@ def verify_release_string(release): def add_makeflags(job_core_count, cmd): """ - Correct for multi-core if necessary (especially important in case coreCount=1 to limit parallel make). + Correct for multi-core if necessary (especially important in + case coreCount=1 to limit parallel make). :param job_core_count: core count from the job definition (int). :param cmd: payload execution command (string). @@ -544,16 +626,18 @@ def add_makeflags(job_core_count, cmd): # ATHENA_PROC_NUMBER is set in Node.py using the schedconfig value try: core_count = int(os.environ.get('ATHENA_PROC_NUMBER')) - except Exception: + except (TypeError, ValueError): core_count = -1 + if core_count == -1: try: core_count = int(job_core_count) - except Exception: + except (TypeError, ValueError): pass else: if core_count >= 1: - # Note: the original request (AF) was to use j%d and not -j%d, now using the latter + # Note: the original request (AF) was to use j%d + # and not -j%d, now using the latter cmd += "export MAKEFLAGS=\'-j%d QUICK=1 -l1\';" % (core_count) # make sure that MAKEFLAGS is always set @@ -567,10 +651,12 @@ def get_analysis_run_command(job, trf_name): """ Return the proper run command for the user job. - Example output: export X509_USER_PROXY=<..>;./runAthena --usePFCTurl --directIn + Example output: + export X509_USER_PROXY=<..>;./runAthena --usePFCTurl --directIn :param job: job object. - :param trf_name: name of the transform that will run the job (string). Used when containers are not used. + :param trf_name: name of the transform that will run the job (string). + Used when containers are not used. :return: command (string). """ @@ -579,7 +665,8 @@ def get_analysis_run_command(job, trf_name): # get relevant file transfer info #use_copy_tool, use_direct_access, use_pfc_turl = get_file_transfer_info(job) # check if the input files are to be accessed locally (ie if prodDBlockToken is set to local) - #if job.is_local(): ## useless since stage-in phase has already passed (DEPRECATE ME, anisyonk) + ## useless since stage-in phase has already passed (DEPRECATE ME, anisyonk) + #if job.is_local(): # logger.debug('switched off direct access for local prodDBlockToken') # use_direct_access = False # use_pfc_turl = False @@ -601,12 +688,12 @@ def get_analysis_run_command(job, trf_name): # check if image is on disk as defined by envar PAYLOAD_CONTAINER_LOCATION payload_container_location = os.environ.get('PAYLOAD_CONTAINER_LOCATION') if payload_container_location is not None: - logger.debug("$PAYLOAD_CONTAINER_LOCATION = %s" % payload_container_location) + logger.debug("$PAYLOAD_CONTAINER_LOCATION = %s", payload_container_location) # get container name containername = imagename.rsplit('/')[-1] image_location = os.path.join(payload_container_location, containername) if os.path.exists(image_location): - logger.debug("image exists at %s" % image_location) + logger.debug("image exists at %s", image_location) imagename = image_location # restore the image name if necessary @@ -621,15 +708,19 @@ def get_analysis_run_command(job, trf_name): # cmd += ' --directIn' if job.has_remoteio(): - logger.debug('direct access (remoteio) is used to access some input files: --usePFCTurl and --directIn will be added to payload command') + logger.debug(( + 'direct access (remoteio) is used to access some input files: ' + '--usePFCTurl and --directIn will be added to payload command')) if '--usePFCTurl' not in cmd: cmd += ' --usePFCTurl' if '--directIn' not in cmd: cmd += ' --directIn' # update the payload command for forced accessmode - ## -- REDUNDANT logic, since it should be done from the beginning at the step of FileSpec initialization (anisyonk) - #cmd = update_forced_accessmode(log, cmd, job.transfertype, job.jobparams, trf_name) ## DEPRECATE ME (anisyonk) + ## -- REDUNDANT logic, since it should be done from the beginning at + ## the step of FileSpec initialization (anisyonk) + #cmd = update_forced_accessmode(log, cmd, job.transfertype, + # job.jobparams, trf_name) ## DEPRECATE ME (anisyonk) # add guids when needed # get the correct guids list (with only the direct access files) @@ -644,16 +735,19 @@ def get_analysis_run_command(job, trf_name): return cmd -## SHOULD NOT BE USED since payload cmd should be properly generated from the beginning (consider final directio settings) (anisyonk) -def update_forced_accessmode(log, cmd, transfertype, jobparams, trf_name): ## DEPRECATE ME (anisyonk) +## SHOULD NOT BE USED since payload cmd should be properly generated +## from the beginning (consider final directio settings) (anisyonk) +## DEPRECATE ME (anisyonk) +def update_forced_accessmode(log, cmd, transfertype, jobparams, trf_name): """ Update the payload command for forced accessmode. - accessmode is an option that comes from HammerCloud and is used to force a certain input file access mode; i.e. - copy-to-scratch or direct access. + accessmode is an option that comes from HammerCloud and is used to + force a certain input file access mode; i.e. copy-to-scratch or direct access. :param log: logging object. :param cmd: payload command. - :param transfertype: transfer type (.e.g 'direct') from the job definition with priority over accessmode (string). + :param transfertype: transfer type (.e.g 'direct') from the job + definition with priority over accessmode (string). :param jobparams: job parameters (string). :param trf_name: transformation name (string). :return: updated payload command string. @@ -669,7 +763,7 @@ def update_forced_accessmode(log, cmd, transfertype, jobparams, trf_name): ## D for _mode in list(_accessmode_dic.keys()): # Python 2/3 if _mode in jobparams: # any accessmode set in jobPars should overrule schedconfig - logger.info("enforcing %s" % _accessmode_dic[_mode][0]) + logger.info("enforcing %s", _accessmode_dic[_mode][0]) if _mode == "--accessmode=copy": # make sure direct access is turned off accessmode_usect = True @@ -709,7 +803,8 @@ def update_forced_accessmode(log, cmd, transfertype, jobparams, trf_name): ## D cmd = cmd.replace("./%s" % trf_name, "export X509_USER_PROXY=%s;./%s" % (os.environ.get('X509_USER_PROXY'), trf_name)) - # if both direct access and the accessmode loop added a directIn switch, remove the first one from the string + # if both direct access and the accessmode loop added a + # directIn switch, remove the first one from the string if cmd.count("directIn") > 1: cmd = cmd.replace(' --directIn', ' ', 1) @@ -721,8 +816,10 @@ def get_guids_from_jobparams(jobparams, infiles, infilesguids): Extract the correct guid from the input file list. The guids list is used for direct reading. 1. extract input file list for direct reading from job parameters - 2. for each input file in this list, find the corresponding guid from the input file guid list - Since the job parameters string is entered by a human, the order of the input files might not be the same. + 2. for each input file in this list, find the corresponding guid from + the input file guid list. + Since the job parameters string is entered by a human, the order of + the input files might not be the same. :param jobparams: job parameters. :param infiles: input file list. @@ -750,22 +847,23 @@ def get_guids_from_jobparams(jobparams, infiles, infilesguids): tail = match.group(3) body = match.group(2).split(',') attr = match.group(4).split(',') - for idx in range(len(body)): - lfn = '%s%s%s%s' % (head, body[idx], tail, attr[idx]) + + for idx, item in enumerate(body): + lfn = '%s%s%s%s' % (head, item, tail, attr[idx]) infiles.append(lfn) else: infiles = [compactinfiles] - if _infiles != []: - for infile in _infiles: - # get the corresponding index from the inputFiles list, which has the same order as infilesguids - try: - index = infiles.index(infile) - except Exception as e: - logger.warning("exception caught: %s (direct reading will fail)" % e) - else: - # add the corresponding guid to the list - guidlist.append(infilesguids[index]) + for infile in _infiles: + # get the corresponding index from the inputFiles list, + # which has the same order as infilesguids + try: + index = infiles.index(infile) + except ValueError as exc: + logger.warning("exception caught: %s (direct reading will fail)", exc) + else: + # add the corresponding guid to the list + guidlist.append(infilesguids[index]) return guidlist @@ -775,7 +873,8 @@ def get_file_transfer_info(job): ## TO BE DEPRECATED, NOT USED (anisyonk) Return information about desired file transfer. :param job: job object - :return: use copy tool (boolean), use direct access (boolean), use PFC Turl (boolean). + :return: use copy tool (boolean), use direct access (boolean), + use PFC Turl (boolean). """ use_copy_tool = True @@ -783,10 +882,14 @@ def get_file_transfer_info(job): ## TO BE DEPRECATED, NOT USED (anisyonk) use_pfc_turl = False # check with schedconfig - if (job.infosys.queuedata.direct_access_lan or job.infosys.queuedata.direct_access_wan or job.transfertype == 'direct') and not job.is_build_job(): + is_lan = job.infosys.queuedata.direct_access_lan + is_wan = job.infosys.queuedata.direct_access_wan + if not job.is_build_job() and (is_lan or is_wan or job.transfertype == 'direct'): # override if all input files are copy-to-scratch if job.only_copy_to_scratch(): - logger.info('all input files are copy-to-scratch (--usePFCTurl and --directIn will not be set)') + logger.info(( + 'all input files are copy-to-scratch ' + '(--usePFCTurl and --directIn will not be set)')) else: logger.debug('--usePFCTurl and --directIn will be set') use_copy_tool = False @@ -799,17 +902,19 @@ def get_file_transfer_info(job): ## TO BE DEPRECATED, NOT USED (anisyonk) def update_job_data(job): """ This function can be used to update/add data to the job object. - E.g. user specific information can be extracted from other job object fields. In the case of ATLAS, information - is extracted from the metadata field and added to other job object fields. + E.g. user specific information can be extracted from other job object fields. + In the case of ATLAS, information is extracted from the metadata field and + added to other job object fields. :param job: job object :return: """ ## comment from Alexey: - ## it would be better to reallocate this logic (as well as parse metadata values)directly to Job object - ## since in general it's Job related part - ## later on once we introduce VO specific Job class (inherited from JobData) this can be easily customized + ## it would be better to reallocate this logic (as well as parse + ## metadata values)directly to Job object since in general it's Job + ## related part. Later on once we introduce VO specific Job class + ## (inherited from JobData) this can be easily customized # get label "all" or "log" stageout = get_stageout_label(job) @@ -817,7 +922,7 @@ def update_job_data(job): if 'exeErrorDiag' in job.metadata: job.exeerrordiag = job.metadata['exeErrorDiag'] if job.exeerrordiag: - logger.warning('payload failed: exeErrorDiag=%s' % job.exeerrordiag) + logger.warning('payload failed: exeErrorDiag=%s', job.exeerrordiag) # determine what should be staged out job.stageout = stageout # output and log file or only log file @@ -825,37 +930,47 @@ def update_job_data(job): work_attributes = None try: work_attributes = parse_jobreport_data(job.metadata) - except Exception as e: - logger.warning('failed to parse job report (cannot set job.nevents): %s' % e) + except Exception as exc: + logger.warning('failed to parse job report (cannot set job.nevents): %s', exc) else: - # note: the number of events can be set already at this point if the value was extracted from the job report - # (a more thorough search for this value is done later unless it was set here) + # note: the number of events can be set already at this point + # if the value was extracted from the job report (a more thorough + # search for this value is done later unless it was set here) nevents = work_attributes.get('nEvents', 0) if nevents: job.nevents = nevents - # extract output files from the job report if required, in case the trf has created additional (overflow) files - # also make sure all guids are assigned (use job report value if present, otherwise generate the guid) + # extract output files from the job report if required, in case the trf + # has created additional (overflow) files. Also make sure all guids are + # assigned (use job report value if present, otherwise generate the guid) if job.metadata and not job.is_eventservice: - extract_output_file_guids(job) # keep this for now, complicated to merge with verify_output_files? + # keep this for now, complicated to merge with verify_output_files? + extract_output_file_guids(job) try: verify_output_files(job) - except Exception as e: - logger.warning('exception caught while trying verify output files: %s' % e) + except Exception as exc: + logger.warning('exception caught while trying verify output files: %s', exc) else: if not job.allownooutput: # i.e. if it's an empty list/string, do nothing - logger.debug("will not try to extract output files from jobReport for user job (and allowNoOut list is empty)") + logger.debug(( + "will not try to extract output files from jobReport " + "for user job (and allowNoOut list is empty)")) else: # remove the files listed in allowNoOutput if they don't exist remove_no_output_files(job) ## validate output data (to be moved into the JobData) - ## warning: do no execute this code unless guid lookup in job report has failed - pilot should only generate guids + ## warning: do no execute this code unless guid lookup in job report + # has failed - pilot should only generate guids ## if they are not present in job report for dat in job.outdata: if not dat.guid: dat.guid = get_guid() - logger.warning('guid not set: generated guid=%s for lfn=%s' % (dat.guid, dat.lfn)) + logger.warning( + 'guid not set: generated guid=%s for lfn=%s', + dat.guid, + dat.lfn + ) def get_stageout_label(job): @@ -878,7 +993,7 @@ def get_stageout_label(job): if job.exeerrorcode == 0: stageout = "all" else: - logger.info('payload failed: exeErrorCode=%d' % job.exeerrorcode) + logger.info('payload failed: exeErrorCode=%d', job.exeerrorcode) stageout = "log" return stageout @@ -894,11 +1009,13 @@ def update_output_for_hpo(job): try: new_outdata = discover_new_outdata(job) - except Exception as e: - logger.warning('exception caught while discovering new outdata: %s' % e) + except Exception as exc: + logger.warning('exception caught while discovering new outdata: %s', exc) else: if new_outdata: - logger.info('replacing job outdata with discovered output (%d file(s))' % len(new_outdata)) + logger.info(( + 'replacing job outdata with discovered output ' + '(%d file(s))', len(new_outdata))) job.outdata = new_outdata @@ -918,12 +1035,22 @@ def discover_new_outdata(job): if new_output: # create new FileSpec objects out of the new output for outfile in new_output: - # note: guid will be taken from job report after this function has been called - files = [{'scope': outdata_file.scope, 'lfn': outfile, 'workdir': job.workdir, - 'dataset': outdata_file.dataset, 'ddmendpoint': outdata_file.ddmendpoint, - 'ddmendpoint_alt': None, 'filesize': new_output[outfile]['filesize'], - 'checksum': new_output[outfile]['checksum'], 'guid': ''}] - # do not abbreviate the following two lines as otherwise the content of xfiles will be a list of generator objects + # note: guid will be taken from job report + # after this function has been called + files = [{ + 'scope': outdata_file.scope, + 'lfn': outfile, + 'workdir': job.workdir, + 'dataset': outdata_file.dataset, + 'ddmendpoint': outdata_file.ddmendpoint, + 'ddmendpoint_alt': None, + 'filesize': new_output[outfile]['filesize'], + 'checksum': new_output[outfile]['checksum'], + 'guid': '' + }] + + # do not abbreviate the following two lines as otherwise + # the content of xfiles will be a list of generator objects _xfiles = [FileSpec(type='output', **f) for f in files] new_outdata += _xfiles @@ -958,29 +1085,43 @@ def discover_new_output(name_pattern, workdir): if filesize and checksum: new_output[lfn] = {'path': path, 'filesize': filesize, 'checksum': checksum} else: - logger.warning('failed to create file info (filesize=%d, checksum=%s) for lfn=%s' % - (filesize, checksum, lfn)) + logger.warning( + 'failed to create file info (filesize=%d, checksum=%s) for lfn=%s', + filesize, + checksum, + lfn + ) + return new_output def extract_output_file_guids(job): """ - Extract output file info from the job report and make sure all guids are assigned (use job report value if present, - otherwise generate the guid - note: guid generation is done later, not in this function since this function - might not be called if metadata info is not found prior to the call). + Extract output file info from the job report and make sure all guids\ + are assigned (use job report value if present, otherwise generate the guid.\ + Note: guid generation is done later, not in this function since + this function might not be called if metadata info is not found prior + to the call). :param job: job object. :return: """ - # make sure there is a defined output file list in the job report - unless it is allowed by task parameter allowNoOutput + # make sure there is a defined output file list in the job report - + # unless it is allowed by task parameter allowNoOutput if not job.allownooutput: output = job.metadata.get('files', {}).get('output', []) if output: - logger.info('verified that job report contains metadata for %d file(s)' % len(output)) + logger.info(( + 'verified that job report contains metadata ' + 'for %d file(s)', len(output))) else: - logger.warning('job report contains no output files and allowNoOutput is not set') #- will fail job since allowNoOutput is not set') - #job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOOUTPUTINJOBREPORT) + #- will fail job since allowNoOutput is not set') + logger.warning(( + 'job report contains no output ' + 'files and allowNoOutput is not set')) + #job.piloterrorcodes, job.piloterrordiags = + # errors.add_error_code(errors.NOOUTPUTINJOBREPORT) return # extract info from metadata (job report JSON) @@ -991,20 +1132,27 @@ def extract_output_file_guids(job): lfn = fdat['name'] # verify the guid if the lfn is known - # only extra guid if the file is known by the job definition (March 18 change, v 2.5.2) + # only extra guid if the file is known by the + # job definition (March 18 change, v 2.5.2) if lfn in data: data[lfn].guid = fdat['file_guid'] - logger.info('set guid=%s for lfn=%s (value taken from job report)' % (data[lfn].guid, lfn)) + logger.info(( + 'set guid=%s for lfn=%s ' + '(value taken from job report)', data[lfn].guid, lfn)) else: # found new entry - logger.warning('pilot no longer considers output files not mentioned in job definition (lfn=%s)' % lfn) + logger.warning(( + 'pilot no longer considers output files not mentioned ' + 'in job definition (lfn=%s)', lfn)) continue #if job.outdata: # kw = {'lfn': lfn, - # 'scope': job.outdata[0].scope, ## take value from 1st output file? + # . # take value from 1st output file? + # 'scope': job.outdata[0].scope, # 'guid': fdat['file_guid'], # 'filesize': fdat['file_size'], - # 'dataset': dat.get('dataset') or job.outdata[0].dataset ## take value from 1st output file? + # # take value from 1st output file? + # 'dataset': dat.get('dataset') or job.outdata[0].dataset # } # spec = FileSpec(filetype='output', **kw) # extra.append(spec) @@ -1013,25 +1161,28 @@ def extract_output_file_guids(job): for fspec in job.outdata: if fspec.guid != data[fspec.lfn].guid: fspec.guid = data[fspec.lfn].guid - logger.debug('reset guid=%s for lfn=%s' % (fspec.guid, fspec.lfn)) + logger.debug('reset guid=%s for lfn=%s', fspec.guid, fspec.lfn) else: if fspec.guid: - logger.debug('verified guid=%s for lfn=%s' % (fspec.guid, fspec.lfn)) + logger.debug('verified guid=%s for lfn=%s', fspec.guid, fspec.lfn) else: - logger.warning('guid not set for lfn=%s' % fspec.lfn) + logger.warning('guid not set for lfn=%s', fspec.lfn) #if extra: - #logger.info('found extra output files in job report, will overwrite output file list: extra=%s' % extra) + #logger.info('found extra output files in job report, + # will overwrite output file list: extra=%s' % extra) #job.outdata = extra def verify_output_files(job): """ - Make sure that the known output files from the job definition are listed in the job report and number of processed events - is greater than zero. If the output file is not listed in the job report, then if the file is listed in allowNoOutput - remove it from stage-out, otherwise fail the job. + Make sure that the known output files from the job definition are listed + in the job report and number of processed events is greater than zero. + If the output file is not listed in the job report, then if the file is + listed in allowNoOutput remove it from stage-out, otherwise fail the job. - Note from Rod: fail scenario: The output file is not in output:[] or is there with zero events. Then if allownooutput is not - set - fail the job. If it is set, then do not store the output, and finish ok. + Note from Rod: fail scenario: The output file is not in output:[] or is + there with zero events. Then if allownooutput is not set - fail the job. + If it is set, then do not store the output, and finish ok. :param job: job object. :return: Boolean (and potentially updated job.outdata list) @@ -1048,38 +1199,50 @@ def verify_output_files(job): return True # get list of output files from job report - # (if None is returned, it means the job report is from an old release and does not contain an output list) + # (if None is returned, it means the job report is from an old release + # and does not contain an output list) output = job.metadata.get('files', {}).get('output', None) if not output and output is not None: # ie empty list, output=[] - are all known output files in allowNoOutput? - logger.warning('encountered an empty output file list in job report, consulting allowNoOutput list') + logger.warning(( + 'encountered an empty output file list in job report, ' + 'consulting allowNoOutput list')) failed = False for lfn in lfns_jobdef: if lfn not in job.allownooutput: if job.is_analysis(): - logger.warning('lfn %s is not in allowNoOutput list - ignore for user job' % lfn) + logger.warning(( + 'lfn %s is not in allowNoOutput list - ' + 'ignore for user job', + lfn + )) else: failed = True - logger.warning('lfn %s is not in allowNoOutput list - job will fail' % lfn) + logger.warning( + 'lfn %s is not in allowNoOutput list - job will fail', + lfn + ) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.MISSINGOUTPUTFILE) break else: - logger.info('lfn %s listed in allowNoOutput - will be removed from stage-out' % lfn) + logger.info('lfn %s listed in allowNoOutput - will be removed from stage-out', lfn) remove_from_stageout(lfn, job) elif output is None: # ie job report is ancient / output could not be extracted - logger.warning('output file list could not be extracted from job report (nothing to verify)') + logger.warning(( + 'output file list could not be extracted from job report ' + '(nothing to verify)')) else: verified, nevents = verify_extracted_output_files(output, lfns_jobdef, job) - failed = True if not verified else False + failed = (not verified) if nevents > 0 and not failed and job.nevents == 0: job.nevents = nevents - logger.info('number of events from summed up output files: %d' % nevents) + logger.info('number of events from summed up output files: %d', nevents) else: - logger.info('number of events previously set to %d' % job.nevents) + logger.info('number of events previously set to %d', job.nevents) - status = True if not failed else False + status = (not failed) if status: logger.info('output file verification succeeded') @@ -1103,7 +1266,9 @@ def verify_extracted_output_files(output, lfns_jobdef, job): failed = False nevents = 0 output_jobrep = {} # {lfn: nentries, ..} - logger.debug('extracted output file list from job report - make sure all known output files are listed') + logger.debug(( + 'extracted output file list from job report - ' + 'make sure all known output files are listed')) # first collect the output files from the job report for dat in output: @@ -1118,45 +1283,68 @@ def verify_extracted_output_files(output, lfns_jobdef, job): for lfn in lfns_jobdef: if lfn not in output_jobrep and lfn not in job.allownooutput: if job.is_analysis(): - logger.warning( - 'output file %s from job definition is not present in job report and is not listed in allowNoOutput' % lfn) + logger.warning(( + 'output file %s from job definition is not present ' + 'in job report and is not listed in allowNoOutput', lfn)) else: - logger.warning( - 'output file %s from job definition is not present in job report and is not listed in allowNoOutput - job will fail' % lfn) + logger.warning(( + 'output file %s from job definition is not present ' + 'in job report and is not listed in allowNoOutput - ' + 'job will fail', lfn)) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.MISSINGOUTPUTFILE) failed = True break + if lfn not in output_jobrep and lfn in job.allownooutput: - logger.warning( - 'output file %s from job definition is not present in job report but is listed in allowNoOutput - remove from stage-out' % lfn) + logger.warning(( + 'output file %s from job definition is not present ' + 'in job report but is listed in allowNoOutput - ' + 'remove from stage-out', lfn)) remove_from_stageout(lfn, job) else: nentries = output_jobrep[lfn] if nentries == "UNDEFINED": - logger.warning('encountered file with nentries=UNDEFINED - will ignore %s' % lfn) - continue - elif nentries is None and lfn not in job.allownooutput: - logger.warning( - 'output file %s is listed in job report, but has no events and is not listed in allowNoOutput - will ignore' % lfn) - continue - elif nentries is None and lfn in job.allownooutput: - logger.warning( - 'output file %s is listed in job report, nentries is None and is listed in allowNoOutput - remove from stage-out' % lfn) - remove_from_stageout(lfn, job) - elif type(nentries) is int and nentries == 0 and lfn not in job.allownooutput: - logger.warning( - 'output file %s is listed in job report, has zero events and is not listed in allowNoOutput - will ignore' % lfn) - elif type(nentries) is int and nentries == 0 and lfn in job.allownooutput: - logger.warning( - 'output file %s is listed in job report, has zero events and is listed in allowNoOutput - remove from stage-out' % lfn) - remove_from_stageout(lfn, job) + logger.warning(( + 'encountered file with nentries=UNDEFINED - ' + 'will ignore %s', lfn)) + + elif nentries is None: + + if lfn not in job.allownooutput: + logger.warning(( + 'output file %s is listed in job report, ' + 'but has no events and is not listed in ' + 'allowNoOutput - will ignore', lfn)) + else: + logger.warning(( + 'output file %s is listed in job report, ' + 'nentries is None and is listed in allowNoOutput - ' + 'remove from stage-out', lfn)) + remove_from_stageout(lfn, job) + + elif nentries == 0: + + if lfn not in job.allownooutput: + logger.warning(( + 'output file %s is listed in job report, ' + 'has zero events and is not listed in ' + 'allowNoOutput - will ignore', lfn)) + else: + logger.warning(( + 'output file %s is listed in job report, ' + 'has zero events and is listed in allowNoOutput - ' + 'remove from stage-out', lfn)) + remove_from_stageout(lfn, job) + elif type(nentries) is int and nentries: - logger.info('output file %s has %d event(s)' % (lfn, nentries)) + logger.info('output file %s has %d event(s)', lfn, nentries) nevents += nentries else: # should not reach this step - logger.warning('case not handled for output file %s with %s event(s) (ignore)' % (lfn, str(nentries))) + logger.warning(( + 'case not handled for output file %s with %s event(s) ' + '(ignore)', lfn, str(nentries))) - status = False if failed else True + status = (not failed) return status, nevents @@ -1172,7 +1360,7 @@ def remove_from_stageout(lfn, job): outdata = [] for fspec in job.outdata: if fspec.lfn == lfn: - logger.info('removing %s from stage-out list' % lfn) + logger.info('removing %s from stage-out list', lfn) else: outdata.append(fspec) job.outdata = outdata @@ -1180,7 +1368,8 @@ def remove_from_stageout(lfn, job): def remove_no_output_files(job): """ - Remove files from output file list if they are listed in allowNoOutput and do not exist. + Remove files from output file list if they are listed in + allowNoOutput and do not exist. :param job: job object. :return: @@ -1194,15 +1383,22 @@ def remove_no_output_files(job): if filename in job.allownooutput: if os.path.exists(path): - logger.info("file %s is listed in allowNoOutput but exists (will not be removed from list of files to be staged-out)" % filename) + logger.info(( + "file %s is listed in allowNoOutput but exists " + "(will not be removed from list of files to be " + "staged-out)", filename)) _outfiles.append(filename) else: - logger.info("file %s is listed in allowNoOutput and does not exist (will be removed from list of files to be staged-out)" % filename) + logger.info(( + "file %s is listed in allowNoOutput and does not exist " + "(will be removed from list of files to be staged-out)", filename)) else: if os.path.exists(path): - logger.info("file %s is not listed in allowNoOutput (will be staged-out)" % filename) + logger.info("file %s is not listed in allowNoOutput (will be staged-out)", filename) else: - logger.warning("file %s is not listed in allowNoOutput and does not exist (job will fail)" % filename) + logger.warning(( + "file %s is not listed in allowNoOutput and " + "does not exist (job will fail)", filename)) _outfiles.append(filename) # now remove the unwanted fspecs @@ -1223,12 +1419,15 @@ def get_outfiles_records(subfiles): """ res = {} - for f in subfiles: - res[f['name']] = {'guid': f['file_guid'], - 'size': f['file_size']} - nentries = f.get('nentries', 'UNDEFINED') + for subfile in subfiles: + res[subfile['name']] = { + 'guid': subfile['file_guid'], + 'size': subfile['file_size'] + } + + nentries = subfile.get('nentries', 'UNDEFINED') if type(nentries) == int: - res[f['name']]['nentries'] = nentries + res[subfile['name']]['nentries'] = nentries else: logger.warning("nentries is undefined in job report") @@ -1241,14 +1440,15 @@ def get(self, path, dst_dict, dst_key): if len(keys) == 0: return last_key = keys.pop() - v = self + me_ = self for key in keys: - if key in v and isinstance(v[key], dict): - v = v[key] - else: + if not (key in me_ and isinstance(me_[key], dict)): return - if last_key in v: - dst_dict[dst_key] = v[last_key] + + me_ = me_[key] + + if last_key in me_: + dst_dict[dst_key] = me_[last_key] def parse_jobreport_data(job_report): @@ -1271,25 +1471,25 @@ def parse_jobreport_data(job_report): work_attributes["outputfiles"] = [] if "ATHENA_PROC_NUMBER" in os.environ: - logger.debug("ATHENA_PROC_NUMBER: {0}".format(os.environ["ATHENA_PROC_NUMBER"])) + logger.debug("ATHENA_PROC_NUMBER: %s", os.environ["ATHENA_PROC_NUMBER"]) work_attributes['core_count'] = int(os.environ["ATHENA_PROC_NUMBER"]) core_count = int(os.environ["ATHENA_PROC_NUMBER"]) - dq = DictQuery(job_report) - dq.get("resource/transform/processedEvents", work_attributes, "nEvents") - dq.get("resource/transform/cpuTimeTotal", work_attributes, "cpuConsumptionTime") - dq.get("resource/machine/node", work_attributes, "node") - dq.get("resource/machine/model_name", work_attributes, "cpuConsumptionUnit") - dq.get("resource/dbTimeTotal", work_attributes, "dbTime") - dq.get("resource/dbDataTotal", work_attributes, "dbData") - dq.get("exitCode", work_attributes, "transExitCode") - dq.get("exitMsg", work_attributes, "exeErrorDiag") - dq.get("files/input", work_attributes, "inputfiles") - dq.get("files/output", work_attributes, "outputfiles") + dictq = DictQuery(job_report) + dictq.get("resource/transform/processedEvents", work_attributes, "nEvents") + dictq.get("resource/transform/cpuTimeTotal", work_attributes, "cpuConsumptionTime") + dictq.get("resource/machine/node", work_attributes, "node") + dictq.get("resource/machine/model_name", work_attributes, "cpuConsumptionUnit") + dictq.get("resource/dbTimeTotal", work_attributes, "dbTime") + dictq.get("resource/dbDataTotal", work_attributes, "dbData") + dictq.get("exitCode", work_attributes, "transExitCode") + dictq.get("exitMsg", work_attributes, "exeErrorDiag") + dictq.get("files/input", work_attributes, "inputfiles") + dictq.get("files/output", work_attributes, "outputfiles") outputfiles_dict = {} - for of in work_attributes['outputfiles']: - outputfiles_dict.update(get_outfiles_records(of['subFiles'])) + for opf in work_attributes['outputfiles']: + outputfiles_dict.update(get_outfiles_records(opf['subFiles'])) work_attributes['outputfiles'] = outputfiles_dict if work_attributes['inputfiles']: @@ -1302,20 +1502,36 @@ def parse_jobreport_data(job_report): if 'resource' in job_report and 'executor' in job_report['resource']: j = job_report['resource']['executor'] + + # Original version exc_report = [] fin_report = defaultdict(int) + try: _tmplist = filter(lambda d: 'memory' in d and ('Max' or 'Avg' in d['memory']), j.itervalues()) # Python 2 except Exception: _tmplist = [d for d in iter(list(j.values())) if 'memory' in d and ('Max' or 'Avg' in d['memory'])] # Python 3 - for v in _tmplist: - if 'Avg' in v['memory']: - exc_report.extend(list(v['memory']['Avg'].items())) # Python 2/3 - if 'Max' in v['memory']: - exc_report.extend(list(v['memory']['Max'].items())) # Python 2/3 - for x in exc_report: - fin_report[x[0]] += x[1] + + for item in _tmplist: + if 'Avg' in item['memory']: + exc_report.extend(list(item['memory']['Avg'].items())) # Python 2/3 + if 'Max' in item['memory']: + exc_report.extend(list(item['memory']['Max'].items())) # Python 2/3 + + for item in exc_report: + fin_report[item[0]] += item[1] + + # Proposed version + # fin_report_brinick = defaultdict(int) + # for value in j.values(): + # mem = value.get('memory') + # for key in ('Avg', 'Max'): + # for subk, subv in mem.get(key, {}).items(): + # fin_report_brinick[subk] += subv + # logger.debug("Original code yields fin_report: %s", fin_report) + # logger.debug("Proposed code yields fin_report: %s", fin_report_brinick) + work_attributes.update(fin_report) workdir_size = get_workdir_size() @@ -1325,8 +1541,8 @@ def parse_jobreport_data(job_report): work_attributes["dbTime"], work_attributes["dbData"], workdir_size) - del(work_attributes["dbData"]) - del(work_attributes["dbTime"]) + del work_attributes["dbData"] + del work_attributes["dbTime"] return work_attributes @@ -1337,9 +1553,9 @@ def get_workdir_size(): :return: """ - c, o, e = execute('du -s', shell=True) - if o is not None: - return o.split()[0] + _, stdout, _ = execute('du -s', shell=True) + if stdout is not None: + return stdout.split()[0] return None @@ -1366,42 +1582,6 @@ def get_executor_dictionary(jobreport_dictionary): return executor_dictionary -def get_number_of_events_deprecated(jobreport_dictionary): # TODO: remove this function - """ - Extract the number of events from the job report. - - :param jobreport_dictionary: - :return: - """ - - nevents = {} # FORMAT: { format : total_events, .. } - nmax = 0 - - executor_dictionary = get_executor_dictionary(jobreport_dictionary) - if executor_dictionary != {}: - for format in list(executor_dictionary.keys()): # "RAWtoESD", .., Python 2/3 - if 'nevents' in executor_dictionary[format]: - if format in nevents: - nevents[format] += executor_dictionary[format]['nevents'] - else: - nevents[format] = executor_dictionary[format]['nevents'] - else: - logger.warning("format %s has no such key: nevents" % (format)) - - # Now find the largest number of events among the different formats - if nevents != {}: - try: - nmax = max(nevents.values()) - except Exception as e: - logger.warning("exception caught: %s" % (e)) - nmax = 0 - else: - logger.warning("did not find the number of events in the job report") - nmax = 0 - - return nmax - - def get_resimevents(jobreport_dictionary): """ Extract and add up the resimevents from the job report. @@ -1415,11 +1595,11 @@ def get_resimevents(jobreport_dictionary): executor_dictionary = get_executor_dictionary(jobreport_dictionary) if executor_dictionary != {}: - for format in list(executor_dictionary.keys()): # "ReSim", Python 2/3 - if 'resimevents' in executor_dictionary[format]: + for fmt in list(executor_dictionary.keys()): # "ReSim", Python 2/3 + if 'resimevents' in executor_dictionary[fmt]: try: - resimevents = int(executor_dictionary[format]['resimevents']) - except Exception: + resimevents = int(executor_dictionary[fmt]['resimevents']) + except (KeyError, ValueError, TypeError): pass else: break @@ -1431,8 +1611,9 @@ def get_db_info(jobreport_dictionary): """ Extract and add up the DB info from the job report. This information is reported with the jobMetrics. - Note: this function adds up the different dbData and dbTime's in the different executor steps. In modern job - reports this might have been done already by the transform and stored in dbDataTotal and dbTimeTotal. + Note: this function adds up the different dbData and dbTime's in + the different executor steps. In modern job reports this might have + been done already by the transform and stored in dbDataTotal and dbTimeTotal. :param jobreport_dictionary: job report dictionary. :return: db_time (int), db_data (long) @@ -1441,26 +1622,26 @@ def get_db_info(jobreport_dictionary): db_time = 0 try: db_data = long(0) # Python 2 # noqa: F821 - except Exception: + except NameError: db_data = 0 # Python 3 executor_dictionary = get_executor_dictionary(jobreport_dictionary) if executor_dictionary != {}: - for format in list(executor_dictionary.keys()): # "RAWtoESD", .., Python 2/3 - if 'dbData' in executor_dictionary[format]: + for fmt in list(executor_dictionary.keys()): # "RAWtoESD", .., Python 2/3 + if 'dbData' in executor_dictionary[fmt]: try: - db_data += executor_dictionary[format]['dbData'] + db_data += executor_dictionary[fmt]['dbData'] except Exception: pass else: - logger.warning("format %s has no such key: dbData" % format) - if 'dbTime' in executor_dictionary[format]: + logger.warning("format %s has no such key: dbData", fmt) + if 'dbTime' in executor_dictionary[fmt]: try: - db_time += executor_dictionary[format]['dbTime'] + db_time += executor_dictionary[fmt]['dbTime'] except Exception: pass else: - logger.warning("format %s has no such key: dbTime" % format) + logger.warning("format %s has no such key: dbTime", fmt) return db_time, db_data @@ -1477,17 +1658,16 @@ def get_db_info_str(db_time, db_data): try: zero = long(0) # Python 2 # noqa: F821 - except Exception: + except NameError: zero = 0 # Python 3 + db_data_s = "" if db_data != zero: db_data_s = "%s" % (db_data) - else: - db_data_s = "" + + db_time_s = "" if db_time != 0: db_time_s = "%.2f" % (db_time) - else: - db_time_s = "" return db_time_s, db_data_s @@ -1500,24 +1680,24 @@ def get_cpu_times(jobreport_dictionary): Note: this function is used with Event Service jobs :param jobreport_dictionary: - :return: cpu_conversion_unit (unit), total_cpu_time, conversion_factor (output consistent with set_time_consumed()) + :return: cpu_conversion_unit (unit), total_cpu_time, + conversion_factor (output consistent with set_time_consumed()) """ try: total_cpu_time = long(0) # Python 2 # noqa: F821 - except Exception: + except NameError: total_cpu_time = 0 # Python 3 executor_dictionary = get_executor_dictionary(jobreport_dictionary) if executor_dictionary != {}: - for format in list(executor_dictionary.keys()): # "RAWtoESD", .., Python 2/3 - if 'cpuTime' in executor_dictionary[format]: - try: - total_cpu_time += executor_dictionary[format]['cpuTime'] - except Exception: - pass - else: - logger.warning("format %s has no such key: cpuTime" % (format)) + for fmt in list(executor_dictionary.keys()): # "RAWtoESD", .., Python 2/3 + try: + total_cpu_time += executor_dictionary[fmt]['cpuTime'] + except KeyError: + logger.warning("format %s has no such key: cpuTime", fmt) + except Exception: + pass conversion_factor = 1.0 cpu_conversion_unit = "s" @@ -1546,15 +1726,15 @@ def cleanup_looping_payload(workdir): :return: """ - for (p, d, f) in os.walk(workdir): - for filename in f: + for (root, _, files) in os.walk(workdir): + for filename in files: if 'pool.root' in filename: - path = os.path.join(p, filename) + path = os.path.join(root, filename) path = os.path.abspath(path) remove(path) -def cleanup_payload(workdir, outputfiles=[], removecores=True): +def cleanup_payload(workdir, outputfiles=None, removecores=True): """ Cleanup of payload (specifically AthenaMP) sub directories prior to log file creation. Also remove core dumps. @@ -1565,26 +1745,31 @@ def cleanup_payload(workdir, outputfiles=[], removecores=True): :return: """ + if outputfiles is None: + outputfiles = [] + if removecores: remove_core_dumps(workdir) for ampdir in glob('%s/athenaMP-workers-*' % workdir): - for (p, d, f) in os.walk(ampdir): - for filename in f: - if ('core' in filename and removecores) or 'pool.root' in filename or 'tmp.' in filename: - path = os.path.join(p, filename) - path = os.path.abspath(path) + for (root, _, files) in os.walk(ampdir): + for filename in files: + path = os.path.abspath(os.path.join(root, filename)) + + if ('core' in filename and removecores) or \ + 'pool.root' in filename or \ + 'tmp.' in filename: remove(path) + for outfile in outputfiles: if outfile in filename: - path = os.path.join(p, filename) - path = os.path.abspath(path) remove(path) def get_redundant_path(): """ - Return the path to the file containing the redundant files and directories to be removed prior to log file creation. + Return the path to the file containing the redundant files + and directories to be removed prior to log file creation. :return: file path (string). """ @@ -1601,20 +1786,26 @@ def get_redundant_path(): def get_redundants(): """ Get list of redundant files and directories (to be removed). - The function will return the content of an external file. It that can't be read, then a list defined in this - function will be returned instead. Any updates to the external file must be propagated to this function. + The function will return the content of an external file. It that + can't be read, then a list defined in this function will be returned instead. + Any updates to the external file must be propagated to this function. :return: files and directories list """ # try to read the list from the external file filename = get_redundant_path() - if os.path.exists(filename) and False: # do not use the cvmfs file since it is not being updated - dir_list = read_list(filename) - if dir_list: - return dir_list - logger.debug('list of redundant files could not be read from external file: %s (will use internal list)' % filename) + # do not use the cvmfs file since it is not being updated + # If you uncomment this block, need to also uncomment the read_list import + # if os.path.exists(filename) and False: + # dir_list = read_list(filename) + # if dir_list: + # return dir_list + + logger.debug(( + 'list of redundant files could not be read from external file: %s ' + '(will use internal list)', filename)) # else return the following dir_list = ["AtlasProduction*", @@ -1683,7 +1874,8 @@ def get_redundants(): def remove_archives(workdir): """ - Explicitly remove any soft linked archives (.a files) since they will be dereferenced by the tar command + Explicitly remove any soft linked archives (.a files) since + they will be dereferenced by the tar command (--dereference option). :param workdir: working directory (string) @@ -1691,15 +1883,15 @@ def remove_archives(workdir): """ matches = [] - for root, dirnames, filenames in os.walk(workdir): + for root, _, filenames in os.walk(workdir): for filename in fnmatch.filter(filenames, '*.a'): matches.append(os.path.join(root, filename)) - for root, dirnames, filenames in os.walk(os.path.dirname(workdir)): + for root, _, filenames in os.walk(os.path.dirname(workdir)): for filename in fnmatch.filter(filenames, 'EventService_premerge_*.tar'): matches.append(os.path.join(root, filename)) - if matches != []: - for f in matches: - remove(f) + + for match in matches: + remove(match) def cleanup_broken_links(workdir): @@ -1711,28 +1903,26 @@ def cleanup_broken_links(workdir): """ broken = [] - for root, dirs, files in os.walk(workdir): + for root, _, files in os.walk(workdir): for filename in files: path = os.path.join(root, filename) - if os.path.islink(path): - target_path = os.readlink(path) - # Resolve relative symlinks - if not os.path.isabs(target_path): - target_path = os.path.join(os.path.dirname(path), target_path) - if not os.path.exists(target_path): - broken.append(path) - else: - # If it's not a symlink we're not interested. + if not os.path.islink(path): continue - if broken: - for p in broken: - remove(p) + target_path = os.readlink(path) + # Resolve relative symlinks + if not os.path.isabs(target_path): + target_path = os.path.join(os.path.dirname(path), target_path) + if not os.path.exists(target_path): + broken.append(path) + + for brok in broken: + remove(brok) -def ls(workdir): +def list_work_dir(workdir): cmd = 'ls -lF %s' % workdir - ec, stdout, stderr = execute(cmd) + _, stdout, stderr = execute(cmd) logger.debug('%s:\n' % stdout + stderr) @@ -1752,33 +1942,32 @@ def remove_special_files(workdir, dir_list, outputfiles): to_delete = [] for _dir in dir_list: files = glob(os.path.join(workdir, _dir)) + if not files: + continue + exclude = [] + for exc in exceptions_list: + for item in files: + if exc in item: + exclude.append(os.path.abspath(item)) - if files: - for exc in exceptions_list: - for f in files: - if exc in f: - exclude.append(os.path.abspath(f)) - _files = [] - for f in files: - if f not in exclude: - _files.append(os.path.abspath(f)) - to_delete += _files + _files = [os.path.abspath(item) for item in files if item not in exclude] + to_delete += _files exclude_files = [] - for of in outputfiles: - exclude_files.append(os.path.join(workdir, of)) - - for f in to_delete: - if f not in exclude_files: - logger.debug('removing %s' % f) - if os.path.isfile(f): - remove(f) + for opf in outputfiles: + exclude_files.append(os.path.join(workdir, opf)) + + for item in to_delete: + if item not in exclude_files: + logger.debug('removing %s', item) + if os.path.isfile(item): + remove(item) else: - remove_dir_tree(f) + remove_dir_tree(item) -def remove_redundant_files(workdir, outputfiles=[], islooping=False, debugmode=False): +def remove_redundant_files(workdir, outputfiles=None, islooping=False, debugmode=False): """ Remove redundant files and directories prior to creating the log file. @@ -1786,28 +1975,32 @@ def remove_redundant_files(workdir, outputfiles=[], islooping=False, debugmode=F :param workdir: working directory (string). :param outputfiles: list of protected output files (list). - :param islooping: looping job variable to make sure workDir is not removed in case of looping (boolean). + :param islooping: looping job variable to make sure workDir + is not removed in case of looping (boolean). :param debugmode: True if debug mode has been switched on (Boolean). :return: """ + if outputfiles is None: + outputfiles = [] + logger.debug("removing redundant files prior to log creation") workdir = os.path.abspath(workdir) - ls(workdir) + list_work_dir(workdir) # get list of redundant files and directories (to be removed) dir_list = get_redundants() # remove core and pool.root files from AthenaMP sub directories + logger.debug('cleaning up payload') try: - logger.debug('cleaning up payload') cleanup_payload(workdir, outputfiles, removecores=not debugmode) - except Exception as e: - logger.warning("failed to execute cleanup_payload(): %s" % e) + except OSError as exc: + logger.warning("failed to execute cleanup_payload(): %s", exc) - # explicitly remove any soft linked archives (.a files) since they will be dereferenced by the tar command - # (--dereference option) + # explicitly remove any soft linked archives (.a files) + # since they will be dereferenced by the tar command (--dereference option) logger.debug('removing archives') remove_archives(workdir) @@ -1824,7 +2017,7 @@ def remove_redundant_files(workdir, outputfiles=[], islooping=False, debugmode=F # remove at least root files from workDir (ie also in the case of looping job) cleanup_looping_payload(path) if not islooping: - logger.debug('removing \'workDir\' from workdir=%s' % workdir) + logger.debug('removing \'workDir\' from workdir=%s', workdir) remove_dir_tree(path) # remove additional dirs @@ -1832,10 +2025,10 @@ def remove_redundant_files(workdir, outputfiles=[], islooping=False, debugmode=F for additional in additionals: path = os.path.join(workdir, additional) if os.path.exists(path): - logger.debug('removing \'%s\' from workdir=%s' % (additional, workdir)) + logger.debug('removing \'%s\' from workdir=%s', additional, workdir) remove_dir_tree(path) - ls(workdir) + list_work_dir(workdir) def download_command(process, workdir): @@ -1854,9 +2047,9 @@ def download_command(process, workdir): # download the command if necessary if cmd.startswith('http'): # Try to download the trf (skip when user container is to be used) - ec, diagnostics, cmd = get_analysis_trf(cmd, workdir) - if ec != 0: - logger.warning('cannot execute command due to previous error: %s' % cmd) + exitcode, _, cmd = get_analysis_trf(cmd, workdir) + if exitcode != 0: + logger.warning('cannot execute command due to previous error: %s', cmd) return {} # update the preprocess command (the URL should be stripped) @@ -1867,15 +2060,26 @@ def download_command(process, workdir): def get_utility_commands(order=None, job=None): """ - Return a dictionary of utility commands and arguments to be executed in parallel with the payload. - This could e.g. be memory and network monitor commands. A separate function can be used to determine the - corresponding command setups using the utility command name. - If the optional order parameter is set, the function should return the list of corresponding commands. - E.g. if order=UTILITY_BEFORE_PAYLOAD, the function should return all commands that are to be executed before the - payload. If order=UTILITY_WITH_PAYLOAD, the corresponding commands will be prepended to the payload execution - string. If order=UTILITY_AFTER_PAYLOAD_STARTED, the commands that should be executed after the payload has been started - should be returned. If order=UTILITY_WITH_STAGEIN, the commands that should be executed parallel with stage-in will - be returned. + Return a dictionary of utility commands and arguments to be executed + in parallel with the payload. This could e.g. be memory and network + monitor commands. A separate function can be used to determine the + corresponding command setups using the utility command name. If the + optional order parameter is set, the function should return the list + of corresponding commands. + + For example: + + If order=UTILITY_BEFORE_PAYLOAD, the function should return all + commands that are to be executed before the payload. + + If order=UTILITY_WITH_PAYLOAD, the corresponding commands will be + prepended to the payload execution string. + + If order=UTILITY_AFTER_PAYLOAD_STARTED, the commands that should be + executed after the payload has been started should be returned. + + If order=UTILITY_WITH_STAGEIN, the commands that should be executed + parallel with stage-in will be returned. FORMAT: {'command': , 'args': , 'label': } @@ -1886,18 +2090,38 @@ def get_utility_commands(order=None, job=None): if order == UTILITY_BEFORE_PAYLOAD and job.preprocess: return get_precopostprocess_command(job.preprocess, job.workdir, 'preprocess') - elif order == UTILITY_WITH_PAYLOAD: + + if order == UTILITY_WITH_PAYLOAD: return {'command': 'NetworkMonitor', 'args': '', 'label': 'networkmonitor'} - elif order == UTILITY_AFTER_PAYLOAD_STARTED: + + if order == UTILITY_AFTER_PAYLOAD_STARTED: return get_utility_after_payload_started() - elif order == UTILITY_AFTER_PAYLOAD_STARTED2 and job.coprocess: + + if order == UTILITY_AFTER_PAYLOAD_STARTED2 and job.coprocess: return get_precopostprocess_command(job.coprocess, job.workdir, 'coprocess') - elif order == UTILITY_AFTER_PAYLOAD_FINISHED: - return get_xcache_command(job.infosys.queuedata.catchall, job.workdir, job.jobid, 'xcache_kill', xcache_deactivation_command) - elif order == UTILITY_AFTER_PAYLOAD_FINISHED2 and job.postprocess: + + if order == UTILITY_AFTER_PAYLOAD_FINISHED: + return get_xcache_command( + job.infosys.queuedata.catchall, + job.workdir, + job.jobid, + 'xcache_kill', + xcache_deactivation_command, + ) + + if order == UTILITY_AFTER_PAYLOAD_FINISHED2 and job.postprocess: return get_precopostprocess_command(job.postprocess, job.workdir, 'postprocess') - elif order == UTILITY_BEFORE_STAGEIN: - return get_xcache_command(job.infosys.queuedata.catchall, job.workdir, job.jobid, 'xcache_start', xcache_activation_command) + + if order == UTILITY_BEFORE_STAGEIN: + return get_xcache_command( + job.infosys.queuedata.catchall, + job.workdir, + job.jobid, + 'xcache_start', + xcache_activation_command, + ) + + return None def get_precopostprocess_command(process, workdir, label): @@ -1973,16 +2197,16 @@ def post_prestagein_utility_command(**kwargs): stdout = kwargs.get('output', None) if stdout: - logger.debug('processing stdout for label=%s' % label) + logger.debug('processing stdout for label=%s', label) xcache_proxy(stdout) else: - logger.warning('no output for label=%s' % label) + logger.warning('no output for label=%s', label) alrb_xcache_files = os.environ.get('ALRB_XCACHE_FILES', '') if alrb_xcache_files: cmd = 'cat $ALRB_XCACHE_FILES/settings.sh' - exit_code, _stdout, _stderr = execute(cmd) - logger.debug('cmd=%s:\n\n%s\n\n' % (cmd, _stdout)) + _, _stdout, _ = execute(cmd) + logger.debug('cmd=%s:\n\n%s\n\n', cmd, _stdout) def xcache_proxy(output): @@ -1996,16 +2220,31 @@ def xcache_proxy(output): # loop over each line in the xcache stdout and identify the needed environmental variables for line in output.split('\n'): if 'ALRB_XCACHE_PROXY' in line: - remote = 'REMOTE' in line - name = 'ALRB_XCACHE_PROXY_REMOTE' if remote else 'ALRB_XCACHE_PROXY' - pattern = r'\ export\ ALRB_XCACHE_PROXY_REMOTE\=\"(.+)\"' if remote else r'\ export\ ALRB_XCACHE_PROXY\=\"(.+)\"' + suffix = '_REMOTE' if 'REMOTE' in line else '' + name = 'ALRB_XCACHE_PROXY%s' % suffix + pattern = r'\ export\ ALRB_XCACHE_PROXY%s\=\"(.+)\"' % suffix set_xcache_var(line, name=name, pattern=pattern) + elif 'ALRB_XCACHE_MYPROCESS' in line: - set_xcache_var(line, name='ALRB_XCACHE_MYPROCESS', pattern=r'\ ALRB_XCACHE_MYPROCESS\=(.+)') + set_xcache_var( + line, + name='ALRB_XCACHE_MYPROCESS', + pattern=r'\ ALRB_XCACHE_MYPROCESS\=(.+)' + ) + elif 'Messages logged in' in line: - set_xcache_var(line, name='ALRB_XCACHE_LOG', pattern=r'xcache\ started\ successfully.\ \ Messages\ logged\ in\ (.+)') + set_xcache_var( + line, + name='ALRB_XCACHE_LOG', + pattern=r'xcache\ started\ successfully.\ \ Messages\ logged\ in\ (.+)' + ) + elif 'ALRB_XCACHE_FILES' in line: - set_xcache_var(line, name='ALRB_XCACHE_FILES', pattern=r'\ ALRB_XCACHE_FILES\=(.+)') + set_xcache_var( + line, + name='ALRB_XCACHE_FILES', + pattern=r'\ ALRB_XCACHE_FILES\=(.+)' + ) def set_xcache_var(line, name='', pattern=''): @@ -2028,7 +2267,8 @@ def xcache_activation_command(workdir='', jobid=''): """ Return the xcache service activation command. - Note: the workdir is not used here, but the function prototype needs it in the called (xcache_activation_command needs it). + Note: the workdir is not used here, but the function prototype + needs it in the called (xcache_activation_command needs it). :param workdir: unused work directory - do not remove (string). :param jobid: PanDA job id to guarantee that xcache process is unique (int). @@ -2036,13 +2276,17 @@ def xcache_activation_command(workdir='', jobid=''): """ # a successful startup will set ALRB_XCACHE_PROXY and ALRB_XCACHE_PROXY_REMOTE - # so any file access with root://... should be replaced with one of the above - # (depending on whether you are on the same machine or not) + # so any file access with root://... should be replaced with one of + # the above (depending on whether you are on the same machine or not) # example: # ${ALRB_XCACHE_PROXY}root://atlasxrootd-kit.gridka.de:1094//pnfs/gridka.de/../DAOD_FTAG4.24348858._000020.pool.root.1 command = "%s " % get_asetup(asetup=False) - # add 'xcache list' which will also kill any orphaned processes lingering in the system - command += "lsetup xcache; xcache list; xcache start -d $PWD/%s/xcache -C centos7 --disklow 4g --diskhigh 5g -b 4" % jobid + + # add 'xcache list' which will also kill any + # orphaned processes lingering in the system + command += ( + "lsetup xcache; xcache list; " + "xcache start -d $PWD/%s/xcache -C centos7 --disklow 4g --diskhigh 5g -b 4" % jobid) return {'command': command, 'args': ''} @@ -2053,7 +2297,8 @@ def xcache_deactivation_command(workdir='', jobid=''): This service should be stopped after the payload has finished. Copy the messages log before shutting down. - Note: the job id is not used here, but the function prototype needs it in the called (xcache_activation_command needs it). + Note: the job id is not used here, but the function prototype + needs it in the called (xcache_activation_command needs it). :param workdir: payload work directory (string). :param jobid: unused job id - do not remove (string). @@ -2062,17 +2307,17 @@ def xcache_deactivation_command(workdir='', jobid=''): path = os.environ.get('ALRB_XCACHE_LOG', None) if path and os.path.exists(path): - logger.debug('copying xcache messages log file (%s) to work dir (%s)' % (path, workdir)) + logger.debug('copying xcache messages log file (%s) to work dir (%s)', path, workdir) dest = os.path.join(workdir, 'xcache-messages.log') try: copy(path, dest) - except Exception as e: - logger.warning('exception caught copying xcache log: %s' % e) + except Exception as exc: + logger.warning('exception caught copying xcache log: %s', exc) else: if not path: logger.warning('ALRB_XCACHE_LOG is not set') if path and not os.path.exists(path): - logger.warning('path does not exist: %s' % path) + logger.warning('path does not exist: %s', path) command = "%s " % get_asetup(asetup=False) command += "lsetup xcache; xcache kill" # -C centos7 @@ -2091,11 +2336,23 @@ def get_utility_command_setup(name, job, setup=None): """ if name == 'MemoryMonitor': - # must know if payload is running in a container or not (enables search for pid in ps output) + # must know if payload is running in a container or not + # (enables search for pid in ps output) use_container = job.usecontainer or 'runcontainer' in job.transformation - dump_ps = True if "PRMON_DEBUG" in job.infosys.queuedata.catchall else False - setup, pid = get_memory_monitor_setup(job.pid, job.pgrp, job.jobid, job.workdir, job.command, use_container=use_container, - transformation=job.transformation, outdata=job.outdata, dump_ps=dump_ps) + dump_ps = ("PRMON_DEBUG" in job.infosys.queuedata.catchall) + + setup, pid = get_memory_monitor_setup( + job.pid, + job.pgrp, + job.jobid, + job.workdir, + job.command, + use_container=use_container, + transformation=job.transformation, + outdata=job.outdata, + dump_ps=dump_ps + ) + _pattern = r"([\S]+)\ ." pattern = re.compile(_pattern) _name = re.findall(pattern, setup.split(';')[-1]) @@ -2105,21 +2362,24 @@ def get_utility_command_setup(name, job, setup=None): logger.warning('trf name could not be identified in setup string') # update the pgrp if the pid changed - if job.pid != pid and pid != --1: - logger.debug('updating pgrp=%d for pid=%d' % (job.pgrp, pid)) + if pid not in (job.pid, -1): + logger.debug('updating pgrp=%d for pid=%d', job.pgrp, pid) try: job.pgrp = os.getpgid(pid) - except Exception as e: - logger.warning('os.getpgid(%d) failed with: %s' % (pid, e)) + except Exception as exc: + logger.warning('os.getpgid(%d) failed with: %s', pid, exc) return setup - elif name == 'NetworkMonitor' and setup: + + if name == 'NetworkMonitor' and setup: return get_network_monitor_setup(setup, job) - elif name == 'Prefetcher': + + if name == 'Prefetcher': return get_prefetcher_setup(job) - elif name == 'Benchmark': + + if name == 'Benchmark': return get_benchmark_setup(job) - else: - return "" + + return "" def get_utility_command_execution_order(name): @@ -2133,12 +2393,13 @@ def get_utility_command_execution_order(name): # example implementation if name == 'NetworkMonitor': return UTILITY_WITH_PAYLOAD - elif name == 'MemoryMonitor': - return UTILITY_AFTER_PAYLOAD_STARTED - else: - logger.warning('unknown utility name: %s' % name) + + if name == 'MemoryMonitor': return UTILITY_AFTER_PAYLOAD_STARTED + logger.warning('unknown utility name: %s', name) + return UTILITY_AFTER_PAYLOAD_STARTED + def post_utility_command_action(name, job): """ @@ -2193,7 +2454,7 @@ def verify_lfn_length(outdata): :return: error code (int), diagnostics (string). """ - ec = 0 + exitcode = 0 diagnostics = "" max_length = 255 @@ -2202,10 +2463,10 @@ def verify_lfn_length(outdata): if len(fspec.lfn) > max_length: diagnostics = "LFN too long (length: %d, must be less than %d characters): %s" % \ (len(fspec.lfn), max_length, fspec.lfn) - ec = errors.LFNTOOLONG + exitcode = errors.LFNTOOLONG break - return ec, diagnostics + return exitcode, diagnostics def verify_ncores(corecount): @@ -2227,25 +2488,30 @@ def verify_ncores(corecount): except Exception: athena_proc_number = None - # Note: if ATHENA_PROC_NUMBER is set (by the wrapper), then do not overwrite it - # Otherwise, set it to the value of job.coreCount - # (actually set ATHENA_PROC_NUMBER_JOB and use it if it exists, otherwise use ATHENA_PROC_NUMBER directly; - # ATHENA_PROC_NUMBER_JOB will always be the value from the job definition) + # Note: if ATHENA_PROC_NUMBER is set (by the wrapper), then do not + # overwrite it. Otherwise, set it to the value of job.coreCount + # (actually set ATHENA_PROC_NUMBER_JOB and use it if it exists, + # otherwise use ATHENA_PROC_NUMBER directly; ATHENA_PROC_NUMBER_JOB + # will always be the value from the job definition) if athena_proc_number: - logger.info("encountered a set ATHENA_PROC_NUMBER (%d), will not overwrite it" % athena_proc_number) + logger.info(( + "encountered a set ATHENA_PROC_NUMBER (%d), " + "will not overwrite it", athena_proc_number)) logger.info('set ATHENA_CORE_NUMBER to same value as ATHENA_PROC_NUMBER') - os.environ['ATHENA_CORE_NUMBER'] = "%s" % athena_proc_number + os.environ['ATHENA_CORE_NUMBER'] = str(athena_proc_number) else: - os.environ['ATHENA_PROC_NUMBER_JOB'] = "%s" % corecount - os.environ['ATHENA_CORE_NUMBER'] = "%s" % corecount - logger.info("set ATHENA_PROC_NUMBER_JOB and ATHENA_CORE_NUMBER to %s (ATHENA_PROC_NUMBER will not be overwritten)" % corecount) + os.environ['ATHENA_PROC_NUMBER_JOB'] = str(corecount) + os.environ['ATHENA_CORE_NUMBER'] = str(corecount) + logger.info(( + "set ATHENA_PROC_NUMBER_JOB and ATHENA_CORE_NUMBER to %s " + "(ATHENA_PROC_NUMBER will not be overwritten)", corecount)) def verify_job(job): """ Verify job parameters for specific errors. Note: - in case of problem, the function should set the corresponding pilot error code using + in case of problem, the function should set the corresponding pilot error code using: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code()) :param job: job object @@ -2255,11 +2521,11 @@ def verify_job(job): status = False # are LFNs of correct lengths? - ec, diagnostics = verify_lfn_length(job.outdata) - if ec != 0: + exitcode, diagnostics = verify_lfn_length(job.outdata) + if exitcode != 0: logger.fatal(diagnostics) job.piloterrordiag = diagnostics - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec) + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exitcode) else: status = True @@ -2292,7 +2558,7 @@ def get_metadata(workdir): path = os.path.join(workdir, config.Payload.jobreport) metadata = read_file(path) if os.path.exists(path) else None - logger.debug('metadata=%s' % str(metadata)) + logger.debug('metadata=%s', str(metadata)) return metadata @@ -2304,12 +2570,7 @@ def should_update_logstash(frequency=10): :param frequency: :return: return True once per 'frequency' times. """ - - from random import randint - if randint(0, frequency - 1) == 0: - return True - else: - return False + return randint(0, frequency - 1) == 0 def update_server(job): @@ -2323,39 +2584,57 @@ def update_server(job): """ # attempt to read memory_monitor_output.txt and convert it to json - if should_update_logstash(): - path = os.path.join(job.workdir, get_memory_monitor_output_filename()) - if os.path.exists(path): - # convert memory monitor text output to json and return the selection (don't store it, log has already been created) - metadata_dictionary = get_metadata_dict_from_txt(path, storejson=True, jobid=job.jobid) - if metadata_dictionary: - # the output was previously written to file, update the path and tell curl to send it - new_path = update_extension(path=path, extension='json') - #out = read_json(new_path) - #logger.debug('prmon json=\n%s' % out) - # logger.debug('final logstash prmon dictionary: %s' % str(metadata_dictionary)) - url = 'https://pilot.atlas-ml.org' # 'http://collector.atlas-ml.org:80' - #cmd = "curl --connect-timeout 20 --max-time 120 -H \"Content-Type: application/json\" -X POST -d \'%s\' %s" % \ - # (str(metadata_dictionary).replace("'", '"'), url) - # curl --connect-timeout 20 --max-time 120 -H "Content-Type: application/json" -X POST --upload-file test.json - # https://pilot.atlas-ml.org - cmd = "curl --connect-timeout 20 --max-time 120 -H \"Content-Type: application/json\" -X POST --upload-file %s %s" % (new_path, url) - #cmd = "curl --connect-timeout 20 --max-time 120 -F 'data=@%s' %s" % (new_path, url) - # send metadata to logstash - try: - exit_code, stdout, stderr = execute(cmd, usecontainer=False) - except Exception as e: - logger.warning('exception caught: %s' % e) - else: - logger.debug('sent prmon JSON dictionary to logstash server') - logger.debug('stdout: %s' % stdout) - logger.debug('stderr: %s' % stderr) - else: - logger.warning('no prmon json available - cannot send anything to logstash server') + if not should_update_logstash(): + logger.debug('no need to update logstash for this job') + return + + path = os.path.join(job.workdir, get_memory_monitor_output_filename()) + if not os.path.exists(path): + logger.warning('path does not exist: %s', path) + return + + # convert memory monitor text output to json and return the selection + # (don't store it, log has already been created) + metadata_dictionary = get_metadata_dict_from_txt(path, storejson=True, jobid=job.jobid) + if metadata_dictionary: + # the output was previously written to file, + # update the path and tell curl to send it + new_path = update_extension(path=path, extension='json') + + #out = read_json(new_path) + #logger.debug('prmon json=\n%s' % out) + # logger.debug('final logstash prmon dictionary: %s' % str(metadata_dictionary)) + url = 'https://pilot.atlas-ml.org' # 'http://collector.atlas-ml.org:80' + + # cmd = ( + # "curl --connect-timeout 20 --max-time 120 " + # "-H \"Content-Type: application/json\" -X POST -d \'%s\' %s" % \ + # (str(metadata_dictionary).replace("'", '"'), url) + #) + + # curl --connect-timeout 20 --max-time 120 -H + # "Content-Type: application/json" -X POST --upload-file test.json + # https://pilot.atlas-ml.org + cmd = ( + "curl --connect-timeout 20 --max-time 120 " + "-H \"Content-Type: application/json\" " + "-X POST " + "--upload-file %s %s" % (new_path, url) + ) + #cmd = "curl --connect-timeout 20 --max-time 120 -F + # 'data=@%s' %s" % (new_path, url) + # send metadata to logstash + try: + _, stdout, stderr = execute(cmd, usecontainer=False) + except Exception as exc: + logger.warning('exception caught: %s', exc) else: - logger.warning('path does not exist: %s' % path) + logger.debug('sent prmon JSON dictionary to logstash server') + logger.debug('stdout: %s', stdout) + logger.debug('stderr: %s', stderr) else: - logger.debug('no need to update logstash for this job') + msg = 'no prmon json available - cannot send anything to logstash server' + logger.warning(msg) def preprocess_debug_command(job): @@ -2367,7 +2646,11 @@ def preprocess_debug_command(job): preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams) # get the general setup command and then verify it if required resource_name = get_resource_name() # 'grid' if no hpc_resource is set - resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0) # Python 3, -1 -> 0 + + # Python 3, level: -1 -> 0 + modname = 'pilot.user.atlas.resource.%s' % resource_name + resource = __import__(modname, globals(), locals(), [resource_name], 0) + cmd = resource.get_setup_command(job, preparesetup) if not cmd.endswith(';'): cmd += '; ' @@ -2377,51 +2660,69 @@ def preprocess_debug_command(job): def process_debug_command(debug_command, pandaid): """ - In debug mode, the server can send a special debug command to the pilot via the updateJob backchannel. - This function can be used to process that command, i.e. to identify a proper pid to debug (which is unknown + In debug mode, the server can send a special debug command to the piloti + via the updateJob backchannel. This function can be used to process that + command, i.e. to identify a proper pid to debug (which is unknown to the server). - For gdb, the server might send a command with gdb option --pid %. The pilot need to replace the % with the proper - pid. The default (hardcoded) process will be that of athena.py. The pilot will find the corresponding pid. + For gdb, the server might send a command with gdb option --pid %. + The pilot need to replace the % with the proper pid. The default + (hardcoded) process will be that of athena.py. The pilot will find the + corresponding pid. :param debug_command: debug command (string). :param pandaid: PanDA id (string). :return: updated debug command (string). """ + if '--pid %' not in debug_command: + return debug_command + pandaid_pid = None - if '--pid %' in debug_command: - # replace the % with the pid for athena.py - # note: if athena.py is not yet running, the --pid % will remain. Otherwise the % will be replaced by the pid - # first find the pid (if athena.py is running) - cmd = 'ps axo pid,ppid,pgid,args' - exit_code, stdout, stderr = execute(cmd) - if stdout: - #logger.debug('ps=\n\n%s\n' % stdout) - # convert the ps output to a dictionary - dictionary = convert_ps_to_dict(stdout) - # trim this dictionary to reduce the size (only keep the PID and PPID lists) - trimmed_dictionary = get_trimmed_dictionary(['PID', 'PPID'], dictionary) - # what is the pid of the trf? - pandaid_pid = find_pid(pandaid, dictionary) - # find all athena processes - pids = find_cmd_pids('athena.py', dictionary) - # which of the found pids are children of the trf? (which has an export PandaID=.. attached to it) - for pid in pids: - try: - child = is_child(pid, pandaid_pid, trimmed_dictionary) - except RuntimeError as e: - logger.warning('too many recursions: %s (cannot identify athena process)' % e) - else: - if child: - logger.info('pid=%d is a child process of the trf of this job' % pid) - debug_command = debug_command.replace('--pid %', '--pid %d' % pid) - logger.info('updated debug command: %s' % debug_command) - break - else: - logger.info('pid=%d is not a child process of the trf of this job' % pid) - if not pids or '--pid %' in debug_command: - logger.debug('athena is not yet running (no corresponding pid)') - debug_command = '' # reset the command to prevent the payload from being killed (will be killed when gdb has run) + + # replace the % with the pid for athena.py + # note: if athena.py is not yet running, the --pid % will remain. + # Otherwise the % will be replaced by the pid first find the pid + # (if athena.py is running) + cmd = 'ps axo pid,ppid,pgid,args' + _, stdout, _ = execute(cmd) + if stdout: + #logger.debug('ps=\n\n%s\n' % stdout) + # convert the ps output to a dictionary + dictionary = convert_ps_to_dict(stdout) + + # trim this dictionary to reduce the size + # (only keep the PID and PPID lists) + trimmed_dictionary = get_trimmed_dictionary(['PID', 'PPID'], dictionary) + + # what is the pid of the trf? + pandaid_pid = find_pid(pandaid, dictionary) + + # find all athena processes + pids = find_cmd_pids('athena.py', dictionary) + + # which of the found pids are children of the trf? + # (which has an export PandaID=.. attached to it) + for pid in pids: + try: + child = is_child(pid, pandaid_pid, trimmed_dictionary) + except RuntimeError as rte: + logger.warning(( + 'too many recursions: %s ' + '(cannot identify athena process)', rte)) + else: + if child: + logger.info('pid=%d is a child process of the trf of this job', pid) + debug_command = debug_command.replace('--pid %', '--pid %d' % pid) + logger.info('updated debug command: %s', debug_command) + break + logger.info('pid=%d is not a child process of the trf of this job', pid) + + if not pids or '--pid %' in debug_command: + logger.debug('athena is not yet running (no corresponding pid)') + + # reset the command to prevent the payload from being killed + # (will be killed when gdb has run) + debug_command = '' return debug_command From d40f349e46655b1115f3a0a4dcff9a560f4b6724 Mon Sep 17 00:00:00 2001 From: Brinick Simmons Date: Mon, 21 Jun 2021 11:04:49 +0200 Subject: [PATCH 77/96] Remove trailing whitespace --- pilot/user/atlas/common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index d02faaf8..3e26f401 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -16,7 +16,6 @@ import re from random import randint from signal import SIGTERM, SIGUSR1 -from typing import Type # from tarfile import ExFileObject try: @@ -1043,7 +1042,7 @@ def discover_new_outdata(job): 'workdir': job.workdir, 'dataset': outdata_file.dataset, 'ddmendpoint': outdata_file.ddmendpoint, - 'ddmendpoint_alt': None, + 'ddmendpoint_alt': None, 'filesize': new_output[outfile]['filesize'], 'checksum': new_output[outfile]['checksum'], 'guid': '' From 11c1a4e4f6b448ced3ee127b6c9538b8e896558f Mon Sep 17 00:00:00 2001 From: Brinick Simmons Date: Mon, 21 Jun 2021 11:07:57 +0200 Subject: [PATCH 78/96] Fix flake8 issues --- pilot/user/atlas/common.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 3e26f401..40c6a511 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -1755,9 +1755,11 @@ def cleanup_payload(workdir, outputfiles=None, removecores=True): for filename in files: path = os.path.abspath(os.path.join(root, filename)) - if ('core' in filename and removecores) or \ - 'pool.root' in filename or \ - 'tmp.' in filename: + core_file = ('core' in filename and removecores) + pool_root_file = 'pool.root' in filename + tmp_file = 'tmp.' in filename + + if core_file or pool_root_file or tmp_file: remove(path) for outfile in outputfiles: From 401dbfbc357d477d877019e04afe956eb98fa08e Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 21 Jun 2021 13:04:57 +0200 Subject: [PATCH 79/96] Avoiding the decode problem with strings. Added protection against UTF-8 failures while parsing stdout --- PILOTVERSION | 2 +- pilot/user/atlas/diagnose.py | 5 ++++- pilot/util/constants.py | 2 +- pilot/util/container.py | 13 ++++++++++--- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index a14d3a59..d2b75bf7 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.12.1.46 \ No newline at end of file +2.12.1.49 \ No newline at end of file diff --git a/pilot/user/atlas/diagnose.py b/pilot/user/atlas/diagnose.py index 9f131c93..7b3dbaae 100644 --- a/pilot/user/atlas/diagnose.py +++ b/pilot/user/atlas/diagnose.py @@ -70,7 +70,10 @@ def interpret(job): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code) # interpret the exit info from the payload - interpret_payload_exit_info(job) + try: + interpret_payload_exit_info(job) + except Exception as error: + logger.warning('exception caught while interpreting payload exit info: %s', error) return exit_code diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 667d188b..9f9f2120 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '46' # build number should be reset to '1' for every new development cycle +BUILD = '49' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/container.py b/pilot/util/container.py index 0a6c20b0..c81e5f22 100644 --- a/pilot/util/container.py +++ b/pilot/util/container.py @@ -91,10 +91,17 @@ def execute(executable, **kwargs): stdout, stderr = process.communicate() exit_code = process.poll() + # this should not be necessary since encoding is set above for Py3 (it is necessary if encoding above is removed) # for Python 3, convert from byte-like object to str - if is_python3(): - stdout = stdout.decode('utf-8') - stderr = stderr.decode('utf-8') + #import sys + #if is_python3(): + # logger.debug('Using python version=%s' % str(sys.version_info)) + # try: + # stdout = stdout.decode('utf-8') + # stderr = stderr.decode('utf-8') + # except Exception as error: + # logger.warning('exception caught: %s (can be ignored)', error) + # remove any added \n if stdout and stdout.endswith('\n'): stdout = stdout[:-1] From 7aa6bd8bdfefd03cdf69b862228c1c8f29c9903d Mon Sep 17 00:00:00 2001 From: Brinick Simmons Date: Mon, 21 Jun 2021 13:31:37 +0200 Subject: [PATCH 80/96] Improve code in jobreport parsing function --- pilot/user/atlas/common.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 1a1344de..20beb8af 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -1251,7 +1251,7 @@ def get(self, path, dst_dict, dst_key): dst_dict[dst_key] = v[last_key] -def parse_jobreport_data(job_report): +def parse_jobreport_data(job_report): # noqa: C901 """ Parse a job report and extract relevant fields. @@ -1316,6 +1316,19 @@ def parse_jobreport_data(job_report): exc_report.extend(list(v['memory']['Max'].items())) # Python 2/3 for x in exc_report: fin_report[x[0]] += x[1] + + # Proposed version + fin_report_brinick = defaultdict(int) + for value in j.values(): + mem = value.get('memory', {}) + for key in ('Avg', 'Max'): + for subk, subv in mem.get(key, {}).items(): + fin_report_brinick[subk] += subv + + # Compare output from the original and the proposed versions + logger.debug("Original code yields fin_report: %s", fin_report) + logger.debug("Proposed code yields fin_report: %s", fin_report_brinick) + work_attributes.update(fin_report) workdir_size = get_workdir_size() From df87ff5f8d0b19487689950ffc97c84df686f154 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 21 Jun 2021 16:07:18 +0200 Subject: [PATCH 81/96] Version update --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index d2b75bf7..edc0161f 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.12.1.49 \ No newline at end of file +2.12.1.50 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 9f9f2120..e3d9bebc 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '49' # build number should be reset to '1' for every new development cycle +BUILD = '50' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 2020e3d5f9c6453a6b31a72ea982ccd8071a0225 Mon Sep 17 00:00:00 2001 From: Brinick Simmons Date: Tue, 22 Jun 2021 11:50:27 +0200 Subject: [PATCH 82/96] Fix logging calls --- pilot/user/atlas/common.py | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 6f5254ea..09dd695a 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -369,7 +369,7 @@ def get_payload_command(job): if not os.path.exists(path): logger.warning(( 'base trace report does not exist (%s) - input file ' - 'traces should already have been sent', path)) + 'traces should already have been sent'), path) else: process_remote_file_traces(path, job, not_opened_turls) @@ -570,7 +570,7 @@ def add_athena_proc_number(cmd): else: logger.info(( "will not add ATHENA_PROC_NUMBER to cmd " - "since the value is %s", str(value1))) + "since the value is %s"), str(value1)) else: logger.warning(( "don't know how to set ATHENA_PROC_NUMBER " @@ -1014,7 +1014,7 @@ def update_output_for_hpo(job): if new_outdata: logger.info(( 'replacing job outdata with discovered output ' - '(%d file(s))', len(new_outdata))) + '(%d file(s))'), len(new_outdata)) job.outdata = new_outdata @@ -1113,7 +1113,7 @@ def extract_output_file_guids(job): if output: logger.info(( 'verified that job report contains metadata ' - 'for %d file(s)', len(output))) + 'for %d file(s)'), len(output)) else: #- will fail job since allowNoOutput is not set') logger.warning(( @@ -1137,11 +1137,11 @@ def extract_output_file_guids(job): data[lfn].guid = fdat['file_guid'] logger.info(( 'set guid=%s for lfn=%s ' - '(value taken from job report)', data[lfn].guid, lfn)) + '(value taken from job report)'), data[lfn].guid, lfn) else: # found new entry logger.warning(( 'pilot no longer considers output files not mentioned ' - 'in job definition (lfn=%s)', lfn)) + 'in job definition (lfn=%s)'), lfn) continue #if job.outdata: @@ -1212,9 +1212,9 @@ def verify_output_files(job): if job.is_analysis(): logger.warning(( 'lfn %s is not in allowNoOutput list - ' - 'ignore for user job', + 'ignore for user job'), lfn - )) + ) else: failed = True logger.warning( @@ -1284,12 +1284,12 @@ def verify_extracted_output_files(output, lfns_jobdef, job): if job.is_analysis(): logger.warning(( 'output file %s from job definition is not present ' - 'in job report and is not listed in allowNoOutput', lfn)) + 'in job report and is not listed in allowNoOutput'), lfn) else: logger.warning(( 'output file %s from job definition is not present ' 'in job report and is not listed in allowNoOutput - ' - 'job will fail', lfn)) + 'job will fail'), lfn) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.MISSINGOUTPUTFILE) failed = True break @@ -1298,14 +1298,14 @@ def verify_extracted_output_files(output, lfns_jobdef, job): logger.warning(( 'output file %s from job definition is not present ' 'in job report but is listed in allowNoOutput - ' - 'remove from stage-out', lfn)) + 'remove from stage-out'), lfn) remove_from_stageout(lfn, job) else: nentries = output_jobrep[lfn] if nentries == "UNDEFINED": logger.warning(( 'encountered file with nentries=UNDEFINED - ' - 'will ignore %s', lfn)) + 'will ignore %s'), lfn) elif nentries is None: @@ -1313,12 +1313,12 @@ def verify_extracted_output_files(output, lfns_jobdef, job): logger.warning(( 'output file %s is listed in job report, ' 'but has no events and is not listed in ' - 'allowNoOutput - will ignore', lfn)) + 'allowNoOutput - will ignore'), lfn) else: logger.warning(( 'output file %s is listed in job report, ' 'nentries is None and is listed in allowNoOutput - ' - 'remove from stage-out', lfn)) + 'remove from stage-out'), lfn) remove_from_stageout(lfn, job) elif nentries == 0: @@ -1327,12 +1327,12 @@ def verify_extracted_output_files(output, lfns_jobdef, job): logger.warning(( 'output file %s is listed in job report, ' 'has zero events and is not listed in ' - 'allowNoOutput - will ignore', lfn)) + 'allowNoOutput - will ignore'), lfn) else: logger.warning(( 'output file %s is listed in job report, ' 'has zero events and is listed in allowNoOutput - ' - 'remove from stage-out', lfn)) + 'remove from stage-out'), lfn) remove_from_stageout(lfn, job) elif type(nentries) is int and nentries: @@ -1341,7 +1341,7 @@ def verify_extracted_output_files(output, lfns_jobdef, job): else: # should not reach this step logger.warning(( 'case not handled for output file %s with %s event(s) ' - '(ignore)', lfn, str(nentries))) + '(ignore)'), lfn, str(nentries)) status = (not failed) return status, nevents @@ -1385,19 +1385,19 @@ def remove_no_output_files(job): logger.info(( "file %s is listed in allowNoOutput but exists " "(will not be removed from list of files to be " - "staged-out)", filename)) + "staged-out)"), filename) _outfiles.append(filename) else: logger.info(( "file %s is listed in allowNoOutput and does not exist " - "(will be removed from list of files to be staged-out)", filename)) + "(will be removed from list of files to be staged-out)"), filename) else: if os.path.exists(path): logger.info("file %s is not listed in allowNoOutput (will be staged-out)", filename) else: logger.warning(( "file %s is not listed in allowNoOutput and " - "does not exist (job will fail)", filename)) + "does not exist (job will fail)"), filename) _outfiles.append(filename) # now remove the unwanted fspecs @@ -1784,7 +1784,7 @@ def get_redundants(): logger.debug(( 'list of redundant files could not be read from external file: %s ' - '(will use internal list)', filename)) + '(will use internal list)'), filename) # else return the following dir_list = ["AtlasProduction*", @@ -2475,7 +2475,7 @@ def verify_ncores(corecount): if athena_proc_number: logger.info(( "encountered a set ATHENA_PROC_NUMBER (%d), " - "will not overwrite it", athena_proc_number)) + "will not overwrite it"), athena_proc_number) logger.info('set ATHENA_CORE_NUMBER to same value as ATHENA_PROC_NUMBER') os.environ['ATHENA_CORE_NUMBER'] = str(athena_proc_number) else: @@ -2483,7 +2483,7 @@ def verify_ncores(corecount): os.environ['ATHENA_CORE_NUMBER'] = str(corecount) logger.info(( "set ATHENA_PROC_NUMBER_JOB and ATHENA_CORE_NUMBER to %s " - "(ATHENA_PROC_NUMBER will not be overwritten)", corecount)) + "(ATHENA_PROC_NUMBER will not be overwritten)"), corecount) def verify_job(job): @@ -2688,7 +2688,7 @@ def process_debug_command(debug_command, pandaid): except RuntimeError as rte: logger.warning(( 'too many recursions: %s ' - '(cannot identify athena process)', rte)) + '(cannot identify athena process)'), rte) else: if child: logger.info('pid=%d is a child process of the trf of this job', pid) From 8fc2d5f5dd86015f10ec0d4adb271b150c98f0ea Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 23 Jun 2021 10:27:13 +0200 Subject: [PATCH 83/96] Added handling for new preprocess exit codes. Removed thread name info from log messages. No longer dumping stage-in/out in main log. Cleanup and pylint corrections --- PILOTVERSION | 2 +- pilot/control/payloads/generic.py | 14 ++++-- pilot/util/constants.py | 2 +- pilot/util/container.py | 10 ++--- pilot/util/filehandling.py | 12 ++++-- pilot/util/middleware.py | 71 +++++++++++-------------------- 6 files changed, 53 insertions(+), 58 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index edc0161f..24b55189 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.12.1.50 \ No newline at end of file +2.12.1.51 \ No newline at end of file diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 747e6acb..8968938c 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -249,6 +249,7 @@ def execute_utility_command(self, cmd, job, label): exit_code, stdout, stderr = execute(cmd, workdir=job.workdir, cwd=job.workdir, usecontainer=False) if exit_code: + ignored_exit_codes = [160, 161, 162] logger.warning('command returned non-zero exit code: %s (exit code = %d) - see utility logs for details', cmd, exit_code) if label == 'preprocess': err = errors.PREPROCESSFAILURE @@ -256,8 +257,10 @@ def execute_utility_command(self, cmd, job, label): err = errors.POSTPROCESSFAILURE else: err = 0 # ie ignore - if err and exit_code != 160: # ignore no-more-data-points exit code + if err and exit_code not in ignored_exit_codes: # ignore no-more-data-points exit codes job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(err) + if exit_code in ignored_exit_codes: + job.transexitcode = exit_code # write output to log files self.write_utility_output(job.workdir, label, stdout, stderr) @@ -532,7 +535,11 @@ def run_preprocess(self, job): logger.info("\n\npreprocess execution command:\n\n%s\n", cmd_before_payload) exit_code = self.execute_utility_command(cmd_before_payload, job, 'preprocess') if exit_code == 160: - logger.fatal('no more HP points - time to abort processing loop') + logger.warning('no more HP points - time to abort processing loop') + elif exit_code == 161: + logger.warning('no more HP points but at least one point was processed - time to abort processing loop') + elif exit_code == 162: + logger.warning('loop count reached the limit - time to abort processing loop') elif exit_code: # set error code job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PREPROCESSFAILURE) @@ -568,6 +575,7 @@ def run(self): # noqa: C901 # abort when nothing more to run, or when the preprocess returns a special exit code iteration = 0 while True: + logger.info('payload iteration loop #%d', iteration + 1) os.environ['PILOT_EXEC_ITERATION_COUNT'] = '%s' % iteration show_memory_usage() @@ -577,7 +585,7 @@ def run(self): # noqa: C901 exit_code = self.run_preprocess(self.__job) jobparams_post = self.__job.jobparams if exit_code: - if exit_code == 160: + if exit_code >= 160 and exit_code <= 162: exit_code = 0 # wipe the output file list since there won't be any new files # any output files from previous iterations, should have been transferred already diff --git a/pilot/util/constants.py b/pilot/util/constants.py index e3d9bebc..a13e7af7 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '50' # build number should be reset to '1' for every new development cycle +BUILD = '51' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/container.py b/pilot/util/container.py index c81e5f22..89a6d0ed 100644 --- a/pilot/util/container.py +++ b/pilot/util/container.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 import subprocess from os import environ, getcwd, setpgrp #, getpgid #setsid @@ -72,7 +72,7 @@ def execute(executable, **kwargs): secret_key = sub_cmd.split('S3_SECRET_KEY=')[1] secret_key = 'S3_SECRET_KEY=' + secret_key executable_readable = executable_readable.replace(secret_key, 'S3_SECRET_KEY=********') - logger.info('executing command: %s' % executable_readable) + logger.info('executing command: %s', executable_readable) if mode == 'python': exe = ['/usr/bin/python'] + executable.split() @@ -95,7 +95,7 @@ def execute(executable, **kwargs): # for Python 3, convert from byte-like object to str #import sys #if is_python3(): - # logger.debug('Using python version=%s' % str(sys.version_info)) + # logger.debug('Using python version=%s', str(sys.version_info)) # try: # stdout = stdout.decode('utf-8') # stderr = stderr.decode('utf-8') @@ -134,8 +134,8 @@ def containerise_executable(executable, **kwargs): diagnostics = "" try: executable = container.wrapper(executable, **kwargs) - except Exception as e: - diagnostics = 'failed to execute wrapper function: %s' % e + except Exception as exc: + diagnostics = 'failed to execute wrapper function: %s' % exc logger.fatal(diagnostics) else: if executable == "": diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index c3f80c37..caf0beeb 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -952,13 +952,19 @@ def dump(path, cmd="cat"): logger.info("path %s does not exist", path) -def establish_logging(debug=True, nopilotlog=False, filename=config.Pilot.pilotlog): +def establish_logging(debug=True, nopilotlog=False, filename=config.Pilot.pilotlog, loglevel=0): """ Setup and establish logging. + Option loglevel can be used to decide which (predetermined) logging format to use. + Example: + loglevel=0: '%(asctime)s | %(levelname)-8s | %(name)-32s | %(funcName)-25s | %(message)s' + loglevel=1: 'ts=%(asctime)s level=%(levelname)-8s event=%(name)-32s.%(funcName)-25s msg="%(message)s"' + :param debug: debug mode (Boolean), :param nopilotlog: True when pilot log is not known (Boolean). - :param filename: name of log file. + :param filename: name of log file (string). + :param loglevel: selector for logging level (int). :return: """ @@ -968,7 +974,7 @@ def establish_logging(debug=True, nopilotlog=False, filename=config.Pilot.pilotl console = logging.StreamHandler(sys.stdout) if debug: - format_str = '%(asctime)s | %(levelname)-8s | %(threadName)-19s | %(name)-32s | %(funcName)-25s | %(message)s' + format_str = '%(asctime)s | %(levelname)-8s | %(name)-32s | %(funcName)-25s | %(message)s' level = logging.DEBUG else: format_str = '%(asctime)s | %(levelname)-8s | %(message)s' diff --git a/pilot/util/middleware.py b/pilot/util/middleware.py index a9fe101b..decfd0c8 100644 --- a/pilot/util/middleware.py +++ b/pilot/util/middleware.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2021 from os import environ, path, getcwd #, chmod @@ -47,17 +47,17 @@ def containerise_general_command(job, container_options, label='command', contai raise PilotException try: - logger.info('*** executing %s (logging will be redirected) ***' % label) + logger.info('*** executing %s (logging will be redirected) ***', label) exit_code, stdout, stderr = execute(cmd, job=job, usecontainer=False) - except Exception as e: - logger.info('*** %s has failed ***' % label) - logger.warning('exception caught: %s' % e) + except Exception as exc: + logger.info('*** %s has failed ***', label) + logger.warning('exception caught: %s', exc) else: if exit_code == 0: - logger.info('*** %s has finished ***' % label) + logger.info('*** %s has finished ***', label) else: - logger.info('*** %s has failed ***' % label) - logger.debug('%s script returned exit_code=%d' % (label, exit_code)) + logger.info('*** %s has failed ***', label) + logger.debug('%s script returned exit_code=%d', label, exit_code) def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, container_options, external_dir, @@ -104,30 +104,28 @@ def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, except PilotException as e: raise e else: - logger.warning('%s will not be done in a container (but it will be done by a script)' % label) + logger.warning('%s will not be done in a container (but it will be done by a script)', label) try: - logger.info('*** executing %s (logging will be redirected) ***' % label) + logger.info('*** executing %s (logging will be redirected) ***', label) exit_code, stdout, stderr = execute(cmd, job=job, usecontainer=False) - except Exception as e: - logger.info('*** %s has failed ***' % label) - logger.warning('exception caught: %s' % e) + except Exception as exc: + logger.info('*** %s has failed ***', label) + logger.warning('exception caught: %s', exc) else: if exit_code == 0: - logger.info('*** %s has finished ***' % label) + logger.info('*** %s has finished ***', label) else: - logger.info('*** %s has failed ***' % label) - logger.debug('%s script returned exit_code=%d' % (label, exit_code)) + logger.info('*** %s has failed ***', label) + logger.debug('%s script returned exit_code=%d', label, exit_code) # write stdout+stderr to files try: _stdout_name, _stderr_name = get_logfile_names(label) write_file(path.join(job.workdir, _stdout_name), stdout, mute=False) write_file(path.join(job.workdir, _stderr_name), stderr, mute=False) - logger.debug('stage-in/out stdout=\n%s' % stdout) - logger.debug('stage-in/out stderr=\n%s' % stderr) - except PilotException as e: - msg = 'exception caught: %s' % e + except PilotException as exc: + msg = 'exception caught: %s' % exc if label == 'stage-in': raise StageInFailure(msg) else: @@ -136,8 +134,8 @@ def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, # handle errors, file statuses, etc (the stage-in/out scripts write errors and file status to a json file) try: handle_updated_job_object(job, xdata, label=label) - except PilotException as e: - raise e + except PilotException as exc: + raise exc def get_script_path(script): @@ -149,8 +147,6 @@ def get_script_path(script): """ srcdir = environ.get('PILOT_SOURCE_DIR', '.') - logger.debug('PILOT_SOURCE_DIR=%s' % srcdir) - _path = path.join(srcdir, 'pilot/scripts') if not path.exists(_path): _path = path.join(srcdir, 'pilot2') @@ -190,8 +186,8 @@ def get_command(job, xdata, queue, script, eventtype, localsite, remotesite, ext # write file data to file try: status = write_json(path.join(job.workdir, config.Container.stagein_replica_dictionary), filedata_dictionary) - except Exception as e: - diagnostics = 'exception caught in get_command(): %s' % e + except Exception as exc: + diagnostics = 'exception caught in get_command(): %s' % exc logger.warning(diagnostics) raise PilotException(diagnostics) else: @@ -283,8 +279,8 @@ def handle_updated_job_object(job, xdata, label='stage-in'): fspec.turl = file_dictionary[fspec.lfn][3] fspec.checksum['adler32'] = file_dictionary[fspec.lfn][4] fspec.filesize = file_dictionary[fspec.lfn][5] - except Exception as e: - msg = "exception caught while reading file dictionary: %s" % e + except Exception as exc: + msg = "exception caught while reading file dictionary: %s" % exc logger.warning(msg) if label == 'stage-in': raise StageInFailure(msg) @@ -359,8 +355,8 @@ def get_filedata(data): 'istar': fspec.is_tar, 'accessmode': fspec.accessmode, 'storagetoken': fspec.storage_token} - except Exception as e: - logger.warning('exception caught in get_filedata(): %s' % e) + except Exception as exc: + logger.warning('exception caught in get_filedata(): %s', exc) return file_dictionary @@ -421,19 +417,4 @@ def use_middleware_script(container_type): :return: Boolean (True if middleware should be containerised). """ - # see definition in atlas/container.py, but also see useful code below (in case middleware is available locally) - #:param cmd: middleware command, used to determine if the container should be used or not (string). - #usecontainer = False - #if not config.Container.middleware_container: - # logger.info('container usage for middleware is not allowed by pilot config') - #else: - # # if the middleware is available locally, do not use container - # if find_executable(cmd) == "": - # usecontainer = True - # logger.info('command %s is not available locally, will attempt to use container' % cmd) - # else: - # logger.info('command %s is available locally, no need to use container' % cmd) - - # FOR TESTING - #return True if config.Container.middleware_container_stagein_script else False return True if container_type == 'container' or container_type == 'bash' else False From e2346903cc7ce59110e377d2589ea77e59dcbc28 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 23 Jun 2021 10:30:54 +0200 Subject: [PATCH 84/96] Updated build number after merge with Brinick's pylint updates --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 24b55189..2430ed91 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.12.1.51 \ No newline at end of file +2.12.1.52 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index a13e7af7..54a51713 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '51' # build number should be reset to '1' for every new development cycle +BUILD = '52' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 347b22cd4eb74146008ec93c5b6f99259a504a97 Mon Sep 17 00:00:00 2001 From: Brinick Simmons Date: Wed, 23 Jun 2021 11:35:22 +0200 Subject: [PATCH 85/96] Fix initialisation bug and logging bug self.trace_report was being used before being bound to the instance. logging.getLogger was being called with multiple args. --- pilot/api/data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index ccdc874d..c6ca6a99 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -69,7 +69,7 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_ super(StagingClient, self).__init__() if not logger: - logger = logging.getLogger('%s.%s', __name__, 'null') + logger = logging.getLogger(__name__ + '.null') logger.disabled = True self.logger = logger @@ -93,6 +93,9 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_ if not self.acopytools.get('default'): self.acopytools['default'] = self.get_default_copytools(default_copytools) + # get an initialized trace report (has to be updated for get/put if not defined before) + self.trace_report = trace_report if trace_report else TraceReport(pq=os.environ.get('PILOT_SITENAME', '')) + if not self.acopytools: msg = 'failed to initilize StagingClient: no acopytools options found, acopytools=%s' % self.acopytools logger.error(msg) @@ -101,9 +104,6 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_ raise PilotException("failed to resolve acopytools settings") logger.info('configured copytools per activity: acopytools=%s', self.acopytools) - # get an initialized trace report (has to be updated for get/put if not defined before) - self.trace_report = trace_report if trace_report else TraceReport(pq=os.environ.get('PILOT_SITENAME', '')) - def set_acopytools(self): """ Set the internal acopytools. From 5c5b62b58098d03079cf194dc3207a9763a521e1 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 23 Jun 2021 16:55:03 +0200 Subject: [PATCH 86/96] Merge with Brinick's pylint updates. Nuber of concurrent remote file open attempts can now be set with catchall --- PILOTVERSION | 2 +- pilot/info/jobdata.py | 17 +---- pilot/scripts/open_remote_file.py | 107 ++++++++++++++++++++++++++++-- pilot/user/atlas/common.py | 34 ++++++++-- pilot/util/auxiliary.py | 24 ++++++- pilot/util/constants.py | 2 +- 6 files changed, 155 insertions(+), 31 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 2430ed91..2d3b6bc8 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.12.1.52 \ No newline at end of file +2.12.1.54 \ No newline at end of file diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index 4e6ecbfe..52ea3255 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -30,7 +30,7 @@ from .basedata import BaseData from .filespec import FileSpec -from pilot.util.auxiliary import get_object_size +from pilot.util.auxiliary import get_object_size, get_key_value from pilot.util.constants import LOG_TRANSFER_NOT_DONE from pilot.util.filehandling import get_guid, get_valid_path_from_list from pilot.util.timing import get_elapsed_real_time @@ -201,7 +201,7 @@ def init(self, infosys): # prepend IMAGE_BASE to imagename if necessary (for testing purposes) image_base = os.environ.get('IMAGE_BASE', '') if not image_base and 'IMAGE_BASE' in infosys.queuedata.catchall: - image_base = self.get_key_value(infosys.queuedata.catchall, key='IMAGE_BASE') + image_base = get_key_value(infosys.queuedata.catchall, key='IMAGE_BASE') if image_base: paths = [os.path.join(image_base, os.path.basename(self.imagename)), os.path.join(image_base, self.imagename)] @@ -211,19 +211,6 @@ def init(self, infosys): #if image_base and not os.path.isabs(self.imagename) and not self.imagename.startswith('docker'): # self.imagename = os.path.join(image_base, self.imagename) - def get_key_value(self, catchall, key='SOMEKEY'): - """ - Return the value corresponding to key in catchall. - :param catchall: catchall free string. - :param key: key name (string). - :return: value (string). - """ - - # ignore any non-key-value pairs that might be present in the catchall string - s = dict(s.split('=', 1) for s in catchall.split() if '=' in s) - - return s.get(key) - def prepare_infiles(self, data): """ Construct FileSpec objects for input files from raw dict `data` diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py index 92dec03b..cd3d127b 100644 --- a/pilot/scripts/open_remote_file.py +++ b/pilot/scripts/open_remote_file.py @@ -4,15 +4,21 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2021 import argparse import os import logging +import threading +import queue import ROOT +from collections import namedtuple from pilot.util.config import config -from pilot.util.filehandling import establish_logging, write_json +from pilot.util.filehandling import ( + establish_logging, + write_json, +) logger = logging.getLogger(__name__) @@ -31,6 +37,12 @@ def get_args(): action='store_true', default=False, help='Enable debug mode for logging messages') + arg_parser.add_argument('-t', + dest='nthreads', + default=1, + required=False, + type=int, + help='Number of concurrent file open threads') arg_parser.add_argument('-w', dest='workdir', required=False, @@ -50,10 +62,26 @@ def get_args(): def message(msg): + """ + Print message to stdout or to log. + Note: not using lazy formatting. + + :param msg: message (string). + :return: + """ + print(msg) if not logger else logger.info(msg) def get_file_lists(turls): + """ + Return a dictionary with the turls. + Format: {'turls': } + + :param turls: comma separated turls (string) + :return: turls dictionary. + """ + _turls = [] try: @@ -64,7 +92,17 @@ def get_file_lists(turls): return {'turls': _turls} -def try_open_file(turl): +def try_open_file(turl, queues): + """ + Attempt to open a remote file. + Successfully opened turls will be put in the queues.opened queue. Unsuccessful turls will be placed in + the queues.unopened queue. + + :param turl: turl (string). + :param queues: queues collection. + :return: + """ + turl_opened = False try: in_file = ROOT.TFile.Open(turl) @@ -75,7 +113,29 @@ def try_open_file(turl): in_file.Close() turl_opened = True - return turl_opened + queues.opened.put(turl) if turl_opened else queues.unopened.put(turl) + + +def spawn_file_open_thread(queues, file_list): + """ + Spawn a thread for the try_open_file(). + + :param queues: queue collection. + :param file_list: files to open (list). + :return: thread. + """ + + thread = None + try: + turl = file_list.pop(0) + except IndexError: + pass + else: + # create and start thread for the current turl + thread = threading.Thread(target=try_open_file, args=(turl, queues)) + thread.start() + + return thread if __name__ == '__main__': @@ -106,10 +166,43 @@ def try_open_file(turl): file_list_dictionary = get_file_lists(args.turls) turls = file_list_dictionary.get('turls') processed_turls_dictionary = {} + + queues = namedtuple('queues', ['result', 'opened', 'unopened']) + queues.result = queue.Queue() + queues.opened = queue.Queue() + queues.unopened = queue.Queue() + threads = [] + if turls: - message('got TURLs: %s' % str(turls)) - for turl in turls: - processed_turls_dictionary[turl] = try_open_file(turl) + # make N calls to begin with + for index in range(args.nthreads): + thread = spawn_file_open_thread(queues, turls) + if thread: + threads.append(thread) + + while turls: + + try: + _ = queues.result.get(block=True) + except Exception as error: + message("caught exception: %s" % error) + + thread = spawn_file_open_thread(queues, turls) + if thread: + threads.append(thread) + + # wait until all threads have finished + [thread.join() for thread in threads] + + opened_turls = list(queues.opened.queue) + opened_turls.sort() + unopened_turls = list(queues.unopened.queue) + unopened_turls.sort() + + for turl in opened_turls: + processed_turls_dictionary[turl] = True + for turl in unopened_turls: + processed_turls_dictionary[turl] = False # write dictionary to file with results _status = write_json(os.path.join(args.workdir, config.Pilot.remotefileverification_dictionary), processed_turls_dictionary) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 09dd695a..cd05e799 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -45,7 +45,13 @@ get_metadata_dict_from_txt, ) -from pilot.util.auxiliary import get_resource_name, show_memory_usage, is_python3 +from pilot.util.auxiliary import ( + get_resource_name, + show_memory_usage, + is_python3, + get_key_value, +) + from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import TrfDownloadFailure, PilotException from pilot.util.config import config @@ -152,12 +158,13 @@ def validate(job): return status -def open_remote_files(indata, workdir): +def open_remote_files(indata, workdir, nthreads): """ Verify that direct i/o files can be opened. :param indata: list of FileSpec. :param workdir: working directory (string). + :param nthreads: number of concurrent file open threads (int). :return: exit code (int), diagnostics (string). """ @@ -202,7 +209,7 @@ def open_remote_files(indata, workdir): # correct the path when containers have been used final_script_path = os.path.join('.', script) - _cmd = get_file_open_command(final_script_path, turls) + _cmd = get_file_open_command(final_script_path, turls, nthreads) cmd = create_root_container_command(workdir, _cmd) show_memory_usage() @@ -248,14 +255,16 @@ def open_remote_files(indata, workdir): return exitcode, diagnostics, not_opened -def get_file_open_command(script_path, turls): +def get_file_open_command(script_path, turls, nthreads): """ :param script_path: path to script (string). + :param turls: comma-separated turls (string). + :param nthreads: number of concurrent file open threads (int). :return: comma-separated list of turls (string). """ - return "%s --turls=%s -w %s" % (script_path, turls, os.path.dirname(script_path)) + return "%s --turls=%s -w %s -t %s" % (script_path, turls, os.path.dirname(script_path), str(nthreads)) def extract_turls(indata): @@ -317,6 +326,19 @@ def process_remote_file_traces(path, job, not_opened_turls): logger.warning('failed to create trace report for turl=%s', fspec.turl) +def get_nthreads(catchall): + """ + Extract number of concurrent file open threads from catchall. + Return nthreads=1 if nopenfiles=.. is not present in catchall. + + :param catchall: queuedata catchall (string). + :return: number of threads (int). + """ + + _nthreads = get_key_value(catchall, key='nopenfiles') + return _nthreads if _nthreads else 1 + + def get_payload_command(job): """ Return the full command for executing the payload, including the @@ -360,7 +382,7 @@ def get_payload_command(job): diagnostics = "" not_opened_turls = "" try: - exitcode, diagnostics, not_opened_turls = open_remote_files(job.indata, job.workdir) + exitcode, diagnostics, not_opened_turls = open_remote_files(job.indata, job.workdir, get_nthreads(catchall)) except PilotException as exc: logger.warning('caught exception: %s', exc) else: diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index 23314ae6..f0277273 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -22,9 +22,17 @@ zero_depth_bases = (str, bytes, Number, range, bytearray) # Python 3 iteritems = 'items' +from pilot.util.constants import ( + SUCCESS, + FAILURE, + SERVER_UPDATE_FINAL, + SERVER_UPDATE_NOT_DONE, + SERVER_UPDATE_TROUBLE, + get_pilot_version, +) + from pilot.common.errorcodes import ErrorCodes from pilot.util.container import execute -from pilot.util.constants import SUCCESS, FAILURE, SERVER_UPDATE_FINAL, SERVER_UPDATE_NOT_DONE, SERVER_UPDATE_TROUBLE, get_pilot_version from pilot.util.filehandling import dump import logging @@ -636,3 +644,17 @@ def get_display_info(): product = result[0] return product, vendor + + +def get_key_value(catchall, key='SOMEKEY'): + """ + Return the value corresponding to key in catchall. + :param catchall: catchall free string. + :param key: key name (string). + :return: value (string). + """ + + # ignore any non-key-value pairs that might be present in the catchall string + _dic = dict(_str.split('=', 1) for _str in catchall.split() if '=' in _str) + + return _dic.get(key) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 54a51713..210a974f 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '52' # build number should be reset to '1' for every new development cycle +BUILD = '54' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 76d399ad129510b314752d6e4e1ab4a98ade1532 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 23 Jun 2021 16:57:12 +0200 Subject: [PATCH 87/96] Flake8 correction --- pilot/scripts/open_remote_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py index cd3d127b..ff0de545 100644 --- a/pilot/scripts/open_remote_file.py +++ b/pilot/scripts/open_remote_file.py @@ -192,7 +192,7 @@ def spawn_file_open_thread(queues, file_list): threads.append(thread) # wait until all threads have finished - [thread.join() for thread in threads] + [_thread.join() for _thread in threads] opened_turls = list(queues.opened.queue) opened_turls.sort() From 90125ed08a8c3c3f09698811dc703c155461e93f Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 25 Jun 2021 17:21:50 +0200 Subject: [PATCH 88/96] Popen corrections for Python 3 and utf-8 --- pilot/util/container.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/pilot/util/container.py b/pilot/util/container.py index 89a6d0ed..f220f6c0 100644 --- a/pilot/util/container.py +++ b/pilot/util/container.py @@ -42,8 +42,8 @@ def execute(executable, **kwargs): mute = kwargs.get('mute', False) mode = kwargs.get('mode', 'bash') cwd = kwargs.get('cwd', getcwd()) - stdout = kwargs.get('stdout', subprocess.PIPE) - stderr = kwargs.get('stderr', subprocess.PIPE) + stdout_name = kwargs.get('stdout', subprocess.PIPE) + stderr_name = kwargs.get('stderr', subprocess.PIPE) usecontainer = kwargs.get('usecontainer', False) returnproc = kwargs.get('returnproc', False) job = kwargs.get('job') @@ -80,33 +80,33 @@ def execute(executable, **kwargs): exe = ['/bin/bash', '-c', executable] # try: intercept exception such as OSError -> report e.g. error.RESOURCEUNAVAILABLE: "Resource temporarily unavailable" - if is_python3(): - process = subprocess.Popen(exe, bufsize=-1, stdout=stdout, stderr=stderr, cwd=cwd, preexec_fn=setpgrp, encoding='utf-8') # Python 3 + if is_python3(): # Python 3 + process = subprocess.Popen(exe, + bufsize=-1, + stdout=stdout_name, + stderr=stderr_name, + cwd=cwd, + preexec_fn=setpgrp, + encoding='utf-8', + errors='replace') else: - process = subprocess.Popen(exe, bufsize=-1, stdout=stdout, stderr=stderr, cwd=cwd, preexec_fn=setpgrp) # Python 2 - + process = subprocess.Popen(exe, + bufsize=-1, + stdout=stdout_name, + stderr=stderr_name, + cwd=cwd, + preexec_fn=setpgrp) if returnproc: return process else: stdout, stderr = process.communicate() exit_code = process.poll() - # this should not be necessary since encoding is set above for Py3 (it is necessary if encoding above is removed) - # for Python 3, convert from byte-like object to str - #import sys - #if is_python3(): - # logger.debug('Using python version=%s', str(sys.version_info)) - # try: - # stdout = stdout.decode('utf-8') - # stderr = stderr.decode('utf-8') - # except Exception as error: - # logger.warning('exception caught: %s (can be ignored)', error) - # remove any added \n if stdout and stdout.endswith('\n'): stdout = stdout[:-1] - return exit_code, stdout, stderr + return exit_code, stdout, stderr def containerise_executable(executable, **kwargs): From fe032cf2db59899da8bee4cf00dfd0755cee7932 Mon Sep 17 00:00:00 2001 From: Brinick Simmons Date: Mon, 28 Jun 2021 12:47:04 +0200 Subject: [PATCH 89/96] Add nojekyll file to build docs workflow --- .github/workflows/build-docs.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml index 351e3fee..fef80173 100644 --- a/.github/workflows/build-docs.yml +++ b/.github/workflows/build-docs.yml @@ -30,13 +30,16 @@ jobs: run: | cd ./doc make github - cd .. + + - name: Add nojekyll file to repo root dir + run: | + touch .nojekyll - name: Push docs to repo run: | git config user.name "brinick" git config user.email "brinick@users.noreply.github.com" - git add docs - git commit -m "Adding documentation" + git add docs .nojekyll + git commit -m "Adding Pilot documentation" git push From 395e0fa3a0a76e3518cfc5f898364c244f4b9435 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 28 Jun 2021 15:53:09 +0200 Subject: [PATCH 90/96] Improved check for singularity errors --- PILOTVERSION | 2 +- pilot/api/data.py | 8 ++++---- pilot/common/errorcodes.py | 8 ++++++-- pilot/control/data.py | 6 +----- pilot/control/payload.py | 38 ++++++++++++++++++++------------------ pilot/util/constants.py | 2 +- 6 files changed, 33 insertions(+), 31 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 2d3b6bc8..22f7df92 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.12.1.54 \ No newline at end of file +2.12.1.58 \ No newline at end of file diff --git a/pilot/api/data.py b/pilot/api/data.py index c6ca6a99..bd538531 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -522,10 +522,10 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 caught_errors[-1].get_error_code() == ErrorCodes.MISSINGOUTPUTFILE: raise caught_errors[-1] - remain_files = [f for f in files if f.status not in ['remote_io', 'transferred', 'no_transfer']] + remain_files = [fspec for fspec in files if fspec.status not in ['remote_io', 'transferred', 'no_transfer']] if remain_files: # failed or incomplete transfer - # Propagate message from first error back up + # propagate message from first error back up errmsg = str(caught_errors[0]) if caught_errors else '' if caught_errors and "Cannot authenticate" in str(caught_errors): code = ErrorCodes.STAGEINAUTHENTICATIONFAILURE @@ -1065,13 +1065,13 @@ def transfer_files(self, copytool, files, activity, **kwargs): if not fspec.ddmendpoint: # ensure that output destination is properly set if 'mv' not in self.infosys.queuedata.copytools: - msg = 'No output RSE defined for file=%s' % fspec.lfn + msg = 'no output RSE defined for file=%s' % fspec.lfn self.logger.error(msg) raise PilotException(msg, code=ErrorCodes.NOSTORAGE, state='NO_OUTPUTSTORAGE_DEFINED') pfn = fspec.surl or getattr(fspec, 'pfn', None) or os.path.join(kwargs.get('workdir', ''), fspec.lfn) if not os.path.isfile(pfn) or not os.access(pfn, os.R_OK): - msg = "Error: output pfn file does not exist: %s" % pfn + msg = "output pfn file does not exist: %s" % pfn self.logger.error(msg) self.trace_report.update(clientState='MISSINGOUTPUTFILE', stateReason=msg) self.trace_report.send() diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py index 49233a76..01986479 100644 --- a/pilot/common/errorcodes.py +++ b/pilot/common/errorcodes.py @@ -403,8 +403,12 @@ def resolve_transform_error(self, exit_code, stderr): elif exit_code == -1: ec = self.UNKNOWNTRFFAILURE else: - # do not assign a pilot error code for unidentified transform error, return 0 - ec = 0 + # singularity errors can appear even with no exit code set + if "Singularity is not installed" in stderr: + ec = self.SINGULARITYNOTINSTALLED + else: + # do not assign a pilot error code for unidentified transform error, return 0 + ec = 0 return ec diff --git a/pilot/control/data.py b/pilot/control/data.py index 20151b92..79bb33f6 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -1006,11 +1006,7 @@ def queue_monitoring(queues, traces, args): job.stageout = "log" set_pilot_state(job=job, state="failed") if not _stage_out_new(job, args): - logger.info("job %s failed during stage-out of data file(s) as well as during stage-out of log, " - "adding job object to failed_jobs queue", job.jobid) - else: - logger.info("job %s failed during stage-out of data file(s) - stage-out of log succeeded, adding job " - "object to failed_jobs queue", job.jobid) + logger.info("job %s failed during stage-out", job.jobid) put_in_queue(job, queues.failed_jobs) diff --git a/pilot/control/payload.py b/pilot/control/payload.py index b51063df..03cdc0fb 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -342,11 +342,15 @@ def perform_initial_payload_error_analysis(job, exit_code): :return: """ + # look for singularity errors (the exit code can be zero in this case) + stderr = read_file(os.path.join(job.workdir, config.Payload.payloadstderr)) + if stderr: + exit_code = errors.resolve_transform_error(exit_code, stderr) + if exit_code != 0: msg = "" exit_code = 0 logger.warning('main payload execution returned non-zero exit code: %d', exit_code) - stderr = read_file(os.path.join(job.workdir, config.Payload.payloadstderr)) if stderr != "": msg = errors.extract_stderr_error(stderr) if msg == "": @@ -359,8 +363,6 @@ def perform_initial_payload_error_analysis(job, exit_code): logger.warning("extracted message from stderr:\n%s", msg) exit_code = set_error_code_from_stderr(msg, fatal) - if not exit_code: - exit_code = errors.resolve_transform_error(exit_code, stderr) if exit_code != 0: if msg: msg = errors.format_diagnostics(exit_code, msg) @@ -388,22 +390,22 @@ def set_error_code_from_stderr(msg, fatal): :return: error code (int). """ - if "Failed invoking the NEWUSER namespace runtime" in msg: - exit_code = errors.SINGULARITYNEWUSERNAMESPACE - elif "Failed to create user namespace" in msg: - exit_code = errors.SINGULARITYFAILEDUSERNAMESPACE - elif "command not found" in msg: - exit_code = errors.TRANSFORMNOTFOUND - elif "SL5 is unsupported" in msg: - exit_code = errors.UNSUPPORTEDSL5OS - elif "resource temporarily unavailable" in msg: - exit_code = errors.SINGULARITYRESOURCEUNAVAILABLE - elif "unrecognized arguments" in msg: - exit_code = errors.UNRECOGNIZEDTRFARGUMENTS - elif fatal: + exit_code = 0 + error_map = {errors.SINGULARITYNEWUSERNAMESPACE: "Failed invoking the NEWUSER namespace runtime", + errors.SINGULARITYFAILEDUSERNAMESPACE: "Failed to create user namespace", + errors.SINGULARITYRESOURCEUNAVAILABLE: "resource temporarily unavailable", + errors.SINGULARITYNOTINSTALLED: "Singularity is not installed", + errors.TRANSFORMNOTFOUND: "command not found", + errors.UNSUPPORTEDSL5OS: "SL5 is unsupported", + errors.UNRECOGNIZEDTRFARGUMENTS: "unrecognized arguments",} + + for key, value in error_map.items(): + if value in msg: + exit_code = key + break + + if fatal and not exit_code: exit_code = errors.UNRECOGNIZEDTRFSTDERR - else: - exit_code = 0 return exit_code diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 210a974f..7c8499b7 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '54' # build number should be reset to '1' for every new development cycle +BUILD = '58' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 74d382962c69a20ace5066b38fc682252c291849 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 28 Jun 2021 16:56:23 +0200 Subject: [PATCH 91/96] Added missing result queue in remote file open script --- PILOTVERSION | 2 +- pilot/scripts/open_remote_file.py | 1 + pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 22f7df92..e7a26e32 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.12.1.58 \ No newline at end of file +2.12.1.59 \ No newline at end of file diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py index ff0de545..b6e739d5 100644 --- a/pilot/scripts/open_remote_file.py +++ b/pilot/scripts/open_remote_file.py @@ -114,6 +114,7 @@ def try_open_file(turl, queues): turl_opened = True queues.opened.put(turl) if turl_opened else queues.unopened.put(turl) + queues.result.put(turl) def spawn_file_open_thread(queues, file_list): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 7c8499b7..c241977e 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '58' # build number should be reset to '1' for every new development cycle +BUILD = '59' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 514f83e949e4a58b401de4f7c0b0e262935e97fc Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 28 Jun 2021 17:53:41 +0200 Subject: [PATCH 92/96] Added log messages --- PILOTVERSION | 2 +- pilot/scripts/open_remote_file.py | 5 ++++- pilot/util/constants.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index e7a26e32..ca156888 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.12.1.59 \ No newline at end of file +2.12.1.60 \ No newline at end of file diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py index b6e739d5..69eb80b3 100644 --- a/pilot/scripts/open_remote_file.py +++ b/pilot/scripts/open_remote_file.py @@ -105,6 +105,7 @@ def try_open_file(turl, queues): turl_opened = False try: + message('opening %s' % turl) in_file = ROOT.TFile.Open(turl) except Exception as error: message('caught exception: %s' % error) @@ -112,7 +113,7 @@ def try_open_file(turl, queues): if in_file and in_file.IsOpen(): in_file.Close() turl_opened = True - + message('closed %s' % turl) queues.opened.put(turl) if turl_opened else queues.unopened.put(turl) queues.result.put(turl) @@ -174,6 +175,8 @@ def spawn_file_open_thread(queues, file_list): queues.unopened = queue.Queue() threads = [] + message('will attempt to open %d file(s) using %d thread(s)' % (len(turls), args.nthreads)) + if turls: # make N calls to begin with for index in range(args.nthreads): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index c241977e..5209e811 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '59' # build number should be reset to '1' for every new development cycle +BUILD = '60' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 8d8c1428197986462b6bfc30888e8bcae8ebcafc Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 29 Jun 2021 12:05:36 +0200 Subject: [PATCH 93/96] Flake8 correction --- pilot/control/payload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/control/payload.py b/pilot/control/payload.py index 03cdc0fb..6da11b4a 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -397,7 +397,7 @@ def set_error_code_from_stderr(msg, fatal): errors.SINGULARITYNOTINSTALLED: "Singularity is not installed", errors.TRANSFORMNOTFOUND: "command not found", errors.UNSUPPORTEDSL5OS: "SL5 is unsupported", - errors.UNRECOGNIZEDTRFARGUMENTS: "unrecognized arguments",} + errors.UNRECOGNIZEDTRFARGUMENTS: "unrecognized arguments"} for key, value in error_map.items(): if value in msg: From cce6852ed0f6d1c35d7c4ed9295dbc4529f2765e Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 29 Jun 2021 16:13:55 +0200 Subject: [PATCH 94/96] Fixes and cleanup --- PILOTVERSION | 2 +- pilot/common/errorcodes.py | 7 +++++-- pilot/control/data.py | 12 +++++------ pilot/control/payload.py | 25 ++++++++++++++--------- pilot/control/payloads/generic.py | 33 +++++++++++++------------------ pilot/user/atlas/common.py | 4 +--- pilot/util/constants.py | 2 +- 7 files changed, 44 insertions(+), 41 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index ca156888..c3a7e952 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.12.1.60 \ No newline at end of file +2.12.1.62 \ No newline at end of file diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py index 01986479..f6e43197 100644 --- a/pilot/common/errorcodes.py +++ b/pilot/common/errorcodes.py @@ -388,6 +388,7 @@ def resolve_transform_error(self, exit_code, stderr): :return: pilot error code (int) """ + ec = 0 if exit_code == 251 and "Not mounting requested bind point" in stderr: ec = self.SINGULARITYBINDPOINTFAILURE elif exit_code == 255 and "No more available loop devices" in stderr: @@ -406,10 +407,12 @@ def resolve_transform_error(self, exit_code, stderr): # singularity errors can appear even with no exit code set if "Singularity is not installed" in stderr: ec = self.SINGULARITYNOTINSTALLED - else: + #else: # do not assign a pilot error code for unidentified transform error, return 0 - ec = 0 + # ec = 0 + if not ec: + ec = exit_code return ec def extract_stderr_error(self, stderr): diff --git a/pilot/control/data.py b/pilot/control/data.py index 79bb33f6..596c3a49 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -475,18 +475,18 @@ def copytool_in(queues, traces, args): cmd = user.get_utility_commands(job=job, order=UTILITY_BEFORE_STAGEIN) if cmd: # xcache debug - _, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[before xcache start] stdout=%s', _stdout) - logger.debug('[before xcache start] stderr=%s', _stderr) + #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + #logger.debug('[before xcache start] stdout=%s', _stdout) + #logger.debug('[before xcache start] stderr=%s', _stderr) _, stdout, stderr = execute(cmd.get('command')) logger.debug('stdout=%s', stdout) logger.debug('stderr=%s', stderr) # xcache debug - _, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[after xcache start] stdout=%s', _stdout) - logger.debug('[after xcache start] stderr=%s', _stderr) + #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + #logger.debug('[after xcache start] stdout=%s', _stdout) + #logger.debug('[after xcache start] stderr=%s', _stderr) # perform any action necessary after command execution (e.g. stdout processing) kwargs = {'label': cmd.get('label', 'utility'), 'output': stdout} diff --git a/pilot/control/payload.py b/pilot/control/payload.py index 6da11b4a..65ff94c4 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -22,7 +22,7 @@ from pilot.control.payloads import generic, eventservice, eventservicemerge from pilot.control.job import send_state -from pilot.util.auxiliary import set_pilot_state, show_memory_usage +from pilot.util.auxiliary import set_pilot_state from pilot.util.processes import get_cpu_consumption_time from pilot.util.config import config from pilot.util.filehandling import read_file, remove_core_dumps, get_guid @@ -230,9 +230,7 @@ def execute_payloads(queues, traces, args): # noqa: C901 break payload_executor = get_payload_executor(args, job, out, err, traces) - logger.info("Got payload executor: %s", payload_executor) - - show_memory_usage() + logger.info("will use payload executor: %s", payload_executor) # run the payload and measure the execution time job.t0 = os.times() @@ -349,7 +347,6 @@ def perform_initial_payload_error_analysis(job, exit_code): if exit_code != 0: msg = "" - exit_code = 0 logger.warning('main payload execution returned non-zero exit code: %d', exit_code) if stderr != "": msg = errors.extract_stderr_error(stderr) @@ -359,10 +356,15 @@ def perform_initial_payload_error_analysis(job, exit_code): fatal = False else: fatal = True - if msg != "": - logger.warning("extracted message from stderr:\n%s", msg) - exit_code = set_error_code_from_stderr(msg, fatal) + #if msg != "": # redundant since resolve_transform_error is used above + # logger.warning("extracted message from stderr:\n%s", msg) + # exit_code = set_error_code_from_stderr(msg, fatal) + + if msg: + msg = errors.format_diagnostics(exit_code, msg) + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code, msg=msg) + ''' if exit_code != 0: if msg: msg = errors.format_diagnostics(exit_code, msg) @@ -376,8 +378,13 @@ def perform_initial_payload_error_analysis(job, exit_code): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.COREDUMP) else: logger.warning('initial error analysis did not resolve the issue (and core dumps were not found)') + ''' else: - logger.info('main payload execution returned zero exit code, but will check it more carefully') + logger.info('main payload execution returned zero exit code') + + # check if core dumps exist, if so remove them and return True + if remove_core_dumps(job.workdir) and not job.debug: + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.COREDUMP) def set_error_code_from_stderr(msg, fatal): diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 8968938c..df5a852f 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -492,15 +492,12 @@ def get_payload_command(self, job): :return: command (string). """ - show_memory_usage() - cmd = "" # for testing looping job: cmd = user.get_payload_command(job) + ';sleep 240' try: pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 - show_memory_usage() cmd = user.get_payload_command(job) #+ 'sleep 1000' # to test looping jobs except PilotException as error: self.post_setup(job) @@ -564,8 +561,6 @@ def run(self): # noqa: C901 # get the payload command from the user specific code self.pre_setup(self.__job) - show_memory_usage() - cmd = self.get_payload_command(self.__job) # extract the setup in case the preprocess command needs it self.__job.setup = self.extract_setup(cmd) @@ -601,9 +596,9 @@ def run(self): # noqa: C901 # note: no need to run any main payload in HPO Horovod jobs on Kubernetes if os.environ.get('HARVESTER_HOROVOD', '') == '': - exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[before payload start] stdout=%s', _stdout) - logger.debug('[before payload start] stderr=%s', _stderr) + #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + #logger.debug('[before payload start] stdout=%s', _stdout) + #logger.debug('[before payload start] stderr=%s', _stderr) proc = self.run_payload(self.__job, cmd, self.__out, self.__err) else: @@ -651,9 +646,9 @@ def run(self): # noqa: C901 set_pilot_state(job=self.__job, state=state) logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n', proc.pid, exit_code, self.__job.state) - exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[after payload finish] stdout=%s', _stdout) - logger.debug('[after payload finish] stderr=%s', _stderr) + #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + #logger.debug('[after payload finish] stdout=%s', _stdout) + #logger.debug('[after payload finish] stderr=%s', _stderr) # stop the utility command (e.g. a coprocess if necessary if proc_co: @@ -708,18 +703,18 @@ def run_utility_after_payload_finished(self, state, order): logger.info("\n\npostprocess execution command:\n\n%s\n", cmd_after_payload) # xcache debug - if 'xcache' in cmd_after_payload: - _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[before xcache kill] stdout=%s', _stdout) - logger.debug('[before xcache kill] stderr=%s', _stderr) + #if 'xcache' in cmd_after_payload: + # _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + # logger.debug('[before xcache kill] stdout=%s', _stdout) + # logger.debug('[before xcache kill] stderr=%s', _stderr) exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label) # xcache debug - if 'xcache' in cmd_after_payload: - _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') - logger.debug('[after xcache kill] stdout=%s', _stdout) - logger.debug('[after xcache kill] stderr=%s', _stderr) + #if 'xcache' in cmd_after_payload: + # _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + # logger.debug('[after xcache kill] stdout=%s', _stdout) + # logger.debug('[after xcache kill] stderr=%s', _stderr) return exit_code diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index cd05e799..1fb3be4a 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -604,9 +604,7 @@ def add_athena_proc_number(cmd): if value2 > 1: cmd = 'export ATHENA_CORE_NUMBER=%d;' % value2 + cmd else: - logger.info(( - "will not add ATHENA_CORE_NUMBER to cmd since the " - "value is %s", str(value2))) + logger.info("will not add ATHENA_CORE_NUMBER to cmd since the value is %s", str(value2)) else: logger.warning(( 'there is no ATHENA_CORE_NUMBER in os.environ ' diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 5209e811..7bb1caf9 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '60' # build number should be reset to '1' for every new development cycle +BUILD = '62' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 02b7d21a738b3ca16a4390f0a7e70ca898311265 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 29 Jun 2021 16:17:03 +0200 Subject: [PATCH 95/96] Update --- pilot/common/errorcodes.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py index f6e43197..68192b14 100644 --- a/pilot/common/errorcodes.py +++ b/pilot/common/errorcodes.py @@ -397,19 +397,15 @@ def resolve_transform_error(self, exit_code, stderr): ec = self.SINGULARITYIMAGEMOUNTFAILURE elif exit_code == 255 and "Operation not permitted" in stderr: ec = self.SINGULARITYGENERALFAILURE - elif exit_code == 64 and "Singularity is not installed" in stderr: + elif "Singularity is not installed" in stderr: # exit code should be 64 but not always? ec = self.SINGULARITYNOTINSTALLED elif exit_code == 64 and "cannot create directory" in stderr: ec = self.MKDIR elif exit_code == -1: ec = self.UNKNOWNTRFFAILURE - else: - # singularity errors can appear even with no exit code set - if "Singularity is not installed" in stderr: - ec = self.SINGULARITYNOTINSTALLED - #else: - # do not assign a pilot error code for unidentified transform error, return 0 - # ec = 0 + #else: + # do not assign a pilot error code for unidentified transform error, return 0 + # ec = 0 if not ec: ec = exit_code From 90b8a6160490220f9ccbffdfd7ea46d693fe87de Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 29 Jun 2021 16:19:07 +0200 Subject: [PATCH 96/96] Flake8 correction --- pilot/control/payload.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pilot/control/payload.py b/pilot/control/payload.py index 65ff94c4..c5641c70 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -353,9 +353,9 @@ def perform_initial_payload_error_analysis(job, exit_code): if msg == "": # look for warning messages instead (might not be fatal so do not set UNRECOGNIZEDTRFSTDERR) msg = errors.extract_stderr_warning(stderr) - fatal = False - else: - fatal = True + # fatal = False + #else: + # fatal = True #if msg != "": # redundant since resolve_transform_error is used above # logger.warning("extracted message from stderr:\n%s", msg) # exit_code = set_error_code_from_stderr(msg, fatal)