diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml index 351e3fee2..fef80173b 100644 --- a/.github/workflows/build-docs.yml +++ b/.github/workflows/build-docs.yml @@ -30,13 +30,16 @@ jobs: run: | cd ./doc make github - cd .. + + - name: Add nojekyll file to repo root dir + run: | + touch .nojekyll - name: Push docs to repo run: | git config user.name "brinick" git config user.email "brinick@users.noreply.github.com" - git add docs - git commit -m "Adding documentation" + git add docs .nojekyll + git commit -m "Adding Pilot documentation" git push diff --git a/PILOTVERSION b/PILOTVERSION index 00d0494d7..c3a7e9520 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.11.2.22 \ No newline at end of file +2.12.1.62 \ No newline at end of file diff --git a/pilot.py b/pilot.py index 1b8ed0931..c3fc0e7c8 100755 --- a/pilot.py +++ b/pilot.py @@ -10,6 +10,7 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019 from __future__ import print_function # Python 2 (2to3 complains about this) +from __future__ import absolute_import import argparse import logging @@ -68,7 +69,7 @@ def main(): infosys.init(args.queue) # check if queue is ACTIVE if infosys.queuedata.state != 'ACTIVE': - logger.critical('specified queue is NOT ACTIVE: %s -- aborting' % infosys.queuedata.name) + logger.critical('specified queue is NOT ACTIVE: %s -- aborting', infosys.queuedata.name) return errors.PANDAQUEUENOTACTIVE except PilotException as error: logger.fatal(error) @@ -81,14 +82,14 @@ def main(): environ['PILOT_SITENAME'] = infosys.queuedata.resource #args.site # TODO: replace with singleton # set requested workflow - logger.info('pilot arguments: %s' % str(args)) + logger.info('pilot arguments: %s', str(args)) workflow = __import__('pilot.workflow.%s' % args.workflow, globals(), locals(), [args.workflow], 0) # Python 3, -1 -> 0 # execute workflow try: exit_code = workflow.run(args) except Exception as e: - logger.fatal('main pilot function caught exception: %s' % e) + logger.fatal('main pilot function caught exception: %s', e) exit_code = None return exit_code @@ -101,62 +102,6 @@ class Args: pass -# rename module to pilot2 to avoid conflict in import with pilot directory -def import_module(**kwargs): - """ - This function allows for importing the pilot code. - - :param kwargs: pilot options (dictionary). - :return: pilot error code (integer). - """ - - argument_dictionary = {'-a': kwargs.get('workdir', ''), - '-d': kwargs.get('debug', None), - '-w': kwargs.get('workflow', 'generic'), - '-l': kwargs.get('lifetime', '3600'), - '-q': kwargs.get('queue'), # required - '-r': kwargs.get('resource'), # required - '-s': kwargs.get('site'), # required - '-j': kwargs.get('job_label', 'ptest'), # change default later to 'managed' - '-i': kwargs.get('version_tag', 'PR'), - '-t': kwargs.get('verify_proxy', True), - '-z': kwargs.get('update_server', True), - '--cacert': kwargs.get('cacert', None), - '--capath': kwargs.get('capath'), - '--url': kwargs.get('url', ''), - '-p': kwargs.get('port', '25443'), - '--country-group': kwargs.get('country_group', ''), - '--working-group': kwargs.get('working_group', ''), - '--allow-other-country': kwargs.get('allow_other_country', 'False'), - '--allow-same-user': kwargs.get('allow_same_user', 'True'), - '--pilot-user': kwargs.get('pilot_user', 'generic'), - '--input-dir': kwargs.get('input_dir', ''), - '--output-dir': kwargs.get('output_dir', ''), - '--hpc-resource': kwargs.get('hpc_resource', ''), - '--harvester-workdir': kwargs.get('harvester_workdir', ''), - '--harvester-datadir': kwargs.get('harvester_datadir', ''), - '--harvester-eventstatusdump': kwargs.get('harvester_eventstatusdump', ''), - '--harvester-workerattributes': kwargs.get('harvester_workerattributes', ''), - '--harvester-submitmode': kwargs.get('harvester_submitmode', ''), - '--resource-type': kwargs.get('resource_type', '') - } - - args = Args() - parser = argparse.ArgumentParser() - try: - _items = list(argument_dictionary.items()) # Python 3 - except Exception: - _items = argument_dictionary.iteritems() # Python 2 - for key, value in _items: - print(key, value) - parser.add_argument(key) - parser.parse_args(args=[key, value], namespace=args) # convert back int and bool strings to int and bool?? - - # call main pilot function - - return 0 - - def str2bool(v): """ Helper function to convert string to bool """ @@ -379,6 +324,11 @@ def get_args(): dest='jobtype', default='', help='Job type (managed, user)') + arg_parser.add_argument('--use-rucio-traces', + dest='use_rucio_traces', + type=str2bool, + default=True, + help='Use rucio traces') # HPC options arg_parser.add_argument('--hpc-resource', @@ -413,10 +363,10 @@ def create_main_work_dir(args): try: # create the main PanDA Pilot work directory mkdirs(mainworkdir) - except Exception as e: + except PilotException as error: # print to stderr since logging has not been established yet - print('failed to create workdir at %s -- aborting: %s' % (mainworkdir, e), file=sys.stderr) - exit_code = shell_exit_code(e._errorCode) + print('failed to create workdir at %s -- aborting: %s' % (mainworkdir, error), file=sys.stderr) + exit_code = shell_exit_code(error._errorCode) else: mainworkdir = getcwd() @@ -467,9 +417,15 @@ def set_environment_variables(args, mainworkdir): # set the (HPC) resource name (if set in options) environ['PILOT_RESOURCE_NAME'] = args.hpc_resource + # allow for the possibility of turning off rucio traces + environ['PILOT_USE_RUCIO_TRACES'] = str(args.use_rucio_traces) + # event service executor type environ['PILOT_ES_EXECUTOR_TYPE'] = args.executor_type + if args.output_dir: + environ['PILOT_OUTPUT_DIR'] = args.output_dir + # keep track of the server urls _port = ":%s" % args.port url = args.url if _port in args.url else args.url + _port @@ -495,9 +451,9 @@ def wrap_up(initdir, mainworkdir, args): try: rmtree(mainworkdir) except Exception as e: - logging.warning("failed to remove %s: %s" % (mainworkdir, e)) + logging.warning("failed to remove %s: %s", mainworkdir, e) else: - logging.info("removed %s" % mainworkdir) + logging.info("removed %s", mainworkdir) # in Harvester mode, create a kill_worker file that will instruct Harvester that the pilot has finished if args.harvester: @@ -509,15 +465,15 @@ def wrap_up(initdir, mainworkdir, args): except Exception: exit_code = trace else: - logging.info('traces error code: %d' % exit_code) + logging.info('traces error code: %d', exit_code) if trace.pilot['nr_jobs'] <= 1: if exit_code != 0: - logging.info('an exit code was already set: %d (will be converted to a standard shell code)' % exit_code) + logging.info('an exit code was already set: %d (will be converted to a standard shell code)', exit_code) elif trace.pilot['nr_jobs'] > 0: if trace.pilot['nr_jobs'] == 1: - logging.getLogger(__name__).info('pilot has finished (%d job was processed)' % trace.pilot['nr_jobs']) + logging.getLogger(__name__).info('pilot has finished (%d job was processed)', trace.pilot['nr_jobs']) else: - logging.getLogger(__name__).info('pilot has finished (%d jobs were processed)' % trace.pilot['nr_jobs']) + logging.getLogger(__name__).info('pilot has finished (%d jobs were processed)', trace.pilot['nr_jobs']) exit_code = SUCCESS elif trace.pilot['state'] == FAILURE: logging.critical('pilot workflow failure -- aborting') @@ -579,7 +535,7 @@ def get_pilot_source_dir(): set_environment_variables(args, mainworkdir) # setup and establish standard logging - establish_logging(args) + establish_logging(debug=args.debug, nopilotlog=args.nopilotlog) # execute main function trace = main() diff --git a/pilot/api/analytics.py b/pilot/api/analytics.py index 3b509b572..aa7e047bf 100644 --- a/pilot/api/analytics.py +++ b/pilot/api/analytics.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 from .services import Services from pilot.common.exception import NotDefined, NotSameLength, UnknownException @@ -146,21 +146,20 @@ def get_fitted_data(self, filename, x_name='Time', y_name='pss+swap', precision= y = y[:-2] if (len(x) > 7 and len(y) > 7) and len(x) == len(y): - logger.info('fitting %s vs %s' % (y_name, x_name)) + logger.info('fitting %s vs %s', y_name, x_name) try: fit = self.fit(x, y) _slope = self.slope() except Exception as e: - logger.warning('failed to fit data, x=%s, y=%s: %s' % (str(x), str(y), e)) + logger.warning('failed to fit data, x=%s, y=%s: %s', str(x), str(y), e) else: if _slope: slope = float_to_rounded_string(fit.slope(), precision=precision) chi2 = float_to_rounded_string(fit.chi2(), precision=0) # decimals are not needed for chi2 if slope != "": - logger.info('current memory leak: %s B/s (using %d data points, chi2=%s)' % - (slope, len(x), chi2)) + logger.info('current memory leak: %s B/s (using %d data points, chi2=%s)', slope, len(x), chi2) else: - logger.warning('wrong length of table data, x=%s, y=%s (must be same and length>=4)' % (str(x), str(y))) + logger.warning('wrong length of table data, x=%s, y=%s (must be same and length>=4)', str(x), str(y)) return {"slope": slope, "chi2": chi2} @@ -182,8 +181,8 @@ def extract_from_table(self, table, x_name, y_name): y2_name = y_name.split('+')[1] y1_value = table.get(y1_name, []) y2_value = table.get(y2_name, []) - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) x = [] y = [] else: @@ -238,7 +237,7 @@ def __init__(self, **kwargs): self.set_intersect() self.set_chi2() else: - logger.warning("\'%s\' model is not implemented" % self._model) + logger.warning("\'%s\' model is not implemented", self._model) raise NotImplementedError() def fit(self): diff --git a/pilot/api/dask.py b/pilot/api/dask.py new file mode 100644 index 000000000..ab5ff3eb5 --- /dev/null +++ b/pilot/api/dask.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Authors: +# - Paul Nilsson, paul.nilsson@cern.ch, 2021 + +try: + # import dask + import dask_kubernetes +except Exception: + pass + +#from pilot.common.exception import NotDefined, NotSameLength, UnknownException +from pilot.util.container import execute +from pilot.util.filehandling import establish_logging, write_file + +import os +import re +from time import sleep + +import logging +logger = logging.getLogger(__name__) + + +class Dask(object): + """ + Dask interface class. + """ + + servicename = 'single-dask' + status = None + loadbalancerip = None + servicetype = "LoadBalancer" + jupyter = False + overrides = "override_values.yaml" + _workdir = os.getcwd() + cluster = None + + def __init__(self, **kwargs): + """ + Init function. + + :param kwargs: + """ + + _servicename = kwargs.get('servicename', None) + if _servicename: + self.servicename = _servicename + _servicetype = kwargs.get('servicetype', None) + if _servicetype: + self.servicetype = _servicetype + _jupyter = kwargs.get('jupyter', None) + if _jupyter: + self.jupyter = _jupyter + _overrides = kwargs.get('overrides', None) + if _overrides: + self.overrides = _overrides + + def uninstall(self, block=True): + """ + + """ + + logger.info('uninstalling service %s', self.servicename) + if block: + logger.warning('blocking mode not yet implemented') + + cmd = 'helm uninstall %s' % self.servicename + exit_code, stdout, stderr = execute(cmd, mute=True) + if not exit_code: + self.status = 'uninstalled' + logger.info('uninstall of service %s has been requested', self.servicename) + + def install(self, block=True): + """ + + """ + + # can dask be installed? + if not self._validate(): + logger.warning('validation failed') + self.status = 'failed' + else: + logger.debug('dask has been validated') + self.status = 'validated' + + # is the single-dask cluster already running? + name = '%s-scheduler' % self.servicename + if self.is_running(name=name): + logger.info('service %s is already running - nothing to install', name) + else: + logger.info('service %s is not yet running - proceed with installation', name) + + # perform helm updates before actual instqllation + cmd = '' + # + override_option = "-f %s" % self.overrides if self.overrides else "" + cmd = 'helm install %s %s dask/dask' % (override_option, self.servicename) + exit_code, stdout, stderr = execute(cmd, mute=True) + if not exit_code: + logger.info('installation of service %s is in progress', self.servicename) + + if block: + while True: + name = '%s-scheduler' % self.servicename + if self.is_running(name=name): + logger.info('service %s is running', name) + self.status = 'running' + break + else: + self.status = 'pending' + sleep(2) + # note: in non-blocking mode, status is not getting updated + + def is_running(self, name='single-dask-scheduler'): + """ + + """ + + status = False + dictionary = self._get_dictionary(cmd='kubectl get services') + for key in dictionary: + if key == name: + status = True if self._is_valid_ip(dictionary[key]['EXTERNAL-IP']) else False + break + + return status + + def _is_valid_ip(self, ip): + """ + + """ + + regex = r"^((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.){3}(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])$" + return True if re.search(regex, ip) else False + + def _get_dictionary(self, cmd=None): + """ + + """ + + dictionary = {} + if not cmd: + return dictionary + + exit_code, stdout, stderr = execute(cmd, mute=True) + if exit_code: + logger.warning('failed to execute \'%s\': %s', cmd, stdout) + self.status = 'failed' + else: + # parse output + dictionary = self._convert_to_dict(stdout) + + return dictionary + + def _validate(self): + """ + Make sure that pre-conditions are met before any installation can be attempted. + + Pre-conditions: required libraries and commands + 1. library: dask + 2. library: dask_kubernetes + 3. command: helm + 4. command: kubectl + 5. copy relevant yaml file(s) + """ + + establish_logging(debug=True) + + # check imported modules + # dask + # dask_kubernetes + + # verify relevant commands + commands = ['helm', 'kubectl'] + found = False + for cmd in commands: + exit_code, stdout, stderr = execute('which %s' % cmd, mute=True) + found = True if 'not found' not in stdout else False + if not found: + logger.warning(stdout) + break + else: + logger.debug('%s verified', cmd) + if not found: + return False + + # create yaml file(s) + self._generate_override_script() + + return True + + def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'): + """ + Generate a values yaml script, unless it already exists. + + :param jupyter: False if jupyter notebook server should be disabled (Boolean). + :param servicetype: name of service type (string). + :return: + """ + + filename = os.path.join(self._workdir, self.overrides) + if os.path.exists(filename): + logger.info('file \'%s\' already exists - will not override', filename) + return + + script = "" + if not jupyter: + script += 'jupyter:\n enabled: false\n\n' + if servicetype: + script += 'scheduler:\n serviceType: \"%s\"\n' % servicetype + + if script: + status = write_file(filename, script) + if status: + logger.debug('generated script: %s', filename) + else: + self.overrides = None + + def _convert_to_dict(self, output): + """ + + """ + + dictionary = {} + first_line = [] + for line in output.split('\n'): + try: + # Remove empty entries from list (caused by multiple \t) + _l = re.sub(' +', ' ', line) + _l = [_f for _f in _l.split(' ') if _f] + if first_line == []: # "NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + first_line = _l[1:] + else: + dictionary[_l[0]] = {} + for i in range(len(_l[1:])): + dictionary[_l[0]][first_line[i]] = _l[1:][i] + + except Exception: + logger.warning("unexpected format of utility output: %s", line) + + return dictionary + + def connect_cluster(self, release_name=None, manager=dask_kubernetes.HelmCluster): + """ + + """ + + if not release_name: + release_name = self.servicename + self.cluster = manager(release_name=release_name) + logger.info('connected to %s', manager.__name__) + + def scale(self, number): + """ + + """ + + if number > 2: + logger.warning('too large scale: %d (please use <= 2 for now)', number) + return + if not self.cluster: + self.connect_cluster() + if not self.cluster: + logger.warning('cluster not connected - cannot proceed') + self.status = 'failed' + return + + logger.info('setting scale to: %d', number) + self.cluster.scale(number) + + def shutdown(self): + """ + Shutdown logging. + + """ + + logging.handlers = [] + logging.shutdown() diff --git a/pilot/api/data.py b/pilot/api/data.py index 7d9246c59..bd5385311 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -6,7 +6,7 @@ # # Authors: # - Mario Lassnig, mario.lassnig@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # - Tobias Wegner, tobias.wegner@cern.ch, 2017-2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2018-2019 @@ -69,7 +69,7 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_ super(StagingClient, self).__init__() if not logger: - logger = logging.getLogger('%s.%s' % (__name__, 'null')) + logger = logging.getLogger(__name__ + '.null') logger.disabled = True self.logger = logger @@ -93,16 +93,16 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_ if not self.acopytools.get('default'): self.acopytools['default'] = self.get_default_copytools(default_copytools) + # get an initialized trace report (has to be updated for get/put if not defined before) + self.trace_report = trace_report if trace_report else TraceReport(pq=os.environ.get('PILOT_SITENAME', '')) + if not self.acopytools: msg = 'failed to initilize StagingClient: no acopytools options found, acopytools=%s' % self.acopytools logger.error(msg) self.trace_report.update(clientState='BAD_COPYTOOL', stateReason=msg) self.trace_report.send() raise PilotException("failed to resolve acopytools settings") - logger.info('configured copytools per activity: acopytools=%s' % self.acopytools) - - # get an initialized trace report (has to be updated for get/put if not defined before) - self.trace_report = trace_report if trace_report else TraceReport(pq=os.environ.get('PILOT_SITENAME', '')) + logger.info('configured copytools per activity: acopytools=%s', self.acopytools) def set_acopytools(self): """ @@ -268,17 +268,17 @@ def resolve_replicas(self, files, use_vp=False): # add signature lifetime for signed URL storages query.update(signature_lifetime=24 * 3600) # note: default is otherwise 1h - logger.info('calling rucio.list_replicas() with query=%s' % query) + logger.info('calling rucio.list_replicas() with query=%s', query) try: replicas = c.list_replicas(**query) - except Exception as e: - raise PilotException("Failed to get replicas from Rucio: %s" % e, code=ErrorCodes.RUCIOLISTREPLICASFAILED) + except Exception as error: + raise PilotException("Failed to get replicas from Rucio: %s" % error, code=ErrorCodes.RUCIOLISTREPLICASFAILED) show_memory_usage() replicas = list(replicas) - logger.debug("replicas received from Rucio: %s" % replicas) + logger.debug("replicas received from Rucio: %s", replicas) files_lfn = dict(((e.scope, e.lfn), e) for e in xfiles) for replica in replicas: @@ -294,18 +294,18 @@ def resolve_replicas(self, files, use_vp=False): self.trace_report.update(validateStart=time.time()) status = True if fdat.filesize != replica['bytes']: - logger.warning("Filesize of input file=%s mismatched with value from Rucio replica: filesize=%s, replica.filesize=%s, fdat=%s" - % (fdat.lfn, fdat.filesize, replica['bytes'], fdat)) + logger.warning("Filesize of input file=%s mismatched with value from Rucio replica: filesize=%s, replica.filesize=%s, fdat=%s", + fdat.lfn, fdat.filesize, replica['bytes'], fdat) status = False if not fdat.filesize: fdat.filesize = replica['bytes'] - logger.warning("Filesize value for input file=%s is not defined, assigning info from Rucio replica: filesize=%s" % (fdat.lfn, replica['bytes'])) + logger.warning("Filesize value for input file=%s is not defined, assigning info from Rucio replica: filesize=%s", fdat.lfn, replica['bytes']) for ctype in ['adler32', 'md5']: if fdat.checksum.get(ctype) != replica[ctype] and replica[ctype]: - logger.warning("Checksum value of input file=%s mismatched with info got from Rucio replica: checksum=%s, replica.checksum=%s, fdat=%s" - % (fdat.lfn, fdat.checksum, replica[ctype], fdat)) + logger.warning("Checksum value of input file=%s mismatched with info got from Rucio replica: checksum=%s, replica.checksum=%s, fdat=%s", + fdat.lfn, fdat.checksum, replica[ctype], fdat) status = False if not fdat.checksum.get(ctype) and replica[ctype]: @@ -489,33 +489,32 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 code=ErrorCodes.UNKNOWNCOPYTOOL) module = self.copytool_modules[name]['module_name'] - self.logger.info('trying to use copytool=%s for activity=%s' % (name, activity)) + self.logger.info('trying to use copytool=%s for activity=%s', name, activity) copytool = __import__('pilot.copytool.%s' % module, globals(), locals(), [module], 0) # Python 2/3 - self.trace_report.update(protocol=name) + #self.trace_report.update(protocol=name) - except PilotException as e: - caught_errors.append(e) - self.logger.debug('error: %s' % e) + except PilotException as error: + caught_errors.append(error) + self.logger.debug('error: %s', error) continue - except Exception as e: - self.logger.warning('failed to import copytool module=%s, error=%s' % (module, e)) + except Exception as error: + self.logger.warning('failed to import copytool module=%s, error=%s', module, error) continue try: - #self.logger.debug('kwargs=%s' % str(kwargs)) result = self.transfer_files(copytool, remain_files, activity, **kwargs) - self.logger.debug('transfer_files() using copytool=%s completed with result=%s' % (copytool, str(result))) + self.logger.debug('transfer_files() using copytool=%s completed with result=%s', copytool, str(result)) show_memory_usage() break - except PilotException as e: - self.logger.warning('failed to transfer_files() using copytool=%s .. skipped; error=%s' % (copytool, e)) - caught_errors.append(e) - except TimeoutException as e: - self.logger.warning('function timed out: %s' % e) - caught_errors.append(e) - except Exception as e: - self.logger.warning('failed to transfer files using copytool=%s .. skipped; error=%s' % (copytool, e)) - caught_errors.append(e) + except PilotException as error: + self.logger.warning('failed to transfer_files() using copytool=%s .. skipped; error=%s', copytool, error) + caught_errors.append(error) + except TimeoutException as error: + self.logger.warning('function timed out: %s', error) + caught_errors.append(error) + except Exception as error: + self.logger.warning('failed to transfer files using copytool=%s .. skipped; error=%s', copytool, error) + caught_errors.append(error) import traceback self.logger.error(traceback.format_exc()) @@ -523,10 +522,10 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 caught_errors[-1].get_error_code() == ErrorCodes.MISSINGOUTPUTFILE: raise caught_errors[-1] - remain_files = [f for f in files if f.status not in ['remote_io', 'transferred', 'no_transfer']] + remain_files = [fspec for fspec in files if fspec.status not in ['remote_io', 'transferred', 'no_transfer']] if remain_files: # failed or incomplete transfer - # Propagate message from first error back up + # propagate message from first error back up errmsg = str(caught_errors[0]) if caught_errors else '' if caught_errors and "Cannot authenticate" in str(caught_errors): code = ErrorCodes.STAGEINAUTHENTICATIONFAILURE @@ -537,7 +536,7 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 errmsg = caught_errors[0].get_last_error() elif caught_errors and isinstance(caught_errors[0], TimeoutException): code = ErrorCodes.STAGEINTIMEOUT if self.mode == 'stage-in' else ErrorCodes.STAGEOUTTIMEOUT # is it stage-in/out? - self.logger.warning('caught time-out exception: %s' % caught_errors[0]) + self.logger.warning('caught time-out exception: %s', caught_errors[0]) else: code = ErrorCodes.STAGEINFAILED if self.mode == 'stage-in' else ErrorCodes.STAGEOUTFAILED # is it stage-in/out? details = str(caught_errors) + ":" + 'failed to transfer files using copytools=%s' % copytools @@ -575,13 +574,13 @@ def require_protocols(self, files, copytool, activity, local_dir=''): protocols = self.resolve_protocol(fspec, allowed_schemas) if not protocols and 'mv' not in self.infosys.queuedata.copytools: # no protocols found error = 'Failed to resolve protocol for file=%s, allowed_schemas=%s, fspec=%s' % (fspec.lfn, allowed_schemas, fspec) - self.logger.error("resolve_protocol: %s" % error) + self.logger.error("resolve_protocol: %s", error) raise PilotException(error, code=ErrorCodes.NOSTORAGEPROTOCOL) # take first available protocol for copytool: FIX ME LATER if need (do iterate over all allowed protocols?) protocol = protocols[0] - self.logger.info("Resolved protocol to be used for transfer: \'%s\': lfn=\'%s\'" % (protocol, fspec.lfn)) + self.logger.info("Resolved protocol to be used for transfer: \'%s\': lfn=\'%s\'", protocol, fspec.lfn) resolve_surl = getattr(copytool, 'resolve_surl', None) if not callable(resolve_surl): @@ -608,7 +607,7 @@ def resolve_protocols(self, files): ddm = ddmconf.get(fdat.ddmendpoint) if not ddm: error = 'Failed to resolve output ddmendpoint by name=%s (from PanDA), please check configuration.' % fdat.ddmendpoint - self.logger.error("resolve_protocols: %s, fspec=%s" % (error, fdat)) + self.logger.error("resolve_protocols: %s, fspec=%s", error, fdat) raise PilotException(error, code=ErrorCodes.NOSTORAGE) protocols = [] @@ -689,13 +688,13 @@ def resolve_replica(self, fspec, primary_schemas=None, allowed_schemas=None, dom pschemas = 'any' if primary_schemas and not primary_schemas[0] else ','.join(primary_schemas or []) error = 'Failed to find replica for file=%s, domain=%s, allowed_schemas=%s, pschemas=%s, fspec=%s' % (fspec.lfn, domain, schemas, pschemas, fspec) - self.logger.info("resolve_replica: %s" % error) + self.logger.info("resolve_replica: %s", error) return # prefer SRM protocol for surl -- to be verified, can it be deprecated? rse_replicas = replicas.get(replica['ddmendpoint'], []) surl = self.get_preferred_replica(rse_replicas, ['srm']) or rse_replicas[0] - self.logger.info("[stage-in] surl (srm replica) from Rucio: pfn=%s, ddmendpoint=%s" % (surl['pfn'], surl['ddmendpoint'])) + self.logger.info("[stage-in] surl (srm replica) from Rucio: pfn=%s, ddmendpoint=%s", surl['pfn'], surl['ddmendpoint']) return {'surl': surl['pfn'], 'ddmendpoint': replica['ddmendpoint'], 'pfn': replica['pfn'], 'domain': replica['domain']} @@ -719,42 +718,10 @@ def get_direct_access_variables(self, job): if job and not job.is_analysis() and job.transfertype != 'direct': # task forbids direct access allow_direct_access = False - self.logger.info('switched off direct access mode for production job since transfertype=%s' % job.transfertype) + self.logger.info('switched off direct access mode for production job since transfertype=%s', job.transfertype) return allow_direct_access, direct_access_type - #def set_accessmodes_for_direct_access(self, files, direct_access_type): ## TO BE DEPRECATED (anisyonk) - # """ - # Update the FileSpec accessmodes for direct access and sort the files to get candidates for remote_io coming - # first in order to exclude them from checking of available space for stage-in. - # - # :param files: FileSpec objects. - # :param direct_access_type: type of direct access (LAN or WAN) (string). - # :return: - # """ - # - # # sort the files - # files = sorted(files, key=lambda x: x.is_directaccess(ensure_replica=False), reverse=True) - # - # # populate allowremoteinputs for each FileSpec object - # for fdata in files: - # is_directaccess = fdata.is_directaccess(ensure_replica=False) - # if is_directaccess and direct_access_type == 'WAN': ## is it the same for ES workflow ?? -- test and verify/FIXME LATER - # fdata.allowremoteinputs = True - # self.logger.info("check direct access for lfn=%s: allow_direct_access=true, fdata.is_directaccess()=%s =>" - # " is_directaccess=%s, allowremoteinputs=%s" % (fdata.lfn, - # fdata.is_directaccess(ensure_replica=False), - # is_directaccess, fdata.allowremoteinputs)) - # # must update accessmode for user jobs (it is only set already for production jobs) - # if fdata.accessmode != 'direct' and is_directaccess and fdata.accessmode != 'copy': - # fdata.accessmode = 'direct' - # - # # reset accessmode if direct access is not to be used - # if fdata.accessmode == 'direct' and not is_directaccess: - # fdata.accessmode = '' - # - # self.logger.info('accessmode for LFN=%s: %s (is_directaccess=%s)' % (fdata.lfn, fdata.accessmode, is_directaccess)) - def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C901 """ Automatically stage in files using the selected copy tool module. @@ -780,7 +747,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 # overwrite allowed_schemas for VP jobs if kwargs['use_vp']: allowed_schemas = ['root'] - self.logger.debug('overwrote allowed_schemas for VP job: %s' % str(allowed_schemas)) + self.logger.debug('overwrote allowed_schemas for VP job: %s', str(allowed_schemas)) for fspec in files: resolve_replica = getattr(copytool, 'resolve_replica', None) @@ -796,11 +763,11 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 fspec.is_directaccess(ensure_replica=False) else None) replica = resolve_replica(fspec, primary_schemas, allowed_schemas, domain='lan') else: - self.logger.info("[stage-in] LAN access is DISABLED for lfn=%s (fspec.allow_lan=%s)" % (fspec.lfn, fspec.allow_lan)) + self.logger.info("[stage-in] LAN access is DISABLED for lfn=%s (fspec.allow_lan=%s)", fspec.lfn, fspec.allow_lan) if not replica and fspec.allow_lan: - self.logger.info("[stage-in] No LAN replica found for lfn=%s, primary_schemas=%s, allowed_schemas=%s" % - (fspec.lfn, primary_schemas, allowed_schemas)) + self.logger.info("[stage-in] No LAN replica found for lfn=%s, primary_schemas=%s, allowed_schemas=%s", + fspec.lfn, primary_schemas, allowed_schemas) # check remote replicas if not replica and fspec.allow_wan: @@ -808,12 +775,12 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 primary_schemas = (self.direct_remoteinput_allowed_schemas if fspec.direct_access_wan and fspec.is_directaccess(ensure_replica=False) else None) xschemas = self.remoteinput_allowed_schemas - allowed_schemas = [e for e in allowed_schemas if e in xschemas] if allowed_schemas else xschemas + allowed_schemas = [schema for schema in allowed_schemas if schema in xschemas] if allowed_schemas else xschemas replica = resolve_replica(fspec, primary_schemas, allowed_schemas, domain='wan') if not replica and fspec.allow_wan: - self.logger.info("[stage-in] No WAN replica found for lfn=%s, primary_schemas=%s, allowed_schemas=%s" % - (fspec.lfn, primary_schemas, allowed_schemas)) + self.logger.info("[stage-in] No WAN replica found for lfn=%s, primary_schemas=%s, allowed_schemas=%s", + fspec.lfn, primary_schemas, allowed_schemas) if not replica: raise ReplicasNotFound('No replica found for lfn=%s (allow_lan=%s, allow_wan=%s)' % (fspec.lfn, fspec.allow_lan, fspec.allow_wan)) @@ -826,8 +793,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 if replica.get('domain'): fspec.domain = replica['domain'] - self.logger.info("[stage-in] found replica to be used for lfn=%s: ddmendpoint=%s, pfn=%s" % - (fspec.lfn, fspec.ddmendpoint, fspec.turl)) + self.logger.info("[stage-in] found replica to be used for lfn=%s: ddmendpoint=%s, pfn=%s", fspec.lfn, fspec.ddmendpoint, fspec.turl) # prepare files (resolve protocol/transfer url) if getattr(copytool, 'require_input_protocols', False) and files: @@ -845,7 +811,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 if not copytool.is_valid_for_copy_in(remain_files): msg = 'input is not valid for transfers using copytool=%s' % copytool self.logger.warning(msg) - self.logger.debug('input: %s' % remain_files) + self.logger.debug('input: %s', remain_files) self.trace_report.update(clientState='NO_REPLICA', stateReason=msg) self.trace_report.send() raise PilotException('invalid input data for transfer operation') @@ -867,7 +833,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 # add the trace report kwargs['trace_report'] = self.trace_report - self.logger.info('ready to transfer (stage-in) files: %s' % remain_files) + self.logger.info('ready to transfer (stage-in) files: %s', remain_files) # use bulk downloads if necessary # if kwargs['use_bulk_transfer'] @@ -896,11 +862,11 @@ def set_status_for_direct_access(self, files, workdir): # direct_lan = True if not direct_lan and not direct_wan: - self.logger.debug('direct lan/wan transfer will not be used for lfn=%s' % fspec.lfn) + self.logger.debug('direct lan/wan transfer will not be used for lfn=%s', fspec.lfn) self.logger.debug('lfn=%s, direct_lan=%s, direct_wan=%s, direct_access_lan=%s, direct_access_wan=%s, ' - 'direct_localinput_allowed_schemas=%s, remoteinput_allowed_schemas=%s' % - (fspec.lfn, direct_lan, direct_wan, fspec.direct_access_lan, fspec.direct_access_wan, - str(self.direct_localinput_allowed_schemas), str(self.direct_remoteinput_allowed_schemas))) + 'direct_localinput_allowed_schemas=%s, remoteinput_allowed_schemas=%s, domain=%s', + fspec.lfn, direct_lan, direct_wan, fspec.direct_access_lan, fspec.direct_access_wan, + str(self.direct_localinput_allowed_schemas), str(self.direct_remoteinput_allowed_schemas), fspec.domain) if direct_lan or direct_wan: fspec.status_code = 0 @@ -910,8 +876,8 @@ def set_status_for_direct_access(self, files, workdir): if alrb_xcache_proxy and direct_lan: #fspec.is_directaccess(ensure_replica=False): fspec.turl = '${ALRB_XCACHE_PROXY}' + fspec.turl - self.logger.info('stage-in: direct access (remote i/o) will be used for lfn=%s (direct_lan=%s, direct_wan=%s), turl=%s' % - (fspec.lfn, direct_lan, direct_wan, fspec.turl)) + self.logger.info('stage-in: direct access (remote i/o) will be used for lfn=%s (direct_lan=%s, direct_wan=%s), turl=%s', + fspec.lfn, direct_lan, direct_wan, fspec.turl) # send trace localsite = os.environ.get('RUCIO_LOCAL_SITE_ID') @@ -933,7 +899,7 @@ def set_status_for_direct_access(self, files, workdir): if not os.path.exists(_workdir): path = os.path.join('/srv', config.Pilot.base_trace_report) if not os.path.exists(path): - self.logger.debug('writing base trace report to: %s' % path) + self.logger.debug('writing base trace report to: %s', path) write_json(path, self.trace_report) else: self.trace_report.send() @@ -947,7 +913,7 @@ def check_availablespace(self, files): """ for f in files: - self.logger.debug('lfn=%s filesize=%d accessmode=%s' % (f.lfn, f.filesize, f.accessmode)) + self.logger.debug('lfn=%s filesize=%d accessmode=%s', f.lfn, f.filesize, f.accessmode) maxinputsize = convert_mb_to_b(get_maximum_input_sizes()) totalsize = reduce(lambda x, y: x + y.filesize, files, 0) @@ -958,12 +924,11 @@ def check_availablespace(self, files): (len(files), totalsize, maxinputsize) raise SizeTooLarge(error) - self.logger.info("total input file size=%s B within allowed limit=%s B (zero value means unlimited)" % - (totalsize, maxinputsize)) + self.logger.info("total input file size=%s B within allowed limit=%s B (zero value means unlimited)", totalsize, maxinputsize) # get available space available_space = convert_mb_to_b(get_local_disk_space(os.getcwd())) - self.logger.info("locally available space: %d B" % available_space) + self.logger.info("locally available space: %d B", available_space) # are we within the limit? if totalsize > available_space: @@ -1018,17 +983,17 @@ def prepare_destinations(self, files, activities): # take the fist choice for now, extend the logic later if need ddm = storages[0] - self.logger.info("[prepare_destinations][%s]: allowed (local) destinations: %s" % (activity, storages)) - self.logger.info("[prepare_destinations][%s]: resolved default destination ddm=%s" % (activity, ddm)) + self.logger.info("[prepare_destinations][%s]: allowed (local) destinations: %s", activity, storages) + self.logger.info("[prepare_destinations][%s]: resolved default destination ddm=%s", activity, ddm) for e in files: if not e.ddmendpoint: # no preferences => use default destination self.logger.info("[prepare_destinations][%s]: fspec.ddmendpoint is not set for lfn=%s" - " .. will use default ddm=%s as (local) destination" % (activity, e.lfn, ddm)) + " .. will use default ddm=%s as (local) destination", activity, e.lfn, ddm) e.ddmendpoint = ddm elif e.ddmendpoint not in storages: # fspec.ddmendpoint is not in associated storages => assume it as final (non local) alternative destination self.logger.info("[prepare_destinations][%s]: Requested fspec.ddmendpoint=%s is not in the list of allowed (local) destinations" - " .. will consider default ddm=%s for transfer and tag %s as alt. location" % (activity, e.ddmendpoint, ddm, e.ddmendpoint)) + " .. will consider default ddm=%s for transfer and tag %s as alt. location", activity, e.ddmendpoint, ddm, e.ddmendpoint) e.ddmendpoint = ddm e.ddmendpoint_alt = e.ddmendpoint # consider me later @@ -1100,13 +1065,13 @@ def transfer_files(self, copytool, files, activity, **kwargs): if not fspec.ddmendpoint: # ensure that output destination is properly set if 'mv' not in self.infosys.queuedata.copytools: - msg = 'No output RSE defined for file=%s' % fspec.lfn + msg = 'no output RSE defined for file=%s' % fspec.lfn self.logger.error(msg) raise PilotException(msg, code=ErrorCodes.NOSTORAGE, state='NO_OUTPUTSTORAGE_DEFINED') pfn = fspec.surl or getattr(fspec, 'pfn', None) or os.path.join(kwargs.get('workdir', ''), fspec.lfn) if not os.path.isfile(pfn) or not os.access(pfn, os.R_OK): - msg = "Error: output pfn file does not exist: %s" % pfn + msg = "output pfn file does not exist: %s" % pfn self.logger.error(msg) self.trace_report.update(clientState='MISSINGOUTPUTFILE', stateReason=msg) self.trace_report.send() @@ -1134,10 +1099,10 @@ def transfer_files(self, copytool, files, activity, **kwargs): if not copytool.is_valid_for_copy_out(files): self.logger.warning('Input is not valid for transfers using copytool=%s' % copytool) - self.logger.debug('Input: %s' % files) + self.logger.debug('Input: %s', files) raise PilotException('Invalid input for transfer operation') - self.logger.info('ready to transfer (stage-out) files: %s' % files) + self.logger.info('ready to transfer (stage-out) files: %s', files) if self.infosys: kwargs['copytools'] = self.infosys.queuedata.copytools diff --git a/pilot/api/es_data.py b/pilot/api/es_data.py index 708e6de7a..e246cbd9f 100644 --- a/pilot/api/es_data.py +++ b/pilot/api/es_data.py @@ -7,7 +7,7 @@ # Authors: # - Wen Guan, wen.guan@cern,ch, 2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2019 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021 import logging @@ -46,7 +46,7 @@ def prepare_sources(self, files, activities=None): fspec.scope = 'transient' if storage_id: fspec.ddmendpoint = self.infosys.get_ddmendpoint(storage_id) - logger.info("Processed file with storage id: %s" % fspec) + logger.info("Processed file with storage id: %s", fspec) class StageOutESClient(StageOutClient): diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py index 08aa02b75..68192b14f 100644 --- a/pilot/common/errorcodes.py +++ b/pilot/common/errorcodes.py @@ -147,6 +147,7 @@ class ErrorCodes: XRDCPERROR = 1362 KILLPAYLOAD = 1363 # note, not a failure but a kill instruction from Raythena MISSINGCREDENTIALS = 1364 + NOCTYPES = 1365 _error_messages = { GENERALERROR: "General pilot error, consult batch log", @@ -272,7 +273,8 @@ class ErrorCodes: REMOTEFILECOULDNOTBEOPENED: "Remote file could not be opened", XRDCPERROR: "Xrdcp was unable to open file", KILLPAYLOAD: "Raythena has decided to kill payload", - MISSINGCREDENTIALS: "Unable to locate credentials for S3 transfer" + MISSINGCREDENTIALS: "Unable to locate credentials for S3 transfer", + NOCTYPES: "Python module ctypes not available on worker node" } put_error_codes = [1135, 1136, 1137, 1141, 1152, 1181] @@ -386,6 +388,7 @@ def resolve_transform_error(self, exit_code, stderr): :return: pilot error code (int) """ + ec = 0 if exit_code == 251 and "Not mounting requested bind point" in stderr: ec = self.SINGULARITYBINDPOINTFAILURE elif exit_code == 255 and "No more available loop devices" in stderr: @@ -394,16 +397,18 @@ def resolve_transform_error(self, exit_code, stderr): ec = self.SINGULARITYIMAGEMOUNTFAILURE elif exit_code == 255 and "Operation not permitted" in stderr: ec = self.SINGULARITYGENERALFAILURE - elif exit_code == 64 and "Singularity is not installed" in stderr: + elif "Singularity is not installed" in stderr: # exit code should be 64 but not always? ec = self.SINGULARITYNOTINSTALLED elif exit_code == 64 and "cannot create directory" in stderr: ec = self.MKDIR elif exit_code == -1: ec = self.UNKNOWNTRFFAILURE - else: + #else: # do not assign a pilot error code for unidentified transform error, return 0 - ec = 0 + # ec = 0 + if not ec: + ec = exit_code return ec def extract_stderr_error(self, stderr): diff --git a/pilot/common/pluginfactory.py b/pilot/common/pluginfactory.py index cf2b5f27f..27925299a 100644 --- a/pilot/common/pluginfactory.py +++ b/pilot/common/pluginfactory.py @@ -6,6 +6,7 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021 import logging @@ -30,11 +31,11 @@ def get_plugin(self, confs): class_name = confs['class'] if class_name is None: - logger.error("[class] is not defined in confs: %s" % confs) + logger.error("[class] is not defined in confs: %s", confs) return None if class_name not in self.classMap: - logger.info("Trying to import %s" % class_name) + logger.info("Trying to import %s", class_name) components = class_name.split('.') mod = __import__('.'.join(components[:-1])) for comp in components[1:]: @@ -48,7 +49,7 @@ def get_plugin(self, confs): args[key] = confs[key] cls = self.classMap[class_name] - logger.info("Importing %s with args: %s" % (cls, args)) + logger.info("Importing %s with args: %s", cls, args) impl = cls(**args) return impl diff --git a/pilot/control/data.py b/pilot/control/data.py index be4fc8a26..596c3a491 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -7,7 +7,7 @@ # Authors: # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # - Wen Guan, wen.guan@cern.ch, 2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2018 @@ -32,7 +32,7 @@ from pilot.util.constants import PILOT_PRE_STAGEIN, PILOT_POST_STAGEIN, PILOT_PRE_STAGEOUT, PILOT_POST_STAGEOUT, LOG_TRANSFER_IN_PROGRESS,\ LOG_TRANSFER_DONE, LOG_TRANSFER_NOT_DONE, LOG_TRANSFER_FAILED, SERVER_UPDATE_RUNNING, MAX_KILL_WAIT_TIME, UTILITY_BEFORE_STAGEIN from pilot.util.container import execute -from pilot.util.filehandling import remove +from pilot.util.filehandling import remove, write_file from pilot.util.processes import threads_aborted from pilot.util.queuehandling import declare_failed_by_kill, put_in_queue from pilot.util.timing import add_to_pilot_timing @@ -63,7 +63,7 @@ def control(queues, traces, args): pass else: exc_type, exc_obj, exc_trace = exc - logger.warning("thread \'%s\' received an exception from bucket: %s" % (thread.name, exc_obj)) + logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj) # deal with the exception # .. @@ -107,8 +107,8 @@ def skip_special_files(job): user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: user.update_stagein(job) - except Exception as e: - logger.warning('caught exception: %s' % e) + except Exception as error: + logger.warning('caught exception: %s', error) def update_indata(job): @@ -124,7 +124,7 @@ def update_indata(job): if fspec.status == 'no_transfer': toberemoved.append(fspec) for fspec in toberemoved: - logger.info('removing fspec object (lfn=%s) from list of input files' % fspec.lfn) + logger.info('removing fspec object (lfn=%s) from list of input files', fspec.lfn) job.indata.remove(fspec) @@ -193,11 +193,11 @@ def _stage_in(args, job): pilot.util.middleware.containerise_middleware(job, job.indata, args.queue, eventtype, localsite, remotesite, job.infosys.queuedata.container_options, args.input_dir, label=label, container_type=job.infosys.queuedata.container_type.get("middleware")) - except PilotException as e: - logger.warning('stage-in containerisation threw a pilot exception: %s' % e) - except Exception as e: + except PilotException as error: + logger.warning('stage-in containerisation threw a pilot exception: %s', error) + except Exception as error: import traceback - logger.warning('stage-in containerisation threw an exception: %s' % e) + logger.warning('stage-in containerisation threw an exception: %s', error) logger.error(traceback.format_exc()) else: try: @@ -224,17 +224,17 @@ def _stage_in(args, job): msg = errors.format_diagnostics(error.get_error_code(), error_msg) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code(), msg=msg) except Exception as error: - logger.error('failed to stage-in: error=%s' % error) + logger.error('failed to stage-in: error=%s', error) logger.info('summary of transferred files:') - for e in job.indata: - status = e.status if e.status else "(not transferred)" - logger.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, status)) + for infile in job.indata: + status = infile.status if infile.status else "(not transferred)" + logger.info(" -- lfn=%s, status_code=%s, status=%s", infile.lfn, infile.status_code, status) # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args) - remain_files = [e for e in job.indata if e.status not in ['remote_io', 'transferred', 'no_transfer']] + remain_files = [infile for infile in job.indata if infile.status not in ['remote_io', 'transferred', 'no_transfer']] logger.info("stage-in finished") if not remain_files else logger.info("stage-in failed") return not remain_files @@ -255,8 +255,8 @@ def get_rse(data, lfn=""): if lfn == "": try: return data[0].ddmendpoint - except Exception as e: - logger.warning("exception caught: %s" % e) + except Exception as error: + logger.warning("exception caught: %s", error) logger.warning("end point is currently unknown") return "unknown" @@ -271,7 +271,7 @@ def get_rse(data, lfn=""): return rse -def stage_in_auto(site, files): +def stage_in_auto(files): """ Separate dummy implementation for automatic stage-in outside of pilot workflows. Should be merged with regular stage-in functionality later, but we need to have @@ -289,47 +289,47 @@ def stage_in_auto(site, files): '--no-subdir'] # quickly remove non-existing destinations - for f in files: - if not os.path.exists(f['destination']): - f['status'] = 'failed' - f['errmsg'] = 'Destination directory does not exist: %s' % f['destination'] - f['errno'] = 1 + for _file in files: + if not os.path.exists(_file['destination']): + _file['status'] = 'failed' + _file['errmsg'] = 'Destination directory does not exist: %s' % _file['destination'] + _file['errno'] = 1 else: - f['status'] = 'running' - f['errmsg'] = 'File not yet successfully downloaded.' - f['errno'] = 2 + _file['status'] = 'running' + _file['errmsg'] = 'File not yet successfully downloaded.' + _file['errno'] = 2 - for f in files: - if f['errno'] == 1: + for _file in files: + if _file['errno'] == 1: continue tmp_executable = objectcopy.deepcopy(executable) - tmp_executable += ['--dir', f['destination']] - tmp_executable.append('%s:%s' % (f['scope'], - f['name'])) + tmp_executable += ['--dir', _file['destination']] + tmp_executable.append('%s:%s' % (_file['scope'], + _file['name'])) process = subprocess.Popen(tmp_executable, bufsize=-1, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - f['errno'] = 2 + _file['errno'] = 2 while True: time.sleep(0.5) exit_code = process.poll() if exit_code is not None: - stdout, stderr = process.communicate() + _, stderr = process.communicate() if exit_code == 0: - f['status'] = 'done' - f['errno'] = 0 - f['errmsg'] = 'File successfully downloaded.' + _file['status'] = 'done' + _file['errno'] = 0 + _file['errmsg'] = 'File successfully downloaded.' else: - f['status'] = 'failed' - f['errno'] = 3 + _file['status'] = 'failed' + _file['errno'] = 3 try: # the Details: string is set in rucio: lib/rucio/common/exception.py in __str__() - f['errmsg'] = [detail for detail in stderr.split('\n') if detail.startswith('Details:')][0][9:-1] - except Exception as e: - f['errmsg'] = 'Could not find rucio error message details - please check stderr directly: %s' % str(e) + _file['errmsg'] = [detail for detail in stderr.split('\n') if detail.startswith('Details:')][0][9:-1] + except Exception as error: + _file['errmsg'] = 'Could not find rucio error message details - please check stderr directly: %s' % error break else: continue @@ -337,7 +337,7 @@ def stage_in_auto(site, files): return files -def stage_out_auto(site, files): +def stage_out_auto(files): """ Separate dummy implementation for automatic stage-out outside of pilot workflows. Should be merged with regular stage-out functionality later, but we need to have @@ -351,63 +351,60 @@ def stage_out_auto(site, files): 'rucio', '-v', 'upload'] # quickly remove non-existing destinations - for f in files: - if not os.path.exists(f['file']): - f['status'] = 'failed' - f['errmsg'] = 'Source file does not exist: %s' % f['file'] - f['errno'] = 1 + for _file in files: + if not os.path.exists(_file['file']): + _file['status'] = 'failed' + _file['errmsg'] = 'Source file does not exist: %s' % _file['file'] + _file['errno'] = 1 else: - f['status'] = 'running' - f['errmsg'] = 'File not yet successfully uploaded.' - f['errno'] = 2 + _file['status'] = 'running' + _file['errmsg'] = 'File not yet successfully uploaded.' + _file['errno'] = 2 - for f in files: - if f['errno'] == 1: + for _file in files: + if _file['errno'] == 1: continue tmp_executable = objectcopy.deepcopy(executable) - tmp_executable += ['--rse', f['rse']] + tmp_executable += ['--rse', _file['rse']] - if 'no_register' in list(f.keys()) and f['no_register']: # Python 2/3 + if 'no_register' in list(_file.keys()) and _file['no_register']: # Python 2/3 tmp_executable += ['--no-register'] - if 'summary' in list(f.keys()) and f['summary']: # Python 2/3 + if 'summary' in list(_file.keys()) and _file['summary']: # Python 2/3 tmp_executable += ['--summary'] - if 'lifetime' in list(f.keys()): # Python 2/3 - tmp_executable += ['--lifetime', str(f['lifetime'])] + if 'lifetime' in list(_file.keys()): # Python 2/3 + tmp_executable += ['--lifetime', str(_file['lifetime'])] - if 'guid' in list(f.keys()): # Python 2/3 - tmp_executable += ['--guid', f['guid']] + if 'guid' in list(_file.keys()): # Python 2/3 + tmp_executable += ['--guid', _file['guid']] - if 'attach' in list(f.keys()): # Python 2/3 - tmp_executable += ['--scope', f['scope'], '%s:%s' % (f['attach']['scope'], f['attach']['name']), f['file']] + if 'attach' in list(_file.keys()): # Python 2/3 + tmp_executable += ['--scope', _file['scope'], '%s:%s' % (_file['attach']['scope'], _file['attach']['name']), _file['file']] else: - tmp_executable += ['--scope', f['scope'], f['file']] + tmp_executable += ['--scope', _file['scope'], _file['file']] - process = subprocess.Popen(tmp_executable, - bufsize=-1, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - f['errno'] = 2 + process = subprocess.Popen(tmp_executable, bufsize=-1, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + _file['errno'] = 2 while True: time.sleep(0.5) exit_code = process.poll() if exit_code is not None: - stdout, stderr = process.communicate() + _, stderr = process.communicate() if exit_code == 0: - f['status'] = 'done' - f['errno'] = 0 - f['errmsg'] = 'File successfully uploaded.' + _file['status'] = 'done' + _file['errno'] = 0 + _file['errmsg'] = 'File successfully uploaded.' else: - f['status'] = 'failed' - f['errno'] = 3 + _file['status'] = 'failed' + _file['errno'] = 3 try: # the Details: string is set in rucio: lib/rucio/common/exception.py in __str__() - f['errmsg'] = [detail for detail in stderr.split('\n') if detail.startswith('Details:')][0][9:-1] - except Exception as e: - f['errmsg'] = 'Could not find rucio error message details - please check stderr directly: %s' % str(e) + _file['errmsg'] = [detail for detail in stderr.split('\n') if detail.startswith('Details:')][0][9:-1] + except Exception as error: + _file['errmsg'] = 'Could not find rucio error message details - please check stderr directly: %s' % error break else: continue @@ -415,36 +412,39 @@ def stage_out_auto(site, files): return files -def xcache_proxy(output): +def write_output(filename, output): + """ + Write command output to file. - for line in output.split('\n'): - if 'ALRB_XCACHE_PROXY' in line: - set_xcache_proxy(line, remote='REMOTE' in line) - if 'Messages logged in' in line: - set_xcache_log(line) + :param filename: file name (string). + :param output: command stdout/stderr (string). + :return: + """ + try: + write_file(filename, output, unique=True) + except PilotException as error: + logger.warning('failed to write utility output to file: %s, %s', error, output) + else: + logger.debug('wrote %s', filename) -def set_xcache_proxy(line, remote=None): - - import re - pattern = r'\ export\ ALRB_XCACHE_PROXY_REMOTE\=\"(.+)\"' if remote else r'\ export\ ALRB_XCACHE_PROXY\=\"(.+)\"' - pattern = re.compile(pattern) - result = re.findall(pattern, line) - if result: - if remote: - os.environ['ALRB_XCACHE_PROXY_REMOTE'] = result[0] - else: - os.environ['ALRB_XCACHE_PROXY'] = result[0] +def write_utility_output(workdir, step, stdout, stderr): + """ + Write the utility command output to stdout, stderr files to the job.workdir for the current step. + -> _stdout.txt, _stderr.txt + Example of step: xcache. -def set_xcache_log(line): + :param workdir: job workdir (string). + :param step: utility step (string). + :param stdout: command stdout (string). + :param stderr: command stderr (string). + :return: + """ - import re - pattern = r'xcache\ started\ successfully.\ \ Messages\ logged\ in\ (.+)' - pattern = re.compile(pattern) - result = re.findall(pattern, line) - if result: - os.environ['ALRB_XCACHE_LOG'] = result[0] + # dump to files + write_output(os.path.join(workdir, step + '_stdout.txt'), stdout) + write_output(os.path.join(workdir, step + '_stderr.txt'), stderr) def copytool_in(queues, traces, args): @@ -474,15 +474,26 @@ def copytool_in(queues, traces, args): user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 cmd = user.get_utility_commands(job=job, order=UTILITY_BEFORE_STAGEIN) if cmd: - exit_code, stdout, stderr = execute(cmd.get('command')) - logger.debug('exit_code=%d' % exit_code) - logger.debug('stderr=%s' % stderr) - logger.debug('stdout=%s' % stdout) - # move code to user area - xcache_proxy(stdout) + # xcache debug + #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + #logger.debug('[before xcache start] stdout=%s', _stdout) + #logger.debug('[before xcache start] stderr=%s', _stderr) + + _, stdout, stderr = execute(cmd.get('command')) + logger.debug('stdout=%s', stdout) + logger.debug('stderr=%s', stderr) + + # xcache debug + #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + #logger.debug('[after xcache start] stdout=%s', _stdout) + #logger.debug('[after xcache start] stderr=%s', _stderr) - logger.debug('ALRB_XCACHE_PROXY=%s' % os.environ.get('ALRB_XCACHE_PROXY', '')) - logger.debug('ALRB_XCACHE_PROXY_REMOTE=%s' % os.environ.get('ALRB_XCACHE_PROXY_REMOTE', '')) + # perform any action necessary after command execution (e.g. stdout processing) + kwargs = {'label': cmd.get('label', 'utility'), 'output': stdout} + user.post_prestagein_utility_command(**kwargs) + + # write output to log files + write_utility_output(job.workdir, cmd.get('label', 'utility'), stdout, stderr) # place it in the current stage-in queue (used by the jobs' queue monitoring) put_in_queue(job, queues.current_data_in) @@ -516,7 +527,7 @@ def copytool_in(queues, traces, args): # remove the job from the current stage-in queue _job = queues.current_data_in.get(block=True, timeout=1) if _job: - logger.debug('job %s has been removed from the current_data_in queue' % _job.jobid) + logger.debug('job %s has been removed from the current_data_in queue', _job.jobid) # now create input file metadata if required by the payload if os.environ.get('PILOT_ES_EXECUTOR_TYPE', 'generic') == 'generic': @@ -524,12 +535,12 @@ def copytool_in(queues, traces, args): user = __import__('pilot.user.%s.metadata' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 file_dictionary = get_input_file_dictionary(job.indata) xml = user.create_input_file_metadata(file_dictionary, job.workdir) - logger.info('created input file metadata:\n%s' % xml) + logger.info('created input file metadata:\n%s', xml) else: # remove the job from the current stage-in queue _job = queues.current_data_in.get(block=True, timeout=1) if _job: - logger.debug('job %s has been removed from the current_data_in queue' % _job.jobid) + logger.debug('job %s has been removed from the current_data_in queue', _job.jobid) logger.warning('stage-in failed, adding job object to failed_data_in queue') job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEINFAILED) set_pilot_state(job=job, state="failed") @@ -593,7 +604,7 @@ def copytool_out(queues, traces, args): if is_already_processed(queues, processed_jobs): continue - logger.info('will perform stage-out for job id=%s' % job.jobid) + logger.info('will perform stage-out for job id=%s', job.jobid) if args.abort_job.is_set(): traces.pilot['command'] = 'abort' @@ -655,7 +666,7 @@ def is_already_processed(queues, processed_jobs): for jobid in processed_jobs: if jobid in jobids: - logger.warning('output from job %s has already been staged out' % jobid) + logger.warning('output from job %s has already been staged out', jobid) found = True break if found: @@ -669,6 +680,7 @@ def get_input_file_dictionary(indata): Return an input file dictionary. Format: {'guid': 'pfn', ..} Normally use_turl would be set to True if direct access is used. + Note: any environment variables in the turls will be expanded :param indata: list of FileSpec objects. :return: file dictionary. @@ -678,6 +690,7 @@ def get_input_file_dictionary(indata): for fspec in indata: ret[fspec.guid] = fspec.turl if fspec.status == 'remote_io' else fspec.lfn + ret[fspec.guid] = os.path.expandvars(ret[fspec.guid]) # correction for ND and mv # in any case use the lfn instead of pfn since there are trf's that have problems with pfn's @@ -695,7 +708,7 @@ def filter_files_for_log(directory): """ filtered_files = [] maxfilesize = 10 - for root, dirnames, filenames in os.walk(directory): + for root, _, filenames in os.walk(directory): for filename in filenames: location = os.path.join(root, filename) if os.path.exists(location): # do not include broken links @@ -705,7 +718,7 @@ def filter_files_for_log(directory): return filtered_files -def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], output_files=[], is_looping=False): +def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], output_files=[], is_looping=False, debugmode=False): """ Create the tarball for the job. @@ -716,11 +729,13 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out :param input_files: list of input files to remove (list). :param output_files: list of output files to remove (list). :param is_looping: True for looping jobs, False by default (Boolean). + :param debugmode: True if debug mode has been switched on (Boolean). :raises LogFileCreationFailure: in case of log file creation problem. :return: """ - logger.debug('preparing to create log file') + logger.debug('preparing to create log file (debug mode=%s)', str(debugmode)) + # PILOT_HOME is the launch directory of the pilot (or the one specified in pilot options as pilot workdir) pilot_home = os.environ.get('PILOT_HOME', os.getcwd()) current_dir = os.getcwd() @@ -731,13 +746,13 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out if cleanup: pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 - user.remove_redundant_files(workdir, islooping=is_looping) + user.remove_redundant_files(workdir, islooping=is_looping, debugmode=debugmode) # remove any present input/output files before tarring up workdir - for f in input_files + output_files: - path = os.path.join(workdir, f) + for fname in input_files + output_files: + path = os.path.join(workdir, fname) if os.path.exists(path): - logger.info('removing file: %s' % path) + logger.info('removing file: %s', path) remove(path) # rename the workdir for the tarball creation @@ -747,20 +762,20 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out workdir = newworkdir fullpath = os.path.join(workdir, logfile_name) # /some/path/to/dirname/log.tgz - logger.info('will create archive %s' % fullpath) + logger.info('will create archive %s', fullpath) try: cmd = "pwd;tar cvfz %s %s --dereference --one-file-system; echo $?" % (fullpath, tarball_name) - exit_code, stdout, stderr = execute(cmd) - except Exception as e: - raise LogFileCreationFailure(e) + _, stdout, _ = execute(cmd) + except Exception as error: + raise LogFileCreationFailure(error) else: if pilot_home != current_dir: os.chdir(pilot_home) - logger.debug('stdout = %s' % stdout) + logger.debug('stdout = %s', stdout) try: os.rename(workdir, orgworkdir) - except Exception as e: - logger.debug('exception caught: %s' % e) + except Exception as error: + logger.debug('exception caught: %s', error) def _do_stageout(job, xdata, activity, queue, title, output_dir=''): @@ -775,7 +790,7 @@ def _do_stageout(job, xdata, activity, queue, title, output_dir=''): :return: True in case of success transfers """ - logger.info('prepare to stage-out %d %s file(s)' % (len(xdata), title)) + logger.info('prepare to stage-out %d %s file(s)', len(xdata), title) label = 'stage-out' # should stage-in be done by a script (for containerisation) or by invoking the API (ie classic mode)? @@ -787,10 +802,10 @@ def _do_stageout(job, xdata, activity, queue, title, output_dir=''): pilot.util.middleware.containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, job.infosys.queuedata.container_options, output_dir, label=label, container_type=job.infosys.queuedata.container_type.get("middleware")) - except PilotException as e: - logger.warning('stage-out containerisation threw a pilot exception: %s' % e) - except Exception as e: - logger.warning('stage-out containerisation threw an exception: %s' % e) + except PilotException as error: + logger.warning('stage-out containerisation threw a pilot exception: %s', error) + except Exception as error: + logger.warning('stage-out containerisation threw an exception: %s', error) else: try: logger.info('stage-out will not be done in a container') @@ -820,16 +835,14 @@ def _do_stageout(job, xdata, activity, queue, title, output_dir=''): logger.debug('stage-out client completed') logger.info('summary of transferred files:') - for e in xdata: - if not e.status: + for iofile in xdata: + if not iofile.status: status = "(not transferred)" else: - status = e.status - logger.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, status)) + status = iofile.status + logger.info(" -- lfn=%s, status_code=%s, status=%s", iofile.lfn, iofile.status_code, status) - remain_files = [e for e in xdata if e.status not in ['transferred']] - logger.debug('remain_files=%s' % str(remain_files)) - logger.debug('xdata=%s' % str(xdata)) + remain_files = [iofile for iofile in xdata if iofile.status not in ['transferred']] return not remain_files @@ -878,9 +891,9 @@ def _stage_out_new(job, args): output_files = [fspec.lfn for fspec in job.outdata] create_log(job.workdir, logfile.lfn, tarball_name, args.cleanup, input_files=input_files, output_files=output_files, - is_looping=errors.LOOPINGJOB in job.piloterrorcodes) - except LogFileCreationFailure as e: - logger.warning('failed to create tar file: %s' % e) + is_looping=errors.LOOPINGJOB in job.piloterrorcodes, debugmode=job.debug) + except LogFileCreationFailure as error: + logger.warning('failed to create tar file: %s', error) set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.LOGFILECREATIONFAILURE) return False @@ -900,32 +913,27 @@ def _stage_out_new(job, args): # generate fileinfo details to be send to Panda fileinfo = {} - for e in job.outdata + job.logdata: - if e.status in ['transferred']: - logger.debug('got surl=%s' % e.surl) - logger.debug('got turl=%s' % e.turl) - fileinfo[e.lfn] = {'guid': e.guid, 'fsize': e.filesize, - 'adler32': e.checksum.get('adler32'), - 'surl': e.turl} + for iofile in job.outdata + job.logdata: + if iofile.status in ['transferred']: + fileinfo[iofile.lfn] = {'guid': iofile.guid, + 'fsize': iofile.filesize, + 'adler32': iofile.checksum.get('adler32'), + 'surl': iofile.turl} job.fileinfo = fileinfo - logger.info('prepared job.fileinfo=%s' % job.fileinfo) # WARNING THE FOLLOWING RESETS ANY PREVIOUS STAGEOUT ERRORS if not is_success: # set error code + message (a more precise error code might have been set already) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEOUTFAILED) set_pilot_state(job=job, state="failed") - logger.warning('stage-out failed') # with error: %d, %s (setting job state to failed)' % - # logger.warning('stage-out failed with error: %d, %s (setting job state to failed)' % - # (job['pilotErrorCode'], job['pilotErrorDiag'])) - # send_state(job, args, 'failed') + logger.warning('stage-out failed') return False logger.info('stage-out finished correctly') if not job.state or (job.state and job.state == 'stageout'): # is the job state already set? if so, don't change the state (unless it's the stageout state) - logger.debug('changing job state from %s to finished' % job.state) + logger.debug('changing job state from %s to finished', job.state) set_pilot_state(job=job, state="finished") # send final server update since all transfers have finished correctly @@ -966,13 +974,10 @@ def queue_monitoring(queues, traces, args): # TODO: put in data_out queue instead? if not _stage_out_new(job, args): - logger.info("job %s failed during stage-in and stage-out of log, adding job object to failed_data_outs " - "queue" % job.jobid) - #queues.failed_data_out.put(job) + logger.info("job %s failed during stage-in and stage-out of log, adding job object to failed_data_outs queue", job.jobid) put_in_queue(job, queues.failed_data_out) else: - logger.info("job %s failed during stage-in, adding job object to failed_jobs queue" % job.jobid) - #queues.failed_jobs.put(job) + logger.info("job %s failed during stage-in, adding job object to failed_jobs queue", job.jobid) put_in_queue(job, queues.failed_jobs) # monitor the finished_data_out queue @@ -1001,13 +1006,8 @@ def queue_monitoring(queues, traces, args): job.stageout = "log" set_pilot_state(job=job, state="failed") if not _stage_out_new(job, args): - logger.info("job %s failed during stage-out of data file(s) as well as during stage-out of log, " - "adding job object to failed_jobs queue" % job.jobid) - else: - logger.info("job %s failed during stage-out of data file(s) - stage-out of log succeeded, adding job " - "object to failed_jobs queue" % job.jobid) + logger.info("job %s failed during stage-out", job.jobid) - #queues.failed_jobs.put(job) put_in_queue(job, queues.failed_jobs) if abort: diff --git a/pilot/control/interceptor.py b/pilot/control/interceptor.py index e7987a3a3..31f4c3952 100644 --- a/pilot/control/interceptor.py +++ b/pilot/control/interceptor.py @@ -5,7 +5,9 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2021 + +# Note: leave this module for now - the code might be useful for reuse import time @@ -29,9 +31,6 @@ def run(args): :returns: """ - # t = threading.current_thread() - # logger.debug('job.control is run by thread: %s' % t.name) - targets = {'receive': receive, 'send': send} threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'args': args}, name=name) for name, target in list(targets.items())] # Python 2/3 @@ -48,7 +47,7 @@ def run(args): pass else: exc_type, exc_obj, exc_trace = exc - logger.warning("thread \'%s\' received an exception from bucket: %s" % (thread.name, exc_obj)) + logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj) # deal with the exception # .. diff --git a/pilot/control/job.py b/pilot/control/job.py index 62ab72386..cf6b8394c 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -7,7 +7,7 @@ # Authors: # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # - Wen Guan, wen.guan@cern.ch, 2018 from __future__ import print_function # Python 2 @@ -17,6 +17,7 @@ import hashlib import random import socket +import logging try: import Queue as queue # noqa: N813 @@ -25,6 +26,7 @@ from json import dumps #, loads from re import findall +from glob import glob from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import ExcThread, PilotException #, JobAlreadyRunning @@ -32,7 +34,7 @@ from pilot.util import https from pilot.util.auxiliary import get_batchsystem_jobid, get_job_scheduler_id, get_pilot_id, \ set_pilot_state, get_pilot_state, check_for_final_server_update, pilot_version_banner, is_virtual_machine, \ - is_python3, show_memory_usage, has_instruction_sets + is_python3, show_memory_usage, has_instruction_sets, locate_core_file, get_display_info from pilot.util.config import config from pilot.util.common import should_abort, was_pilot_killed from pilot.util.constants import PILOT_MULTIJOB_START_TIME, PILOT_PRE_GETJOB, PILOT_POST_GETJOB, PILOT_KILL_SIGNAL, LOG_TRANSFER_NOT_DONE, \ @@ -46,17 +48,16 @@ publish_stageout_files from pilot.util.jobmetrics import get_job_metrics from pilot.util.math import mean +from pilot.util.middleware import containerise_general_command from pilot.util.monitoring import job_monitor_tasks, check_local_space from pilot.util.monitoringtime import MonitoringTime -from pilot.util.processes import cleanup, threads_aborted, kill_process +from pilot.util.processes import cleanup, threads_aborted, kill_process, kill_processes from pilot.util.proxy import get_distinguished_name from pilot.util.queuehandling import scan_for_jobs, put_in_queue, queue_report, purge_queue from pilot.util.timing import add_to_pilot_timing, timing_report, get_postgetjob_time, get_time_since, time_stamp from pilot.util.workernode import get_disk_space, collect_workernode_info, get_node_name, get_cpu_model -import logging logger = logging.getLogger(__name__) - errors = ErrorCodes() @@ -70,9 +71,6 @@ def control(queues, traces, args): :return: """ - # t = threading.current_thread() - # logger.debug('job.control is run by thread: %s' % t.name) - targets = {'validate': validate, 'retrieve': retrieve, 'create_data_payload': create_data_payload, 'queue_monitor': queue_monitor, 'job_monitor': job_monitor} threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'queues': queues, 'traces': traces, 'args': args}, @@ -90,7 +88,7 @@ def control(queues, traces, args): pass else: exc_type, exc_obj, exc_trace = exc - logger.warning("thread \'%s\' received an exception from bucket: %s" % (thread.name, exc_obj)) + logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj) # deal with the exception # .. @@ -140,8 +138,8 @@ def _validate_job(job): try: kwargs = {'job': job} job.usecontainer = container.do_use_container(**kwargs) - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) return True if user.verify_job(job) else False @@ -159,14 +157,14 @@ def verify_error_code(job): """ if job.piloterrorcode == 0 and len(job.piloterrorcodes) > 0: - logger.warning('piloterrorcode set to first piloterrorcodes list entry: %s' % str(job.piloterrorcodes)) + logger.warning('piloterrorcode set to first piloterrorcodes list entry: %s', str(job.piloterrorcodes)) job.piloterrorcode = job.piloterrorcodes[0] if job.piloterrorcode != 0 and job.is_analysis(): if errors.is_recoverable(code=job.piloterrorcode): job.piloterrorcode = -abs(job.piloterrorcode) job.state = 'failed' - logger.info('failed user job is recoverable (error code=%s)' % job.piloterrorcode) + logger.info('failed user job is recoverable (error code=%s)', job.piloterrorcode) else: logger.info('failed user job is not recoverable') else: @@ -185,8 +183,6 @@ def get_proper_state(job, state): :return: valid server state (string). """ - logger.debug('state=%s' % state) - logger.debug('serverstate=%s' % job.serverstate) if job.serverstate == "finished" or job.serverstate == "failed": pass elif job.serverstate == "" and state != "finished" and state != "failed": @@ -195,7 +191,6 @@ def get_proper_state(job, state): job.serverstate = state else: job.serverstate = 'running' - logger.debug('serverstate=%s' % job.serverstate) return job.serverstate @@ -220,7 +215,7 @@ def publish_harvester_reports(state, args, data, job, final): # publish work report if not publish_work_report(data, path): - logger.debug('failed to write to workerAttributesFile %s' % path) + logger.debug('failed to write to workerAttributesFile %s', path) return False # check if we are in final state then write out information for output files @@ -228,9 +223,9 @@ def publish_harvester_reports(state, args, data, job, final): # Use the job information to write Harvester event_status.dump file event_status_file = get_event_status_file(args) if publish_stageout_files(job, event_status_file): - logger.debug('wrote log and output files to file %s' % event_status_file) + logger.debug('wrote log and output files to file %s', event_status_file) else: - logger.warning('could not write log and output files to file %s' % event_status_file) + logger.warning('could not write log and output files to file %s', event_status_file) return False # publish job report @@ -259,8 +254,8 @@ def write_heartbeat_to_file(data): path = os.path.join(os.environ.get('PILOT_HOME'), config.Pilot.heartbeat_message) if write_json(path, data): - logger.debug('heartbeat dictionary: %s' % data) - logger.debug('wrote heartbeat to file %s' % path) + logger.debug('heartbeat dictionary: %s', data) + logger.debug('wrote heartbeat to file %s', path) return True else: return False @@ -290,7 +285,7 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) if state == 'finished' or state == 'failed' or state == 'holding': final = True os.environ['SERVER_UPDATE'] = SERVER_UPDATE_UPDATING - logger.info('job %s has %s - %s final server update' % (job.jobid, state, tag)) + logger.info('job %s has %s - %s final server update', job.jobid, state, tag) # make sure that job.state is 'failed' if there's a set error code if job.piloterrorcode or job.piloterrorcodes: @@ -302,7 +297,7 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) verify_error_code(job) else: final = False - logger.info('job %s has state \'%s\' - %s heartbeat' % (job.jobid, state, tag)) + logger.info('job %s has state \'%s\' - %s heartbeat', job.jobid, state, tag) # build the data structure needed for getJob, updateJob data = get_data_structure(job, state, args, xml=xml, metadata=metadata) @@ -324,7 +319,7 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) attempt = 0 done = False while attempt < max_attempts and not done: - logger.info('job update attempt %d/%d' % (attempt + 1, max_attempts)) + logger.info('job update attempt %d/%d', attempt + 1, max_attempts) # get the URL for the PanDA server from pilot options or from config pandaserver = get_panda_server(args.url, args.port) @@ -335,8 +330,8 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) attempt += 1 time_after = int(time.time()) - logger.info('server updateJob request completed in %ds for job %s' % (time_after - time_before, job.jobid)) - logger.info("server responded with: res = %s" % str(res)) + logger.info('server updateJob request completed in %ds for job %s', time_after - time_before, job.jobid) + logger.info("server responded with: res = %s", str(res)) show_memory_usage() @@ -352,10 +347,9 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) logger.info('skipping job update for fake test job') return True - except Exception as e: - logger.warning('exception caught while sending https request: %s' % e) - logger.warning('possibly offending data: %s' % data) - pass + except Exception as error: + logger.warning('exception caught while sending https request: %s', error) + logger.warning('possibly offending data: %s', data) if final: os.environ['SERVER_UPDATE'] = SERVER_UPDATE_TROUBLE @@ -401,7 +395,7 @@ def get_job_status_from_server(job_id, url, port): # open connection ret = https.request('{pandaserver}/server/panda/getStatus'.format(pandaserver=pandaserver), data=data) response = ret[1] - logger.info("response: %s" % str(response)) + logger.info("response: %s", str(response)) if response: try: # decode the response @@ -411,21 +405,21 @@ def get_job_status_from_server(job_id, url, port): status = response['status'] # e.g. 'holding' attempt_nr = int(response['attemptNr']) # e.g. '0' status_code = int(response['StatusCode']) # e.g. '0' - except Exception as e: + except Exception as error: logger.warning( - "exception: dispatcher did not return allowed values: %s, %s" % (str(ret), e)) + "exception: dispatcher did not return allowed values: %s, %s", str(ret), error) status = "unknown" attempt_nr = -1 status_code = 20 else: - logger.debug('server job status=%s, attempt_nr=%d, status_code=%d' % (status, attempt_nr, status_code)) + logger.debug('server job status=%s, attempt_nr=%d, status_code=%d', status, attempt_nr, status_code) else: - logger.warning("dispatcher did not return allowed values: %s" % str(ret)) + logger.warning("dispatcher did not return allowed values: %s", str(ret)) status = "unknown" attempt_nr = -1 status_code = 20 - except Exception as e: - logger.warning("could not interpret job status from dispatcher: %s" % e) + except Exception as error: + logger.warning("could not interpret job status from dispatcher: %s", error) status = 'unknown' attempt_nr = -1 status_code = -1 @@ -472,11 +466,48 @@ def get_panda_server(url, port): if default in pandaserver: rnd = random.choice([socket.getfqdn(vv) for vv in set([v[-1][0] for v in socket.getaddrinfo(default, 25443, socket.AF_INET)])]) pandaserver = pandaserver.replace(default, rnd) - logger.debug('updated %s to %s' % (default, pandaserver)) + logger.debug('updated %s to %s', default, pandaserver) return pandaserver +def get_debug_command(cmd): + """ + Identify and filter the given debug command. + + Note: only a single command will be allowed from a predefined list: tail, ls, gdb, ps, du. + + :param cmd: raw debug command from job definition (string). + :return: debug_mode (Boolean, True if command is deemed ok), debug_command (string). + """ + + debug_mode = False + debug_command = "" + + allowed_commands = ['tail', 'ls', 'ps', 'gdb', 'du'] + forbidden_commands = ['rm'] + + # remove any 'debug,' command that the server might send redundantly + if ',' in cmd and 'debug' in cmd: + cmd = cmd.replace('debug,', '').replace(',debug', '') + try: + tmp = cmd.split(' ') + com = tmp[0] + except Exception as error: + logger.warning('failed to identify debug command: %s', error) + else: + if com not in allowed_commands: + logger.warning('command=%s is not in the list of allowed commands: %s', com, str(allowed_commands)) + elif ';' in cmd or ';' in cmd: + logger.warning('debug command cannot contain \';\': \'%s\'', cmd) + elif com in forbidden_commands: + logger.warning('command=%s is not allowed', com) + else: + debug_mode = True + debug_command = cmd + return debug_mode, debug_command + + def handle_backchannel_command(res, job, args, test_tobekilled=False): """ Does the server update contain any backchannel information? if so, update the job object. @@ -493,11 +524,16 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): res['command'] = 'tobekilled' if 'command' in res and res.get('command') != 'NULL': - # look for 'tobekilled', 'softkill', 'debug', 'debugoff' # warning: server might return comma-separated string, 'debug,tobekilled' - if 'tobekilled' in res.get('command'): - logger.info('pilot received a panda server signal to kill job %s at %s' % - (job.jobid, time_stamp())) + cmd = res.get('command') + # is it a 'command options'-type? debug_command=tail .., ls .., gdb .., ps .., du .. + if ' ' in cmd and 'tobekilled' not in cmd: + try: + job.debug, job.debug_command = get_debug_command(cmd) + except Exception as error: + logger.debug('exception caught in get_debug_command(): %s', error) + elif 'tobekilled' in cmd: + logger.info('pilot received a panda server signal to kill job %s at %s', job.jobid, time_stamp()) set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PANDAKILL) if job.pid: @@ -506,18 +542,30 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): else: logger.debug('no pid to kill') args.abort_job.set() - elif 'softkill' in res.get('command'): - logger.info('pilot received a panda server signal to softkill job %s at %s' % - (job.jobid, time_stamp())) + elif 'softkill' in cmd: + logger.info('pilot received a panda server signal to softkill job %s at %s', job.jobid, time_stamp()) # event service kill instruction - elif 'debug' in res.get('command'): - logger.info('pilot received a command to turn on debug mode from the server') + job.debug_command = 'softkill' + elif 'debug' in cmd: + logger.info('pilot received a command to turn on standard debug mode from the server') job.debug = True - elif 'debugoff' in res.get('command'): + job.debug_command = 'debug' + elif 'debugoff' in cmd: logger.info('pilot received a command to turn off debug mode from the server') job.debug = False + job.debug_command = 'debugoff' else: - logger.warning('received unknown server command via backchannel: %s' % res.get('command')) + logger.warning('received unknown server command via backchannel: %s', cmd) + + # for testing debug mode + # job.debug = True + # job.debug_command = 'du -sk' + # job.debug_command = 'tail -30 payload.stdout' + # job.debug_command = 'ls -ltr workDir' # not really tested + # job.debug_command = 'ls -ltr %s' % job.workdir + # job.debug_command = 'ps -ef' + # job.debug_command = 'ps axo pid,ppid,pgid,args' + # job.debug_command = 'gdb --pid % -ex \'generate-core-file\'' def add_data_structure_ids(data, version_tag): @@ -585,9 +633,7 @@ def get_data_structure(job, state, args, xml=None, metadata=None): # in debug mode, also send a tail of the latest log file touched by the payload if job.debug: - stdout_tail = get_payload_log_tail(job) - if stdout_tail: - data['stdout'] = stdout_tail + data['stdout'] = process_debug_mode(job) # add the core count if job.corecount and job.corecount != 'null' and job.corecount != 'NULL': @@ -595,13 +641,13 @@ def get_data_structure(job, state, args, xml=None, metadata=None): #data['coreCount'] = mean(job.corecounts) if job.corecounts else job.corecount if job.corecounts: _mean = mean(job.corecounts) - logger.info('mean actualcorecount: %f' % _mean) + logger.info('mean actualcorecount: %f', _mean) data['meanCoreCount'] = _mean # get the number of events, should report in heartbeat in case of preempted. if job.nevents != 0: data['nEvents'] = job.nevents - logger.info("total number of processed events: %d (read)" % job.nevents) + logger.info("total number of processed events: %d (read)", job.nevents) else: logger.info("payload/TRF did not report the number of read events") @@ -613,11 +659,14 @@ def get_data_structure(job, state, args, xml=None, metadata=None): data['cpuConsumptionUnit'] = job.cpuconsumptionunit + "+" + get_cpu_model() instruction_sets = has_instruction_sets(['AVX2']) + product, vendor = get_display_info() if instruction_sets: if 'cpuConsumptionUnit' in data: data['cpuConsumptionUnit'] += '+' + instruction_sets else: data['cpuConsumptionUnit'] = instruction_sets + if product and vendor: + logger.debug('cpuConsumptionUnit: could have added: product=%s, vendor=%s', product, vendor) # add memory information if available add_memory_info(data, job.workdir, name=job.memorymonitor) @@ -628,6 +677,161 @@ def get_data_structure(job, state, args, xml=None, metadata=None): return data +def process_debug_mode(job): + """ + Handle debug mode - preprocess debug command, get the output and kill the payload in case of gdb. + + :param job: job object. + :return: stdout from debug command (string). + """ + + # for gdb commands, use the proper gdb version (the system one may be too old) + if job.debug_command.startswith('gdb '): + pilot_user = os.environ.get('PILOT_USER', 'generic').lower() + user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 + user.preprocess_debug_command(job) + + stdout = get_debug_stdout(job) + if stdout: + # in case gdb was successfully used, the payload can now be killed + if job.debug_command.startswith('gdb ') and job.pid: + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PANDAKILL, + msg='payload was killed after gdb produced requested core file') + logger.debug('will proceed to kill payload processes') + kill_processes(job.pid) + + return stdout + + +def get_debug_stdout(job): + """ + Return the requested output from a given debug command. + + :param job: job object. + :return: output (string). + """ + + if job.debug_command == 'debug': + return get_payload_log_tail(job.workdir) + elif 'tail ' in job.debug_command: + return get_requested_log_tail(job.debug_command, job.workdir) + elif 'ls ' in job.debug_command: + return get_ls(job.debug_command, job.workdir) + elif 'ps ' in job.debug_command or 'gdb ' in job.debug_command: + return get_general_command_stdout(job) + else: + # general command, execute and return output + _, stdout, _ = execute(job.debug_command) + logger.info('debug_command: %s:\n\n%s\n', job.debug_command, stdout) + return stdout + + +def get_general_command_stdout(job): + """ + Return the output from the requested debug command. + + :param job: job object. + :return: output (string). + """ + + stdout = '' + + # for gdb, we might have to process the debug command (e.g. to identify the proper pid to debug) + if 'gdb ' in job.debug_command and '--pid %' in job.debug_command: + pilot_user = os.environ.get('PILOT_USER', 'generic').lower() + user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 + job.debug_command = user.process_debug_command(job.debug_command, job.jobid) + + if job.debug_command: + _containerisation = False # set this with some logic instead - not used for now + if _containerisation: + try: + containerise_general_command(job, job.infosys.queuedata.container_options, + label='general', + container_type='container') + except PilotException as error: + logger.warning('general containerisation threw a pilot exception: %s', error) + except Exception as error: + logger.warning('general containerisation threw an exception: %s', error) + else: + _, stdout, stderr = execute(job.debug_command) + logger.debug("%s (stdout):\n\n%s\n\n", job.debug_command, stdout) + logger.debug("%s (stderr):\n\n%s\n\n", job.debug_command, stderr) + + # in case a core file was produced, locate it + path = locate_core_file(cmd=job.debug_command) if 'gdb ' in job.debug_command else '' + if path: + # copy it to the working directory (so it will be saved in the log) + try: + copy(path, job.workdir) + except Exception: + pass + + return stdout + + +def get_ls(debug_command, workdir): + """ + Return the requested ls debug command. + + :param debug_command: full debug command (string). + :param workdir: job work directory (string). + :return: output (string). + """ + + items = debug_command.split(' ') + # cmd = items[0] + options = ' '.join(items[1:]) + path = options.split(' ')[-1] if ' ' in options else options + if path.startswith('-'): + path = '.' + finalpath = os.path.join(workdir, path) + debug_command = debug_command.replace(path, finalpath) + + _, stdout, _ = execute(debug_command) + logger.debug("%s:\n\n%s\n\n", debug_command, stdout) + + return stdout + + +def get_requested_log_tail(debug_command, workdir): + """ + Return the tail of the requested debug log. + + Examples + tail workdir/tmp.stdout* <- pilot finds the requested log file in the specified relative path + tail log.RAWtoALL <- pilot finds the requested log file + + :param debug_command: full debug command (string). + :param workdir: job work directory (string). + :return: output (string). + """ + + _tail = "" + items = debug_command.split(' ') + cmd = items[0] + options = ' '.join(items[1:]) + logger.debug('debug command: %s', cmd) + logger.debug('debug options: %s', options) + + # assume that the path is the last of the options; + path = options.split(' ')[-1] if ' ' in options else options + fullpath = os.path.join(workdir, path) + + # find all files with the given pattern and pick the latest updated file (if several) + files = glob(fullpath) + if files: + logger.info('files found: %s', str(files)) + _tail = get_latest_log_tail(files) + else: + logger.warning('did not find \'%s\' in path %s', path, fullpath) + + if _tail: + logger.debug('tail =\n\n%s\n\n', _tail) + + return _tail + + def add_error_codes(data, job): """ Add error codes to data structure. @@ -641,7 +845,7 @@ def add_error_codes(data, job): pilot_error_code = job.piloterrorcode pilot_error_codes = job.piloterrorcodes if pilot_error_codes != []: - logger.warning('pilotErrorCodes = %s (will report primary/first error code)' % str(pilot_error_codes)) + logger.warning('pilotErrorCodes = %s (will report primary/first error code)', str(pilot_error_codes)) data['pilotErrorCode'] = pilot_error_codes[0] else: data['pilotErrorCode'] = pilot_error_code @@ -650,7 +854,7 @@ def add_error_codes(data, job): pilot_error_diag = job.piloterrordiag pilot_error_diags = job.piloterrordiags if pilot_error_diags != []: - logger.warning('pilotErrorDiags = %s (will report primary/first error diag)' % str(pilot_error_diags)) + logger.warning('pilotErrorDiags = %s (will report primary/first error diag)', str(pilot_error_diags)) data['pilotErrorDiag'] = pilot_error_diags[0] else: data['pilotErrorDiag'] = pilot_error_diag @@ -675,7 +879,7 @@ def get_cpu_consumption_time(cpuconsumptiontime): except Exception: constime = None if constime and constime > 10 ** 9: - logger.warning("unrealistic cpuconsumptiontime: %d (reset to -1)" % constime) + logger.warning("unrealistic cpuconsumptiontime: %d (reset to -1)", constime) constime = -1 return constime @@ -704,7 +908,7 @@ def add_timing_and_extracts(data, job, state, args): user = __import__('pilot.user.%s.diagnose' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 extracts = user.get_log_extracts(job, state) if extracts != "": - logger.warning('\nXXXXXXXXXXXXXXXXXXXXX[begin log extracts]\n%s\nXXXXXXXXXXXXXXXXXXXXX[end log extracts]' % extracts) + logger.warning('\nXXXXXXXXXXXXXXXXXXXXX[begin log extracts]\n%s\nXXXXXXXXXXXXXXXXXXXXX[end log extracts]', extracts) data['pilotLog'] = extracts[:1024] data['endTime'] = time.time() @@ -723,26 +927,10 @@ def add_memory_info(data, workdir, name=""): pilot_user = os.environ.get('PILOT_USER', 'generic').lower() utilities = __import__('pilot.user.%s.utilities' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: - #for key in job.utilities utility_node = utilities.get_memory_monitor_info(workdir, name=name) data.update(utility_node) - except Exception as e: - logger.info('memory information not available: %s' % e) - pass - - -#def get_list_of_log_files(): -# """ -# Return a list of log files produced by the payload. -# -# :return: list of log files. -# """ -# -# list_of_files = get_files() -# if not list_of_files: # some TRFs produce logs with different naming scheme -# list_of_files = get_files(pattern="log.*") -# -# return list_of_files + except Exception as error: + logger.info('memory information not available: %s', error) def remove_pilot_logs_from_list(list_of_files): @@ -753,6 +941,8 @@ def remove_pilot_logs_from_list(list_of_files): :return: list of files (list). """ + # note: better to move experiment specific files to user area + # ignore the pilot log files try: to_be_removed = [config.Pilot.pilotlog, config.Pilot.stageinlog, config.Pilot.stageoutlog, @@ -760,30 +950,27 @@ def remove_pilot_logs_from_list(list_of_files): config.Pilot.remotefileverification_log, config.Pilot.base_trace_report, config.Container.container_script, config.Container.release_setup, config.Container.stagein_status_dictionary, config.Container.stagein_replica_dictionary, - 'eventLoopHeartBeat.txt'] - except Exception as e: - logger.warning('exception caught: %s' % e) + 'eventLoopHeartBeat.txt', 'memory_monitor_output.txt', 'memory_monitor_summary.json_snapshot'] + except Exception as error: + logger.warning('exception caught: %s', error) to_be_removed = [] new_list_of_files = [] for filename in list_of_files: - if os.path.basename(filename) not in to_be_removed and '/pilot/' not in filename: + if os.path.basename(filename) not in to_be_removed and '/pilot/' not in filename and 'prmon' not in filename: new_list_of_files.append(filename) - #logger.debug('list_of_files=%s' % str(new_list_of_files)) return new_list_of_files -def get_payload_log_tail(job): +def get_payload_log_tail(workdir): """ Return the tail of the payload stdout or its latest updated log file. - :param job: job object. + :param workdir: job work directory (string). :return: tail of stdout (string). """ - stdout_tail = "" - # find the latest updated log file # list_of_files = get_list_of_log_files() # find the latest updated text file @@ -791,29 +978,41 @@ def get_payload_log_tail(job): list_of_files = remove_pilot_logs_from_list(list_of_files) if not list_of_files: - logger.info('no log files were found (will use default %s)' % config.Payload.payloadstdout) - list_of_files = [os.path.join(job.workdir, config.Payload.payloadstdout)] + logger.info('no log files were found (will use default %s)', config.Payload.payloadstdout) + list_of_files = [os.path.join(workdir, config.Payload.payloadstdout)] + + return get_latest_log_tail(list_of_files) + + +def get_latest_log_tail(files): + """ + Get the tail of the latest updated file from the given file list. + + :param files: files (list). + """ + + stdout_tail = "" try: - latest_file = max(list_of_files, key=os.path.getmtime) - logger.info('tail of file %s will be added to heartbeat' % latest_file) + latest_file = max(files, key=os.path.getmtime) + logger.info('tail of file %s will be added to heartbeat', latest_file) # now get the tail of the found log file and protect against potentially large tails stdout_tail = latest_file + "\n" + tail(latest_file) stdout_tail = stdout_tail[-2048:] - except Exception as e: - logger.warning('failed to get payload stdout tail: %s' % e) + except Exception as error: + logger.warning('failed to get payload stdout tail: %s', error) return stdout_tail def validate(queues, traces, args): """ - (add description) + Perform validation of job. - :param queues: - :param traces: - :param args: + :param queues: queues object. + :param traces: traces object. + :param args: args object. :return: """ @@ -828,7 +1027,7 @@ def validate(queues, traces, args): # set the environmental variable for the task id os.environ['PanDA_TaskID'] = str(job.taskid) - logger.info('processing PanDA job %s from task %s' % (job.jobid, job.taskid)) + logger.info('processing PanDA job %s from task %s', job.jobid, job.taskid) if _validate_job(job): @@ -836,16 +1035,16 @@ def validate(queues, traces, args): os.setpgrp() job_dir = os.path.join(args.mainworkdir, 'PanDA_Pilot-%s' % job.jobid) - logger.debug('creating job working directory: %s' % job_dir) + logger.debug('creating job working directory: %s', job_dir) try: os.mkdir(job_dir) os.chmod(job_dir, 0o770) job.workdir = job_dir - except Exception as e: - logger.debug('cannot create working directory: %s' % str(e)) + except Exception as error: + logger.debug('cannot create working directory: %s', error) traces.pilot['error_code'] = errors.MKDIR job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(traces.pilot['error_code']) - job.piloterrordiag = e + job.piloterrordiag = error put_in_queue(job, queues.failed_jobs) break else: @@ -855,48 +1054,36 @@ def validate(queues, traces, args): # # stream the job object to file # job_dict = job.to_json() # write_json(os.path.join(job.workdir, 'job.json'), job_dict) -# except Exception as e: -# logger.debug('exception caught: %s' % e) +# except Exception as error: +# logger.debug('exception caught: %s', error) # else: # try: # _job_dict = read_json(os.path.join(job.workdir, 'job.json')) # job_dict = loads(_job_dict) # _job = JobData(job_dict, use_kmap=False) -# except Exception as e: -# logger.warning('exception caught: %s' % e) +# except Exception as error: +# logger.warning('exception caught: %s', error) create_symlink(from_path='../%s' % config.Pilot.pilotlog, to_path=os.path.join(job_dir, config.Pilot.pilotlog)) # pre-cleanup pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - utilities = __import__('pilot.user.%s.utilities' % pilot_user, globals(), locals(), [pilot_user], - 0) # Python 2/3 + utilities = __import__('pilot.user.%s.utilities' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: utilities.precleanup() - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) # store the PanDA job id for the wrapper to pick up store_jobid(job.jobid, args.sourcedir) # run the delayed space check now - proceed_with_local_space_check = True if (args.harvester_submitmode.lower() == 'push' and args.update_server) else False - if proceed_with_local_space_check: - logger.debug('pilot will not perform delayed space check') - ec, diagnostics = check_local_space() - if ec != 0: - traces.pilot['error_code'] = errors.NOLOCALSPACE - # set the corresponding error code - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOLOCALSPACE, msg=diagnostics) - logger.debug('Failed to validate job=%s' % job.jobid) - put_in_queue(job, queues.failed_jobs) - else: - put_in_queue(job, queues.validated_jobs) - else: - put_in_queue(job, queues.validated_jobs) + delayed_space_check(queues, traces, args, job) + # make sure that ctypes is available (needed at the end by orphan killer) + verify_ctypes(queues, job) else: - logger.debug('Failed to validate job=%s' % job.jobid) + logger.debug('Failed to validate job=%s', job.jobid) put_in_queue(job, queues.failed_jobs) # proceed to set the job_aborted flag? @@ -909,6 +1096,62 @@ def validate(queues, traces, args): logger.debug('[job] validate thread has finished') +def verify_ctypes(queues, job): + """ + Verify ctypes and make sure all subprocess are parented. + + :param queues: queues object. + :param job: job object. + :return: + """ + + try: + import ctypes + except Exception as error: + diagnostics = 'ctypes python module could not be imported: %s' % error + logger.warning(diagnostics) + #job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOCTYPES, msg=diagnostics) + #logger.debug('Failed to validate job=%s', job.jobid) + #put_in_queue(job, queues.failed_jobs) + else: + logger.debug('ctypes python module imported') + + # make sure all children are parented by the pilot + # specifically, this will include any 'orphans', i.e. if the pilot kills all subprocesses at the end, + # 'orphans' will be included (orphans seem like the wrong name) + libc = ctypes.CDLL('libc.so.6') + pr_set_child_subreaper = 36 + libc.prctl(pr_set_child_subreaper, 1) + logger.debug('all child subprocesses will be parented') + + +def delayed_space_check(queues, traces, args, job): + """ + Run the delayed space check if necessary. + + :param queues: queues object. + :param traces: traces object. + :param args: args object. + :param job: job object. + :return: + """ + + proceed_with_local_space_check = True if (args.harvester_submitmode.lower() == 'push' and args.update_server) else False + if proceed_with_local_space_check: + logger.debug('pilot will now perform delayed space check') + exit_code, diagnostics = check_local_space() + if exit_code != 0: + traces.pilot['error_code'] = errors.NOLOCALSPACE + # set the corresponding error code + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOLOCALSPACE, msg=diagnostics) + logger.debug('Failed to validate job=%s', job.jobid) + put_in_queue(job, queues.failed_jobs) + else: + put_in_queue(job, queues.validated_jobs) + else: + put_in_queue(job, queues.validated_jobs) + + def create_k8_link(job_dir): """ Create a soft link to the payload workdir on Kubernetes if SHARED_DIR exists. @@ -937,10 +1180,10 @@ def store_jobid(jobid, init_dir): path = os.path.join(os.path.join(init_dir, 'pilot2'), config.Pilot.jobid_file) path = path.replace('pilot2/pilot2', 'pilot2') # dirty fix for bad paths mode = 'a' if os.path.exists(path) else 'w' - logger.debug('path=%s mode=%s' % (path, mode)) + logger.debug('path=%s mode=%s', path, mode) write_file(path, "%s\n" % str(jobid), mode=mode, mute=False) - except Exception as e: - logger.warning('exception caught while trying to store job id: %s' % e) + except Exception as error: + logger.warning('exception caught while trying to store job id: %s', error) def create_data_payload(queues, traces, args): @@ -1091,7 +1334,7 @@ def get_dispatcher_dictionary(args): taskid = get_task_id() if taskid != "" and args.allow_same_user: data['taskID'] = taskid - logger.info("will download a new job belonging to task id: %s" % (data['taskID'])) + logger.info("will download a new job belonging to task id: %s", data['taskID']) if args.resource_type != "": data['resourceType'] = args.resource_type @@ -1152,8 +1395,8 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge # pilot can report the error with a server update) proceed_with_local_space_check = False if (submitmode.lower() == 'push' and update_server) else True if proceed_with_local_space_check: - ec, diagnostics = check_local_space() - if ec != 0: + exit_code, diagnostics = check_local_space() + if exit_code != 0: traces.pilot['error_code'] = errors.NOLOCALSPACE return False else: @@ -1161,7 +1404,7 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge maximum_getjob_requests = 60 if harvester else max_getjob_requests # 1 s apart (if harvester) if getjob_requests > int(maximum_getjob_requests): - logger.warning('reached maximum number of getjob requests (%s) -- will abort pilot' % maximum_getjob_requests) + logger.warning('reached maximum number of getjob requests (%s) -- will abort pilot', maximum_getjob_requests) # use singleton: # instruct the pilot to wrap up quickly os.environ['PILOT_WRAP_UP'] = 'QUICKLY' @@ -1175,7 +1418,7 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge return False if (currenttime - starttime > timefloor) and jobnumber > 0: - logger.warning("the pilot has run out of time (timefloor=%d has been passed)" % timefloor) + logger.warning("the pilot has run out of time (timefloor=%d has been passed)", timefloor) # use singleton: # instruct the pilot to wrap up quickly os.environ['PILOT_WRAP_UP'] = 'QUICKLY' @@ -1183,8 +1426,7 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge # timefloor not relevant for the first job if jobnumber > 0: - logger.info('since timefloor=%d s and only %d s has passed since launch, pilot can run another job' % - (timefloor, currenttime - starttime)) + logger.info('since timefloor=%d s and only %d s has passed since launch, pilot can run another job', timefloor, currenttime - starttime) if harvester and jobnumber > 0: # unless it's the first job (which is preplaced in the init dir), instruct Harvester to place another job @@ -1214,7 +1456,7 @@ def getjob_server_command(url, port): if not findall(port_pattern, url): url = url + ':%s' % port else: - logger.debug('URL already contains port: %s' % url) + logger.debug('URL already contains port: %s', url) else: url = config.Pilot.pandaserver if url == "": @@ -1242,7 +1484,7 @@ def get_job_definition_from_file(path, harvester): if is_json(path): job_definition_list = parse_job_definition_file(path) if not job_definition_list: - logger.warning('no jobs were found in Harvester job definitions file: %s' % path) + logger.warning('no jobs were found in Harvester job definitions file: %s', path) return {} else: # remove the job definition file from the original location, place a renamed copy in the pilot dir @@ -1258,11 +1500,11 @@ def get_job_definition_from_file(path, harvester): with open(path, 'r') as jobdatafile: response = jobdatafile.read() if len(response) == 0: - logger.fatal('encountered empty job definition file: %s' % path) + logger.fatal('encountered empty job definition file: %s', path) res = None # this is a fatal error, no point in continuing as the file will not be replaced else: # parse response message - # logger.debug('%s:\n\n%s\n\n' % (path, response)) + # logger.debug('%s:\n\n%s\n\n', path, response) try: from urlparse import parse_qsl # Python 2 except Exception: @@ -1270,8 +1512,8 @@ def get_job_definition_from_file(path, harvester): datalist = parse_qsl(response, keep_blank_values=True) # convert to dictionary - for d in datalist: - res[d[0]] = d[1] + for data in datalist: + res[data[0]] = data[1] if os.path.exists(path): remove(path) @@ -1294,7 +1536,7 @@ def get_job_definition_from_server(args): cmd = getjob_server_command(args.url, args.port) if cmd != "": - logger.info('executing server command: %s' % cmd) + logger.info('executing server command: %s', cmd) res = https.request(cmd, data=data) return res @@ -1345,7 +1587,7 @@ def get_job_definition(args): logger.info('will use a fake PanDA job') res = get_fake_job() elif os.path.exists(path): - logger.info('will read job definition from file %s' % path) + logger.info('will read job definition from file %s', path) res = get_job_definition_from_file(path, args.harvester) else: if args.harvester and args.harvester_submitmode.lower() == 'push': @@ -1471,11 +1713,11 @@ def get_fake_job(input=True): 'destinationDblock': job_name, 'dispatchDBlockToken': 'NULL', 'jobPars': '-a sources.20115461.derivation.tgz -r ./ -j "Reco_tf.py ' - '--inputAODFile AOD.07709524._000050.pool.root.1 --outputDAODFile test.pool.root ' - '--reductionConf HIGG3D1" -i "[\'AOD.07709524._000050.pool.root.1\']" -m "[]" -n "[]" --trf' - ' --useLocalIO --accessmode=copy -o ' - '"{\'IROOT\': [(\'DAOD_HIGG3D1.test.pool.root\', \'%s.root\')]}" ' - '--sourceURL https://aipanda012.cern.ch:25443' % (job_name), + '--inputAODFile AOD.07709524._000050.pool.root.1 --outputDAODFile test.pool.root ' + '--reductionConf HIGG3D1" -i "[\'AOD.07709524._000050.pool.root.1\']" -m "[]" -n "[]" --trf' + ' --useLocalIO --accessmode=copy -o ' + '"{\'IROOT\': [(\'DAOD_HIGG3D1.test.pool.root\', \'%s.root\')]}" ' + '--sourceURL https://aipanda012.cern.ch:25443' % (job_name), 'attemptNr': '0', 'swRelease': 'Atlas-20.7.6', 'nucleus': 'NULL', @@ -1493,7 +1735,7 @@ def get_fake_job(input=True): 'taskID': 'NULL', 'logFile': '%s.job.log.tgz' % job_name} else: - logger.warning('unknown test job type: %s' % config.Pilot.testjobtype) + logger.warning('unknown test job type: %s', config.Pilot.testjobtype) if res: if not input: @@ -1507,7 +1749,7 @@ def get_fake_job(input=True): if config.Pilot.testtransfertype == "NULL" or config.Pilot.testtransfertype == 'direct': res['transferType'] = config.Pilot.testtransfertype else: - logger.warning('unknown test transfer type: %s (ignored)' % config.Pilot.testtransfertype) + logger.warning('unknown test transfer type: %s (ignored)', config.Pilot.testtransfertype) if config.Pilot.testjobcommand == 'sleep': res['transformation'] = 'sleep' @@ -1586,7 +1828,7 @@ def retrieve(queues, traces, args): # noqa: C901 # get a job definition from a source (file or server) res = get_job_definition(args) - logger.info('job definition = %s' % str(res)) + logger.info('job definition = %s', str(res)) if res is None: logger.fatal('fatal error in job download loop - cannot continue') @@ -1599,8 +1841,8 @@ def retrieve(queues, traces, args): # noqa: C901 if not res: delay = get_job_retrieval_delay(args.harvester) if not args.harvester: - logger.warning('did not get a job -- sleep %d s and repeat' % delay) - for i in range(delay): + logger.warning('did not get a job -- sleep %d s and repeat', delay) + for _ in range(delay): if args.graceful_stop.is_set(): break time.sleep(1) @@ -1608,7 +1850,7 @@ def retrieve(queues, traces, args): # noqa: C901 # it seems the PanDA server returns StatusCode as an int, but the aCT returns it as a string # note: StatusCode keyword is not available in job definition files from Harvester (not needed) if 'StatusCode' in res and res['StatusCode'] != '0' and res['StatusCode'] != 0: - logger.warning('did not get a job -- sleep 60s and repeat -- status: %s' % res['StatusCode']) + logger.warning('did not get a job -- sleep 60s and repeat -- status: %s', res['StatusCode']) for i in range(60): if args.graceful_stop.is_set(): break @@ -1624,11 +1866,11 @@ def retrieve(queues, traces, args): # noqa: C901 #try: # job_status, job_attempt_nr, job_status_code = get_job_status_from_server(job.jobid, args.url, args.port) # if job_status == "running": - # pilot_error_diag = "job %s is already running elsewhere - aborting" % (job.jobid) + # pilot_error_diag = "job %s is already running elsewhere - aborting" % job.jobid # logger.warning(pilot_error_diag) # raise JobAlreadyRunning(pilot_error_diag) - #except Exception as e: - # logger.warning("%s" % e) + #except Exception as error: + # logger.warning("%s", error) # write time stamps to pilot timing file # note: PILOT_POST_GETJOB corresponds to START_TIME in Pilot 1 add_to_pilot_timing(job.jobid, PILOT_PRE_GETJOB, time_pre_getjob, args) @@ -1652,7 +1894,7 @@ def retrieve(queues, traces, args): # noqa: C901 logging.info('pilot has finished for previous job - re-establishing logging') logging.handlers = [] logging.shutdown() - establish_logging(args) + establish_logging(debug=args.debug, nopilotlog=args.nopilotlog) pilot_version_banner() getjob_requests = 0 add_to_pilot_timing('1', PILOT_MULTIJOB_START_TIME, time.time(), args) @@ -1701,8 +1943,8 @@ def create_job(dispatcher_response, queue): #job.workdir = os.getcwd() - logger.info('received job: %s (sleep until the job has finished)' % job.jobid) - logger.info('job details: \n%s' % job) + logger.info('received job: %s (sleep until the job has finished)', job.jobid) + logger.info('job details: \n%s', job) # payload environment wants the PANDAID to be set, also used below os.environ['PANDAID'] = job.jobid @@ -1728,13 +1970,13 @@ def has_job_completed(queues, args): else: make_job_report(job) cmd = 'ls -lF %s' % os.environ.get('PILOT_HOME') - logger.debug('%s:\n' % cmd) - ec, stdout, stderr = execute(cmd) + logger.debug('%s:\n', cmd) + _, stdout, _ = execute(cmd) logger.debug(stdout) queue_report(queues) job.reset_errors() - logger.info("job %s has completed (purged errors)" % job.jobid) + logger.info("job %s has completed (purged errors)", job.jobid) # cleanup of any remaining processes if job.pid: @@ -1747,14 +1989,14 @@ def has_job_completed(queues, args): #finished_queue_snapshot = list(queues.finished_jobs.queue) #peek = [obj for obj in finished_queue_snapshot if jobid == obj.jobid] #if peek: - # logger.info("job %s has completed (finished)" % jobid) + # logger.info("job %s has completed (finished)", jobid) # return True # is there anything in the failed_jobs queue? #failed_queue_snapshot = list(queues.failed_jobs.queue) #peek = [obj for obj in failed_queue_snapshot if jobid == obj.jobid] #if peek: - # logger.info("job %s has completed (failed)" % jobid) + # logger.info("job %s has completed (failed)", jobid) # return True return False @@ -1781,31 +2023,31 @@ def get_job_from_queue(queues, state): else: # make sure that state=failed set_pilot_state(job=job, state=state) - logger.info("job %s has state=%s" % (job.jobid, job.state)) + logger.info("job %s has state=%s", job.jobid, job.state) return job -def is_queue_empty(queues, q): +def is_queue_empty(queues, queue): """ Check if the given queue is empty (without pulling). :param queues: pilot queues object. - :param q: queue name (string). + :param queue: queue name (string). :return: True if queue is empty, False otherwise """ status = False - if q in queues._fields: - _q = getattr(queues, q) - jobs = list(_q.queue) + if queue in queues._fields: + _queue = getattr(queues, queue) + jobs = list(_queue.queue) if len(jobs) > 0: - logger.info('queue %s not empty: found %d job(s)' % (q, len(jobs))) + logger.info('queue %s not empty: found %d job(s)', queue, len(jobs)) else: - logger.info('queue %s is empty' % q) + logger.info('queue %s is empty', queue) status = True else: - logger.warning('queue %s not present in %s' % (q, queues._fields)) + logger.warning('queue %s not present in %s', queue, queues._fields) return status @@ -1832,7 +2074,7 @@ def order_log_transfer(queues, job): while n < nmax: # refresh the log_transfer since it might have changed log_transfer = job.get_status('LOG_TRANSFER') - logger.info('waiting for log transfer to finish (#%d/#%d): %s' % (n + 1, nmax, log_transfer)) + logger.info('waiting for log transfer to finish (#%d/#%d): %s', n + 1, nmax, log_transfer) if is_queue_empty(queues, 'data_out') and \ (log_transfer == LOG_TRANSFER_DONE or log_transfer == LOG_TRANSFER_FAILED): # set in data component logger.info('stage-out of log has completed') @@ -1843,7 +2085,7 @@ def order_log_transfer(queues, job): time.sleep(2) n += 1 - logger.info('proceeding with server update (n=%d)' % n) + logger.info('proceeding with server update (n=%d)', n) def wait_for_aborted_job_stageout(args, queues, job): @@ -1861,9 +2103,9 @@ def wait_for_aborted_job_stageout(args, queues, job): time_since_kill = get_time_since('1', PILOT_KILL_SIGNAL, args) was_killed = was_pilot_killed(args.timing) if was_killed: - logger.info('%d s passed since kill signal was intercepted - make sure that stage-out has finished' % time_since_kill) - except Exception as e: - logger.warning('exception caught: %s' % e) + logger.info('%d s passed since kill signal was intercepted - make sure that stage-out has finished', time_since_kill) + except Exception as error: + logger.warning('exception caught: %s', error) time_since_kill = 60 else: if time_since_kill > 60 or time_since_kill < 0: # fail-safe @@ -1873,7 +2115,7 @@ def wait_for_aborted_job_stageout(args, queues, job): # if stage-out has not finished, we need to wait (less than two minutes or the batch system will issue # a hard SIGKILL) max_wait_time = 2 * 60 - time_since_kill - 5 - logger.debug('using max_wait_time = %d s' % max_wait_time) + logger.debug('using max_wait_time = %d s', max_wait_time) t0 = time.time() while time.time() - t0 < max_wait_time: if job in queues.finished_data_out.queue or job in queues.failed_data_out.queue: @@ -1940,14 +2182,14 @@ def queue_monitor(queues, traces, args): # noqa: C901 while i < imax and os.environ.get('PILOT_WRAP_UP', '') == 'NORMAL': job = get_finished_or_failed_job(args, queues) if job: - logger.debug('returned job has state=%s' % job.state) + logger.debug('returned job has state=%s', job.state) #if job.state == 'failed': # logger.warning('will abort failed job (should prepare for final server update)') break i += 1 state = get_pilot_state() # the job object is not available, but the state is also kept in PILOT_JOB_STATE if state != 'stage-out': - # logger.info("no need to wait since job state=\'%s\'" % state) + # logger.info("no need to wait since job state=\'%s\'", state) break pause_queue_monitor(1) if not abort_thread else pause_queue_monitor(10) @@ -1957,7 +2199,7 @@ def queue_monitor(queues, traces, args): # noqa: C901 completed_jobids = queues.completed_jobids.queue if queues.completed_jobids else [] if job and job.jobid not in completed_jobids: - logger.info("preparing for final server update for job %s in state=\'%s\'" % (job.jobid, job.state)) + logger.info("preparing for final server update for job %s in state=\'%s\'", job.jobid, job.state) if args.job_aborted.is_set(): # wait for stage-out to finish for aborted job @@ -1974,7 +2216,7 @@ def queue_monitor(queues, traces, args): # noqa: C901 logger.warning('failed to dequeue job: queue is empty (did job fail before job monitor started?)') make_job_report(job) else: - logger.debug('job %s was dequeued from the monitored payloads queue' % _job.jobid) + logger.debug('job %s was dequeued from the monitored payloads queue', _job.jobid) # now ready for the next job (or quit) put_in_queue(job.jobid, queues.completed_jobids) @@ -2010,8 +2252,8 @@ def update_server(job, args): metadata = user.get_metadata(job.workdir) try: user.update_server(job) - except Exception as e: - logger.warning('exception caught in update_server(): %s' % e) + except Exception as error: + logger.warning('exception caught in update_server(): %s', error) if job.fileinfo: send_state(job, args, job.state, xml=dumps(job.fileinfo), metadata=metadata) else: @@ -2026,7 +2268,7 @@ def pause_queue_monitor(delay): :return: """ - logger.warning('since job:queue_monitor is responsible for sending job updates, we sleep for %d s' % delay) + logger.warning('since job:queue_monitor is responsible for sending job updates, we sleep for %d s', delay) time.sleep(delay) @@ -2083,8 +2325,8 @@ def get_heartbeat_period(debug=False): try: return int(config.Pilot.heartbeat if not debug else config.Pilot.debug_heartbeat) - except Exception as e: - logger.warning('bad config data for heartbeat period: %s (will use default 1800 s)' % e) + except Exception as error: + logger.warning('bad config data for heartbeat period: %s (will use default 1800 s)', error) return 1800 @@ -2098,7 +2340,7 @@ def check_for_abort_job(args, caller=''): """ abort_job = False if args.abort_job.is_set(): - logger.warning('%s detected an abort_job request (signal=%s)' % (caller, args.signal)) + logger.warning('%s detected an abort_job request (signal=%s)', caller, args.signal) logger.warning('in case pilot is running more than one job, all jobs will be aborted') abort_job = True @@ -2130,9 +2372,8 @@ def interceptor(queues, traces, args): # peek at the jobs in the validated_jobs queue and send the running ones to the heartbeat function jobs = queues.monitored_payloads.queue if jobs: - for i in range(len(jobs)): - - logger.info('interceptor loop %d: looking for communication file' % n) + for _ in range(len(jobs)): + logger.info('interceptor loop %d: looking for communication file', n) time.sleep(30) n += 1 @@ -2197,10 +2438,15 @@ def job_monitor(queues, traces, args): # noqa: C901 update_time = send_heartbeat_if_time(jobs[i], args, update_time) # note: when sending a state change to the server, the server might respond with 'tobekilled' - if jobs[i].state == 'failed': - logger.warning('job state is \'failed\' - order log transfer and abort job_monitor() (1)') - jobs[i].stageout = 'log' # only stage-out log file - put_in_queue(jobs[i], queues.data_out) + try: + jobs[i] + except Exception as error: + logger.warning('detected stale jobs[i] object in job_monitor: %s', error) + else: + if jobs[i].state == 'failed': + logger.warning('job state is \'failed\' - order log transfer and abort job_monitor() (1)') + jobs[i].stageout = 'log' # only stage-out log file + put_in_queue(jobs[i], queues.data_out) # sleep for a while if stage-in has not completed time.sleep(1) @@ -2219,9 +2465,9 @@ def job_monitor(queues, traces, args): # noqa: C901 peeking_time = int(time.time()) for i in range(len(jobs)): current_id = jobs[i].jobid - logger.info('monitor loop #%d: job %d:%s is in state \'%s\'' % (n, i, current_id, jobs[i].state)) + logger.info('monitor loop #%d: job %d:%s is in state \'%s\'', n, i, current_id, jobs[i].state) if jobs[i].state == 'finished' or jobs[i].state == 'failed': - logger.info('will abort job monitoring soon since job state=%s (job is still in queue)' % jobs[i].state) + logger.info('will abort job monitoring soon since job state=%s (job is still in queue)', jobs[i].state) break # perform the monitoring tasks @@ -2237,8 +2483,8 @@ def job_monitor(queues, traces, args): # noqa: C901 else: try: fail_monitored_job(jobs[i], exit_code, diagnostics, queues, traces) - except Exception as e: - logger.warning('(1) exception caught: %s (job id=%s)' % (e, current_id)) + except Exception as error: + logger.warning('(1) exception caught: %s (job id=%s)', error, current_id) break # run this check again in case job_monitor_tasks() takes a long time to finish (and the job object @@ -2246,15 +2492,15 @@ def job_monitor(queues, traces, args): # noqa: C901 try: _job = jobs[i] except Exception: - logger.info('aborting job monitoring since job object (job id=%s) has expired' % current_id) + logger.info('aborting job monitoring since job object (job id=%s) has expired', current_id) break # send heartbeat if it is time (note that the heartbeat function might update the job object, e.g. # by turning on debug mode, ie we need to get the heartbeat period in case it has changed) try: update_time = send_heartbeat_if_time(_job, args, update_time) - except Exception as e: - logger.warning('(2) exception caught: %s (job id=%s)' % (e, current_id)) + except Exception as error: + logger.warning('(2) exception caught: %s (job id=%s)', error, current_id) break else: # note: when sending a state change to the server, the server might respond with 'tobekilled' @@ -2349,7 +2595,7 @@ def fail_monitored_job(job, exit_code, diagnostics, queues, traces): job.piloterrordiag = diagnostics traces.pilot['error_code'] = exit_code put_in_queue(job, queues.failed_payloads) - logger.info('aborting job monitoring since job state=%s' % job.state) + logger.info('aborting job monitoring since job state=%s', job.state) def make_job_report(job): @@ -2364,37 +2610,37 @@ def make_job_report(job): logger.info('') logger.info('job summary report') logger.info('--------------------------------------------------') - logger.info('PanDA job id: %s' % job.jobid) - logger.info('task id: %s' % job.taskid) + logger.info('PanDA job id: %s', job.jobid) + logger.info('task id: %s', job.taskid) n = len(job.piloterrorcodes) if n > 0: for i in range(n): - logger.info('error %d/%d: %s: %s' % (i + 1, n, job.piloterrorcodes[i], job.piloterrordiags[i])) + logger.info('error %d/%d: %s: %s', i + 1, n, job.piloterrorcodes[i], job.piloterrordiags[i]) else: logger.info('errors: (none)') if job.piloterrorcode != 0: - logger.info('pilot error code: %d' % job.piloterrorcode) - logger.info('pilot error diag: %s' % job.piloterrordiag) + logger.info('pilot error code: %d', job.piloterrorcode) + logger.info('pilot error diag: %s', job.piloterrordiag) info = "" for key in job.status: info += key + " = " + job.status[key] + " " - logger.info('status: %s' % info) + logger.info('status: %s', info) s = "" if job.is_analysis() and job.state != 'finished': s = '(user job is recoverable)' if errors.is_recoverable(code=job.piloterrorcode) else '(user job is not recoverable)' - logger.info('pilot state: %s %s' % (job.state, s)) - logger.info('transexitcode: %d' % job.transexitcode) - logger.info('exeerrorcode: %d' % job.exeerrorcode) - logger.info('exeerrordiag: %s' % job.exeerrordiag) - logger.info('exitcode: %d' % job.exitcode) - logger.info('exitmsg: %s' % job.exitmsg) - logger.info('cpuconsumptiontime: %d %s' % (job.cpuconsumptiontime, job.cpuconsumptionunit)) - logger.info('nevents: %d' % job.nevents) - logger.info('neventsw: %d' % job.neventsw) - logger.info('pid: %s' % job.pid) - logger.info('pgrp: %s' % str(job.pgrp)) - logger.info('corecount: %d' % job.corecount) - logger.info('event service: %s' % str(job.is_eventservice)) - logger.info('sizes: %s' % str(job.sizes)) + logger.info('pilot state: %s %s', job.state, s) + logger.info('transexitcode: %d', job.transexitcode) + logger.info('exeerrorcode: %d', job.exeerrorcode) + logger.info('exeerrordiag: %s', job.exeerrordiag) + logger.info('exitcode: %d', job.exitcode) + logger.info('exitmsg: %s', job.exitmsg) + logger.info('cpuconsumptiontime: %d %s', job.cpuconsumptiontime, job.cpuconsumptionunit) + logger.info('nevents: %d', job.nevents) + logger.info('neventsw: %d', job.neventsw) + logger.info('pid: %s', job.pid) + logger.info('pgrp: %s', str(job.pgrp)) + logger.info('corecount: %d', job.corecount) + logger.info('event service: %s', str(job.is_eventservice)) + logger.info('sizes: %s', str(job.sizes)) logger.info('--------------------------------------------------') logger.info('') diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 8770f82b2..99ba180b6 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -6,7 +6,7 @@ # # Authors: # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # NOTE: this module should deal with non-job related monitoring, such as thread monitoring. Job monitoring is # a task for the job_monitor thread in the Job component. @@ -41,22 +41,22 @@ def control(queues, traces, args): :return: """ - t0 = time.time() - traces.pilot['lifetime_start'] = t0 # ie referring to when pilot monitoring began - traces.pilot['lifetime_max'] = t0 + t_0 = time.time() + traces.pilot['lifetime_start'] = t_0 # ie referring to when pilot monitoring began + traces.pilot['lifetime_max'] = t_0 threadchecktime = int(config.Pilot.thread_check) # for CPU usage debugging cpuchecktime = int(config.Pilot.cpu_check) - tcpu = t0 + tcpu = t_0 queuedata = get_queuedata_from_job(queues) max_running_time = get_max_running_time(args.lifetime, queuedata) try: # overall loop counter (ignoring the fact that more than one job may be running) - n = 0 + niter = 0 while not args.graceful_stop.is_set(): # every seconds, run the monitoring checks @@ -74,8 +74,7 @@ def control(queues, traces, args): time_since_start = get_time_since_start(args) grace_time = 10 * 60 if time_since_start - grace_time > max_running_time: - logger.fatal('max running time (%d s) minus grace time (%d s) has been exceeded - must abort pilot' % - (max_running_time, grace_time)) + logger.fatal('max running time (%d s) minus grace time (%d s) has been exceeded - must abort pilot', max_running_time, grace_time) logger.info('setting REACHED_MAXTIME and graceful stop') environ['REACHED_MAXTIME'] = 'REACHED_MAXTIME' # TODO: use singleton instead # do not set graceful stop if pilot has not finished sending the final job update @@ -84,8 +83,8 @@ def control(queues, traces, args): args.graceful_stop.set() break else: - if n % 60 == 0: - logger.info('%d s have passed since pilot start' % time_since_start) + if niter % 60 == 0: + logger.info('%d s have passed since pilot start', time_since_start) time.sleep(1) # time to check the CPU? @@ -93,12 +92,12 @@ def control(queues, traces, args): processes = get_process_info('python pilot2/pilot.py', pid=getpid()) if processes: logger.info('-' * 100) - logger.info('PID=%d has CPU usage=%s%% MEM usage=%s%% CMD=%s' % (getpid(), processes[0], processes[1], processes[2])) - n = processes[3] - if n > 1: - logger.info('there are %d such processes running' % n) + logger.info('PID=%d has CPU usage=%s%% MEM usage=%s%% CMD=%s', getpid(), processes[0], processes[1], processes[2]) + nproc = processes[3] + if nproc > 1: + logger.info('there are %d such processes running', nproc) else: - logger.info('there is %d such process running' % n) + logger.info('there is %d such process running', nproc) logger.info('-' * 100) tcpu = time.time() @@ -109,22 +108,21 @@ def control(queues, traces, args): if int(time.time() - traces.pilot['lifetime_start']) % threadchecktime == 0: # get all threads for thread in threading.enumerate(): - # logger.info('thread name: %s' % thread.name) + # logger.info('thread name: %s', thread.name) if not thread.is_alive(): - logger.fatal('thread \'%s\' is not alive' % thread.name) + logger.fatal('thread \'%s\' is not alive', thread.name) # args.graceful_stop.set() - n += 1 + niter += 1 - except Exception as e: - print(("monitor: exception caught: %s" % e)) - raise PilotException(e) + except Exception as error: + print(("monitor: exception caught: %s" % error)) + raise PilotException(error) logger.info('[monitor] control thread has ended') #def log_lifetime(sig, frame, traces): -# logger.info('lifetime: %i used, %i maximum' % (int(time.time() - traces.pilot['lifetime_start']), -# traces.pilot['lifetime_max'])) +# logger.info('lifetime: %i used, %i maximum', int(time.time() - traces.pilot['lifetime_start']), traces.pilot['lifetime_max']) def get_process_info(cmd, user=None, args='aufx', pid=None): @@ -151,14 +149,14 @@ def get_process_info(cmd, user=None, args='aufx', pid=None): """ processes = [] - n = 0 + num = 0 if not user: user = getuid() pattern = re.compile(r"\S+|[-+]?\d*\.\d+|\d+") arguments = ['ps', '-u', user, args, '--no-headers'] process = Popen(arguments, stdout=PIPE, stderr=PIPE) - stdout, notused = process.communicate() + stdout, _ = process.communicate() for line in stdout.splitlines(): found = re.findall(pattern, line) if found is not None: @@ -167,12 +165,12 @@ def get_process_info(cmd, user=None, args='aufx', pid=None): mem = found[3] command = ' '.join(found[10:]) if cmd in command: - n += 1 + num += 1 if processid == str(pid): processes = [cpu, mem, command] if processes: - processes.append(n) + processes.append(num) return processes @@ -194,9 +192,9 @@ def run_checks(queues, args): t_max = 2 * 60 logger.warning('pilot monitor received instruction that abort_job has been requested') - logger.warning('will wait for a maximum of %d seconds for threads to finish' % t_max) - t0 = time.time() - while time.time() - t0 < t_max: + logger.warning('will wait for a maximum of %d seconds for threads to finish', t_max) + t_0 = time.time() + while time.time() - t_0 < t_max: if args.job_aborted.is_set(): logger.warning('job_aborted has been set - aborting pilot monitoring') args.abort_job.clear() @@ -210,10 +208,10 @@ def run_checks(queues, args): args.graceful_stop.set() if not args.job_aborted.is_set(): - logger.warning('will wait for a maximum of %d seconds for graceful_stop to take effect' % t_max) + logger.warning('will wait for a maximum of %d seconds for graceful_stop to take effect', t_max) t_max = 10 - t0 = time.time() - while time.time() - t0 < t_max: + t_0 = time.time() + while time.time() - t_0 < t_max: if args.job_aborted.is_set(): logger.warning('job_aborted has been set - aborting pilot monitoring') args.abort_job.clear() @@ -241,20 +239,20 @@ def get_max_running_time(lifetime, queuedata): # use the schedconfig value if set, otherwise use the pilot option lifetime value if not queuedata: logger.warning('queuedata could not be extracted from queues, will use default for max running time ' - '(%d s)' % max_running_time) + '(%d s)', max_running_time) else: if queuedata.maxtime: try: max_running_time = int(queuedata.maxtime) - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) logger.warning('failed to convert maxtime from queuedata, will use default value for max running time ' - '(%d s)' % max_running_time) + '(%d s)', max_running_time) else: if max_running_time == 0: max_running_time = lifetime # fallback to default value - logger.info('will use default value for max running time: %d s' % max_running_time) + logger.info('will use default value for max running time: %d s', max_running_time) else: - logger.info('will use queuedata.maxtime value for max running time: %d s' % max_running_time) + logger.info('will use queuedata.maxtime value for max running time: %d s', max_running_time) return max_running_time diff --git a/pilot/control/payload.py b/pilot/control/payload.py index f6bef60eb..c5641c70d 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -8,7 +8,7 @@ # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 # - Tobias Wegner, tobias.wegner@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # - Wen Guan, wen.guan@cern.ch, 2017-2018 import os @@ -22,7 +22,7 @@ from pilot.control.payloads import generic, eventservice, eventservicemerge from pilot.control.job import send_state -from pilot.util.auxiliary import set_pilot_state, show_memory_usage +from pilot.util.auxiliary import set_pilot_state from pilot.util.processes import get_cpu_consumption_time from pilot.util.config import config from pilot.util.filehandling import read_file, remove_core_dumps, get_guid @@ -64,7 +64,7 @@ def control(queues, traces, args): pass else: exc_type, exc_obj, exc_trace = exc - logger.warning("thread \'%s\' received an exception from bucket: %s" % (thread.name, exc_obj)) + logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj) # deal with the exception # .. @@ -146,8 +146,8 @@ def _validate_payload(job): user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: status = user.validate(job) - except Exception as e: - logger.fatal('failed to execute user validate() function: %s' % e) + except Exception as error: + logger.fatal('failed to execute user validate() function: %s', error) status = False return status @@ -203,7 +203,7 @@ def execute_payloads(queues, traces, args): # noqa: C901 peek = [s_job for s_job in q_snapshot if job.jobid == s_job.jobid] if len(peek) == 0: put_in_queue(job, queues.validated_payloads) - for i in range(10): # Python 3 + for _ in range(10): # Python 3 if args.graceful_stop.is_set(): break time.sleep(1) @@ -213,13 +213,13 @@ def execute_payloads(queues, traces, args): # noqa: C901 #queues.monitored_payloads.put(job) put_in_queue(job, queues.monitored_payloads) - logger.info('job %s added to monitored payloads queue' % job.jobid) + logger.info('job %s added to monitored payloads queue', job.jobid) try: out = open(os.path.join(job.workdir, config.Payload.payloadstdout), 'wb') err = open(os.path.join(job.workdir, config.Payload.payloadstderr), 'wb') - except Exception as e: - logger.warning('failed to open payload stdout/err: %s' % e) + except Exception as error: + logger.warning('failed to open payload stdout/err: %s', error) out = None err = None send_state(job, args, 'starting') @@ -230,9 +230,7 @@ def execute_payloads(queues, traces, args): # noqa: C901 break payload_executor = get_payload_executor(args, job, out, err, traces) - logger.info("Got payload executor: %s" % payload_executor) - - show_memory_usage() + logger.info("will use payload executor: %s", payload_executor) # run the payload and measure the execution time job.t0 = os.times() @@ -252,13 +250,13 @@ def execute_payloads(queues, traces, args): # noqa: C901 0) # Python 2/3 try: user.update_output_for_hpo(job) - except Exception as e: - logger.warning('exception caught by update_output_for_hpo(): %s' % e) + except Exception as error: + logger.warning('exception caught by update_output_for_hpo(): %s', error) else: for dat in job.outdata: if not dat.guid: dat.guid = get_guid() - logger.warning('guid not set: generated guid=%s for lfn=%s' % (dat.guid, dat.lfn)) + logger.warning('guid not set: generated guid=%s for lfn=%s', dat.guid, dat.lfn) #if traces.pilot['nr_jobs'] == 1: # logger.debug('faking job failure in first multi-job') @@ -275,8 +273,8 @@ def execute_payloads(queues, traces, args): # noqa: C901 user = __import__('pilot.user.%s.diagnose' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: exit_code_interpret = user.interpret(job) - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) #exit_code_interpret = -1 job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.INTERNALPILOTPROBLEM) @@ -298,8 +296,8 @@ def execute_payloads(queues, traces, args): # noqa: C901 except queue.Empty: continue - except Exception as e: - logger.fatal('execute payloads caught an exception (cannot recover): %s, %s' % (e, traceback.format_exc())) + except Exception as error: + logger.fatal('execute payloads caught an exception (cannot recover): %s, %s', error, traceback.format_exc()) if job: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADEXECUTIONEXCEPTION) #queues.failed_payloads.put(job) @@ -329,8 +327,7 @@ def set_cpu_consumption_time(job): job.cpuconsumptiontime = int(round(cpuconsumptiontime)) job.cpuconsumptionunit = "s" job.cpuconversionfactor = 1.0 - logger.info('CPU consumption time: %f %s (rounded to %d %s)' % - (cpuconsumptiontime, job.cpuconsumptionunit, job.cpuconsumptiontime, job.cpuconsumptionunit)) + logger.info('CPU consumption time: %f %s (rounded to %d %s)', cpuconsumptiontime, job.cpuconsumptionunit, job.cpuconsumptiontime, job.cpuconsumptionunit) def perform_initial_payload_error_analysis(job, exit_code): @@ -343,40 +340,51 @@ def perform_initial_payload_error_analysis(job, exit_code): :return: """ + # look for singularity errors (the exit code can be zero in this case) + stderr = read_file(os.path.join(job.workdir, config.Payload.payloadstderr)) + if stderr: + exit_code = errors.resolve_transform_error(exit_code, stderr) + if exit_code != 0: msg = "" - ec = 0 - logger.warning('main payload execution returned non-zero exit code: %d' % exit_code) - stderr = read_file(os.path.join(job.workdir, config.Payload.payloadstderr)) + logger.warning('main payload execution returned non-zero exit code: %d', exit_code) if stderr != "": msg = errors.extract_stderr_error(stderr) if msg == "": # look for warning messages instead (might not be fatal so do not set UNRECOGNIZEDTRFSTDERR) msg = errors.extract_stderr_warning(stderr) - fatal = False - else: - fatal = True - if msg != "": - logger.warning("extracted message from stderr:\n%s" % msg) - ec = set_error_code_from_stderr(msg, fatal) - - if not ec: - ec = errors.resolve_transform_error(exit_code, stderr) - if ec != 0: + # fatal = False + #else: + # fatal = True + #if msg != "": # redundant since resolve_transform_error is used above + # logger.warning("extracted message from stderr:\n%s", msg) + # exit_code = set_error_code_from_stderr(msg, fatal) + + if msg: + msg = errors.format_diagnostics(exit_code, msg) + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code, msg=msg) + + ''' + if exit_code != 0: if msg: - msg = errors.format_diagnostics(ec, msg) - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec, msg=msg) + msg = errors.format_diagnostics(exit_code, msg) + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code, msg=msg) else: if job.piloterrorcodes: - logger.warning('error code(s) already set: %s' % str(job.piloterrorcodes)) + logger.warning('error code(s) already set: %s', str(job.piloterrorcodes)) else: # check if core dumps exist, if so remove them and return True - if remove_core_dumps(job.workdir): + if remove_core_dumps(job.workdir) and not job.debug: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.COREDUMP) else: logger.warning('initial error analysis did not resolve the issue (and core dumps were not found)') + ''' else: - logger.info('main payload execution returned zero exit code, but will check it more carefully') + logger.info('main payload execution returned zero exit code') + + # check if core dumps exist, if so remove them and return True + if remove_core_dumps(job.workdir) and not job.debug: + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.COREDUMP) def set_error_code_from_stderr(msg, fatal): @@ -389,24 +397,24 @@ def set_error_code_from_stderr(msg, fatal): :return: error code (int). """ - if "Failed invoking the NEWUSER namespace runtime" in msg: - ec = errors.SINGULARITYNEWUSERNAMESPACE - elif "Failed to create user namespace" in msg: - ec = errors.SINGULARITYFAILEDUSERNAMESPACE - elif "command not found" in msg: - ec = errors.TRANSFORMNOTFOUND - elif "SL5 is unsupported" in msg: - ec = errors.UNSUPPORTEDSL5OS - elif "resource temporarily unavailable" in msg: - ec = errors.SINGULARITYRESOURCEUNAVAILABLE - elif "unrecognized arguments" in msg: - ec = errors.UNRECOGNIZEDTRFARGUMENTS - elif fatal: - ec = errors.UNRECOGNIZEDTRFSTDERR - else: - ec = 0 + exit_code = 0 + error_map = {errors.SINGULARITYNEWUSERNAMESPACE: "Failed invoking the NEWUSER namespace runtime", + errors.SINGULARITYFAILEDUSERNAMESPACE: "Failed to create user namespace", + errors.SINGULARITYRESOURCEUNAVAILABLE: "resource temporarily unavailable", + errors.SINGULARITYNOTINSTALLED: "Singularity is not installed", + errors.TRANSFORMNOTFOUND: "command not found", + errors.UNSUPPORTEDSL5OS: "SL5 is unsupported", + errors.UNRECOGNIZEDTRFARGUMENTS: "unrecognized arguments"} + + for key, value in error_map.items(): + if value in msg: + exit_code = key + break + + if fatal and not exit_code: + exit_code = errors.UNRECOGNIZEDTRFSTDERR - return ec + return exit_code def validate_post(queues, traces, args): diff --git a/pilot/control/payloads/eventservice.py b/pilot/control/payloads/eventservice.py index 1d6017390..dc36ec72d 100644 --- a/pilot/control/payloads/eventservice.py +++ b/pilot/control/payloads/eventservice.py @@ -53,11 +53,11 @@ def run_payload(self, job, cmd, out, err): logger.fatal('could not define payload command') return None - logger.info("payload execution command: %s" % executable) + logger.info("payload execution command: %s", executable) try: payload = {'executable': executable, 'workdir': job.workdir, 'output_file': out, 'error_file': err, 'job': job} - logger.debug("payload: %s" % payload) + logger.debug("payload: %s", payload) logger.info("Starting EventService WorkExecutor") executor_type = self.get_executor_type() @@ -66,14 +66,14 @@ def run_payload(self, job, cmd, out, err): executor.start() logger.info("EventService WorkExecutor started") - logger.info("ESProcess started with pid: %s" % executor.get_pid()) + logger.info("ESProcess started with pid: %s", executor.get_pid()) job.pid = executor.get_pid() if job.pid: job.pgrp = os.getpgid(job.pid) self.utility_after_payload_started(job) - except Exception as e: - logger.error('could not execute: %s' % str(e)) + except Exception as error: + logger.error('could not execute: %s', str(error)) return None return executor @@ -100,15 +100,15 @@ def wait_graceful(self, args, proc): :return: """ - t1 = time.time() + t_1 = time.time() while proc.is_alive(): if args.graceful_stop.is_set(): logger.debug("Graceful stop is set, stopping work executor") proc.stop() break - if time.time() > t1 + 300: # 5 minutes + if time.time() > t_1 + 300: # 5 minutes logger.info("Process is still running") - t1 = time.time() + t_1 = time.time() time.sleep(2) while proc.is_alive(): diff --git a/pilot/control/payloads/eventservicemerge.py b/pilot/control/payloads/eventservicemerge.py index a23c00b2e..5c3d454d9 100644 --- a/pilot/control/payloads/eventservicemerge.py +++ b/pilot/control/payloads/eventservicemerge.py @@ -6,7 +6,7 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2021 import os @@ -25,9 +25,9 @@ def untar_file(self, lfn, job): pfn = os.path.join(job.workdir, lfn) command = "tar -xf %s -C %s" % (pfn, job.workdir) - logger.info("Untar file: %s" % command) + logger.info("untar file: %s", command) exit_code, stdout, stderr = execute(command) - logger.info("exit_code: %s, stdout: %s, stderr: %s\n" % (exit_code, stdout, stderr)) + logger.info("exit_code: %s, stdout: %s, stderr: %s\n", exit_code, stdout, stderr) def utility_before_payload(self, job): """ diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 1d699a0f4..df5a852fe 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -23,7 +23,7 @@ from pilot.util.container import execute from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED, \ UTILITY_AFTER_PAYLOAD_FINISHED, PILOT_PRE_SETUP, PILOT_POST_SETUP, PILOT_PRE_PAYLOAD, PILOT_POST_PAYLOAD, \ - UTILITY_AFTER_PAYLOAD_STARTED2 + UTILITY_AFTER_PAYLOAD_STARTED2, UTILITY_AFTER_PAYLOAD_FINISHED2 from pilot.util.filehandling import write_file from pilot.util.processes import kill_processes from pilot.util.timing import add_to_pilot_timing @@ -92,7 +92,7 @@ def utility_before_payload(self, job): cmd_dictionary = user.get_utility_commands(order=UTILITY_BEFORE_PAYLOAD, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.debug('utility command to be executed before the payload: %s' % cmd) + logger.info('utility command (\'%s\') to be executed before the payload: %s', cmd_dictionary.get('label', 'utility'), cmd) return cmd @@ -114,7 +114,7 @@ def utility_with_payload(self, job): cmd_dictionary = user.get_utility_commands(order=UTILITY_WITH_PAYLOAD, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.debug('utility command to be executed with the payload: %s' % cmd) + logger.info('utility command (\'%s\') to be executed with the payload: %s', cmd_dictionary.get('label', 'utility'), cmd) return cmd @@ -138,7 +138,7 @@ def get_utility_command(self, order=None): cmd_dictionary = user.get_utility_commands(order=order, job=self.__job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.info('utility command to be executed after the payload: %s' % cmd) + logger.info('utility command (\'%s\') to be executed after the payload: %s', cmd_dictionary.get('label', 'utility'), cmd) return cmd @@ -156,7 +156,7 @@ def utility_after_payload_started(self, job): cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_STARTED, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.info('utility command to be executed after the payload: %s' % cmd) + logger.info('utility command to be executed after the payload: %s', cmd) # how should this command be executed? utilitycommand = user.get_utility_command_setup(cmd_dictionary.get('command'), job) @@ -166,8 +166,8 @@ def utility_after_payload_started(self, job): try: proc1 = execute(utilitycommand, workdir=job.workdir, returnproc=True, usecontainer=False, stdout=PIPE, stderr=PIPE, cwd=job.workdir, job=job) - except Exception as e: - logger.error('could not execute: %s' % e) + except Exception as error: + logger.error('could not execute: %s', error) else: # store process handle in job object, and keep track on how many times the command has been launched # also store the full command in case it needs to be restarted later (by the job_monitor() thread) @@ -191,7 +191,7 @@ def utility_after_payload_started_new(self, job): cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_STARTED, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.info('utility command to be executed after the payload: %s' % cmd) + logger.info('utility command to be executed after the payload: %s', cmd) return cmd @@ -203,22 +203,24 @@ def utility_after_payload_started_new(self, job): # try: # proc = execute(utilitycommand, workdir=job.workdir, returnproc=True, usecontainer=False, # stdout=PIPE, stderr=PIPE, cwd=job.workdir, job=job) -# except Exception as e: -# logger.error('could not execute: %s' % e) +# except Exception as error: +# logger.error('could not execute: %s', error) # else: # # store process handle in job object, and keep track on how many times the command has been launched # # also store the full command in case it needs to be restarted later (by the job_monitor() thread) # job.utilities[cmd_dictionary.get('command')] = [proc, 1, utilitycommand] - def utility_after_payload_finished(self, job): + def utility_after_payload_finished(self, job, order): """ Prepare commands/utilities to run after payload has finished. This command will be executed later. - REFACTOR + The order constant can be UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2 :param job: job object. + :param order: constant used for utility selection (constant). + :return: command (string), label (string). """ cmd = "" @@ -228,12 +230,12 @@ def utility_after_payload_finished(self, job): user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 # should any additional commands be prepended to the payload execution string? - cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_FINISHED, job=job) + cmd_dictionary = user.get_utility_commands(order=order, job=job) if cmd_dictionary: cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) - logger.debug('utility command to be executed after the payload has finished: %s' % cmd) + logger.info('utility command (\'%s\') to be executed after the payload has finished: %s', cmd_dictionary.get('label', 'utility'), cmd) - return cmd + return cmd, cmd_dictionary.get('label') def execute_utility_command(self, cmd, job, label): """ @@ -247,15 +249,18 @@ def execute_utility_command(self, cmd, job, label): exit_code, stdout, stderr = execute(cmd, workdir=job.workdir, cwd=job.workdir, usecontainer=False) if exit_code: - logger.warning('command returned non-zero exit code: %s (exit code = %d) - see utility logs for details' % (cmd, exit_code)) + ignored_exit_codes = [160, 161, 162] + logger.warning('command returned non-zero exit code: %s (exit code = %d) - see utility logs for details', cmd, exit_code) if label == 'preprocess': err = errors.PREPROCESSFAILURE elif label == 'postprocess': err = errors.POSTPROCESSFAILURE else: err = 0 # ie ignore - if err and exit_code != 160: # ignore no-more-data-points exit code + if err and exit_code not in ignored_exit_codes: # ignore no-more-data-points exit codes job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(err) + if exit_code in ignored_exit_codes: + job.transexitcode = exit_code # write output to log files self.write_utility_output(job.workdir, label, stdout, stderr) @@ -285,13 +290,20 @@ def write_utility_output(self, workdir, step, stdout, stderr): elif step == 'postprocess': self.__postprocess_stdout_name = name_stdout self.__postprocess_stderr_name = name_stderr - write_file(os.path.join(workdir, step + '_stdout.txt'), stdout, unique=True) - except PilotException as e: - logger.warning('failed to write utility stdout to file: %s, %s' % (e, stdout)) + name = os.path.join(workdir, step + '_stdout.txt') + write_file(name, stdout, unique=True) + except PilotException as error: + logger.warning('failed to write utility stdout to file: %s, %s', error, stdout) + else: + logger.debug('wrote %s', name) + try: - write_file(os.path.join(workdir, step + '_stderr.txt'), stderr, unique=True) - except PilotException as e: - logger.warning('failed to write utility stderr to file: %s, %s' % (e, stderr)) + name = os.path.join(workdir, step + '_stderr.txt') + write_file(name, stderr, unique=True) + except PilotException as error: + logger.warning('failed to write utility stderr to file: %s, %s', error, stderr) + else: + logger.debug('wrote %s', name) def pre_payload(self, job): """ @@ -322,13 +334,13 @@ def run_command(self, cmd, label=None): """ if label: - logger.info('\n\n%s:\n\n%s\n' % (label, cmd)) + logger.info('\n\n%s:\n\n%s\n', label, cmd) if label == 'coprocess': try: out = open(os.path.join(self.__job.workdir, self.__coprocess_stdout_name), 'wb') err = open(os.path.join(self.__job.workdir, self.__coprocess_stderr_name), 'wb') - except Exception as e: - logger.warning('failed to open coprocess stdout/err: %s' % e) + except Exception as error: + logger.warning('failed to open coprocess stdout/err: %s', error) out = None err = None else: @@ -337,14 +349,14 @@ def run_command(self, cmd, label=None): try: proc = execute(cmd, workdir=self.__job.workdir, returnproc=True, stdout=out, stderr=err, usecontainer=False, cwd=self.__job.workdir, job=self.__job) - except Exception as e: - logger.error('could not execute: %s' % str(e)) + except Exception as error: + logger.error('could not execute: %s', error) return None - if type(proc) == tuple and not proc[0]: + if isinstance(proc, tuple) and not proc[0]: logger.error('failed to execute command') return None - logger.info('started %s -- pid=%s executable=%s' % (label, proc.pid, cmd)) + logger.info('started %s -- pid=%s executable=%s', label, proc.pid, cmd) return proc @@ -365,25 +377,23 @@ def run_payload(self, job, cmd, out, err): # add time for PILOT_PRE_PAYLOAD self.pre_payload(job) - logger.info("\n\npayload execution command:\n\n%s\n" % cmd) + logger.info("\n\npayload execution command:\n\n%s\n", cmd) try: proc = execute(cmd, workdir=job.workdir, returnproc=True, usecontainer=True, stdout=out, stderr=err, cwd=job.workdir, job=job) - except Exception as e: - logger.error('could not execute: %s' % str(e)) + except Exception as error: + logger.error('could not execute: %s', error) return None - if type(proc) == tuple and not proc[0]: + if isinstance(proc, tuple) and not proc[0]: logger.error('failed to execute payload') return None - logger.info('started -- pid=%s executable=%s' % (proc.pid, cmd)) + logger.info('started -- pid=%s executable=%s', proc.pid, cmd) job.pid = proc.pid job.pgrp = os.getpgid(job.pid) set_pilot_state(job=job, state="running") #_cmd = self.utility_with_payload(job) - #if _cmd: - # logger.info('could have executed: %s' % _cmd) self.utility_after_payload_started(job) @@ -398,13 +408,17 @@ def extract_setup(self, cmd): :return: updated secondary command (string). """ - def cut_str_from(_cmd, s): - # cut the string from the position of the given _cmd - return _cmd[:_cmd.find(s)] + def cut_str_from(_cmd, _str): + """ + Cut the string from the position of the given _cmd + """ + return _cmd[:_cmd.find(_str)] def cut_str_from_last_semicolon(_cmd): - # cut the string from the last semicolon - # NOTE: this will not work if jobParams also contain ; + """ + Cut the string from the last semicolon + NOTE: this will not work if jobParams also contain ; + """ # remove any trailing spaces and ;-signs _cmd = _cmd.strip() _cmd = _cmd[:-1] if _cmd.endswith(';') else _cmd @@ -445,16 +459,16 @@ def wait_graceful(self, args, proc): time.sleep(0.1) iteration += 1 - for i in range(60): # Python 2/3 + for _ in range(60): # Python 2/3 if args.graceful_stop.is_set(): breaker = True - logger.info('breaking -- sending SIGTERM pid=%s' % proc.pid) + logger.info('breaking -- sending SIGTERM pid=%s', proc.pid) os.killpg(os.getpgid(proc.pid), signal.SIGTERM) # proc.terminate() break time.sleep(1) if breaker: - logger.info('breaking -- sleep 3s before sending SIGKILL pid=%s' % proc.pid) + logger.info('breaking -- sleep 3s before sending SIGKILL pid=%s', proc.pid) time.sleep(3) proc.kill() break @@ -462,7 +476,7 @@ def wait_graceful(self, args, proc): exit_code = proc.poll() if iteration % 10 == 0: - logger.info('running: iteration=%d pid=%s exit_code=%s' % (iteration, proc.pid, exit_code)) + logger.info('running: iteration=%d pid=%s exit_code=%s', iteration, proc.pid, exit_code) if exit_code is not None: break else: @@ -478,15 +492,12 @@ def get_payload_command(self, job): :return: command (string). """ - show_memory_usage() - cmd = "" # for testing looping job: cmd = user.get_payload_command(job) + ';sleep 240' try: pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 - show_memory_usage() cmd = user.get_payload_command(job) #+ 'sleep 1000' # to test looping jobs except PilotException as error: self.post_setup(job) @@ -495,7 +506,7 @@ def get_payload_command(self, job): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code()) self.__traces.pilot['error_code'] = job.piloterrorcodes[0] logger.fatal( - 'could not define payload command (traces error set to: %d)' % self.__traces.pilot['error_code']) + 'could not define payload command (traces error set to: %d)', self.__traces.pilot['error_code']) return cmd @@ -512,25 +523,29 @@ def run_preprocess(self, job): try: # note: this might update the jobparams cmd_before_payload = self.utility_before_payload(job) - except Exception as e: - logger.error(e) - raise e + except Exception as error: + logger.error(error) + raise error if cmd_before_payload: cmd_before_payload = job.setup + cmd_before_payload - logger.info("\n\npreprocess execution command:\n\n%s\n" % cmd_before_payload) + logger.info("\n\npreprocess execution command:\n\n%s\n", cmd_before_payload) exit_code = self.execute_utility_command(cmd_before_payload, job, 'preprocess') if exit_code == 160: - logger.fatal('no more HP points - time to abort processing loop') + logger.warning('no more HP points - time to abort processing loop') + elif exit_code == 161: + logger.warning('no more HP points but at least one point was processed - time to abort processing loop') + elif exit_code == 162: + logger.warning('loop count reached the limit - time to abort processing loop') elif exit_code: # set error code job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PREPROCESSFAILURE) - logger.fatal('cannot continue since preprocess failed: exit_code=%d' % exit_code) + logger.fatal('cannot continue since preprocess failed: exit_code=%d', exit_code) else: # in case the preprocess produced a command, chmod it path = os.path.join(job.workdir, job.containeroptions.get('containerExec', 'does_not_exist')) if os.path.exists(path): - logger.debug('chmod 0o755: %s' % path) + logger.debug('chmod 0o755: %s', path) os.chmod(path, 0o755) return exit_code @@ -546,8 +561,6 @@ def run(self): # noqa: C901 # get the payload command from the user specific code self.pre_setup(self.__job) - show_memory_usage() - cmd = self.get_payload_command(self.__job) # extract the setup in case the preprocess command needs it self.__job.setup = self.extract_setup(cmd) @@ -557,7 +570,8 @@ def run(self): # noqa: C901 # abort when nothing more to run, or when the preprocess returns a special exit code iteration = 0 while True: - logger.info('payload iteration loop #%d' % (iteration + 1)) + + logger.info('payload iteration loop #%d', iteration + 1) os.environ['PILOT_EXEC_ITERATION_COUNT'] = '%s' % iteration show_memory_usage() @@ -566,7 +580,7 @@ def run(self): # noqa: C901 exit_code = self.run_preprocess(self.__job) jobparams_post = self.__job.jobparams if exit_code: - if exit_code == 160: + if exit_code >= 160 and exit_code <= 162: exit_code = 0 # wipe the output file list since there won't be any new files # any output files from previous iterations, should have been transferred already @@ -581,6 +595,11 @@ def run(self): # noqa: C901 # now run the main payload, when it finishes, run the postprocess (if necessary) # note: no need to run any main payload in HPO Horovod jobs on Kubernetes if os.environ.get('HARVESTER_HOROVOD', '') == '': + + #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + #logger.debug('[before payload start] stdout=%s', _stdout) + #logger.debug('[before payload start] stderr=%s', _stderr) + proc = self.run_payload(self.__job, cmd, self.__out, self.__err) else: proc = None @@ -590,7 +609,7 @@ def run(self): # noqa: C901 # run the post-process command even if there was no main payload if os.environ.get('HARVESTER_HOROVOD', '') != '': logger.info('No need to execute any main payload') - exit_code = self.run_utility_after_payload_finished() + exit_code = self.run_utility_after_payload_finished(True, UTILITY_AFTER_PAYLOAD_FINISHED2) self.post_payload(self.__job) else: break @@ -608,7 +627,7 @@ def run(self): # noqa: C901 # allow for a secondary command to be started after the payload (e.g. a coprocess) utility_cmd = self.get_utility_command(order=UTILITY_AFTER_PAYLOAD_STARTED2) if utility_cmd: - logger.debug('starting utility command: %s' % utility_cmd) + logger.debug('starting utility command: %s', utility_cmd) label = 'coprocess' if 'coprocess' in utility_cmd else None proc_co = self.run_command(utility_cmd, label=label) @@ -625,19 +644,23 @@ def run(self): # noqa: C901 else: state = 'finished' if exit_code == 0 else 'failed' set_pilot_state(job=self.__job, state=state) - logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n' % (proc.pid, exit_code, self.__job.state)) + logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n', proc.pid, exit_code, self.__job.state) + + #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + #logger.debug('[after payload finish] stdout=%s', _stdout) + #logger.debug('[after payload finish] stderr=%s', _stderr) # stop the utility command (e.g. a coprocess if necessary if proc_co: - logger.debug('stopping utility command: %s' % utility_cmd) + logger.debug('stopping utility command: %s', utility_cmd) kill_processes(proc_co.pid) if exit_code is None: logger.warning('detected unset exit_code from wait_graceful - reset to -1') exit_code = -1 - if state != 'failed': - exit_code = self.run_utility_after_payload_finished() + for order in [UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2]: + exit_code = self.run_utility_after_payload_finished(state, order) self.post_payload(self.__job) @@ -654,26 +677,44 @@ def run(self): # noqa: C901 return exit_code - def run_utility_after_payload_finished(self): + def run_utility_after_payload_finished(self, state, order): """ Run utility command after the main payload has finished. + In horovod mode, select the corresponding post-process. Otherwise, select different post-process (e.g. Xcache). + The order constant can be UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2 + + :param state: payload state; finished/failed (string). + :param order: constant used for utility selection (constant). :return: exit code (int). """ exit_code = 0 try: - cmd_after_payload = self.utility_after_payload_finished(self.__job) - except Exception as e: - logger.error(e) + cmd_after_payload, label = self.utility_after_payload_finished(self.__job, order) + except Exception as error: + logger.error(error) else: - if cmd_after_payload and self.__job.postprocess: + if cmd_after_payload and self.__job.postprocess and state != 'failed': cmd_after_payload = self.__job.setup + cmd_after_payload - logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload) - exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'postprocess') + logger.info("\n\npostprocess execution command:\n\n%s\n", cmd_after_payload) + exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label) elif cmd_after_payload: - logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload) - exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'xcache') + logger.info("\n\npostprocess execution command:\n\n%s\n", cmd_after_payload) + + # xcache debug + #if 'xcache' in cmd_after_payload: + # _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + # logger.debug('[before xcache kill] stdout=%s', _stdout) + # logger.debug('[before xcache kill] stderr=%s', _stderr) + + exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label) + + # xcache debug + #if 'xcache' in cmd_after_payload: + # _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') + # logger.debug('[after xcache kill] stdout=%s', _stdout) + # logger.debug('[after xcache kill] stderr=%s', _stderr) return exit_code @@ -691,11 +732,11 @@ def stop_utilities(self): if utproc: user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 sig = user.get_utility_command_kill_signal(utcmd) - logger.info("stopping process \'%s\' with signal %d" % (utcmd, sig)) + logger.info("stopping process \'%s\' with signal %d", utcmd, sig) try: os.killpg(os.getpgid(utproc.pid), sig) - except Exception as e: - logger.warning('exception caught: %s (ignoring)' % e) + except Exception as error: + logger.warning('exception caught: %s (ignoring)', error) user.post_utility_command_action(utcmd, self.__job) @@ -712,4 +753,4 @@ def rename_log_files(self, iteration): if os.path.exists(name): os.rename(name, name + '%d' % iteration) else: - logger.warning('cannot rename %s since it does not exist' % name) + logger.warning('cannot rename %s since it does not exist', name) diff --git a/pilot/copytool/common.py b/pilot/copytool/common.py index ce5f0df35..12381b3d2 100644 --- a/pilot/copytool/common.py +++ b/pilot/copytool/common.py @@ -6,7 +6,7 @@ # # Authors: # - Tobias Wegner, tobias.wegner@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # - Mario Lassnig, mario.lassnig@cern.ch, 2020 import logging @@ -61,8 +61,8 @@ def verify_catalog_checksum(fspec, path): checksum_local = calculate_checksum(path, algorithm=checksum_type) if checksum_type == 'ad32': checksum_type = 'adler32' - logger.info('checksum (catalog): %s (type: %s)' % (checksum_catalog, checksum_type)) - logger.info('checksum (local): %s' % checksum_local) + logger.info('checksum (catalog): %s (type: %s)', checksum_catalog, checksum_type) + logger.info('checksum (local): %s', checksum_local) if checksum_local and checksum_local != '' and checksum_local != checksum_catalog: diagnostics = 'checksum verification failed for LFN=%s: checksum (catalog)=%s != checksum (local)=%s' % \ (fspec.lfn, checksum_catalog, checksum_local) diff --git a/pilot/copytool/gfal.py b/pilot/copytool/gfal.py index 2c184c891..54034e7fc 100644 --- a/pilot/copytool/gfal.py +++ b/pilot/copytool/gfal.py @@ -57,7 +57,8 @@ def copy_in(files, **kwargs): if not check_for_gfal(): raise StageInFailure("No GFAL2 tools found") - localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None) + # note, env vars might be unknown inside middleware contrainers, if so get the value already in the trace report + localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', trace_report.get_value('localSite')) for fspec in files: # update the trace report localsite = localsite if localsite else fspec.ddmendpoint diff --git a/pilot/copytool/gs.py b/pilot/copytool/gs.py index 68e50b5c6..ddaa68d68 100644 --- a/pilot/copytool/gs.py +++ b/pilot/copytool/gs.py @@ -6,6 +6,7 @@ # # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2021 +# - Shuwei import os import logging @@ -73,11 +74,11 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): # http_access = rprotocols["http_access"] # os.environ['GTAG'] = http_access + os.path.join(remote_path, config.Pilot.pilotlog) # logger.debug('http_access=%s' % http_access) - # except Exception as e: + # except Exception: # logger.warning("Failed in get 'http_access' in ddm.rprotocols") surl = protocol.get('endpoint', '') + remote_path - logger.info('For GCS bucket, set surl=%s' % surl) + logger.info('For GCS bucket, set surl=%s', surl) # example: # protocol = {u'path': u'/atlas-eventservice', u'endpoint': u's3://s3.cern.ch:443/', u'flavour': u'AWS-S3-SSL', u'id': 175} @@ -97,7 +98,7 @@ def copy_in(files, **kwargs): dst = fspec.workdir or kwargs.get('workdir') or '.' path = os.path.join(dst, fspec.lfn) - logger.info('downloading surl=%s to local file %s' % (fspec.surl, path)) + logger.info('downloading surl=%s to local file %s', fspec.surl, path) status, diagnostics = download_file(path, fspec.surl, object_name=fspec.lfn) if not status: ## an error occurred @@ -131,8 +132,8 @@ def download_file(path, surl, object_name=None): target = pathlib.Path(object_name) with target.open(mode="wb") as downloaded_file: client.download_blob_to_file(surl, downloaded_file) - except Exception as e: - diagnostics = 'exception caught in gs client: %s' % e + except Exception as error: + diagnostics = 'exception caught in gs client: %s' % error logger.critical(diagnostics) return False, diagnostics @@ -150,7 +151,7 @@ def copy_out(files, **kwargs): workdir = kwargs.pop('workdir') for fspec in files: - logger.info('Going to process fspec.turl=%s' % fspec.turl) + logger.info('Going to process fspec.turl=%s', fspec.turl) import re # bucket = re.sub(r'gs://(.*?)/.*', r'\1', fspec.turl) @@ -164,7 +165,7 @@ def copy_out(files, **kwargs): path = os.path.join(workdir, logfile) if os.path.exists(path): object_name = os.path.join(remote_path, logfile) - logger.info('uploading %s to bucket=%s using object name=%s' % (path, bucket, object_name)) + logger.info('uploading %s to bucket=%s using object name=%s', path, bucket, object_name) status, diagnostics = upload_file(path, bucket, object_name=object_name) if not status: ## an error occurred @@ -204,15 +205,15 @@ def upload_file(file_name, bucket, object_name=None): try: client = storage.Client() gs_bucket = client.get_bucket(bucket) - logger.info('uploading a file to bucket=%s in full path=%s' % (bucket, object_name)) + logger.info('uploading a file to bucket=%s in full path=%s', bucket, object_name) blob = gs_bucket.blob(object_name) blob.upload_from_filename(filename=file_name) if file_name.endswith(config.Pilot.pilotlog): url_pilotlog = blob.public_url os.environ['GTAG'] = url_pilotlog - logger.debug("Set envvar GTAG with the pilotLot URL=%s" % url_pilotlog) - except Exception as e: - diagnostics = 'exception caught in gs client: %s' % e + logger.debug("Set envvar GTAG with the pilotLot URL=%s", url_pilotlog) + except Exception as error: + diagnostics = 'exception caught in gs client: %s' % error logger.critical(diagnostics) return False, diagnostics diff --git a/pilot/copytool/lsm.py b/pilot/copytool/lsm.py index 8f63cd460..67d8b791e 100644 --- a/pilot/copytool/lsm.py +++ b/pilot/copytool/lsm.py @@ -7,7 +7,7 @@ # Authors: # - Pavlo Svirin, pavlo.svirin@cern.ch, 2017 # - Tobias Wegner, tobias.wegner@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2018 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 import os import logging @@ -75,7 +75,9 @@ def copy_in(files, **kwargs): copysetup = get_copysetup(copytools, 'lsm') trace_report = kwargs.get('trace_report') #allow_direct_access = kwargs.get('allow_direct_access') - localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None) + + # note, env vars might be unknown inside middleware contrainers, if so get the value already in the trace report + localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', trace_report.get_value('localSite')) for fspec in files: # update the trace report @@ -99,17 +101,16 @@ def copy_in(files, **kwargs): source = fspec.turl destination = os.path.join(dst, fspec.lfn) - logger.info("transferring file %s from %s to %s" % (fspec.lfn, source, destination)) + logger.info("transferring file %s from %s to %s", fspec.lfn, source, destination) exit_code, stdout, stderr = move(source, destination, dst_in=True, copysetup=copysetup) if exit_code != 0: - logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s" % (exit_code, stdout, stderr)) + logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr) error = resolve_common_transfer_errors(stderr, is_stagein=True) fspec.status = 'failed' fspec.status_code = error.get('rcode') - logger.warning('error=%d' % error.get('rcode')) trace_report.update(clientState=error.get('state') or 'STAGEIN_ATTEMPT_FAILED', stateReason=error.get('error'), timeEnd=time()) trace_report.send() @@ -186,7 +187,7 @@ def copy_out(files, **kwargs): except Exception: opts = " ".join(["%s %s" % (k, v) for (k, v) in list(opts.items())]) # Python 3 - logger.info("transferring file %s from %s to %s" % (fspec.lfn, source, destination)) + logger.info("transferring file %s from %s to %s", fspec.lfn, source, destination) nretries = 1 # input parameter to function? for retry in range(nretries): @@ -246,7 +247,7 @@ def move_all_files_in(files, nretries=1): stderr = "" for entry in files: # entry = {'name':, 'source':, 'destination':} - logger.info("transferring file %s from %s to %s" % (entry['name'], entry['source'], entry['destination'])) + logger.info("transferring file %s from %s to %s", entry['name'], entry['source'], entry['destination']) source = entry['source'] + '/' + entry['name'] destination = os.path.join(entry['destination'], entry['name']) @@ -255,7 +256,7 @@ def move_all_files_in(files, nretries=1): if exit_code != 0: if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: - logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s" % (exit_code, stdout, stderr)) + logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr) return exit_code, stdout, stderr else: # all successful break @@ -276,7 +277,7 @@ def move_all_files_out(files, nretries=1): stderr = "" for entry in files: # entry = {'name':, 'source':, 'destination':} - logger.info("transferring file %s from %s to %s" % (entry['name'], entry['source'], entry['destination'])) + logger.info("transferring file %s from %s to %s", entry['name'], entry['source'], entry['destination']) destination = entry['destination'] + '/' + entry['name'] source = os.path.join(entry['source'], entry['name']) @@ -285,7 +286,7 @@ def move_all_files_out(files, nretries=1): if exit_code != 0: if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: - logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s" % (exit_code, stdout, stderr)) + logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr) return exit_code, stdout, stderr else: # all successful break @@ -321,16 +322,16 @@ def move(source, destination, dst_in=True, copysetup="", options=None): try: exit_code, stdout, stderr = execute(cmd, usecontainer=False, copytool=True) #, timeout=get_timeout(fspec.filesize)) - except Exception as e: + except Exception as error: if dst_in: exit_code = ErrorCodes.STAGEINFAILED else: exit_code = ErrorCodes.STAGEOUTFAILED - stdout = 'exception caught: e' % e + stdout = 'exception caught: e' % error stderr = '' logger.warning(stdout) - logger.info('exit_code=%d, stdout=%s, stderr=%s' % (exit_code, stdout, stderr)) + logger.info('exit_code=%d, stdout=%s, stderr=%s', exit_code, stdout, stderr) return exit_code, stdout, stderr diff --git a/pilot/copytool/mv.py b/pilot/copytool/mv.py index 73093a924..3ff42143a 100644 --- a/pilot/copytool/mv.py +++ b/pilot/copytool/mv.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # - Tobias Wegner, tobias.wegner@cern.ch, 2018 # - David Cameron, david.cameron@cern.ch, 2018-2019 @@ -48,12 +48,12 @@ def create_output_list(files, init_dir, ddmconf): # resolve token value from fspec.ddmendpoint token = ddmconf.get(fspec.ddmendpoint).token if not token: - logger.info('No space token info for %s' % fspec.ddmendpoint) + logger.info('No space token info for %s', fspec.ddmendpoint) else: arcturl = re.sub(r'((:\d+)/)', r'\2;autodir=no;spacetoken=%s/' % token, arcturl) arcturl += ':checksumtype=%s:checksumvalue=%s' % (checksumtype, checksum) - logger.info('Adding to output.list: %s %s' % (fspec.lfn, arcturl)) + logger.info('Adding to output.list: %s %s', fspec.lfn, arcturl) # Write output.list with open(os.path.join(init_dir, 'output.list'), 'a') as f: f.write('%s %s\n' % (fspec.lfn, arcturl)) @@ -124,7 +124,7 @@ def copy_out(files, copy_type="mv", **kwargs): raise StageOutFailure(stdout) # Create output list for ARC CE if necessary - logger.debug('init_dir for output.list=%s' % os.path.dirname(kwargs.get('workdir'))) + logger.debug('init_dir for output.list=%s', os.path.dirname(kwargs.get('workdir'))) output_dir = kwargs.get('output_dir', '') if not output_dir: create_output_list(files, os.path.dirname(kwargs.get('workdir')), kwargs.get('ddmconf', None)) @@ -168,11 +168,11 @@ def move_all_files(files, copy_type, workdir): # resolve canonical path source = os.path.realpath(source) - logger.info("transferring file %s from %s to %s" % (name, source, destination)) + logger.info("transferring file %s from %s to %s", name, source, destination) exit_code, stdout, stderr = copy_method(source, destination) if exit_code != 0: - logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s" % (exit_code, stdout, stderr)) + logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr) fspec.status = 'failed' if fspec.filetype == 'input': fspec.status_code = ErrorCodes.STAGEINFAILED diff --git a/pilot/copytool/objectstore.py b/pilot/copytool/objectstore.py index a8ccb38d3..e13c20e4e 100644 --- a/pilot/copytool/objectstore.py +++ b/pilot/copytool/objectstore.py @@ -7,7 +7,7 @@ # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2019 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2021 import os import json @@ -73,7 +73,7 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): # :return: protocol as dictionary # """ # -# logger.info("Resolving protocol for file(lfn: %s, ddmendpoint: %s) with activity(%s)" % (fspec.lfn, fspec.ddmendpoint, activity)) +# logger.info("Resolving protocol for file(lfn: %s, ddmendpoint: %s) with activity(%s)", fspec.lfn, fspec.ddmendpoint, activity) # # activity = get_ddm_activity(activity) # protocols = ddm.arprotocols.get(activity) @@ -87,7 +87,7 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): # logger.error(err) # raise PilotException(err) # protocol = protocols_allow[0] -# logger.info("Resolved protocol for file(lfn: %s, ddmendpoint: %s) with activity(%s): %s" % (fspec.lfn, fspec.ddmendpoint, activity, protocol)) +# logger.info("Resolved protocol for file(lfn: %s, ddmendpoint: %s) with activity(%s): %s", fspec.lfn, fspec.ddmendpoint, activity, protocol) # return protocol @@ -109,7 +109,7 @@ def copy_in(files, **kwargs): for fspec in files: cmd = [] - logger.info("To transfer file: %s" % fspec) + logger.info("To transfer file: %s", fspec) if fspec.protocol_id: ddm = ddmconf.get(fspec.ddmendpoint) if ddm: @@ -212,7 +212,7 @@ def copy_out(files, **kwargs): cwd = fspec.workdir or kwargs.get('workdir') or '.' path = os.path.join(cwd, 'rucio_upload.json') if not os.path.exists(path): - logger.error('Failed to resolve Rucio summary JSON, wrong path? file=%s' % path) + logger.error('Failed to resolve Rucio summary JSON, wrong path? file=%s', path) else: with open(path, 'rb') as f: summary = json.load(f) diff --git a/pilot/copytool/rucio.py b/pilot/copytool/rucio.py index 626821eb8..675688e50 100644 --- a/pilot/copytool/rucio.py +++ b/pilot/copytool/rucio.py @@ -48,7 +48,7 @@ def verify_stage_out(fspec): from rucio.rse import rsemanager as rsemgr rse_settings = rsemgr.get_rse_info(fspec.ddmendpoint) uploaded_file = {'name': fspec.lfn, 'scope': fspec.scope} - logger.info('Checking file: %s' % str(fspec.lfn)) + logger.info('Checking file: %s', str(fspec.lfn)) return rsemgr.exists(rse_settings, [uploaded_file]) @@ -66,15 +66,16 @@ def copy_in(files, **kwargs): trace_report = kwargs.get('trace_report') use_pcache = kwargs.get('use_pcache') #job = kwargs.get('job') - #use_pcache = job.infosys.queuedata.use_pcache if job else False - logger.debug('use_pcache=%s' % use_pcache) # don't spoil the output, we depend on stderr parsing os.environ['RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]' - localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None) + logger.debug('RUCIO_LOCAL_SITE_ID=%s', os.environ.get('RUCIO_LOCAL_SITE_ID', '')) + logger.debug('trace_report[localSite]=%s', trace_report.get_value('localSite')) + # note, env vars might be unknown inside middleware contrainers, if so get the value already in the trace report + localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', trace_report.get_value('localSite')) for fspec in files: - logger.info('rucio copytool, downloading file with scope:%s lfn:%s' % (str(fspec.scope), str(fspec.lfn))) + logger.info('rucio copytool, downloading file with scope:%s lfn:%s', str(fspec.scope), str(fspec.lfn)) # update the trace report localsite = localsite if localsite else fspec.ddmendpoint trace_report.update(localSite=localsite, remoteSite=fspec.ddmendpoint, filesize=fspec.filesize) @@ -152,7 +153,7 @@ def copy_in(files, **kwargs): def get_protocol(trace_report_out): """ - Extract the protocol used for the transdfer from the dictionary returned by rucio. + Extract the protocol used for the transfer from the dictionary returned by rucio. :param trace_report_out: returned rucio transfer dictionary (dictionary). :return: protocol (string). @@ -160,8 +161,8 @@ def get_protocol(trace_report_out): try: p = trace_report_out[0].get('protocol') - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s' % error) p = '' return p @@ -481,21 +482,21 @@ def _stage_in_api(dst, fspec, trace_report, trace_report_out, transfer_timeout, result = download_client.download_pfns([f], 1, trace_custom_fields=trace_pattern, traces_copy_out=trace_report_out) else: result = download_client.download_dids([f], trace_custom_fields=trace_pattern, traces_copy_out=trace_report_out) - except Exception as e: + except Exception as error: logger.warning('*** rucio API download client failed ***') - logger.warning('caught exception: %s' % e) - logger.debug('trace_report_out=%s' % trace_report_out) + logger.warning('caught exception: %s', error) + logger.debug('trace_report_out=%s', trace_report_out) # only raise an exception if the error info cannot be extracted if not trace_report_out: - raise e + raise error if not trace_report_out[0].get('stateReason'): - raise e + raise error ec = -1 else: logger.info('*** rucio API download client finished ***') - logger.debug('client returned %s' % result) + logger.debug('client returned %s', result) - logger.debug('trace_report_out=%s' % trace_report_out) + logger.debug('trace_report_out=%s', trace_report_out) return ec, trace_report_out @@ -552,18 +553,18 @@ def _stage_in_bulk(dst, files, trace_report_out=None, trace_common_fields=None): logger.info('*** rucio API downloading files (taking over logging) ***') try: result = download_client.download_pfns(file_list, num_threads, trace_custom_fields=trace_pattern, traces_copy_out=trace_report_out) - except Exception as e: + except Exception as error: logger.warning('*** rucio API download client failed ***') - logger.warning('caught exception: %s' % e) - logger.debug('trace_report_out=%s' % trace_report_out) + logger.warning('caught exception: %s', error) + logger.debug('trace_report_out=%s', trace_report_out) # only raise an exception if the error info cannot be extracted if not trace_report_out: - raise e + raise error if not trace_report_out[0].get('stateReason'): - raise e + raise error else: logger.info('*** rucio API download client finished ***') - logger.debug('client returned %s' % result) + logger.debug('client returned %s', result) def _stage_out_api(fspec, summary_file_path, trace_report, trace_report_out, transfer_timeout): @@ -607,31 +608,31 @@ def _stage_out_api(fspec, summary_file_path, trace_report, trace_report_out, tra logger.debug('summary_file_path=%s' % summary_file_path) logger.debug('trace_report_out=%s' % trace_report_out) result = upload_client.upload([f], summary_file_path=summary_file_path, traces_copy_out=trace_report_out) - except Exception as e: + except Exception as error: logger.warning('*** rucio API upload client failed ***') - logger.warning('caught exception: %s' % e) + logger.warning('caught exception: %s', error) import traceback logger.error(traceback.format_exc()) - logger.debug('trace_report_out=%s' % trace_report_out) + logger.debug('trace_report_out=%s', trace_report_out) if not trace_report_out: - raise e + raise error if not trace_report_out[0].get('stateReason'): - raise e + raise error ec = -1 except UnboundLocalError: logger.warning('*** rucio API upload client failed ***') logger.warning('rucio still needs a bug fix of the summary in the uploadclient') else: logger.warning('*** rucio API upload client finished ***') - logger.debug('client returned %s' % result) + logger.debug('client returned %s', result) try: file_exists = verify_stage_out(fspec) logger.info('file exists at the storage: %s' % str(file_exists)) if not file_exists: raise StageOutFailure('physical check after upload failed') - except Exception as e: - msg = 'file existence verification failed with: %s' % e + except Exception as error: + msg = 'file existence verification failed with: %s' % error logger.info(msg) raise StageOutFailure(msg) diff --git a/pilot/copytool/s3.py b/pilot/copytool/s3.py index 365f49cb7..a0a480bc9 100644 --- a/pilot/copytool/s3.py +++ b/pilot/copytool/s3.py @@ -81,7 +81,7 @@ def copy_in(files, **kwargs): bucket = 'bucket' # UPDATE ME path = os.path.join(dst, fspec.lfn) - logger.info('downloading object %s from bucket=%s to local file %s' % (fspec.lfn, bucket, path)) + logger.info('downloading object %s from bucket=%s to local file %s', fspec.lfn, bucket, path) status, diagnostics = download_file(path, bucket, object_name=fspec.lfn) if not status: ## an error occurred @@ -113,12 +113,12 @@ def download_file(path, bucket, object_name=None): try: s3 = boto3.client('s3') s3.download_file(bucket, object_name, path) - except ClientError as e: - diagnostics = 'S3 ClientError: %s' % e + except ClientError as error: + diagnostics = 'S3 ClientError: %s' % error logger.critical(diagnostics) return False, diagnostics - except Exception as e: - diagnostics = 'exception caught in s3_client: %s' % e + except Exception as error: + diagnostics = 'exception caught in s3_client: %s' % error logger.critical(diagnostics) return False, diagnostics @@ -140,7 +140,7 @@ def copy_out(files, **kwargs): path = os.path.join(workdir, fspec.lfn) if os.path.exists(path): bucket = 'bucket' # UPDATE ME - logger.info('uploading %s to bucket=%s using object name=%s' % (path, bucket, fspec.lfn)) + logger.info('uploading %s to bucket=%s using object name=%s', path, bucket, fspec.lfn) status, diagnostics = upload_file(path, bucket, object_name=fspec.lfn) if not status: ## an error occurred @@ -181,12 +181,12 @@ def upload_file(file_name, bucket, object_name=None): s3_client = boto3.client('s3') #response = s3_client.upload_file(file_name, bucket, object_name) s3_client.upload_file(file_name, bucket, object_name) - except ClientError as e: - diagnostics = 'S3 ClientError: %s' % e + except ClientError as error: + diagnostics = 'S3 ClientError: %s' % error logger.critical(diagnostics) return False, diagnostics - except Exception as e: - diagnostics = 'exception caught in s3_client: %s' % e + except Exception as error: + diagnostics = 'exception caught in s3_client: %s' % error logger.critical(diagnostics) return False, diagnostics diff --git a/pilot/copytool/xrdcp.py b/pilot/copytool/xrdcp.py index 9eafbfc56..bfcd2f754 100644 --- a/pilot/copytool/xrdcp.py +++ b/pilot/copytool/xrdcp.py @@ -6,7 +6,7 @@ # # Authors: # - Tobias Wegner, tobias.wegner@cern.ch, 2017-2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # Reimplemented by Alexey Anisenkov @@ -42,28 +42,28 @@ def _resolve_checksum_option(setup, **kwargs): if setup: cmd = "source %s; %s" % (setup, cmd) - logger.info("Execute command (%s) to check xrdcp client version" % cmd) + logger.info("Execute command (%s) to check xrdcp client version", cmd) rcode, stdout, stderr = execute(cmd, **kwargs) - logger.info("return code: %s" % rcode) - logger.info("return output: %s" % (stdout + stderr)) + logger.info("return code: %s", rcode) + logger.info("return output: %s", stdout + stderr) cmd = "%s -h" % copy_command if setup: cmd = "source %s; %s" % (setup, cmd) - logger.info("Execute command (%s) to decide which option should be used to calc/verify file checksum.." % cmd) + logger.info("Execute command (%s) to decide which option should be used to calc/verify file checksum..", cmd) rcode, stdout, stderr = execute(cmd, **kwargs) output = stdout + stderr - logger.info("return code: %s" % rcode) - logger.debug("return output: %s" % output) + logger.info("return code: %s", rcode) + logger.debug("return output: %s", output) coption = "" checksum_type = 'adler32' ## consider only adler32 for now if rcode: - logger.error('FAILED to execute command=%s: %s' % (cmd, output)) + logger.error('FAILED to execute command=%s: %s', cmd, output) else: if "--cksum" in output: coption = "--cksum %s:print" % checksum_type @@ -73,7 +73,7 @@ def _resolve_checksum_option(setup, **kwargs): coption = "-md5" if coption: - logger.info("Use %s option to get the checksum for %s command" % (coption, copy_command)) + logger.info("Use %s option to get the checksum for %s command", coption, copy_command) return coption @@ -96,7 +96,7 @@ def _stagefile(coption, source, destination, filesize, is_stagein, setup=None, * #logger.info("Executing command: %s, timeout=%s" % (cmd, timeout)) rcode, stdout, stderr = execute(cmd, **kwargs) - logger.info('rcode=%d, stdout=%s, stderr=%s' % (rcode, stdout, stderr)) + logger.info('rcode=%d, stdout=%s, stderr=%s', rcode, stdout, stderr) if rcode: ## error occurred error = resolve_common_transfer_errors(stdout + stderr, is_stagein=is_stagein) @@ -138,7 +138,8 @@ def copy_in(files, **kwargs): coption = _resolve_checksum_option(setup, **kwargs) trace_report = kwargs.get('trace_report') - localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None) + # note, env vars might be unknown inside middleware contrainers, if so get the value already in the trace report + localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', trace_report.get_value('localSite')) for fspec in files: # update the trace report localsite = localsite if localsite else fspec.ddmendpoint @@ -243,7 +244,7 @@ def get_file_info_from_output(output): return None, None, None if not ("xrootd" in output or "XRootD" in output or "adler32" in output): - logger.warning("WARNING: Failed to extract checksum: Unexpected output: %s" % output) + logger.warning("WARNING: Failed to extract checksum: Unexpected output: %s", output) return None, None, None pattern = r"(?Pmd5|adler32):\ (?P[a-zA-Z0-9]+)\ \S+\ (?P[0-9]+)" # Python 3 (added r) @@ -258,10 +259,10 @@ def get_file_info_from_output(output): if filesize: try: filesize = int(filesize) - except ValueError as e: - logger.warning('failed to convert filesize to int: %s' % e) + except ValueError as error: + logger.warning('failed to convert filesize to int: %s', error) filesize = None else: - logger.warning("WARNING: Checksum/file size info not found in output: failed to match pattern=%s in output=%s" % (pattern, output)) + logger.warning("WARNING: Checksum/file size info not found in output: failed to match pattern=%s in output=%s", pattern, output) return filesize, checksum, checksum_type diff --git a/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py b/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py index 202f10aee..dc0140da4 100644 --- a/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py +++ b/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py @@ -16,7 +16,7 @@ from pilot.common.errorcodes import ErrorCodes from pilot.eventservice.esprocess.esprocess import ESProcess from pilot.info.filespec import FileSpec -from pilot.util.filehandling import calculate_checksum +from pilot.util.filehandling import calculate_checksum, move from .baseexecutor import BaseExecutor @@ -62,6 +62,21 @@ def create_file_spec(self, pfn): file_spec = FileSpec(filetype='output', **file_data) return file_spec + def move_output(self, pfn): + """ + Move output file from given PFN path to PILOT_OUTPUT_DIR if set. + + :param pfn: physical file name (string). + :return: + """ + + outputdir = os.environ.get('PILOT_OUTPUT_DIR', None) + if outputdir: + try: + move(pfn, outputdir) + except Exception as e: + logger.warning('failed to move output: %s' % e) + def update_finished_event_ranges(self, out_messagess): """ Update finished event ranges @@ -81,6 +96,10 @@ def update_finished_event_ranges(self, out_messagess): for checksum_key in fspec.checksum: event_range_status[checksum_key] = fspec.checksum[checksum_key] event_ranges.append(event_range_status) + + # move the output to a common area if necessary + self.move_output(out_msg['output']) + event_ranges_status = {"esOutput": {"numEvents": len(event_ranges)}, "eventRanges": event_ranges} event_range_message = {'version': 1, 'eventRanges': json.dumps([event_ranges_status])} self.update_events(event_range_message) diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index fd959d615..52ea3255c 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -30,7 +30,7 @@ from .basedata import BaseData from .filespec import FileSpec -from pilot.util.auxiliary import get_object_size +from pilot.util.auxiliary import get_object_size, get_key_value from pilot.util.constants import LOG_TRANSFER_NOT_DONE from pilot.util.filehandling import get_guid, get_valid_path_from_list from pilot.util.timing import get_elapsed_real_time @@ -89,12 +89,13 @@ class JobData(BaseData): neventsw = 0 # number of events written dbtime = None # dbdata = None # - resimevents = 0 # ReSim events from job report (ATLAS) + resimevents = None # ReSim events from job report (ATLAS) payload = "" # payload name utilities = {} # utility processes { : [, number of launches, command string], .. } pid = None # payload pid pgrp = None # payload process group sizes = {} # job object sizes { timestamp: size, .. } + currentsize = 0 # current job object size command = "" # full payload command (set for container jobs) setup = "" # full payload setup (needed by postprocess command) zombies = [] # list of zombie process ids @@ -118,7 +119,8 @@ class JobData(BaseData): attemptnr = 0 # job attempt number destinationdblock = "" ## to be moved to FileSpec (job.outdata) datasetin = "" ## TO BE DEPRECATED: moved to FileSpec (job.indata) - debug = False # + debug = False # debug mode, when True, pilot will send debug info back to the server + debug_command = '' # debug command (can be defined on the task side) produserid = "" # the user DN (added to trace report) jobdefinitionid = "" # the job definition id (added to trace report) infilesguids = "" # @@ -199,7 +201,7 @@ def init(self, infosys): # prepend IMAGE_BASE to imagename if necessary (for testing purposes) image_base = os.environ.get('IMAGE_BASE', '') if not image_base and 'IMAGE_BASE' in infosys.queuedata.catchall: - image_base = self.get_key_value(infosys.queuedata.catchall, key='IMAGE_BASE') + image_base = get_key_value(infosys.queuedata.catchall, key='IMAGE_BASE') if image_base: paths = [os.path.join(image_base, os.path.basename(self.imagename)), os.path.join(image_base, self.imagename)] @@ -209,19 +211,6 @@ def init(self, infosys): #if image_base and not os.path.isabs(self.imagename) and not self.imagename.startswith('docker'): # self.imagename = os.path.join(image_base, self.imagename) - def get_key_value(self, catchall, key='SOMEKEY'): - """ - Return the value corresponding to key in catchall. - :param catchall: catchall free string. - :param key: key name (string). - :return: value (string). - """ - - # ignore any non-key-value pairs that might be present in the catchall string - s = dict(s.split('=', 1) for s in catchall.split() if '=' in s) - - return s.get(key) - def prepare_infiles(self, data): """ Construct FileSpec objects for input files from raw dict `data` @@ -273,7 +262,7 @@ def prepare_infiles(self, data): idat[key] = getattr(self.infosys.queuedata, key) finfo = FileSpec(filetype='input', **idat) - logger.info('added file %s' % lfn) + logger.info('added file \'%s\' with accessmode \'%s\'' % (lfn, accessmode)) ret.append(finfo) return ret @@ -610,7 +599,7 @@ def clean__jobparams(self, raw, value): :return: updated job parameters (string). """ - #value += ' --athenaopts "HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER" someblah' + # value += ' --athenaopts "HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER" someblah' logger.info('cleaning jobparams: %s' % value) # user specific pre-filtering @@ -985,7 +974,12 @@ def get_size(self): :return: size (int). """ - return get_object_size(self) + # protect against the case where the object changes size during calculation (rare) + try: + self.currentsize = get_object_size(self) + except Exception: + pass + return self.currentsize def collect_zombies(self, tn=None): """ diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py index 96ab6805f..69eb80b32 100644 --- a/pilot/scripts/open_remote_file.py +++ b/pilot/scripts/open_remote_file.py @@ -4,15 +4,21 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2021 import argparse import os import logging +import threading +import queue import ROOT +from collections import namedtuple from pilot.util.config import config -from pilot.util.filehandling import establish_logging, write_json +from pilot.util.filehandling import ( + establish_logging, + write_json, +) logger = logging.getLogger(__name__) @@ -31,6 +37,12 @@ def get_args(): action='store_true', default=False, help='Enable debug mode for logging messages') + arg_parser.add_argument('-t', + dest='nthreads', + default=1, + required=False, + type=int, + help='Number of concurrent file open threads') arg_parser.add_argument('-w', dest='workdir', required=False, @@ -50,10 +62,26 @@ def get_args(): def message(msg): + """ + Print message to stdout or to log. + Note: not using lazy formatting. + + :param msg: message (string). + :return: + """ + print(msg) if not logger else logger.info(msg) def get_file_lists(turls): + """ + Return a dictionary with the turls. + Format: {'turls': } + + :param turls: comma separated turls (string) + :return: turls dictionary. + """ + _turls = [] try: @@ -64,9 +92,20 @@ def get_file_lists(turls): return {'turls': _turls} -def try_open_file(turl): +def try_open_file(turl, queues): + """ + Attempt to open a remote file. + Successfully opened turls will be put in the queues.opened queue. Unsuccessful turls will be placed in + the queues.unopened queue. + + :param turl: turl (string). + :param queues: queues collection. + :return: + """ + turl_opened = False try: + message('opening %s' % turl) in_file = ROOT.TFile.Open(turl) except Exception as error: message('caught exception: %s' % error) @@ -74,8 +113,31 @@ def try_open_file(turl): if in_file and in_file.IsOpen(): in_file.Close() turl_opened = True + message('closed %s' % turl) + queues.opened.put(turl) if turl_opened else queues.unopened.put(turl) + queues.result.put(turl) + + +def spawn_file_open_thread(queues, file_list): + """ + Spawn a thread for the try_open_file(). + + :param queues: queue collection. + :param file_list: files to open (list). + :return: thread. + """ - return turl_opened + thread = None + try: + turl = file_list.pop(0) + except IndexError: + pass + else: + # create and start thread for the current turl + thread = threading.Thread(target=try_open_file, args=(turl, queues)) + thread.start() + + return thread if __name__ == '__main__': @@ -99,17 +161,52 @@ def try_open_file(turl): print("remote file open verification not desired") exit(0) - establish_logging(args, filename=logname) + establish_logging(debug=args.debug, nopilotlog=args.nopilotlog, filename=logname) logger = logging.getLogger(__name__) # get the file info file_list_dictionary = get_file_lists(args.turls) turls = file_list_dictionary.get('turls') processed_turls_dictionary = {} + + queues = namedtuple('queues', ['result', 'opened', 'unopened']) + queues.result = queue.Queue() + queues.opened = queue.Queue() + queues.unopened = queue.Queue() + threads = [] + + message('will attempt to open %d file(s) using %d thread(s)' % (len(turls), args.nthreads)) + if turls: - message('got TURLs: %s' % str(turls)) - for turl in turls: - processed_turls_dictionary[turl] = try_open_file(turl) + # make N calls to begin with + for index in range(args.nthreads): + thread = spawn_file_open_thread(queues, turls) + if thread: + threads.append(thread) + + while turls: + + try: + _ = queues.result.get(block=True) + except Exception as error: + message("caught exception: %s" % error) + + thread = spawn_file_open_thread(queues, turls) + if thread: + threads.append(thread) + + # wait until all threads have finished + [_thread.join() for _thread in threads] + + opened_turls = list(queues.opened.queue) + opened_turls.sort() + unopened_turls = list(queues.unopened.queue) + unopened_turls.sort() + + for turl in opened_turls: + processed_turls_dictionary[turl] = True + for turl in unopened_turls: + processed_turls_dictionary[turl] = False # write dictionary to file with results _status = write_json(os.path.join(args.workdir, config.Pilot.remotefileverification_dictionary), processed_turls_dictionary) diff --git a/pilot/scripts/stagein.py b/pilot/scripts/stagein.py index 851ca307a..00ea77b7d 100644 --- a/pilot/scripts/stagein.py +++ b/pilot/scripts/stagein.py @@ -356,7 +356,7 @@ def extract_error_info(err): args.debug = True args.nopilotlog = False - establish_logging(args, filename=config.Pilot.stageinlog) + establish_logging(debug=args.debug, nopilotlog=args.nopilotlog, filename=config.Pilot.stageinlog) logger = logging.getLogger(__name__) #ret = verify_args() diff --git a/pilot/scripts/stageout.py b/pilot/scripts/stageout.py index f60219d1c..2872801bb 100644 --- a/pilot/scripts/stageout.py +++ b/pilot/scripts/stageout.py @@ -289,7 +289,7 @@ def extract_error_info(err): args.debug = True args.nopilotlog = False - establish_logging(args, filename=config.Pilot.stageoutlog) + establish_logging(debug=args.debug, nopilotlog=args.nopilotlog, filename=config.Pilot.stageoutlog) logger = logging.getLogger(__name__) #ret = verify_args() diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 13d0546e0..1fb3be4a7 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -8,40 +8,79 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 # - Wen Guan, wen.guan@cern.ch, 2018 -import os -import re -import fnmatch from collections import defaultdict +import fnmatch from glob import glob +import logging +import os +import re +from random import randint from signal import SIGTERM, SIGUSR1 +# from tarfile import ExFileObject try: from functools import reduce # Python 3 -except Exception: +except ImportError: pass from .container import create_root_container_command from .dbrelease import get_dbrelease_version, create_dbrelease -from .setup import should_pilot_prepare_setup, is_standard_atlas_job, get_asetup,\ - set_inds, get_analysis_trf, get_payload_environment_variables, replace_lfns_with_turls -from .utilities import get_memory_monitor_setup, get_network_monitor_setup, post_memory_monitor_action,\ - get_memory_monitor_summary_filename, get_prefetcher_setup, get_benchmark_setup, get_memory_monitor_output_filename,\ - get_metadata_dict_from_txt +from .setup import ( + should_pilot_prepare_setup, + is_standard_atlas_job, + get_asetup, + set_inds, + get_analysis_trf, + get_payload_environment_variables, + replace_lfns_with_turls, +) +from .utilities import ( + get_memory_monitor_setup, + get_network_monitor_setup, + post_memory_monitor_action, + get_memory_monitor_summary_filename, + get_prefetcher_setup, + get_benchmark_setup, + get_memory_monitor_output_filename, + get_metadata_dict_from_txt, +) + +from pilot.util.auxiliary import ( + get_resource_name, + show_memory_usage, + is_python3, + get_key_value, +) -from pilot.util.auxiliary import get_resource_name, show_memory_usage from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import TrfDownloadFailure, PilotException -from pilot.util.auxiliary import is_python3 from pilot.util.config import config -from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED,\ - UTILITY_AFTER_PAYLOAD, UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_STARTED2,\ - UTILITY_BEFORE_STAGEIN +from pilot.util.constants import ( + UTILITY_BEFORE_PAYLOAD, + UTILITY_WITH_PAYLOAD, + UTILITY_AFTER_PAYLOAD_STARTED, + UTILITY_AFTER_PAYLOAD_FINISHED, + UTILITY_AFTER_PAYLOAD_STARTED2, + UTILITY_BEFORE_STAGEIN, + UTILITY_AFTER_PAYLOAD_FINISHED2 +) from pilot.util.container import execute -from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy,\ - copy_pilot_source, write_file, read_json, read_file, update_extension, get_local_file_size, calculate_checksum +from pilot.util.filehandling import ( + copy, copy_pilot_source, calculate_checksum, + get_guid, get_local_file_size, + remove, remove_dir_tree, remove_core_dumps, read_file, read_json, + update_extension, + write_file, + # read_list +) +from pilot.util.processes import ( + convert_ps_to_dict, + find_pid, find_cmd_pids, + get_trimmed_dictionary, + is_child +) from pilot.util.tracereport import TraceReport -import logging logger = logging.getLogger(__name__) errors = ErrorCodes() @@ -49,8 +88,9 @@ def sanity_check(): """ - Perform an initial sanity check before doing anything else in a given workflow. - This function can be used to verify importing of modules that are otherwise used much later, but it is better to abort + Perform an initial sanity check before doing anything else in a + given workflow. This function can be used to verify importing of + modules that are otherwise used much later, but it is better to abort the pilot if a problem is discovered early. :return: exit code (0 if all is ok, otherwise non-zero exit code). @@ -61,7 +101,8 @@ def sanity_check(): #try: # from rucio.client.downloadclient import DownloadClient # from rucio.client.uploadclient import UploadClient - # # note: must do something with Download/UploadClients or flake8 will complain - but do not instantiate + # # note: must do something with Download/UploadClients or flake8 + # will complain - but do not instantiate #except Exception as e: # logger.warning('sanity check failed: %s' % e) # exit_code = errors.MIDDLEWAREIMPORTFAILURE @@ -81,7 +122,9 @@ def validate(job): status = True if 'DBRelease' in job.jobparams: - logger.debug('encountered DBRelease info in job parameters - will attempt to create a local DBRelease file') + logger.debug(( + 'encountered DBRelease info in job parameters - ' + 'will attempt to create a local DBRelease file')) version = get_dbrelease_version(job.jobparams) if version: status = create_dbrelease(version, job.workdir) @@ -94,35 +137,38 @@ def validate(job): if status: if job.imagename and job.imagename.startswith('/'): if os.path.exists(job.imagename): - logger.info('verified that image exists: %s' % job.imagename) + logger.info('verified that image exists: %s', job.imagename) else: status = False - logger.warning('image does not exist: %s' % job.imagename) + logger.warning('image does not exist: %s', job.imagename) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.IMAGENOTFOUND) # cleanup job parameters if only copy-to-scratch #if job.only_copy_to_scratch(): # logger.debug('job.params=%s' % job.jobparams) # if ' --usePFCTurl' in job.jobparams: - # logger.debug('cleaning up --usePFCTurl from job parameters since all input is copy-to-scratch') + # logger.debug('cleaning up --usePFCTurl from job parameters + # since all input is copy-to-scratch') # job.jobparams = job.jobparams.replace(' --usePFCTurl', '') # if ' --directIn' in job.jobparams: - # logger.debug('cleaning up --directIn from job parameters since all input is copy-to-scratch') + # logger.debug('cleaning up --directIn from job parameters + # since all input is copy-to-scratch') # job.jobparams = job.jobparams.replace(' --directIn', '') return status -def open_remote_files(indata, workdir): +def open_remote_files(indata, workdir, nthreads): """ Verify that direct i/o files can be opened. :param indata: list of FileSpec. :param workdir: working directory (string). + :param nthreads: number of concurrent file open threads (int). :return: exit code (int), diagnostics (string). """ - ec = 0 + exitcode = 0 diagnostics = "" not_opened = "" @@ -140,73 +186,85 @@ def open_remote_files(indata, workdir): final_script_path = os.path.join(workdir, script) os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH') + ':' + workdir script_path = os.path.join('pilot/scripts', script) - d1 = os.path.join(os.path.join(os.environ['PILOT_HOME'], 'pilot2'), script_path) - d2 = os.path.join(workdir, script_path) - full_script_path = d1 if os.path.exists(d1) else d2 + dir1 = os.path.join(os.path.join(os.environ['PILOT_HOME'], 'pilot2'), script_path) + dir2 = os.path.join(workdir, script_path) + full_script_path = dir1 if os.path.exists(dir1) else dir2 if not os.path.exists(full_script_path): # do not set ec since this will be a pilot issue rather than site issue - diagnostics = 'cannot perform file open test - script path does not exist: %s' % full_script_path + diagnostics = ( + 'cannot perform file open test - script path does ' + 'not exist: %s' % full_script_path + ) logger.warning(diagnostics) - logger.warning('tested both path=%s and path=%s (none exists)' % (d1, d2)) - return ec, diagnostics, not_opened + logger.warning('tested both path=%s and path=%s (none exists)', dir1, dir2) + return exitcode, diagnostics, not_opened try: copy(full_script_path, final_script_path) - except Exception as e: + except PilotException as exc: # do not set ec since this will be a pilot issue rather than site issue - diagnostics = 'cannot perform file open test - pilot source copy failed: %s' % e + diagnostics = 'cannot perform file open test - pilot source copy failed: %s' % exc logger.warning(diagnostics) - return ec, diagnostics, not_opened + return exitcode, diagnostics, not_opened else: # correct the path when containers have been used final_script_path = os.path.join('.', script) - _cmd = get_file_open_command(final_script_path, turls) + _cmd = get_file_open_command(final_script_path, turls, nthreads) cmd = create_root_container_command(workdir, _cmd) show_memory_usage() - logger.info('*** executing file open verification script:\n\n\'%s\'\n\n' % cmd) + logger.info('*** executing file open verification script:\n\n\'%s\'\n\n', cmd) exit_code, stdout, stderr = execute(cmd, usecontainer=False) if config.Pilot.remotefileverification_log: - write_file(os.path.join(workdir, config.Pilot.remotefileverification_log), stdout + stderr, mute=False) + fpath = os.path.join(workdir, config.Pilot.remotefileverification_log) + write_file(fpath, stdout + stderr, mute=False) show_memory_usage() # error handling if exit_code: - logger.warning('script %s finished with ec=%d' % (script, exit_code)) + logger.warning('script %s finished with ec=%d', script, exit_code) else: - dictionary_path = os.path.join(workdir, config.Pilot.remotefileverification_dictionary) + dictionary_path = os.path.join( + workdir, + config.Pilot.remotefileverification_dictionary + ) if not dictionary_path: - logger.warning('file does not exist: %s' % dictionary_path) + logger.warning('file does not exist: %s', dictionary_path) else: file_dictionary = read_json(dictionary_path) if not file_dictionary: - logger.warning('could not read dictionary from %s' % dictionary_path) + logger.warning('could not read dictionary from %s', dictionary_path) else: not_opened = "" for turl in file_dictionary: opened = file_dictionary[turl] - logger.info('turl could be opened: %s' % turl) if opened else logger.info('turl could not be opened: %s' % turl) if not opened: + logger.info('turl could not be opened: %s', turl) not_opened += turl if not not_opened else ",%s" % turl + else: + logger.info('turl could be opened: %s', turl) + if not_opened: - ec = errors.REMOTEFILECOULDNOTBEOPENED - diagnostics = "turl not opened:%s" % not_opened if "," not in not_opened else "turls not opened:%s" % not_opened + exitcode = errors.REMOTEFILECOULDNOTBEOPENED + diagnostics = "Remote file could not be opened: %s" % not_opened if "," not in not_opened else "turls not opened:%s" % not_opened else: logger.info('nothing to verify (for remote files)') - return ec, diagnostics, not_opened + return exitcode, diagnostics, not_opened -def get_file_open_command(script_path, turls): +def get_file_open_command(script_path, turls, nthreads): """ :param script_path: path to script (string). + :param turls: comma-separated turls (string). + :param nthreads: number of concurrent file open threads (int). :return: comma-separated list of turls (string). """ - return "%s --turls=%s -w %s" % (script_path, turls, os.path.dirname(script_path)) + return "%s --turls=%s -w %s -t %s" % (script_path, turls, os.path.dirname(script_path), str(nthreads)) def extract_turls(indata): @@ -217,19 +275,22 @@ def extract_turls(indata): :return: comma-separated list of turls (string). """ - turls = "" - for f in indata: - if f.status == 'remote_io': - turls += f.turl if not turls else ",%s" % f.turl + # turls = "" + # for filespc in indata: + # if filespc.status == 'remote_io': + # turls += filespc.turl if not turls else ",%s" % filespc.turl + # return turls - return turls + return ",".join( + fspec.turl for fspec in indata if fspec.status == 'remote_io' + ) def process_remote_file_traces(path, job, not_opened_turls): """ Report traces for remote files. - The function reads back the base trace report (common part of all traces) and updates it per file before reporting - it to the Rucio server. + The function reads back the base trace report (common part of all traces) + and updates it per file before reporting it to the Rucio server. :param path: path to base trace report (string). :param job: job object. @@ -239,8 +300,8 @@ def process_remote_file_traces(path, job, not_opened_turls): try: base_trace_report = read_json(path) - except PilotException as e: - logger.warning('failed to open base trace report (cannot send trace reports): %s' % e) + except PilotException as exc: + logger.warning('failed to open base trace report (cannot send trace reports): %s', exc) else: if not base_trace_report: logger.warning('failed to read back base trace report (cannot send trace reports)') @@ -262,13 +323,26 @@ def process_remote_file_traces(path, job, not_opened_turls): if trace_report: trace_report.send() else: - logger.warning('failed to create trace report for turl=%s' % fspec.turl) + logger.warning('failed to create trace report for turl=%s', fspec.turl) + + +def get_nthreads(catchall): + """ + Extract number of concurrent file open threads from catchall. + Return nthreads=1 if nopenfiles=.. is not present in catchall. + + :param catchall: queuedata catchall (string). + :return: number of threads (int). + """ + + _nthreads = get_key_value(catchall, key='nopenfiles') + return _nthreads if _nthreads else 1 def get_payload_command(job): """ - Return the full command for executing the payload, including the sourcing of all setup files and setting of - environment variables. + Return the full command for executing the payload, including the + sourcing of all setup files and setting of environment variables. :param job: job object. :raises PilotException: TrfDownloadFailure. @@ -285,52 +359,55 @@ def get_payload_command(job): # Is it a user job or not? userjob = job.is_analysis() - logger.info('pilot is running a user analysis job') if userjob else logger.info('pilot is running a production job') + logger.info('pilot is running a %s job', 'user analysis' if userjob else 'production') resource_name = get_resource_name() # 'grid' if no hpc_resource is set - resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0) # Python 3, -1 -> 0 + + # Python 3, level -1 -> 0 + modname = 'pilot.user.atlas.resource.%s' % resource_name + resource = __import__(modname, globals(), locals(), [resource_name], 0) # get the general setup command and then verify it if required cmd = resource.get_setup_command(job, preparesetup) if cmd: - ec, diagnostics = resource.verify_setup_command(cmd) - if ec != 0: - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec) - raise PilotException(diagnostics, code=ec) + exitcode, diagnostics = resource.verify_setup_command(cmd) + if exitcode != 0: + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exitcode) + raise PilotException(diagnostics, code=exitcode) # make sure that remote file can be opened before executing payload catchall = job.infosys.queuedata.catchall.lower() if job.infosys.queuedata.catchall else '' if config.Pilot.remotefileverification_log and 'remoteio_test=false' not in catchall: - ec = 0 + exitcode = 0 diagnostics = "" not_opened_turls = "" try: - ec, diagnostics, not_opened_turls = open_remote_files(job.indata, job.workdir) - except Exception as e: - logger.warning('caught exception: %s' % e) + exitcode, diagnostics, not_opened_turls = open_remote_files(job.indata, job.workdir, get_nthreads(catchall)) + except PilotException as exc: + logger.warning('caught exception: %s', exc) else: # read back the base trace report path = os.path.join(job.workdir, config.Pilot.base_trace_report) if not os.path.exists(path): - logger.warning('base trace report does not exist (%s) - input file traces should already have been sent' % path) + logger.warning(( + 'base trace report does not exist (%s) - input file ' + 'traces should already have been sent'), path) else: process_remote_file_traces(path, job, not_opened_turls) # fail the job if the remote files could not be verified - if ec != 0: - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec) - raise PilotException(diagnostics, code=ec) + if exitcode != 0: + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exitcode, msg=diagnostics) + raise PilotException(diagnostics, code=exitcode) else: logger.debug('no remote file open verification') if is_standard_atlas_job(job.swrelease): - # Normal setup (production and user jobs) logger.info("preparing normal production/analysis job setup command") cmd = get_normal_payload_command(cmd, job, preparesetup, userjob) - - else: # Generic, non-ATLAS specific jobs, or at least a job with undefined swRelease - + else: + # Generic, non-ATLAS specific jobs, or at least a job with undefined swRelease logger.info("generic job (non-ATLAS specific or with undefined swRelease)") cmd = get_generic_payload_command(cmd, job, preparesetup, userjob) @@ -341,7 +418,8 @@ def get_payload_command(job): # only if not using a user container if not job.imagename: site = os.environ.get('PILOT_SITENAME', '') - variables = get_payload_environment_variables(cmd, job.jobid, job.taskid, job.attemptnr, job.processingtype, site, userjob) + variables = get_payload_environment_variables( + cmd, job.jobid, job.taskid, job.attemptnr, job.processingtype, site, userjob) cmd = ''.join(variables) + cmd # prepend PanDA job id in case it is not there already (e.g. runcontainer jobs) @@ -350,24 +428,36 @@ def get_payload_command(job): cmd = cmd.replace(';;', ';') - # For direct access in prod jobs, we need to substitute the input file names with the corresponding TURLs + # For direct access in prod jobs, we need to substitute the input file names + # with the corresponding TURLs # get relevant file transfer info #use_copy_tool, use_direct_access, use_pfc_turl = get_file_transfer_info(job) #if not userjob and use_direct_access and job.transfertype == 'direct': - if not userjob and not job.is_build_job() and job.has_remoteio(): ## ported from old logic + + ## ported from old logic + if not userjob and not job.is_build_job() and job.has_remoteio(): ## ported from old logic but still it looks strange (anisyonk) - ## the "PoolFileCatalog.xml" should already contains proper TURLs values as it created by create_input_file_metadata() - ## if the case is just to patch `writetofile` file, than logic should be cleaned and decoupled - ## anyway, instead of parsing the file, it's much more easy to generate properly `writetofile` content from the beginning with TURL data + ## the "PoolFileCatalog.xml" should already contains proper TURLs + ## values as it created by create_input_file_metadata() if the case + ## is just to patch `writetofile` file, than logic should be cleaned + ## and decoupled anyway, instead of parsing the file, it's much easier + ## to generate properly `writetofile` content from the beginning + ## with TURL data lfns = job.get_lfns_and_guids()[0] - cmd = replace_lfns_with_turls(cmd, job.workdir, "PoolFileCatalog.xml", lfns, writetofile=job.writetofile) + cmd = replace_lfns_with_turls( + cmd, + job.workdir, + "PoolFileCatalog.xml", + lfns, + writetofile=job.writetofile + ) # Explicitly add the ATHENA_PROC_NUMBER (or JOB value) cmd = add_athena_proc_number(cmd) show_memory_usage() - logger.info('payload run command: %s' % cmd) + logger.info('payload run command: %s', cmd) return cmd @@ -379,27 +469,30 @@ def get_normal_payload_command(cmd, job, preparesetup, userjob): :param cmd: any preliminary command setup (string). :param job: job object. :param userjob: True for user analysis jobs, False otherwise (bool). - :param preparesetup: True if the pilot should prepare the setup, False if already in the job parameters. + :param preparesetup: True if the pilot should prepare the setup, + False if already in the job parameters. :return: normal payload command (string). """ - # set the INDS env variable (used by runAthena but also for EventIndex production jobs) + # set the INDS env variable + # (used by runAthena but also for EventIndex production jobs) set_inds(job.datasetin) # realDatasetsIn if userjob: # Try to download the trf (skip when user container is to be used) - ec, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir) - if ec != 0: + exitcode, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir) + if exitcode != 0: raise TrfDownloadFailure(diagnostics) - else: - logger.debug('user analysis trf: %s' % trf_name) + + logger.debug('user analysis trf: %s', trf_name) if preparesetup: _cmd = get_analysis_run_command(job, trf_name) else: _cmd = job.jobparams - # Correct for multi-core if necessary (especially important in case coreCount=1 to limit parallel make) + # Correct for multi-core if necessary (especially important in + # case coreCount=1 to limit parallel make) cmd += "; " + add_makeflags(job.corecount, "") + _cmd else: # Add Database commands if they are set by the local site @@ -437,19 +530,19 @@ def get_generic_payload_command(cmd, job, preparesetup, userjob): #if job.imagename != "" or "--containerImage" in job.jobparams: # job.transformation = os.path.join(os.path.dirname(job.transformation), "runcontainer") # logger.warning('overwrote job.transformation, now set to: %s' % job.transformation) - ec, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir) - if ec != 0: + exitcode, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir) + if exitcode != 0: raise TrfDownloadFailure(diagnostics) - else: - logger.debug('user analysis trf: %s' % trf_name) + + logger.debug('user analysis trf: %s', trf_name) if preparesetup: _cmd = get_analysis_run_command(job, trf_name) else: _cmd = job.jobparams - # correct for multi-core if necessary (especially important in case coreCount=1 to limit parallel make) - # only if not using a user container + # correct for multi-core if necessary (especially important in case + # coreCount=1 to limit parallel make), only if not using a user container if not job.imagename: cmd += "; " + add_makeflags(job.corecount, "") + _cmd else: @@ -471,7 +564,8 @@ def get_generic_payload_command(cmd, job, preparesetup, userjob): def add_athena_proc_number(cmd): """ - Add the ATHENA_PROC_NUMBER and ATHENA_CORE_NUMBER to the payload command if necessary. + Add the ATHENA_PROC_NUMBER and ATHENA_CORE_NUMBER to + the payload command if necessary. :param cmd: payload execution command (string). :return: updated payload execution command (string). @@ -480,13 +574,13 @@ def add_athena_proc_number(cmd): # get the values if they exist try: value1 = int(os.environ['ATHENA_PROC_NUMBER_JOB']) - except Exception as e: - logger.warning('failed to convert ATHENA_PROC_NUMBER_JOB to int: %s' % e) + except (TypeError, ValueError) as exc: + logger.warning('failed to convert ATHENA_PROC_NUMBER_JOB to int: %s', exc) value1 = None try: value2 = int(os.environ['ATHENA_CORE_NUMBER']) - except Exception as e: - logger.warning('failed to convert ATHENA_CORE_NUMBER to int: %s' % e) + except (TypeError, ValueError) as exc: + logger.warning('failed to convert ATHENA_CORE_NUMBER to int: %s', exc) value2 = None if "ATHENA_PROC_NUMBER" not in cmd: @@ -496,9 +590,13 @@ def add_athena_proc_number(cmd): if value1 > 1: cmd = 'export ATHENA_PROC_NUMBER=%d;' % value1 + cmd else: - logger.info("will not add ATHENA_PROC_NUMBER to cmd since the value is %s" % str(value1)) + logger.info(( + "will not add ATHENA_PROC_NUMBER to cmd " + "since the value is %s"), str(value1)) else: - logger.warning("don't know how to set ATHENA_PROC_NUMBER (could not find it in os.environ)") + logger.warning(( + "don't know how to set ATHENA_PROC_NUMBER " + "(could not find it in os.environ)")) else: logger.info("ATHENA_PROC_NUMBER already in job command") @@ -506,9 +604,11 @@ def add_athena_proc_number(cmd): if value2 > 1: cmd = 'export ATHENA_CORE_NUMBER=%d;' % value2 + cmd else: - logger.info("will not add ATHENA_CORE_NUMBER to cmd since the value is %s" % str(value2)) + logger.info("will not add ATHENA_CORE_NUMBER to cmd since the value is %s", str(value2)) else: - logger.warning('there is no ATHENA_CORE_NUMBER in os.environ (cannot add it to payload command)') + logger.warning(( + 'there is no ATHENA_CORE_NUMBER in os.environ ' + '(cannot add it to payload command)')) return cmd @@ -534,7 +634,8 @@ def verify_release_string(release): def add_makeflags(job_core_count, cmd): """ - Correct for multi-core if necessary (especially important in case coreCount=1 to limit parallel make). + Correct for multi-core if necessary (especially important in + case coreCount=1 to limit parallel make). :param job_core_count: core count from the job definition (int). :param cmd: payload execution command (string). @@ -544,16 +645,18 @@ def add_makeflags(job_core_count, cmd): # ATHENA_PROC_NUMBER is set in Node.py using the schedconfig value try: core_count = int(os.environ.get('ATHENA_PROC_NUMBER')) - except Exception: + except (TypeError, ValueError): core_count = -1 + if core_count == -1: try: core_count = int(job_core_count) - except Exception: + except (TypeError, ValueError): pass else: if core_count >= 1: - # Note: the original request (AF) was to use j%d and not -j%d, now using the latter + # Note: the original request (AF) was to use j%d + # and not -j%d, now using the latter cmd += "export MAKEFLAGS=\'-j%d QUICK=1 -l1\';" % (core_count) # make sure that MAKEFLAGS is always set @@ -567,10 +670,12 @@ def get_analysis_run_command(job, trf_name): """ Return the proper run command for the user job. - Example output: export X509_USER_PROXY=<..>;./runAthena --usePFCTurl --directIn + Example output: + export X509_USER_PROXY=<..>;./runAthena --usePFCTurl --directIn :param job: job object. - :param trf_name: name of the transform that will run the job (string). Used when containers are not used. + :param trf_name: name of the transform that will run the job (string). + Used when containers are not used. :return: command (string). """ @@ -579,7 +684,8 @@ def get_analysis_run_command(job, trf_name): # get relevant file transfer info #use_copy_tool, use_direct_access, use_pfc_turl = get_file_transfer_info(job) # check if the input files are to be accessed locally (ie if prodDBlockToken is set to local) - #if job.is_local(): ## useless since stage-in phase has already passed (DEPRECATE ME, anisyonk) + ## useless since stage-in phase has already passed (DEPRECATE ME, anisyonk) + #if job.is_local(): # logger.debug('switched off direct access for local prodDBlockToken') # use_direct_access = False # use_pfc_turl = False @@ -601,12 +707,12 @@ def get_analysis_run_command(job, trf_name): # check if image is on disk as defined by envar PAYLOAD_CONTAINER_LOCATION payload_container_location = os.environ.get('PAYLOAD_CONTAINER_LOCATION') if payload_container_location is not None: - logger.debug("$PAYLOAD_CONTAINER_LOCATION = %s" % payload_container_location) + logger.debug("$PAYLOAD_CONTAINER_LOCATION = %s", payload_container_location) # get container name containername = imagename.rsplit('/')[-1] image_location = os.path.join(payload_container_location, containername) if os.path.exists(image_location): - logger.debug("image exists at %s" % image_location) + logger.debug("image exists at %s", image_location) imagename = image_location # restore the image name if necessary @@ -621,15 +727,19 @@ def get_analysis_run_command(job, trf_name): # cmd += ' --directIn' if job.has_remoteio(): - logger.debug('direct access (remoteio) is used to access some input files: --usePFCTurl and --directIn will be added to payload command') + logger.debug(( + 'direct access (remoteio) is used to access some input files: ' + '--usePFCTurl and --directIn will be added to payload command')) if '--usePFCTurl' not in cmd: cmd += ' --usePFCTurl' if '--directIn' not in cmd: cmd += ' --directIn' # update the payload command for forced accessmode - ## -- REDUNDANT logic, since it should be done from the beginning at the step of FileSpec initialization (anisyonk) - #cmd = update_forced_accessmode(log, cmd, job.transfertype, job.jobparams, trf_name) ## DEPRECATE ME (anisyonk) + ## -- REDUNDANT logic, since it should be done from the beginning at + ## the step of FileSpec initialization (anisyonk) + #cmd = update_forced_accessmode(log, cmd, job.transfertype, + # job.jobparams, trf_name) ## DEPRECATE ME (anisyonk) # add guids when needed # get the correct guids list (with only the direct access files) @@ -644,16 +754,19 @@ def get_analysis_run_command(job, trf_name): return cmd -## SHOULD NOT BE USED since payload cmd should be properly generated from the beginning (consider final directio settings) (anisyonk) -def update_forced_accessmode(log, cmd, transfertype, jobparams, trf_name): ## DEPRECATE ME (anisyonk) +## SHOULD NOT BE USED since payload cmd should be properly generated +## from the beginning (consider final directio settings) (anisyonk) +## DEPRECATE ME (anisyonk) +def update_forced_accessmode(log, cmd, transfertype, jobparams, trf_name): """ Update the payload command for forced accessmode. - accessmode is an option that comes from HammerCloud and is used to force a certain input file access mode; i.e. - copy-to-scratch or direct access. + accessmode is an option that comes from HammerCloud and is used to + force a certain input file access mode; i.e. copy-to-scratch or direct access. :param log: logging object. :param cmd: payload command. - :param transfertype: transfer type (.e.g 'direct') from the job definition with priority over accessmode (string). + :param transfertype: transfer type (.e.g 'direct') from the job + definition with priority over accessmode (string). :param jobparams: job parameters (string). :param trf_name: transformation name (string). :return: updated payload command string. @@ -669,7 +782,7 @@ def update_forced_accessmode(log, cmd, transfertype, jobparams, trf_name): ## D for _mode in list(_accessmode_dic.keys()): # Python 2/3 if _mode in jobparams: # any accessmode set in jobPars should overrule schedconfig - logger.info("enforcing %s" % _accessmode_dic[_mode][0]) + logger.info("enforcing %s", _accessmode_dic[_mode][0]) if _mode == "--accessmode=copy": # make sure direct access is turned off accessmode_usect = True @@ -709,7 +822,8 @@ def update_forced_accessmode(log, cmd, transfertype, jobparams, trf_name): ## D cmd = cmd.replace("./%s" % trf_name, "export X509_USER_PROXY=%s;./%s" % (os.environ.get('X509_USER_PROXY'), trf_name)) - # if both direct access and the accessmode loop added a directIn switch, remove the first one from the string + # if both direct access and the accessmode loop added a + # directIn switch, remove the first one from the string if cmd.count("directIn") > 1: cmd = cmd.replace(' --directIn', ' ', 1) @@ -721,8 +835,10 @@ def get_guids_from_jobparams(jobparams, infiles, infilesguids): Extract the correct guid from the input file list. The guids list is used for direct reading. 1. extract input file list for direct reading from job parameters - 2. for each input file in this list, find the corresponding guid from the input file guid list - Since the job parameters string is entered by a human, the order of the input files might not be the same. + 2. for each input file in this list, find the corresponding guid from + the input file guid list. + Since the job parameters string is entered by a human, the order of + the input files might not be the same. :param jobparams: job parameters. :param infiles: input file list. @@ -750,22 +866,23 @@ def get_guids_from_jobparams(jobparams, infiles, infilesguids): tail = match.group(3) body = match.group(2).split(',') attr = match.group(4).split(',') - for idx in range(len(body)): - lfn = '%s%s%s%s' % (head, body[idx], tail, attr[idx]) + + for idx, item in enumerate(body): + lfn = '%s%s%s%s' % (head, item, tail, attr[idx]) infiles.append(lfn) else: infiles = [compactinfiles] - if _infiles != []: - for infile in _infiles: - # get the corresponding index from the inputFiles list, which has the same order as infilesguids - try: - index = infiles.index(infile) - except Exception as e: - logger.warning("exception caught: %s (direct reading will fail)" % e) - else: - # add the corresponding guid to the list - guidlist.append(infilesguids[index]) + for infile in _infiles: + # get the corresponding index from the inputFiles list, + # which has the same order as infilesguids + try: + index = infiles.index(infile) + except ValueError as exc: + logger.warning("exception caught: %s (direct reading will fail)", exc) + else: + # add the corresponding guid to the list + guidlist.append(infilesguids[index]) return guidlist @@ -775,7 +892,8 @@ def get_file_transfer_info(job): ## TO BE DEPRECATED, NOT USED (anisyonk) Return information about desired file transfer. :param job: job object - :return: use copy tool (boolean), use direct access (boolean), use PFC Turl (boolean). + :return: use copy tool (boolean), use direct access (boolean), + use PFC Turl (boolean). """ use_copy_tool = True @@ -783,10 +901,14 @@ def get_file_transfer_info(job): ## TO BE DEPRECATED, NOT USED (anisyonk) use_pfc_turl = False # check with schedconfig - if (job.infosys.queuedata.direct_access_lan or job.infosys.queuedata.direct_access_wan or job.transfertype == 'direct') and not job.is_build_job(): + is_lan = job.infosys.queuedata.direct_access_lan + is_wan = job.infosys.queuedata.direct_access_wan + if not job.is_build_job() and (is_lan or is_wan or job.transfertype == 'direct'): # override if all input files are copy-to-scratch if job.only_copy_to_scratch(): - logger.info('all input files are copy-to-scratch (--usePFCTurl and --directIn will not be set)') + logger.info(( + 'all input files are copy-to-scratch ' + '(--usePFCTurl and --directIn will not be set)')) else: logger.debug('--usePFCTurl and --directIn will be set') use_copy_tool = False @@ -799,17 +921,19 @@ def get_file_transfer_info(job): ## TO BE DEPRECATED, NOT USED (anisyonk) def update_job_data(job): """ This function can be used to update/add data to the job object. - E.g. user specific information can be extracted from other job object fields. In the case of ATLAS, information - is extracted from the metadata field and added to other job object fields. + E.g. user specific information can be extracted from other job object fields. + In the case of ATLAS, information is extracted from the metadata field and + added to other job object fields. :param job: job object :return: """ ## comment from Alexey: - ## it would be better to reallocate this logic (as well as parse metadata values)directly to Job object - ## since in general it's Job related part - ## later on once we introduce VO specific Job class (inherited from JobData) this can be easily customized + ## it would be better to reallocate this logic (as well as parse + ## metadata values)directly to Job object since in general it's Job + ## related part. Later on once we introduce VO specific Job class + ## (inherited from JobData) this can be easily customized # get label "all" or "log" stageout = get_stageout_label(job) @@ -817,7 +941,7 @@ def update_job_data(job): if 'exeErrorDiag' in job.metadata: job.exeerrordiag = job.metadata['exeErrorDiag'] if job.exeerrordiag: - logger.warning('payload failed: exeErrorDiag=%s' % job.exeerrordiag) + logger.warning('payload failed: exeErrorDiag=%s', job.exeerrordiag) # determine what should be staged out job.stageout = stageout # output and log file or only log file @@ -825,37 +949,47 @@ def update_job_data(job): work_attributes = None try: work_attributes = parse_jobreport_data(job.metadata) - except Exception as e: - logger.warning('failed to parse job report (cannot set job.nevents): %s' % e) + except Exception as exc: + logger.warning('failed to parse job report (cannot set job.nevents): %s', exc) else: - # note: the number of events can be set already at this point if the value was extracted from the job report - # (a more thorough search for this value is done later unless it was set here) + # note: the number of events can be set already at this point + # if the value was extracted from the job report (a more thorough + # search for this value is done later unless it was set here) nevents = work_attributes.get('nEvents', 0) if nevents: job.nevents = nevents - # extract output files from the job report if required, in case the trf has created additional (overflow) files - # also make sure all guids are assigned (use job report value if present, otherwise generate the guid) + # extract output files from the job report if required, in case the trf + # has created additional (overflow) files. Also make sure all guids are + # assigned (use job report value if present, otherwise generate the guid) if job.metadata and not job.is_eventservice: - extract_output_file_guids(job) # keep this for now, complicated to merge with verify_output_files? + # keep this for now, complicated to merge with verify_output_files? + extract_output_file_guids(job) try: verify_output_files(job) - except Exception as e: - logger.warning('exception caught while trying verify output files: %s' % e) + except Exception as exc: + logger.warning('exception caught while trying verify output files: %s', exc) else: if not job.allownooutput: # i.e. if it's an empty list/string, do nothing - logger.debug("will not try to extract output files from jobReport for user job (and allowNoOut list is empty)") + logger.debug(( + "will not try to extract output files from jobReport " + "for user job (and allowNoOut list is empty)")) else: # remove the files listed in allowNoOutput if they don't exist remove_no_output_files(job) ## validate output data (to be moved into the JobData) - ## warning: do no execute this code unless guid lookup in job report has failed - pilot should only generate guids + ## warning: do no execute this code unless guid lookup in job report + # has failed - pilot should only generate guids ## if they are not present in job report for dat in job.outdata: if not dat.guid: dat.guid = get_guid() - logger.warning('guid not set: generated guid=%s for lfn=%s' % (dat.guid, dat.lfn)) + logger.warning( + 'guid not set: generated guid=%s for lfn=%s', + dat.guid, + dat.lfn + ) def get_stageout_label(job): @@ -878,7 +1012,7 @@ def get_stageout_label(job): if job.exeerrorcode == 0: stageout = "all" else: - logger.info('payload failed: exeErrorCode=%d' % job.exeerrorcode) + logger.info('payload failed: exeErrorCode=%d', job.exeerrorcode) stageout = "log" return stageout @@ -894,11 +1028,13 @@ def update_output_for_hpo(job): try: new_outdata = discover_new_outdata(job) - except Exception as e: - logger.warning('exception caught while discovering new outdata: %s' % e) + except Exception as exc: + logger.warning('exception caught while discovering new outdata: %s', exc) else: if new_outdata: - logger.info('replacing job outdata with discovered output (%d file(s))' % len(new_outdata)) + logger.info(( + 'replacing job outdata with discovered output ' + '(%d file(s))'), len(new_outdata)) job.outdata = new_outdata @@ -918,12 +1054,22 @@ def discover_new_outdata(job): if new_output: # create new FileSpec objects out of the new output for outfile in new_output: - # note: guid will be taken from job report after this function has been called - files = [{'scope': outdata_file.scope, 'lfn': outfile, 'workdir': job.workdir, - 'dataset': outdata_file.dataset, 'ddmendpoint': outdata_file.ddmendpoint, - 'ddmendpoint_alt': None, 'filesize': new_output[outfile]['filesize'], - 'checksum': new_output[outfile]['checksum'], 'guid': ''}] - # do not abbreviate the following two lines as otherwise the content of xfiles will be a list of generator objects + # note: guid will be taken from job report + # after this function has been called + files = [{ + 'scope': outdata_file.scope, + 'lfn': outfile, + 'workdir': job.workdir, + 'dataset': outdata_file.dataset, + 'ddmendpoint': outdata_file.ddmendpoint, + 'ddmendpoint_alt': None, + 'filesize': new_output[outfile]['filesize'], + 'checksum': new_output[outfile]['checksum'], + 'guid': '' + }] + + # do not abbreviate the following two lines as otherwise + # the content of xfiles will be a list of generator objects _xfiles = [FileSpec(type='output', **f) for f in files] new_outdata += _xfiles @@ -958,29 +1104,43 @@ def discover_new_output(name_pattern, workdir): if filesize and checksum: new_output[lfn] = {'path': path, 'filesize': filesize, 'checksum': checksum} else: - logger.warning('failed to create file info (filesize=%d, checksum=%s) for lfn=%s' % - (filesize, checksum, lfn)) + logger.warning( + 'failed to create file info (filesize=%d, checksum=%s) for lfn=%s', + filesize, + checksum, + lfn + ) + return new_output def extract_output_file_guids(job): """ - Extract output file info from the job report and make sure all guids are assigned (use job report value if present, - otherwise generate the guid - note: guid generation is done later, not in this function since this function - might not be called if metadata info is not found prior to the call). + Extract output file info from the job report and make sure all guids\ + are assigned (use job report value if present, otherwise generate the guid.\ + Note: guid generation is done later, not in this function since + this function might not be called if metadata info is not found prior + to the call). :param job: job object. :return: """ - # make sure there is a defined output file list in the job report - unless it is allowed by task parameter allowNoOutput + # make sure there is a defined output file list in the job report - + # unless it is allowed by task parameter allowNoOutput if not job.allownooutput: output = job.metadata.get('files', {}).get('output', []) if output: - logger.info('verified that job report contains metadata for %d file(s)' % len(output)) + logger.info(( + 'verified that job report contains metadata ' + 'for %d file(s)'), len(output)) else: - logger.warning('job report contains no output files and allowNoOutput is not set') #- will fail job since allowNoOutput is not set') - #job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOOUTPUTINJOBREPORT) + #- will fail job since allowNoOutput is not set') + logger.warning(( + 'job report contains no output ' + 'files and allowNoOutput is not set')) + #job.piloterrorcodes, job.piloterrordiags = + # errors.add_error_code(errors.NOOUTPUTINJOBREPORT) return # extract info from metadata (job report JSON) @@ -991,20 +1151,27 @@ def extract_output_file_guids(job): lfn = fdat['name'] # verify the guid if the lfn is known - # only extra guid if the file is known by the job definition (March 18 change, v 2.5.2) + # only extra guid if the file is known by the + # job definition (March 18 change, v 2.5.2) if lfn in data: data[lfn].guid = fdat['file_guid'] - logger.info('set guid=%s for lfn=%s (value taken from job report)' % (data[lfn].guid, lfn)) + logger.info(( + 'set guid=%s for lfn=%s ' + '(value taken from job report)'), data[lfn].guid, lfn) else: # found new entry - logger.warning('pilot no longer considers output files not mentioned in job definition (lfn=%s)' % lfn) + logger.warning(( + 'pilot no longer considers output files not mentioned ' + 'in job definition (lfn=%s)'), lfn) continue #if job.outdata: # kw = {'lfn': lfn, - # 'scope': job.outdata[0].scope, ## take value from 1st output file? + # . # take value from 1st output file? + # 'scope': job.outdata[0].scope, # 'guid': fdat['file_guid'], # 'filesize': fdat['file_size'], - # 'dataset': dat.get('dataset') or job.outdata[0].dataset ## take value from 1st output file? + # # take value from 1st output file? + # 'dataset': dat.get('dataset') or job.outdata[0].dataset # } # spec = FileSpec(filetype='output', **kw) # extra.append(spec) @@ -1013,25 +1180,28 @@ def extract_output_file_guids(job): for fspec in job.outdata: if fspec.guid != data[fspec.lfn].guid: fspec.guid = data[fspec.lfn].guid - logger.debug('reset guid=%s for lfn=%s' % (fspec.guid, fspec.lfn)) + logger.debug('reset guid=%s for lfn=%s', fspec.guid, fspec.lfn) else: if fspec.guid: - logger.debug('verified guid=%s for lfn=%s' % (fspec.guid, fspec.lfn)) + logger.debug('verified guid=%s for lfn=%s', fspec.guid, fspec.lfn) else: - logger.warning('guid not set for lfn=%s' % fspec.lfn) + logger.warning('guid not set for lfn=%s', fspec.lfn) #if extra: - #logger.info('found extra output files in job report, will overwrite output file list: extra=%s' % extra) + #logger.info('found extra output files in job report, + # will overwrite output file list: extra=%s' % extra) #job.outdata = extra def verify_output_files(job): """ - Make sure that the known output files from the job definition are listed in the job report and number of processed events - is greater than zero. If the output file is not listed in the job report, then if the file is listed in allowNoOutput - remove it from stage-out, otherwise fail the job. + Make sure that the known output files from the job definition are listed + in the job report and number of processed events is greater than zero. + If the output file is not listed in the job report, then if the file is + listed in allowNoOutput remove it from stage-out, otherwise fail the job. - Note from Rod: fail scenario: The output file is not in output:[] or is there with zero events. Then if allownooutput is not - set - fail the job. If it is set, then do not store the output, and finish ok. + Note from Rod: fail scenario: The output file is not in output:[] or is + there with zero events. Then if allownooutput is not set - fail the job. + If it is set, then do not store the output, and finish ok. :param job: job object. :return: Boolean (and potentially updated job.outdata list) @@ -1048,38 +1218,50 @@ def verify_output_files(job): return True # get list of output files from job report - # (if None is returned, it means the job report is from an old release and does not contain an output list) + # (if None is returned, it means the job report is from an old release + # and does not contain an output list) output = job.metadata.get('files', {}).get('output', None) if not output and output is not None: # ie empty list, output=[] - are all known output files in allowNoOutput? - logger.warning('encountered an empty output file list in job report, consulting allowNoOutput list') + logger.warning(( + 'encountered an empty output file list in job report, ' + 'consulting allowNoOutput list')) failed = False for lfn in lfns_jobdef: if lfn not in job.allownooutput: if job.is_analysis(): - logger.warning('lfn %s is not in allowNoOutput list - ignore for user job' % lfn) + logger.warning(( + 'lfn %s is not in allowNoOutput list - ' + 'ignore for user job'), + lfn + ) else: failed = True - logger.warning('lfn %s is not in allowNoOutput list - job will fail' % lfn) + logger.warning( + 'lfn %s is not in allowNoOutput list - job will fail', + lfn + ) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.MISSINGOUTPUTFILE) break else: - logger.info('lfn %s listed in allowNoOutput - will be removed from stage-out' % lfn) + logger.info('lfn %s listed in allowNoOutput - will be removed from stage-out', lfn) remove_from_stageout(lfn, job) elif output is None: # ie job report is ancient / output could not be extracted - logger.warning('output file list could not be extracted from job report (nothing to verify)') + logger.warning(( + 'output file list could not be extracted from job report ' + '(nothing to verify)')) else: verified, nevents = verify_extracted_output_files(output, lfns_jobdef, job) - failed = True if not verified else False + failed = (not verified) if nevents > 0 and not failed and job.nevents == 0: job.nevents = nevents - logger.info('number of events from summed up output files: %d' % nevents) + logger.info('number of events from summed up output files: %d', nevents) else: - logger.info('number of events previously set to %d' % job.nevents) + logger.info('number of events previously set to %d', job.nevents) - status = True if not failed else False + status = (not failed) if status: logger.info('output file verification succeeded') @@ -1103,7 +1285,9 @@ def verify_extracted_output_files(output, lfns_jobdef, job): failed = False nevents = 0 output_jobrep = {} # {lfn: nentries, ..} - logger.debug('extracted output file list from job report - make sure all known output files are listed') + logger.debug(( + 'extracted output file list from job report - ' + 'make sure all known output files are listed')) # first collect the output files from the job report for dat in output: @@ -1118,45 +1302,68 @@ def verify_extracted_output_files(output, lfns_jobdef, job): for lfn in lfns_jobdef: if lfn not in output_jobrep and lfn not in job.allownooutput: if job.is_analysis(): - logger.warning( - 'output file %s from job definition is not present in job report and is not listed in allowNoOutput' % lfn) + logger.warning(( + 'output file %s from job definition is not present ' + 'in job report and is not listed in allowNoOutput'), lfn) else: - logger.warning( - 'output file %s from job definition is not present in job report and is not listed in allowNoOutput - job will fail' % lfn) + logger.warning(( + 'output file %s from job definition is not present ' + 'in job report and is not listed in allowNoOutput - ' + 'job will fail'), lfn) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.MISSINGOUTPUTFILE) failed = True break + if lfn not in output_jobrep and lfn in job.allownooutput: - logger.warning( - 'output file %s from job definition is not present in job report but is listed in allowNoOutput - remove from stage-out' % lfn) + logger.warning(( + 'output file %s from job definition is not present ' + 'in job report but is listed in allowNoOutput - ' + 'remove from stage-out'), lfn) remove_from_stageout(lfn, job) else: nentries = output_jobrep[lfn] if nentries == "UNDEFINED": - logger.warning('encountered file with nentries=UNDEFINED - will ignore %s' % lfn) - continue - elif nentries is None and lfn not in job.allownooutput: - logger.warning( - 'output file %s is listed in job report, but has no events and is not listed in allowNoOutput - will ignore' % lfn) - continue - elif nentries is None and lfn in job.allownooutput: - logger.warning( - 'output file %s is listed in job report, nentries is None and is listed in allowNoOutput - remove from stage-out' % lfn) - remove_from_stageout(lfn, job) - elif type(nentries) is int and nentries == 0 and lfn not in job.allownooutput: - logger.warning( - 'output file %s is listed in job report, has zero events and is not listed in allowNoOutput - will ignore' % lfn) - elif type(nentries) is int and nentries == 0 and lfn in job.allownooutput: - logger.warning( - 'output file %s is listed in job report, has zero events and is listed in allowNoOutput - remove from stage-out' % lfn) - remove_from_stageout(lfn, job) + logger.warning(( + 'encountered file with nentries=UNDEFINED - ' + 'will ignore %s'), lfn) + + elif nentries is None: + + if lfn not in job.allownooutput: + logger.warning(( + 'output file %s is listed in job report, ' + 'but has no events and is not listed in ' + 'allowNoOutput - will ignore'), lfn) + else: + logger.warning(( + 'output file %s is listed in job report, ' + 'nentries is None and is listed in allowNoOutput - ' + 'remove from stage-out'), lfn) + remove_from_stageout(lfn, job) + + elif nentries == 0: + + if lfn not in job.allownooutput: + logger.warning(( + 'output file %s is listed in job report, ' + 'has zero events and is not listed in ' + 'allowNoOutput - will ignore'), lfn) + else: + logger.warning(( + 'output file %s is listed in job report, ' + 'has zero events and is listed in allowNoOutput - ' + 'remove from stage-out'), lfn) + remove_from_stageout(lfn, job) + elif type(nentries) is int and nentries: - logger.info('output file %s has %d event(s)' % (lfn, nentries)) + logger.info('output file %s has %d event(s)', lfn, nentries) nevents += nentries else: # should not reach this step - logger.warning('case not handled for output file %s with %s event(s) (ignore)' % (lfn, str(nentries))) + logger.warning(( + 'case not handled for output file %s with %s event(s) ' + '(ignore)'), lfn, str(nentries)) - status = False if failed else True + status = (not failed) return status, nevents @@ -1172,7 +1379,7 @@ def remove_from_stageout(lfn, job): outdata = [] for fspec in job.outdata: if fspec.lfn == lfn: - logger.info('removing %s from stage-out list' % lfn) + logger.info('removing %s from stage-out list', lfn) else: outdata.append(fspec) job.outdata = outdata @@ -1180,7 +1387,8 @@ def remove_from_stageout(lfn, job): def remove_no_output_files(job): """ - Remove files from output file list if they are listed in allowNoOutput and do not exist. + Remove files from output file list if they are listed in + allowNoOutput and do not exist. :param job: job object. :return: @@ -1194,15 +1402,22 @@ def remove_no_output_files(job): if filename in job.allownooutput: if os.path.exists(path): - logger.info("file %s is listed in allowNoOutput but exists (will not be removed from list of files to be staged-out)" % filename) + logger.info(( + "file %s is listed in allowNoOutput but exists " + "(will not be removed from list of files to be " + "staged-out)"), filename) _outfiles.append(filename) else: - logger.info("file %s is listed in allowNoOutput and does not exist (will be removed from list of files to be staged-out)" % filename) + logger.info(( + "file %s is listed in allowNoOutput and does not exist " + "(will be removed from list of files to be staged-out)"), filename) else: if os.path.exists(path): - logger.info("file %s is not listed in allowNoOutput (will be staged-out)" % filename) + logger.info("file %s is not listed in allowNoOutput (will be staged-out)", filename) else: - logger.warning("file %s is not listed in allowNoOutput and does not exist (job will fail)" % filename) + logger.warning(( + "file %s is not listed in allowNoOutput and " + "does not exist (job will fail)"), filename) _outfiles.append(filename) # now remove the unwanted fspecs @@ -1223,12 +1438,15 @@ def get_outfiles_records(subfiles): """ res = {} - for f in subfiles: - res[f['name']] = {'guid': f['file_guid'], - 'size': f['file_size']} - nentries = f.get('nentries', 'UNDEFINED') + for subfile in subfiles: + res[subfile['name']] = { + 'guid': subfile['file_guid'], + 'size': subfile['file_size'] + } + + nentries = subfile.get('nentries', 'UNDEFINED') if type(nentries) == int: - res[f['name']]['nentries'] = nentries + res[subfile['name']]['nentries'] = nentries else: logger.warning("nentries is undefined in job report") @@ -1241,17 +1459,18 @@ def get(self, path, dst_dict, dst_key): if len(keys) == 0: return last_key = keys.pop() - v = self + me_ = self for key in keys: - if key in v and isinstance(v[key], dict): - v = v[key] - else: + if not (key in me_ and isinstance(me_[key], dict)): return - if last_key in v: - dst_dict[dst_key] = v[last_key] + me_ = me_[key] -def parse_jobreport_data(job_report): + if last_key in me_: + dst_dict[dst_key] = me_[last_key] + + +def parse_jobreport_data(job_report): # noqa: C901 """ Parse a job report and extract relevant fields. @@ -1271,25 +1490,25 @@ def parse_jobreport_data(job_report): work_attributes["outputfiles"] = [] if "ATHENA_PROC_NUMBER" in os.environ: - logger.debug("ATHENA_PROC_NUMBER: {0}".format(os.environ["ATHENA_PROC_NUMBER"])) + logger.debug("ATHENA_PROC_NUMBER: %s", os.environ["ATHENA_PROC_NUMBER"]) work_attributes['core_count'] = int(os.environ["ATHENA_PROC_NUMBER"]) core_count = int(os.environ["ATHENA_PROC_NUMBER"]) - dq = DictQuery(job_report) - dq.get("resource/transform/processedEvents", work_attributes, "nEvents") - dq.get("resource/transform/cpuTimeTotal", work_attributes, "cpuConsumptionTime") - dq.get("resource/machine/node", work_attributes, "node") - dq.get("resource/machine/model_name", work_attributes, "cpuConsumptionUnit") - dq.get("resource/dbTimeTotal", work_attributes, "dbTime") - dq.get("resource/dbDataTotal", work_attributes, "dbData") - dq.get("exitCode", work_attributes, "transExitCode") - dq.get("exitMsg", work_attributes, "exeErrorDiag") - dq.get("files/input", work_attributes, "inputfiles") - dq.get("files/output", work_attributes, "outputfiles") + dictq = DictQuery(job_report) + dictq.get("resource/transform/processedEvents", work_attributes, "nEvents") + dictq.get("resource/transform/cpuTimeTotal", work_attributes, "cpuConsumptionTime") + dictq.get("resource/machine/node", work_attributes, "node") + dictq.get("resource/machine/model_name", work_attributes, "cpuConsumptionUnit") + dictq.get("resource/dbTimeTotal", work_attributes, "dbTime") + dictq.get("resource/dbDataTotal", work_attributes, "dbData") + dictq.get("exitCode", work_attributes, "transExitCode") + dictq.get("exitMsg", work_attributes, "exeErrorDiag") + dictq.get("files/input", work_attributes, "inputfiles") + dictq.get("files/output", work_attributes, "outputfiles") outputfiles_dict = {} - for of in work_attributes['outputfiles']: - outputfiles_dict.update(get_outfiles_records(of['subFiles'])) + for opf in work_attributes['outputfiles']: + outputfiles_dict.update(get_outfiles_records(opf['subFiles'])) work_attributes['outputfiles'] = outputfiles_dict if work_attributes['inputfiles']: @@ -1302,20 +1521,14 @@ def parse_jobreport_data(job_report): if 'resource' in job_report and 'executor' in job_report['resource']: j = job_report['resource']['executor'] - exc_report = [] + fin_report = defaultdict(int) - try: - _tmplist = filter(lambda d: 'memory' in d and ('Max' or 'Avg' in d['memory']), j.itervalues()) # Python 2 - except Exception: - _tmplist = [d for d in iter(list(j.values())) if - 'memory' in d and ('Max' or 'Avg' in d['memory'])] # Python 3 - for v in _tmplist: - if 'Avg' in v['memory']: - exc_report.extend(list(v['memory']['Avg'].items())) # Python 2/3 - if 'Max' in v['memory']: - exc_report.extend(list(v['memory']['Max'].items())) # Python 2/3 - for x in exc_report: - fin_report[x[0]] += x[1] + for value in j.values(): + mem = value.get('memory', {}) + for key in ('Avg', 'Max'): + for subk, subv in mem.get(key, {}).items(): + fin_report[subk] += subv + work_attributes.update(fin_report) workdir_size = get_workdir_size() @@ -1325,8 +1538,8 @@ def parse_jobreport_data(job_report): work_attributes["dbTime"], work_attributes["dbData"], workdir_size) - del(work_attributes["dbData"]) - del(work_attributes["dbTime"]) + del work_attributes["dbData"] + del work_attributes["dbTime"] return work_attributes @@ -1337,9 +1550,9 @@ def get_workdir_size(): :return: """ - c, o, e = execute('du -s', shell=True) - if o is not None: - return o.split()[0] + _, stdout, _ = execute('du -s', shell=True) + if stdout is not None: + return stdout.split()[0] return None @@ -1366,60 +1579,24 @@ def get_executor_dictionary(jobreport_dictionary): return executor_dictionary -def get_number_of_events_deprecated(jobreport_dictionary): # TODO: remove this function - """ - Extract the number of events from the job report. - - :param jobreport_dictionary: - :return: - """ - - nevents = {} # FORMAT: { format : total_events, .. } - nmax = 0 - - executor_dictionary = get_executor_dictionary(jobreport_dictionary) - if executor_dictionary != {}: - for format in list(executor_dictionary.keys()): # "RAWtoESD", .., Python 2/3 - if 'nevents' in executor_dictionary[format]: - if format in nevents: - nevents[format] += executor_dictionary[format]['nevents'] - else: - nevents[format] = executor_dictionary[format]['nevents'] - else: - logger.warning("format %s has no such key: nevents" % (format)) - - # Now find the largest number of events among the different formats - if nevents != {}: - try: - nmax = max(nevents.values()) - except Exception as e: - logger.warning("exception caught: %s" % (e)) - nmax = 0 - else: - logger.warning("did not find the number of events in the job report") - nmax = 0 - - return nmax - - def get_resimevents(jobreport_dictionary): """ Extract and add up the resimevents from the job report. This information is reported with the jobMetrics. :param jobreport_dictionary: job report dictionary. - :return: resimevents (int) + :return: resimevents (int or None) """ - resimevents = 0 + resimevents = None executor_dictionary = get_executor_dictionary(jobreport_dictionary) if executor_dictionary != {}: - for format in list(executor_dictionary.keys()): # "ReSim", Python 2/3 - if 'resimevents' in executor_dictionary[format]: + for fmt in list(executor_dictionary.keys()): # "ReSim", Python 2/3 + if 'resimevents' in executor_dictionary[fmt]: try: - resimevents = int(executor_dictionary[format]['resimevents']) - except Exception: + resimevents = int(executor_dictionary[fmt]['resimevents']) + except (KeyError, ValueError, TypeError): pass else: break @@ -1431,8 +1608,9 @@ def get_db_info(jobreport_dictionary): """ Extract and add up the DB info from the job report. This information is reported with the jobMetrics. - Note: this function adds up the different dbData and dbTime's in the different executor steps. In modern job - reports this might have been done already by the transform and stored in dbDataTotal and dbTimeTotal. + Note: this function adds up the different dbData and dbTime's in + the different executor steps. In modern job reports this might have + been done already by the transform and stored in dbDataTotal and dbTimeTotal. :param jobreport_dictionary: job report dictionary. :return: db_time (int), db_data (long) @@ -1441,26 +1619,26 @@ def get_db_info(jobreport_dictionary): db_time = 0 try: db_data = long(0) # Python 2 # noqa: F821 - except Exception: + except NameError: db_data = 0 # Python 3 executor_dictionary = get_executor_dictionary(jobreport_dictionary) if executor_dictionary != {}: - for format in list(executor_dictionary.keys()): # "RAWtoESD", .., Python 2/3 - if 'dbData' in executor_dictionary[format]: + for fmt in list(executor_dictionary.keys()): # "RAWtoESD", .., Python 2/3 + if 'dbData' in executor_dictionary[fmt]: try: - db_data += executor_dictionary[format]['dbData'] + db_data += executor_dictionary[fmt]['dbData'] except Exception: pass else: - logger.warning("format %s has no such key: dbData" % format) - if 'dbTime' in executor_dictionary[format]: + logger.warning("format %s has no such key: dbData", fmt) + if 'dbTime' in executor_dictionary[fmt]: try: - db_time += executor_dictionary[format]['dbTime'] + db_time += executor_dictionary[fmt]['dbTime'] except Exception: pass else: - logger.warning("format %s has no such key: dbTime" % format) + logger.warning("format %s has no such key: dbTime", fmt) return db_time, db_data @@ -1477,17 +1655,16 @@ def get_db_info_str(db_time, db_data): try: zero = long(0) # Python 2 # noqa: F821 - except Exception: + except NameError: zero = 0 # Python 3 + db_data_s = "" if db_data != zero: db_data_s = "%s" % (db_data) - else: - db_data_s = "" + + db_time_s = "" if db_time != 0: db_time_s = "%.2f" % (db_time) - else: - db_time_s = "" return db_time_s, db_data_s @@ -1500,24 +1677,24 @@ def get_cpu_times(jobreport_dictionary): Note: this function is used with Event Service jobs :param jobreport_dictionary: - :return: cpu_conversion_unit (unit), total_cpu_time, conversion_factor (output consistent with set_time_consumed()) + :return: cpu_conversion_unit (unit), total_cpu_time, + conversion_factor (output consistent with set_time_consumed()) """ try: total_cpu_time = long(0) # Python 2 # noqa: F821 - except Exception: + except NameError: total_cpu_time = 0 # Python 3 executor_dictionary = get_executor_dictionary(jobreport_dictionary) if executor_dictionary != {}: - for format in list(executor_dictionary.keys()): # "RAWtoESD", .., Python 2/3 - if 'cpuTime' in executor_dictionary[format]: - try: - total_cpu_time += executor_dictionary[format]['cpuTime'] - except Exception: - pass - else: - logger.warning("format %s has no such key: cpuTime" % (format)) + for fmt in list(executor_dictionary.keys()): # "RAWtoESD", .., Python 2/3 + try: + total_cpu_time += executor_dictionary[fmt]['cpuTime'] + except KeyError: + logger.warning("format %s has no such key: cpuTime", fmt) + except Exception: + pass conversion_factor = 1.0 cpu_conversion_unit = "s" @@ -1546,43 +1723,52 @@ def cleanup_looping_payload(workdir): :return: """ - for (p, d, f) in os.walk(workdir): - for filename in f: + for (root, _, files) in os.walk(workdir): + for filename in files: if 'pool.root' in filename: - path = os.path.join(p, filename) + path = os.path.join(root, filename) path = os.path.abspath(path) remove(path) -def cleanup_payload(workdir, outputfiles=[]): +def cleanup_payload(workdir, outputfiles=None, removecores=True): """ Cleanup of payload (specifically AthenaMP) sub directories prior to log file creation. Also remove core dumps. - :param workdir: working directory (string) - :param outputfiles: list of output files + :param workdir: working directory (string). + :param outputfiles: list of output files. + :param removecores: remove core files if True (Boolean). :return: """ - remove_core_dumps(workdir) + if outputfiles is None: + outputfiles = [] + + if removecores: + remove_core_dumps(workdir) for ampdir in glob('%s/athenaMP-workers-*' % workdir): - for (p, d, f) in os.walk(ampdir): - for filename in f: - if 'core' in filename or 'pool.root' in filename or 'tmp.' in filename: - path = os.path.join(p, filename) - path = os.path.abspath(path) + for (root, _, files) in os.walk(ampdir): + for filename in files: + path = os.path.abspath(os.path.join(root, filename)) + + core_file = ('core' in filename and removecores) + pool_root_file = 'pool.root' in filename + tmp_file = 'tmp.' in filename + + if core_file or pool_root_file or tmp_file: remove(path) + for outfile in outputfiles: if outfile in filename: - path = os.path.join(p, filename) - path = os.path.abspath(path) remove(path) def get_redundant_path(): """ - Return the path to the file containing the redundant files and directories to be removed prior to log file creation. + Return the path to the file containing the redundant files + and directories to be removed prior to log file creation. :return: file path (string). """ @@ -1599,20 +1785,26 @@ def get_redundant_path(): def get_redundants(): """ Get list of redundant files and directories (to be removed). - The function will return the content of an external file. It that can't be read, then a list defined in this - function will be returned instead. Any updates to the external file must be propagated to this function. + The function will return the content of an external file. It that + can't be read, then a list defined in this function will be returned instead. + Any updates to the external file must be propagated to this function. :return: files and directories list """ # try to read the list from the external file filename = get_redundant_path() - if os.path.exists(filename) and False: # do not use the cvmfs file since it is not being updated - dir_list = read_list(filename) - if dir_list: - return dir_list - logger.debug('list of redundant files could not be read from external file: %s (will use internal list)' % filename) + # do not use the cvmfs file since it is not being updated + # If you uncomment this block, need to also uncomment the read_list import + # if os.path.exists(filename) and False: + # dir_list = read_list(filename) + # if dir_list: + # return dir_list + + logger.debug(( + 'list of redundant files could not be read from external file: %s ' + '(will use internal list)'), filename) # else return the following dir_list = ["AtlasProduction*", @@ -1681,7 +1873,8 @@ def get_redundants(): def remove_archives(workdir): """ - Explicitly remove any soft linked archives (.a files) since they will be dereferenced by the tar command + Explicitly remove any soft linked archives (.a files) since + they will be dereferenced by the tar command (--dereference option). :param workdir: working directory (string) @@ -1689,15 +1882,15 @@ def remove_archives(workdir): """ matches = [] - for root, dirnames, filenames in os.walk(workdir): + for root, _, filenames in os.walk(workdir): for filename in fnmatch.filter(filenames, '*.a'): matches.append(os.path.join(root, filename)) - for root, dirnames, filenames in os.walk(os.path.dirname(workdir)): + for root, _, filenames in os.walk(os.path.dirname(workdir)): for filename in fnmatch.filter(filenames, 'EventService_premerge_*.tar'): matches.append(os.path.join(root, filename)) - if matches != []: - for f in matches: - remove(f) + + for match in matches: + remove(match) def cleanup_broken_links(workdir): @@ -1709,28 +1902,26 @@ def cleanup_broken_links(workdir): """ broken = [] - for root, dirs, files in os.walk(workdir): + for root, _, files in os.walk(workdir): for filename in files: path = os.path.join(root, filename) - if os.path.islink(path): - target_path = os.readlink(path) - # Resolve relative symlinks - if not os.path.isabs(target_path): - target_path = os.path.join(os.path.dirname(path), target_path) - if not os.path.exists(target_path): - broken.append(path) - else: - # If it's not a symlink we're not interested. + if not os.path.islink(path): continue - if broken: - for p in broken: - remove(p) + target_path = os.readlink(path) + # Resolve relative symlinks + if not os.path.isabs(target_path): + target_path = os.path.join(os.path.dirname(path), target_path) + if not os.path.exists(target_path): + broken.append(path) + + for brok in broken: + remove(brok) -def ls(workdir): +def list_work_dir(workdir): cmd = 'ls -lF %s' % workdir - ec, stdout, stderr = execute(cmd) + _, stdout, stderr = execute(cmd) logger.debug('%s:\n' % stdout + stderr) @@ -1750,59 +1941,65 @@ def remove_special_files(workdir, dir_list, outputfiles): to_delete = [] for _dir in dir_list: files = glob(os.path.join(workdir, _dir)) + if not files: + continue + exclude = [] + for exc in exceptions_list: + for item in files: + if exc in item: + exclude.append(os.path.abspath(item)) - if files: - for exc in exceptions_list: - for f in files: - if exc in f: - exclude.append(os.path.abspath(f)) - _files = [] - for f in files: - if f not in exclude: - _files.append(os.path.abspath(f)) - to_delete += _files + _files = [os.path.abspath(item) for item in files if item not in exclude] + to_delete += _files exclude_files = [] - for of in outputfiles: - exclude_files.append(os.path.join(workdir, of)) - - for f in to_delete: - if f not in exclude_files: - logger.debug('removing %s' % f) - if os.path.isfile(f): - remove(f) + for opf in outputfiles: + exclude_files.append(os.path.join(workdir, opf)) + + for item in to_delete: + if item not in exclude_files: + logger.debug('removing %s', item) + if os.path.isfile(item): + remove(item) else: - remove_dir_tree(f) + remove_dir_tree(item) -def remove_redundant_files(workdir, outputfiles=[], islooping=False): +def remove_redundant_files(workdir, outputfiles=None, islooping=False, debugmode=False): """ Remove redundant files and directories prior to creating the log file. + Note: in debug mode, any core files should not be removed before creating the log. + :param workdir: working directory (string). :param outputfiles: list of protected output files (list). - :param islooping: looping job variable to make sure workDir is not removed in case of looping (boolean). + :param islooping: looping job variable to make sure workDir + is not removed in case of looping (boolean). + :param debugmode: True if debug mode has been switched on (Boolean). :return: """ + if outputfiles is None: + outputfiles = [] + logger.debug("removing redundant files prior to log creation") workdir = os.path.abspath(workdir) - ls(workdir) + list_work_dir(workdir) # get list of redundant files and directories (to be removed) dir_list = get_redundants() # remove core and pool.root files from AthenaMP sub directories + logger.debug('cleaning up payload') try: - logger.debug('cleaning up payload') - cleanup_payload(workdir, outputfiles) - except Exception as e: - logger.warning("failed to execute cleanup_payload(): %s" % e) + cleanup_payload(workdir, outputfiles, removecores=not debugmode) + except OSError as exc: + logger.warning("failed to execute cleanup_payload(): %s", exc) - # explicitly remove any soft linked archives (.a files) since they will be dereferenced by the tar command - # (--dereference option) + # explicitly remove any soft linked archives (.a files) + # since they will be dereferenced by the tar command (--dereference option) logger.debug('removing archives') remove_archives(workdir) @@ -1819,7 +2016,7 @@ def remove_redundant_files(workdir, outputfiles=[], islooping=False): # remove at least root files from workDir (ie also in the case of looping job) cleanup_looping_payload(path) if not islooping: - logger.debug('removing \'workDir\' from workdir=%s' % workdir) + logger.debug('removing \'workDir\' from workdir=%s', workdir) remove_dir_tree(path) # remove additional dirs @@ -1827,16 +2024,18 @@ def remove_redundant_files(workdir, outputfiles=[], islooping=False): for additional in additionals: path = os.path.join(workdir, additional) if os.path.exists(path): - logger.debug('removing \'%s\' from workdir=%s' % (additional, workdir)) + logger.debug('removing \'%s\' from workdir=%s', additional, workdir) remove_dir_tree(path) - ls(workdir) + list_work_dir(workdir) def download_command(process, workdir): """ Download the pre/postprocess commands if necessary. + Process FORMAT: {'command': , 'args': , 'label': } + :param process: pre/postprocess dictionary. :param workdir: job workdir (string). :return: updated pre/postprocess dictionary. @@ -1847,9 +2046,9 @@ def download_command(process, workdir): # download the command if necessary if cmd.startswith('http'): # Try to download the trf (skip when user container is to be used) - ec, diagnostics, cmd = get_analysis_trf(cmd, workdir) - if ec != 0: - logger.warning('cannot execute command due to previous error: %s' % cmd) + exitcode, _, cmd = get_analysis_trf(cmd, workdir) + if exitcode != 0: + logger.warning('cannot execute command due to previous error: %s', cmd) return {} # update the preprocess command (the URL should be stripped) @@ -1860,95 +2059,268 @@ def download_command(process, workdir): def get_utility_commands(order=None, job=None): """ - Return a dictionary of utility commands and arguments to be executed in parallel with the payload. - This could e.g. be memory and network monitor commands. A separate function can be used to determine the - corresponding command setups using the utility command name. - If the optional order parameter is set, the function should return the list of corresponding commands. - E.g. if order=UTILITY_BEFORE_PAYLOAD, the function should return all commands that are to be executed before the - payload. If order=UTILITY_WITH_PAYLOAD, the corresponding commands will be prepended to the payload execution - string. If order=UTILITY_AFTER_PAYLOAD_STARTED, the commands that should be executed after the payload has been started - should be returned. If order=UTILITY_WITH_STAGEIN, the commands that should be executed parallel with stage-in will - be returned. + Return a dictionary of utility commands and arguments to be executed + in parallel with the payload. This could e.g. be memory and network + monitor commands. A separate function can be used to determine the + corresponding command setups using the utility command name. If the + optional order parameter is set, the function should return the list + of corresponding commands. + + For example: + + If order=UTILITY_BEFORE_PAYLOAD, the function should return all + commands that are to be executed before the payload. + + If order=UTILITY_WITH_PAYLOAD, the corresponding commands will be + prepended to the payload execution string. + + If order=UTILITY_AFTER_PAYLOAD_STARTED, the commands that should be + executed after the payload has been started should be returned. - FORMAT: {'command': , 'args': } + If order=UTILITY_WITH_STAGEIN, the commands that should be executed + parallel with stage-in will be returned. + + FORMAT: {'command': , 'args': , 'label': } :param order: optional sorting order (see pilot.util.constants). :param job: optional job object. :return: dictionary of utilities to be executed in parallel with the payload. """ + if order == UTILITY_BEFORE_PAYLOAD and job.preprocess: + return get_precopostprocess_command(job.preprocess, job.workdir, 'preprocess') + + if order == UTILITY_WITH_PAYLOAD: + return {'command': 'NetworkMonitor', 'args': '', 'label': 'networkmonitor'} + + if order == UTILITY_AFTER_PAYLOAD_STARTED: + return get_utility_after_payload_started() + + if order == UTILITY_AFTER_PAYLOAD_STARTED2 and job.coprocess: + return get_precopostprocess_command(job.coprocess, job.workdir, 'coprocess') + + if order == UTILITY_AFTER_PAYLOAD_FINISHED: + return get_xcache_command( + job.infosys.queuedata.catchall, + job.workdir, + job.jobid, + 'xcache_kill', + xcache_deactivation_command, + ) + + if order == UTILITY_AFTER_PAYLOAD_FINISHED2 and job.postprocess: + return get_precopostprocess_command(job.postprocess, job.workdir, 'postprocess') + + if order == UTILITY_BEFORE_STAGEIN: + return get_xcache_command( + job.infosys.queuedata.catchall, + job.workdir, + job.jobid, + 'xcache_start', + xcache_activation_command, + ) + + return None + + +def get_precopostprocess_command(process, workdir, label): + """ + Return the pre/co/post-process command dictionary. + + Command FORMAT: {'command': , 'args': , 'label': } + + The returned command has the structure: { 'command': , } + :param process: pre/co/post-process (dictionary). + :param workdir: working directory (string). + :param label: label (string). + :return: command (dictionary). + """ + com = {} + if process.get('command', ''): + com = download_command(process, workdir) + com['label'] = label + return com - if order == UTILITY_BEFORE_PAYLOAD and job.preprocess: - if job.preprocess.get('command', ''): - com = download_command(job.preprocess, job.workdir) - elif order == UTILITY_WITH_PAYLOAD: - com = {'command': 'NetworkMonitor', 'args': ''} - elif order == UTILITY_AFTER_PAYLOAD_STARTED: + +def get_utility_after_payload_started(): + """ + Return the command dictionary for the utility after the payload has started. + + Command FORMAT: {'command': , 'args': , 'label': } + + :return: command (dictionary). + """ + + com = {} + try: cmd = config.Pilot.utility_after_payload_started + except Exception: + pass + else: if cmd: - com = {'command': cmd, 'args': ''} - elif order == UTILITY_AFTER_PAYLOAD_STARTED2 and job.coprocess: - if job.coprocess.get('command', ''): - com = download_command(job.coprocess, job.workdir) - elif order == UTILITY_AFTER_PAYLOAD and job.postprocess: - if job.postprocess.get('command', ''): - com = download_command(job.postprocess, job.workdir) - elif order == UTILITY_AFTER_PAYLOAD_FINISHED: - if job.postprocess and job.postprocess.get('command', ''): - com = download_command(job.postprocess, job.workdir) - elif 'pilotXcache' in job.infosys.queuedata.catchall: - com = xcache_deactivation_command(job.workdir) - elif order == UTILITY_BEFORE_STAGEIN: - if 'pilotXcache' in job.infosys.queuedata.catchall: - com = xcache_activation_command(job.jobid) + com = {'command': cmd, 'args': '', 'label': cmd.lower()} + return com + + +def get_xcache_command(catchall, workdir, jobid, label, xcache_function): + """ + Return the proper xcache command for either activation or deactivation. + Command FORMAT: {'command': , 'args': , 'label': } + + :param catchall: queuedata catchall field (string). + :param workdir: job working directory (string). + :param jobid: PanDA job id (string). + :param label: label (string). + :param xcache_function: activation/deactivation function name (function). + :return: command (dictionary). + """ + + com = {} + if 'pilotXcache' in catchall: + com = xcache_function(jobid=jobid, workdir=workdir) + com['label'] = label return com -def xcache_activation_command(jobid): +def post_prestagein_utility_command(**kwargs): + """ + Execute any post pre-stage-in utility commands. + + :param kwargs: kwargs (dictionary). + :return: + """ + + label = kwargs.get('label', 'unknown_label') + stdout = kwargs.get('output', None) + + if stdout: + logger.debug('processing stdout for label=%s', label) + xcache_proxy(stdout) + else: + logger.warning('no output for label=%s', label) + + alrb_xcache_files = os.environ.get('ALRB_XCACHE_FILES', '') + if alrb_xcache_files: + cmd = 'cat $ALRB_XCACHE_FILES/settings.sh' + _, _stdout, _ = execute(cmd) + logger.debug('cmd=%s:\n\n%s\n\n', cmd, _stdout) + + +def xcache_proxy(output): + """ + Extract env vars from xcache stdout and set them. + + :param output: command output (string). + :return: + """ + + # loop over each line in the xcache stdout and identify the needed environmental variables + for line in output.split('\n'): + if 'ALRB_XCACHE_PROXY' in line: + suffix = '_REMOTE' if 'REMOTE' in line else '' + name = 'ALRB_XCACHE_PROXY%s' % suffix + pattern = r'\ export\ ALRB_XCACHE_PROXY%s\=\"(.+)\"' % suffix + set_xcache_var(line, name=name, pattern=pattern) + + elif 'ALRB_XCACHE_MYPROCESS' in line: + set_xcache_var( + line, + name='ALRB_XCACHE_MYPROCESS', + pattern=r'\ ALRB_XCACHE_MYPROCESS\=(.+)' + ) + + elif 'Messages logged in' in line: + set_xcache_var( + line, + name='ALRB_XCACHE_LOG', + pattern=r'xcache\ started\ successfully.\ \ Messages\ logged\ in\ (.+)' + ) + + elif 'ALRB_XCACHE_FILES' in line: + set_xcache_var( + line, + name='ALRB_XCACHE_FILES', + pattern=r'\ ALRB_XCACHE_FILES\=(.+)' + ) + + +def set_xcache_var(line, name='', pattern=''): + """ + Extract the value of a given environmental variable from a given stdout line. + + :param line: line from stdout to be investigated (string). + :param name: name of env var (string). + :param pattern: regex pattern (string). + :return: + """ + + pattern = re.compile(pattern) + result = re.findall(pattern, line) + if result: + os.environ[name] = result[0] + + +def xcache_activation_command(workdir='', jobid=''): """ Return the xcache service activation command. + Note: the workdir is not used here, but the function prototype + needs it in the called (xcache_activation_command needs it). + + :param workdir: unused work directory - do not remove (string). :param jobid: PanDA job id to guarantee that xcache process is unique (int). :return: xcache command (string). """ # a successful startup will set ALRB_XCACHE_PROXY and ALRB_XCACHE_PROXY_REMOTE - # so any file access with root://... should be replaced with one of the above - # (depending on whether you are on the same machine or not) + # so any file access with root://... should be replaced with one of + # the above (depending on whether you are on the same machine or not) # example: # ${ALRB_XCACHE_PROXY}root://atlasxrootd-kit.gridka.de:1094//pnfs/gridka.de/../DAOD_FTAG4.24348858._000020.pool.root.1 command = "%s " % get_asetup(asetup=False) - # add 'xcache list' which will also kill any orphaned processes lingering in the system - command += "lsetup xcache; xcache list; xcache start -d $PWD/%s/xcache -C centos7 --disklow 4g --diskhigh 5g" % jobid + + # add 'xcache list' which will also kill any + # orphaned processes lingering in the system + command += ( + "lsetup xcache; xcache list; " + "xcache start -d $PWD/%s/xcache -C centos7 --disklow 4g --diskhigh 5g -b 4" % jobid) return {'command': command, 'args': ''} -def xcache_deactivation_command(workdir): +def xcache_deactivation_command(workdir='', jobid=''): """ Return the xcache service deactivation command. This service should be stopped after the payload has finished. Copy the messages log before shutting down. + Note: the job id is not used here, but the function prototype + needs it in the called (xcache_activation_command needs it). + :param workdir: payload work directory (string). + :param jobid: unused job id - do not remove (string). :return: xcache command (string). """ path = os.environ.get('ALRB_XCACHE_LOG', None) if path and os.path.exists(path): - logger.debug('copying xcache messages log file (%s) to work dir (%s)' % (path, workdir)) + logger.debug('copying xcache messages log file (%s) to work dir (%s)', path, workdir) dest = os.path.join(workdir, 'xcache-messages.log') try: copy(path, dest) - except Exception as e: - logger.warning('exception caught copying xcache log: %s' % e) - + except Exception as exc: + logger.warning('exception caught copying xcache log: %s', exc) + else: + if not path: + logger.warning('ALRB_XCACHE_LOG is not set') + if path and not os.path.exists(path): + logger.warning('path does not exist: %s', path) command = "%s " % get_asetup(asetup=False) command += "lsetup xcache; xcache kill" # -C centos7 - return {'command': command, 'args': '-p all'} + return {'command': command, 'args': '-p $ALRB_XCACHE_MYPROCESS'} def get_utility_command_setup(name, job, setup=None): @@ -1963,11 +2335,23 @@ def get_utility_command_setup(name, job, setup=None): """ if name == 'MemoryMonitor': - # must know if payload is running in a container or not (enables search for pid in ps output) + # must know if payload is running in a container or not + # (enables search for pid in ps output) use_container = job.usecontainer or 'runcontainer' in job.transformation - dump_ps = True if "PRMON_DEBUG" in job.infosys.queuedata.catchall else False - setup, pid = get_memory_monitor_setup(job.pid, job.pgrp, job.jobid, job.workdir, job.command, use_container=use_container, - transformation=job.transformation, outdata=job.outdata, dump_ps=dump_ps) + dump_ps = ("PRMON_DEBUG" in job.infosys.queuedata.catchall) + + setup, pid = get_memory_monitor_setup( + job.pid, + job.pgrp, + job.jobid, + job.workdir, + job.command, + use_container=use_container, + transformation=job.transformation, + outdata=job.outdata, + dump_ps=dump_ps + ) + _pattern = r"([\S]+)\ ." pattern = re.compile(_pattern) _name = re.findall(pattern, setup.split(';')[-1]) @@ -1977,21 +2361,24 @@ def get_utility_command_setup(name, job, setup=None): logger.warning('trf name could not be identified in setup string') # update the pgrp if the pid changed - if job.pid != pid and pid != --1: - logger.debug('updating pgrp=%d for pid=%d' % (job.pgrp, pid)) + if pid not in (job.pid, -1): + logger.debug('updating pgrp=%d for pid=%d', job.pgrp, pid) try: job.pgrp = os.getpgid(pid) - except Exception as e: - logger.warning('os.getpgid(%d) failed with: %s' % (pid, e)) + except Exception as exc: + logger.warning('os.getpgid(%d) failed with: %s', pid, exc) return setup - elif name == 'NetworkMonitor' and setup: + + if name == 'NetworkMonitor' and setup: return get_network_monitor_setup(setup, job) - elif name == 'Prefetcher': + + if name == 'Prefetcher': return get_prefetcher_setup(job) - elif name == 'Benchmark': + + if name == 'Benchmark': return get_benchmark_setup(job) - else: - return "" + + return "" def get_utility_command_execution_order(name): @@ -2005,12 +2392,13 @@ def get_utility_command_execution_order(name): # example implementation if name == 'NetworkMonitor': return UTILITY_WITH_PAYLOAD - elif name == 'MemoryMonitor': - return UTILITY_AFTER_PAYLOAD_STARTED - else: - logger.warning('unknown utility name: %s' % name) + + if name == 'MemoryMonitor': return UTILITY_AFTER_PAYLOAD_STARTED + logger.warning('unknown utility name: %s', name) + return UTILITY_AFTER_PAYLOAD_STARTED + def post_utility_command_action(name, job): """ @@ -2065,7 +2453,7 @@ def verify_lfn_length(outdata): :return: error code (int), diagnostics (string). """ - ec = 0 + exitcode = 0 diagnostics = "" max_length = 255 @@ -2074,10 +2462,10 @@ def verify_lfn_length(outdata): if len(fspec.lfn) > max_length: diagnostics = "LFN too long (length: %d, must be less than %d characters): %s" % \ (len(fspec.lfn), max_length, fspec.lfn) - ec = errors.LFNTOOLONG + exitcode = errors.LFNTOOLONG break - return ec, diagnostics + return exitcode, diagnostics def verify_ncores(corecount): @@ -2099,25 +2487,30 @@ def verify_ncores(corecount): except Exception: athena_proc_number = None - # Note: if ATHENA_PROC_NUMBER is set (by the wrapper), then do not overwrite it - # Otherwise, set it to the value of job.coreCount - # (actually set ATHENA_PROC_NUMBER_JOB and use it if it exists, otherwise use ATHENA_PROC_NUMBER directly; - # ATHENA_PROC_NUMBER_JOB will always be the value from the job definition) + # Note: if ATHENA_PROC_NUMBER is set (by the wrapper), then do not + # overwrite it. Otherwise, set it to the value of job.coreCount + # (actually set ATHENA_PROC_NUMBER_JOB and use it if it exists, + # otherwise use ATHENA_PROC_NUMBER directly; ATHENA_PROC_NUMBER_JOB + # will always be the value from the job definition) if athena_proc_number: - logger.info("encountered a set ATHENA_PROC_NUMBER (%d), will not overwrite it" % athena_proc_number) + logger.info(( + "encountered a set ATHENA_PROC_NUMBER (%d), " + "will not overwrite it"), athena_proc_number) logger.info('set ATHENA_CORE_NUMBER to same value as ATHENA_PROC_NUMBER') - os.environ['ATHENA_CORE_NUMBER'] = "%s" % athena_proc_number + os.environ['ATHENA_CORE_NUMBER'] = str(athena_proc_number) else: - os.environ['ATHENA_PROC_NUMBER_JOB'] = "%s" % corecount - os.environ['ATHENA_CORE_NUMBER'] = "%s" % corecount - logger.info("set ATHENA_PROC_NUMBER_JOB and ATHENA_CORE_NUMBER to %s (ATHENA_PROC_NUMBER will not be overwritten)" % corecount) + os.environ['ATHENA_PROC_NUMBER_JOB'] = str(corecount) + os.environ['ATHENA_CORE_NUMBER'] = str(corecount) + logger.info(( + "set ATHENA_PROC_NUMBER_JOB and ATHENA_CORE_NUMBER to %s " + "(ATHENA_PROC_NUMBER will not be overwritten)"), corecount) def verify_job(job): """ Verify job parameters for specific errors. Note: - in case of problem, the function should set the corresponding pilot error code using + in case of problem, the function should set the corresponding pilot error code using: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code()) :param job: job object @@ -2127,11 +2520,11 @@ def verify_job(job): status = False # are LFNs of correct lengths? - ec, diagnostics = verify_lfn_length(job.outdata) - if ec != 0: + exitcode, diagnostics = verify_lfn_length(job.outdata) + if exitcode != 0: logger.fatal(diagnostics) job.piloterrordiag = diagnostics - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec) + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exitcode) else: status = True @@ -2164,7 +2557,7 @@ def get_metadata(workdir): path = os.path.join(workdir, config.Payload.jobreport) metadata = read_file(path) if os.path.exists(path) else None - logger.debug('metadata=%s' % str(metadata)) + logger.debug('metadata=%s', str(metadata)) return metadata @@ -2176,12 +2569,7 @@ def should_update_logstash(frequency=10): :param frequency: :return: return True once per 'frequency' times. """ - - from random import randint - if randint(0, frequency - 1) == 0: - return True - else: - return False + return randint(0, frequency - 1) == 0 def update_server(job): @@ -2195,36 +2583,145 @@ def update_server(job): """ # attempt to read memory_monitor_output.txt and convert it to json - if should_update_logstash(): - path = os.path.join(job.workdir, get_memory_monitor_output_filename()) - if os.path.exists(path): - # convert memory monitor text output to json and return the selection (don't store it, log has already been created) - metadata_dictionary = get_metadata_dict_from_txt(path, storejson=True, jobid=job.jobid) - if metadata_dictionary: - # the output was previously written to file, update the path and tell curl to send it - new_path = update_extension(path=path, extension='json') - #out = read_json(new_path) - #logger.debug('prmon json=\n%s' % out) - # logger.debug('final logstash prmon dictionary: %s' % str(metadata_dictionary)) - url = 'https://pilot.atlas-ml.org' # 'http://collector.atlas-ml.org:80' - #cmd = "curl --connect-timeout 20 --max-time 120 -H \"Content-Type: application/json\" -X POST -d \'%s\' %s" % \ - # (str(metadata_dictionary).replace("'", '"'), url) - # curl --connect-timeout 20 --max-time 120 -H "Content-Type: application/json" -X POST --upload-file test.json - # https://pilot.atlas-ml.org - cmd = "curl --connect-timeout 20 --max-time 120 -H \"Content-Type: application/json\" -X POST --upload-file %s %s" % (new_path, url) - #cmd = "curl --connect-timeout 20 --max-time 120 -F 'data=@%s' %s" % (new_path, url) - # send metadata to logstash - try: - exit_code, stdout, stderr = execute(cmd, usecontainer=False) - except Exception as e: - logger.warning('exception caught: %s' % e) - else: - logger.debug('sent prmon JSON dictionary to logstash server') - logger.debug('stdout: %s' % stdout) - logger.debug('stderr: %s' % stderr) - else: - logger.warning('no prmon json available - cannot send anything to logstash server') + if not should_update_logstash(): + logger.debug('no need to update logstash for this job') + return + + path = os.path.join(job.workdir, get_memory_monitor_output_filename()) + if not os.path.exists(path): + logger.warning('path does not exist: %s', path) + return + + # convert memory monitor text output to json and return the selection + # (don't store it, log has already been created) + metadata_dictionary = get_metadata_dict_from_txt(path, storejson=True, jobid=job.jobid) + if metadata_dictionary: + # the output was previously written to file, + # update the path and tell curl to send it + new_path = update_extension(path=path, extension='json') + + #out = read_json(new_path) + #logger.debug('prmon json=\n%s' % out) + # logger.debug('final logstash prmon dictionary: %s' % str(metadata_dictionary)) + url = 'https://pilot.atlas-ml.org' # 'http://collector.atlas-ml.org:80' + + # cmd = ( + # "curl --connect-timeout 20 --max-time 120 " + # "-H \"Content-Type: application/json\" -X POST -d \'%s\' %s" % \ + # (str(metadata_dictionary).replace("'", '"'), url) + #) + + # curl --connect-timeout 20 --max-time 120 -H + # "Content-Type: application/json" -X POST --upload-file test.json + # https://pilot.atlas-ml.org + cmd = ( + "curl --connect-timeout 20 --max-time 120 " + "-H \"Content-Type: application/json\" " + "-X POST " + "--upload-file %s %s" % (new_path, url) + ) + #cmd = "curl --connect-timeout 20 --max-time 120 -F + # 'data=@%s' %s" % (new_path, url) + # send metadata to logstash + try: + _, stdout, stderr = execute(cmd, usecontainer=False) + except Exception as exc: + logger.warning('exception caught: %s', exc) else: - logger.warning('path does not exist: %s' % path) + logger.debug('sent prmon JSON dictionary to logstash server') + logger.debug('stdout: %s', stdout) + logger.debug('stderr: %s', stderr) else: - logger.debug('no need to update logstash for this job') + msg = 'no prmon json available - cannot send anything to logstash server' + logger.warning(msg) + + +def preprocess_debug_command(job): + """ + + """ + + # Should the pilot do the setup or does jobPars already contain the information? + preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams) + # get the general setup command and then verify it if required + resource_name = get_resource_name() # 'grid' if no hpc_resource is set + + # Python 3, level: -1 -> 0 + modname = 'pilot.user.atlas.resource.%s' % resource_name + resource = __import__(modname, globals(), locals(), [resource_name], 0) + + cmd = resource.get_setup_command(job, preparesetup) + if not cmd.endswith(';'): + cmd += '; ' + if cmd not in job.debug_command: + job.debug_command = cmd + job.debug_command + + +def process_debug_command(debug_command, pandaid): + """ + In debug mode, the server can send a special debug command to the piloti + via the updateJob backchannel. This function can be used to process that + command, i.e. to identify a proper pid to debug (which is unknown + to the server). + + For gdb, the server might send a command with gdb option --pid %. + The pilot need to replace the % with the proper pid. The default + (hardcoded) process will be that of athena.py. The pilot will find the + corresponding pid. + + :param debug_command: debug command (string). + :param pandaid: PanDA id (string). + :return: updated debug command (string). + """ + + if '--pid %' not in debug_command: + return debug_command + + pandaid_pid = None + + # replace the % with the pid for athena.py + # note: if athena.py is not yet running, the --pid % will remain. + # Otherwise the % will be replaced by the pid first find the pid + # (if athena.py is running) + cmd = 'ps axo pid,ppid,pgid,args' + _, stdout, _ = execute(cmd) + if stdout: + #logger.debug('ps=\n\n%s\n' % stdout) + # convert the ps output to a dictionary + dictionary = convert_ps_to_dict(stdout) + + # trim this dictionary to reduce the size + # (only keep the PID and PPID lists) + trimmed_dictionary = get_trimmed_dictionary(['PID', 'PPID'], dictionary) + + # what is the pid of the trf? + pandaid_pid = find_pid(pandaid, dictionary) + + # find all athena processes + pids = find_cmd_pids('athena.py', dictionary) + + # which of the found pids are children of the trf? + # (which has an export PandaID=.. attached to it) + for pid in pids: + try: + child = is_child(pid, pandaid_pid, trimmed_dictionary) + except RuntimeError as rte: + logger.warning(( + 'too many recursions: %s ' + '(cannot identify athena process)'), rte) + else: + if child: + logger.info('pid=%d is a child process of the trf of this job', pid) + debug_command = debug_command.replace('--pid %', '--pid %d' % pid) + logger.info('updated debug command: %s', debug_command) + break + logger.info('pid=%d is not a child process of the trf of this job', pid) + + if not pids or '--pid %' in debug_command: + logger.debug('athena is not yet running (no corresponding pid)') + + # reset the command to prevent the payload from being killed + # (will be killed when gdb has run) + debug_command = '' + + return debug_command diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index c518f2491..2a81807a4 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -899,7 +899,7 @@ def create_root_container_command(workdir, cmd): return command -def create_middleware_container_command(workdir, cmd, container_options, label='stagein'): +def create_middleware_container_command(workdir, cmd, container_options, label='stagein', proxy=True): """ Create the stage-in/out container command. @@ -924,10 +924,16 @@ def create_middleware_container_command(workdir, cmd, container_options, label=' command = 'cd %s;' % workdir # add bits and pieces for the containerisation - middleware_container = get_middleware_container() - content = get_middleware_container_script(middleware_container, cmd) + middleware_container = get_middleware_container(label=label) + content = get_middleware_container_script(middleware_container, cmd, label=label) # store it in setup.sh - script_name = 'stagein.sh' if label == 'stage-in' else 'stageout.sh' + if label == 'stage-in': + script_name = 'stagein.sh' + elif label == 'stage-out': + script_name = 'stageout.sh' + else: + script_name = 'general.sh' + try: status = write_file(os.path.join(workdir, script_name), content) except PilotException as e: @@ -935,9 +941,10 @@ def create_middleware_container_command(workdir, cmd, container_options, label=' else: if status: # generate the final container command - x509 = os.environ.get('X509_USER_PROXY', '') - if x509: - command += 'export X509_USER_PROXY=%s;' % x509 + if proxy: + x509 = os.environ.get('X509_USER_PROXY', '') + if x509: + command += 'export X509_USER_PROXY=%s;' % x509 command += 'export ALRB_CONT_RUNPAYLOAD=\"source /srv/%s\";' % script_name command += get_asetup(alrb=True) # export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase; command += 'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh -c %s' % middleware_container @@ -964,7 +971,7 @@ def get_root_container_script(cmd): return content -def get_middleware_container_script(middleware_container, cmd, asetup=False): +def get_middleware_container_script(middleware_container, cmd, asetup=False, label=''): """ Return the content of the middleware container script. If asetup is True, atlasLocalSetup will be added to the command. @@ -984,9 +991,11 @@ def get_middleware_container_script(middleware_container, cmd, asetup=False): content += 'export ALRB_LOCAL_PY3=YES; ' if asetup: # export ATLAS_LOCAL_ROOT_BASE=/cvmfs/..;source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh --quiet; content += get_asetup(asetup=False) - content += sitename + 'lsetup rucio davix xrootd; ' - content += 'python3 %s ' % cmd if is_python3() else 'python %s' % cmd - + if label == 'stagein' or label == 'stageout': + content += sitename + 'lsetup rucio davix xrootd; ' + content += 'python3 %s ' % cmd if is_python3() else 'python %s' % cmd + else: + content += cmd if not asetup: content += '\nexit $?' @@ -995,13 +1004,17 @@ def get_middleware_container_script(middleware_container, cmd, asetup=False): return content -def get_middleware_container(): +def get_middleware_container(label=None): """ Return the middleware container. + :param label: label (string). :return: path (string). """ + if label and label == 'general': + return 'CentOS7' + path = config.Container.middleware_container if path.startswith('/') and not os.path.exists(path): logger.warning('requested middleware container path does not exist: %s (switching to default value)' % path) diff --git a/pilot/user/atlas/dbrelease.py b/pilot/user/atlas/dbrelease.py index cbbec4842..c3cf9ee49 100644 --- a/pilot/user/atlas/dbrelease.py +++ b/pilot/user/atlas/dbrelease.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2021 import os import re @@ -58,14 +58,14 @@ def get_dbrelease_dir(): :return: path to DBRelease (string). """ - path = os.path.expandvars('$VO_ATLAS_SW_DIR/database/DBRelease') if 'VO_ATLAS_SW_DIR' in os.environ else os.path.expandvars('$OSG_APP/database/DBRelease') + path = os.path.join(os.environ.get('VO_ATLAS_SW_DIR', 'OSG_APP'), 'database/DBRelease') if path == "" or path.startswith('OSG_APP'): logger.warning("note: the DBRelease database directory is not available (will not attempt to skip DBRelease stage-in)") else: if os.path.exists(path): - logger.info("local DBRelease path verified: %s (will attempt to skip DBRelease stage-in)" % path) + logger.info("local DBRelease path verified: %s (will attempt to skip DBRelease stage-in)", path) else: - logger.warning("note: local DBRelease path does not exist: %s (will not attempt to skip DBRelease stage-in)" % path) + logger.warning("note: local DBRelease path does not exist: %s (will not attempt to skip DBRelease stage-in)", path) return path @@ -95,14 +95,14 @@ def is_dbrelease_available(version): # is the required DBRelease version available? if dir_list: if version in dir_list: - logger.info("found version %s in path %s (%d releases found)" % (version, path, len(dir_list))) + logger.info("found version %s in path %s (%d releases found)", version, path, len(dir_list)) status = True else: - logger.warning("did not find version %s in path %s (%d releases found)" % (version, path, len(dir_list))) + logger.warning("did not find version %s in path %s (%d releases found)", version, path, len(dir_list)) else: - logger.warning("empty DBRelease directory list: %s" % path) + logger.warning("empty DBRelease directory list: %s", path) else: - logger.warning('no such DBRelease path: %s' % path) + logger.warning('no such DBRelease path: %s', path) return status @@ -131,13 +131,13 @@ def create_setup_file(version, path): try: status = write_file(path, txt) - except FileHandlingFailure as e: - logger.warning('failed to create DBRelease setup file: %s' % e) + except FileHandlingFailure as error: + logger.warning('failed to create DBRelease setup file: %s', error) else: - logger.info("Created setup file with the following content:.................................\n%s" % txt) + logger.info("Created setup file with the following content:.................................\n%s", txt) logger.info("...............................................................................") else: - logger.warning('failed to create %s for DBRelease version=%s and directory=%s' % (path, version, d)) + logger.warning('failed to create %s for DBRelease version=%s and directory=%s', path, version, d) return status @@ -158,25 +158,25 @@ def create_dbrelease(version, path): _path = os.path.join(dbrelease_path, version) try: mkdirs(_path, chmod=None) - except PilotException as e: - logger.warning('failed to create directories for DBRelease: %s' % e) + except PilotException as error: + logger.warning('failed to create directories for DBRelease: %s', error) else: - logger.debug('created directories: %s' % _path) + logger.debug('created directories: %s', _path) # create the setup file in the DBRelease directory version_path = os.path.join(dbrelease_path, version) setup_filename = "setup.py" _path = os.path.join(version_path, setup_filename) if create_setup_file(version, _path): - logger.info("created DBRelease setup file: %s" % _path) + logger.info("created DBRelease setup file: %s", _path) # now create a new DBRelease tarball filename = os.path.join(path, "DBRelease-%s.tar.gz" % version) - logger.info("creating file: %s" % filename) + logger.info("creating file: %s", filename) try: tar = tarfile.open(filename, "w:gz") - except Exception as e: - logger.warning("could not create DBRelease tar file: %s" % e) + except Exception as error: + logger.warning("could not create DBRelease tar file: %s", error) else: if tar: # add the setup file to the tar file @@ -186,10 +186,10 @@ def create_dbrelease(version, path): try: _link = os.path.join(path, "DBRelease/current") os.symlink(version, _link) - except Exception as e: - logger.warning("failed to create symbolic link %s: %s" % (_link, e)) + except Exception as error: + logger.warning("failed to create symbolic link %s: %s", _link, error) else: - logger.warning("created symbolic link: %s" % _link) + logger.warning("created symbolic link: %s", _link) # add the symbolic link to the tar file tar.add(_link) @@ -197,17 +197,17 @@ def create_dbrelease(version, path): # done with the tar archive tar.close() - logger.info("created new DBRelease tar file: %s" % filename) + logger.info("created new DBRelease tar file: %s", filename) status = True else: logger.warning("failed to open DBRelease tar file") # clean up if rmdirs(dbrelease_path): - logger.debug("cleaned up directories in path: %s" % dbrelease_path) + logger.debug("cleaned up directories in path: %s", dbrelease_path) else: logger.warning("failed to create DBRelease setup file") if rmdirs(dbrelease_path): - logger.debug("cleaned up directories in path: %s" % dbrelease_path) + logger.debug("cleaned up directories in path: %s", dbrelease_path) return status diff --git a/pilot/user/atlas/diagnose.py b/pilot/user/atlas/diagnose.py index 9f131c939..7b3dbaaea 100644 --- a/pilot/user/atlas/diagnose.py +++ b/pilot/user/atlas/diagnose.py @@ -70,7 +70,10 @@ def interpret(job): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code) # interpret the exit info from the payload - interpret_payload_exit_info(job) + try: + interpret_payload_exit_info(job) + except Exception as error: + logger.warning('exception caught while interpreting payload exit info: %s', error) return exit_code diff --git a/pilot/user/atlas/jobmetrics.py b/pilot/user/atlas/jobmetrics.py index 503542e9a..0d9741755 100644 --- a/pilot/user/atlas/jobmetrics.py +++ b/pilot/user/atlas/jobmetrics.py @@ -5,16 +5,18 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 from pilot.api import analytics from pilot.util.jobmetrics import get_job_metrics_entry +from pilot.util.filehandling import find_last_line from .cpu import get_core_count from .common import get_db_info, get_resimevents from .utilities import get_memory_monitor_output_filename import os +import re import logging logger = logging.getLogger(__name__) @@ -31,7 +33,7 @@ def get_job_metrics_string(job): # report core count (will also set corecount in job object) corecount = get_core_count(job) - logger.debug('job definition core count: %d' % corecount) + logger.debug('job definition core count: %d', corecount) #if corecount is not None and corecount != "NULL" and corecount != 'null': # job_metrics += get_job_metrics_entry("coreCount", corecount) @@ -54,7 +56,7 @@ def get_job_metrics_string(job): job_metrics += get_job_metrics_entry("dbTime", job.dbtime) if job.dbdata and job.dbdata != "": job_metrics += get_job_metrics_entry("dbData", job.dbdata) - if job.resimevents: + if job.resimevents is not None: job_metrics += get_job_metrics_entry("resimevents", job.resimevents) # get the max disk space used by the payload (at the end of a job) @@ -69,14 +71,32 @@ def get_job_metrics_string(job): if max_space > zero: job_metrics += get_job_metrics_entry("workDirSize", max_space) else: - logger.info("will not add max space = %d B to job metrics" % max_space) + logger.info("will not add max space = %d B to job metrics", max_space) # get analytics data - path = os.path.join(job.workdir, get_memory_monitor_output_filename()) + job_metrics = add_analytics_data(job_metrics, job.workdir, job.state) + + # extract event number from file and add to job metrics if it exists + job_metrics = add_event_number(job_metrics, job.workdir) + + return job_metrics + + +def add_analytics_data(job_metrics, workdir, state): + """ + Add the memory leak+chi2 analytics data to the job metrics. + + :param job_metrics: job metrics (string). + :param workdir: work directory (string). + :param state: job state (string). + :return: updated job metrics (string). + """ + + path = os.path.join(workdir, get_memory_monitor_output_filename()) if os.path.exists(path): client = analytics.Analytics() # do not include tails on final update - tails = False if (job.state == "finished" or job.state == "failed" or job.state == "holding") else True + tails = False if (state == "finished" or state == "failed" or state == "holding") else True data = client.get_fitted_data(path, tails=tails) slope = data.get("slope", "") chi2 = data.get("chi2", "") @@ -88,6 +108,28 @@ def get_job_metrics_string(job): return job_metrics +def add_event_number(job_metrics, workdir): + """ + Extract event number from file and add to job metrics if it exists + + :param job_metrics: job metrics (string). + :param workdir: work directory (string). + :return: updated job metrics (string). + """ + + path = os.path.join(workdir, 'eventLoopHeartBeat.txt') + if os.path.exists(path): + last_line = find_last_line(path) + if last_line: + event_number = get_number_in_string(last_line) + if event_number: + job_metrics += get_job_metrics_entry("eventnumber", event_number) + else: + logger.debug('file %s does not exist (skip for now)', path) + + return job_metrics + + def get_job_metrics(job): """ Return a properly formatted job metrics string. @@ -109,17 +151,41 @@ def get_job_metrics(job): job_metrics = job_metrics.lstrip().rstrip() if job_metrics != "": - logger.debug('job metrics=\"%s\"' % (job_metrics)) + logger.debug('job metrics=\"%s\"', job_metrics) else: logger.debug("no job metrics (all values are zero)") # is job_metrics within allowed size? if len(job_metrics) > 500: - logger.warning("job_metrics out of size (%d)" % (len(job_metrics))) + logger.warning("job_metrics out of size (%d)", len(job_metrics)) # try to reduce the field size and remove the last entry which might be cut job_metrics = job_metrics[:500] job_metrics = " ".join(job_metrics.split(" ")[:-1]) - logger.warning("job_metrics has been reduced to: %s" % (job_metrics)) + logger.warning("job_metrics has been reduced to: %s", job_metrics) return job_metrics + + +def get_number_in_string(line, pattern=r'\ done\ processing\ event\ \#(\d+)\,'): + """ + Extract a number from the given string. + + E.g. file eventLoopHeartBeat.txt contains + done processing event #20166959, run #276689 22807 events read so far <<<=== + This function will return 20166959 as in int. + + :param line: line from a file (string). + :param pattern: reg ex pattern (raw string). + :return: extracted number (int). + """ + + event_number = None + match = re.search(pattern, line) + if match: + try: + event_number = int(match.group(1)) + except Exception: + pass + + return event_number diff --git a/pilot/user/atlas/metadata.py b/pilot/user/atlas/metadata.py index e5d45f3b7..25f18d666 100644 --- a/pilot/user/atlas/metadata.py +++ b/pilot/user/atlas/metadata.py @@ -21,6 +21,7 @@ def create_input_file_metadata(file_dictionary, workdir, filename="PoolFileCatal """ Create a Pool File Catalog for the files listed in the input dictionary. The function creates properly formatted XML (pretty printed) and writes the XML to file. + Note: any environment variables in the pfn tags will be expanded (see pilot/control/data::get_input_file_dictionary()). Format: dictionary = {'guid': 'pfn', ..} diff --git a/pilot/user/atlas/setup.py b/pilot/user/atlas/setup.py index 87d77faf5..e01aeece0 100644 --- a/pilot/user/atlas/setup.py +++ b/pilot/user/atlas/setup.py @@ -196,7 +196,7 @@ def set_inds(dataset): inds = ds break if inds != "": - logger.info("setting INDS environmental variable to: %s" % (inds)) + logger.info("setting INDS environmental variable to: %s", inds) os.environ['INDS'] = inds else: logger.warning("INDS unknown") @@ -219,24 +219,24 @@ def get_analysis_trf(transform, workdir): harvester_workdir = os.environ.get('HARVESTER_WORKDIR') if harvester_workdir is not None: search_pattern = "%s/jobO.*.tar.gz" % harvester_workdir - logger.debug("search_pattern - %s" % search_pattern) + logger.debug("search_pattern - %s", search_pattern) jobopt_files = glob.glob(search_pattern) for jobopt_file in jobopt_files: - logger.debug("jobopt_file = %s workdir = %s" % (jobopt_file, workdir)) + logger.debug("jobopt_file = %s workdir = %s", jobopt_file, workdir) try: copy(jobopt_file, workdir) - except Exception as e: - logger.error("could not copy file %s to %s : %s" % (jobopt_file, workdir, e)) + except Exception as error: + logger.error("could not copy file %s to %s : %s", jobopt_file, workdir, error) if '/' in transform: transform_name = transform.split('/')[-1] else: - logger.warning('did not detect any / in %s (using full transform name)' % transform) + logger.warning('did not detect any / in %s (using full transform name)', transform) transform_name = transform # is the command already available? (e.g. if already downloaded by a preprocess/main process step) if os.path.exists(os.path.join(workdir, transform_name)): - logger.info('script %s is already available - no need to download again' % transform_name) + logger.info('script %s is already available - no need to download again', transform_name) return ec, diagnostics, transform_name original_base_url = "" @@ -255,7 +255,7 @@ def get_analysis_trf(transform, workdir): status = False for base_url in get_valid_base_urls(order=original_base_url): trf = re.sub(original_base_url, base_url, transform) - logger.debug("attempting to download script: %s" % trf) + logger.debug("attempting to download script: %s", trf) status, diagnostics = download_transform(trf, transform_name, workdir) if status: break @@ -265,11 +265,11 @@ def get_analysis_trf(transform, workdir): logger.info("successfully downloaded script") path = os.path.join(workdir, transform_name) - logger.debug("changing permission of %s to 0o755" % path) + logger.debug("changing permission of %s to 0o755", path) try: os.chmod(path, 0o755) # Python 2/3 - except Exception as e: - diagnostics = "failed to chmod %s: %s" % (transform_name, e) + except Exception as error: + diagnostics = "failed to chmod %s: %s" % (transform_name, error) return errors.CHMODTRF, diagnostics, "" return ec, diagnostics, transform_name @@ -307,7 +307,7 @@ def download_transform(url, transform_name, workdir): # try to download the trf a maximum of 3 times while trial <= max_trials: - logger.info("executing command [trial %d/%d]: %s" % (trial, max_trials, cmd)) + logger.info("executing command [trial %d/%d]: %s", trial, max_trials, cmd) exit_code, stdout, stderr = execute(cmd, mute=True) if not stdout: @@ -317,14 +317,14 @@ def download_transform(url, transform_name, workdir): diagnostics = "curl command failed: %d, %s, %s" % (exit_code, stdout, stderr) logger.warning(diagnostics) if trial == max_trials: - logger.fatal('could not download transform: %s' % stdout) + logger.fatal('could not download transform: %s', stdout) status = False break else: logger.info("will try again after 60 s") sleep(60) else: - logger.info("curl command returned: %s" % stdout) + logger.info("curl command returned: %s", stdout) status = True break trial += 1 @@ -413,8 +413,8 @@ def get_payload_environment_variables(cmd, job_id, task_id, attempt_nr, processi def get_writetoinput_filenames(writetofile): """ Extract the writeToFile file name(s). - writeToFile='tmpin_mc16_13TeV.345935.PhPy8EG_A14_ttbarMET100_200_hdamp258p75_nonallhad.merge.AOD.e6620_e5984_s3126_r10724_r10726_tid15760866_00:AOD.15760866._000002.pool.root.1' - -> return 'tmpin_mc16_13TeV.345935.PhPy8EG_A14_ttbarMET100_200_hdamp258p75_nonallhad.merge.AOD.e6620_e5984_s3126_r10724_r10726_tid15760866_00' + writeToFile='tmpin_mc16_13TeV.blah:AOD.15760866._000002.pool.root.1' + -> return 'tmpin_mc16_13TeV.blah' :param writetofile: string containing file name information. :return: list of file names @@ -456,12 +456,11 @@ def replace_lfns_with_turls(cmd, workdir, filename, infiles, writetofile=""): # if turl.startswith('root://') and turl not in cmd: if turl not in cmd: cmd = cmd.replace(inputfile, turl) - logger.info("replaced '%s' with '%s' in the run command" % (inputfile, turl)) + logger.info("replaced '%s' with '%s' in the run command", inputfile, turl) # replace the LFNs with TURLs in the writetofile input file list (if it exists) if writetofile and turl_dictionary: filenames = get_writetoinput_filenames(writetofile) - logger.info("filenames=%s" % filenames) for fname in filenames: new_lines = [] path = os.path.join(workdir, fname) @@ -479,10 +478,9 @@ def replace_lfns_with_turls(cmd, workdir, filename, infiles, writetofile=""): lines = '\n'.join(new_lines) if lines: write_file(path, lines) - logger.info("lines=%s" % lines) else: - logger.warning("file does not exist: %s" % path) + logger.warning("file does not exist: %s", path) else: - logger.warning("could not find file: %s (cannot locate TURLs for direct access)" % filename) + logger.warning("could not find file: %s (cannot locate TURLs for direct access)", filename) return cmd diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py index a82646682..f04a50e83 100644 --- a/pilot/user/atlas/utilities.py +++ b/pilot/user/atlas/utilities.py @@ -286,7 +286,7 @@ def get_pid_for_jobid(ps, jobid): pid = None for line in ps.split('\n'): - if jobid in line: + if jobid in line and 'xrootd' not in line: # extract pid _pid = search(r'(\d+) ', line) try: diff --git a/pilot/user/generic/common.py b/pilot/user/generic/common.py index 7df04fc89..c747446eb 100644 --- a/pilot/user/generic/common.py +++ b/pilot/user/generic/common.py @@ -12,7 +12,7 @@ from pilot.common.exception import TrfDownloadFailure from pilot.util.config import config -from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_AFTER_PAYLOAD +from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED from pilot.util.filehandling import read_file from .setup import get_analysis_trf @@ -130,7 +130,7 @@ def get_utility_commands(order=None, job=None): If the optional order parameter is set, the function should return the list of corresponding commands. E.g. if order=UTILITY_BEFORE_PAYLOAD, the function should return all commands that are to be executed before the payload. If order=UTILITY_WITH_PAYLOAD, the corresponding commands will be prepended to the payload execution - string. If order=UTILITY_AFTER_PAYLOAD, the commands that should be executed after the payload has been started + string. If order=UTILITY_AFTER_PAYLOAD_STARTED, the commands that should be executed after the payload has been started should be returned. FORMAT: {'command': , 'args': } @@ -160,14 +160,14 @@ def get_utility_command_execution_order(name): Should the given utility command be executed before or after the payload? :param name: utility name (string). - :return: execution order constant (UTILITY_BEFORE_PAYLOAD or UTILITY_AFTER_PAYLOAD) + :return: execution order constant (UTILITY_BEFORE_PAYLOAD or UTILITY_AFTER_PAYLOAD_STARTED) """ # example implementation if name == 'monitor': return UTILITY_BEFORE_PAYLOAD else: - return UTILITY_AFTER_PAYLOAD + return UTILITY_AFTER_PAYLOAD_STARTED def post_utility_command_action(name, job): @@ -256,3 +256,31 @@ def update_server(job): """ pass + + +def post_prestagein_utility_command(**kwargs): + """ + Execute any post pre-stage-in utility commands. + + :param kwargs: kwargs (dictionary). + :return: + """ + + # label = kwargs.get('label', 'unknown_label') + # stdout = kwargs.get('output', None) + + pass + + +def process_debug_command(debug_command, pandaid): + """ + In debug mode, the server can send a special debug command to the pilot via the updateJob backchannel. + This function can be used to process that command, i.e. to identify a proper pid to debug (which is unknown + to the server). + + :param debug_command: debug command (string), payload pid (int). + :param pandaid: PanDA id (string). + :return: updated debug command (string) + """ + + return debug_command diff --git a/pilot/user/generic/loopingjob_definitions.py b/pilot/user/generic/loopingjob_definitions.py index ad3922578..9f64b65c0 100644 --- a/pilot/user/generic/loopingjob_definitions.py +++ b/pilot/user/generic/loopingjob_definitions.py @@ -34,6 +34,7 @@ def remove_unwanted_files(workdir, files): _files = [] for _file in files: if not (workdir == _file or + "prmon" in _file or "pilotlog" in _file or ".lib.tgz" in _file or ".py" in _file or diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index bd938c7b1..f02772739 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -5,9 +5,10 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 import os +import re import sys from collections import Set, Mapping, deque, OrderedDict @@ -21,9 +22,17 @@ zero_depth_bases = (str, bytes, Number, range, bytearray) # Python 3 iteritems = 'items' +from pilot.util.constants import ( + SUCCESS, + FAILURE, + SERVER_UPDATE_FINAL, + SERVER_UPDATE_NOT_DONE, + SERVER_UPDATE_TROUBLE, + get_pilot_version, +) + from pilot.common.errorcodes import ErrorCodes from pilot.util.container import execute -from pilot.util.constants import SUCCESS, FAILURE, SERVER_UPDATE_FINAL, SERVER_UPDATE_NOT_DONE, SERVER_UPDATE_TROUBLE, get_pilot_version from pilot.util.filehandling import dump import logging @@ -97,7 +106,7 @@ def display_architecture_info(): dump("/etc/issue") dump("$MACHTYPE", cmd="echo") else: - logger.info("\n%s" % stdout) + logger.info("\n%s", stdout) def get_batchsystem_jobid(): @@ -308,7 +317,7 @@ def inner(obj): pass # : unbound method iteritems() must be called # with OrderedDict instance as first argument (got nothing instead) - #logger.debug('exception caught for obj=%s: %s' % (str(obj), e)) + #logger.debug('exception caught for obj=%s: %s', (str(obj), e)) # Check for custom object instances - may subclass above too if hasattr(obj, '__dict__'): @@ -375,7 +384,7 @@ def check_for_final_server_update(update_server): if server_update == SERVER_UPDATE_FINAL or server_update == SERVER_UPDATE_TROUBLE: logger.info('server update done, finishing') break - logger.info('server update not finished (#%d/#%d)' % (i + 1, max_i)) + logger.info('server update not finished (#%d/#%d)', i + 1, max_i) sleep(30) i += 1 @@ -443,7 +452,7 @@ def show_memory_usage(): _value = extract_memory_usage_value(_stdout) except Exception: _value = "(unknown)" - logger.debug('current pilot memory usage:\n\n%s\n\nusage: %s kB\n' % (_stdout, _value)) + logger.debug('current pilot memory usage:\n\n%s\n\nusage: %s kB\n', _stdout, _value) def get_memory_usage(pid): @@ -539,3 +548,113 @@ def has_instruction_sets(instruction_sets): ret += '|%s' % i.upper() if ret else i.upper() return ret + + +def locate_core_file(cmd=None, pid=None): + """ + Locate the core file produced by gdb. + + :param cmd: optional command containing pid corresponding to core file (string). + :param pid: optional pid to use with core file (core.pid) (int). + :return: path to core file (string). + """ + + path = None + if not pid and cmd: + pid = get_pid_from_command(cmd) + if pid: + filename = 'core.%d' % pid + path = os.path.join(os.environ.get('PILOT_HOME', '.'), filename) + if os.path.exists(path): + logger.debug('found core file at: %s', path) + + else: + logger.debug('did not find %s in %s', filename, path) + else: + logger.warning('cannot locate core file since pid could not be extracted from command') + + return path + + +def get_pid_from_command(cmd, pattern=r'gdb --pid (\d+)'): + """ + Identify an explicit process id in the given command. + + Example: + cmd = 'gdb --pid 19114 -ex \'generate-core-file\'' + -> pid = 19114 + + :param cmd: command containing a pid (string). + :param pattern: regex pattern (raw string). + :return: pid (int). + """ + + pid = None + match = re.search(pattern, cmd) + if match: + try: + pid = int(match.group(1)) + except Exception: + pid = None + else: + print('no match for pattern \'%s\' in command=\'%s\'' % (pattern, cmd)) + + return pid + + +def list_hardware(): + """ + Execute lshw to list local hardware. + + :return: lshw output (string). + """ + + exit_code, stdout, stderr = execute('lshw -numeric -C display', mute=True) + if 'Command not found' in stdout or 'Command not found' in stderr: + stdout = '' + return stdout + + +def get_display_info(): + """ + Extract the product and vendor from the lshw command. + E.g. + product: GD 5446 [1013:B8] + vendor: Cirrus Logic [1013] + -> GD 5446, Cirrus Logic + + :return: product (string), vendor (string). + """ + + vendor = '' + product = '' + stdout = list_hardware() + if stdout: + vendor_pattern = re.compile(r'vendor\:\ (.+)\ .') + product_pattern = re.compile(r'product\:\ (.+)\ .') + + for line in stdout.split('\n'): + if 'vendor' in line: + result = re.findall(vendor_pattern, line) + if result: + vendor = result[0] + elif 'product' in line: + result = re.findall(product_pattern, line) + if result: + product = result[0] + + return product, vendor + + +def get_key_value(catchall, key='SOMEKEY'): + """ + Return the value corresponding to key in catchall. + :param catchall: catchall free string. + :param key: key name (string). + :return: value (string). + """ + + # ignore any non-key-value pairs that might be present in the catchall string + _dic = dict(_str.split('=', 1) for _str in catchall.split() if '=' in _str) + + return _dic.get(key) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index a90546207..7bb1caf94 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -12,9 +12,9 @@ # Pilot version RELEASE = '2' # released number should be fixed at 2 for Pilot 2 -VERSION = '11' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates -REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '22' # build number should be reset to '1' for every new development cycle +VERSION = '12' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates +REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates +BUILD = '62' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 @@ -26,8 +26,8 @@ UTILITY_WITH_PAYLOAD = 2 UTILITY_AFTER_PAYLOAD_STARTED = 3 UTILITY_AFTER_PAYLOAD_STARTED2 = 4 -UTILITY_AFTER_PAYLOAD = 5 -UTILITY_AFTER_PAYLOAD_FINISHED = 6 +UTILITY_AFTER_PAYLOAD_FINISHED = 5 +UTILITY_AFTER_PAYLOAD_FINISHED2 = 6 UTILITY_BEFORE_STAGEIN = 7 UTILITY_WITH_STAGEIN = 8 diff --git a/pilot/util/container.py b/pilot/util/container.py index 3ab76b66d..f220f6c02 100644 --- a/pilot/util/container.py +++ b/pilot/util/container.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 import subprocess from os import environ, getcwd, setpgrp #, getpgid #setsid @@ -42,8 +42,8 @@ def execute(executable, **kwargs): mute = kwargs.get('mute', False) mode = kwargs.get('mode', 'bash') cwd = kwargs.get('cwd', getcwd()) - stdout = kwargs.get('stdout', subprocess.PIPE) - stderr = kwargs.get('stderr', subprocess.PIPE) + stdout_name = kwargs.get('stdout', subprocess.PIPE) + stderr_name = kwargs.get('stderr', subprocess.PIPE) usecontainer = kwargs.get('usecontainer', False) returnproc = kwargs.get('returnproc', False) job = kwargs.get('job') @@ -72,7 +72,7 @@ def execute(executable, **kwargs): secret_key = sub_cmd.split('S3_SECRET_KEY=')[1] secret_key = 'S3_SECRET_KEY=' + secret_key executable_readable = executable_readable.replace(secret_key, 'S3_SECRET_KEY=********') - logger.info('executing command: %s' % executable_readable) + logger.info('executing command: %s', executable_readable) if mode == 'python': exe = ['/usr/bin/python'] + executable.split() @@ -80,27 +80,33 @@ def execute(executable, **kwargs): exe = ['/bin/bash', '-c', executable] # try: intercept exception such as OSError -> report e.g. error.RESOURCEUNAVAILABLE: "Resource temporarily unavailable" - process = subprocess.Popen(exe, - bufsize=-1, - stdout=stdout, - stderr=stderr, - cwd=cwd, - preexec_fn=setpgrp) #setsid) + if is_python3(): # Python 3 + process = subprocess.Popen(exe, + bufsize=-1, + stdout=stdout_name, + stderr=stderr_name, + cwd=cwd, + preexec_fn=setpgrp, + encoding='utf-8', + errors='replace') + else: + process = subprocess.Popen(exe, + bufsize=-1, + stdout=stdout_name, + stderr=stderr_name, + cwd=cwd, + preexec_fn=setpgrp) if returnproc: return process else: stdout, stderr = process.communicate() exit_code = process.poll() - # for Python 3, convert from byte-like object to str - if is_python3(): - stdout = stdout.decode('utf-8') - stderr = stderr.decode('utf-8') # remove any added \n if stdout and stdout.endswith('\n'): stdout = stdout[:-1] - return exit_code, stdout, stderr + return exit_code, stdout, stderr def containerise_executable(executable, **kwargs): @@ -128,8 +134,8 @@ def containerise_executable(executable, **kwargs): diagnostics = "" try: executable = container.wrapper(executable, **kwargs) - except Exception as e: - diagnostics = 'failed to execute wrapper function: %s' % e + except Exception as exc: + diagnostics = 'failed to execute wrapper function: %s' % exc logger.fatal(diagnostics) else: if executable == "": diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index efb144bd2..a1c4a07dc 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -41,12 +41,12 @@ testtransfertype: NULL pandaserver: https://pandaserver.cern.ch:25443 # pandaserver: https://aipanda007.cern.ch:25443 -# The URL for the iDDS server (update actual URL later) +# The URL for the iDDS server iddsserver: https://pandaserver.cern.ch:25443 -# The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5 * 60=300 s in ddebug mode) +# The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5*60 = 300 s in debug mode) heartbeat: 1800 -debug_heartbeat: 300 +debug_heartbeat: 60 # Heartbeat message file (only used when Pilot is not sending heartbeats to server) heartbeat_message: heartbeat.json @@ -69,7 +69,7 @@ maximum_input_file_sizes: 14336 MB # Size limit of payload stdout size during running. unit is in kB (value = 2 * 1024 ** 2) local_size_limit_stdout: 2097152 -# Looping job time limits; if job does not write anything in N hours, it is considered a looping job +# Looping job time limits; if job does not write anything in N minutes, it is considered to be a looping looping_verification_time: 900 # for both production and user analysis jobs, 2*3600 looping_limit_default: 7200 diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index 7858b4b65..caf0beebd 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -58,8 +58,8 @@ def mkdirs(workdir, chmod=0o770): # Python 2/3 os.makedirs(workdir) if chmod: os.chmod(workdir, chmod) - except Exception as e: - raise MKDirFailure(e) + except Exception as error: + raise MKDirFailure(error) def rmdirs(path): @@ -74,8 +74,8 @@ def rmdirs(path): try: rmtree(path) - except OSError as e: - logger.warning("failed to remove directories %s: %s" % (path, e)) + except OSError as error: + logger.warning("failed to remove directories %s: %s", path, error) else: status = True @@ -122,17 +122,17 @@ def write_file(path, contents, mute=True, mode='w', unique=False): if f: try: f.write(contents) - except IOError as e: - raise FileHandlingFailure(e) + except IOError as error: + raise FileHandlingFailure(error) else: status = True f.close() if not mute: if 'w' in mode: - logger.info('created file: %s' % path) + logger.info('created file: %s', path) if 'a' in mode: - logger.info('appended file: %s' % path) + logger.info('appended file: %s', path) return status @@ -151,8 +151,8 @@ def open_file(filename, mode): f = None try: f = open(filename, mode) - except IOError as e: - raise FileHandlingFailure(e) + except IOError as error: + raise FileHandlingFailure(error) return f @@ -329,8 +329,8 @@ def read_list(filename): try: with open(filename, 'r') as filehandle: _list = load(filehandle) - except IOError as e: - logger.warning('failed to read %s: %s' % (filename, e)) + except IOError as error: + logger.warning('failed to read %s: %s', filename, error) return convert(_list) @@ -349,9 +349,9 @@ def read_json(filename): if f: try: dictionary = load(f) - except Exception as e: - logger.warning('exception caught: %s' % e) - #raise FileHandlingFailure(str(e)) + except Exception as error: + logger.warning('exception caught: %s', error) + #raise FileHandlingFailure(str(error)) else: f.close() @@ -359,8 +359,8 @@ def read_json(filename): if dictionary != {}: try: dictionary = convert(dictionary) - except Exception as e: - raise ConversionFailure(e) + except Exception as error: + raise ConversionFailure(error) return dictionary @@ -383,8 +383,8 @@ def write_json(filename, data, sort_keys=True, indent=4, separators=(',', ': ')) try: with open(filename, 'w') as fh: dumpjson(data, fh, sort_keys=sort_keys, indent=indent, separators=separators) - except IOError as e: - raise FileHandlingFailure(e) + except IOError as error: + raise FileHandlingFailure(error) else: status = True @@ -434,8 +434,8 @@ def remove(path): try: os.remove(path) - except OSError as e: - logger.warning("failed to remove file: %s (%s, %s)" % (path, e.errno, e.strerror)) + except OSError as error: + logger.warning("failed to remove file: %s (%s, %s)", path, error.errno, error.strerror) return -1 return 0 @@ -449,8 +449,8 @@ def remove_dir_tree(path): try: rmtree(path) - except OSError as e: - logger.warning("failed to remove directory: %s (%s, %s)" % (path, e.errno, e.strerror)) + except OSError as error: + logger.warning("failed to remove directory: %s (%s, %s)", path, error.errno, error.strerror) return -1 return 0 @@ -466,7 +466,7 @@ def remove_files(workdir, files): ec = 0 if type(files) != list: - logger.warning('files parameter not a list: %s' % str(type(list))) + logger.warning('files parameter not a list: %s', str(type(list))) ec = -1 else: for f in files: @@ -524,6 +524,28 @@ def tar_files(wkdir, excludedfiles, logfile_name, attempt=0): return 0 +def move(path1, path2): + """ + Move a file from path1 to path2. + + :param path1: source path (string). + :param path2: destination path2 (string). + """ + + if not os.path.exists(path1): + logger.warning('file copy failure: path does not exist: %s', path1) + raise NoSuchFile("File does not exist: %s" % path1) + + try: + import shutil + shutil.move(path1, path2) + except IOError as error: + logger.warning("exception caught during file move: %s", error) + raise FileHandlingFailure(error) + else: + logger.info("moved %s to %s", path1, path2) + + def copy(path1, path2): """ Copy path1 to path2. @@ -535,16 +557,16 @@ def copy(path1, path2): """ if not os.path.exists(path1): - logger.warning('file copy failure: path does not exist: %s' % path1) + logger.warning('file copy failure: path does not exist: %s', path1) raise NoSuchFile("File does not exist: %s" % path1) try: copy2(path1, path2) - except IOError as e: - logger.warning("exception caught during file copy: %s" % e) - raise FileHandlingFailure(e) + except IOError as error: + logger.warning("exception caught during file copy: %s", error) + raise FileHandlingFailure(error) else: - logger.info("copied %s to %s" % (path1, path2)) + logger.info("copied %s to %s", path1, path2) def find_executable(name): @@ -574,8 +596,8 @@ def get_directory_size(directory="."): try: # convert to int and B size = int(stdout.split()[0]) * 1024 - except Exception as e: - logger.warning('exception caught while trying convert dirsize: %s' % e) + except Exception as error: + logger.warning('exception caught while trying convert dirsize: %s', error) return size @@ -593,13 +615,13 @@ def add_to_total_size(path, total_size): # Get the file size fsize = get_local_file_size(path) if fsize: - logger.info("size of file %s: %d B" % (path, fsize)) + logger.info("size of file %s: %d B", path, fsize) try: total_size += long(fsize) # Python 2 # noqa: F821 except Exception: total_size += int(fsize) # Python 3 (note order in try statement) else: - logger.warning("skipping file %s since it is not present" % path) + logger.warning("skipping file %s since it is not present", path) return total_size @@ -617,10 +639,10 @@ def get_local_file_size(filename): if os.path.exists(filename): try: file_size = os.path.getsize(filename) - except Exception as e: - logger.warning("failed to get file size: %s" % e) + except Exception as error: + logger.warning("failed to get file size: %s", error) else: - logger.warning("local file does not exist: %s" % filename) + logger.warning("local file does not exist: %s", filename) return file_size @@ -661,8 +683,8 @@ def get_table_from_file(filename, header=None, separator="\t", convert_to_float= try: f = open_file(filename, 'r') - except Exception as e: - logger.warning("failed to open file: %s, %s" % (filename, e)) + except Exception as error: + logger.warning("failed to open file: %s, %s", filename, error) else: firstline = True for line in f: @@ -682,8 +704,8 @@ def get_table_from_file(filename, header=None, separator="\t", convert_to_float= if convert_to_float: try: field = float(field) - except Exception as e: - logger.warning("failed to convert %s to float: %s (aborting)" % (field, e)) + except Exception as error: + logger.warning("failed to convert %s to float: %s (aborting)", field, error) return None tabledict[key].append(field) i += 1 @@ -884,7 +906,7 @@ def verify_file_list(list_of_files): diff = diff_lists(list_of_files, filtered_list) if diff: - logger.debug('found %d file(s) that do not exist (e.g. %s)' % (len(diff), diff[0])) + logger.debug('found %d file(s) that do not exist (e.g. %s)', len(diff), diff[0]) return filtered_list @@ -905,8 +927,8 @@ def find_latest_modified_file(list_of_files): try: latest_file = max(list_of_files, key=os.path.getmtime) mtime = int(os.path.getmtime(latest_file)) - except Exception as e: - logger.warning("int conversion failed for mod time: %s" % e) + except Exception as error: + logger.warning("int conversion failed for mod time: %s", error) latest_file = "" mtime = None @@ -925,17 +947,24 @@ def dump(path, cmd="cat"): if os.path.exists(path) or cmd == "echo": _cmd = "%s %s" % (cmd, path) exit_code, stdout, stderr = execute(_cmd) - logger.info("%s:\n%s" % (_cmd, stdout + stderr)) + logger.info("%s:\n%s", _cmd, stdout + stderr) else: - logger.info("path %s does not exist" % path) + logger.info("path %s does not exist", path) -def establish_logging(args, filename=config.Pilot.pilotlog): +def establish_logging(debug=True, nopilotlog=False, filename=config.Pilot.pilotlog, loglevel=0): """ Setup and establish logging. - :param args: pilot arguments object. - :param filename: name of log file. + Option loglevel can be used to decide which (predetermined) logging format to use. + Example: + loglevel=0: '%(asctime)s | %(levelname)-8s | %(name)-32s | %(funcName)-25s | %(message)s' + loglevel=1: 'ts=%(asctime)s level=%(levelname)-8s event=%(name)-32s.%(funcName)-25s msg="%(message)s"' + + :param debug: debug mode (Boolean), + :param nopilotlog: True when pilot log is not known (Boolean). + :param filename: name of log file (string). + :param loglevel: selector for logging level (int). :return: """ @@ -944,8 +973,8 @@ def establish_logging(args, filename=config.Pilot.pilotlog): _logger.propagate = False console = logging.StreamHandler(sys.stdout) - if args.debug: - format_str = '%(asctime)s | %(levelname)-8s | %(threadName)-19s | %(name)-32s | %(funcName)-25s | %(message)s' + if debug: + format_str = '%(asctime)s | %(levelname)-8s | %(name)-32s | %(funcName)-25s | %(message)s' level = logging.DEBUG else: format_str = '%(asctime)s | %(levelname)-8s | %(message)s' @@ -953,7 +982,7 @@ def establish_logging(args, filename=config.Pilot.pilotlog): #rank, maxrank = get_ranks_info() #if rank is not None: # format_str = 'Rank {0} |'.format(rank) + format_str - if args.nopilotlog: + if nopilotlog: logging.basicConfig(level=level, format=format_str, filemode='w') else: logging.basicConfig(filename=filename, level=level, format=format_str, filemode='w') @@ -978,7 +1007,7 @@ def remove_core_dumps(workdir): coredumps = coredumps1 + coredumps2 if coredumps: for coredump in coredumps: - logger.info("removing core dump: %s" % str(coredump)) + logger.info("removing core dump: %s", str(coredump)) remove(coredump) found = True @@ -1049,14 +1078,14 @@ def copy_pilot_source(workdir): diagnostics = "" srcdir = os.path.join(os.environ.get('PILOT_SOURCE_DIR', '.'), 'pilot2') try: - logger.debug('copy %s to %s' % (srcdir, workdir)) + logger.debug('copy %s to %s', srcdir, workdir) cmd = 'cp -r %s/* %s' % (srcdir, workdir) exit_code, stdout, stderr = execute(cmd) if exit_code != 0: diagnostics = 'file copy failed: %d, %s' % (exit_code, stdout) logger.warning(diagnostics) - except Exception as e: - diagnostics = 'exception caught when copying pilot2 source: %s' % e + except Exception as error: + diagnostics = 'exception caught when copying pilot2 source: %s' % error logger.warning(diagnostics) return diagnostics @@ -1072,7 +1101,44 @@ def create_symlink(from_path='', to_path=''): try: os.symlink(from_path, to_path) - except Exception as e: - logger.warning('failed to create symlink from %s to %s: %s' % (from_path, to_path, e)) + except Exception as error: + logger.warning('failed to create symlink from %s to %s: %s', from_path, to_path, error) else: - logger.debug('created symlink from %s to %s' % (from_path, to_path)) + logger.debug('created symlink from %s to %s', from_path, to_path) + + +def locate_file(pattern): + """ + Locate a file defined by the pattern. + + Example: + pattern = os.path.join(os.getcwd(), '**/core.123') + -> /Users/Paul/Development/python/tt/core.123 + + :param pattern: pattern name (string). + :return: path (string). + """ + + path = None + for fname in glob(pattern): + if os.path.isfile(fname): + path = fname + + return path + + +def find_last_line(filename): + """ + Find the last line in a (not too large) file. + + :param filename: file name, full path (string). + :return: last line (string). + """ + + last_line = "" + with open(filename) as f: + for line in f: + pass + last_line = line + + return last_line diff --git a/pilot/util/harvester.py b/pilot/util/harvester.py index 643253efb..918915497 100644 --- a/pilot/util/harvester.py +++ b/pilot/util/harvester.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 import os import os.path @@ -68,7 +68,7 @@ def remove_job_request_file(): path = get_job_request_file_name() if os.path.exists(path): if remove(path) == 0: - logger.info('removed %s' % path) + logger.info('removed %s', path) else: logger.debug('there is no job request file') diff --git a/pilot/util/loopingjob.py b/pilot/util/loopingjob.py index e2c451a07..b3b97a0a4 100644 --- a/pilot/util/loopingjob.py +++ b/pilot/util/loopingjob.py @@ -5,13 +5,13 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 from pilot.common.errorcodes import ErrorCodes -from pilot.util.auxiliary import whoami, set_pilot_state, cut_output +from pilot.util.auxiliary import whoami, set_pilot_state, cut_output, locate_core_file from pilot.util.config import config from pilot.util.container import execute -from pilot.util.filehandling import remove_files, find_latest_modified_file, verify_file_list +from pilot.util.filehandling import remove_files, find_latest_modified_file, verify_file_list, copy from pilot.util.parameters import convert_to_int from pilot.util.processes import kill_processes from pilot.util.timing import time_stamp @@ -59,20 +59,49 @@ def looping_job(job, mt): # the payload process is considered to be looping if it's files have not been touched within looping_limit time if time_last_touched: ct = int(time.time()) - logger.info('current time: %d' % ct) - logger.info('last time files were touched: %d' % time_last_touched) - logger.info('looping limit: %d s' % looping_limit) + logger.info('current time: %d', ct) + logger.info('last time files were touched: %d', time_last_touched) + logger.info('looping limit: %d s', looping_limit) if ct - time_last_touched > looping_limit: try: + # first produce core dump and copy it + create_core_dump(pid=job.pid, workdir=job.workdir) + # set debug mode to prevent core file from being removed before log creation + job.debug = True kill_looping_job(job) - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) else: logger.info('no files were touched') return exit_code, diagnostics +def create_core_dump(pid=None, workdir=None): + """ + Create core dump and copy it to work directory + """ + + if not pid or not workdir: + logger.warning('cannot create core file since pid or workdir is unknown') + return + + cmd = 'gdb --pid %d -ex \'generate-core-file\'' % pid + exit_code, stdout, stderr = execute(cmd) + if not exit_code: + path = locate_core_file(pid=pid) + if path: + try: + copy(path, workdir) + except Exception as error: + logger.warning('failed to copy core file: %s', error) + else: + logger.debug('copied core dump to workdir') + + else: + logger.warning('failed to execute command: %s, stdout+err=%s', cmd, stdout + stderr) + + def get_time_for_last_touch(job, mt, looping_limit): """ Return the time when the files in the workdir were last touched. @@ -98,14 +127,14 @@ def get_time_for_last_touch(job, mt, looping_limit): # remove unwanted list items (*.py, *.pyc, workdir, ...) files = loopingjob_definitions.remove_unwanted_files(job.workdir, files) if files: - logger.info('found %d files that were recently updated' % len(files)) - logger.debug('recent files:\n%s' % files) + logger.info('found %d files that were recently updated', len(files)) + logger.debug('recent files:\n%s', files) updated_files = verify_file_list(files) # now get the mod times for these file, and identify the most recently update file latest_modified_file, mtime = find_latest_modified_file(updated_files) if latest_modified_file: - logger.info("file %s is the most recently updated file (at time=%d)" % (latest_modified_file, mtime)) + logger.info("file %s is the most recently updated file (at time=%d)", latest_modified_file, mtime) else: logger.warning('looping job algorithm failed to identify latest updated file') return mt.ct_looping_last_touched @@ -120,7 +149,7 @@ def get_time_for_last_touch(job, mt, looping_limit): # cut the output if too long stdout = cut_output(stdout) stderr = cut_output(stderr) - logger.warning('find command failed: %d, %s, %s' % (exit_code, stdout, stderr)) + logger.warning('find command failed: %d, %s, %s', exit_code, stdout, stderr) return mt.ct_looping_last_touched @@ -140,19 +169,19 @@ def kill_looping_job(job): cmd = 'ps -fwu %s' % whoami() exit_code, stdout, stderr = execute(cmd, mute=True) - logger.info("%s: %s" % (cmd + '\n', stdout)) + logger.info("%s: %s", cmd + '\n', stdout) cmd = 'ls -ltr %s' % (job.workdir) exit_code, stdout, stderr = execute(cmd, mute=True) - logger.info("%s: %s" % (cmd + '\n', stdout)) + logger.info("%s: %s", cmd + '\n', stdout) cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % whoami() exit_code, stdout, stderr = execute(cmd, mute=True) - logger.info("%s: %s" % (cmd + '\n', stdout)) + logger.info("%s: %s", cmd + '\n', stdout) cmd = 'pstree -g -a' exit_code, stdout, stderr = execute(cmd, mute=True) - logger.info("%s: %s" % (cmd + '\n', stdout)) + logger.info("%s: %s", cmd + '\n', stdout) # set the relevant error code if job.state == 'stagein': @@ -184,6 +213,6 @@ def get_looping_job_limit(): looping_limit = convert_to_int(config.Pilot.looping_limit_default, default=2 * 3600) looping_limit_min_default = convert_to_int(config.Pilot.looping_limit_min_default, default=2 * 3600) looping_limit = max(looping_limit, looping_limit_min_default) - logger.info("using looping job limit: %d s" % looping_limit) + logger.info("using looping job limit: %d s", looping_limit) return looping_limit diff --git a/pilot/util/middleware.py b/pilot/util/middleware.py index d88d8dddf..decfd0c80 100644 --- a/pilot/util/middleware.py +++ b/pilot/util/middleware.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2021 from os import environ, path, getcwd #, chmod @@ -20,13 +20,56 @@ errors = ErrorCodes() +def containerise_general_command(job, container_options, label='command', container_type='container'): + """ + Containerise a general command by execution in a script that can be run in a container. + + :param job: job object. + :param label: label (string). + :param container_options: container options from queuedata (string). + :param container_type: optional 'container/bash' + :raises PilotException: for general failures. + :return: + """ + + cwd = getcwd() + + if container_type == 'container': + # add bits and pieces needed to run the cmd in a container + pilot_user = environ.get('PILOT_USER', 'generic').lower() + user = __import__('pilot.user.%s.container' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 + try: + cmd = user.create_middleware_container_command(job.workdir, job.debug_command, container_options, label=label, proxy=False) + except PilotException as e: + raise e + else: + logger.warning('not yet implemented') + raise PilotException + + try: + logger.info('*** executing %s (logging will be redirected) ***', label) + exit_code, stdout, stderr = execute(cmd, job=job, usecontainer=False) + except Exception as exc: + logger.info('*** %s has failed ***', label) + logger.warning('exception caught: %s', exc) + else: + if exit_code == 0: + logger.info('*** %s has finished ***', label) + else: + logger.info('*** %s has failed ***', label) + logger.debug('%s script returned exit_code=%d', label, exit_code) + + def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, container_options, external_dir, label='stage-in', container_type='container'): """ Containerise the middleware by performing stage-in/out steps in a script that in turn can be run in a container. + Note: a container will only be used for option container_type='container'. If this is 'bash', then stage-in/out will still be done by a script, but not containerised. + Note: this function is tailor made for stage-in/out. + :param job: job object. :param xdata: list of FileSpec objects. :param queue: queue name (string). @@ -37,9 +80,9 @@ def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, :param external_dir: input or output files directory (string). :param label: optional 'stage-in/out' (String). :param container_type: optional 'container/bash' - :return: :raises StageInFailure: for stage-in failures :raises StageOutFailure: for stage-out failures + :return: """ cwd = getcwd() @@ -61,30 +104,28 @@ def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, except PilotException as e: raise e else: - logger.warning('%s will not be done in a container (but it will be done by a script)' % label) + logger.warning('%s will not be done in a container (but it will be done by a script)', label) try: - logger.info('*** executing %s (logging will be redirected) ***' % label) + logger.info('*** executing %s (logging will be redirected) ***', label) exit_code, stdout, stderr = execute(cmd, job=job, usecontainer=False) - except Exception as e: - logger.info('*** %s has failed ***' % label) - logger.warning('exception caught: %s' % e) + except Exception as exc: + logger.info('*** %s has failed ***', label) + logger.warning('exception caught: %s', exc) else: if exit_code == 0: - logger.info('*** %s has finished ***' % label) + logger.info('*** %s has finished ***', label) else: - logger.info('*** %s has failed ***' % label) - logger.debug('%s script returned exit_code=%d' % (label, exit_code)) + logger.info('*** %s has failed ***', label) + logger.debug('%s script returned exit_code=%d', label, exit_code) # write stdout+stderr to files try: _stdout_name, _stderr_name = get_logfile_names(label) write_file(path.join(job.workdir, _stdout_name), stdout, mute=False) write_file(path.join(job.workdir, _stderr_name), stderr, mute=False) - logger.debug('stage-in/out stdout=\n%s' % stdout) - logger.debug('stage-in/out stderr=\n%s' % stderr) - except PilotException as e: - msg = 'exception caught: %s' % e + except PilotException as exc: + msg = 'exception caught: %s' % exc if label == 'stage-in': raise StageInFailure(msg) else: @@ -93,8 +134,8 @@ def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, # handle errors, file statuses, etc (the stage-in/out scripts write errors and file status to a json file) try: handle_updated_job_object(job, xdata, label=label) - except PilotException as e: - raise e + except PilotException as exc: + raise exc def get_script_path(script): @@ -106,8 +147,6 @@ def get_script_path(script): """ srcdir = environ.get('PILOT_SOURCE_DIR', '.') - logger.debug('PILOT_SOURCE_DIR=%s' % srcdir) - _path = path.join(srcdir, 'pilot/scripts') if not path.exists(_path): _path = path.join(srcdir, 'pilot2') @@ -123,6 +162,8 @@ def get_command(job, xdata, queue, script, eventtype, localsite, remotesite, ext """ Get the middleware container execution command. + Note: this function is tailor made for stage-in/out. + :param job: job object. :param xdata: list of FileSpec objects. :param queue: queue name (string). @@ -145,8 +186,8 @@ def get_command(job, xdata, queue, script, eventtype, localsite, remotesite, ext # write file data to file try: status = write_json(path.join(job.workdir, config.Container.stagein_replica_dictionary), filedata_dictionary) - except Exception as e: - diagnostics = 'exception caught in get_command(): %s' % e + except Exception as exc: + diagnostics = 'exception caught in get_command(): %s' % exc logger.warning(diagnostics) raise PilotException(diagnostics) else: @@ -238,8 +279,8 @@ def handle_updated_job_object(job, xdata, label='stage-in'): fspec.turl = file_dictionary[fspec.lfn][3] fspec.checksum['adler32'] = file_dictionary[fspec.lfn][4] fspec.filesize = file_dictionary[fspec.lfn][5] - except Exception as e: - msg = "exception caught while reading file dictionary: %s" % e + except Exception as exc: + msg = "exception caught while reading file dictionary: %s" % exc logger.warning(msg) if label == 'stage-in': raise StageInFailure(msg) @@ -314,8 +355,8 @@ def get_filedata(data): 'istar': fspec.is_tar, 'accessmode': fspec.accessmode, 'storagetoken': fspec.storage_token} - except Exception as e: - logger.warning('exception caught in get_filedata(): %s' % e) + except Exception as exc: + logger.warning('exception caught in get_filedata(): %s', exc) return file_dictionary @@ -376,19 +417,4 @@ def use_middleware_script(container_type): :return: Boolean (True if middleware should be containerised). """ - # see definition in atlas/container.py, but also see useful code below (in case middleware is available locally) - #:param cmd: middleware command, used to determine if the container should be used or not (string). - #usecontainer = False - #if not config.Container.middleware_container: - # logger.info('container usage for middleware is not allowed by pilot config') - #else: - # # if the middleware is available locally, do not use container - # if find_executable(cmd) == "": - # usecontainer = True - # logger.info('command %s is not available locally, will attempt to use container' % cmd) - # else: - # logger.info('command %s is available locally, no need to use container' % cmd) - - # FOR TESTING - #return True if config.Container.middleware_container_stagein_script else False return True if container_type == 'container' or container_type == 'bash' else False diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 27252ed9a..c1f442d68 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -54,15 +54,15 @@ def job_monitor_tasks(job, mt, args): check_hz() try: cpuconsumptiontime = get_current_cpu_consumption_time(job.pid) - except Exception as e: - diagnostics = "Exception caught: %s" % e + except Exception as error: + diagnostics = "Exception caught: %s" % error logger.warning(diagnostics) exit_code = get_exception_error_code(diagnostics) return exit_code, diagnostics else: job.cpuconsumptiontime = int(round(cpuconsumptiontime)) job.cpuconversionfactor = 1.0 - logger.info('CPU consumption time for pid=%d: %f (rounded to %d)' % (job.pid, cpuconsumptiontime, job.cpuconsumptiontime)) + logger.info('CPU consumption time for pid=%d: %f (rounded to %d)', job.pid, cpuconsumptiontime, job.cpuconsumptiontime) # check how many cores the payload is using set_number_used_cores(job) @@ -123,7 +123,7 @@ def display_oom_info(payload_pid): payload_score = get_score(payload_pid) if payload_pid else 'UNKNOWN' pilot_score = get_score(os.getpid()) - logger.info('oom_score(pilot) = %s, oom_score(payload) = %s' % (pilot_score, payload_score)) + logger.info('oom_score(pilot) = %s, oom_score(payload) = %s', pilot_score, payload_score) def get_score(pid): @@ -136,8 +136,8 @@ def get_score(pid): try: score = '%s' % read_file('/proc/%d/oom_score' % pid) - except Exception as e: - logger.warning('caught exception reading oom_score: %s' % e) + except Exception as error: + logger.warning('caught exception reading oom_score: %s', error) score = 'UNKNOWN' else: if score.endswith('\n'): @@ -207,8 +207,8 @@ def verify_memory_usage(current_time, mt, job): # is the used memory within the allowed limit? try: exit_code, diagnostics = memory.memory_usage(job) - except Exception as e: - logger.warning('caught exception: %s' % e) + except Exception as error: + logger.warning('caught exception: %s', error) exit_code = -1 if exit_code != 0: logger.warning('ignoring failure to parse memory monitor output') @@ -291,8 +291,8 @@ def verify_looping_job(current_time, mt, job): # is the job looping? try: exit_code, diagnostics = looping_job(job, mt) - except Exception as e: - diagnostics = 'exception caught in looping job algorithm: %s' % e + except Exception as error: + diagnostics = 'exception caught in looping job algorithm: %s' % error logger.warning(diagnostics) if "No module named" in diagnostics: exit_code = errors.BLACKHOLE @@ -371,15 +371,15 @@ def verify_running_processes(current_time, mt, pid): nproc = get_number_of_child_processes(pid) try: nproc_env = int(os.environ.get('PILOT_MAXNPROC', 0)) - except Exception as e: - logger.warning('failed to convert PILOT_MAXNPROC to int: %s' % e) + except Exception as error: + logger.warning('failed to convert PILOT_MAXNPROC to int: %s', error) else: if nproc > nproc_env: # set the maximum number of found processes os.environ['PILOT_MAXNPROC'] = str(nproc) if nproc_env > 0: - logger.info('maximum number of monitored processes: %d' % nproc_env) + logger.info('maximum number of monitored processes: %d', nproc_env) return 0, "" @@ -417,19 +417,19 @@ def utility_monitor(job): try: proc1 = execute(utility_command, workdir=job.workdir, returnproc=True, usecontainer=False, stdout=PIPE, stderr=PIPE, cwd=job.workdir, queuedata=job.infosys.queuedata) - except Exception as e: - logger.error('could not execute: %s' % e) + except Exception as error: + logger.error('could not execute: %s', error) else: # store process handle in job object, and keep track on how many times the # command has been launched job.utilities[utcmd] = [proc1, utility_subprocess_launches + 1, utility_command] else: - logger.warning('detected crashed utility subprocess - too many restarts, will not restart %s again' % utcmd) + logger.warning('detected crashed utility subprocess - too many restarts, will not restart %s again', utcmd) else: # check the utility output (the selector option adds a substring to the output file name) filename = usercommon.get_utility_command_output_filename(utcmd, selector=True) path = os.path.join(job.workdir, filename) if not os.path.exists(path): - logger.warning('file: %s does not exist' % path) + logger.warning('file: %s does not exist', path) time.sleep(10) @@ -444,10 +444,9 @@ def get_local_size_limit_stdout(bytes=True): try: localsizelimit_stdout = int(config.Pilot.local_size_limit_stdout) - except Exception as e: + except Exception as error: localsizelimit_stdout = 2097152 - logger.warning('bad value in config for local_size_limit_stdout: %s (will use value: %d kB)' % - (e, localsizelimit_stdout)) + logger.warning('bad value in config for local_size_limit_stdout: %s (will use value: %d kB)', error, localsizelimit_stdout) # convert from kB to B if bytes: @@ -484,17 +483,17 @@ def check_payload_stdout(job): # now loop over all files and check each individually (any large enough file will fail the job) for filename in file_list: - logger.debug('check_payload_stdout: filename=%s' % filename) + logger.debug('check_payload_stdout: filename=%s', filename) if "job.log.tgz" in filename: - logger.info("skipping file size check of file (%s) since it is a special log file" % (filename)) + logger.info("skipping file size check of file (%s) since it is a special log file", filename) continue if os.path.exists(filename): try: # get file size in bytes fsize = os.path.getsize(filename) - except Exception as e: - logger.warning("could not read file size of %s: %s" % (filename, e)) + except Exception as error: + logger.warning("could not read file size of %s: %s", filename, error) else: # is the file too big? localsizelimit_stdout = get_local_size_limit_stdout() @@ -517,9 +516,9 @@ def check_payload_stdout(job): # remove any lingering input files from the work dir exit_code = remove_files(job.workdir, lfns) else: - logger.info("payload log (%s) within allowed size limit (%d B): %d B" % (os.path.basename(filename), localsizelimit_stdout, fsize)) + logger.info("payload log (%s) within allowed size limit (%d B): %d B", os.path.basename(filename), localsizelimit_stdout, fsize) else: - logger.info("skipping file size check of payload stdout file (%s) since it has not been created yet" % filename) + logger.info("skipping file size check of payload stdout file (%s) since it has not been created yet", filename) return exit_code, diagnostics @@ -539,7 +538,7 @@ def check_local_space(initial=True): # is there enough local space to run a job? cwd = os.getcwd() - logger.debug('checking local space on %s' % cwd) + logger.debug('checking local space on %s', cwd) spaceleft = convert_mb_to_b(get_local_disk_space(cwd)) # B (diskspace is in MB) free_space_limit = human2bytes(config.Pilot.free_space_limit) if initial else human2bytes(config.Pilot.free_space_limit_running) @@ -549,7 +548,7 @@ def check_local_space(initial=True): ec = errors.NOLOCALSPACE logger.warning(diagnostics) else: - logger.info('sufficient remaining disk space (%d B)' % spaceleft) + logger.info('sufficient remaining disk space (%d B)', spaceleft) return ec, diagnostics @@ -578,11 +577,11 @@ def check_work_dir(job): exit_code = errors.USERDIRTOOLARGE diagnostics = "work directory (%s) is too large: %d B (must be < %d B)" % \ (job.workdir, workdirsize, maxwdirsize) - logger.fatal("%s" % diagnostics) + logger.fatal("%s", diagnostics) cmd = 'ls -altrR %s' % job.workdir _ec, stdout, stderr = execute(cmd, mute=True) - logger.info("%s: %s" % (cmd + '\n', stdout)) + logger.info("%s: %s", cmd + '\n', stdout) # kill the job # pUtil.createLockFile(True, self.__env['jobDic'][k][1].workdir, lockfile="JOBWILLBEKILLED") @@ -598,13 +597,13 @@ def check_work_dir(job): # remeasure the size of the workdir at this point since the value is stored below workdirsize = get_directory_size(directory=job.workdir) else: - logger.info("size of work directory %s: %d B (within %d B limit)" % (job.workdir, workdirsize, maxwdirsize)) + logger.info("size of work directory %s: %d B (within %d B limit)", job.workdir, workdirsize, maxwdirsize) # Store the measured disk space (the max value will later be sent with the job metrics) if workdirsize > 0: job.add_workdir_size(workdirsize) else: - logger.warning('job work dir does not exist: %s' % job.workdir) + logger.warning('job work dir does not exist: %s', job.workdir) else: logger.warning('skipping size check of workdir since it has not been created yet') @@ -621,17 +620,17 @@ def get_max_allowed_work_dir_size(queuedata): try: maxwdirsize = convert_mb_to_b(get_maximum_input_sizes()) # from MB to B, e.g. 16336 MB -> 17,129,537,536 B - except Exception as e: + except Exception as error: max_input_size = get_max_input_size() maxwdirsize = max_input_size + config.Pilot.local_size_limit_stdout * 1024 logger.info("work directory size check will use %d B as a max limit (maxinputsize [%d B] + local size limit for" - " stdout [%d B])" % (maxwdirsize, max_input_size, config.Pilot.local_size_limit_stdout * 1024)) - logger.warning('conversion caught exception: %s' % e) + " stdout [%d B])", maxwdirsize, max_input_size, config.Pilot.local_size_limit_stdout * 1024) + logger.warning('conversion caught exception: %s', error) else: # grace margin, as discussed in https://its.cern.ch/jira/browse/ATLASPANDA-482 margin = 10.0 # percent, read later from somewhere maxwdirsize = int(maxwdirsize * (1 + margin / 100.0)) - logger.info("work directory size check will use %d B as a max limit (10%% grace limit added)" % maxwdirsize) + logger.info("work directory size check will use %d B as a max limit (10%% grace limit added)", maxwdirsize) return maxwdirsize @@ -654,8 +653,8 @@ def get_max_input_size(queuedata, megabyte=False): _maxinputsize = int(_maxinputsize) # MB else: # convert to B int _maxinputsize = int(_maxinputsize) * 1024 * 1024 # MB -> B - except Exception as e: - logger.warning("schedconfig.maxinputsize: %s" % e) + except Exception as error: + logger.warning("schedconfig.maxinputsize: %s", error) if megabyte: _maxinputsize = max_input_file_sizes_mb else: @@ -667,9 +666,9 @@ def get_max_input_size(queuedata, megabyte=False): _maxinputsize = max_input_file_sizes if megabyte: - logger.info("max input size = %d MB (pilot default)" % _maxinputsize) + logger.info("max input size = %d MB (pilot default)", _maxinputsize) else: - logger.info("Max input size = %d B (pilot default)" % _maxinputsize) + logger.info("Max input size = %d B (pilot default)", _maxinputsize) return _maxinputsize @@ -693,12 +692,12 @@ def check_output_file_sizes(job): fsize = get_local_file_size(path) max_fsize = human2bytes(config.Pilot.maximum_output_file_size) if fsize and fsize < max_fsize: - logger.info('output file %s is within allowed size limit (%d B < %d B)' % (path, fsize, max_fsize)) + logger.info('output file %s is within allowed size limit (%d B < %d B)', path, fsize, max_fsize) else: exit_code = errors.OUTPUTFILETOOLARGE diagnostics = 'output file %s is not within allowed size limit (%d B > %d B)' % (path, fsize, max_fsize) logger.warning(diagnostics) else: - logger.info('output file size check: skipping output file %s since it does not exist' % path) + logger.info('output file size check: skipping output file %s since it does not exist', path) return exit_code, diagnostics diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 8abdbb64f..c51b717ed 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 import os import time @@ -47,8 +47,8 @@ def find_processes_in_group(cpids, pid): try: thispid = int(lines[i].split()[0]) thisppid = int(lines[i].split()[1]) - except Exception as e: - logger.warning('exception caught: %s' % e) + except Exception as error: + logger.warning('exception caught: %s', error) if thisppid == pid: find_processes_in_group(cpids, thispid) @@ -84,7 +84,7 @@ def get_process_commands(euid, pids): exit_code, stdout, stderr = execute(cmd, mute=True) if exit_code != 0 or stdout == '': - logger.warning('ps command failed: %d, \"%s\", \"%s\"' % (exit_code, stdout, stderr)) + logger.warning('ps command failed: %d, \"%s\", \"%s\"', exit_code, stdout, stderr) else: # extract the relevant processes p_commands = stdout.split('\n') @@ -153,13 +153,13 @@ def kill_processes(pid): return children.reverse() - logger.info("process IDs to be killed: %s (in reverse order)" % str(children)) + logger.info("process IDs to be killed: %s (in reverse order)", str(children)) # find which commands are still running try: cmds = get_process_commands(os.geteuid(), children) - except Exception as e: - logger.warning("get_process_commands() threw an exception: %s" % e) + except Exception as error: + logger.warning("get_process_commands() threw an exception: %s", error) else: if len(cmds) <= 1: logger.warning("found no corresponding commands to process id(s)") @@ -177,6 +177,8 @@ def kill_processes(pid): kill_process(i) # kill any remaining orphan processes + # note: this should no longer be necessary since ctypes has made sure all subprocesses are parented + # if orphan process killing is not desired, set env var PILOT_NOKILL kill_orphans() @@ -193,13 +195,13 @@ def kill_child_processes(pid): # reverse the process order so that the athena process is killed first (otherwise the stdout will be truncated) children.reverse() - logger.info("process IDs to be killed: %s (in reverse order)" % str(children)) + logger.info("process IDs to be killed: %s (in reverse order)", str(children)) # find which commands are still running try: cmds = get_process_commands(os.geteuid(), children) - except Exception as e: - logger.warning("get_process_commands() threw an exception: %s" % e) + except Exception as error: + logger.warning("get_process_commands() threw an exception: %s", error) else: if len(cmds) <= 1: logger.warning("found no corresponding commands to process id(s)") @@ -229,26 +231,26 @@ def kill_process_group(pgrp): _sleep = True # kill the process gracefully - logger.info("killing group process %d" % pgrp) + logger.info("killing group process %d", pgrp) try: os.killpg(pgrp, signal.SIGTERM) - except Exception as e: - logger.warning("exception thrown when killing child group process under SIGTERM: %s" % e) + except Exception as error: + logger.warning("exception thrown when killing child group process under SIGTERM: %s", error) _sleep = False else: - logger.info("SIGTERM sent to process group %d" % pgrp) + logger.info("SIGTERM sent to process group %d", pgrp) if _sleep: _t = 30 - logger.info("sleeping %d s to allow processes to exit" % _t) + logger.info("sleeping %d s to allow processes to exit", _t) time.sleep(_t) try: os.killpg(pgrp, signal.SIGKILL) - except Exception as e: - logger.warning("exception thrown when killing child group process with SIGKILL: %s" % e) + except Exception as error: + logger.warning("exception thrown when killing child group process with SIGKILL: %s", error) else: - logger.info("SIGKILL sent to process group %d" % pgrp) + logger.info("SIGKILL sent to process group %d", pgrp) status = True return status @@ -268,7 +270,7 @@ def kill_process(pid): kill(pid, signal.SIGTERM) _t = 10 - logger.info("sleeping %d s to allow process to exit" % _t) + logger.info("sleeping %d s to allow process to exit", _t) time.sleep(_t) # now do a hard kill just in case some processes haven't gone away @@ -289,10 +291,10 @@ def kill(pid, sig): status = False try: os.kill(pid, sig) - except Exception as e: - logger.warning("exception thrown when killing process %d with signal=%d: %s" % (pid, sig, e)) + except Exception as error: + logger.warning("exception thrown when killing process %d with signal=%d: %s", pid, sig, error) else: - logger.info("killed process %d with signal=%d" % (pid, sig)) + logger.info("killed process %d with signal=%d", pid, sig) status = True return status @@ -311,12 +313,12 @@ def get_number_of_child_processes(pid): n = 0 try: find_processes_in_group(children, pid) - except Exception as e: - logger.warning("exception caught in find_processes_in_group: %s" % e) + except Exception as error: + logger.warning("exception caught in find_processes_in_group: %s", error) else: if pid: n = len(children) - logger.info("number of running child processes to parent process %d: %d" % (pid, n)) + logger.info("number of running child processes to parent process %d: %d", pid, n) else: logger.debug("pid not yet set") return n @@ -333,16 +335,16 @@ def killpg(pid, sig, args): try: os.killpg(int(pid), sig) - except Exception as e: - logger.warning("failed to execute killpg(): %s" % e) + except Exception as error: + logger.warning("failed to execute killpg(): %s", error) cmd = 'kill -%d %s' % (sig, pid) exit_code, rs, stderr = execute(cmd) if exit_code != 0: logger.warning(rs) else: - logger.info("killed orphaned process %s (%s)" % (pid, args)) + logger.info("killed orphaned process %s (%s)", pid, args) else: - logger.info("killed orphaned process group %s (%s)" % (pid, args)) + logger.info("killed orphaned process group %s (%s)", pid, args) def get_pilot_pid_from_processes(_processes, pattern): @@ -362,8 +364,8 @@ def get_pilot_pid_from_processes(_processes, pattern): args = ids.group(3) try: pid = int(pid) - except Exception as e: - logger.warning('failed to convert pid to int: %s' % e) + except Exception as error: + logger.warning('failed to convert pid to int: %s', error) continue if 'pilot.py' in args and 'python' in args: pilot_pid = pid @@ -403,30 +405,29 @@ def kill_orphans(): args = ids.group(3) try: pid = int(pid) - except Exception as e: - logger.warning('failed to convert pid to int: %s' % e) + except Exception as error: + logger.warning('failed to convert pid to int: %s', error) continue if 'cvmfs2' in args: - logger.info("ignoring possible orphan process running cvmfs2: pid=%s, ppid=%s, args=\'%s\'" % - (pid, ppid, args)) + logger.info("ignoring possible orphan process running cvmfs2: pid=%s, ppid=%s, args=\'%s\'", pid, ppid, args) elif 'pilots_starter.py' in args or 'runpilot2-wrapper.sh' in args: - logger.info("ignoring pilot launcher: pid=%s, ppid=%s, args='%s'" % (pid, ppid, args)) + logger.info("ignoring pilot launcher: pid=%s, ppid=%s, args='%s'", pid, ppid, args) elif ppid == '1': count += 1 - logger.info("found orphan process: pid=%s, ppid=%s, args='%s'" % (pid, ppid, args)) + logger.info("found orphan process: pid=%s, ppid=%s, args='%s'", pid, ppid, args) if 'bash' in args or ('python' in args and 'pilot.py' in args): logger.info("will not kill bash process") else: killpg(pid, signal.SIGTERM, args) _t = 10 - logger.info("sleeping %d s to allow processes to exit" % _t) + logger.info("sleeping %d s to allow processes to exit", _t) time.sleep(_t) killpg(pid, signal.SIGKILL, args) if count == 0: logger.info("did not find any orphan processes") else: - logger.info("found %d orphan process(es)" % count) + logger.info("found %d orphan process(es)", count) def get_max_memory_usage_from_cgroups(): @@ -451,19 +452,19 @@ def get_max_memory_usage_from_cgroups(): if ":memory:" in out: pos = out.find('/') path = out[pos:] - logger.info("extracted path = %s" % path) + logger.info("extracted path = %s", path) pre = get_cgroups_base_path() if pre != "": path = pre + os.path.join(path, "memory.max_usage_in_bytes") - logger.info("path to CGROUPS memory info: %s" % path) + logger.info("path to CGROUPS memory info: %s", path) max_memory = read_file(path) else: logger.info("CGROUPS base path could not be extracted - not a CGROUPS site") else: - logger.warning("invalid format: %s (expected ..:memory:[path])" % out) + logger.warning("invalid format: %s (expected ..:memory:[path])", out) else: - logger.info("path %s does not exist (not a CGROUPS site)" % path) + logger.info("path %s does not exist (not a CGROUPS site)", path) return max_memory @@ -516,7 +517,7 @@ def get_instant_cpu_consumption_time(pid): hz = os.sysconf(os.sysconf_names['SC_CLK_TCK']) if type(hz) != int: - logger.warning('unknown SC_CLK_TCK: %s' % str(hz)) + logger.warning('unknown SC_CLK_TCK: %s', str(hz)) return 0.0 if pid and hz and hz > 0: @@ -584,21 +585,21 @@ def cleanup(job, args): # make sure the workdir is deleted if args.cleanup: if remove_dir_tree(job.workdir): - logger.info('removed %s' % job.workdir) + logger.info('removed %s', job.workdir) if os.path.exists(job.workdir): - logger.warning('work directory still exists: %s' % job.workdir) + logger.warning('work directory still exists: %s', job.workdir) else: - logger.debug('work directory was removed: %s' % job.workdir) + logger.debug('work directory was removed: %s', job.workdir) else: - logger.info('workdir not removed %s' % job.workdir) + logger.info('workdir not removed %s', job.workdir) # collect any zombie processes job.collect_zombies(tn=10) logger.info("collected zombie processes") if job.pid: - logger.info("will now attempt to kill all subprocesses of pid=%d" % job.pid) + logger.info("will now attempt to kill all subprocesses of pid=%d", job.pid) kill_processes(job.pid) else: logger.warning('cannot kill any subprocesses since job.pid is not set') @@ -628,3 +629,130 @@ def threads_aborted(abort_at=2): aborted = True return aborted + + +def convert_ps_to_dict(output, pattern=r'(\d+) (\d+) (\d+) (.+)'): + """ + Convert output from a ps command to a dictionary. + + Example: ps axo pid,ppid,pgid,cmd + PID PPID PGID COMMAND + 22091 6672 22091 bash + 32581 22091 32581 ps something;sdfsdfds/athena.py ddfg + -> dictionary = { 'PID': [22091, 32581], 'PPID': [22091, 6672], .. , 'COMMAND': ['ps ..', 'bash']} + + :param output: ps stdout (string). + :param pattern: regex pattern matching the ps output (raw string). + :return: dictionary. + """ + + dictionary = {} + first_line = [] # e.g. PID PPID PGID COMMAND + + for line in output.split('\n'): + try: + # remove leading and trailing spaces + line = line.strip() + # remove multiple spaces inside the line + _l = re.sub(' +', ' ', line) + + if first_line == []: + _l = [_f for _f in _l.split(' ') if _f] + first_line = _l + for i in range(len(_l)): + dictionary[_l[i]] = [] + else: # e.g. 22091 6672 22091 bash + match = re.search(pattern, _l) + if match: + for i in range(len(first_line)): + try: + var = int(match.group(i + 1)) + except Exception: + var = match.group(i + 1) + dictionary[first_line[i]].append(var) + + except Exception as error: + print("unexpected format of utility output: %s", error) + + return dictionary + + +def get_trimmed_dictionary(keys, dictionary): + """ + Return a sub-dictionary with only the given keys. + + :param keys: keys to keep (list). + :param dictionary: full dictionary. + :return: trimmed dictionary. + """ + + subdictionary = {} + for key in keys: + if key in dictionary: + subdictionary[key] = dictionary[key] + + return subdictionary + + +def find_cmd_pids(cmd, ps_dictionary): + """ + Find all pids for the given command. + Example. cmd = 'athena.py' -> pids = [1234, 2267] (in case there are two pilots running on the WN). + + :param cmd: command (string). + :param ps_dictionary: converted ps output (dictionary). + """ + + pids = [] + i = -1 + for _cmd in ps_dictionary.get('COMMAND'): + i += 1 + if cmd in _cmd: + pids.append(ps_dictionary.get('PID')[i]) + return pids + + +def find_pid(pandaid, ps_dictionary): + """ + Find the process id for the command that contains 'export PandaID=%d'. + + :param pandaid: PanDA ID (string). + :param ps_dictionaryL ps output dictionary. + :return: pid (int). + """ + + pid = -1 + i = -1 + pandaid_cmd = 'export PandaID=%s' % pandaid + for _cmd in ps_dictionary.get('COMMAND'): + i += 1 + if pandaid_cmd in _cmd: + pid = ps_dictionary.get('PID')[i] + break + + return pid + + +def is_child(pid, pandaid_pid, dictionary): + """ + Is the given pid a child process of the pandaid_pid? + Proceed recursively until the parent pandaid_pid has been found, or return False if it fails to find it. + """ + + try: + # where are we at in the PID list? + index = dictionary.get('PID').index(pid) + except ValueError: + # not in the list + return False + else: + # get the corresponding ppid + ppid = dictionary.get('PPID')[index] + + print(index, pid, ppid, pandaid_pid) + # is the current parent the same as the pandaid_pid? if yes, we are done + if ppid == pandaid_pid: + return True + else: + # try another pid + return is_child(ppid, pandaid_pid, dictionary) diff --git a/pilot/util/tracereport.py b/pilot/util/tracereport.py index b00e70675..717d17f56 100644 --- a/pilot/util/tracereport.py +++ b/pilot/util/tracereport.py @@ -102,6 +102,13 @@ def init(self, job): exit_code, stdout, stderr = execute(cmd) self['uuid'] = stdout.replace('-', '') + def get_value(self, key): + """ + + """ + + return self.get(key, None) + def verify_trace(self): """ Verify the trace consistency. @@ -133,6 +140,11 @@ def send(self): :return: Boolean. """ + # only send trace if it is actually required (can be turned off with pilot option) + if environ.get('PILOT_USE_RUCIO_TRACES', 'True') == 'False': + logger.debug('rucio trace does not need to be sent') + return True + url = config.Rucio.url logger.info("tracing server: %s" % url) logger.info("sending tracing report: %s" % str(self)) diff --git a/pilot/util/workernode.py b/pilot/util/workernode.py index f23bb86f0..e96d3ba94 100644 --- a/pilot/util/workernode.py +++ b/pilot/util/workernode.py @@ -5,13 +5,13 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021 import os import re -from pilot.util.disk import disk_usage from pilot.info import infosys +from pilot.util.disk import disk_usage import logging logger = logging.getLogger(__name__) @@ -36,8 +36,8 @@ def get_local_disk_space(path): if not diskpipe.close(): try: disk = float(disks.splitlines()[1].split()[3]) - except ValueError as e: - logger.warning('exception caught while trying to convert disk info: %s' % e) + except ValueError as error: + logger.warning('exception caught while trying to convert disk info: %s', error) return disk @@ -56,8 +56,8 @@ def get_meminfo(): if mems.upper().find("MEMTOTAL") != -1: try: mem = float(mems.split()[1]) / 1024 # value listed by command as kB, convert to MB - except ValueError as e: - logger.warning('exception caught while trying to convert meminfo: %s' % e) + except ValueError as error: + logger.warning('exception caught while trying to convert meminfo: %s', error) break mems = fd.readline() @@ -78,8 +78,8 @@ def get_cpuinfo(): if line.find("cpu MHz") != -1: # Python 2/3 try: cpu = float(line.split(":")[1]) - except ValueError as e: - logger.warning('exception caught while trying to convert cpuinfo: %s' % e) + except ValueError as error: + logger.warning('exception caught while trying to convert cpuinfo: %s', error) break # command info is the same for all cores, so break here return cpu @@ -114,21 +114,21 @@ def get_disk_space(queuedata): # --- non Job related queue data # jobinfo provider is required to consider overwriteAGIS data coming from Job _maxinputsize = infosys.queuedata.maxwdir - logger.debug("resolved value from global infosys.queuedata instance: infosys.queuedata.maxwdir=%s B" % _maxinputsize) + logger.debug("resolved value from global infosys.queuedata instance: infosys.queuedata.maxwdir=%s B", _maxinputsize) _maxinputsize = queuedata.maxwdir - logger.debug("resolved value: queuedata.maxwdir=%s B" % _maxinputsize) + logger.debug("resolved value: queuedata.maxwdir=%s B", _maxinputsize) try: du = disk_usage(os.path.abspath(".")) _diskspace = int(du[2] / (1024 * 1024)) # need to convert from B to MB - except ValueError as e: - logger.warning("failed to extract disk space: %s (will use schedconfig default)" % e) + except ValueError as error: + logger.warning("failed to extract disk space: %s (will use schedconfig default)", error) _diskspace = _maxinputsize else: - logger.info("available WN disk space: %d MB" % (_diskspace)) + logger.info("available WN disk space: %d MB", _diskspace) _diskspace = min(_diskspace, _maxinputsize) - logger.info("sending disk space %d MB to dispatcher" % (_diskspace)) + logger.info("sending disk space %d MB to dispatcher", _diskspace) return _diskspace @@ -221,10 +221,8 @@ def check_hz(): """ try: - hz = os.sysconf(os.sysconf_names['SC_CLK_TCK']) + _ = os.sysconf(os.sysconf_names['SC_CLK_TCK']) except Exception: import traceback logger.fatal('failed to read SC_CLK_TCK - will not be able to perform CPU consumption calculation') logger.warning(traceback.format_exc()) - else: - logger.debug('SC_CLK_TCK=%s' % str(hz)) diff --git a/pilot/workflow/generic_hpc.py b/pilot/workflow/generic_hpc.py index 8b5675996..caf3309fc 100644 --- a/pilot/workflow/generic_hpc.py +++ b/pilot/workflow/generic_hpc.py @@ -6,7 +6,7 @@ # # Authors: # - Mario Lassnig, mario.lassnig@cern.ch, 2016 -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 # - Danila Oleynik danila.oleynik@cern.ch, 2018 import functools @@ -48,9 +48,9 @@ def interrupt(args, signum, frame): """ try: - logger.info('caught signal: %s' % [v for v, k in signal.__dict__.iteritems() if k == signum][0]) # Python 2 + logger.info('caught signal: %s', [v for v, k in signal.__dict__.iteritems() if k == signum][0]) # Python 2 except Exception: - logger.info('caught signal: %s' % [v for v, k in list(signal.__dict__.items()) if k == signum][0]) # Python 3 + logger.info('caught signal: %s', [v for v, k in list(signal.__dict__.items()) if k == signum][0]) # Python 3 args.graceful_stop.set() @@ -212,11 +212,11 @@ def run(args): logger.debug("Final report: {0}".format(work_report)) add_to_pilot_timing(job.jobid, PILOT_POST_FINAL_UPDATE, time.time(), args) - except Exception as e: + except Exception as error: work_report["jobStatus"] = "failed" - work_report["exitMsg"] = str(e) + work_report["exitMsg"] = str(error) publish_work_report(work_report, worker_attributes_file) - logging.exception('exception caught:') + logging.exception('exception caught: %s', error) traces.pilot['state'] = FAILURE return traces