From 0d07511f014b9f33e973dff9cca3bfe6400b7f93 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Wed, 12 May 2021 12:25:08 +0200
Subject: [PATCH 01/96] New version 2.11.3.1

---
 PILOTVERSION            | 2 +-
 pilot/util/constants.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 00d0494d..243cbdb7 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.2.22
\ No newline at end of file
+2.11.3.1
\ No newline at end of file
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index a9054620..a40ec8fd 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -13,8 +13,8 @@
 # Pilot version
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'   # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
-REVISION = '2'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '22'    # build number should be reset to '1' for every new development cycle
+REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
+BUILD = '1'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 830ab0aeb422282b083574ad585cda7ba75a5212 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Thu, 13 May 2021 10:52:21 +0200
Subject: [PATCH 02/96] Added diagnostics to failed remote file open
 verification

---
 pilot/user/atlas/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index 13d0546e..a8657296 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -318,7 +318,7 @@ def get_payload_command(job):
 
             # fail the job if the remote files could not be verified
             if ec != 0:
-                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec)
+                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec, msg=diagnostics)
                 raise PilotException(diagnostics, code=ec)
     else:
         logger.debug('no remote file open verification')

From 2f8660723e548ad4330a6976c5c9b96488ada09f Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Thu, 13 May 2021 16:55:28 +0200
Subject: [PATCH 03/96] Updated comment

---
 pilot/util/default.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg
index efb144bd..d99a7e85 100644
--- a/pilot/util/default.cfg
+++ b/pilot/util/default.cfg
@@ -41,7 +41,7 @@ testtransfertype: NULL
 pandaserver: https://pandaserver.cern.ch:25443
 # pandaserver: https://aipanda007.cern.ch:25443
 
-# The URL for the iDDS server (update actual URL later)
+# The URL for the iDDS server
 iddsserver: https://pandaserver.cern.ch:25443
 
 # The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5 * 60=300 s in ddebug mode)

From ef73e5bbbcb5f8ddf8a6b29b0be9a7c3fc58a065 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Thu, 13 May 2021 16:56:19 +0200
Subject: [PATCH 04/96] Updated comment

---
 pilot/util/default.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg
index d99a7e85..3a50a0d0 100644
--- a/pilot/util/default.cfg
+++ b/pilot/util/default.cfg
@@ -44,7 +44,7 @@ pandaserver: https://pandaserver.cern.ch:25443
 # The URL for the iDDS server
 iddsserver: https://pandaserver.cern.ch:25443
 
-# The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5 * 60=300 s in ddebug mode)
+# The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5 * 60=300 s in debug mode)
 heartbeat: 1800
 debug_heartbeat: 300
 

From 0b928df490df09477d58ed2089242302f6e6d7a4 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Thu, 13 May 2021 16:59:09 +0200
Subject: [PATCH 05/96] Updated comment

---
 pilot/util/default.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg
index 3a50a0d0..1ab5b762 100644
--- a/pilot/util/default.cfg
+++ b/pilot/util/default.cfg
@@ -44,7 +44,7 @@ pandaserver: https://pandaserver.cern.ch:25443
 # The URL for the iDDS server
 iddsserver: https://pandaserver.cern.ch:25443
 
-# The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5 * 60=300 s in debug mode)
+# The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5*60 = 300 s in debug mode)
 heartbeat: 1800
 debug_heartbeat: 300
 

From 521374269d68e1be3b4a446dc8525a1775d509bd Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Thu, 13 May 2021 19:35:04 +0200
Subject: [PATCH 06/96] Updated log message

---
 pilot/control/job.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pilot/control/job.py b/pilot/control/job.py
index 62ab7238..25396851 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -882,7 +882,7 @@ def validate(queues, traces, args):
             # run the delayed space check now
             proceed_with_local_space_check = True if (args.harvester_submitmode.lower() == 'push' and args.update_server) else False
             if proceed_with_local_space_check:
-                logger.debug('pilot will not perform delayed space check')
+                logger.debug('pilot will now perform delayed space check')
                 ec, diagnostics = check_local_space()
                 if ec != 0:
                     traces.pilot['error_code'] = errors.NOLOCALSPACE

From d67cab8ede3b27b3904136c0b35da1feaa422b35 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Mon, 17 May 2021 10:51:48 +0200
Subject: [PATCH 07/96] Updated comment

---
 pilot/util/default.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg
index 1ab5b762..f8de374e 100644
--- a/pilot/util/default.cfg
+++ b/pilot/util/default.cfg
@@ -69,7 +69,7 @@ maximum_input_file_sizes: 14336 MB
 # Size limit of payload stdout size during running. unit is in kB (value = 2 * 1024 ** 2)
 local_size_limit_stdout: 2097152
 
-# Looping job time limits; if job does not write anything in N hours, it is considered a looping job
+# Looping job time limits; if job does not write anything in N minutes, it is considered to be a looping
 looping_verification_time: 900
 # for both production and user analysis jobs, 2*3600
 looping_limit_default: 7200

From f3bbf969bed436274b07e4af0caca673f605ffc9 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Mon, 17 May 2021 11:47:21 +0200
Subject: [PATCH 08/96] Now adding resimevents to job metrics also when it is
 zero

---
 PILOTVERSION                   | 2 +-
 pilot/user/atlas/common.py     | 4 ++--
 pilot/user/atlas/jobmetrics.py | 2 +-
 pilot/util/constants.py        | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 243cbdb7..88b6eae4 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.1
\ No newline at end of file
+2.11.3.2
\ No newline at end of file
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index a8657296..6a7d448c 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -1408,10 +1408,10 @@ def get_resimevents(jobreport_dictionary):
     This information is reported with the jobMetrics.
 
     :param jobreport_dictionary: job report dictionary.
-    :return: resimevents (int)
+    :return: resimevents (int or None)
     """
 
-    resimevents = 0
+    resimevents = None
 
     executor_dictionary = get_executor_dictionary(jobreport_dictionary)
     if executor_dictionary != {}:
diff --git a/pilot/user/atlas/jobmetrics.py b/pilot/user/atlas/jobmetrics.py
index 503542e9..20b5d31a 100644
--- a/pilot/user/atlas/jobmetrics.py
+++ b/pilot/user/atlas/jobmetrics.py
@@ -54,7 +54,7 @@ def get_job_metrics_string(job):
         job_metrics += get_job_metrics_entry("dbTime", job.dbtime)
     if job.dbdata and job.dbdata != "":
         job_metrics += get_job_metrics_entry("dbData", job.dbdata)
-    if job.resimevents:
+    if job.resimevents is not None:
         job_metrics += get_job_metrics_entry("resimevents", job.resimevents)
 
     # get the max disk space used by the payload (at the end of a job)
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index a40ec8fd..161089a2 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'   # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '1'    # build number should be reset to '1' for every new development cycle
+BUILD = '2'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 9787898c020c9ecf75f1b3289975c3cde2e801ae Mon Sep 17 00:00:00 2001
From: Shuwei Ye <Shuwei.Ye@cern.ch>
Date: Tue, 18 May 2021 14:58:26 -0400
Subject: [PATCH 09/96] Changed gs.py for GCS buckets with automatic bucket
 name extraction

---
 pilot/copytool/gs.py | 68 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 17 deletions(-)

diff --git a/pilot/copytool/gs.py b/pilot/copytool/gs.py
index af9cef30..fb5ed595 100644
--- a/pilot/copytool/gs.py
+++ b/pilot/copytool/gs.py
@@ -23,6 +23,7 @@
 from pilot.common.errorcodes import ErrorCodes
 from pilot.common.exception import PilotException
 from pilot.util.ruciopath import get_rucio_path
+from pilot.util.config import config
 
 logger = logging.getLogger(__name__)
 errors = ErrorCodes()
@@ -52,17 +53,32 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs):
         :param fspec: file spec data
         :return: dictionary {'surl': surl}
     """
+
     ddm = ddmconf.get(fspec.ddmendpoint)
     if not ddm:
         raise PilotException('failed to resolve ddmendpoint by name=%s' % fspec.ddmendpoint)
 
-    if ddm.is_deterministic:
-        surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), get_rucio_path(fspec.scope, fspec.lfn))
-    elif ddm.type in ['OS_ES', 'OS_LOGS']:
-        surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), fspec.lfn)
-        fspec.protocol_id = protocol.get('id')
+    dataset = fspec.dataset
+    if dataset:
+       dataset = dataset.replace("#{pandaid}",os.environ['PANDAID'])
     else:
-        raise PilotException('resolve_surl(): Failed to construct SURL for non deterministic ddm=%s: NOT IMPLEMENTED', fspec.ddmendpoint)
+       dataset = ""
+
+    remotePath = os.path.join(protocol.get('path', ''), dataset)
+
+    # pilot ID is passed by the envvar GTAG
+    # try:
+    #   rprotocols = ddm.rprotocols
+    #   logger.debug('ddm.rprotocols=%s' % rprotocols)
+    #   if "http_access" in rprotocols:
+    #      http_access = rprotocols["http_access"]
+    #      os.environ['GTAG'] = http_access + os.path.join(remotePath, config.Pilot.pilotlog)
+    #      logger.debug('http_access=%s' % http_access)
+    # except Exception as e:
+    #   logger.warning("Failed in get 'http_access' in ddm.rprotocols")
+
+    surl = protocol.get('endpoint', '') + remotePath
+    logger.info('For GCS bucket, set surl=%s' % surl)
 
     # example:
     #   protocol = {u'path': u'/atlas-eventservice', u'endpoint': u's3://s3.cern.ch:443/', u'flavour': u'AWS-S3-SSL', u'id': 175}
@@ -72,7 +88,7 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs):
 
 def copy_in(files, **kwargs):
     """
-    Download given files from an S3 bucket.
+    Download given files from a GCS bucket.
 
     :param files: list of `FileSpec` objects
     :raise: PilotException in case of controlled error
@@ -103,7 +119,7 @@ def download_file(path, surl, object_name=None):
 
     :param path: Path to local file after download (string).
     :param surl: remote path (string).
-    :param object_name: S3 object name. If not specified then file_name from path is used.
+    :param object_name: GCS object name. If not specified then file_name from path is used.
     :return: True if file was uploaded (else False), diagnostics (string).
     """
 
@@ -136,11 +152,22 @@ def copy_out(files, **kwargs):
 
     for fspec in files:
 
-        path = os.path.join(workdir, fspec.lfn)
+      logger.info('Going to process fspec.turl=%s' % fspec.turl)
+      import re
+      # bucket = re.sub(r'gs://(.*?)/.*', r'\1', fspec.turl)
+      reObj = re.match(r'gs://([^/]*)/(.*)', fspec.turl)
+      (bucket, remotePath) = reObj.groups()
+
+
+      # ["pilotlog.txt", "payload.stdout", "payload.stderr"]:
+      for logFile in os.listdir(workdir):
+        if logFile.endswith("gz"):
+           continue
+        path = os.path.join(workdir, logFile)
         if os.path.exists(path):
-            bucket = 'bucket'  # UPDATE ME
-            logger.info('uploading %s to bucket=%s using object name=%s' % (path, bucket, fspec.lfn))
-            status, diagnostics = upload_file(path, bucket, object_name=fspec.lfn)
+            objectName = os.path.join(remotePath, logFile)
+            logger.info('uploading %s to bucket=%s using object name=%s' % (path, bucket, objectName))
+            status, diagnostics = upload_file(path, bucket, object_name=objectName)
 
             if not status:  ## an error occurred
                 # create new error code(s) in ErrorCodes.py and set it/them in resolve_common_transfer_errors()
@@ -163,25 +190,32 @@ def copy_out(files, **kwargs):
 
 def upload_file(file_name, bucket, object_name=None):
     """
-    Upload a file to an S3 bucket.
+    Upload a file to a GCS bucket.
 
     :param file_name: File to upload.
     :param bucket: Bucket to upload to (string).
-    :param object_name: S3 object name. If not specified then file_name is used.
+    :param object_name: GCS object name. If not specified then file_name is used.
     :return: True if file was uploaded (else False), diagnostics (string).
     """
 
-    # if S3 object_name was not specified, use file_name
+    # if GCS object_name was not specified, use file_name
     if object_name is None:
         object_name = file_name
 
+    #      os.environ['GTAG'] = http_access + os.path.join(remotePath, config.Pilot.pilotlog)
+    #      logger.debug('http_access=%s' % http_access)
+
     # upload the file
     try:
         client = storage.Client()
         gs_bucket = client.get_bucket(bucket)
-        remote_path = file_name  # update me
-        blob = gs_bucket.blob(remote_path)
+        logger.info('uploading a file to bucket=%s in full path=%s' % (bucket, object_name))
+        blob = gs_bucket.blob(object_name)
         blob.upload_from_filename(filename=file_name)
+        if file_name.endswith(config.Pilot.pilotlog):
+           url_pilotLog = blob.public_url
+           os.environ['GTAG'] = url_pilotLog
+           logger.debug("Set envvar GTAG with the pilotLot URL=%s" % url_pilotLog)
     except Exception as e:
         diagnostics = 'exception caught in gs client: %s' % e
         logger.critical(diagnostics)

From c54abb2f9a4ecb732e7a708e0dfbf77f4c280765 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Wed, 19 May 2021 11:27:20 +0200
Subject: [PATCH 10/96] Removed default protocol value from trace - previously
 set to copy tool name (this is probably irrelevant since the proper value is
 set later when it is known)

---
 PILOTVERSION            | 2 +-
 pilot/api/data.py       | 2 +-
 pilot/util/constants.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 88b6eae4..6ee41598 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.2
\ No newline at end of file
+2.11.3.3
\ No newline at end of file
diff --git a/pilot/api/data.py b/pilot/api/data.py
index ea1ba48f..bf5d73be 100644
--- a/pilot/api/data.py
+++ b/pilot/api/data.py
@@ -491,7 +491,7 @@ def transfer(self, files, activity='default', **kwargs):  # noqa: C901
                 module = self.copytool_modules[name]['module_name']
                 self.logger.info('trying to use copytool=%s for activity=%s' % (name, activity))
                 copytool = __import__('pilot.copytool.%s' % module, globals(), locals(), [module], 0)  # Python 2/3
-                self.trace_report.update(protocol=name)
+                #self.trace_report.update(protocol=name)
 
             except PilotException as e:
                 caught_errors.append(e)
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 161089a2..06625198 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'   # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '2'    # build number should be reset to '1' for every new development cycle
+BUILD = '3'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 8273d4719852afd52712c9a8c47ae990efe9167f Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Wed, 19 May 2021 15:18:23 +0200
Subject: [PATCH 11/96] Added new pilot option for turning on/off rucio traces

---
 pilot.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pilot.py b/pilot.py
index 1b8ed093..9bcff83a 100755
--- a/pilot.py
+++ b/pilot.py
@@ -379,6 +379,11 @@ def get_args():
                             dest='jobtype',
                             default='',
                             help='Job type (managed, user)')
+    arg_parser.add_argument('--use-rucio-traces',
+                            dest='use_rucio_traces',
+                            type=str2bool,
+                            default=True,
+                            help='Use rucio traces')
 
     # HPC options
     arg_parser.add_argument('--hpc-resource',
@@ -467,6 +472,9 @@ def set_environment_variables(args, mainworkdir):
     # set the (HPC) resource name (if set in options)
     environ['PILOT_RESOURCE_NAME'] = args.hpc_resource
 
+    # allow for the possibility of turning off rucio traces
+    environ['PILOT_USE_RUCIO_TRACES'] = str(args.use_rucio_traces)
+
     # event service executor type
     environ['PILOT_ES_EXECUTOR_TYPE'] = args.executor_type
 

From 77589e807a2fb65e805eb5380f416b0214c057f0 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Wed, 19 May 2021 15:59:14 +0200
Subject: [PATCH 12/96] Pilot now only sends rucio traces if required

---
 pilot/util/tracereport.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pilot/util/tracereport.py b/pilot/util/tracereport.py
index b00e7067..9f2f8c09 100644
--- a/pilot/util/tracereport.py
+++ b/pilot/util/tracereport.py
@@ -133,6 +133,11 @@ def send(self):
         :return: Boolean.
         """
 
+        # only send trace if it is actually required (can be turned off with pilot option)
+        if environ.get('PILOT_USE_RUCIO_TRACES', 'True') == 'False':
+            logger.debug('rucio trace does not need to be sent')
+            return True
+
         url = config.Rucio.url
         logger.info("tracing server: %s" % url)
         logger.info("sending tracing report: %s" % str(self))

From 396c66d6b6d56e3b8049d815690c01e202514f0e Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Fri, 21 May 2021 17:13:47 +0200
Subject: [PATCH 13/96] Removed -p all option for xcache kill. Added debug info
 for xcache message log. Added preliminary functions for advanced debug mode.

---
 PILOTVERSION               |  2 +-
 pilot/control/data.py      |  3 ++
 pilot/control/job.py       | 70 +++++++++++++++++++++++++-------------
 pilot/info/jobdata.py      |  5 +--
 pilot/user/atlas/common.py | 10 ++++--
 pilot/util/constants.py    |  2 +-
 6 files changed, 62 insertions(+), 30 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 6ee41598..39d30056 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.3
\ No newline at end of file
+2.11.3.7
\ No newline at end of file
diff --git a/pilot/control/data.py b/pilot/control/data.py
index be4fc8a2..d0c710ec 100644
--- a/pilot/control/data.py
+++ b/pilot/control/data.py
@@ -445,6 +445,9 @@ def set_xcache_log(line):
     result = re.findall(pattern, line)
     if result:
         os.environ['ALRB_XCACHE_LOG'] = result[0]
+        logger.debug('extracted xcache log path: ALRB_XCACHE_LOG=\'%s\'' % result[0])
+    else:
+        logger.warning('failed to extract log path for ALRB_XCACHE_LOG from line: \'%s\'' % line)
 
 
 def copytool_in(queues, traces, args):
diff --git a/pilot/control/job.py b/pilot/control/job.py
index 25396851..95221dd5 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -477,6 +477,37 @@ def get_panda_server(url, port):
     return pandaserver
 
 
+def get_debug_command(cmd):
+    """
+    Identify and filter the given debug command.
+
+    Note: only a single command will be allowed from a predefined list: tail, ls, gdb, ps, du.
+
+    :param cmd: raw debug command from job definition (string).
+    :return: debug_mode (Boolean, True if command is deemed ok), debug_command (string).
+    """
+
+    debug_mode = False
+    debug_command = ""
+
+    allowed_commands = ['tail', 'ls', 'ps', 'gdb', 'du']
+    forbidden_commands = ['rm']
+    try:
+        tmp = cmd.split(' ')
+        com = tmp[0]
+        opts = tmp[1]
+    except Exception as e:
+        logger.warning('failed to identify debug command: %s' % e)
+    else:
+        if com not in allowed_commands:
+            logger.warning('command=%s is not in the list of allowed commands: %s' % (com, str(allowed_commands)))
+        elif ';' in opts or '&#59' in opts:
+            logger.warning('debug command cannot contain \';\': \'%s\'' % cmd)
+        elif com in forbidden_commands:
+            logger.warning('command=%s is not allowed' % com)
+    return debug_mode, debug_command
+
+
 def handle_backchannel_command(res, job, args, test_tobekilled=False):
     """
     Does the server update contain any backchannel information? if so, update the job object.
@@ -493,9 +524,15 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False):
         res['command'] = 'tobekilled'
 
     if 'command' in res and res.get('command') != 'NULL':
-        # look for 'tobekilled', 'softkill', 'debug', 'debugoff'
         # warning: server might return comma-separated string, 'debug,tobekilled'
-        if 'tobekilled' in res.get('command'):
+        cmd = res.get('command')
+        # is it a 'command options'-type? debug_command=tail .., ls .., gdb .., ps .., du ..
+        if ' ' in cmd:
+            try:
+                job.debug, job.debug_command = get_debug_command(cmd)
+            except Exception as e:
+                logger.debug('exception caught in get_debug_command(): %s' % e)
+        elif 'tobekilled' in cmd:
             logger.info('pilot received a panda server signal to kill job %s at %s' %
                         (job.jobid, time_stamp()))
             set_pilot_state(job=job, state="failed")
@@ -506,18 +543,18 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False):
             else:
                 logger.debug('no pid to kill')
             args.abort_job.set()
-        elif 'softkill' in res.get('command'):
+        elif 'softkill' in cmd:
             logger.info('pilot received a panda server signal to softkill job %s at %s' %
                         (job.jobid, time_stamp()))
             # event service kill instruction
-        elif 'debug' in res.get('command'):
-            logger.info('pilot received a command to turn on debug mode from the server')
+        elif 'debug' in cmd:
+            logger.info('pilot received a command to turn on standard debug mode from the server')
             job.debug = True
-        elif 'debugoff' in res.get('command'):
+        elif 'debugoff' in cmd:
             logger.info('pilot received a command to turn off debug mode from the server')
             job.debug = False
         else:
-            logger.warning('received unknown server command via backchannel: %s' % res.get('command'))
+            logger.warning('received unknown server command via backchannel: %s' % cmd)
 
 
 def add_data_structure_ids(data, version_tag):
@@ -723,7 +760,6 @@ def add_memory_info(data, workdir, name=""):
     pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
     utilities = __import__('pilot.user.%s.utilities' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
     try:
-        #for key in job.utilities
         utility_node = utilities.get_memory_monitor_info(workdir, name=name)
         data.update(utility_node)
     except Exception as e:
@@ -731,20 +767,6 @@ def add_memory_info(data, workdir, name=""):
         pass
 
 
-#def get_list_of_log_files():
-#    """
-#    Return a list of log files produced by the payload.
-#
-#    :return: list of log files.
-#    """
-#
-#    list_of_files = get_files()
-#    if not list_of_files:  # some TRFs produce logs with different naming scheme
-#        list_of_files = get_files(pattern="log.*")
-#
-#    return list_of_files
-
-
 def remove_pilot_logs_from_list(list_of_files):
     """
     Remove any pilot logs from the list of last updated files.
@@ -753,6 +775,8 @@ def remove_pilot_logs_from_list(list_of_files):
     :return: list of files (list).
     """
 
+    # note: better to move experiment specific files to user area
+
     # ignore the pilot log files
     try:
         to_be_removed = [config.Pilot.pilotlog, config.Pilot.stageinlog, config.Pilot.stageoutlog,
@@ -760,7 +784,7 @@ def remove_pilot_logs_from_list(list_of_files):
                          config.Pilot.remotefileverification_log, config.Pilot.base_trace_report,
                          config.Container.container_script, config.Container.release_setup,
                          config.Container.stagein_status_dictionary, config.Container.stagein_replica_dictionary,
-                         'eventLoopHeartBeat.txt']
+                         'eventLoopHeartBeat.txt', 'memory_monitor_output.txt', 'memory_monitor_summary.json_snapshot']
     except Exception as e:
         logger.warning('exception caught: %s' % e)
         to_be_removed = []
diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py
index fd959d61..f3ec0cb7 100644
--- a/pilot/info/jobdata.py
+++ b/pilot/info/jobdata.py
@@ -118,7 +118,8 @@ class JobData(BaseData):
     attemptnr = 0                  # job attempt number
     destinationdblock = ""         ## to be moved to FileSpec (job.outdata)
     datasetin = ""                 ## TO BE DEPRECATED: moved to FileSpec (job.indata)
-    debug = False                  #
+    debug = False                  # debug mode, when True, pilot will send debug info back to the server
+    debug_command = 'tail'         # debug command (can be defined on the task side)
     produserid = ""                # the user DN (added to trace report)
     jobdefinitionid = ""           # the job definition id (added to trace report)
     infilesguids = ""              #
@@ -610,7 +611,7 @@ def clean__jobparams(self, raw, value):
         :return: updated job parameters (string).
         """
 
-        #value += ' --athenaopts "HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER" someblah'
+        #   value += ' --athenaopts "HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER" someblah'
         logger.info('cleaning jobparams: %s' % value)
 
         # user specific pre-filtering
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index 6a7d448c..2a1a4502 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -1897,7 +1897,7 @@ def get_utility_commands(order=None, job=None):
     elif order == UTILITY_AFTER_PAYLOAD_FINISHED:
         if job.postprocess and job.postprocess.get('command', ''):
             com = download_command(job.postprocess, job.workdir)
-        elif 'pilotXcache' in job.infosys.queuedata.catchall:
+        if 'pilotXcache' in job.infosys.queuedata.catchall:
             com = xcache_deactivation_command(job.workdir)
     elif order == UTILITY_BEFORE_STAGEIN:
         if 'pilotXcache' in job.infosys.queuedata.catchall:
@@ -1944,11 +1944,15 @@ def xcache_deactivation_command(workdir):
             copy(path, dest)
         except Exception as e:
             logger.warning('exception caught copying xcache log: %s' % e)
-
+    else:
+        if not path:
+            logger.warning('ALRB_XCACHE_LOG is not set')
+        if path and not os.path.exists(path):
+            logger.warning('path does not exist: %s' % path)
     command = "%s " % get_asetup(asetup=False)
     command += "lsetup xcache; xcache kill"  # -C centos7
 
-    return {'command': command, 'args': '-p all'}
+    return {'command': command, 'args': ''}
 
 
 def get_utility_command_setup(name, job, setup=None):
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 06625198..9a459253 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'   # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '3'    # build number should be reset to '1' for every new development cycle
+BUILD = '7'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From fca62f243a33a75a93f1fd8492810a6ec03c0f79 Mon Sep 17 00:00:00 2001
From: Shuwei Ye <Shuwei.Ye@cern.ch>
Date: Fri, 21 May 2021 13:55:36 -0400
Subject: [PATCH 14/96] Adjusted gs.py text format to pass the flake8 check

---
 pilot/copytool/gs.py | 81 +++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 43 deletions(-)

diff --git a/pilot/copytool/gs.py b/pilot/copytool/gs.py
index fb5ed595..03be7e77 100644
--- a/pilot/copytool/gs.py
+++ b/pilot/copytool/gs.py
@@ -22,7 +22,6 @@
 from .common import resolve_common_transfer_errors
 from pilot.common.errorcodes import ErrorCodes
 from pilot.common.exception import PilotException
-from pilot.util.ruciopath import get_rucio_path
 from pilot.util.config import config
 
 logger = logging.getLogger(__name__)
@@ -60,11 +59,11 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs):
 
     dataset = fspec.dataset
     if dataset:
-       dataset = dataset.replace("#{pandaid}",os.environ['PANDAID'])
+        dataset = dataset.replace("#{pandaid}", os.environ['PANDAID'])
     else:
-       dataset = ""
+        dataset = ""
 
-    remotePath = os.path.join(protocol.get('path', ''), dataset)
+    remote_path = os.path.join(protocol.get('path', ''), dataset)
 
     # pilot ID is passed by the envvar GTAG
     # try:
@@ -72,12 +71,12 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs):
     #   logger.debug('ddm.rprotocols=%s' % rprotocols)
     #   if "http_access" in rprotocols:
     #      http_access = rprotocols["http_access"]
-    #      os.environ['GTAG'] = http_access + os.path.join(remotePath, config.Pilot.pilotlog)
+    #      os.environ['GTAG'] = http_access + os.path.join(remote_path, config.Pilot.pilotlog)
     #      logger.debug('http_access=%s' % http_access)
     # except Exception as e:
     #   logger.warning("Failed in get 'http_access' in ddm.rprotocols")
 
-    surl = protocol.get('endpoint', '') + remotePath
+    surl = protocol.get('endpoint', '') + remote_path
     logger.info('For GCS bucket, set surl=%s' % surl)
 
     # example:
@@ -151,39 +150,38 @@ def copy_out(files, **kwargs):
     workdir = kwargs.pop('workdir')
 
     for fspec in files:
-
-      logger.info('Going to process fspec.turl=%s' % fspec.turl)
-      import re
-      # bucket = re.sub(r'gs://(.*?)/.*', r'\1', fspec.turl)
-      reObj = re.match(r'gs://([^/]*)/(.*)', fspec.turl)
-      (bucket, remotePath) = reObj.groups()
-
-
-      # ["pilotlog.txt", "payload.stdout", "payload.stderr"]:
-      for logFile in os.listdir(workdir):
-        if logFile.endswith("gz"):
-           continue
-        path = os.path.join(workdir, logFile)
-        if os.path.exists(path):
-            objectName = os.path.join(remotePath, logFile)
-            logger.info('uploading %s to bucket=%s using object name=%s' % (path, bucket, objectName))
-            status, diagnostics = upload_file(path, bucket, object_name=objectName)
-
-            if not status:  ## an error occurred
-                # create new error code(s) in ErrorCodes.py and set it/them in resolve_common_transfer_errors()
-                error = resolve_common_transfer_errors(diagnostics, is_stagein=False)
+        logger.info('Going to process fspec.turl=%s' % fspec.turl)
+
+        import re
+        # bucket = re.sub(r'gs://(.*?)/.*', r'\1', fspec.turl)
+        reObj = re.match(r'gs://([^/]*)/(.*)', fspec.turl)
+        (bucket, remote_path) = reObj.groups()
+
+        # ["pilotlog.txt", "payload.stdout", "payload.stderr"]:
+        for logfile in os.listdir(workdir):
+            if logfile.endswith("gz"):
+                continue
+            path = os.path.join(workdir, logfile)
+            if os.path.exists(path):
+                object_name = os.path.join(remote_path, logfile)
+                logger.info('uploading %s to bucket=%s using object name=%s' % (path, bucket, object_name))
+                status, diagnostics = upload_file(path, bucket, object_name=object_name)
+
+                if not status:  ## an error occurred
+                    # create new error code(s) in ErrorCodes.py and set it/them in resolve_common_transfer_errors()
+                    error = resolve_common_transfer_errors(diagnostics, is_stagein=False)
+                    fspec.status = 'failed'
+                    fspec.status_code = error.get('rcode')
+                    raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state'))
+            else:
+                diagnostics = 'local output file does not exist: %s' % path
+                logger.warning(diagnostics)
                 fspec.status = 'failed'
-                fspec.status_code = error.get('rcode')
-                raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state'))
-        else:
-            diagnostics = 'local output file does not exist: %s' % path
-            logger.warning(diagnostics)
-            fspec.status = 'failed'
-            fspec.status_code = errors.STAGEOUTFAILED
-            raise PilotException(diagnostics, code=fspec.status_code, state=fspec.status)
+                fspec.status_code = errors.STAGEOUTFAILED
+                raise PilotException(diagnostics, code=fspec.status_code, state=fspec.status)
 
-        fspec.status = 'transferred'
-        fspec.status_code = 0
+            fspec.status = 'transferred'
+            fspec.status_code = 0
 
     return files
 
@@ -202,9 +200,6 @@ def upload_file(file_name, bucket, object_name=None):
     if object_name is None:
         object_name = file_name
 
-    #      os.environ['GTAG'] = http_access + os.path.join(remotePath, config.Pilot.pilotlog)
-    #      logger.debug('http_access=%s' % http_access)
-
     # upload the file
     try:
         client = storage.Client()
@@ -213,9 +208,9 @@ def upload_file(file_name, bucket, object_name=None):
         blob = gs_bucket.blob(object_name)
         blob.upload_from_filename(filename=file_name)
         if file_name.endswith(config.Pilot.pilotlog):
-           url_pilotLog = blob.public_url
-           os.environ['GTAG'] = url_pilotLog
-           logger.debug("Set envvar GTAG with the pilotLot URL=%s" % url_pilotLog)
+            url_pilotLog = blob.public_url
+            os.environ['GTAG'] = url_pilotLog
+            logger.debug("Set envvar GTAG with the pilotLot URL=%s" % url_pilotLog)
     except Exception as e:
         diagnostics = 'exception caught in gs client: %s' % e
         logger.critical(diagnostics)

From 37b26eb604ee634a5fbcbfaf7d553880eede97cd Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 25 May 2021 09:39:21 +0200
Subject: [PATCH 15/96] Corrected xcache kill. Refactored env var support
 functions. Initial commit for dask class

---
 PILOTVERSION               |  2 +-
 pilot/api/dask.py          | 40 +++++++++++++++++++++++++++++++++++++
 pilot/control/data.py      | 41 +++++++++++++++++++-------------------
 pilot/user/atlas/common.py |  2 +-
 pilot/util/constants.py    |  2 +-
 5 files changed, 63 insertions(+), 24 deletions(-)
 create mode 100644 pilot/api/dask.py

diff --git a/PILOTVERSION b/PILOTVERSION
index 39d30056..bd49a880 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.7
\ No newline at end of file
+2.11.3.11
\ No newline at end of file
diff --git a/pilot/api/dask.py b/pilot/api/dask.py
new file mode 100644
index 00000000..2fe123a9
--- /dev/null
+++ b/pilot/api/dask.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Authors:
+# - Paul Nilsson, paul.nilsson@cern.ch, 2021
+
+#from pilot.common.exception import NotDefined, NotSameLength, UnknownException
+#from pilot.util.filehandling import get_table_from_file
+#from pilot.util.math import mean, sum_square_dev, sum_dev, chi2, float_to_rounded_string
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+class Dask(object):
+    """
+    Dask interface class.
+    """
+
+    status = None
+    loadbalancerip = None
+
+    def __init__(self, **kwargs):
+        """
+        Init function.
+
+        :param kwargs:
+        """
+
+        pass
+
+    def install(self):
+        """
+
+        """
+
+        pass
diff --git a/pilot/control/data.py b/pilot/control/data.py
index d0c710ec..314caa52 100644
--- a/pilot/control/data.py
+++ b/pilot/control/data.py
@@ -416,38 +416,37 @@ def stage_out_auto(site, files):
 
 
 def xcache_proxy(output):
+    """
+
+    """
 
     for line in output.split('\n'):
         if 'ALRB_XCACHE_PROXY' in line:
-            set_xcache_proxy(line, remote='REMOTE' in line)
-        if 'Messages logged in' in line:
-            set_xcache_log(line)
-
+            remote = 'REMOTE' in line
+            name = 'ALRB_XCACHE_PROXY_REMOTE' if remote else 'ALRB_XCACHE_PROXY'
+            pattern = r'\ export\ ALRB_XCACHE_PROXY_REMOTE\=\"(.+)\"' if remote else r'\ export\ ALRB_XCACHE_PROXY\=\"(.+)\"'
+            set_xcache_var(line, name=name, pattern=pattern)
+        elif 'ALRB_XCACHE_MYPROCESS' in line:
+            set_xcache_var(line, name='ALRB_XCACHE_MYPROCESS', pattern=r'\ ALRB_XCACHE_MYPROCESS\=(.+)')
+        elif 'Messages logged in' in line:
+            set_xcache_var(line, name='ALRB_XCACHE_LOG', pattern=r'xcache\ started\ successfully.\ \ Messages\ logged\ in\ (.+)')
 
-def set_xcache_proxy(line, remote=None):
-
-    import re
-    pattern = r'\ export\ ALRB_XCACHE_PROXY_REMOTE\=\"(.+)\"' if remote else r'\ export\ ALRB_XCACHE_PROXY\=\"(.+)\"'
-    pattern = re.compile(pattern)
-    result = re.findall(pattern, line)
-    if result:
-        if remote:
-            os.environ['ALRB_XCACHE_PROXY_REMOTE'] = result[0]
-        else:
-            os.environ['ALRB_XCACHE_PROXY'] = result[0]
 
+def set_xcache_var(line, name='', pattern=''):
+    """
+    Extract the value of a given environmental variable from a given stdout line.
 
-def set_xcache_log(line):
+    :param line: line from stdout to be investigated (string).
+    :param name: name of env var (string).
+    :param pattern: regex pattern (string).
+    :return:
+    """
 
     import re
-    pattern = r'xcache\ started\ successfully.\ \ Messages\ logged\ in\ (.+)'
     pattern = re.compile(pattern)
     result = re.findall(pattern, line)
     if result:
-        os.environ['ALRB_XCACHE_LOG'] = result[0]
-        logger.debug('extracted xcache log path: ALRB_XCACHE_LOG=\'%s\'' % result[0])
-    else:
-        logger.warning('failed to extract log path for ALRB_XCACHE_LOG from line: \'%s\'' % line)
+        os.environ[name] = result[0]
 
 
 def copytool_in(queues, traces, args):
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index 2a1a4502..1c156b71 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -1952,7 +1952,7 @@ def xcache_deactivation_command(workdir):
     command = "%s " % get_asetup(asetup=False)
     command += "lsetup xcache; xcache kill"  # -C centos7
 
-    return {'command': command, 'args': ''}
+    return {'command': command, 'args': '-p $ALRB_XCACHE_MYPROCESS'}
 
 
 def get_utility_command_setup(name, job, setup=None):
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 9a459253..e64ab6e2 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'   # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '7'    # build number should be reset to '1' for every new development cycle
+BUILD = '11'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From d95d55923da39e02f5b81ca8b73cf361354fc531 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 25 May 2021 09:44:40 +0200
Subject: [PATCH 16/96] Added function comments

---
 pilot/control/data.py   | 4 ++++
 pilot/util/constants.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/pilot/control/data.py b/pilot/control/data.py
index 314caa52..53f671c7 100644
--- a/pilot/control/data.py
+++ b/pilot/control/data.py
@@ -417,9 +417,13 @@ def stage_out_auto(site, files):
 
 def xcache_proxy(output):
     """
+    Extract env vars from xcache stdout and set them.
 
+    :param output: command output (string).
+    :return:
     """
 
+    # loop over each line in the xcache stdout and identify the needed environmental variables
     for line in output.split('\n'):
         if 'ALRB_XCACHE_PROXY' in line:
             remote = 'REMOTE' in line
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index e64ab6e2..6837fa95 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'   # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '11'    # build number should be reset to '1' for every new development cycle
+BUILD = '12'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 3f1df918096d059f1a0a322c54d5f7b1da4a77b1 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 25 May 2021 12:08:33 +0200
Subject: [PATCH 17/96] Moved xcache code to atlas area. Corrected xcache
 output handling. Lots of debugging info for xcache [to be removed again]

---
 PILOTVERSION                      |  2 +-
 pilot/control/data.py             | 72 ++++++++++++++++--------------
 pilot/control/payloads/generic.py | 41 ++++++++++++++---
 pilot/user/atlas/common.py        | 73 ++++++++++++++++++++++++++++++-
 pilot/user/generic/common.py      | 15 +++++++
 pilot/util/constants.py           |  2 +-
 6 files changed, 162 insertions(+), 43 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index bd49a880..c51eef03 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.11
\ No newline at end of file
+2.11.3.13
\ No newline at end of file
diff --git a/pilot/control/data.py b/pilot/control/data.py
index 53f671c7..f8e80c94 100644
--- a/pilot/control/data.py
+++ b/pilot/control/data.py
@@ -32,7 +32,7 @@
 from pilot.util.constants import PILOT_PRE_STAGEIN, PILOT_POST_STAGEIN, PILOT_PRE_STAGEOUT, PILOT_POST_STAGEOUT, LOG_TRANSFER_IN_PROGRESS,\
     LOG_TRANSFER_DONE, LOG_TRANSFER_NOT_DONE, LOG_TRANSFER_FAILED, SERVER_UPDATE_RUNNING, MAX_KILL_WAIT_TIME, UTILITY_BEFORE_STAGEIN
 from pilot.util.container import execute
-from pilot.util.filehandling import remove
+from pilot.util.filehandling import remove, write_file
 from pilot.util.processes import threads_aborted
 from pilot.util.queuehandling import declare_failed_by_kill, put_in_queue
 from pilot.util.timing import add_to_pilot_timing
@@ -415,42 +415,39 @@ def stage_out_auto(site, files):
     return files
 
 
-def xcache_proxy(output):
+def write_output(filename, output):
     """
-    Extract env vars from xcache stdout and set them.
+    Write command output to file.
 
-    :param output: command output (string).
+    :param filename: file name (string).
+    :param output: command stdout/stderr (string).
     :return:
     """
 
-    # loop over each line in the xcache stdout and identify the needed environmental variables
-    for line in output.split('\n'):
-        if 'ALRB_XCACHE_PROXY' in line:
-            remote = 'REMOTE' in line
-            name = 'ALRB_XCACHE_PROXY_REMOTE' if remote else 'ALRB_XCACHE_PROXY'
-            pattern = r'\ export\ ALRB_XCACHE_PROXY_REMOTE\=\"(.+)\"' if remote else r'\ export\ ALRB_XCACHE_PROXY\=\"(.+)\"'
-            set_xcache_var(line, name=name, pattern=pattern)
-        elif 'ALRB_XCACHE_MYPROCESS' in line:
-            set_xcache_var(line, name='ALRB_XCACHE_MYPROCESS', pattern=r'\ ALRB_XCACHE_MYPROCESS\=(.+)')
-        elif 'Messages logged in' in line:
-            set_xcache_var(line, name='ALRB_XCACHE_LOG', pattern=r'xcache\ started\ successfully.\ \ Messages\ logged\ in\ (.+)')
+    try:
+        write_file(filename, output, unique=True)
+    except PilotException as e:
+        logger.warning('failed to write utility output to file: %s, %s' % (e, output))
+    else:
+        logger.debug('wrote %s' % filename)
 
 
-def set_xcache_var(line, name='', pattern=''):
+def write_utility_output(workdir, step, stdout, stderr):
     """
-    Extract the value of a given environmental variable from a given stdout line.
-
-    :param line: line from stdout to be investigated (string).
-    :param name: name of env var (string).
-    :param pattern: regex pattern (string).
+    Write the utility command output to stdout, stderr files to the job.workdir for the current step.
+    -> <step>_stdout.txt, <step>_stderr.txt
+    Example of step: xcache.
+
+    :param workdir: job workdir (string).
+    :param step: utility step (string).
+    :param stdout: command stdout (string).
+    :param stderr: command stderr (string).
     :return:
     """
 
-    import re
-    pattern = re.compile(pattern)
-    result = re.findall(pattern, line)
-    if result:
-        os.environ[name] = result[0]
+    # dump to files
+    write_output(os.path.join(workdir, step + '_stdout.txt'), stdout)
+    write_output(os.path.join(workdir, step + '_stderr.txt'), stderr)
 
 
 def copytool_in(queues, traces, args):
@@ -480,15 +477,26 @@ def copytool_in(queues, traces, args):
             user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
             cmd = user.get_utility_commands(job=job, order=UTILITY_BEFORE_STAGEIN)
             if cmd:
+                # xcache debug
+                exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                logger.debug('[before xcache start] stdout=%s' % stdout)
+                logger.debug('[before xcache start] stderr=%s' % stderr)
+
                 exit_code, stdout, stderr = execute(cmd.get('command'))
-                logger.debug('exit_code=%d' % exit_code)
-                logger.debug('stderr=%s' % stderr)
                 logger.debug('stdout=%s' % stdout)
-                # move code to user area
-                xcache_proxy(stdout)
+                logger.debug('stderr=%s' % stderr)
+
+                # xcache debug
+                exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                logger.debug('[after xcache start] stdout=%s' % stdout)
+                logger.debug('[after xcache start] stderr=%s' % stderr)
+
+                # perform any action necessary after command execution (e.g. stdout processing)
+                kwargs = {'label': cmd.get('label', 'utility'), 'output': stdout}
+                user.post_prestagein_utility_command(**kwargs)
 
-            logger.debug('ALRB_XCACHE_PROXY=%s' % os.environ.get('ALRB_XCACHE_PROXY', '<not set>'))
-            logger.debug('ALRB_XCACHE_PROXY_REMOTE=%s' % os.environ.get('ALRB_XCACHE_PROXY_REMOTE', '<not set>'))
+                # write output to log files
+                write_utility_output(job.workdir, cmd.get('label', 'utility'), stdout, stderr)
 
             # place it in the current stage-in queue (used by the jobs' queue monitoring)
             put_in_queue(job, queues.current_data_in)
diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py
index 1d699a0f..3c674fd6 100644
--- a/pilot/control/payloads/generic.py
+++ b/pilot/control/payloads/generic.py
@@ -92,7 +92,7 @@ def utility_before_payload(self, job):
         cmd_dictionary = user.get_utility_commands(order=UTILITY_BEFORE_PAYLOAD, job=job)
         if cmd_dictionary:
             cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args'))
-            logger.debug('utility command to be executed before the payload: %s' % cmd)
+            logger.info('utility command (\'%s\') to be executed before the payload: %s' % (cmd_dictionary.get('label', 'utility'), cmd))
 
         return cmd
 
@@ -114,7 +114,7 @@ def utility_with_payload(self, job):
         cmd_dictionary = user.get_utility_commands(order=UTILITY_WITH_PAYLOAD, job=job)
         if cmd_dictionary:
             cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args'))
-            logger.debug('utility command to be executed with the payload: %s' % cmd)
+            logger.info('utility command (\'%s\') to be executed with the payload: %s' % (cmd_dictionary.get('label', 'utility'), cmd))
 
         return cmd
 
@@ -138,7 +138,7 @@ def get_utility_command(self, order=None):
         cmd_dictionary = user.get_utility_commands(order=order, job=self.__job)
         if cmd_dictionary:
             cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args'))
-            logger.info('utility command to be executed after the payload: %s' % cmd)
+            logger.info('utility command (\'%s\') to be executed after the payload: %s' % (cmd_dictionary.get('label', 'utility'), cmd))
 
         return cmd
 
@@ -231,7 +231,7 @@ def utility_after_payload_finished(self, job):
         cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_FINISHED, job=job)
         if cmd_dictionary:
             cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args'))
-            logger.debug('utility command to be executed after the payload has finished: %s' % cmd)
+            logger.info('utility command (\'%s\') to be executed after the payload has finished: %s' % (cmd_dictionary.get('label', 'utility'), cmd))
 
         return cmd
 
@@ -285,13 +285,20 @@ def write_utility_output(self, workdir, step, stdout, stderr):
             elif step == 'postprocess':
                 self.__postprocess_stdout_name = name_stdout
                 self.__postprocess_stderr_name = name_stderr
-            write_file(os.path.join(workdir, step + '_stdout.txt'), stdout, unique=True)
+            name = os.path.join(workdir, step + '_stdout.txt')
+            write_file(name, stdout, unique=True)
         except PilotException as e:
             logger.warning('failed to write utility stdout to file: %s, %s' % (e, stdout))
+        else:
+            logger.debug('wrote %s' % name)
+
         try:
-            write_file(os.path.join(workdir, step + '_stderr.txt'), stderr, unique=True)
+            name = os.path.join(workdir, step + '_stderr.txt')
+            write_file(name, stderr, unique=True)
         except PilotException as e:
             logger.warning('failed to write utility stderr to file: %s, %s' % (e, stderr))
+        else:
+            logger.debug('wrote %s' % name)
 
     def pre_payload(self, job):
         """
@@ -581,6 +588,11 @@ def run(self):  # noqa: C901
             # now run the main payload, when it finishes, run the postprocess (if necessary)
             # note: no need to run any main payload in HPO Horovod jobs on Kubernetes
             if os.environ.get('HARVESTER_HOROVOD', '') == '':
+
+                exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                logger.debug('[before payload start] stdout=%s' % stdout)
+                logger.debug('[before payload start] stderr=%s' % stderr)
+
                 proc = self.run_payload(self.__job, cmd, self.__out, self.__err)
             else:
                 proc = None
@@ -627,6 +639,10 @@ def run(self):  # noqa: C901
                 set_pilot_state(job=self.__job, state=state)
                 logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n' % (proc.pid, exit_code, self.__job.state))
 
+                exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                logger.debug('[after payload finish] stdout=%s' % stdout)
+                logger.debug('[after payload finish] stderr=%s' % stderr)
+
                 # stop the utility command (e.g. a coprocess if necessary
                 if proc_co:
                     logger.debug('stopping utility command: %s' % utility_cmd)
@@ -673,7 +689,18 @@ def run_utility_after_payload_finished(self):
                 exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'postprocess')
             elif cmd_after_payload:
                 logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload)
-                exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'xcache')
+
+                # xcache debug
+                exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                logger.debug('[before xcache kill] stdout=%s' % stdout)
+                logger.debug('[before xcache kill] stderr=%s' % stderr)
+
+                exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'xcache_kill')
+
+                # xcache debug
+                exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                logger.debug('[after xcache kill] stdout=%s' % stdout)
+                logger.debug('[after xcache kill] stderr=%s' % stderr)
 
         return exit_code
 
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index 1c156b71..ee0ecd90 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -1882,30 +1882,99 @@ def get_utility_commands(order=None, job=None):
     if order == UTILITY_BEFORE_PAYLOAD and job.preprocess:
         if job.preprocess.get('command', ''):
             com = download_command(job.preprocess, job.workdir)
+            com['label'] = 'preprocess'
     elif order == UTILITY_WITH_PAYLOAD:
-        com = {'command': 'NetworkMonitor', 'args': ''}
+        com = {'command': 'NetworkMonitor', 'args': '', 'label': 'networkmonitor'}
     elif order == UTILITY_AFTER_PAYLOAD_STARTED:
         cmd = config.Pilot.utility_after_payload_started
         if cmd:
-            com = {'command': cmd, 'args': ''}
+            com = {'command': cmd, 'args': '', 'label': cmd.lower()}
     elif order == UTILITY_AFTER_PAYLOAD_STARTED2 and job.coprocess:
         if job.coprocess.get('command', ''):
             com = download_command(job.coprocess, job.workdir)
+            com['label'] = 'coprocess'
     elif order == UTILITY_AFTER_PAYLOAD and job.postprocess:
         if job.postprocess.get('command', ''):
             com = download_command(job.postprocess, job.workdir)
+            com['label'] = 'postprocess'
     elif order == UTILITY_AFTER_PAYLOAD_FINISHED:
         if job.postprocess and job.postprocess.get('command', ''):
             com = download_command(job.postprocess, job.workdir)
+            com['label'] = 'postprocess'
         if 'pilotXcache' in job.infosys.queuedata.catchall:
             com = xcache_deactivation_command(job.workdir)
+            com['label'] = 'xcache_kill'
     elif order == UTILITY_BEFORE_STAGEIN:
         if 'pilotXcache' in job.infosys.queuedata.catchall:
             com = xcache_activation_command(job.jobid)
+            com['label'] = 'xcache'
 
     return com
 
 
+def post_prestagein_utility_command(**kwargs):
+    """
+    Execute any post pre-stage-in utility commands.
+
+    :param kwargs: kwargs (dictionary).
+    :return:
+    """
+
+    label = kwargs.get('label', 'unknown_label')
+    stdout = kwargs.get('output', None)
+
+    if stdout:
+        logger.debug('processing stdout for label=%s' % label)
+        xcache_proxy(stdout)
+    else:
+        logger.warning('no output for label=%s' % label)
+
+    alrb_xcache_files = os.environ.get('ALRB_XCACHE_FILES', '')
+    if alrb_xcache_files:
+        cmd = 'cat $ALRB_XCACHE_FILES/settings.sh'
+        exit_code, _stdout, _stderr = execute(cmd, usecontainer=False)
+        logger.debug('cmd=%s:\n\n%s\n\n' % _stdout)
+
+
+def xcache_proxy(output):
+    """
+    Extract env vars from xcache stdout and set them.
+
+    :param output: command output (string).
+    :return:
+    """
+
+    # loop over each line in the xcache stdout and identify the needed environmental variables
+    for line in output.split('\n'):
+        if 'ALRB_XCACHE_PROXY' in line:
+            remote = 'REMOTE' in line
+            name = 'ALRB_XCACHE_PROXY_REMOTE' if remote else 'ALRB_XCACHE_PROXY'
+            pattern = r'\ export\ ALRB_XCACHE_PROXY_REMOTE\=\"(.+)\"' if remote else r'\ export\ ALRB_XCACHE_PROXY\=\"(.+)\"'
+            set_xcache_var(line, name=name, pattern=pattern)
+        elif 'ALRB_XCACHE_MYPROCESS' in line:
+            set_xcache_var(line, name='ALRB_XCACHE_MYPROCESS', pattern=r'\ ALRB_XCACHE_MYPROCESS\=(.+)')
+        elif 'Messages logged in' in line:
+            set_xcache_var(line, name='ALRB_XCACHE_LOG', pattern=r'xcache\ started\ successfully.\ \ Messages\ logged\ in\ (.+)')
+        elif 'ALRB_XCACHE_FILES' in line:
+            set_xcache_var(line, name='ALRB_XCACHE_FILES', pattern=r'\ ALRB_XCACHE_FILES\=(.+)')
+
+
+def set_xcache_var(line, name='', pattern=''):
+    """
+    Extract the value of a given environmental variable from a given stdout line.
+
+    :param line: line from stdout to be investigated (string).
+    :param name: name of env var (string).
+    :param pattern: regex pattern (string).
+    :return:
+    """
+
+    pattern = re.compile(pattern)
+    result = re.findall(pattern, line)
+    if result:
+        os.environ[name] = result[0]
+
+
 def xcache_activation_command(jobid):
     """
     Return the xcache service activation command.
diff --git a/pilot/user/generic/common.py b/pilot/user/generic/common.py
index 7df04fc8..b21442e1 100644
--- a/pilot/user/generic/common.py
+++ b/pilot/user/generic/common.py
@@ -256,3 +256,18 @@ def update_server(job):
     """
 
     pass
+
+
+def post_prestagein_utility_command(**kwargs):
+    """
+    Execute any post pre-stage-in utility commands.
+
+    :param kwargs: kwargs (dictionary).
+    :return:
+    """
+
+    # label = kwargs.get('label', 'unknown_label')
+    # stdout = kwargs.get('output', None)
+
+    pass
+
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 6837fa95..c05c9593 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'   # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '12'    # build number should be reset to '1' for every new development cycle
+BUILD = '13'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 515f623dd7dc44201cb8ed2a8f3326c31008f2be Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 12:49:39 +0200
Subject: [PATCH 18/96] Dask validation

---
 pilot/api/dask.py | 42 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 2fe123a9..e611a645 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -10,6 +10,7 @@
 #from pilot.common.exception import NotDefined, NotSameLength, UnknownException
 #from pilot.util.filehandling import get_table_from_file
 #from pilot.util.math import mean, sum_square_dev, sum_dev, chi2, float_to_rounded_string
+from pilot.util.container import execute
 
 import logging
 logger = logging.getLogger(__name__)
@@ -32,9 +33,46 @@ def __init__(self, **kwargs):
 
         pass
 
-    def install(self):
+    def install(self, block=True):
         """
 
         """
 
-        pass
+        # can dask be installed?
+        if not self._validate():
+            logger.warning('validation failed')
+            self.status = 'failed'
+        else:
+            logger.info('dask has been validated')
+            self.status = 'validated'
+
+    def _validate(self):
+        """
+        Make sure that pre-conditions are met before any installation can be attempted.
+
+        Pre-conditions: required libraries and commands
+        1. library: dask
+        2. library: dask_kubernetes
+        3. command: helm
+        4. command: kubectl
+        """
+
+        try:
+            import dask
+            import dask_kubernetes
+        except Exception as error:
+            logger.warning('module not available: %s' % error)
+            return False
+
+        commands = ['helm', 'kubectl']
+        found = False
+        for cmd in commands:
+            exit_code, stdout, stderr = execute('which %s' % cmd, mute=True)
+            found = True if not 'not found' in stdout else False
+            if not found in stdout:
+                logger.warning(stdout)
+                break
+        if not found:
+            return False
+
+        return True

From d6e8c26966ffb8039aef27790d2d91a053209b60 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 12:58:57 +0200
Subject: [PATCH 19/96] Removed args object from establish_logging()

---
 pilot.py                          | 2 +-
 pilot/control/job.py              | 2 +-
 pilot/scripts/open_remote_file.py | 2 +-
 pilot/scripts/stagein.py          | 2 +-
 pilot/scripts/stageout.py         | 2 +-
 pilot/util/filehandling.py        | 9 +++++----
 6 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/pilot.py b/pilot.py
index 9bcff83a..ac40b52b 100755
--- a/pilot.py
+++ b/pilot.py
@@ -587,7 +587,7 @@ def get_pilot_source_dir():
     set_environment_variables(args, mainworkdir)
 
     # setup and establish standard logging
-    establish_logging(args)
+    establish_logging(debug=args.debug, nopilotlog=args.nopilotlog)
 
     # execute main function
     trace = main()
diff --git a/pilot/control/job.py b/pilot/control/job.py
index 95221dd5..d5d6b41d 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -1676,7 +1676,7 @@ def retrieve(queues, traces, args):  # noqa: C901
                         logging.info('pilot has finished for previous job - re-establishing logging')
                         logging.handlers = []
                         logging.shutdown()
-                        establish_logging(args)
+                        establish_logging(debug=args.debug, nopilotlog=args.nopilotlog)
                         pilot_version_banner()
                         getjob_requests = 0
                         add_to_pilot_timing('1', PILOT_MULTIJOB_START_TIME, time.time(), args)
diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py
index 96ab6805..92dec03b 100644
--- a/pilot/scripts/open_remote_file.py
+++ b/pilot/scripts/open_remote_file.py
@@ -99,7 +99,7 @@ def try_open_file(turl):
             print("remote file open verification not desired")
             exit(0)
 
-    establish_logging(args, filename=logname)
+    establish_logging(debug=args.debug, nopilotlog=args.nopilotlog, filename=logname)
     logger = logging.getLogger(__name__)
 
     # get the file info
diff --git a/pilot/scripts/stagein.py b/pilot/scripts/stagein.py
index 851ca307..00ea77b7 100644
--- a/pilot/scripts/stagein.py
+++ b/pilot/scripts/stagein.py
@@ -356,7 +356,7 @@ def extract_error_info(err):
     args.debug = True
     args.nopilotlog = False
 
-    establish_logging(args, filename=config.Pilot.stageinlog)
+    establish_logging(debug=args.debug, nopilotlog=args.nopilotlog, filename=config.Pilot.stageinlog)
     logger = logging.getLogger(__name__)
 
     #ret = verify_args()
diff --git a/pilot/scripts/stageout.py b/pilot/scripts/stageout.py
index f60219d1..2872801b 100644
--- a/pilot/scripts/stageout.py
+++ b/pilot/scripts/stageout.py
@@ -289,7 +289,7 @@ def extract_error_info(err):
     args.debug = True
     args.nopilotlog = False
 
-    establish_logging(args, filename=config.Pilot.stageoutlog)
+    establish_logging(debug=args.debug, nopilotlog=args.nopilotlog, filename=config.Pilot.stageoutlog)
     logger = logging.getLogger(__name__)
 
     #ret = verify_args()
diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py
index 7858b4b6..9ebeac46 100644
--- a/pilot/util/filehandling.py
+++ b/pilot/util/filehandling.py
@@ -930,11 +930,12 @@ def dump(path, cmd="cat"):
         logger.info("path %s does not exist" % path)
 
 
-def establish_logging(args, filename=config.Pilot.pilotlog):
+def establish_logging(debug=True, nopilotlog=False, filename=config.Pilot.pilotlog):
     """
     Setup and establish logging.
 
-    :param args: pilot arguments object.
+    :param debug: debug mode (Boolean),
+    :param nopilotlog: True when pilot log is not known (Boolean).
     :param filename: name of log file.
     :return:
     """
@@ -944,7 +945,7 @@ def establish_logging(args, filename=config.Pilot.pilotlog):
     _logger.propagate = False
 
     console = logging.StreamHandler(sys.stdout)
-    if args.debug:
+    if debug:
         format_str = '%(asctime)s | %(levelname)-8s | %(threadName)-19s | %(name)-32s | %(funcName)-25s | %(message)s'
         level = logging.DEBUG
     else:
@@ -953,7 +954,7 @@ def establish_logging(args, filename=config.Pilot.pilotlog):
     #rank, maxrank = get_ranks_info()
     #if rank is not None:
     #    format_str = 'Rank {0} |'.format(rank) + format_str
-    if args.nopilotlog:
+    if nopilotlog:
         logging.basicConfig(level=level, format=format_str, filemode='w')
     else:
         logging.basicConfig(filename=filename, level=level, format=format_str, filemode='w')

From 14a86fdbc94d6d9c3efa4af408e6e2dab44d6b7a Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 13:00:41 +0200
Subject: [PATCH 20/96] Added logging to dask

---
 pilot/api/dask.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index e611a645..756596c9 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -11,6 +11,7 @@
 #from pilot.util.filehandling import get_table_from_file
 #from pilot.util.math import mean, sum_square_dev, sum_dev, chi2, float_to_rounded_string
 from pilot.util.container import execute
+from pilot.util.filehandling import establish_logging
 
 import logging
 logger = logging.getLogger(__name__)
@@ -57,6 +58,8 @@ def _validate(self):
         4. command: kubectl
         """
 
+        establish_logging(debug=True)
+
         try:
             import dask
             import dask_kubernetes

From 8669f8d93bcfada2f707256feb3c1c08c7bc5cf0 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 13:06:15 +0200
Subject: [PATCH 21/96] Update

---
 pilot/api/dask.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 756596c9..1c5f32eb 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -72,7 +72,7 @@ def _validate(self):
         for cmd in commands:
             exit_code, stdout, stderr = execute('which %s' % cmd, mute=True)
             found = True if not 'not found' in stdout else False
-            if not found in stdout:
+            if found not in stdout:
                 logger.warning(stdout)
                 break
         if not found:

From 92b1571f2e903571ace6e0b9316fddda5cd8872e Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 13:06:53 +0200
Subject: [PATCH 22/96] Update

---
 pilot/api/dask.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 1c5f32eb..d517c31f 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -72,7 +72,7 @@ def _validate(self):
         for cmd in commands:
             exit_code, stdout, stderr = execute('which %s' % cmd, mute=True)
             found = True if not 'not found' in stdout else False
-            if found not in stdout:
+            if not found:
                 logger.warning(stdout)
                 break
         if not found:

From df9f0a002ef87e51293b9b8943be322f01e1a533 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 14:17:39 +0200
Subject: [PATCH 23/96] Added override values

---
 pilot/api/dask.py | 56 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index d517c31f..4c5196c2 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -11,7 +11,9 @@
 #from pilot.util.filehandling import get_table_from_file
 #from pilot.util.math import mean, sum_square_dev, sum_dev, chi2, float_to_rounded_string
 from pilot.util.container import execute
-from pilot.util.filehandling import establish_logging
+from pilot.util.filehandling import establish_logging, write_file
+
+import os
 
 import logging
 logger = logging.getLogger(__name__)
@@ -24,6 +26,10 @@ class Dask(object):
 
     status = None
     loadbalancerip = None
+    servicetype = "LoadBalancer"
+    jupyter = False
+    overrides = "override_values.yaml"
+    _workdir = os.getcwd()
 
     def __init__(self, **kwargs):
         """
@@ -32,7 +38,15 @@ def __init__(self, **kwargs):
         :param kwargs:
         """
 
-        pass
+        _servicetype = kwargs.get('servicetype', None)
+        if _servicetype:
+            self.servicetype = _servicetype
+        _jupyter = kwargs.get('jupyter', None)
+        if _jupyter:
+            self.jupyter = _jupyter
+        _overrides = kwargs.get('overrides', None)
+        if _overrides:
+            self.overrides = _overrides
 
     def install(self, block=True):
         """
@@ -44,9 +58,17 @@ def install(self, block=True):
             logger.warning('validation failed')
             self.status = 'failed'
         else:
-            logger.info('dask has been validated')
+            logger.debug('dask has been validated')
             self.status = 'validated'
 
+            # is the single-dask cluster already running?
+            cmd = 'kubectl get services'
+            exit_code, stdout, stderr = execute(cmd, mute=True)
+            if exit_code:
+                logger.warning('failed to execute \'%s\': %s' % (cmd, stdout))
+                self.status = 'failed'
+            else:
+
     def _validate(self):
         """
         Make sure that pre-conditions are met before any installation can be attempted.
@@ -56,17 +78,22 @@ def _validate(self):
         2. library: dask_kubernetes
         3. command: helm
         4. command: kubectl
+        5. copy relevant yaml file(s)
         """
 
         establish_logging(debug=True)
 
+        # import relevant modules
         try:
             import dask
+            logger.debug('dask imported')
             import dask_kubernetes
+            logger.debug('dask_kubernetes imported')
         except Exception as error:
             logger.warning('module not available: %s' % error)
             return False
 
+        # verify relevant commands
         commands = ['helm', 'kubectl']
         found = False
         for cmd in commands:
@@ -75,7 +102,30 @@ def _validate(self):
             if not found:
                 logger.warning(stdout)
                 break
+            else:
+                logger.debug('%s verified' % cmd)
         if not found:
             return False
 
+        # create yaml file(s)
+        self._generate_override_script()
+
         return True
+
+    def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'):
+        """
+
+        """
+
+        filename = os.path.join(self._workdir, self.overrides)
+        if os.path.exists(filename):
+            logger.info('file \'%s\' already exists - will not override')
+            return
+
+        script = ""
+        if not jupyter:
+            script += 'jupyter:\n\tenabled: false\n\n'
+        if servicetype:
+            script += 'scheduler:\n\tserviceType: \"%s\"\n' % servicetype
+
+        write_file(filename, script)

From 0d505f3d04e8e563f5411bcc51a55d7f3423a4c9 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 14:18:50 +0200
Subject: [PATCH 24/96] Update

---
 pilot/api/dask.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 4c5196c2..2cbeaee1 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -68,6 +68,7 @@ def install(self, block=True):
                 logger.warning('failed to execute \'%s\': %s' % (cmd, stdout))
                 self.status = 'failed'
             else:
+                pass
 
     def _validate(self):
         """

From 72bc46db8d13db0253a0d3f74d444fed8f5fe805 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 14:24:54 +0200
Subject: [PATCH 25/96] Removed tabs

---
 pilot/api/dask.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 2cbeaee1..1565a839 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -68,6 +68,7 @@ def install(self, block=True):
                 logger.warning('failed to execute \'%s\': %s' % (cmd, stdout))
                 self.status = 'failed'
             else:
+                # parse output
                 pass
 
     def _validate(self):
@@ -115,7 +116,11 @@ def _validate(self):
 
     def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'):
         """
+        Generate a values yaml script, unless it already exists.
 
+        :param jupyter: False if jupyter notebook server should be disabled (Boolean).
+        :param servicetype: name of service type (string).
+        :return:
         """
 
         filename = os.path.join(self._workdir, self.overrides)
@@ -125,8 +130,10 @@ def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'):
 
         script = ""
         if not jupyter:
-            script += 'jupyter:\n\tenabled: false\n\n'
+            script += 'jupyter:\n    enabled: false\n\n'
         if servicetype:
-            script += 'scheduler:\n\tserviceType: \"%s\"\n' % servicetype
+            script += 'scheduler:\n    serviceType: \"%s\"\n' % servicetype
 
-        write_file(filename, script)
+        status = write_file(filename, script)
+        if status:
+            logger.debug('generated script: %s' % filename)

From d13735ef37443030b48d8ae57f8c153973ed732a Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 14:40:53 +0200
Subject: [PATCH 26/96] Parsing of kubectl output

---
 pilot/api/dask.py | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 1565a839..0442815a 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -12,6 +12,7 @@
 #from pilot.util.math import mean, sum_square_dev, sum_dev, chi2, float_to_rounded_string
 from pilot.util.container import execute
 from pilot.util.filehandling import establish_logging, write_file
+from pilot.util.parameters import convert_to_int
 
 import os
 
@@ -24,6 +25,7 @@ class Dask(object):
     Dask interface class.
     """
 
+    servicename = 'single-dask'
     status = None
     loadbalancerip = None
     servicetype = "LoadBalancer"
@@ -38,6 +40,9 @@ def __init__(self, **kwargs):
         :param kwargs:
         """
 
+        _servicename = kwargs.get('servicename', None)
+        if _servicename:
+            self.servicename = _servicename
         _servicetype = kwargs.get('servicetype', None)
         if _servicetype:
             self.servicetype = _servicetype
@@ -69,7 +74,8 @@ def install(self, block=True):
                 self.status = 'failed'
             else:
                 # parse output
-                pass
+                dictionary = self._convert_to_dict(stdout)
+                logger.debug('d=%s' % str(dictionary))
 
     def _validate(self):
         """
@@ -137,3 +143,35 @@ def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'):
         status = write_file(filename, script)
         if status:
             logger.debug('generated script: %s' % filename)
+
+    def _convert_to_dict(self, output):
+        """
+
+        """
+
+        dictionary = {}
+        summary_keys = []  # to keep track of content
+        header_locked = False
+
+        for line in output.split('\n'):
+            try:
+                # Remove empty entries from list (caused by multiple \t)
+                _l = line.replace('\n', '')
+                _l = [_f for _f in _l.split('\t') if _f]  # Python 3
+
+                # define dictionary keys
+                if type(_l[0]) == str and not header_locked:
+                    summary_keys = _l
+                    for key in _l:
+                        dictionary[key] = []
+                    header_locked = True
+                else:  # sort the memory measurements in the correct columns
+                    for i, key in enumerate(_l):
+                        # for key in _l:
+                        key_entry = summary_keys[i]  # e.g. Time
+                        value = convert_to_int(key)
+                        dictionary[key_entry].append(value)
+            except Exception:
+                logger.warning("unexpected format of utility output: %s" % line)
+
+        return dictionary

From c0d7da570d3f9d7ac60655d6ab30cde7de630e79 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 15:32:44 +0200
Subject: [PATCH 27/96] Checking if service is running

---
 pilot/api/dask.py | 81 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 57 insertions(+), 24 deletions(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 0442815a..d7b2c01c 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -15,6 +15,7 @@
 from pilot.util.parameters import convert_to_int
 
 import os
+import re
 
 import logging
 logger = logging.getLogger(__name__)
@@ -67,15 +68,50 @@ def install(self, block=True):
             self.status = 'validated'
 
             # is the single-dask cluster already running?
-            cmd = 'kubectl get services'
-            exit_code, stdout, stderr = execute(cmd, mute=True)
-            if exit_code:
-                logger.warning('failed to execute \'%s\': %s' % (cmd, stdout))
-                self.status = 'failed'
-            else:
-                # parse output
-                dictionary = self._convert_to_dict(stdout)
-                logger.debug('d=%s' % str(dictionary))
+            name = '%s-scheduler' % self.servicename
+            if self.is_running(name=name):
+                logger.info('service %s is running')
+
+    def is_running(self, name='single-dask-scheduler'):
+        """
+
+        """
+
+        status = False
+        dictionary = self._get_dictionary(cmd='kubectl get services')
+        for key in dictionary:
+            if key == name:
+                status = True if self._is_valid_ip(dictionary[key]['EXTERNAL-IP']) else False
+                break
+
+        return status
+
+    def _is_valid_ip(self, ip):
+        """
+
+        """
+
+        regex = "^((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.){3}(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])$"
+        return True if re.search(regex, ip) else False
+
+    def _get_dictionary(self, cmd=None):
+        """
+
+        """
+
+        dictionary = {}
+        if not cmd:
+            return dictionary
+
+        exit_code, stdout, stderr = execute(cmd, mute=True)
+        if exit_code:
+            logger.warning('failed to execute \'%s\': %s' % (cmd, stdout))
+            self.status = 'failed'
+        else:
+            # parse output
+            dictionary = self._convert_to_dict(stdout)
+
+        return dictionary
 
     def _validate(self):
         """
@@ -153,24 +189,21 @@ def _convert_to_dict(self, output):
         summary_keys = []  # to keep track of content
         header_locked = False
 
+        dictionary = {}
+        first_line = []
         for line in output.split('\n'):
             try:
                 # Remove empty entries from list (caused by multiple \t)
-                _l = line.replace('\n', '')
-                _l = [_f for _f in _l.split('\t') if _f]  # Python 3
-
-                # define dictionary keys
-                if type(_l[0]) == str and not header_locked:
-                    summary_keys = _l
-                    for key in _l:
-                        dictionary[key] = []
-                    header_locked = True
-                else:  # sort the memory measurements in the correct columns
-                    for i, key in enumerate(_l):
-                        # for key in _l:
-                        key_entry = summary_keys[i]  # e.g. Time
-                        value = convert_to_int(key)
-                        dictionary[key_entry].append(value)
+                _l = line
+                _l = [_f for _f in _l.split('\t') if _f]
+
+                if first_line == []:  # "NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
+                    first_line = _l[1:]
+                else:
+                    dictionary[_l[0]] = {}
+                    for i in range(len(_l[1:])):
+                        dictionary[_l[0]][first_line[i]] = _l[1:][i]
+
             except Exception:
                 logger.warning("unexpected format of utility output: %s" % line)
 

From 7480a83dca7eb8f4e7a171ee5bf4f4be223ae3bd Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 15:34:43 +0200
Subject: [PATCH 28/96] Update

---
 pilot/api/dask.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index d7b2c01c..b17c032e 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -70,7 +70,9 @@ def install(self, block=True):
             # is the single-dask cluster already running?
             name = '%s-scheduler' % self.servicename
             if self.is_running(name=name):
-                logger.info('service %s is running')
+                logger.info('service %s is running' % name)
+            else:
+                logger.info('service %s is not yet running' % name)
 
     def is_running(self, name='single-dask-scheduler'):
         """
@@ -167,7 +169,7 @@ def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'):
 
         filename = os.path.join(self._workdir, self.overrides)
         if os.path.exists(filename):
-            logger.info('file \'%s\' already exists - will not override')
+            logger.info('file \'%s\' already exists - will not override' % filename)
             return
 
         script = ""

From 02f40e07ac1c3634263afc4d96276cedee2697fa Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 15:40:16 +0200
Subject: [PATCH 29/96] Update

---
 pilot/api/dask.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index b17c032e..98bde617 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -81,9 +81,12 @@ def is_running(self, name='single-dask-scheduler'):
 
         status = False
         dictionary = self._get_dictionary(cmd='kubectl get services')
+        logger.debug('d=%s' % str(dictionary))
         for key in dictionary:
             if key == name:
+                logger.debug('ip:%s' % dictionary[key]['EXTERNAL-IP'])
                 status = True if self._is_valid_ip(dictionary[key]['EXTERNAL-IP']) else False
+                logger.debug('status=%s' % str(status))
                 break
 
         return status

From 172cceb521f01888b4fb777f932c61d42df04358 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 15:44:25 +0200
Subject: [PATCH 30/96] Update

---
 pilot/api/dask.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 98bde617..847e35d7 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -190,13 +190,10 @@ def _convert_to_dict(self, output):
 
         """
 
-        dictionary = {}
-        summary_keys = []  # to keep track of content
-        header_locked = False
-
         dictionary = {}
         first_line = []
         for line in output.split('\n'):
+            logger.debug('line=%s' % line)
             try:
                 # Remove empty entries from list (caused by multiple \t)
                 _l = line
@@ -212,4 +209,5 @@ def _convert_to_dict(self, output):
             except Exception:
                 logger.warning("unexpected format of utility output: %s" % line)
 
+        logger.debug('dictionary=%s' % str(dictionary))
         return dictionary

From f91c677b50f95952bd2764d9fce2388c2e3e197b Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 15:47:12 +0200
Subject: [PATCH 31/96] Update

---
 pilot/api/dask.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 847e35d7..372a5fdf 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -198,9 +198,10 @@ def _convert_to_dict(self, output):
                 # Remove empty entries from list (caused by multiple \t)
                 _l = line
                 _l = [_f for _f in _l.split('\t') if _f]
-
+                logger.debug('_l=%s' % _l)
                 if first_line == []:  # "NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
                     first_line = _l[1:]
+                    logger.debug('first line=%s' % first_line)
                 else:
                     dictionary[_l[0]] = {}
                     for i in range(len(_l[1:])):

From 6b92b2191a501eeaa3b091c6f5bcec0dce9215b5 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 15:51:18 +0200
Subject: [PATCH 32/96] Update

---
 pilot/api/dask.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 372a5fdf..0f544a5c 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -196,8 +196,8 @@ def _convert_to_dict(self, output):
             logger.debug('line=%s' % line)
             try:
                 # Remove empty entries from list (caused by multiple \t)
-                _l = line
-                _l = [_f for _f in _l.split('\t') if _f]
+                _l = re.sub(' +', ' ', line)
+                _l = [_f for _f in _l.split(' ') if _f]
                 logger.debug('_l=%s' % _l)
                 if first_line == []:  # "NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
                     first_line = _l[1:]

From 60a9bef3c78ca57df20b309bdfe098ad58f0de49 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 16:05:41 +0200
Subject: [PATCH 33/96] Installation and uninstallation of dask

---
 pilot/api/dask.py | 51 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 0f544a5c..3f81af00 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -16,6 +16,7 @@
 
 import os
 import re
+from time import sleep
 
 import logging
 logger = logging.getLogger(__name__)
@@ -54,6 +55,16 @@ def __init__(self, **kwargs):
         if _overrides:
             self.overrides = _overrides
 
+    def uninstall(self):
+        """
+
+        """
+
+        cmd = 'helm uninstall %s' % self.servicename
+        exit_code, stdout, stderr = execute(cmd, mute=True)
+        if not exit_code:
+            logger.info('service %s has been uninstalled' % self.servicename)
+
     def install(self, block=True):
         """
 
@@ -70,9 +81,29 @@ def install(self, block=True):
             # is the single-dask cluster already running?
             name = '%s-scheduler' % self.servicename
             if self.is_running(name=name):
-                logger.info('service %s is running' % name)
+                logger.info('service %s is already running - nothing to install' % name)
             else:
-                logger.info('service %s is not yet running' % name)
+                logger.info('service %s is not yet running - proceed with installation' % name)
+
+                #
+                override_option = "-f %s" % self.overrides if self.overrides else ""
+                cmd = 'helm install %s %s dask/dask' % (override_option, self.servicename)
+                #exit_code, stdout, stderr = execute(cmd, mute=True)
+                exit_code = 0
+                if not exit_code:
+                    logger.info('installation of service %s is in progress' % self.servicename)
+
+                    if block:
+                        while True:
+                            name = '%s-scheduler' % self.servicename
+                            if self.is_running(name=name):
+                                logger.info('service %s is running' % name)
+                                self.status = 'running'
+                                break
+                            else:
+                                self.status = 'pending'
+                                sleep(2)
+                    # note: in non-blocking mode, status is not getting updated
 
     def is_running(self, name='single-dask-scheduler'):
         """
@@ -81,12 +112,9 @@ def is_running(self, name='single-dask-scheduler'):
 
         status = False
         dictionary = self._get_dictionary(cmd='kubectl get services')
-        logger.debug('d=%s' % str(dictionary))
         for key in dictionary:
             if key == name:
-                logger.debug('ip:%s' % dictionary[key]['EXTERNAL-IP'])
                 status = True if self._is_valid_ip(dictionary[key]['EXTERNAL-IP']) else False
-                logger.debug('status=%s' % str(status))
                 break
 
         return status
@@ -181,9 +209,12 @@ def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'):
         if servicetype:
             script += 'scheduler:\n    serviceType: \"%s\"\n' % servicetype
 
-        status = write_file(filename, script)
-        if status:
-            logger.debug('generated script: %s' % filename)
+        if script:
+            status = write_file(filename, script)
+            if status:
+                logger.debug('generated script: %s' % filename)
+        else:
+            self.overrides = None
 
     def _convert_to_dict(self, output):
         """
@@ -193,15 +224,12 @@ def _convert_to_dict(self, output):
         dictionary = {}
         first_line = []
         for line in output.split('\n'):
-            logger.debug('line=%s' % line)
             try:
                 # Remove empty entries from list (caused by multiple \t)
                 _l = re.sub(' +', ' ', line)
                 _l = [_f for _f in _l.split(' ') if _f]
-                logger.debug('_l=%s' % _l)
                 if first_line == []:  # "NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
                     first_line = _l[1:]
-                    logger.debug('first line=%s' % first_line)
                 else:
                     dictionary[_l[0]] = {}
                     for i in range(len(_l[1:])):
@@ -210,5 +238,4 @@ def _convert_to_dict(self, output):
             except Exception:
                 logger.warning("unexpected format of utility output: %s" % line)
 
-        logger.debug('dictionary=%s' % str(dictionary))
         return dictionary

From 49e5c0cef2645a18d39dc4022b8a144751cc05a8 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 16:10:03 +0200
Subject: [PATCH 34/96] Update

---
 pilot/api/dask.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 3f81af00..c3b95c39 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -63,6 +63,7 @@ def uninstall(self):
         cmd = 'helm uninstall %s' % self.servicename
         exit_code, stdout, stderr = execute(cmd, mute=True)
         if not exit_code:
+            self.status = 'uninstalled'
             logger.info('service %s has been uninstalled' % self.servicename)
 
     def install(self, block=True):
@@ -88,8 +89,7 @@ def install(self, block=True):
                 #
                 override_option = "-f %s" % self.overrides if self.overrides else ""
                 cmd = 'helm install %s %s dask/dask' % (override_option, self.servicename)
-                #exit_code, stdout, stderr = execute(cmd, mute=True)
-                exit_code = 0
+                exit_code, stdout, stderr = execute(cmd, mute=True)
                 if not exit_code:
                     logger.info('installation of service %s is in progress' % self.servicename)
 

From 5826863b20f1be6145a766e35aa791ceb9ff8972 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 16:26:06 +0200
Subject: [PATCH 35/96] Connected cluster

---
 pilot/api/dask.py | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index c3b95c39..6d488afe 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -34,6 +34,7 @@ class Dask(object):
     jupyter = False
     overrides = "override_values.yaml"
     _workdir = os.getcwd()
+    cluster = None
 
     def __init__(self, **kwargs):
         """
@@ -55,16 +56,20 @@ def __init__(self, **kwargs):
         if _overrides:
             self.overrides = _overrides
 
-    def uninstall(self):
+    def uninstall(self, block=True):
         """
 
         """
 
+        logger.info('uninstalling service %s' % self.servicename)
+        if block:
+            logger.warning('blocking mode not yet implemented')
+
         cmd = 'helm uninstall %s' % self.servicename
         exit_code, stdout, stderr = execute(cmd, mute=True)
         if not exit_code:
             self.status = 'uninstalled'
-            logger.info('service %s has been uninstalled' % self.servicename)
+            logger.info('uninstall of service %s has been requested' % self.servicename)
 
     def install(self, block=True):
         """
@@ -239,3 +244,29 @@ def _convert_to_dict(self, output):
                 logger.warning("unexpected format of utility output: %s" % line)
 
         return dictionary
+
+    def connect_cluster(self):
+        """
+
+        """
+
+        logger.info('connecting to HelmCluster')
+        self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename)
+
+    def scale(self, number):
+        """
+
+        """
+
+        if number > 2:
+            logger.warning('too large scale: %d (please use <= 2 for now)' % number)
+            return
+        if not self.cluster:
+            self.connect_cluster()
+        if not self.cluster:
+            logger.warning('cluster not connected - cannot proceed')
+            self.status = 'failed'
+            return
+
+        logger.info('setting scale to: %d' % number)
+        self.cluster.scale = number

From c8cc5d55a1527035326b9e2a58c0f969879704b7 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 16:32:14 +0200
Subject: [PATCH 36/96] Update

---
 pilot/api/dask.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 6d488afe..0709e965 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -251,6 +251,7 @@ def connect_cluster(self):
         """
 
         logger.info('connecting to HelmCluster')
+        import dask_kubernetes
         self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename)
 
     def scale(self, number):

From 8126ef221b8c3ef4e91f096748dd38379e3f5a42 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 16:33:32 +0200
Subject: [PATCH 37/96] Update

---
 pilot/api/dask.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 0709e965..3174574d 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -251,8 +251,12 @@ def connect_cluster(self):
         """
 
         logger.info('connecting to HelmCluster')
-        import dask_kubernetes
-        self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename)
+        try:
+            import dask_kubernetes
+        except Exception as error:
+            logger.warning('failed to import dask_kubernetes')
+        else:
+            self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename)
 
     def scale(self, number):
         """

From 6e2b1514061172f9265d520ce73f388a1985c3a1 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 16:34:09 +0200
Subject: [PATCH 38/96] Update

---
 pilot/api/dask.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 3174574d..491af4c4 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -250,13 +250,13 @@ def connect_cluster(self):
 
         """
 
-        logger.info('connecting to HelmCluster')
         try:
             import dask_kubernetes
         except Exception as error:
             logger.warning('failed to import dask_kubernetes')
         else:
             self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename)
+            logger.info('connected to HelmCluster')
 
     def scale(self, number):
         """

From beed675a6398752445b7185d71f71bf2f740ea05 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Tue, 25 May 2021 16:37:38 +0200
Subject: [PATCH 39/96] Update

---
 pilot/api/dask.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 491af4c4..c62c3a6f 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -274,4 +274,4 @@ def scale(self, number):
             return
 
         logger.info('setting scale to: %d' % number)
-        self.cluster.scale = number
+        self.cluster.scale(number)

From 7a177e4fe326cf9440c0a83a9c03689dd762f65f Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 25 May 2021 18:56:04 +0200
Subject: [PATCH 40/96] Fixed bad xcache debugging

---
 PILOTVERSION                      |  2 +-
 pilot/api/dask.py                 |  3 ---
 pilot/control/data.py             | 12 ++++++------
 pilot/control/payloads/generic.py | 24 ++++++++++++------------
 pilot/util/constants.py           |  2 +-
 5 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index c51eef03..0bab6e7d 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.13
\ No newline at end of file
+2.11.3.15
\ No newline at end of file
diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index c62c3a6f..3d027847 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -8,11 +8,8 @@
 # - Paul Nilsson, paul.nilsson@cern.ch, 2021
 
 #from pilot.common.exception import NotDefined, NotSameLength, UnknownException
-#from pilot.util.filehandling import get_table_from_file
-#from pilot.util.math import mean, sum_square_dev, sum_dev, chi2, float_to_rounded_string
 from pilot.util.container import execute
 from pilot.util.filehandling import establish_logging, write_file
-from pilot.util.parameters import convert_to_int
 
 import os
 import re
diff --git a/pilot/control/data.py b/pilot/control/data.py
index f8e80c94..a754b20d 100644
--- a/pilot/control/data.py
+++ b/pilot/control/data.py
@@ -478,18 +478,18 @@ def copytool_in(queues, traces, args):
             cmd = user.get_utility_commands(job=job, order=UTILITY_BEFORE_STAGEIN)
             if cmd:
                 # xcache debug
-                exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[before xcache start] stdout=%s' % stdout)
-                logger.debug('[before xcache start] stderr=%s' % stderr)
+                exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                logger.debug('[before xcache start] stdout=%s' % _stdout)
+                logger.debug('[before xcache start] stderr=%s' % _stderr)
 
                 exit_code, stdout, stderr = execute(cmd.get('command'))
                 logger.debug('stdout=%s' % stdout)
                 logger.debug('stderr=%s' % stderr)
 
                 # xcache debug
-                exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[after xcache start] stdout=%s' % stdout)
-                logger.debug('[after xcache start] stderr=%s' % stderr)
+                exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                logger.debug('[after xcache start] stdout=%s' % _stdout)
+                logger.debug('[after xcache start] stderr=%s' % _stderr)
 
                 # perform any action necessary after command execution (e.g. stdout processing)
                 kwargs = {'label': cmd.get('label', 'utility'), 'output': stdout}
diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py
index 3c674fd6..c6bf1bf6 100644
--- a/pilot/control/payloads/generic.py
+++ b/pilot/control/payloads/generic.py
@@ -589,9 +589,9 @@ def run(self):  # noqa: C901
             # note: no need to run any main payload in HPO Horovod jobs on Kubernetes
             if os.environ.get('HARVESTER_HOROVOD', '') == '':
 
-                exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[before payload start] stdout=%s' % stdout)
-                logger.debug('[before payload start] stderr=%s' % stderr)
+                exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                logger.debug('[before payload start] stdout=%s' % _stdout)
+                logger.debug('[before payload start] stderr=%s' % _stderr)
 
                 proc = self.run_payload(self.__job, cmd, self.__out, self.__err)
             else:
@@ -639,9 +639,9 @@ def run(self):  # noqa: C901
                 set_pilot_state(job=self.__job, state=state)
                 logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n' % (proc.pid, exit_code, self.__job.state))
 
-                exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[after payload finish] stdout=%s' % stdout)
-                logger.debug('[after payload finish] stderr=%s' % stderr)
+                exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                logger.debug('[after payload finish] stdout=%s' % _stdout)
+                logger.debug('[after payload finish] stderr=%s' % _stderr)
 
                 # stop the utility command (e.g. a coprocess if necessary
                 if proc_co:
@@ -691,16 +691,16 @@ def run_utility_after_payload_finished(self):
                 logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload)
 
                 # xcache debug
-                exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[before xcache kill] stdout=%s' % stdout)
-                logger.debug('[before xcache kill] stderr=%s' % stderr)
+                exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                logger.debug('[before xcache kill] stdout=%s' % _stdout)
+                logger.debug('[before xcache kill] stderr=%s' % _stderr)
 
                 exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'xcache_kill')
 
                 # xcache debug
-                exit_code, stdout, stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[after xcache kill] stdout=%s' % stdout)
-                logger.debug('[after xcache kill] stderr=%s' % stderr)
+                _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                logger.debug('[after xcache kill] stdout=%s' % _stdout)
+                logger.debug('[after xcache kill] stderr=%s' % _stderr)
 
         return exit_code
 
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index c05c9593..50ebe53d 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'   # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '13'    # build number should be reset to '1' for every new development cycle
+BUILD = '15'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From cac5620e0a45a8ec6980dd80c6f80d8b08ac90d0 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 25 May 2021 19:02:26 +0200
Subject: [PATCH 41/96] Part 1 of 2; fix for postprocesses and xcache

---
 pilot/control/payloads/generic.py | 8 ++++----
 pilot/user/atlas/common.py        | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py
index c6bf1bf6..c50fa634 100644
--- a/pilot/control/payloads/generic.py
+++ b/pilot/control/payloads/generic.py
@@ -652,8 +652,7 @@ def run(self):  # noqa: C901
                     logger.warning('detected unset exit_code from wait_graceful - reset to -1')
                     exit_code = -1
 
-                if state != 'failed':
-                    exit_code = self.run_utility_after_payload_finished()
+                exit_code = self.run_utility_after_payload_finished(state)
 
                 self.post_payload(self.__job)
 
@@ -670,10 +669,11 @@ def run(self):  # noqa: C901
 
         return exit_code
 
-    def run_utility_after_payload_finished(self):
+    def run_utility_after_payload_finished(self, state):
         """
         Run utility command after the main payload has finished.
 
+        :param state: payload state; finished/failed (string).
         :return: exit code (int).
         """
 
@@ -683,7 +683,7 @@ def run_utility_after_payload_finished(self):
         except Exception as e:
             logger.error(e)
         else:
-            if cmd_after_payload and self.__job.postprocess:
+            if cmd_after_payload and self.__job.postprocess and state != 'failed':
                 cmd_after_payload = self.__job.setup + cmd_after_payload
                 logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload)
                 exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'postprocess')
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index ee0ecd90..7358c5ba 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -1901,7 +1901,7 @@ def get_utility_commands(order=None, job=None):
         if job.postprocess and job.postprocess.get('command', ''):
             com = download_command(job.postprocess, job.workdir)
             com['label'] = 'postprocess'
-        if 'pilotXcache' in job.infosys.queuedata.catchall:
+        if 'pilotXcache' in job.infosys.queuedata.catchall:  # should be UTILITY_AFTER_PAYLOAD_FINISHED2
             com = xcache_deactivation_command(job.workdir)
             com['label'] = 'xcache_kill'
     elif order == UTILITY_BEFORE_STAGEIN:

From 4964894a14ec4a203057f1e51030e1991ae31161 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 25 May 2021 20:30:34 +0200
Subject: [PATCH 42/96] Update

---
 PILOTVERSION                      | 2 +-
 pilot/control/payloads/generic.py | 2 +-
 pilot/util/constants.py           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 0bab6e7d..d6ebd519 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.15
\ No newline at end of file
+2.11.3.17
\ No newline at end of file
diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py
index c50fa634..92db9b85 100644
--- a/pilot/control/payloads/generic.py
+++ b/pilot/control/payloads/generic.py
@@ -602,7 +602,7 @@ def run(self):  # noqa: C901
                 # run the post-process command even if there was no main payload
                 if os.environ.get('HARVESTER_HOROVOD', '') != '':
                     logger.info('No need to execute any main payload')
-                    exit_code = self.run_utility_after_payload_finished()
+                    exit_code = self.run_utility_after_payload_finished(True)
                     self.post_payload(self.__job)
                 else:
                     break
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 50ebe53d..b276f475 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'   # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '15'    # build number should be reset to '1' for every new development cycle
+BUILD = '17'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 3eb851a87d5261c1dd0fa45c17a415d480d920a7 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 25 May 2021 20:36:03 +0200
Subject: [PATCH 43/96] Flake8 corrections

---
 PILOTVERSION                 |  2 +-
 pilot/api/dask.py            | 24 ++++++++++++------------
 pilot/user/generic/common.py |  1 -
 pilot/util/constants.py      |  2 +-
 4 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index d6ebd519..492fd442 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.17
\ No newline at end of file
+2.11.3.18
\ No newline at end of file
diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 3d027847..2f737ac3 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -7,6 +7,12 @@
 # Authors:
 # - Paul Nilsson, paul.nilsson@cern.ch, 2021
 
+try:
+    # import dask
+    import dask_kubernetes
+except Exception:
+    pass
+
 #from pilot.common.exception import NotDefined, NotSameLength, UnknownException
 from pilot.util.container import execute
 from pilot.util.filehandling import establish_logging, write_file
@@ -126,7 +132,7 @@ def _is_valid_ip(self, ip):
 
         """
 
-        regex = "^((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.){3}(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])$"
+        regex = r"^((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.){3}(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])$"
         return True if re.search(regex, ip) else False
 
     def _get_dictionary(self, cmd=None):
@@ -162,22 +168,16 @@ def _validate(self):
 
         establish_logging(debug=True)
 
-        # import relevant modules
-        try:
-            import dask
-            logger.debug('dask imported')
-            import dask_kubernetes
-            logger.debug('dask_kubernetes imported')
-        except Exception as error:
-            logger.warning('module not available: %s' % error)
-            return False
+        # check imported modules
+        # dask
+        # dask_kubernetes
 
         # verify relevant commands
         commands = ['helm', 'kubectl']
         found = False
         for cmd in commands:
             exit_code, stdout, stderr = execute('which %s' % cmd, mute=True)
-            found = True if not 'not found' in stdout else False
+            found = True if 'not found' not in stdout else False
             if not found:
                 logger.warning(stdout)
                 break
@@ -250,7 +250,7 @@ def connect_cluster(self):
         try:
             import dask_kubernetes
         except Exception as error:
-            logger.warning('failed to import dask_kubernetes')
+            logger.warning('failed to import dask_kubernetes: %s' % error)
         else:
             self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename)
             logger.info('connected to HelmCluster')
diff --git a/pilot/user/generic/common.py b/pilot/user/generic/common.py
index b21442e1..069494a8 100644
--- a/pilot/user/generic/common.py
+++ b/pilot/user/generic/common.py
@@ -270,4 +270,3 @@ def post_prestagein_utility_command(**kwargs):
     # stdout = kwargs.get('output', None)
 
     pass
-
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index b276f475..92968e7f 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'   # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '17'    # build number should be reset to '1' for every new development cycle
+BUILD = '18'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From e0c80b56125e78883b64a090dfcd0b5f273057be Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 25 May 2021 20:37:24 +0200
Subject: [PATCH 44/96] Flake8 corrections

---
 pilot/api/dask.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 2f737ac3..ce3f40fb 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -247,13 +247,8 @@ def connect_cluster(self):
 
         """
 
-        try:
-            import dask_kubernetes
-        except Exception as error:
-            logger.warning('failed to import dask_kubernetes: %s' % error)
-        else:
-            self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename)
-            logger.info('connected to HelmCluster')
+        self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename)
+        logger.info('connected to HelmCluster')
 
     def scale(self, number):
         """

From 574bdaa6dad5defcc3047cdfe871368df41f84c2 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 25 May 2021 20:42:01 +0200
Subject: [PATCH 45/96] Update

---
 PILOTVERSION               | 2 +-
 pilot/user/atlas/common.py | 4 ++--
 pilot/util/constants.py    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 492fd442..678fd135 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.18
\ No newline at end of file
+2.11.3.19
\ No newline at end of file
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index 7358c5ba..d3c06380 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -1932,8 +1932,8 @@ def post_prestagein_utility_command(**kwargs):
     alrb_xcache_files = os.environ.get('ALRB_XCACHE_FILES', '')
     if alrb_xcache_files:
         cmd = 'cat $ALRB_XCACHE_FILES/settings.sh'
-        exit_code, _stdout, _stderr = execute(cmd, usecontainer=False)
-        logger.debug('cmd=%s:\n\n%s\n\n' % _stdout)
+        exit_code, _stdout, _stderr = execute(cmd)
+        logger.debug('cmd=%s:\n\n%s\n\n' % (cmd, _stdout))
 
 
 def xcache_proxy(output):
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 92968e7f..be972231 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'   # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '18'    # build number should be reset to '1' for every new development cycle
+BUILD = '19'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From ab1406bbe3a8e88929c41170bc644116a3400a76 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Wed, 26 May 2021 17:28:45 +0200
Subject: [PATCH 46/96] Now allowing for different dask_kubernetes manager

---
 pilot/api/dask.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index ce3f40fb..2f7a4066 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -94,6 +94,8 @@ def install(self, block=True):
             else:
                 logger.info('service %s is not yet running - proceed with installation' % name)
 
+                # perform helm updates before actual instqllation
+                cmd = ''
                 #
                 override_option = "-f %s" % self.overrides if self.overrides else ""
                 cmd = 'helm install %s %s dask/dask' % (override_option, self.servicename)
@@ -242,13 +244,13 @@ def _convert_to_dict(self, output):
 
         return dictionary
 
-    def connect_cluster(self):
+    def connect_cluster(self, release_name=self.servicename, manager=dask_kubernetes.HelmCluster):
         """
 
         """
 
-        self.cluster = dask_kubernetes.HelmCluster(release_name=self.servicename)
-        logger.info('connected to HelmCluster')
+        self.cluster = manager(release_name=self.servicename)
+        logger.info('connected to %s' % manager.__name__)
 
     def scale(self, number):
         """
@@ -267,3 +269,12 @@ def scale(self, number):
 
         logger.info('setting scale to: %d' % number)
         self.cluster.scale(number)
+
+    def shutdown(self):
+        """
+        Shutdown logging.
+
+        """
+
+        logging.handlers = []
+        logging.shutdown()

From 49b0775669533214d339ee8156a1260d48623f4c Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Wed, 26 May 2021 17:30:39 +0200
Subject: [PATCH 47/96] Update

---
 pilot/api/dask.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index 2f7a4066..ad051c00 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -244,12 +244,14 @@ def _convert_to_dict(self, output):
 
         return dictionary
 
-    def connect_cluster(self, release_name=self.servicename, manager=dask_kubernetes.HelmCluster):
+    def connect_cluster(self, release_name=None, manager=dask_kubernetes.HelmCluster):
         """
 
         """
 
-        self.cluster = manager(release_name=self.servicename)
+        if not release_name:
+            release_name = self.servicename
+        self.cluster = manager(release_name=release_name)
         logger.info('connected to %s' % manager.__name__)
 
     def scale(self, number):

From 0b2462c2a32f7f61f250164a2911a885ec7570a0 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Thu, 27 May 2021 10:43:58 +0200
Subject: [PATCH 48/96] Updated post-process handling to support multiple
 post-processes (like xcache + HPO). Added '-b 4' option to xcache start. Now
 expanding env vars in xml, needed for xcache.

---
 PILOTVERSION                      |  2 +-
 pilot/control/data.py             |  2 ++
 pilot/control/payloads/generic.py | 18 ++++++++++++------
 pilot/user/atlas/common.py        | 11 ++++++-----
 pilot/user/atlas/metadata.py      |  1 +
 pilot/util/constants.py           |  9 +++++----
 6 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 678fd135..e265e54b 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.19
\ No newline at end of file
+2.11.3.21
\ No newline at end of file
diff --git a/pilot/control/data.py b/pilot/control/data.py
index a754b20d..5df19f89 100644
--- a/pilot/control/data.py
+++ b/pilot/control/data.py
@@ -683,6 +683,7 @@ def get_input_file_dictionary(indata):
     Return an input file dictionary.
     Format: {'guid': 'pfn', ..}
     Normally use_turl would be set to True if direct access is used.
+    Note: any environment variables in the turls will be expanded
 
     :param indata: list of FileSpec objects.
     :return: file dictionary.
@@ -692,6 +693,7 @@ def get_input_file_dictionary(indata):
 
     for fspec in indata:
         ret[fspec.guid] = fspec.turl if fspec.status == 'remote_io' else fspec.lfn
+        ret[fspec.guid] = os.path.expandvars(ret[fspec.guid])
 
         # correction for ND and mv
         # in any case use the lfn instead of pfn since there are trf's that have problems with pfn's
diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py
index 92db9b85..630639d3 100644
--- a/pilot/control/payloads/generic.py
+++ b/pilot/control/payloads/generic.py
@@ -23,7 +23,7 @@
 from pilot.util.container import execute
 from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED, \
     UTILITY_AFTER_PAYLOAD_FINISHED, PILOT_PRE_SETUP, PILOT_POST_SETUP, PILOT_PRE_PAYLOAD, PILOT_POST_PAYLOAD, \
-    UTILITY_AFTER_PAYLOAD_STARTED2
+    UTILITY_AFTER_PAYLOAD_STARTED2, UTILITY_AFTER_PAYLOAD_FINISHED2
 from pilot.util.filehandling import write_file
 from pilot.util.processes import kill_processes
 from pilot.util.timing import add_to_pilot_timing
@@ -210,7 +210,7 @@ def utility_after_payload_started_new(self, job):
 #                # also store the full command in case it needs to be restarted later (by the job_monitor() thread)
 #                job.utilities[cmd_dictionary.get('command')] = [proc, 1, utilitycommand]
 
-    def utility_after_payload_finished(self, job):
+    def utility_after_payload_finished(self, job, horovod_mode):
         """
         Prepare commands/utilities to run after payload has finished.
 
@@ -219,6 +219,8 @@ def utility_after_payload_finished(self, job):
         REFACTOR
 
         :param job: job object.
+        :param horovod_mode: True if HARVESTER_HOROVOD is set (Boolean).
+        :return:
         """
 
         cmd = ""
@@ -227,8 +229,10 @@ def utility_after_payload_finished(self, job):
         pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
         user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
 
+        order = UTILITY_AFTER_PAYLOAD_FINISHED if not horovod_mode else UTILITY_AFTER_PAYLOAD_FINISHED2
+
         # should any additional commands be prepended to the payload execution string?
-        cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_FINISHED, job=job)
+        cmd_dictionary = user.get_utility_commands(order=order, job=job)
         if cmd_dictionary:
             cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args'))
             logger.info('utility command (\'%s\') to be executed after the payload has finished: %s' % (cmd_dictionary.get('label', 'utility'), cmd))
@@ -602,7 +606,7 @@ def run(self):  # noqa: C901
                 # run the post-process command even if there was no main payload
                 if os.environ.get('HARVESTER_HOROVOD', '') != '':
                     logger.info('No need to execute any main payload')
-                    exit_code = self.run_utility_after_payload_finished(True)
+                    exit_code = self.run_utility_after_payload_finished(True, horovod_mode=True)
                     self.post_payload(self.__job)
                 else:
                     break
@@ -669,17 +673,19 @@ def run(self):  # noqa: C901
 
         return exit_code
 
-    def run_utility_after_payload_finished(self, state):
+    def run_utility_after_payload_finished(self, state, horovod_mode=False):
         """
         Run utility command after the main payload has finished.
+        In horovod mode, select the corresponding post-process. Otherwise, select different post-process (e.g. Xcache).
 
         :param state: payload state; finished/failed (string).
+        :param horovod_mode: True if HARVESTER_HOROVOD is set (Boolean).
         :return: exit code (int).
         """
 
         exit_code = 0
         try:
-            cmd_after_payload = self.utility_after_payload_finished(self.__job)
+            cmd_after_payload = self.utility_after_payload_finished(self.__job, horovod_mode=horovod_mode)
         except Exception as e:
             logger.error(e)
         else:
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index d3c06380..10f11f5d 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -35,7 +35,7 @@
 from pilot.util.config import config
 from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED,\
     UTILITY_AFTER_PAYLOAD, UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_STARTED2,\
-    UTILITY_BEFORE_STAGEIN
+    UTILITY_BEFORE_STAGEIN, UTILITY_AFTER_PAYLOAD_FINISHED2
 from pilot.util.container import execute
 from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy,\
     copy_pilot_source, write_file, read_json, read_file, update_extension, get_local_file_size, calculate_checksum
@@ -1898,12 +1898,13 @@ def get_utility_commands(order=None, job=None):
             com = download_command(job.postprocess, job.workdir)
             com['label'] = 'postprocess'
     elif order == UTILITY_AFTER_PAYLOAD_FINISHED:
+        if 'pilotXcache' in job.infosys.queuedata.catchall:
+            com = xcache_deactivation_command(job.workdir)
+            com['label'] = 'xcache_kill'
+    elif order == UTILITY_AFTER_PAYLOAD_FINISHED2:
         if job.postprocess and job.postprocess.get('command', ''):
             com = download_command(job.postprocess, job.workdir)
             com['label'] = 'postprocess'
-        if 'pilotXcache' in job.infosys.queuedata.catchall:  # should be UTILITY_AFTER_PAYLOAD_FINISHED2
-            com = xcache_deactivation_command(job.workdir)
-            com['label'] = 'xcache_kill'
     elif order == UTILITY_BEFORE_STAGEIN:
         if 'pilotXcache' in job.infosys.queuedata.catchall:
             com = xcache_activation_command(job.jobid)
@@ -1990,7 +1991,7 @@ def xcache_activation_command(jobid):
     # ${ALRB_XCACHE_PROXY}root://atlasxrootd-kit.gridka.de:1094//pnfs/gridka.de/../DAOD_FTAG4.24348858._000020.pool.root.1
     command = "%s " % get_asetup(asetup=False)
     # add 'xcache list' which will also kill any orphaned processes lingering in the system
-    command += "lsetup xcache; xcache list; xcache start -d $PWD/%s/xcache -C centos7 --disklow 4g --diskhigh 5g" % jobid
+    command += "lsetup xcache; xcache list; xcache start -d $PWD/%s/xcache -C centos7 --disklow 4g --diskhigh 5g -b 4" % jobid
 
     return {'command': command, 'args': ''}
 
diff --git a/pilot/user/atlas/metadata.py b/pilot/user/atlas/metadata.py
index e5d45f3b..25f18d66 100644
--- a/pilot/user/atlas/metadata.py
+++ b/pilot/user/atlas/metadata.py
@@ -21,6 +21,7 @@ def create_input_file_metadata(file_dictionary, workdir, filename="PoolFileCatal
     """
     Create a Pool File Catalog for the files listed in the input dictionary.
     The function creates properly formatted XML (pretty printed) and writes the XML to file.
+    Note: any environment variables in the pfn tags will be expanded (see pilot/control/data::get_input_file_dictionary()).
 
     Format:
     dictionary = {'guid': 'pfn', ..}
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index be972231..f7104532 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -12,9 +12,9 @@
 
 # Pilot version
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
-VERSION = '11'   # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
+VERSION = '11'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '19'    # build number should be reset to '1' for every new development cycle
+BUILD = '21'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1
@@ -28,8 +28,9 @@
 UTILITY_AFTER_PAYLOAD_STARTED2 = 4
 UTILITY_AFTER_PAYLOAD = 5
 UTILITY_AFTER_PAYLOAD_FINISHED = 6
-UTILITY_BEFORE_STAGEIN = 7
-UTILITY_WITH_STAGEIN = 8
+UTILITY_AFTER_PAYLOAD_FINISHED2 = 7
+UTILITY_BEFORE_STAGEIN = 8
+UTILITY_WITH_STAGEIN = 9
 
 # Timing constants that allow for additional constants to be defined for values before the pilot is started, ie for
 # wrapper timing purposes.

From dd8e5aa0932a14faf6b18436706b7771505954fc Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Thu, 27 May 2021 11:03:24 +0200
Subject: [PATCH 49/96] Updated post-process handling (added label) to support
 multiple post-processes (like xcache + HPO).

---
 PILOTVERSION                      |  2 +-
 pilot/control/payloads/generic.py | 43 +++++++++++++++++--------------
 pilot/util/constants.py           |  2 +-
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index e265e54b..f1cab8c7 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.21
\ No newline at end of file
+2.11.3.22
\ No newline at end of file
diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py
index 630639d3..8236f24b 100644
--- a/pilot/control/payloads/generic.py
+++ b/pilot/control/payloads/generic.py
@@ -210,17 +210,17 @@ def utility_after_payload_started_new(self, job):
 #                # also store the full command in case it needs to be restarted later (by the job_monitor() thread)
 #                job.utilities[cmd_dictionary.get('command')] = [proc, 1, utilitycommand]
 
-    def utility_after_payload_finished(self, job, horovod_mode):
+    def utility_after_payload_finished(self, job, order):
         """
         Prepare commands/utilities to run after payload has finished.
 
         This command will be executed later.
 
-        REFACTOR
+        The order constant can be UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2
 
         :param job: job object.
-        :param horovod_mode: True if HARVESTER_HOROVOD is set (Boolean).
-        :return:
+        :param order: constant used for utility selection (constant).
+        :return: command (string), label (string).
         """
 
         cmd = ""
@@ -229,15 +229,13 @@ def utility_after_payload_finished(self, job, horovod_mode):
         pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
         user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
 
-        order = UTILITY_AFTER_PAYLOAD_FINISHED if not horovod_mode else UTILITY_AFTER_PAYLOAD_FINISHED2
-
         # should any additional commands be prepended to the payload execution string?
         cmd_dictionary = user.get_utility_commands(order=order, job=job)
         if cmd_dictionary:
             cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args'))
             logger.info('utility command (\'%s\') to be executed after the payload has finished: %s' % (cmd_dictionary.get('label', 'utility'), cmd))
 
-        return cmd
+        return cmd, cmd_dictionary.get('label')
 
     def execute_utility_command(self, cmd, job, label):
         """
@@ -606,7 +604,7 @@ def run(self):  # noqa: C901
                 # run the post-process command even if there was no main payload
                 if os.environ.get('HARVESTER_HOROVOD', '') != '':
                     logger.info('No need to execute any main payload')
-                    exit_code = self.run_utility_after_payload_finished(True, horovod_mode=True)
+                    exit_code = self.run_utility_after_payload_finished(True, UTILITY_AFTER_PAYLOAD_FINISHED2)
                     self.post_payload(self.__job)
                 else:
                     break
@@ -656,7 +654,8 @@ def run(self):  # noqa: C901
                     logger.warning('detected unset exit_code from wait_graceful - reset to -1')
                     exit_code = -1
 
-                exit_code = self.run_utility_after_payload_finished(state)
+                for order in [UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2]:
+                    exit_code = self.run_utility_after_payload_finished(state, order)
 
                 self.post_payload(self.__job)
 
@@ -673,40 +672,44 @@ def run(self):  # noqa: C901
 
         return exit_code
 
-    def run_utility_after_payload_finished(self, state, horovod_mode=False):
+    def run_utility_after_payload_finished(self, state, order):
         """
         Run utility command after the main payload has finished.
         In horovod mode, select the corresponding post-process. Otherwise, select different post-process (e.g. Xcache).
 
+        The order constant can be UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2
+
         :param state: payload state; finished/failed (string).
-        :param horovod_mode: True if HARVESTER_HOROVOD is set (Boolean).
+        :param order: constant used for utility selection (constant).
         :return: exit code (int).
         """
 
         exit_code = 0
         try:
-            cmd_after_payload = self.utility_after_payload_finished(self.__job, horovod_mode=horovod_mode)
+            cmd_after_payload, label = self.utility_after_payload_finished(self.__job, order)
         except Exception as e:
             logger.error(e)
         else:
             if cmd_after_payload and self.__job.postprocess and state != 'failed':
                 cmd_after_payload = self.__job.setup + cmd_after_payload
                 logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload)
-                exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'postprocess')
+                exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label)
             elif cmd_after_payload:
                 logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload)
 
                 # xcache debug
-                exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[before xcache kill] stdout=%s' % _stdout)
-                logger.debug('[before xcache kill] stderr=%s' % _stderr)
+                if 'xcache' in cmd_after_payload:
+                    _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                    logger.debug('[before xcache kill] stdout=%s' % _stdout)
+                    logger.debug('[before xcache kill] stderr=%s' % _stderr)
 
-                exit_code = self.execute_utility_command(cmd_after_payload, self.__job, 'xcache_kill')
+                exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label)
 
                 # xcache debug
-                _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[after xcache kill] stdout=%s' % _stdout)
-                logger.debug('[after xcache kill] stderr=%s' % _stderr)
+                if 'xcache' in cmd_after_payload:
+                    _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                    logger.debug('[after xcache kill] stdout=%s' % _stdout)
+                    logger.debug('[after xcache kill] stderr=%s' % _stderr)
 
         return exit_code
 
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index f7104532..bdf4f12b 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '21'    # build number should be reset to '1' for every new development cycle
+BUILD = '22'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From e8d2fef985e194d02c3c689a6268ed1237bdad44 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Thu, 27 May 2021 12:10:03 +0200
Subject: [PATCH 50/96] Refactored get_utility_commands()

---
 PILOTVERSION                 |   2 +-
 pilot/user/atlas/common.py   | 108 +++++++++++++++++++++++++----------
 pilot/user/generic/common.py |   8 +--
 pilot/util/constants.py      |  11 ++--
 4 files changed, 88 insertions(+), 41 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index f1cab8c7..259ef8c5 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.22
\ No newline at end of file
+2.11.3.23
\ No newline at end of file
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index 10f11f5d..b8053c41 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -34,8 +34,7 @@
 from pilot.util.auxiliary import is_python3
 from pilot.util.config import config
 from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED,\
-    UTILITY_AFTER_PAYLOAD, UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_STARTED2,\
-    UTILITY_BEFORE_STAGEIN, UTILITY_AFTER_PAYLOAD_FINISHED2
+    UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_STARTED2, UTILITY_BEFORE_STAGEIN, UTILITY_AFTER_PAYLOAD_FINISHED2
 from pilot.util.container import execute
 from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy,\
     copy_pilot_source, write_file, read_json, read_file, update_extension, get_local_file_size, calculate_checksum
@@ -1837,6 +1836,8 @@ def download_command(process, workdir):
     """
     Download the pre/postprocess commands if necessary.
 
+    Process FORMAT: {'command': <command>, 'args': <args>, 'label': <some name>}
+
     :param process: pre/postprocess dictionary.
     :param workdir: job workdir (string).
     :return: updated pre/postprocess dictionary.
@@ -1870,46 +1871,87 @@ def get_utility_commands(order=None, job=None):
     should be returned. If order=UTILITY_WITH_STAGEIN, the commands that should be executed parallel with stage-in will
     be returned.
 
-    FORMAT: {'command': <command>, 'args': <args>}
+    FORMAT: {'command': <command>, 'args': <args>, 'label': <some name>}
 
     :param order: optional sorting order (see pilot.util.constants).
     :param job: optional job object.
     :return: dictionary of utilities to be executed in parallel with the payload.
     """
 
-    com = {}
-
     if order == UTILITY_BEFORE_PAYLOAD and job.preprocess:
-        if job.preprocess.get('command', ''):
-            com = download_command(job.preprocess, job.workdir)
-            com['label'] = 'preprocess'
+        return get_precopostprocess_command(job.preprocess, job.workdir, 'preprocess')
     elif order == UTILITY_WITH_PAYLOAD:
-        com = {'command': 'NetworkMonitor', 'args': '', 'label': 'networkmonitor'}
+        return {'command': 'NetworkMonitor', 'args': '', 'label': 'networkmonitor'}
     elif order == UTILITY_AFTER_PAYLOAD_STARTED:
-        cmd = config.Pilot.utility_after_payload_started
-        if cmd:
-            com = {'command': cmd, 'args': '', 'label': cmd.lower()}
+        return get_utility_after_payload_started()
     elif order == UTILITY_AFTER_PAYLOAD_STARTED2 and job.coprocess:
-        if job.coprocess.get('command', ''):
-            com = download_command(job.coprocess, job.workdir)
-            com['label'] = 'coprocess'
-    elif order == UTILITY_AFTER_PAYLOAD and job.postprocess:
-        if job.postprocess.get('command', ''):
-            com = download_command(job.postprocess, job.workdir)
-            com['label'] = 'postprocess'
+        return get_precopostprocess_command(job.coprocess, job.workdir, 'coprocess')
     elif order == UTILITY_AFTER_PAYLOAD_FINISHED:
-        if 'pilotXcache' in job.infosys.queuedata.catchall:
-            com = xcache_deactivation_command(job.workdir)
-            com['label'] = 'xcache_kill'
+        return get_xcache_command(job.infosys.queuedata.catchall, job.workdir, job.jobid, 'xcache_kill', xcache_deactivation_command)
     elif order == UTILITY_AFTER_PAYLOAD_FINISHED2:
-        if job.postprocess and job.postprocess.get('command', ''):
-            com = download_command(job.postprocess, job.workdir)
-            com['label'] = 'postprocess'
+        return get_precopostprocess_command(job.postprocess, job.workdir, 'postprocess')
     elif order == UTILITY_BEFORE_STAGEIN:
-        if 'pilotXcache' in job.infosys.queuedata.catchall:
-            com = xcache_activation_command(job.jobid)
-            com['label'] = 'xcache'
+        return get_xcache_command(job.infosys.queuedata.catchall, job.workdir, job.jobid, 'xcache_start', xcache_activation_command)
 
+
+def get_precopostprocess_command(process, workdir, label):
+    """
+    Return the pre/co/post-process command dictionary.
+
+    Command FORMAT: {'command': <command>, 'args': <args>, 'label': <some name>}
+
+    The returned command has the structure: { 'command': <string>, }
+    :param process: pre/co/post-process (dictionary).
+    :param workdir: working directory (string).
+    :param label: label (string).
+    :return: command (dictionary).
+    """
+
+    com = {}
+    if process.get('command', ''):
+        com = download_command(process, workdir)
+        com['label'] = label
+    return com
+
+
+def get_utility_after_payload_started():
+    """
+    Return the command dictionary for the utility after the payload has started.
+
+    Command FORMAT: {'command': <command>, 'args': <args>, 'label': <some name>}
+
+    :return: command (dictionary).
+    """
+
+    com = {}
+    try:
+        cmd = config.Pilot.utility_after_payload_started
+    except Exception:
+        pass
+    else:
+        if cmd:
+            com = {'command': cmd, 'args': '', 'label': cmd.lower()}
+    return com
+
+
+def get_xcache_command(catchall, workdir, jobid, label, xcache_function):
+    """
+    Return the proper xcache command for either activation or deactivation.
+
+    Command FORMAT: {'command': <command>, 'args': <args>, 'label': <some name>}
+
+    :param catchall: queuedata catchall field (string).
+    :param workdir: job working directory (string).
+    :param jobid: PanDA job id (string).
+    :param label: label (string).
+    :param xcache_function: activation/deactivation function name (function).
+    :return: command (dictionary).
+    """
+
+    com = {}
+    if 'pilotXcache' in catchall:
+        com = xcache_function(jobid=jobid, workdir=workdir)
+        com['label'] = label
     return com
 
 
@@ -1976,10 +2018,13 @@ def set_xcache_var(line, name='', pattern=''):
         os.environ[name] = result[0]
 
 
-def xcache_activation_command(jobid):
+def xcache_activation_command(workdir='', jobid=''):
     """
     Return the xcache service activation command.
 
+    Note: the workdir is not used here, but the function prototype needs it in the called (xcache_activation_command needs it).
+
+    :param workdir: unused work directory - do not remove (string).
     :param jobid: PanDA job id to guarantee that xcache process is unique (int).
     :return: xcache command (string).
     """
@@ -1996,13 +2041,16 @@ def xcache_activation_command(jobid):
     return {'command': command, 'args': ''}
 
 
-def xcache_deactivation_command(workdir):
+def xcache_deactivation_command(workdir='', jobid=''):
     """
     Return the xcache service deactivation command.
     This service should be stopped after the payload has finished.
     Copy the messages log before shutting down.
 
+    Note: the job id is not used here, but the function prototype needs it in the called (xcache_activation_command needs it).
+
     :param workdir: payload work directory (string).
+    :param jobid: unused job id - do not remove (string).
     :return: xcache command (string).
     """
 
diff --git a/pilot/user/generic/common.py b/pilot/user/generic/common.py
index 069494a8..51f05632 100644
--- a/pilot/user/generic/common.py
+++ b/pilot/user/generic/common.py
@@ -12,7 +12,7 @@
 
 from pilot.common.exception import TrfDownloadFailure
 from pilot.util.config import config
-from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_AFTER_PAYLOAD
+from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED
 from pilot.util.filehandling import read_file
 from .setup import get_analysis_trf
 
@@ -130,7 +130,7 @@ def get_utility_commands(order=None, job=None):
     If the optional order parameter is set, the function should return the list of corresponding commands.
     E.g. if order=UTILITY_BEFORE_PAYLOAD, the function should return all commands that are to be executed before the
     payload. If order=UTILITY_WITH_PAYLOAD, the corresponding commands will be prepended to the payload execution
-    string. If order=UTILITY_AFTER_PAYLOAD, the commands that should be executed after the payload has been started
+    string. If order=UTILITY_AFTER_PAYLOAD_STARTED, the commands that should be executed after the payload has been started
     should be returned.
 
     FORMAT: {'command': <command>, 'args': <args>}
@@ -160,14 +160,14 @@ def get_utility_command_execution_order(name):
     Should the given utility command be executed before or after the payload?
 
     :param name: utility name (string).
-    :return: execution order constant (UTILITY_BEFORE_PAYLOAD or UTILITY_AFTER_PAYLOAD)
+    :return: execution order constant (UTILITY_BEFORE_PAYLOAD or UTILITY_AFTER_PAYLOAD_STARTED)
     """
 
     # example implementation
     if name == 'monitor':
         return UTILITY_BEFORE_PAYLOAD
     else:
-        return UTILITY_AFTER_PAYLOAD
+        return UTILITY_AFTER_PAYLOAD_STARTED
 
 
 def post_utility_command_action(name, job):
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index bdf4f12b..8fdf5847 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '22'    # build number should be reset to '1' for every new development cycle
+BUILD = '23'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1
@@ -26,11 +26,10 @@
 UTILITY_WITH_PAYLOAD = 2
 UTILITY_AFTER_PAYLOAD_STARTED = 3
 UTILITY_AFTER_PAYLOAD_STARTED2 = 4
-UTILITY_AFTER_PAYLOAD = 5
-UTILITY_AFTER_PAYLOAD_FINISHED = 6
-UTILITY_AFTER_PAYLOAD_FINISHED2 = 7
-UTILITY_BEFORE_STAGEIN = 8
-UTILITY_WITH_STAGEIN = 9
+UTILITY_AFTER_PAYLOAD_FINISHED = 5
+UTILITY_AFTER_PAYLOAD_FINISHED2 = 6
+UTILITY_BEFORE_STAGEIN = 7
+UTILITY_WITH_STAGEIN = 8
 
 # Timing constants that allow for additional constants to be defined for values before the pilot is started, ie for
 # wrapper timing purposes.

From 9cc2fac9b56fd642d406ac8b5907fff371e6657c Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Thu, 27 May 2021 12:18:50 +0200
Subject: [PATCH 51/96] Update

---
 PILOTVERSION               | 2 +-
 pilot/user/atlas/common.py | 2 +-
 pilot/util/constants.py    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 259ef8c5..b24b5952 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.23
\ No newline at end of file
+2.11.3.24
\ No newline at end of file
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index b8053c41..75d75f0f 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -1888,7 +1888,7 @@ def get_utility_commands(order=None, job=None):
         return get_precopostprocess_command(job.coprocess, job.workdir, 'coprocess')
     elif order == UTILITY_AFTER_PAYLOAD_FINISHED:
         return get_xcache_command(job.infosys.queuedata.catchall, job.workdir, job.jobid, 'xcache_kill', xcache_deactivation_command)
-    elif order == UTILITY_AFTER_PAYLOAD_FINISHED2:
+    elif order == UTILITY_AFTER_PAYLOAD_FINISHED2 and job.postprocess:
         return get_precopostprocess_command(job.postprocess, job.workdir, 'postprocess')
     elif order == UTILITY_BEFORE_STAGEIN:
         return get_xcache_command(job.infosys.queuedata.catchall, job.workdir, job.jobid, 'xcache_start', xcache_activation_command)
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 8fdf5847..76949f59 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '23'    # build number should be reset to '1' for every new development cycle
+BUILD = '24'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 18a30b7b1f9a0cf7283d02c4cf7a3c57f92199ad Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Thu, 27 May 2021 12:25:52 +0200
Subject: [PATCH 52/96] Refactored validate()

---
 PILOTVERSION            |  2 +-
 pilot/control/job.py    | 51 ++++++++++++++++++++++++++---------------
 pilot/util/constants.py |  2 +-
 3 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index b24b5952..72a0e522 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.24
\ No newline at end of file
+2.11.3.25
\ No newline at end of file
diff --git a/pilot/control/job.py b/pilot/control/job.py
index d5d6b41d..aac2d848 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -833,11 +833,11 @@ def get_payload_log_tail(job):
 
 def validate(queues, traces, args):
     """
-    (add description)
+    Perform validation of job.
 
-    :param queues:
-    :param traces:
-    :param args:
+    :param queues: queues object.
+    :param traces: traces object.
+    :param args: args object.
     :return:
     """
 
@@ -904,21 +904,7 @@ def validate(queues, traces, args):
             store_jobid(job.jobid, args.sourcedir)
 
             # run the delayed space check now
-            proceed_with_local_space_check = True if (args.harvester_submitmode.lower() == 'push' and args.update_server) else False
-            if proceed_with_local_space_check:
-                logger.debug('pilot will now perform delayed space check')
-                ec, diagnostics = check_local_space()
-                if ec != 0:
-                    traces.pilot['error_code'] = errors.NOLOCALSPACE
-                    # set the corresponding error code
-                    job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOLOCALSPACE, msg=diagnostics)
-                    logger.debug('Failed to validate job=%s' % job.jobid)
-                    put_in_queue(job, queues.failed_jobs)
-                else:
-                    put_in_queue(job, queues.validated_jobs)
-            else:
-                put_in_queue(job, queues.validated_jobs)
-
+            delayed_space_check(queues, traces, args, job)
         else:
             logger.debug('Failed to validate job=%s' % job.jobid)
             put_in_queue(job, queues.failed_jobs)
@@ -933,6 +919,33 @@ def validate(queues, traces, args):
     logger.debug('[job] validate thread has finished')
 
 
+def delayed_space_check(queues, traces, args, job):
+    """
+    Run the delayed space check if necessary.
+
+    :param queues: queues object.
+    :param traces: traces object.
+    :param args: args object.
+    :param job: job object.
+    :return:
+    """
+
+    proceed_with_local_space_check = True if (args.harvester_submitmode.lower() == 'push' and args.update_server) else False
+    if proceed_with_local_space_check:
+        logger.debug('pilot will now perform delayed space check')
+        ec, diagnostics = check_local_space()
+        if ec != 0:
+            traces.pilot['error_code'] = errors.NOLOCALSPACE
+            # set the corresponding error code
+            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOLOCALSPACE, msg=diagnostics)
+            logger.debug('Failed to validate job=%s' % job.jobid)
+            put_in_queue(job, queues.failed_jobs)
+        else:
+            put_in_queue(job, queues.validated_jobs)
+    else:
+        put_in_queue(job, queues.validated_jobs)
+
+
 def create_k8_link(job_dir):
     """
     Create a soft link to the payload workdir on Kubernetes if SHARED_DIR exists.
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 76949f59..6199ffca 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '24'    # build number should be reset to '1' for every new development cycle
+BUILD = '25'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From c7bcc4ccd67427f6a871bff86e780eecece45c1a Mon Sep 17 00:00:00 2001
From: Shuwei Ye <Shuwei.Ye@cern.ch>
Date: Thu, 27 May 2021 07:41:32 -0400
Subject: [PATCH 53/96] Lowercased some variable names in gs.py to comply with
 flake8

---
 pilot/copytool/gs.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pilot/copytool/gs.py b/pilot/copytool/gs.py
index 03be7e77..68e50b5c 100644
--- a/pilot/copytool/gs.py
+++ b/pilot/copytool/gs.py
@@ -154,8 +154,8 @@ def copy_out(files, **kwargs):
 
         import re
         # bucket = re.sub(r'gs://(.*?)/.*', r'\1', fspec.turl)
-        reObj = re.match(r'gs://([^/]*)/(.*)', fspec.turl)
-        (bucket, remote_path) = reObj.groups()
+        reobj = re.match(r'gs://([^/]*)/(.*)', fspec.turl)
+        (bucket, remote_path) = reobj.groups()
 
         # ["pilotlog.txt", "payload.stdout", "payload.stderr"]:
         for logfile in os.listdir(workdir):
@@ -208,9 +208,9 @@ def upload_file(file_name, bucket, object_name=None):
         blob = gs_bucket.blob(object_name)
         blob.upload_from_filename(filename=file_name)
         if file_name.endswith(config.Pilot.pilotlog):
-            url_pilotLog = blob.public_url
-            os.environ['GTAG'] = url_pilotLog
-            logger.debug("Set envvar GTAG with the pilotLot URL=%s" % url_pilotLog)
+            url_pilotlog = blob.public_url
+            os.environ['GTAG'] = url_pilotlog
+            logger.debug("Set envvar GTAG with the pilotLot URL=%s" % url_pilotlog)
     except Exception as e:
         diagnostics = 'exception caught in gs client: %s' % e
         logger.critical(diagnostics)

From cb8d6fbdf328996ea86b3e7acfe868e373140326 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Thu, 27 May 2021 14:43:36 +0200
Subject: [PATCH 54/96] Added error code for no ctypes. Now using ctypes to
 guarantee orphans actually having parent processes

---
 PILOTVERSION               |  2 +-
 pilot/common/errorcodes.py |  4 +++-
 pilot/control/job.py       | 35 +++++++++++++++++++++++++++++++++--
 pilot/util/constants.py    |  2 +-
 pilot/util/processes.py    |  3 ++-
 5 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 72a0e522..7963461e 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.25
\ No newline at end of file
+2.11.3.26
\ No newline at end of file
diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py
index 08aa02b7..49233a76 100644
--- a/pilot/common/errorcodes.py
+++ b/pilot/common/errorcodes.py
@@ -147,6 +147,7 @@ class ErrorCodes:
     XRDCPERROR = 1362
     KILLPAYLOAD = 1363  # note, not a failure but a kill instruction from Raythena
     MISSINGCREDENTIALS = 1364
+    NOCTYPES = 1365
 
     _error_messages = {
         GENERALERROR: "General pilot error, consult batch log",
@@ -272,7 +273,8 @@ class ErrorCodes:
         REMOTEFILECOULDNOTBEOPENED: "Remote file could not be opened",
         XRDCPERROR: "Xrdcp was unable to open file",
         KILLPAYLOAD: "Raythena has decided to kill payload",
-        MISSINGCREDENTIALS: "Unable to locate credentials for S3 transfer"
+        MISSINGCREDENTIALS: "Unable to locate credentials for S3 transfer",
+        NOCTYPES: "Python module ctypes not available on worker node"
     }
 
     put_error_codes = [1135, 1136, 1137, 1141, 1152, 1181]
diff --git a/pilot/control/job.py b/pilot/control/job.py
index aac2d848..820292df 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -893,8 +893,7 @@ def validate(queues, traces, args):
 
             # pre-cleanup
             pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
-            utilities = __import__('pilot.user.%s.utilities' % pilot_user, globals(), locals(), [pilot_user],
-                                   0)  # Python 2/3
+            utilities = __import__('pilot.user.%s.utilities' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
             try:
                 utilities.precleanup()
             except Exception as e:
@@ -905,6 +904,9 @@ def validate(queues, traces, args):
 
             # run the delayed space check now
             delayed_space_check(queues, traces, args, job)
+
+            # make sure that ctypes is available (needed at the end by orphan killer)
+            verify_ctypes(queues, job)
         else:
             logger.debug('Failed to validate job=%s' % job.jobid)
             put_in_queue(job, queues.failed_jobs)
@@ -919,6 +921,35 @@ def validate(queues, traces, args):
     logger.debug('[job] validate thread has finished')
 
 
+def verify_ctypes(queues, job):
+    """
+    Verify ctypes and make sure all subprocess are parented.
+
+    :param queues: queues object.
+    :param job: job object.
+    :return:
+    """
+
+    try:
+        import ctypes
+    except Exception as e:
+        diagnostics = 'ctypes python module could not be imported: %s' % e
+        logger.warning(diagnostics)
+        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOCTYPES, msg=diagnostics)
+        logger.debug('Failed to validate job=%s' % job.jobid)
+        put_in_queue(job, queues.failed_jobs)
+    else:
+        logger.debug('ctypes python module imported')
+
+        # make sure all children are parented by the pilot
+        # specifically, this will include any 'orphans', i.e. if the pilot kills all subprocesses at the end,
+        # 'orphans' will be included (orphans seem like the wrong name)
+        libc = ctypes.CDLL('libc.so.6')
+        PR_SET_CHILD_SUBREAPER = 36
+        libc.prctl(PR_SET_CHILD_SUBREAPER, 1)
+        logger.debug('all child subprocesses will be parented')
+
+
 def delayed_space_check(queues, traces, args, job):
     """
     Run the delayed space check if necessary.
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 6199ffca..7339925b 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '25'    # build number should be reset to '1' for every new development cycle
+BUILD = '26'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1
diff --git a/pilot/util/processes.py b/pilot/util/processes.py
index 8abdbb64..8f88b3fa 100644
--- a/pilot/util/processes.py
+++ b/pilot/util/processes.py
@@ -177,7 +177,8 @@ def kill_processes(pid):
                     kill_process(i)
 
     # kill any remaining orphan processes
-    kill_orphans()
+    # note: this should no longer be necessary since ctypes has made sure all subprocesses are parented
+    # kill_orphans()
 
 
 def kill_child_processes(pid):

From 2cf2b5a710996836908057465bab9934a15778bd Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Thu, 27 May 2021 20:45:05 +0200
Subject: [PATCH 55/96] Implemented tail and ls debug commands

---
 PILOTVERSION            |   2 +-
 pilot/control/job.py    | 122 ++++++++++++++++++++++++++++++++++++----
 pilot/util/constants.py |   2 +-
 pilot/util/processes.py |   3 +-
 4 files changed, 114 insertions(+), 15 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 7963461e..6d74a3a7 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.26
\ No newline at end of file
+2.11.3.27c
\ No newline at end of file
diff --git a/pilot/control/job.py b/pilot/control/job.py
index 820292df..ec76fbc8 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -25,6 +25,7 @@
 
 from json import dumps  #, loads
 from re import findall
+from glob import glob
 
 from pilot.common.errorcodes import ErrorCodes
 from pilot.common.exception import ExcThread, PilotException  #, JobAlreadyRunning
@@ -495,16 +496,18 @@ def get_debug_command(cmd):
     try:
         tmp = cmd.split(' ')
         com = tmp[0]
-        opts = tmp[1]
     except Exception as e:
         logger.warning('failed to identify debug command: %s' % e)
     else:
         if com not in allowed_commands:
             logger.warning('command=%s is not in the list of allowed commands: %s' % (com, str(allowed_commands)))
-        elif ';' in opts or '&#59' in opts:
+        elif ';' in cmd or '&#59' in cmd:
             logger.warning('debug command cannot contain \';\': \'%s\'' % cmd)
         elif com in forbidden_commands:
             logger.warning('command=%s is not allowed' % com)
+        else:
+            debug_mode = True
+            debug_command = cmd
     return debug_mode, debug_command
 
 
@@ -527,7 +530,7 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False):
         # warning: server might return comma-separated string, 'debug,tobekilled'
         cmd = res.get('command')
         # is it a 'command options'-type? debug_command=tail .., ls .., gdb .., ps .., du ..
-        if ' ' in cmd:
+        if ' ' in cmd and 'tobekilled' not in cmd:
             try:
                 job.debug, job.debug_command = get_debug_command(cmd)
             except Exception as e:
@@ -547,16 +550,25 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False):
             logger.info('pilot received a panda server signal to softkill job %s at %s' %
                         (job.jobid, time_stamp()))
             # event service kill instruction
+            job.debug_command = 'softkill'
         elif 'debug' in cmd:
             logger.info('pilot received a command to turn on standard debug mode from the server')
             job.debug = True
+            job.debug_command = 'debug'
         elif 'debugoff' in cmd:
             logger.info('pilot received a command to turn off debug mode from the server')
             job.debug = False
+            job.debug_command = 'debugoff'
         else:
             logger.warning('received unknown server command via backchannel: %s' % cmd)
 
 
+    job.debug = True
+    # job.debug_command = 'tail payload.stdout' # OK
+    # job.debug_command = 'ls -ltr workDir'  # test with user job
+    job.debug_command = 'ls -ltr %s' % job.workdir
+
+
 def add_data_structure_ids(data, version_tag):
     """
     Add pilot, batch and scheduler ids to the data structure for getJob, updateJob.
@@ -622,9 +634,9 @@ def get_data_structure(job, state, args, xml=None, metadata=None):
 
     # in debug mode, also send a tail of the latest log file touched by the payload
     if job.debug:
-        stdout_tail = get_payload_log_tail(job)
-        if stdout_tail:
-            data['stdout'] = stdout_tail
+        stdout = get_debug_stdout(job.debug_command, job.workdir)
+        if stdout:
+            data['stdout'] = stdout
 
     # add the core count
     if job.corecount and job.corecount != 'null' and job.corecount != 'NULL':
@@ -665,6 +677,82 @@ def get_data_structure(job, state, args, xml=None, metadata=None):
     return data
 
 
+def get_debug_stdout(debug_command, workdir):
+    """
+    Return the requested output from a given debug command.
+
+    :param debug_command: full debug command (string).
+    :param workdir: job work directory (string).
+    :return: output (string).
+    """
+
+    if debug_command == 'debug':
+        return get_payload_log_tail(workdir)
+    elif 'tail' in debug_command:
+        return get_requested_log_tail(debug_command, workdir)
+    elif 'ls ' in debug_command:
+        return get_ls(debug_command, workdir)
+    else:
+        logger.warning('command not handled yet: %s' % debug_command)
+        return ''
+
+
+def get_ls(debug_command, workdir):
+    """
+
+    """
+
+    items = debug_command.split(' ')
+    # cmd = items[0]
+    options = ' '.join(items[1:])
+    path = options.split(' ')[-1] if ' ' in options else options
+    finalpath = os.path.join(workdir, path)
+    debug_command = debug_command.replace(path, finalpath)
+
+    ec, stdout, stderr = execute(debug_command)
+    logger.debug("%s:\n\n%s\n\n" % (debug_command, stdout))
+
+    return stdout
+
+
+def get_requested_log_tail(debug_command, workdir):
+    """
+    Return the tail of the requested log.
+
+    Examples
+      tail workdir/tmp.stdout* <- pilot finds the requested log file in the specified relative path
+      tail log.RAWtoALL <- pilot finds the requested log file
+
+    :param debug_command: full debug command (string).
+    :param workdir: job work directory (string).
+    :return: output (string).
+    """
+
+    _tail = ""
+    items = debug_command.split(' ')
+    cmd = items[0]
+    options = ' '.join(items[1:])
+    logger.debug('debug command: %s' % cmd)
+    logger.debug('debug options: %s' % options)
+
+    # assume that the path is the last of the options; <some option> <some path>
+    path = options.split(' ')[-1] if ' ' in options else options
+    fullpath = os.path.join(workdir, path)
+
+    # find all files with the given pattern and pick the latest updated file (if several)
+    files = glob(fullpath)
+    if files:
+        logger.info('files found: %s' % str(files))
+        _tail = get_latest_log_tail(files)
+    else:
+        logger.warning('did not find \'%s\' in path %s' % (path, fullpath))
+
+    if _tail:
+        logger.debug('tail =\n\n%s\n\n' % _tail)
+
+    return _tail
+
+
 def add_error_codes(data, job):
     """
     Add error codes to data structure.
@@ -798,16 +886,14 @@ def remove_pilot_logs_from_list(list_of_files):
     return new_list_of_files
 
 
-def get_payload_log_tail(job):
+def get_payload_log_tail(workdir):
     """
     Return the tail of the payload stdout or its latest updated log file.
 
-    :param job: job object.
+    :param workdir: job work directory (string).
     :return: tail of stdout (string).
     """
 
-    stdout_tail = ""
-
     # find the latest updated log file
     # list_of_files = get_list_of_log_files()
     # find the latest updated text file
@@ -816,10 +902,22 @@ def get_payload_log_tail(job):
 
     if not list_of_files:
         logger.info('no log files were found (will use default %s)' % config.Payload.payloadstdout)
-        list_of_files = [os.path.join(job.workdir, config.Payload.payloadstdout)]
+        list_of_files = [os.path.join(workdir, config.Payload.payloadstdout)]
+
+    return get_latest_log_tail(list_of_files)
+
+
+def get_latest_log_tail(files):
+    """
+    Get the tail of the latest updated file from the given file list.
+
+    :param files: files (list).
+    """
+
+    stdout_tail = ""
 
     try:
-        latest_file = max(list_of_files, key=os.path.getmtime)
+        latest_file = max(files, key=os.path.getmtime)
         logger.info('tail of file %s will be added to heartbeat' % latest_file)
 
         # now get the tail of the found log file and protect against potentially large tails
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 7339925b..3242c710 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '26'    # build number should be reset to '1' for every new development cycle
+BUILD = '27c'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1
diff --git a/pilot/util/processes.py b/pilot/util/processes.py
index 8f88b3fa..213ac5d6 100644
--- a/pilot/util/processes.py
+++ b/pilot/util/processes.py
@@ -178,7 +178,8 @@ def kill_processes(pid):
 
     # kill any remaining orphan processes
     # note: this should no longer be necessary since ctypes has made sure all subprocesses are parented
-    # kill_orphans()
+    # if orphan process killing is not desired, set env var PILOT_NOKILL
+    kill_orphans()
 
 
 def kill_child_processes(pid):

From 405fe95ad2151137ef45693b3734e2dd82a2bb56 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Fri, 28 May 2021 17:37:52 +0200
Subject: [PATCH 56/96] Skipping xrootd when finding pid for prmon

---
 PILOTVERSION                  |  2 +-
 pilot/control/job.py          | 28 +++++++++++++++++++++++++---
 pilot/user/atlas/common.py    |  2 +-
 pilot/user/atlas/utilities.py |  2 +-
 pilot/util/constants.py       |  2 +-
 5 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 6d74a3a7..98c67543 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.27c
\ No newline at end of file
+2.11.3.27g
\ No newline at end of file
diff --git a/pilot/control/job.py b/pilot/control/job.py
index ec76fbc8..42c2e8b0 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -565,8 +565,10 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False):
 
     job.debug = True
     # job.debug_command = 'tail payload.stdout' # OK
-    # job.debug_command = 'ls -ltr workDir'  # test with user job
-    job.debug_command = 'ls -ltr %s' % job.workdir
+    # job.debug_command = 'ls -ltr workDir'  # test with user jo
+    # job.debug_command = 'ls -ltr %s' % job.workdir
+    # 'ps -ef'
+    job.debug_command = 'ps axo pgid,ppid,comm,args'
 
 
 def add_data_structure_ids(data, version_tag):
@@ -692,14 +694,34 @@ def get_debug_stdout(debug_command, workdir):
         return get_requested_log_tail(debug_command, workdir)
     elif 'ls ' in debug_command:
         return get_ls(debug_command, workdir)
+    elif 'ps ' in debug_command or 'gdb ' in debug_command:
+        return get_general_command_stdout(debug_command)
     else:
         logger.warning('command not handled yet: %s' % debug_command)
         return ''
 
 
+def get_general_command_stdout(debug_command):
+    """
+    Return the output from the requested debug command.
+
+    :param debug_command: full debug command (string).
+    :return: output (string).
+    """
+
+    ec, stdout, stderr = execute(debug_command)
+    logger.debug("%s:\n\n%s\n\n" % (debug_command, stdout))
+
+    return stdout
+
+
 def get_ls(debug_command, workdir):
     """
+    Return the requested ls debug command.
 
+    :param debug_command: full debug command (string).
+    :param workdir: job work directory (string).
+    :return: output (string).
     """
 
     items = debug_command.split(' ')
@@ -717,7 +739,7 @@ def get_ls(debug_command, workdir):
 
 def get_requested_log_tail(debug_command, workdir):
     """
-    Return the tail of the requested log.
+    Return the tail of the requested debug log.
 
     Examples
       tail workdir/tmp.stdout* <- pilot finds the requested log file in the specified relative path
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index 75d75f0f..634f0305 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -191,7 +191,7 @@ def open_remote_files(indata, workdir):
                                 not_opened += turl if not not_opened else ",%s" % turl
                         if not_opened:
                             ec = errors.REMOTEFILECOULDNOTBEOPENED
-                            diagnostics = "turl not opened:%s" % not_opened if "," not in not_opened else "turls not opened:%s" % not_opened
+                            diagnostics = "Remote file could not be opened: %s" % not_opened if "," not in not_opened else "turls not opened:%s" % not_opened
     else:
         logger.info('nothing to verify (for remote files)')
 
diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py
index a8264668..f04a50e8 100644
--- a/pilot/user/atlas/utilities.py
+++ b/pilot/user/atlas/utilities.py
@@ -286,7 +286,7 @@ def get_pid_for_jobid(ps, jobid):
     pid = None
 
     for line in ps.split('\n'):
-        if jobid in line:
+        if jobid in line and 'xrootd' not in line:
             # extract pid
             _pid = search(r'(\d+) ', line)
             try:
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 3242c710..1891c015 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '27c'    # build number should be reset to '1' for every new development cycle
+BUILD = '27g'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From fa2debc10efe22c4ecd9cfe3709a74edc51cbed4 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Fri, 4 Jun 2021 08:25:31 +0200
Subject: [PATCH 57/96] Many fixes for debug mode, including full
 containerisation of gdb command (not finished). Fixed resimevents.

---
 PILOTVERSION                  |   2 +-
 pilot.py                      |  59 +------------
 pilot/control/data.py         |  10 ++-
 pilot/control/job.py          | 129 +++++++++++++++++++++-------
 pilot/info/jobdata.py         |   2 +-
 pilot/user/atlas/common.py    |  92 ++++++++++++++++++--
 pilot/user/atlas/container.py |  27 ++++--
 pilot/user/generic/common.py  |  14 ++++
 pilot/util/constants.py       |   2 +-
 pilot/util/default.cfg        |   2 +-
 pilot/util/filehandling.py    |  21 +++++
 pilot/util/middleware.py      |  47 ++++++++++-
 pilot/util/processes.py       | 153 ++++++++++++++++++++++++++++++++++
 13 files changed, 451 insertions(+), 109 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 98c67543..bceaa76a 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.27g
\ No newline at end of file
+2.11.3.28f
\ No newline at end of file
diff --git a/pilot.py b/pilot.py
index ac40b52b..eee04af1 100755
--- a/pilot.py
+++ b/pilot.py
@@ -101,62 +101,6 @@ class Args:
     pass
 
 
-# rename module to pilot2 to avoid conflict in import with pilot directory
-def import_module(**kwargs):
-    """
-    This function allows for importing the pilot code.
-
-    :param kwargs: pilot options (dictionary).
-    :return: pilot error code (integer).
-    """
-
-    argument_dictionary = {'-a': kwargs.get('workdir', ''),
-                           '-d': kwargs.get('debug', None),
-                           '-w': kwargs.get('workflow', 'generic'),
-                           '-l': kwargs.get('lifetime', '3600'),
-                           '-q': kwargs.get('queue'),  # required
-                           '-r': kwargs.get('resource'),  # required
-                           '-s': kwargs.get('site'),  # required
-                           '-j': kwargs.get('job_label', 'ptest'),  # change default later to 'managed'
-                           '-i': kwargs.get('version_tag', 'PR'),
-                           '-t': kwargs.get('verify_proxy', True),
-                           '-z': kwargs.get('update_server', True),
-                           '--cacert': kwargs.get('cacert', None),
-                           '--capath': kwargs.get('capath'),
-                           '--url': kwargs.get('url', ''),
-                           '-p': kwargs.get('port', '25443'),
-                           '--country-group': kwargs.get('country_group', ''),
-                           '--working-group': kwargs.get('working_group', ''),
-                           '--allow-other-country': kwargs.get('allow_other_country', 'False'),
-                           '--allow-same-user': kwargs.get('allow_same_user', 'True'),
-                           '--pilot-user': kwargs.get('pilot_user', 'generic'),
-                           '--input-dir': kwargs.get('input_dir', ''),
-                           '--output-dir': kwargs.get('output_dir', ''),
-                           '--hpc-resource': kwargs.get('hpc_resource', ''),
-                           '--harvester-workdir': kwargs.get('harvester_workdir', ''),
-                           '--harvester-datadir': kwargs.get('harvester_datadir', ''),
-                           '--harvester-eventstatusdump': kwargs.get('harvester_eventstatusdump', ''),
-                           '--harvester-workerattributes': kwargs.get('harvester_workerattributes', ''),
-                           '--harvester-submitmode': kwargs.get('harvester_submitmode', ''),
-                           '--resource-type': kwargs.get('resource_type', '')
-                           }
-
-    args = Args()
-    parser = argparse.ArgumentParser()
-    try:
-        _items = list(argument_dictionary.items())  # Python 3
-    except Exception:
-        _items = argument_dictionary.iteritems()  # Python 2
-    for key, value in _items:
-        print(key, value)
-        parser.add_argument(key)
-        parser.parse_args(args=[key, value], namespace=args)  # convert back int and bool strings to int and bool??
-
-    # call main pilot function
-
-    return 0
-
-
 def str2bool(v):
     """ Helper function to convert string to bool """
 
@@ -478,6 +422,9 @@ def set_environment_variables(args, mainworkdir):
     # event service executor type
     environ['PILOT_ES_EXECUTOR_TYPE'] = args.executor_type
 
+    if args.output_dir:
+        environ['PILOT_OUTPUT_DIR'] = args.output_dir
+
     # keep track of the server urls
     _port = ":%s" % args.port
     url = args.url if _port in args.url else args.url + _port
diff --git a/pilot/control/data.py b/pilot/control/data.py
index 5df19f89..83c731ee 100644
--- a/pilot/control/data.py
+++ b/pilot/control/data.py
@@ -721,7 +721,7 @@ def filter_files_for_log(directory):
     return filtered_files
 
 
-def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], output_files=[], is_looping=False):
+def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], output_files=[], is_looping=False, debugmode=False):
     """
     Create the tarball for the job.
 
@@ -732,11 +732,13 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out
     :param input_files: list of input files to remove (list).
     :param output_files: list of output files to remove (list).
     :param is_looping: True for looping jobs, False by default (Boolean).
+    :param debugmode: True if debug mode has been switched on (Boolean).
     :raises LogFileCreationFailure: in case of log file creation problem.
     :return:
     """
 
-    logger.debug('preparing to create log file')
+    logger.debug('preparing to create log file (debug mode=%s)' % str(debugmode))
+
     # PILOT_HOME is the launch directory of the pilot (or the one specified in pilot options as pilot workdir)
     pilot_home = os.environ.get('PILOT_HOME', os.getcwd())
     current_dir = os.getcwd()
@@ -747,7 +749,7 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out
     if cleanup:
         pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
         user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
-        user.remove_redundant_files(workdir, islooping=is_looping)
+        user.remove_redundant_files(workdir, islooping=is_looping, debugmode=debugmode)
 
     # remove any present input/output files before tarring up workdir
     for f in input_files + output_files:
@@ -894,7 +896,7 @@ def _stage_out_new(job, args):
             output_files = [fspec.lfn for fspec in job.outdata]
             create_log(job.workdir, logfile.lfn, tarball_name, args.cleanup,
                        input_files=input_files, output_files=output_files,
-                       is_looping=errors.LOOPINGJOB in job.piloterrorcodes)
+                       is_looping=errors.LOOPINGJOB in job.piloterrorcodes, debugmode=job.debug)
         except LogFileCreationFailure as e:
             logger.warning('failed to create tar file: %s' % e)
             set_pilot_state(job=job, state="failed")
diff --git a/pilot/control/job.py b/pilot/control/job.py
index 42c2e8b0..c311d3c4 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -41,15 +41,16 @@
     SERVER_UPDATE_UPDATING, SERVER_UPDATE_NOT_DONE
 from pilot.util.container import execute
 from pilot.util.filehandling import find_text_files, tail, is_json, copy, remove, write_json, establish_logging, write_file, \
-    create_symlink
+    create_symlink, locate_file
 from pilot.util.harvester import request_new_jobs, remove_job_request_file, parse_job_definition_file, \
     is_harvester_mode, get_worker_attributes_file, publish_job_report, publish_work_report, get_event_status_file, \
     publish_stageout_files
 from pilot.util.jobmetrics import get_job_metrics
 from pilot.util.math import mean
+from pilot.util.middleware import containerise_general_command
 from pilot.util.monitoring import job_monitor_tasks, check_local_space
 from pilot.util.monitoringtime import MonitoringTime
-from pilot.util.processes import cleanup, threads_aborted, kill_process
+from pilot.util.processes import cleanup, threads_aborted, kill_process, get_pid_from_command, kill_processes
 from pilot.util.proxy import get_distinguished_name
 from pilot.util.queuehandling import scan_for_jobs, put_in_queue, queue_report, purge_queue
 from pilot.util.timing import add_to_pilot_timing, timing_report, get_postgetjob_time, get_time_since, time_stamp
@@ -562,13 +563,14 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False):
         else:
             logger.warning('received unknown server command via backchannel: %s' % cmd)
 
-
-    job.debug = True
-    # job.debug_command = 'tail payload.stdout' # OK
-    # job.debug_command = 'ls -ltr workDir'  # test with user jo
+    # for testing debug mode
+    #job.debug = True
+    # job.debug_command = 'tail payload.stdout'
+    # job.debug_command = 'ls -ltr workDir'  # not really tested
     # job.debug_command = 'ls -ltr %s' % job.workdir
-    # 'ps -ef'
-    job.debug_command = 'ps axo pgid,ppid,comm,args'
+    # job.debug_command = 'ps -ef'
+    # job.debug_command = 'ps axo pid,ppid,pgid,args'
+    #job.debug_command = 'gdb --pid % -ex \'generate-core-file\''
 
 
 def add_data_structure_ids(data, version_tag):
@@ -636,10 +638,23 @@ def get_data_structure(job, state, args, xml=None, metadata=None):
 
     # in debug mode, also send a tail of the latest log file touched by the payload
     if job.debug:
-        stdout = get_debug_stdout(job.debug_command, job.workdir)
+        # for gdb commands, use the proper gdb version (the system one may be too old)
+        #if 'gdb ' in job.debug_command:
+        #    pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
+        #    user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
+        #    user.preprocess_debug_command(job)
+
+        stdout = get_debug_stdout(job)
         if stdout:
             data['stdout'] = stdout
 
+            # in case gdb was successfully used, the payload can now be killed
+            if 'gdb ' in job.debug_command and job.pid:
+                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PANDAKILL,
+                                                                                 msg='payload was killed after gdb produced requested core file')
+                logger.debug('will proceed to kill payload processes')
+                kill_processes(job.pid)
+
     # add the core count
     if job.corecount and job.corecount != 'null' and job.corecount != 'NULL':
         data['coreCount'] = job.corecount
@@ -679,42 +694,91 @@ def get_data_structure(job, state, args, xml=None, metadata=None):
     return data
 
 
-def get_debug_stdout(debug_command, workdir):
+def get_debug_stdout(job):
     """
     Return the requested output from a given debug command.
 
-    :param debug_command: full debug command (string).
-    :param workdir: job work directory (string).
+    :param job: job object.
     :return: output (string).
     """
 
-    if debug_command == 'debug':
-        return get_payload_log_tail(workdir)
-    elif 'tail' in debug_command:
-        return get_requested_log_tail(debug_command, workdir)
-    elif 'ls ' in debug_command:
-        return get_ls(debug_command, workdir)
-    elif 'ps ' in debug_command or 'gdb ' in debug_command:
-        return get_general_command_stdout(debug_command)
+    if job.debug_command == 'debug':
+        return get_payload_log_tail(job.workdir)
+    elif 'tail' in job.debug_command:
+        return get_requested_log_tail(job.debug_command, job.workdir)
+    elif 'ls ' in job.debug_command:
+        return get_ls(job.debug_command, job.workdir)
+    elif 'ps ' in job.debug_command or 'gdb ' in job.debug_command:
+        return get_general_command_stdout(job)
     else:
-        logger.warning('command not handled yet: %s' % debug_command)
+        logger.warning('command not handled yet: %s' % job.debug_command)
         return ''
 
 
-def get_general_command_stdout(debug_command):
+def get_general_command_stdout(job):
     """
     Return the output from the requested debug command.
 
-    :param debug_command: full debug command (string).
+    :param job: job object.
     :return: output (string).
     """
 
-    ec, stdout, stderr = execute(debug_command)
-    logger.debug("%s:\n\n%s\n\n" % (debug_command, stdout))
+    stdout = ''
+
+    # for gdb, we might have to process the debug command (e.g. to identify the proper pid to debug)
+    if 'gdb ' in job.debug_command and '--pid %' in job.debug_command:
+        pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
+        user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
+        job.debug_command = user.process_debug_command(job.debug_command, job.jobid)
+
+    if job.debug_command:
+        if 'gdb ' in job.debug_command:
+            logger.info('gdb execution will be done by a script')
+            try:
+                containerise_general_command(job, job.infosys.queuedata.container_options,
+                                             label='general',
+                                             container_type='container')
+            except PilotException as e:
+                logger.warning('general containerisation threw a pilot exception: %s' % e)
+            except Exception as e:
+                logger.warning('general containerisation threw an exception: %s' % e)
+
+            # in case a core file was produced, locate it
+            path = locate_core_file(job.debug_command)
+            if path:
+                # copy it to the working directory (so it will be saved in the log)
+                try:
+                    copy(path, job.workdir)
+                except Exception:
+                    pass
+        else:
+            ec, stdout, stderr = execute(job.debug_command)
+            logger.debug("%s:\n\n%s\n\n" % (job.debug_command, stdout))
 
     return stdout
 
 
+def locate_core_file(debug_command):
+    """
+
+    """
+
+    path = None
+    pid = get_pid_from_command(debug_command)
+    if pid:
+        filename = 'core.%d' % pid
+        path = os.path.join(os.environ.get('PILOT_HOME', '.'), filename)
+        if os.path.exists(path):
+            logger.debug('found core file at: %s' % path)
+
+        else:
+            logger.debug('did not find %s in %s' % (filename, path))
+    else:
+        logger.warning('cannot locate core file since pid could not be extracted from debug command')
+
+    return path
+
+
 def get_ls(debug_command, workdir):
     """
     Return the requested ls debug command.
@@ -728,6 +792,8 @@ def get_ls(debug_command, workdir):
     # cmd = items[0]
     options = ' '.join(items[1:])
     path = options.split(' ')[-1] if ' ' in options else options
+    if path.startswith('-'):
+        path = '.'
     finalpath = os.path.join(workdir, path)
     debug_command = debug_command.replace(path, finalpath)
 
@@ -2385,10 +2451,15 @@ def job_monitor(queues, traces, args):  # noqa: C901
                         update_time = send_heartbeat_if_time(jobs[i], args, update_time)
 
                         # note: when sending a state change to the server, the server might respond with 'tobekilled'
-                        if jobs[i].state == 'failed':
-                            logger.warning('job state is \'failed\' - order log transfer and abort job_monitor() (1)')
-                            jobs[i].stageout = 'log'  # only stage-out log file
-                            put_in_queue(jobs[i], queues.data_out)
+                        try:
+                            jobs[i]
+                        except Exception as e:
+                            logger.warning('detected stale jobs[i] object in job_monitor: %s' % e)
+                        else:
+                            if jobs[i].state == 'failed':
+                                logger.warning('job state is \'failed\' - order log transfer and abort job_monitor() (1)')
+                                jobs[i].stageout = 'log'  # only stage-out log file
+                                put_in_queue(jobs[i], queues.data_out)
 
                     # sleep for a while if stage-in has not completed
                     time.sleep(1)
diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py
index f3ec0cb7..a39ba3ab 100644
--- a/pilot/info/jobdata.py
+++ b/pilot/info/jobdata.py
@@ -89,7 +89,7 @@ class JobData(BaseData):
     neventsw = 0                   # number of events written
     dbtime = None                  #
     dbdata = None                  #
-    resimevents = 0                # ReSim events from job report (ATLAS)
+    resimevents = None             # ReSim events from job report (ATLAS)
     payload = ""                   # payload name
     utilities = {}                 # utility processes { <name>: [<process handle>, number of launches, command string], .. }
     pid = None                     # payload pid
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index 634f0305..dd77a154 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -38,6 +38,7 @@
 from pilot.util.container import execute
 from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy,\
     copy_pilot_source, write_file, read_json, read_file, update_extension, get_local_file_size, calculate_checksum
+from pilot.util.processes import convert_ps_to_dict, find_cmd_pids, get_trimmed_dictionary, find_pid, is_child
 from pilot.util.tracereport import TraceReport
 
 import logging
@@ -1553,22 +1554,24 @@ def cleanup_looping_payload(workdir):
                 remove(path)
 
 
-def cleanup_payload(workdir, outputfiles=[]):
+def cleanup_payload(workdir, outputfiles=[], removecores=True):
     """
     Cleanup of payload (specifically AthenaMP) sub directories prior to log file creation.
     Also remove core dumps.
 
-    :param workdir: working directory (string)
-    :param outputfiles: list of output files
+    :param workdir: working directory (string).
+    :param outputfiles: list of output files.
+    :param removecores: remove core files if True (Boolean).
     :return:
     """
 
-    remove_core_dumps(workdir)
+    if removecores:
+        remove_core_dumps(workdir)
 
     for ampdir in glob('%s/athenaMP-workers-*' % workdir):
         for (p, d, f) in os.walk(ampdir):
             for filename in f:
-                if 'core' in filename or 'pool.root' in filename or 'tmp.' in filename:
+                if ('core' in filename and removecores) or 'pool.root' in filename or 'tmp.' in filename:
                     path = os.path.join(p, filename)
                     path = os.path.abspath(path)
                     remove(path)
@@ -1775,13 +1778,16 @@ def remove_special_files(workdir, dir_list, outputfiles):
                 remove_dir_tree(f)
 
 
-def remove_redundant_files(workdir, outputfiles=[], islooping=False):
+def remove_redundant_files(workdir, outputfiles=[], islooping=False, debugmode=False):
     """
     Remove redundant files and directories prior to creating the log file.
 
+    Note: in debug mode, any core files should not be removed before creating the log.
+
     :param workdir: working directory (string).
     :param outputfiles: list of protected output files (list).
     :param islooping: looping job variable to make sure workDir is not removed in case of looping (boolean).
+    :param debugmode: True if debug mode has been switched on (Boolean).
     :return:
     """
 
@@ -1796,7 +1802,7 @@ def remove_redundant_files(workdir, outputfiles=[], islooping=False):
     # remove core and pool.root files from AthenaMP sub directories
     try:
         logger.debug('cleaning up payload')
-        cleanup_payload(workdir, outputfiles)
+        cleanup_payload(workdir, outputfiles, removecores=not debugmode)
     except Exception as e:
         logger.warning("failed to execute cleanup_payload(): %s" % e)
 
@@ -2350,3 +2356,75 @@ def update_server(job):
             logger.warning('path does not exist: %s' % path)
     else:
         logger.debug('no need to update logstash for this job')
+
+
+def preprocess_debug_command(job):
+    """
+
+    """
+
+    return
+
+
+    # Should the pilot do the setup or does jobPars already contain the information?
+    preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams)
+    # get the general setup command and then verify it if required
+    resource_name = get_resource_name()  # 'grid' if no hpc_resource is set
+    resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0)  # Python 3, -1 -> 0
+    cmd = resource.get_setup_command(job, preparesetup)
+    if not cmd.endswith(';'):
+        cmd += '; '
+    if cmd not in job.debug_command:
+        job.debug_command = cmd + job.debug_command
+
+
+def process_debug_command(debug_command, pandaid):
+    """
+    In debug mode, the server can send a special debug command to the pilot via the updateJob backchannel.
+    This function can be used to process that command, i.e. to identify a proper pid to debug (which is unknown
+    to the server).
+
+    For gdb, the server might send a command with gdb option --pid %. The pilot need to replace the % with the proper
+    pid. The default (hardcoded) process will be that of athena.py. The pilot will find the corresponding pid.
+
+    :param debug_command: debug command (string).
+    :param pandaid: PanDA id (string).
+    :return: updated debug command (string).
+    """
+
+    pandaid_pid = None
+    if '--pid %' in debug_command:
+        # replace the % with the pid for athena.py
+        # note: if athena.py is not yet running, the --pid % will remain. Otherwise the % will be replaced by the pid
+        # first find the pid (if athena.py is running)
+        cmd = 'ps axo pid,ppid,pgid,args'
+        exit_code, stdout, stderr = execute(cmd)
+        if stdout:
+            logger.debug('ps=\n\n%s\n' % stdout)
+            # convert the ps output to a dictionary
+            dictionary = convert_ps_to_dict(stdout)
+            # trim this dictionary to reduce the size (only keep the PID and PPID lists)
+            trimmed_dictionary = get_trimmed_dictionary(['PID', 'PPID'], dictionary)
+            # what is the pid of the trf?
+            pandaid_pid = find_pid(pandaid, dictionary)
+            # find all athena processes
+            pids = find_cmd_pids('athena.py', dictionary)
+            # which of the found pids are children of the trf? (which has an export PandaID=.. attached to it)
+            for pid in pids:
+                try:
+                    child = is_child(pid, pandaid_pid, trimmed_dictionary)
+                except RuntimeError as e:
+                    logger.warning('too many recursions: %s (cannot identify athena process)' % e)
+                else:
+                    if child:
+                        logger.info('pid=%d is a child process of the trf of this job' % pid)
+                        debug_command = debug_command.replace('--pid %', '--pid %d' % pid)
+                        logger.info('updated debug command: %s' % debug_command)
+                        break
+                    else:
+                        logger.info('pid=%d is not a child process of the trf of this job' % pid)
+            if not pids:
+                logger.debug('athena is not yet running (no corresponding pid)')
+                debug_command = ''  # reset the command to prevent the payload from being killed (will be killed when gdb has run)
+
+    return debug_command
diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py
index c518f249..84c1cfff 100644
--- a/pilot/user/atlas/container.py
+++ b/pilot/user/atlas/container.py
@@ -899,7 +899,7 @@ def create_root_container_command(workdir, cmd):
     return command
 
 
-def create_middleware_container_command(workdir, cmd, container_options, label='stagein'):
+def create_middleware_container_command(workdir, cmd, container_options, label='stagein', proxy=True):
     """
     Create the stage-in/out container command.
 
@@ -924,10 +924,16 @@ def create_middleware_container_command(workdir, cmd, container_options, label='
     command = 'cd %s;' % workdir
 
     # add bits and pieces for the containerisation
-    middleware_container = get_middleware_container()
-    content = get_middleware_container_script(middleware_container, cmd)
+    middleware_container = get_middleware_container(label=label)
+    content = get_middleware_container_script(middleware_container, cmd, label=label)
     # store it in setup.sh
-    script_name = 'stagein.sh' if label == 'stage-in' else 'stageout.sh'
+    if label == 'stage-in':
+        script_name = 'stagein.sh'
+    elif label == 'stage-out':
+        script_name = 'stageout.sh'
+    else:
+        script_name = 'general.sh'
+
     try:
         status = write_file(os.path.join(workdir, script_name), content)
     except PilotException as e:
@@ -935,9 +941,10 @@ def create_middleware_container_command(workdir, cmd, container_options, label='
     else:
         if status:
             # generate the final container command
-            x509 = os.environ.get('X509_USER_PROXY', '')
-            if x509:
-                command += 'export X509_USER_PROXY=%s;' % x509
+            if proxy:
+                x509 = os.environ.get('X509_USER_PROXY', '')
+                if x509:
+                    command += 'export X509_USER_PROXY=%s;' % x509
             command += 'export ALRB_CONT_RUNPAYLOAD=\"source /srv/%s\";' % script_name
             command += get_asetup(alrb=True)  # export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase;
             command += 'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh -c %s' % middleware_container
@@ -995,13 +1002,17 @@ def get_middleware_container_script(middleware_container, cmd, asetup=False):
     return content
 
 
-def get_middleware_container():
+def get_middleware_container(label=None):
     """
     Return the middleware container.
 
+    :param label: label (string).
     :return: path (string).
     """
 
+    if label and label == 'general':
+        return 'CentOS7'
+
     path = config.Container.middleware_container
     if path.startswith('/') and not os.path.exists(path):
         logger.warning('requested middleware container path does not exist: %s (switching to default value)' % path)
diff --git a/pilot/user/generic/common.py b/pilot/user/generic/common.py
index 51f05632..c747446e 100644
--- a/pilot/user/generic/common.py
+++ b/pilot/user/generic/common.py
@@ -270,3 +270,17 @@ def post_prestagein_utility_command(**kwargs):
     # stdout = kwargs.get('output', None)
 
     pass
+
+
+def process_debug_command(debug_command, pandaid):
+    """
+    In debug mode, the server can send a special debug command to the pilot via the updateJob backchannel.
+    This function can be used to process that command, i.e. to identify a proper pid to debug (which is unknown
+    to the server).
+
+    :param debug_command: debug command (string), payload pid (int).
+    :param pandaid: PanDA id (string).
+    :return: updated debug command (string)
+    """
+
+    return debug_command
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 1891c015..88e0b391 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '27g'    # build number should be reset to '1' for every new development cycle
+BUILD = '28g'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1
diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg
index f8de374e..a1c4a07d 100644
--- a/pilot/util/default.cfg
+++ b/pilot/util/default.cfg
@@ -46,7 +46,7 @@ iddsserver: https://pandaserver.cern.ch:25443
 
 # The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5*60 = 300 s in debug mode)
 heartbeat: 1800
-debug_heartbeat: 300
+debug_heartbeat: 60
 
 # Heartbeat message file (only used when Pilot is not sending heartbeats to server)
 heartbeat_message: heartbeat.json
diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py
index 9ebeac46..72c4c3f3 100644
--- a/pilot/util/filehandling.py
+++ b/pilot/util/filehandling.py
@@ -1077,3 +1077,24 @@ def create_symlink(from_path='', to_path=''):
         logger.warning('failed to create symlink from %s to %s: %s' % (from_path, to_path, e))
     else:
         logger.debug('created symlink from %s to %s' % (from_path, to_path))
+
+
+def locate_file(pattern):
+    """
+    Locate a file defined by the pattern.
+
+    Example:
+        pattern = os.path.join(os.getcwd(), '**/core.123')
+        -> /Users/Paul/Development/python/tt/core.123
+
+    :param pattern: pattern name (string).
+    :return: path (string).
+    """
+
+    path = None
+    for fname in glob(pattern):
+        if os.path.isfile(fname):
+            path = fname
+
+    return path
+
diff --git a/pilot/util/middleware.py b/pilot/util/middleware.py
index d88d8ddd..a9fe101b 100644
--- a/pilot/util/middleware.py
+++ b/pilot/util/middleware.py
@@ -20,13 +20,56 @@
 errors = ErrorCodes()
 
 
+def containerise_general_command(job, container_options, label='command', container_type='container'):
+    """
+    Containerise a general command by execution in a script that can be run in a container.
+
+    :param job: job object.
+    :param label: label (string).
+    :param container_options: container options from queuedata (string).
+    :param container_type: optional 'container/bash'
+    :raises PilotException: for general failures.
+    :return:
+    """
+
+    cwd = getcwd()
+
+    if container_type == 'container':
+        # add bits and pieces needed to run the cmd in a container
+        pilot_user = environ.get('PILOT_USER', 'generic').lower()
+        user = __import__('pilot.user.%s.container' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
+        try:
+            cmd = user.create_middleware_container_command(job.workdir, job.debug_command, container_options, label=label, proxy=False)
+        except PilotException as e:
+            raise e
+    else:
+        logger.warning('not yet implemented')
+        raise PilotException
+
+    try:
+        logger.info('*** executing %s (logging will be redirected) ***' % label)
+        exit_code, stdout, stderr = execute(cmd, job=job, usecontainer=False)
+    except Exception as e:
+        logger.info('*** %s has failed ***' % label)
+        logger.warning('exception caught: %s' % e)
+    else:
+        if exit_code == 0:
+            logger.info('*** %s has finished ***' % label)
+        else:
+            logger.info('*** %s has failed ***' % label)
+        logger.debug('%s script returned exit_code=%d' % (label, exit_code))
+
+
 def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, container_options, external_dir,
                             label='stage-in', container_type='container'):
     """
     Containerise the middleware by performing stage-in/out steps in a script that in turn can be run in a container.
+
     Note: a container will only be used for option container_type='container'. If this is 'bash', then stage-in/out
     will still be done by a script, but not containerised.
 
+    Note: this function is tailor made for stage-in/out.
+
     :param job: job object.
     :param xdata: list of FileSpec objects.
     :param queue: queue name (string).
@@ -37,9 +80,9 @@ def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite,
     :param external_dir: input or output files directory (string).
     :param label: optional 'stage-in/out' (String).
     :param container_type: optional 'container/bash'
-    :return:
     :raises StageInFailure: for stage-in failures
     :raises StageOutFailure: for stage-out failures
+    :return:
     """
 
     cwd = getcwd()
@@ -123,6 +166,8 @@ def get_command(job, xdata, queue, script, eventtype, localsite, remotesite, ext
     """
     Get the middleware container execution command.
 
+    Note: this function is tailor made for stage-in/out.
+
     :param job: job object.
     :param xdata: list of FileSpec objects.
     :param queue: queue name (string).
diff --git a/pilot/util/processes.py b/pilot/util/processes.py
index 213ac5d6..e5b94ae8 100644
--- a/pilot/util/processes.py
+++ b/pilot/util/processes.py
@@ -630,3 +630,156 @@ def threads_aborted(abort_at=2):
         aborted = True
 
     return aborted
+
+
+def convert_ps_to_dict(output, pattern=r'(\d+) (\d+) (\d+) (.+)'):
+    """
+    Convert output from a ps command to a dictionary.
+
+    Example: ps axo pid,ppid,pgid,cmd
+      PID  PPID  PGID COMMAND
+      22091  6672 22091 bash
+      32581 22091 32581 ps something;sdfsdfds/athena.py ddfg
+      -> dictionary = { 'PID': [22091, 32581], 'PPID': [22091, 6672], .. , 'COMMAND': ['ps ..', 'bash']}
+
+    :param output: ps stdout (string).
+    :param pattern: regex pattern matching the ps output (raw string).
+    :return: dictionary.
+    """
+
+    dictionary = {}
+    first_line = []  # e.g. PID PPID PGID COMMAND
+
+    for line in output.split('\n'):
+        try:
+            # remove leading and trailing spaces
+            line = line.strip()
+            # remove multiple spaces inside the line
+            _l = re.sub(' +', ' ', line)
+
+            if first_line == []:
+                _l = [_f for _f in _l.split(' ') if _f]
+                first_line = _l
+                for i in range(len(_l)):
+                    dictionary[_l[i]] = []
+            else:  # e.g. 22091 6672 22091 bash
+                match = re.search(pattern, _l)
+                if match:
+                    for i in range(len(first_line)):
+                        try:
+                            var = int(match.group(i + 1))
+                        except Exception:
+                            var = match.group(i + 1)
+                        dictionary[first_line[i]].append(var)
+
+        except Exception as e:
+            print("unexpected format of utility output: %s" % e)
+
+    return dictionary
+
+
+def get_trimmed_dictionary(keys, dictionary):
+    """
+    Return a sub-dictionary with only the given keys.
+
+    :param keys: keys to keep (list).
+    :param dictionary: full dictionary.
+    :return: trimmed dictionary.
+    """
+
+    subdictionary = {}
+    for key in keys:
+        if key in dictionary:
+            subdictionary[key] = dictionary[key]
+
+    return subdictionary
+
+
+def find_cmd_pids(cmd, ps_dictionary):
+    """
+    Find all pids for the given command.
+    Example. cmd = 'athena.py' -> pids = [1234, 2267] (in case there are two pilots running on the WN).
+
+    :param cmd: command (string).
+    :param ps_dictionary: converted ps output (dictionary).
+    """
+
+    pids = []
+    i = -1
+    for _cmd in ps_dictionary.get('COMMAND'):
+        i += 1
+        if cmd in _cmd:
+            pids.append(ps_dictionary.get('PID')[i])
+    return pids
+
+
+def find_pid(pandaid, ps_dictionary):
+    """
+    Find the process id for the command that contains 'export PandaID=%d'.
+
+    :param pandaid: PanDA ID (string).
+    :param ps_dictionaryL ps output dictionary.
+    :return: pid (int).
+    """
+
+    pid = -1
+    i = -1
+    pandaid_cmd = 'export PandaID=%s' % pandaid
+    for _cmd in ps_dictionary.get('COMMAND'):
+        i += 1
+        if pandaid_cmd in _cmd:
+            pid = ps_dictionary.get('PID')[i]
+            break
+
+    return pid
+
+
+def is_child(pid, pandaid_pid, dictionary):
+    """
+    Is the given pid a child process of the pandaid_pid?
+    Proceed recursively until the parent pandaid_pid has been found, or return False if it fails to find it.
+    """
+
+    try:
+        # where are we at in the PID list?
+        index = dictionary.get('PID').index(pid)
+    except ValueError:
+        # not in the list
+        return False
+    else:
+        # get the corresponding ppid
+        ppid = dictionary.get('PPID')[index]
+
+        print(index, pid, ppid, pandaid_pid)
+        # is the current parent the same as the pandaid_pid? if yes, we are done
+        if ppid == pandaid_pid:
+            return True
+        else:
+            # try another pid
+            return is_child(ppid, pandaid_pid, dictionary)
+
+
+def get_pid_from_command(cmd, pattern=r'gdb --pid (\d+)'):
+    """
+    Identify an explicit process id in the given command.
+
+    Example:
+        cmd = 'gdb --pid 19114 -ex \'generate-core-file\''
+        -> pid = 19114
+
+    :param cmd: command containing a pid (string).
+    :param pattern: regex pattern (raw string).
+    :return: pid (int).
+    """
+
+    pid = None
+    match = re.search(pattern, cmd)
+    if match:
+        try:
+            pid = int(match.group(1))
+        except Exception:
+            pid = None
+    else:
+        print('no match for pattern \'%s\' in command=\'%s\'' % (pattern, cmd))
+
+    return pid

From 523442db5a32c6b6e7676fad09223e9604f149ec Mon Sep 17 00:00:00 2001
From: Paul Nilsson <palnilsson70@gmail.com>
Date: Fri, 4 Jun 2021 15:58:37 +0200
Subject: [PATCH 58/96] Now moving raythena/AthenaMP output to shared directory
 (if --output-dir was used)

---
 .../workexecutor/plugins/raythenaexecutor.py  | 21 +++++++++++++++++-
 pilot/util/filehandling.py                    | 22 +++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py b/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py
index 202f10ae..dc0140da 100644
--- a/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py
+++ b/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py
@@ -16,7 +16,7 @@
 from pilot.common.errorcodes import ErrorCodes
 from pilot.eventservice.esprocess.esprocess import ESProcess
 from pilot.info.filespec import FileSpec
-from pilot.util.filehandling import calculate_checksum
+from pilot.util.filehandling import calculate_checksum, move
 
 from .baseexecutor import BaseExecutor
 
@@ -62,6 +62,21 @@ def create_file_spec(self, pfn):
         file_spec = FileSpec(filetype='output', **file_data)
         return file_spec
 
+    def move_output(self, pfn):
+        """
+        Move output file from given PFN path to PILOT_OUTPUT_DIR if set.
+
+        :param pfn: physical file name (string).
+        :return:
+        """
+
+        outputdir = os.environ.get('PILOT_OUTPUT_DIR', None)
+        if outputdir:
+            try:
+                move(pfn, outputdir)
+            except Exception as e:
+                logger.warning('failed to move output: %s' % e)
+
     def update_finished_event_ranges(self, out_messagess):
         """
         Update finished event ranges
@@ -81,6 +96,10 @@ def update_finished_event_ranges(self, out_messagess):
             for checksum_key in fspec.checksum:
                 event_range_status[checksum_key] = fspec.checksum[checksum_key]
             event_ranges.append(event_range_status)
+
+            # move the output to a common area if necessary
+            self.move_output(out_msg['output'])
+
         event_ranges_status = {"esOutput": {"numEvents": len(event_ranges)}, "eventRanges": event_ranges}
         event_range_message = {'version': 1, 'eventRanges': json.dumps([event_ranges_status])}
         self.update_events(event_range_message)
diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py
index 72c4c3f3..2c76fb52 100644
--- a/pilot/util/filehandling.py
+++ b/pilot/util/filehandling.py
@@ -524,6 +524,28 @@ def tar_files(wkdir, excludedfiles, logfile_name, attempt=0):
     return 0
 
 
+def move(path1, path2):
+    """
+    Move a file from path1 to path2.
+
+    :param path1: source path (string).
+    :param path2: destination path2 (string).
+    """
+
+    if not os.path.exists(path1):
+        logger.warning('file copy failure: path does not exist: %s' % path1)
+        raise NoSuchFile("File does not exist: %s" % path1)
+
+    try:
+        import shutil
+        shutil.move(path1, path2)
+    except IOError as e:
+        logger.warning("exception caught during file move: %s" % e)
+        raise FileHandlingFailure(e)
+    else:
+        logger.info("moved %s to %s" % (path1, path2))
+
+
 def copy(path1, path2):
     """
     Copy path1 to path2.

From e82677099f9e2e0ed9a6af1df8d4cd752b02ea30 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Mon, 7 Jun 2021 15:48:05 +0200
Subject: [PATCH 59/96] Added some debug info for direct access

---
 PILOTVERSION            | 2 +-
 pilot/api/data.py       | 5 +++--
 pilot/info/jobdata.py   | 2 +-
 pilot/util/constants.py | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index bceaa76a..ceb642a6 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.28f
\ No newline at end of file
+2.11.3.28
\ No newline at end of file
diff --git a/pilot/api/data.py b/pilot/api/data.py
index bf5d73be..87c4373a 100644
--- a/pilot/api/data.py
+++ b/pilot/api/data.py
@@ -896,9 +896,10 @@ def set_status_for_direct_access(self, files, workdir):
             if not direct_lan and not direct_wan:
                 self.logger.debug('direct lan/wan transfer will not be used for lfn=%s' % fspec.lfn)
             self.logger.debug('lfn=%s, direct_lan=%s, direct_wan=%s, direct_access_lan=%s, direct_access_wan=%s, '
-                              'direct_localinput_allowed_schemas=%s, remoteinput_allowed_schemas=%s' %
+                              'direct_localinput_allowed_schemas=%s, remoteinput_allowed_schemas=%s, domain=%s' %
                               (fspec.lfn, direct_lan, direct_wan, fspec.direct_access_lan, fspec.direct_access_wan,
-                               str(self.direct_localinput_allowed_schemas), str(self.direct_remoteinput_allowed_schemas)))
+                               str(self.direct_localinput_allowed_schemas), str(self.direct_remoteinput_allowed_schemas),
+                               fspec.domain))
 
             if direct_lan or direct_wan:
                 fspec.status_code = 0
diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py
index a39ba3ab..0b53dd63 100644
--- a/pilot/info/jobdata.py
+++ b/pilot/info/jobdata.py
@@ -274,7 +274,7 @@ def prepare_infiles(self, data):
                     idat[key] = getattr(self.infosys.queuedata, key)
 
             finfo = FileSpec(filetype='input', **idat)
-            logger.info('added file %s' % lfn)
+            logger.info('added file \'%s\' with accessmode \'%s\'' % (lfn, accessmode))
             ret.append(finfo)
 
         return ret
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 88e0b391..c4aa5050 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '28g'    # build number should be reset to '1' for every new development cycle
+BUILD = '28'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From be5e9ffbbcbecc18ed2e7b7b2184490c69fbe9a7 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Mon, 7 Jun 2021 15:55:58 +0200
Subject: [PATCH 60/96] Flake8

---
 pilot/control/job.py       | 14 +++++++-------
 pilot/user/atlas/common.py | 20 +++++++++-----------
 pilot/util/filehandling.py |  1 -
 3 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/pilot/control/job.py b/pilot/control/job.py
index c311d3c4..09e6252c 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -41,7 +41,7 @@
     SERVER_UPDATE_UPDATING, SERVER_UPDATE_NOT_DONE
 from pilot.util.container import execute
 from pilot.util.filehandling import find_text_files, tail, is_json, copy, remove, write_json, establish_logging, write_file, \
-    create_symlink, locate_file
+    create_symlink
 from pilot.util.harvester import request_new_jobs, remove_job_request_file, parse_job_definition_file, \
     is_harvester_mode, get_worker_attributes_file, publish_job_report, publish_work_report, get_event_status_file, \
     publish_stageout_files
@@ -639,10 +639,10 @@ def get_data_structure(job, state, args, xml=None, metadata=None):
     # in debug mode, also send a tail of the latest log file touched by the payload
     if job.debug:
         # for gdb commands, use the proper gdb version (the system one may be too old)
-        #if 'gdb ' in job.debug_command:
-        #    pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
-        #    user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
-        #    user.preprocess_debug_command(job)
+        if 'gdb ' in job.debug_command:
+            pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
+            user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
+            user.preprocess_debug_command(job)
 
         stdout = get_debug_stdout(job)
         if stdout:
@@ -1131,8 +1131,8 @@ def verify_ctypes(queues, job):
         # specifically, this will include any 'orphans', i.e. if the pilot kills all subprocesses at the end,
         # 'orphans' will be included (orphans seem like the wrong name)
         libc = ctypes.CDLL('libc.so.6')
-        PR_SET_CHILD_SUBREAPER = 36
-        libc.prctl(PR_SET_CHILD_SUBREAPER, 1)
+        pr_set_child_subreaper = 36
+        libc.prctl(pr_set_child_subreaper, 1)
         logger.debug('all child subprocesses will be parented')
 
 
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index dd77a154..d1db47ef 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -2360,22 +2360,20 @@ def update_server(job):
 
 def preprocess_debug_command(job):
     """
-
+    (Currently not used - not needed if e.g. gdb will be run in a container)
     """
 
     return
-
-
     # Should the pilot do the setup or does jobPars already contain the information?
-    preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams)
+    #preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams)
     # get the general setup command and then verify it if required
-    resource_name = get_resource_name()  # 'grid' if no hpc_resource is set
-    resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0)  # Python 3, -1 -> 0
-    cmd = resource.get_setup_command(job, preparesetup)
-    if not cmd.endswith(';'):
-        cmd += '; '
-    if cmd not in job.debug_command:
-        job.debug_command = cmd + job.debug_command
+    #resource_name = get_resource_name()  # 'grid' if no hpc_resource is set
+    #resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0)  # Python 3, -1 -> 0
+    #cmd = resource.get_setup_command(job, preparesetup)
+    #if not cmd.endswith(';'):
+    #    cmd += '; '
+    #if cmd not in job.debug_command:
+    #    job.debug_command = cmd + job.debug_command
 
 
 def process_debug_command(debug_command, pandaid):
diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py
index 2c76fb52..53972c30 100644
--- a/pilot/util/filehandling.py
+++ b/pilot/util/filehandling.py
@@ -1119,4 +1119,3 @@ def locate_file(pattern):
             path = fname
 
     return path
-

From de4e150ee8ccfad149b8b2e199313d45cc2c71e4 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Mon, 7 Jun 2021 16:05:53 +0200
Subject: [PATCH 61/96] Refactoring

---
 pilot/control/job.py | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/pilot/control/job.py b/pilot/control/job.py
index 09e6252c..d9c77078 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -638,22 +638,7 @@ def get_data_structure(job, state, args, xml=None, metadata=None):
 
     # in debug mode, also send a tail of the latest log file touched by the payload
     if job.debug:
-        # for gdb commands, use the proper gdb version (the system one may be too old)
-        if 'gdb ' in job.debug_command:
-            pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
-            user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
-            user.preprocess_debug_command(job)
-
-        stdout = get_debug_stdout(job)
-        if stdout:
-            data['stdout'] = stdout
-
-            # in case gdb was successfully used, the payload can now be killed
-            if 'gdb ' in job.debug_command and job.pid:
-                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PANDAKILL,
-                                                                                 msg='payload was killed after gdb produced requested core file')
-                logger.debug('will proceed to kill payload processes')
-                kill_processes(job.pid)
+        data['stdout'] = process_debug_mode(job)
 
     # add the core count
     if job.corecount and job.corecount != 'null' and job.corecount != 'NULL':
@@ -694,6 +679,32 @@ def get_data_structure(job, state, args, xml=None, metadata=None):
     return data
 
 
+def process_debug_mode(job):
+    """
+    Handle debug mode - preprocess debug command, get the output and kill the payload in case of gdb.
+
+    :param job: job object.
+    :return: stdout from debug command (string).
+    """
+
+    # for gdb commands, use the proper gdb version (the system one may be too old)
+    if 'gdb ' in job.debug_command:
+        pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
+        user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
+        user.preprocess_debug_command(job)
+
+    stdout = get_debug_stdout(job)
+    if stdout:
+        # in case gdb was successfully used, the payload can now be killed
+        if 'gdb ' in job.debug_command and job.pid:
+            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PANDAKILL,
+                                                                             msg='payload was killed after gdb produced requested core file')
+            logger.debug('will proceed to kill payload processes')
+            kill_processes(job.pid)
+
+    return stdout
+
+
 def get_debug_stdout(job):
     """
     Return the requested output from a given debug command.

From 667b4fbd6d349632af4f3be1aa6e59f4a1591eee Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 8 Jun 2021 14:05:25 +0200
Subject: [PATCH 62/96] Will not fail jobs on sites that fail to import ctypes

---
 PILOTVERSION            | 2 +-
 pilot/control/job.py    | 6 +++---
 pilot/util/constants.py | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index ceb642a6..38b6ed73 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.28
\ No newline at end of file
+2.11.3.29
\ No newline at end of file
diff --git a/pilot/control/job.py b/pilot/control/job.py
index d9c77078..05e72c5b 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -1132,9 +1132,9 @@ def verify_ctypes(queues, job):
     except Exception as e:
         diagnostics = 'ctypes python module could not be imported: %s' % e
         logger.warning(diagnostics)
-        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOCTYPES, msg=diagnostics)
-        logger.debug('Failed to validate job=%s' % job.jobid)
-        put_in_queue(job, queues.failed_jobs)
+        #job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOCTYPES, msg=diagnostics)
+        #logger.debug('Failed to validate job=%s' % job.jobid)
+        #put_in_queue(job, queues.failed_jobs)
     else:
         logger.debug('ctypes python module imported')
 
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index c4aa5050..8127c5ae 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '28'    # build number should be reset to '1' for every new development cycle
+BUILD = '30'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From b95210f95ab4a537b08572e23841f55fac9b784f Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 8 Jun 2021 14:44:21 +0200
Subject: [PATCH 63/96] Fixed case where job object size calculation fails with
 exception due to object changing size

---
 PILOTVERSION            | 2 +-
 pilot/info/jobdata.py   | 8 +++++++-
 pilot/util/constants.py | 2 +-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 38b6ed73..f2dbf9d1 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.29
\ No newline at end of file
+2.11.3.31
\ No newline at end of file
diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py
index 0b53dd63..f3269a24 100644
--- a/pilot/info/jobdata.py
+++ b/pilot/info/jobdata.py
@@ -95,6 +95,7 @@ class JobData(BaseData):
     pid = None                     # payload pid
     pgrp = None                    # payload process group
     sizes = {}                     # job object sizes { timestamp: size, .. }
+    currentsize = 0                # current job object size
     command = ""                   # full payload command (set for container jobs)
     setup = ""                     # full payload setup (needed by postprocess command)
     zombies = []                   # list of zombie process ids
@@ -986,7 +987,12 @@ def get_size(self):
         :return: size (int).
         """
 
-        return get_object_size(self)
+        # protect against the case where the object changes size during calculation (rare)
+        try:
+            self.currentsize = get_object_size(self)
+        except Exception:
+            pass
+        return self.currentsize
 
     def collect_zombies(self, tn=None):
         """
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 8127c5ae..560bfaf9 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '11'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '30'    # build number should be reset to '1' for every new development cycle
+BUILD = '32'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From a9cac88e98706ba19d50755517b10749888bb898 Mon Sep 17 00:00:00 2001
From: Alexey Anisenkov <Alexey.Anisenkov@cern.ch>
Date: Wed, 9 Jun 2021 15:27:32 +0700
Subject: [PATCH 64/96] make free space check (check_availablespace) being
 optional for specific copytools;ignore the check for mv copytool

---
 pilot/api/data.py    | 10 ++++++----
 pilot/copytool/mv.py |  1 +
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/pilot/api/data.py b/pilot/api/data.py
index ea1ba48f..7d9246c5 100644
--- a/pilot/api/data.py
+++ b/pilot/api/data.py
@@ -857,10 +857,12 @@ def transfer_files(self, copytool, files, activity=None, **kwargs):  # noqa: C90
         kwargs['activity'] = activity
 
         # verify file sizes and available space for stage-in
-        if self.infosys.queuedata.maxinputsize != -1:
-            self.check_availablespace(remain_files)
-        else:
-            self.logger.info('skipping input file size check since maxinputsize=-1')
+        if getattr(copytool, 'check_availablespace', True):
+            if self.infosys.queuedata.maxinputsize != -1:
+                self.check_availablespace(remain_files)
+            else:
+                self.logger.info('skipping input file size check since maxinputsize=-1')
+
         show_memory_usage()
 
         # add the trace report
diff --git a/pilot/copytool/mv.py b/pilot/copytool/mv.py
index 539c6457..73093a92 100644
--- a/pilot/copytool/mv.py
+++ b/pilot/copytool/mv.py
@@ -19,6 +19,7 @@
 logger = logging.getLogger(__name__)
 
 require_replicas = False  # indicate if given copytool requires input replicas to be resolved
+check_availablespace = False  # indicate whether space check should be applied before stage-in transfers using given copytool
 
 
 def create_output_list(files, init_dir, ddmconf):

From 196b79d98c9a6c5a2400b335ac10d9231b1e749a Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Wed, 9 Jun 2021 11:35:05 +0200
Subject: [PATCH 65/96] gdb updates. Now avoiding containerisation of gdb
 command since core file can't be located.

---
 PILOTVERSION                  |  2 +-
 pilot/control/job.py          | 25 +++++++++++++------------
 pilot/info/jobdata.py         |  2 +-
 pilot/user/atlas/common.py    | 23 +++++++++++------------
 pilot/user/atlas/container.py | 10 ++++++----
 pilot/util/constants.py       |  6 +++---
 6 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index f2dbf9d1..1d38d273 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.11.3.31
\ No newline at end of file
+2.12.1.39
\ No newline at end of file
diff --git a/pilot/control/job.py b/pilot/control/job.py
index 05e72c5b..a080b8b2 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -743,8 +743,8 @@ def get_general_command_stdout(job):
         job.debug_command = user.process_debug_command(job.debug_command, job.jobid)
 
     if job.debug_command:
-        if 'gdb ' in job.debug_command:
-            logger.info('gdb execution will be done by a script')
+        _containerisation = False  # set this with some logic instead - not used for now
+        if _containerisation:
             try:
                 containerise_general_command(job, job.infosys.queuedata.container_options,
                                              label='general',
@@ -753,18 +753,19 @@ def get_general_command_stdout(job):
                 logger.warning('general containerisation threw a pilot exception: %s' % e)
             except Exception as e:
                 logger.warning('general containerisation threw an exception: %s' % e)
-
-            # in case a core file was produced, locate it
-            path = locate_core_file(job.debug_command)
-            if path:
-                # copy it to the working directory (so it will be saved in the log)
-                try:
-                    copy(path, job.workdir)
-                except Exception:
-                    pass
         else:
             ec, stdout, stderr = execute(job.debug_command)
-            logger.debug("%s:\n\n%s\n\n" % (job.debug_command, stdout))
+            logger.debug("%s (stdout):\n\n%s\n\n" % (job.debug_command, stdout))
+            logger.debug("%s (stderr):\n\n%s\n\n" % (job.debug_command, stderr))
+
+        # in case a core file was produced, locate it
+        path = locate_core_file(job.debug_command) if 'gdb ' in job.debug_command else ''
+        if path:
+            # copy it to the working directory (so it will be saved in the log)
+            try:
+                copy(path, job.workdir)
+            except Exception:
+                pass
 
     return stdout
 
diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py
index f3269a24..4e6ecbfe 100644
--- a/pilot/info/jobdata.py
+++ b/pilot/info/jobdata.py
@@ -120,7 +120,7 @@ class JobData(BaseData):
     destinationdblock = ""         ## to be moved to FileSpec (job.outdata)
     datasetin = ""                 ## TO BE DEPRECATED: moved to FileSpec (job.indata)
     debug = False                  # debug mode, when True, pilot will send debug info back to the server
-    debug_command = 'tail'         # debug command (can be defined on the task side)
+    debug_command = ''             # debug command (can be defined on the task side)
     produserid = ""                # the user DN (added to trace report)
     jobdefinitionid = ""           # the job definition id (added to trace report)
     infilesguids = ""              #
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index d1db47ef..1a1344de 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -2360,20 +2360,19 @@ def update_server(job):
 
 def preprocess_debug_command(job):
     """
-    (Currently not used - not needed if e.g. gdb will be run in a container)
+
     """
 
-    return
     # Should the pilot do the setup or does jobPars already contain the information?
-    #preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams)
+    preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams)
     # get the general setup command and then verify it if required
-    #resource_name = get_resource_name()  # 'grid' if no hpc_resource is set
-    #resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0)  # Python 3, -1 -> 0
-    #cmd = resource.get_setup_command(job, preparesetup)
-    #if not cmd.endswith(';'):
-    #    cmd += '; '
-    #if cmd not in job.debug_command:
-    #    job.debug_command = cmd + job.debug_command
+    resource_name = get_resource_name()  # 'grid' if no hpc_resource is set
+    resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0)  # Python 3, -1 -> 0
+    cmd = resource.get_setup_command(job, preparesetup)
+    if not cmd.endswith(';'):
+        cmd += '; '
+    if cmd not in job.debug_command:
+        job.debug_command = cmd + job.debug_command
 
 
 def process_debug_command(debug_command, pandaid):
@@ -2398,7 +2397,7 @@ def process_debug_command(debug_command, pandaid):
         cmd = 'ps axo pid,ppid,pgid,args'
         exit_code, stdout, stderr = execute(cmd)
         if stdout:
-            logger.debug('ps=\n\n%s\n' % stdout)
+            #logger.debug('ps=\n\n%s\n' % stdout)
             # convert the ps output to a dictionary
             dictionary = convert_ps_to_dict(stdout)
             # trim this dictionary to reduce the size (only keep the PID and PPID lists)
@@ -2421,7 +2420,7 @@ def process_debug_command(debug_command, pandaid):
                         break
                     else:
                         logger.info('pid=%d is not a child process of the trf of this job' % pid)
-            if not pids:
+            if not pids or '--pid %' in debug_command:
                 logger.debug('athena is not yet running (no corresponding pid)')
                 debug_command = ''  # reset the command to prevent the payload from being killed (will be killed when gdb has run)
 
diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py
index 84c1cfff..2a81807a 100644
--- a/pilot/user/atlas/container.py
+++ b/pilot/user/atlas/container.py
@@ -971,7 +971,7 @@ def get_root_container_script(cmd):
     return content
 
 
-def get_middleware_container_script(middleware_container, cmd, asetup=False):
+def get_middleware_container_script(middleware_container, cmd, asetup=False, label=''):
     """
     Return the content of the middleware container script.
     If asetup is True, atlasLocalSetup will be added to the command.
@@ -991,9 +991,11 @@ def get_middleware_container_script(middleware_container, cmd, asetup=False):
             content += 'export ALRB_LOCAL_PY3=YES; '
         if asetup:  # export ATLAS_LOCAL_ROOT_BASE=/cvmfs/..;source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh --quiet;
             content += get_asetup(asetup=False)
-        content += sitename + 'lsetup rucio davix xrootd; '
-        content += 'python3 %s ' % cmd if is_python3() else 'python %s' % cmd
-
+        if label == 'stagein' or label == 'stageout':
+            content += sitename + 'lsetup rucio davix xrootd; '
+            content += 'python3 %s ' % cmd if is_python3() else 'python %s' % cmd
+        else:
+            content += cmd
     if not asetup:
         content += '\nexit $?'
 
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 560bfaf9..b24d9de6 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -12,9 +12,9 @@
 
 # Pilot version
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
-VERSION = '11'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
-REVISION = '3'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '32'    # build number should be reset to '1' for every new development cycle
+VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
+REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
+BUILD = '39'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 8bb688d33f5fa97a80b7d2e839c6da2e1766e2de Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Fri, 11 Jun 2021 19:41:31 +0200
Subject: [PATCH 66/96] Testing du. Lazy logging updates. Pylint fixes

---
 PILOTVERSION                                |   2 +-
 pilot.py                                    |  19 +--
 pilot/api/analytics.py                      |  17 ++-
 pilot/api/dask.py                           |  28 ++--
 pilot/api/data.py                           | 152 ++++++++------------
 pilot/api/es_data.py                        |   4 +-
 pilot/common/pluginfactory.py               |   7 +-
 pilot/control/data.py                       | 131 ++++++++---------
 pilot/control/interceptor.py                |   9 +-
 pilot/control/job.py                        |  30 ++--
 pilot/control/payloads/eventservice.py      |   8 +-
 pilot/control/payloads/eventservicemerge.py |   6 +-
 pilot/control/payloads/generic.py           | 106 +++++++-------
 pilot/user/atlas/dbrelease.py               |   2 +-
 pilot/user/atlas/setup.py                   |   4 +-
 pilot/util/constants.py                     |   2 +-
 16 files changed, 240 insertions(+), 287 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 1d38d273..6fca09e1 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.12.1.39
\ No newline at end of file
+2.12.1.40b
\ No newline at end of file
diff --git a/pilot.py b/pilot.py
index eee04af1..82fc4d99 100755
--- a/pilot.py
+++ b/pilot.py
@@ -10,6 +10,7 @@
 # - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019
 
 from __future__ import print_function  # Python 2 (2to3 complains about this)
+from __future__ import absolute_import
 
 import argparse
 import logging
@@ -68,7 +69,7 @@ def main():
         infosys.init(args.queue)
         # check if queue is ACTIVE
         if infosys.queuedata.state != 'ACTIVE':
-            logger.critical('specified queue is NOT ACTIVE: %s -- aborting' % infosys.queuedata.name)
+            logger.critical('specified queue is NOT ACTIVE: %s -- aborting', infosys.queuedata.name)
             return errors.PANDAQUEUENOTACTIVE
     except PilotException as error:
         logger.fatal(error)
@@ -81,14 +82,14 @@ def main():
     environ['PILOT_SITENAME'] = infosys.queuedata.resource  #args.site  # TODO: replace with singleton
 
     # set requested workflow
-    logger.info('pilot arguments: %s' % str(args))
+    logger.info('pilot arguments: %s', str(args))
     workflow = __import__('pilot.workflow.%s' % args.workflow, globals(), locals(), [args.workflow], 0)  # Python 3, -1 -> 0
 
     # execute workflow
     try:
         exit_code = workflow.run(args)
     except Exception as e:
-        logger.fatal('main pilot function caught exception: %s' % e)
+        logger.fatal('main pilot function caught exception: %s', e)
         exit_code = None
 
     return exit_code
@@ -450,9 +451,9 @@ def wrap_up(initdir, mainworkdir, args):
         try:
             rmtree(mainworkdir)
         except Exception as e:
-            logging.warning("failed to remove %s: %s" % (mainworkdir, e))
+            logging.warning("failed to remove %s: %s", mainworkdir, e)
         else:
-            logging.info("removed %s" % mainworkdir)
+            logging.info("removed %s", mainworkdir)
 
     # in Harvester mode, create a kill_worker file that will instruct Harvester that the pilot has finished
     if args.harvester:
@@ -464,15 +465,15 @@ def wrap_up(initdir, mainworkdir, args):
     except Exception:
         exit_code = trace
     else:
-        logging.info('traces error code: %d' % exit_code)
+        logging.info('traces error code: %d', exit_code)
         if trace.pilot['nr_jobs'] <= 1:
             if exit_code != 0:
-                logging.info('an exit code was already set: %d (will be converted to a standard shell code)' % exit_code)
+                logging.info('an exit code was already set: %d (will be converted to a standard shell code)', exit_code)
         elif trace.pilot['nr_jobs'] > 0:
             if trace.pilot['nr_jobs'] == 1:
-                logging.getLogger(__name__).info('pilot has finished (%d job was processed)' % trace.pilot['nr_jobs'])
+                logging.getLogger(__name__).info('pilot has finished (%d job was processed)', trace.pilot['nr_jobs'])
             else:
-                logging.getLogger(__name__).info('pilot has finished (%d jobs were processed)' % trace.pilot['nr_jobs'])
+                logging.getLogger(__name__).info('pilot has finished (%d jobs were processed)', trace.pilot['nr_jobs'])
             exit_code = SUCCESS
         elif trace.pilot['state'] == FAILURE:
             logging.critical('pilot workflow failure -- aborting')
diff --git a/pilot/api/analytics.py b/pilot/api/analytics.py
index 3b509b57..aa7e047b 100644
--- a/pilot/api/analytics.py
+++ b/pilot/api/analytics.py
@@ -5,7 +5,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Authors:
-# - Paul Nilsson, paul.nilsson@cern.ch, 2018
+# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021
 
 from .services import Services
 from pilot.common.exception import NotDefined, NotSameLength, UnknownException
@@ -146,21 +146,20 @@ def get_fitted_data(self, filename, x_name='Time', y_name='pss+swap', precision=
                 y = y[:-2]
 
             if (len(x) > 7 and len(y) > 7) and len(x) == len(y):
-                logger.info('fitting %s vs %s' % (y_name, x_name))
+                logger.info('fitting %s vs %s', y_name, x_name)
                 try:
                     fit = self.fit(x, y)
                     _slope = self.slope()
                 except Exception as e:
-                    logger.warning('failed to fit data, x=%s, y=%s: %s' % (str(x), str(y), e))
+                    logger.warning('failed to fit data, x=%s, y=%s: %s', str(x), str(y), e)
                 else:
                     if _slope:
                         slope = float_to_rounded_string(fit.slope(), precision=precision)
                         chi2 = float_to_rounded_string(fit.chi2(), precision=0)  # decimals are not needed for chi2
                         if slope != "":
-                            logger.info('current memory leak: %s B/s (using %d data points, chi2=%s)' %
-                                        (slope, len(x), chi2))
+                            logger.info('current memory leak: %s B/s (using %d data points, chi2=%s)', slope, len(x), chi2)
             else:
-                logger.warning('wrong length of table data, x=%s, y=%s (must be same and length>=4)' % (str(x), str(y)))
+                logger.warning('wrong length of table data, x=%s, y=%s (must be same and length>=4)', str(x), str(y))
 
         return {"slope": slope, "chi2": chi2}
 
@@ -182,8 +181,8 @@ def extract_from_table(self, table, x_name, y_name):
                 y2_name = y_name.split('+')[1]
                 y1_value = table.get(y1_name, [])
                 y2_value = table.get(y2_name, [])
-            except Exception as e:
-                logger.warning('exception caught: %s' % e)
+            except Exception as error:
+                logger.warning('exception caught: %s', error)
                 x = []
                 y = []
             else:
@@ -238,7 +237,7 @@ def __init__(self, **kwargs):
             self.set_intersect()
             self.set_chi2()
         else:
-            logger.warning("\'%s\' model is not implemented" % self._model)
+            logger.warning("\'%s\' model is not implemented", self._model)
             raise NotImplementedError()
 
     def fit(self):
diff --git a/pilot/api/dask.py b/pilot/api/dask.py
index ad051c00..ab5ff3eb 100644
--- a/pilot/api/dask.py
+++ b/pilot/api/dask.py
@@ -64,7 +64,7 @@ def uninstall(self, block=True):
 
         """
 
-        logger.info('uninstalling service %s' % self.servicename)
+        logger.info('uninstalling service %s', self.servicename)
         if block:
             logger.warning('blocking mode not yet implemented')
 
@@ -72,7 +72,7 @@ def uninstall(self, block=True):
         exit_code, stdout, stderr = execute(cmd, mute=True)
         if not exit_code:
             self.status = 'uninstalled'
-            logger.info('uninstall of service %s has been requested' % self.servicename)
+            logger.info('uninstall of service %s has been requested', self.servicename)
 
     def install(self, block=True):
         """
@@ -90,9 +90,9 @@ def install(self, block=True):
             # is the single-dask cluster already running?
             name = '%s-scheduler' % self.servicename
             if self.is_running(name=name):
-                logger.info('service %s is already running - nothing to install' % name)
+                logger.info('service %s is already running - nothing to install', name)
             else:
-                logger.info('service %s is not yet running - proceed with installation' % name)
+                logger.info('service %s is not yet running - proceed with installation', name)
 
                 # perform helm updates before actual instqllation
                 cmd = ''
@@ -101,13 +101,13 @@ def install(self, block=True):
                 cmd = 'helm install %s %s dask/dask' % (override_option, self.servicename)
                 exit_code, stdout, stderr = execute(cmd, mute=True)
                 if not exit_code:
-                    logger.info('installation of service %s is in progress' % self.servicename)
+                    logger.info('installation of service %s is in progress', self.servicename)
 
                     if block:
                         while True:
                             name = '%s-scheduler' % self.servicename
                             if self.is_running(name=name):
-                                logger.info('service %s is running' % name)
+                                logger.info('service %s is running', name)
                                 self.status = 'running'
                                 break
                             else:
@@ -148,7 +148,7 @@ def _get_dictionary(self, cmd=None):
 
         exit_code, stdout, stderr = execute(cmd, mute=True)
         if exit_code:
-            logger.warning('failed to execute \'%s\': %s' % (cmd, stdout))
+            logger.warning('failed to execute \'%s\': %s', cmd, stdout)
             self.status = 'failed'
         else:
             # parse output
@@ -184,7 +184,7 @@ def _validate(self):
                 logger.warning(stdout)
                 break
             else:
-                logger.debug('%s verified' % cmd)
+                logger.debug('%s verified', cmd)
         if not found:
             return False
 
@@ -204,7 +204,7 @@ def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'):
 
         filename = os.path.join(self._workdir, self.overrides)
         if os.path.exists(filename):
-            logger.info('file \'%s\' already exists - will not override' % filename)
+            logger.info('file \'%s\' already exists - will not override', filename)
             return
 
         script = ""
@@ -216,7 +216,7 @@ def _generate_override_script(self, jupyter=False, servicetype='LoadBalancer'):
         if script:
             status = write_file(filename, script)
             if status:
-                logger.debug('generated script: %s' % filename)
+                logger.debug('generated script: %s', filename)
         else:
             self.overrides = None
 
@@ -240,7 +240,7 @@ def _convert_to_dict(self, output):
                         dictionary[_l[0]][first_line[i]] = _l[1:][i]
 
             except Exception:
-                logger.warning("unexpected format of utility output: %s" % line)
+                logger.warning("unexpected format of utility output: %s", line)
 
         return dictionary
 
@@ -252,7 +252,7 @@ def connect_cluster(self, release_name=None, manager=dask_kubernetes.HelmCluster
         if not release_name:
             release_name = self.servicename
         self.cluster = manager(release_name=release_name)
-        logger.info('connected to %s' % manager.__name__)
+        logger.info('connected to %s', manager.__name__)
 
     def scale(self, number):
         """
@@ -260,7 +260,7 @@ def scale(self, number):
         """
 
         if number > 2:
-            logger.warning('too large scale: %d (please use <= 2 for now)' % number)
+            logger.warning('too large scale: %d (please use <= 2 for now)', number)
             return
         if not self.cluster:
             self.connect_cluster()
@@ -269,7 +269,7 @@ def scale(self, number):
             self.status = 'failed'
             return
 
-        logger.info('setting scale to: %d' % number)
+        logger.info('setting scale to: %d', number)
         self.cluster.scale(number)
 
     def shutdown(self):
diff --git a/pilot/api/data.py b/pilot/api/data.py
index cbfabad6..7f104b54 100644
--- a/pilot/api/data.py
+++ b/pilot/api/data.py
@@ -6,7 +6,7 @@
 #
 # Authors:
 # - Mario Lassnig, mario.lassnig@cern.ch, 2017
-# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019
+# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021
 # - Tobias Wegner, tobias.wegner@cern.ch, 2017-2018
 # - Alexey Anisenkov, anisyonk@cern.ch, 2018-2019
 
@@ -69,7 +69,7 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_
         super(StagingClient, self).__init__()
 
         if not logger:
-            logger = logging.getLogger('%s.%s' % (__name__, 'null'))
+            logger = logging.getLogger('%s.%s', __name__, 'null')
             logger.disabled = True
 
         self.logger = logger
@@ -99,7 +99,7 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_
             self.trace_report.update(clientState='BAD_COPYTOOL', stateReason=msg)
             self.trace_report.send()
             raise PilotException("failed to resolve acopytools settings")
-        logger.info('configured copytools per activity: acopytools=%s' % self.acopytools)
+        logger.info('configured copytools per activity: acopytools=%s', self.acopytools)
 
         # get an initialized trace report (has to be updated for get/put if not defined before)
         self.trace_report = trace_report if trace_report else TraceReport(pq=os.environ.get('PILOT_SITENAME', ''))
@@ -268,7 +268,7 @@ def resolve_replicas(self, files, use_vp=False):
         # add signature lifetime for signed URL storages
         query.update(signature_lifetime=24 * 3600)  # note: default is otherwise 1h
 
-        logger.info('calling rucio.list_replicas() with query=%s' % query)
+        logger.info('calling rucio.list_replicas() with query=%s', query)
 
         try:
             replicas = c.list_replicas(**query)
@@ -278,7 +278,7 @@ def resolve_replicas(self, files, use_vp=False):
         show_memory_usage()
 
         replicas = list(replicas)
-        logger.debug("replicas received from Rucio: %s" % replicas)
+        logger.debug("replicas received from Rucio: %s", replicas)
 
         files_lfn = dict(((e.scope, e.lfn), e) for e in xfiles)
         for replica in replicas:
@@ -294,18 +294,18 @@ def resolve_replicas(self, files, use_vp=False):
             self.trace_report.update(validateStart=time.time())
             status = True
             if fdat.filesize != replica['bytes']:
-                logger.warning("Filesize of input file=%s mismatched with value from Rucio replica: filesize=%s, replica.filesize=%s, fdat=%s"
-                               % (fdat.lfn, fdat.filesize, replica['bytes'], fdat))
+                logger.warning("Filesize of input file=%s mismatched with value from Rucio replica: filesize=%s, replica.filesize=%s, fdat=%s",
+                               fdat.lfn, fdat.filesize, replica['bytes'], fdat)
                 status = False
 
             if not fdat.filesize:
                 fdat.filesize = replica['bytes']
-                logger.warning("Filesize value for input file=%s is not defined, assigning info from Rucio replica: filesize=%s" % (fdat.lfn, replica['bytes']))
+                logger.warning("Filesize value for input file=%s is not defined, assigning info from Rucio replica: filesize=%s", fdat.lfn, replica['bytes'])
 
             for ctype in ['adler32', 'md5']:
                 if fdat.checksum.get(ctype) != replica[ctype] and replica[ctype]:
-                    logger.warning("Checksum value of input file=%s mismatched with info got from Rucio replica: checksum=%s, replica.checksum=%s, fdat=%s"
-                                   % (fdat.lfn, fdat.checksum, replica[ctype], fdat))
+                    logger.warning("Checksum value of input file=%s mismatched with info got from Rucio replica: checksum=%s, replica.checksum=%s, fdat=%s",
+                                   fdat.lfn, fdat.checksum, replica[ctype], fdat)
                     status = False
 
                 if not fdat.checksum.get(ctype) and replica[ctype]:
@@ -489,33 +489,32 @@ def transfer(self, files, activity='default', **kwargs):  # noqa: C901
                                          code=ErrorCodes.UNKNOWNCOPYTOOL)
 
                 module = self.copytool_modules[name]['module_name']
-                self.logger.info('trying to use copytool=%s for activity=%s' % (name, activity))
+                self.logger.info('trying to use copytool=%s for activity=%s', name, activity)
                 copytool = __import__('pilot.copytool.%s' % module, globals(), locals(), [module], 0)  # Python 2/3
                 #self.trace_report.update(protocol=name)
 
-            except PilotException as e:
-                caught_errors.append(e)
-                self.logger.debug('error: %s' % e)
+            except PilotException as error:
+                caught_errors.append(error)
+                self.logger.debug('error: %s', error)
                 continue
-            except Exception as e:
-                self.logger.warning('failed to import copytool module=%s, error=%s' % (module, e))
+            except Exception as error:
+                self.logger.warning('failed to import copytool module=%s, error=%s', module, error)
                 continue
 
             try:
-                #self.logger.debug('kwargs=%s' % str(kwargs))
                 result = self.transfer_files(copytool, remain_files, activity, **kwargs)
-                self.logger.debug('transfer_files() using copytool=%s completed with result=%s' % (copytool, str(result)))
+                self.logger.debug('transfer_files() using copytool=%s completed with result=%s', copytool, str(result))
                 show_memory_usage()
                 break
-            except PilotException as e:
-                self.logger.warning('failed to transfer_files() using copytool=%s .. skipped; error=%s' % (copytool, e))
-                caught_errors.append(e)
-            except TimeoutException as e:
-                self.logger.warning('function timed out: %s' % e)
-                caught_errors.append(e)
-            except Exception as e:
-                self.logger.warning('failed to transfer files using copytool=%s .. skipped; error=%s' % (copytool, e))
-                caught_errors.append(e)
+            except PilotException as error:
+                self.logger.warning('failed to transfer_files() using copytool=%s .. skipped; error=%s', copytool, error)
+                caught_errors.append(error)
+            except TimeoutException as error:
+                self.logger.warning('function timed out: %s', error)
+                caught_errors.append(error)
+            except Exception as error:
+                self.logger.warning('failed to transfer files using copytool=%s .. skipped; error=%s', copytool, error)
+                caught_errors.append(error)
                 import traceback
                 self.logger.error(traceback.format_exc())
 
@@ -537,7 +536,7 @@ def transfer(self, files, activity='default', **kwargs):  # noqa: C901
                 errmsg = caught_errors[0].get_last_error()
             elif caught_errors and isinstance(caught_errors[0], TimeoutException):
                 code = ErrorCodes.STAGEINTIMEOUT if self.mode == 'stage-in' else ErrorCodes.STAGEOUTTIMEOUT  # is it stage-in/out?
-                self.logger.warning('caught time-out exception: %s' % caught_errors[0])
+                self.logger.warning('caught time-out exception: %s', caught_errors[0])
             else:
                 code = ErrorCodes.STAGEINFAILED if self.mode == 'stage-in' else ErrorCodes.STAGEOUTFAILED  # is it stage-in/out?
             details = str(caught_errors) + ":" + 'failed to transfer files using copytools=%s' % copytools
@@ -575,13 +574,13 @@ def require_protocols(self, files, copytool, activity, local_dir=''):
             protocols = self.resolve_protocol(fspec, allowed_schemas)
             if not protocols and 'mv' not in self.infosys.queuedata.copytools:  # no protocols found
                 error = 'Failed to resolve protocol for file=%s, allowed_schemas=%s, fspec=%s' % (fspec.lfn, allowed_schemas, fspec)
-                self.logger.error("resolve_protocol: %s" % error)
+                self.logger.error("resolve_protocol: %s", error)
                 raise PilotException(error, code=ErrorCodes.NOSTORAGEPROTOCOL)
 
             # take first available protocol for copytool: FIX ME LATER if need (do iterate over all allowed protocols?)
             protocol = protocols[0]
 
-            self.logger.info("Resolved protocol to be used for transfer: \'%s\': lfn=\'%s\'" % (protocol, fspec.lfn))
+            self.logger.info("Resolved protocol to be used for transfer: \'%s\': lfn=\'%s\'", protocol, fspec.lfn)
 
             resolve_surl = getattr(copytool, 'resolve_surl', None)
             if not callable(resolve_surl):
@@ -608,7 +607,7 @@ def resolve_protocols(self, files):
             ddm = ddmconf.get(fdat.ddmendpoint)
             if not ddm:
                 error = 'Failed to resolve output ddmendpoint by name=%s (from PanDA), please check configuration.' % fdat.ddmendpoint
-                self.logger.error("resolve_protocols: %s, fspec=%s" % (error, fdat))
+                self.logger.error("resolve_protocols: %s, fspec=%s", error, fdat)
                 raise PilotException(error, code=ErrorCodes.NOSTORAGE)
 
             protocols = []
@@ -689,13 +688,13 @@ def resolve_replica(self, fspec, primary_schemas=None, allowed_schemas=None, dom
             pschemas = 'any' if primary_schemas and not primary_schemas[0] else ','.join(primary_schemas or [])
 
             error = 'Failed to find replica for file=%s, domain=%s, allowed_schemas=%s, pschemas=%s, fspec=%s' % (fspec.lfn, domain, schemas, pschemas, fspec)
-            self.logger.info("resolve_replica: %s" % error)
+            self.logger.info("resolve_replica: %s", error)
             return
 
         # prefer SRM protocol for surl -- to be verified, can it be deprecated?
         rse_replicas = replicas.get(replica['ddmendpoint'], [])
         surl = self.get_preferred_replica(rse_replicas, ['srm']) or rse_replicas[0]
-        self.logger.info("[stage-in] surl (srm replica) from Rucio: pfn=%s, ddmendpoint=%s" % (surl['pfn'], surl['ddmendpoint']))
+        self.logger.info("[stage-in] surl (srm replica) from Rucio: pfn=%s, ddmendpoint=%s", surl['pfn'], surl['ddmendpoint'])
 
         return {'surl': surl['pfn'], 'ddmendpoint': replica['ddmendpoint'], 'pfn': replica['pfn'], 'domain': replica['domain']}
 
@@ -719,42 +718,10 @@ def get_direct_access_variables(self, job):
 
         if job and not job.is_analysis() and job.transfertype != 'direct':  # task forbids direct access
             allow_direct_access = False
-            self.logger.info('switched off direct access mode for production job since transfertype=%s' % job.transfertype)
+            self.logger.info('switched off direct access mode for production job since transfertype=%s', job.transfertype)
 
         return allow_direct_access, direct_access_type
 
-    #def set_accessmodes_for_direct_access(self, files, direct_access_type):  ## TO BE DEPRECATED (anisyonk)
-    #    """
-    #    Update the FileSpec accessmodes for direct access and sort the files to get candidates for remote_io coming
-    #    first in order to exclude them from checking of available space for stage-in.
-    #
-    #    :param files: FileSpec objects.
-    #    :param direct_access_type: type of direct access (LAN or WAN) (string).
-    #    :return:
-    #    """
-    #
-    #    # sort the files
-    #    files = sorted(files, key=lambda x: x.is_directaccess(ensure_replica=False), reverse=True)
-    #
-    #    # populate allowremoteinputs for each FileSpec object
-    #    for fdata in files:
-    #        is_directaccess = fdata.is_directaccess(ensure_replica=False)
-    #        if is_directaccess and direct_access_type == 'WAN':  ## is it the same for ES workflow ?? -- test and verify/FIXME LATER
-    #            fdata.allowremoteinputs = True
-    #        self.logger.info("check direct access for lfn=%s: allow_direct_access=true, fdata.is_directaccess()=%s =>"
-    #                         " is_directaccess=%s, allowremoteinputs=%s" % (fdata.lfn,
-    #                                                                        fdata.is_directaccess(ensure_replica=False),
-    #                                                                        is_directaccess, fdata.allowremoteinputs))
-    #        # must update accessmode for user jobs (it is only set already for production jobs)
-    #        if fdata.accessmode != 'direct' and is_directaccess and fdata.accessmode != 'copy':
-    #            fdata.accessmode = 'direct'
-    #
-    #        # reset accessmode if direct access is not to be used
-    #        if fdata.accessmode == 'direct' and not is_directaccess:
-    #            fdata.accessmode = ''
-    #
-    #        self.logger.info('accessmode for LFN=%s: %s (is_directaccess=%s)' % (fdata.lfn, fdata.accessmode, is_directaccess))
-
     def transfer_files(self, copytool, files, activity=None, **kwargs):  # noqa: C901
         """
         Automatically stage in files using the selected copy tool module.
@@ -780,7 +747,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs):  # noqa: C90
             # overwrite allowed_schemas for VP jobs
             if kwargs['use_vp']:
                 allowed_schemas = ['root']
-                self.logger.debug('overwrote allowed_schemas for VP job: %s' % str(allowed_schemas))
+                self.logger.debug('overwrote allowed_schemas for VP job: %s', str(allowed_schemas))
 
             for fspec in files:
                 resolve_replica = getattr(copytool, 'resolve_replica', None)
@@ -796,11 +763,11 @@ def transfer_files(self, copytool, files, activity=None, **kwargs):  # noqa: C90
                                        fspec.is_directaccess(ensure_replica=False) else None)
                     replica = resolve_replica(fspec, primary_schemas, allowed_schemas, domain='lan')
                 else:
-                    self.logger.info("[stage-in] LAN access is DISABLED for lfn=%s (fspec.allow_lan=%s)" % (fspec.lfn, fspec.allow_lan))
+                    self.logger.info("[stage-in] LAN access is DISABLED for lfn=%s (fspec.allow_lan=%s)", fspec.lfn, fspec.allow_lan)
 
                 if not replica and fspec.allow_lan:
-                    self.logger.info("[stage-in] No LAN replica found for lfn=%s, primary_schemas=%s, allowed_schemas=%s" %
-                                     (fspec.lfn, primary_schemas, allowed_schemas))
+                    self.logger.info("[stage-in] No LAN replica found for lfn=%s, primary_schemas=%s, allowed_schemas=%s",
+                                     fspec.lfn, primary_schemas, allowed_schemas)
 
                 # check remote replicas
                 if not replica and fspec.allow_wan:
@@ -812,8 +779,8 @@ def transfer_files(self, copytool, files, activity=None, **kwargs):  # noqa: C90
                     replica = resolve_replica(fspec, primary_schemas, allowed_schemas, domain='wan')
 
                 if not replica and fspec.allow_wan:
-                    self.logger.info("[stage-in] No WAN replica found for lfn=%s, primary_schemas=%s, allowed_schemas=%s" %
-                                     (fspec.lfn, primary_schemas, allowed_schemas))
+                    self.logger.info("[stage-in] No WAN replica found for lfn=%s, primary_schemas=%s, allowed_schemas=%s",
+                                     fspec.lfn, primary_schemas, allowed_schemas)
                 if not replica:
                     raise ReplicasNotFound('No replica found for lfn=%s (allow_lan=%s, allow_wan=%s)' % (fspec.lfn, fspec.allow_lan, fspec.allow_wan))
 
@@ -826,8 +793,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs):  # noqa: C90
                 if replica.get('domain'):
                     fspec.domain = replica['domain']
 
-                self.logger.info("[stage-in] found replica to be used for lfn=%s: ddmendpoint=%s, pfn=%s" %
-                                 (fspec.lfn, fspec.ddmendpoint, fspec.turl))
+                self.logger.info("[stage-in] found replica to be used for lfn=%s: ddmendpoint=%s, pfn=%s", fspec.lfn, fspec.ddmendpoint, fspec.turl)
 
         # prepare files (resolve protocol/transfer url)
         if getattr(copytool, 'require_input_protocols', False) and files:
@@ -845,7 +811,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs):  # noqa: C90
         if not copytool.is_valid_for_copy_in(remain_files):
             msg = 'input is not valid for transfers using copytool=%s' % copytool
             self.logger.warning(msg)
-            self.logger.debug('input: %s' % remain_files)
+            self.logger.debug('input: %s', remain_files)
             self.trace_report.update(clientState='NO_REPLICA', stateReason=msg)
             self.trace_report.send()
             raise PilotException('invalid input data for transfer operation')
@@ -867,7 +833,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs):  # noqa: C90
 
         # add the trace report
         kwargs['trace_report'] = self.trace_report
-        self.logger.info('ready to transfer (stage-in) files: %s' % remain_files)
+        self.logger.info('ready to transfer (stage-in) files: %s', remain_files)
 
         # use bulk downloads if necessary
         # if kwargs['use_bulk_transfer']
@@ -896,12 +862,11 @@ def set_status_for_direct_access(self, files, workdir):
             #        direct_lan = True
 
             if not direct_lan and not direct_wan:
-                self.logger.debug('direct lan/wan transfer will not be used for lfn=%s' % fspec.lfn)
+                self.logger.debug('direct lan/wan transfer will not be used for lfn=%s', fspec.lfn)
             self.logger.debug('lfn=%s, direct_lan=%s, direct_wan=%s, direct_access_lan=%s, direct_access_wan=%s, '
-                              'direct_localinput_allowed_schemas=%s, remoteinput_allowed_schemas=%s, domain=%s' %
-                              (fspec.lfn, direct_lan, direct_wan, fspec.direct_access_lan, fspec.direct_access_wan,
-                               str(self.direct_localinput_allowed_schemas), str(self.direct_remoteinput_allowed_schemas),
-                               fspec.domain))
+                              'direct_localinput_allowed_schemas=%s, remoteinput_allowed_schemas=%s, domain=%s',
+                              fspec.lfn, direct_lan, direct_wan, fspec.direct_access_lan, fspec.direct_access_wan,
+                              str(self.direct_localinput_allowed_schemas), str(self.direct_remoteinput_allowed_schemas), fspec.domain)
 
             if direct_lan or direct_wan:
                 fspec.status_code = 0
@@ -911,8 +876,8 @@ def set_status_for_direct_access(self, files, workdir):
                 if alrb_xcache_proxy and direct_lan:  #fspec.is_directaccess(ensure_replica=False):
                     fspec.turl = '${ALRB_XCACHE_PROXY}' + fspec.turl
 
-                self.logger.info('stage-in: direct access (remote i/o) will be used for lfn=%s (direct_lan=%s, direct_wan=%s), turl=%s' %
-                                 (fspec.lfn, direct_lan, direct_wan, fspec.turl))
+                self.logger.info('stage-in: direct access (remote i/o) will be used for lfn=%s (direct_lan=%s, direct_wan=%s), turl=%s',
+                                 fspec.lfn, direct_lan, direct_wan, fspec.turl)
 
                 # send trace
                 localsite = os.environ.get('RUCIO_LOCAL_SITE_ID')
@@ -934,7 +899,7 @@ def set_status_for_direct_access(self, files, workdir):
                     if not os.path.exists(_workdir):
                         path = os.path.join('/srv', config.Pilot.base_trace_report)
                     if not os.path.exists(path):
-                        self.logger.debug('writing base trace report to: %s' % path)
+                        self.logger.debug('writing base trace report to: %s', path)
                         write_json(path, self.trace_report)
                 else:
                     self.trace_report.send()
@@ -948,7 +913,7 @@ def check_availablespace(self, files):
         """
 
         for f in files:
-            self.logger.debug('lfn=%s filesize=%d accessmode=%s' % (f.lfn, f.filesize, f.accessmode))
+            self.logger.debug('lfn=%s filesize=%d accessmode=%s', f.lfn, f.filesize, f.accessmode)
 
         maxinputsize = convert_mb_to_b(get_maximum_input_sizes())
         totalsize = reduce(lambda x, y: x + y.filesize, files, 0)
@@ -959,12 +924,11 @@ def check_availablespace(self, files):
                     (len(files), totalsize, maxinputsize)
             raise SizeTooLarge(error)
 
-        self.logger.info("total input file size=%s B within allowed limit=%s B (zero value means unlimited)" %
-                         (totalsize, maxinputsize))
+        self.logger.info("total input file size=%s B within allowed limit=%s B (zero value means unlimited)", totalsize, maxinputsize)
 
         # get available space
         available_space = convert_mb_to_b(get_local_disk_space(os.getcwd()))
-        self.logger.info("locally available space: %d B" % available_space)
+        self.logger.info("locally available space: %d B", available_space)
 
         # are we within the limit?
         if totalsize > available_space:
@@ -1019,17 +983,17 @@ def prepare_destinations(self, files, activities):
         # take the fist choice for now, extend the logic later if need
         ddm = storages[0]
 
-        self.logger.info("[prepare_destinations][%s]: allowed (local) destinations: %s" % (activity, storages))
-        self.logger.info("[prepare_destinations][%s]: resolved default destination ddm=%s" % (activity, ddm))
+        self.logger.info("[prepare_destinations][%s]: allowed (local) destinations: %s", activity, storages)
+        self.logger.info("[prepare_destinations][%s]: resolved default destination ddm=%s", activity, ddm)
 
         for e in files:
             if not e.ddmendpoint:  # no preferences => use default destination
                 self.logger.info("[prepare_destinations][%s]: fspec.ddmendpoint is not set for lfn=%s"
-                                 " .. will use default ddm=%s as (local) destination" % (activity, e.lfn, ddm))
+                                 " .. will use default ddm=%s as (local) destination", activity, e.lfn, ddm)
                 e.ddmendpoint = ddm
             elif e.ddmendpoint not in storages:  # fspec.ddmendpoint is not in associated storages => assume it as final (non local) alternative destination
                 self.logger.info("[prepare_destinations][%s]: Requested fspec.ddmendpoint=%s is not in the list of allowed (local) destinations"
-                                 " .. will consider default ddm=%s for transfer and tag %s as alt. location" % (activity, e.ddmendpoint, ddm, e.ddmendpoint))
+                                 " .. will consider default ddm=%s for transfer and tag %s as alt. location", activity, e.ddmendpoint, ddm, e.ddmendpoint)
                 e.ddmendpoint = ddm
                 e.ddmendpoint_alt = e.ddmendpoint  # consider me later
 
@@ -1135,10 +1099,10 @@ def transfer_files(self, copytool, files, activity, **kwargs):
 
         if not copytool.is_valid_for_copy_out(files):
             self.logger.warning('Input is not valid for transfers using copytool=%s' % copytool)
-            self.logger.debug('Input: %s' % files)
+            self.logger.debug('Input: %s', files)
             raise PilotException('Invalid input for transfer operation')
 
-        self.logger.info('ready to transfer (stage-out) files: %s' % files)
+        self.logger.info('ready to transfer (stage-out) files: %s', files)
 
         if self.infosys:
             kwargs['copytools'] = self.infosys.queuedata.copytools
diff --git a/pilot/api/es_data.py b/pilot/api/es_data.py
index 708e6de7..e246cbd9 100644
--- a/pilot/api/es_data.py
+++ b/pilot/api/es_data.py
@@ -7,7 +7,7 @@
 # Authors:
 # - Wen Guan, wen.guan@cern,ch, 2018
 # - Alexey Anisenkov, anisyonk@cern.ch, 2019
-# - Paul Nilsson, paul.nilsson@cern.ch, 2019
+# - Paul Nilsson, paul.nilsson@cern.ch, 2021
 
 import logging
 
@@ -46,7 +46,7 @@ def prepare_sources(self, files, activities=None):
                     fspec.scope = 'transient'
                 if storage_id:
                     fspec.ddmendpoint = self.infosys.get_ddmendpoint(storage_id)
-                logger.info("Processed file with storage id: %s" % fspec)
+                logger.info("Processed file with storage id: %s", fspec)
 
 
 class StageOutESClient(StageOutClient):
diff --git a/pilot/common/pluginfactory.py b/pilot/common/pluginfactory.py
index cf2b5f27..27925299 100644
--- a/pilot/common/pluginfactory.py
+++ b/pilot/common/pluginfactory.py
@@ -6,6 +6,7 @@
 #
 # Authors:
 # - Wen Guan, wen.guan@cern.ch, 2018
+# - Paul Nilsson, paul.nilsson@cern.ch, 2021
 
 
 import logging
@@ -30,11 +31,11 @@ def get_plugin(self, confs):
 
         class_name = confs['class']
         if class_name is None:
-            logger.error("[class] is not defined in confs: %s" % confs)
+            logger.error("[class] is not defined in confs: %s", confs)
             return None
 
         if class_name not in self.classMap:
-            logger.info("Trying to import %s" % class_name)
+            logger.info("Trying to import %s", class_name)
             components = class_name.split('.')
             mod = __import__('.'.join(components[:-1]))
             for comp in components[1:]:
@@ -48,7 +49,7 @@ def get_plugin(self, confs):
             args[key] = confs[key]
 
         cls = self.classMap[class_name]
-        logger.info("Importing %s with args: %s" % (cls, args))
+        logger.info("Importing %s with args: %s", cls, args)
         impl = cls(**args)
 
         return impl
diff --git a/pilot/control/data.py b/pilot/control/data.py
index 83c731ee..6f820d33 100644
--- a/pilot/control/data.py
+++ b/pilot/control/data.py
@@ -7,7 +7,7 @@
 # Authors:
 # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017
 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017
-# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2020
+# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021
 # - Wen Guan, wen.guan@cern.ch, 2018
 # - Alexey Anisenkov, anisyonk@cern.ch, 2018
 
@@ -63,7 +63,7 @@ def control(queues, traces, args):
                 pass
             else:
                 exc_type, exc_obj, exc_trace = exc
-                logger.warning("thread \'%s\' received an exception from bucket: %s" % (thread.name, exc_obj))
+                logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj)
 
                 # deal with the exception
                 # ..
@@ -107,8 +107,8 @@ def skip_special_files(job):
     user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
     try:
         user.update_stagein(job)
-    except Exception as e:
-        logger.warning('caught exception: %s' % e)
+    except Exception as error:
+        logger.warning('caught exception: %s', error)
 
 
 def update_indata(job):
@@ -124,7 +124,7 @@ def update_indata(job):
         if fspec.status == 'no_transfer':
             toberemoved.append(fspec)
     for fspec in toberemoved:
-        logger.info('removing fspec object (lfn=%s) from list of input files' % fspec.lfn)
+        logger.info('removing fspec object (lfn=%s) from list of input files', fspec.lfn)
         job.indata.remove(fspec)
 
 
@@ -193,11 +193,11 @@ def _stage_in(args, job):
             pilot.util.middleware.containerise_middleware(job, job.indata, args.queue, eventtype, localsite, remotesite,
                                                           job.infosys.queuedata.container_options, args.input_dir,
                                                           label=label, container_type=job.infosys.queuedata.container_type.get("middleware"))
-        except PilotException as e:
-            logger.warning('stage-in containerisation threw a pilot exception: %s' % e)
-        except Exception as e:
+        except PilotException as error:
+            logger.warning('stage-in containerisation threw a pilot exception: %s', error)
+        except Exception as error:
             import traceback
-            logger.warning('stage-in containerisation threw an exception: %s' % e)
+            logger.warning('stage-in containerisation threw an exception: %s', error)
             logger.error(traceback.format_exc())
     else:
         try:
@@ -224,17 +224,17 @@ def _stage_in(args, job):
             msg = errors.format_diagnostics(error.get_error_code(), error_msg)
             job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code(), msg=msg)
         except Exception as error:
-            logger.error('failed to stage-in: error=%s' % error)
+            logger.error('failed to stage-in: error=%s', error)
 
     logger.info('summary of transferred files:')
-    for e in job.indata:
-        status = e.status if e.status else "(not transferred)"
-        logger.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, status))
+    for infile in job.indata:
+        status = infile.status if infile.status else "(not transferred)"
+        logger.info(" -- lfn=%s, status_code=%s, status=%s", infile.lfn, infile.status_code, status)
 
     # write time stamps to pilot timing file
     add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args)
 
-    remain_files = [e for e in job.indata if e.status not in ['remote_io', 'transferred', 'no_transfer']]
+    remain_files = [infile for infile in job.indata if infile.status not in ['remote_io', 'transferred', 'no_transfer']]
     logger.info("stage-in finished") if not remain_files else logger.info("stage-in failed")
 
     return not remain_files
@@ -255,8 +255,8 @@ def get_rse(data, lfn=""):
     if lfn == "":
         try:
             return data[0].ddmendpoint
-        except Exception as e:
-            logger.warning("exception caught: %s" % e)
+        except Exception as error:
+            logger.warning("exception caught: %s", error)
             logger.warning("end point is currently unknown")
             return "unknown"
 
@@ -426,10 +426,10 @@ def write_output(filename, output):
 
     try:
         write_file(filename, output, unique=True)
-    except PilotException as e:
-        logger.warning('failed to write utility output to file: %s, %s' % (e, output))
+    except PilotException as error:
+        logger.warning('failed to write utility output to file: %s, %s', error, output)
     else:
-        logger.debug('wrote %s' % filename)
+        logger.debug('wrote %s', filename)
 
 
 def write_utility_output(workdir, step, stdout, stderr):
@@ -479,17 +479,17 @@ def copytool_in(queues, traces, args):
             if cmd:
                 # xcache debug
                 exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[before xcache start] stdout=%s' % _stdout)
-                logger.debug('[before xcache start] stderr=%s' % _stderr)
+                logger.debug('[before xcache start] stdout=%s', _stdout)
+                logger.debug('[before xcache start] stderr=%s', _stderr)
 
                 exit_code, stdout, stderr = execute(cmd.get('command'))
-                logger.debug('stdout=%s' % stdout)
-                logger.debug('stderr=%s' % stderr)
+                logger.debug('stdout=%s', stdout)
+                logger.debug('stderr=%s', stderr)
 
                 # xcache debug
                 exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[after xcache start] stdout=%s' % _stdout)
-                logger.debug('[after xcache start] stderr=%s' % _stderr)
+                logger.debug('[after xcache start] stdout=%s', _stdout)
+                logger.debug('[after xcache start] stderr=%s', _stderr)
 
                 # perform any action necessary after command execution (e.g. stdout processing)
                 kwargs = {'label': cmd.get('label', 'utility'), 'output': stdout}
@@ -530,7 +530,7 @@ def copytool_in(queues, traces, args):
                 # remove the job from the current stage-in queue
                 _job = queues.current_data_in.get(block=True, timeout=1)
                 if _job:
-                    logger.debug('job %s has been removed from the current_data_in queue' % _job.jobid)
+                    logger.debug('job %s has been removed from the current_data_in queue', _job.jobid)
 
                 # now create input file metadata if required by the payload
                 if os.environ.get('PILOT_ES_EXECUTOR_TYPE', 'generic') == 'generic':
@@ -538,12 +538,12 @@ def copytool_in(queues, traces, args):
                     user = __import__('pilot.user.%s.metadata' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
                     file_dictionary = get_input_file_dictionary(job.indata)
                     xml = user.create_input_file_metadata(file_dictionary, job.workdir)
-                    logger.info('created input file metadata:\n%s' % xml)
+                    logger.info('created input file metadata:\n%s', xml)
             else:
                 # remove the job from the current stage-in queue
                 _job = queues.current_data_in.get(block=True, timeout=1)
                 if _job:
-                    logger.debug('job %s has been removed from the current_data_in queue' % _job.jobid)
+                    logger.debug('job %s has been removed from the current_data_in queue', _job.jobid)
                 logger.warning('stage-in failed, adding job object to failed_data_in queue')
                 job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEINFAILED)
                 set_pilot_state(job=job, state="failed")
@@ -607,7 +607,7 @@ def copytool_out(queues, traces, args):
                     if is_already_processed(queues, processed_jobs):
                         continue
 
-                logger.info('will perform stage-out for job id=%s' % job.jobid)
+                logger.info('will perform stage-out for job id=%s', job.jobid)
 
                 if args.abort_job.is_set():
                     traces.pilot['command'] = 'abort'
@@ -669,7 +669,7 @@ def is_already_processed(queues, processed_jobs):
 
     for jobid in processed_jobs:
         if jobid in jobids:
-            logger.warning('output from job %s has already been staged out' % jobid)
+            logger.warning('output from job %s has already been staged out', jobid)
             found = True
             break
     if found:
@@ -737,7 +737,7 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out
     :return:
     """
 
-    logger.debug('preparing to create log file (debug mode=%s)' % str(debugmode))
+    logger.debug('preparing to create log file (debug mode=%s)', str(debugmode))
 
     # PILOT_HOME is the launch directory of the pilot (or the one specified in pilot options as pilot workdir)
     pilot_home = os.environ.get('PILOT_HOME', os.getcwd())
@@ -755,7 +755,7 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out
     for f in input_files + output_files:
         path = os.path.join(workdir, f)
         if os.path.exists(path):
-            logger.info('removing file: %s' % path)
+            logger.info('removing file: %s', path)
             remove(path)
 
     # rename the workdir for the tarball creation
@@ -765,7 +765,7 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out
     workdir = newworkdir
 
     fullpath = os.path.join(workdir, logfile_name)  # /some/path/to/dirname/log.tgz
-    logger.info('will create archive %s' % fullpath)
+    logger.info('will create archive %s', fullpath)
     try:
         cmd = "pwd;tar cvfz %s %s --dereference --one-file-system; echo $?" % (fullpath, tarball_name)
         exit_code, stdout, stderr = execute(cmd)
@@ -774,11 +774,11 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out
     else:
         if pilot_home != current_dir:
             os.chdir(pilot_home)
-        logger.debug('stdout = %s' % stdout)
+        logger.debug('stdout = %s', stdout)
     try:
         os.rename(workdir, orgworkdir)
-    except Exception as e:
-        logger.debug('exception caught: %s' % e)
+    except Exception as error:
+        logger.debug('exception caught: %s', error)
 
 
 def _do_stageout(job, xdata, activity, queue, title, output_dir=''):
@@ -793,7 +793,7 @@ def _do_stageout(job, xdata, activity, queue, title, output_dir=''):
     :return: True in case of success transfers
     """
 
-    logger.info('prepare to stage-out %d %s file(s)' % (len(xdata), title))
+    logger.info('prepare to stage-out %d %s file(s)', len(xdata), title)
     label = 'stage-out'
 
     # should stage-in be done by a script (for containerisation) or by invoking the API (ie classic mode)?
@@ -805,10 +805,10 @@ def _do_stageout(job, xdata, activity, queue, title, output_dir=''):
             pilot.util.middleware.containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite,
                                                           job.infosys.queuedata.container_options, output_dir,
                                                           label=label, container_type=job.infosys.queuedata.container_type.get("middleware"))
-        except PilotException as e:
-            logger.warning('stage-out containerisation threw a pilot exception: %s' % e)
-        except Exception as e:
-            logger.warning('stage-out containerisation threw an exception: %s' % e)
+        except PilotException as error:
+            logger.warning('stage-out containerisation threw a pilot exception: %s', error)
+        except Exception as error:
+            logger.warning('stage-out containerisation threw an exception: %s', error)
     else:
         try:
             logger.info('stage-out will not be done in a container')
@@ -838,16 +838,14 @@ def _do_stageout(job, xdata, activity, queue, title, output_dir=''):
             logger.debug('stage-out client completed')
 
     logger.info('summary of transferred files:')
-    for e in xdata:
-        if not e.status:
+    for iofile in xdata:
+        if not iofile.status:
             status = "(not transferred)"
         else:
-            status = e.status
-        logger.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, status))
+            status = iofile.status
+        logger.info(" -- lfn=%s, status_code=%s, status=%s", iofile.lfn, iofile.status_code, status)
 
-    remain_files = [e for e in xdata if e.status not in ['transferred']]
-    logger.debug('remain_files=%s' % str(remain_files))
-    logger.debug('xdata=%s' % str(xdata))
+    remain_files = [iofile for iofile in xdata if iofile.status not in ['transferred']]
 
     return not remain_files
 
@@ -897,8 +895,8 @@ def _stage_out_new(job, args):
             create_log(job.workdir, logfile.lfn, tarball_name, args.cleanup,
                        input_files=input_files, output_files=output_files,
                        is_looping=errors.LOOPINGJOB in job.piloterrorcodes, debugmode=job.debug)
-        except LogFileCreationFailure as e:
-            logger.warning('failed to create tar file: %s' % e)
+        except LogFileCreationFailure as error:
+            logger.warning('failed to create tar file: %s', error)
             set_pilot_state(job=job, state="failed")
             job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.LOGFILECREATIONFAILURE)
             return False
@@ -918,32 +916,27 @@ def _stage_out_new(job, args):
 
     # generate fileinfo details to be send to Panda
     fileinfo = {}
-    for e in job.outdata + job.logdata:
-        if e.status in ['transferred']:
-            logger.debug('got surl=%s' % e.surl)
-            logger.debug('got turl=%s' % e.turl)
-            fileinfo[e.lfn] = {'guid': e.guid, 'fsize': e.filesize,
-                               'adler32': e.checksum.get('adler32'),
-                               'surl': e.turl}
+    for iofile in job.outdata + job.logdata:
+        if iofile.status in ['transferred']:
+            fileinfo[iofile.lfn] = {'guid': iofile.guid,
+                                    'fsize': iofile.filesize,
+                                    'adler32': iofile.checksum.get('adler32'),
+                                    'surl': iofile.turl}
 
     job.fileinfo = fileinfo
-    logger.info('prepared job.fileinfo=%s' % job.fileinfo)
 
     # WARNING THE FOLLOWING RESETS ANY PREVIOUS STAGEOUT ERRORS
     if not is_success:
         # set error code + message (a more precise error code might have been set already)
         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEOUTFAILED)
         set_pilot_state(job=job, state="failed")
-        logger.warning('stage-out failed')  # with error: %d, %s (setting job state to failed)' %
-        # logger.warning('stage-out failed with error: %d, %s (setting job state to failed)' %
-        #  (job['pilotErrorCode'], job['pilotErrorDiag']))
-        # send_state(job, args, 'failed')
+        logger.warning('stage-out failed')
         return False
 
     logger.info('stage-out finished correctly')
 
     if not job.state or (job.state and job.state == 'stageout'):  # is the job state already set? if so, don't change the state (unless it's the stageout state)
-        logger.debug('changing job state from %s to finished' % job.state)
+        logger.debug('changing job state from %s to finished', job.state)
         set_pilot_state(job=job, state="finished")
 
     # send final server update since all transfers have finished correctly
@@ -984,13 +977,10 @@ def queue_monitoring(queues, traces, args):
             # TODO: put in data_out queue instead?
 
             if not _stage_out_new(job, args):
-                logger.info("job %s failed during stage-in and stage-out of log, adding job object to failed_data_outs "
-                            "queue" % job.jobid)
-                #queues.failed_data_out.put(job)
+                logger.info("job %s failed during stage-in and stage-out of log, adding job object to failed_data_outs queue", job.jobid)
                 put_in_queue(job, queues.failed_data_out)
             else:
-                logger.info("job %s failed during stage-in, adding job object to failed_jobs queue" % job.jobid)
-                #queues.failed_jobs.put(job)
+                logger.info("job %s failed during stage-in, adding job object to failed_jobs queue", job.jobid)
                 put_in_queue(job, queues.failed_jobs)
 
         # monitor the finished_data_out queue
@@ -1020,12 +1010,11 @@ def queue_monitoring(queues, traces, args):
             set_pilot_state(job=job, state="failed")
             if not _stage_out_new(job, args):
                 logger.info("job %s failed during stage-out of data file(s) as well as during stage-out of log, "
-                            "adding job object to failed_jobs queue" % job.jobid)
+                            "adding job object to failed_jobs queue", job.jobid)
             else:
                 logger.info("job %s failed during stage-out of data file(s) - stage-out of log succeeded, adding job "
-                            "object to failed_jobs queue" % job.jobid)
+                            "object to failed_jobs queue", job.jobid)
 
-            #queues.failed_jobs.put(job)
             put_in_queue(job, queues.failed_jobs)
 
         if abort:
diff --git a/pilot/control/interceptor.py b/pilot/control/interceptor.py
index e7987a3a..31f4c395 100644
--- a/pilot/control/interceptor.py
+++ b/pilot/control/interceptor.py
@@ -5,7 +5,9 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Authors:
-# - Paul Nilsson, paul.nilsson@cern.ch, 2020
+# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2021
+
+# Note: leave this module for now - the code might be useful for reuse
 
 import time
 
@@ -29,9 +31,6 @@ def run(args):
     :returns:
     """
 
-    # t = threading.current_thread()
-    # logger.debug('job.control is run by thread: %s' % t.name)
-
     targets = {'receive': receive, 'send': send}
     threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'args': args},
                          name=name) for name, target in list(targets.items())]  # Python 2/3
@@ -48,7 +47,7 @@ def run(args):
                 pass
             else:
                 exc_type, exc_obj, exc_trace = exc
-                logger.warning("thread \'%s\' received an exception from bucket: %s" % (thread.name, exc_obj))
+                logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj)
 
                 # deal with the exception
                 # ..
diff --git a/pilot/control/job.py b/pilot/control/job.py
index a080b8b2..94b1c94c 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -7,7 +7,7 @@
 # Authors:
 # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017
 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017
-# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2020
+# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021
 # - Wen Guan, wen.guan@cern.ch, 2018
 
 from __future__ import print_function  # Python 2
@@ -72,9 +72,6 @@ def control(queues, traces, args):
     :return:
     """
 
-    # t = threading.current_thread()
-    # logger.debug('job.control is run by thread: %s' % t.name)
-
     targets = {'validate': validate, 'retrieve': retrieve, 'create_data_payload': create_data_payload,
                'queue_monitor': queue_monitor, 'job_monitor': job_monitor}
     threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'queues': queues, 'traces': traces, 'args': args},
@@ -92,7 +89,7 @@ def control(queues, traces, args):
                 pass
             else:
                 exc_type, exc_obj, exc_trace = exc
-                logger.warning("thread \'%s\' received an exception from bucket: %s" % (thread.name, exc_obj))
+                logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj)
 
                 # deal with the exception
                 # ..
@@ -142,8 +139,8 @@ def _validate_job(job):
     try:
         kwargs = {'job': job}
         job.usecontainer = container.do_use_container(**kwargs)
-    except Exception as e:
-        logger.warning('exception caught: %s' % e)
+    except Exception as error:
+        logger.warning('exception caught: %s', error)
 
     return True if user.verify_job(job) else False
 
@@ -564,8 +561,12 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False):
             logger.warning('received unknown server command via backchannel: %s' % cmd)
 
     # for testing debug mode
-    #job.debug = True
-    # job.debug_command = 'tail payload.stdout'
+
+
+
+    job.debug = True
+    job.debug_command = 'du -dk'
+    # job.debug_command = 'tail -30 payload.stdout'
     # job.debug_command = 'ls -ltr workDir'  # not really tested
     # job.debug_command = 'ls -ltr %s' % job.workdir
     # job.debug_command = 'ps -ef'
@@ -688,7 +689,7 @@ def process_debug_mode(job):
     """
 
     # for gdb commands, use the proper gdb version (the system one may be too old)
-    if 'gdb ' in job.debug_command:
+    if job.debug_command.startswith('gdb '):
         pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
         user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
         user.preprocess_debug_command(job)
@@ -696,7 +697,7 @@ def process_debug_mode(job):
     stdout = get_debug_stdout(job)
     if stdout:
         # in case gdb was successfully used, the payload can now be killed
-        if 'gdb ' in job.debug_command and job.pid:
+        if job.debug_command.startswith('gdb ') and job.pid:
             job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PANDAKILL,
                                                                              msg='payload was killed after gdb produced requested core file')
             logger.debug('will proceed to kill payload processes')
@@ -715,15 +716,16 @@ def get_debug_stdout(job):
 
     if job.debug_command == 'debug':
         return get_payload_log_tail(job.workdir)
-    elif 'tail' in job.debug_command:
+    elif 'tail ' in job.debug_command:
         return get_requested_log_tail(job.debug_command, job.workdir)
     elif 'ls ' in job.debug_command:
         return get_ls(job.debug_command, job.workdir)
     elif 'ps ' in job.debug_command or 'gdb ' in job.debug_command:
         return get_general_command_stdout(job)
     else:
-        logger.warning('command not handled yet: %s' % job.debug_command)
-        return ''
+        # general command, execute and return output
+        exit_code, stdout, stderr = execute(job.debug_command)
+        return stdout
 
 
 def get_general_command_stdout(job):
diff --git a/pilot/control/payloads/eventservice.py b/pilot/control/payloads/eventservice.py
index 1d601739..07829a73 100644
--- a/pilot/control/payloads/eventservice.py
+++ b/pilot/control/payloads/eventservice.py
@@ -53,11 +53,11 @@ def run_payload(self, job, cmd, out, err):
             logger.fatal('could not define payload command')
             return None
 
-        logger.info("payload execution command: %s" % executable)
+        logger.info("payload execution command: %s", executable)
 
         try:
             payload = {'executable': executable, 'workdir': job.workdir, 'output_file': out, 'error_file': err, 'job': job}
-            logger.debug("payload: %s" % payload)
+            logger.debug("payload: %s", payload)
 
             logger.info("Starting EventService WorkExecutor")
             executor_type = self.get_executor_type()
@@ -66,14 +66,14 @@ def run_payload(self, job, cmd, out, err):
             executor.start()
             logger.info("EventService WorkExecutor started")
 
-            logger.info("ESProcess started with pid: %s" % executor.get_pid())
+            logger.info("ESProcess started with pid: %s", executor.get_pid())
             job.pid = executor.get_pid()
             if job.pid:
                 job.pgrp = os.getpgid(job.pid)
 
             self.utility_after_payload_started(job)
         except Exception as e:
-            logger.error('could not execute: %s' % str(e))
+            logger.error('could not execute: %s', str(e))
             return None
 
         return executor
diff --git a/pilot/control/payloads/eventservicemerge.py b/pilot/control/payloads/eventservicemerge.py
index a23c00b2..5c3d454d 100644
--- a/pilot/control/payloads/eventservicemerge.py
+++ b/pilot/control/payloads/eventservicemerge.py
@@ -6,7 +6,7 @@
 #
 # Authors:
 # - Wen Guan, wen.guan@cern.ch, 2018
-# - Paul Nilsson, paul.nilsson@cern.ch, 2020
+# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2021
 
 import os
 
@@ -25,9 +25,9 @@ def untar_file(self, lfn, job):
 
         pfn = os.path.join(job.workdir, lfn)
         command = "tar -xf %s -C %s" % (pfn, job.workdir)
-        logger.info("Untar file: %s" % command)
+        logger.info("untar file: %s", command)
         exit_code, stdout, stderr = execute(command)
-        logger.info("exit_code: %s, stdout: %s, stderr: %s\n" % (exit_code, stdout, stderr))
+        logger.info("exit_code: %s, stdout: %s, stderr: %s\n", exit_code, stdout, stderr)
 
     def utility_before_payload(self, job):
         """
diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py
index 8236f24b..94fba2af 100644
--- a/pilot/control/payloads/generic.py
+++ b/pilot/control/payloads/generic.py
@@ -92,7 +92,7 @@ def utility_before_payload(self, job):
         cmd_dictionary = user.get_utility_commands(order=UTILITY_BEFORE_PAYLOAD, job=job)
         if cmd_dictionary:
             cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args'))
-            logger.info('utility command (\'%s\') to be executed before the payload: %s' % (cmd_dictionary.get('label', 'utility'), cmd))
+            logger.info('utility command (\'%s\') to be executed before the payload: %s', cmd_dictionary.get('label', 'utility'), cmd)
 
         return cmd
 
@@ -114,7 +114,7 @@ def utility_with_payload(self, job):
         cmd_dictionary = user.get_utility_commands(order=UTILITY_WITH_PAYLOAD, job=job)
         if cmd_dictionary:
             cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args'))
-            logger.info('utility command (\'%s\') to be executed with the payload: %s' % (cmd_dictionary.get('label', 'utility'), cmd))
+            logger.info('utility command (\'%s\') to be executed with the payload: %s', cmd_dictionary.get('label', 'utility'), cmd)
 
         return cmd
 
@@ -138,7 +138,7 @@ def get_utility_command(self, order=None):
         cmd_dictionary = user.get_utility_commands(order=order, job=self.__job)
         if cmd_dictionary:
             cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args'))
-            logger.info('utility command (\'%s\') to be executed after the payload: %s' % (cmd_dictionary.get('label', 'utility'), cmd))
+            logger.info('utility command (\'%s\') to be executed after the payload: %s', cmd_dictionary.get('label', 'utility'), cmd)
 
         return cmd
 
@@ -156,7 +156,7 @@ def utility_after_payload_started(self, job):
         cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_STARTED, job=job)
         if cmd_dictionary:
             cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args'))
-            logger.info('utility command to be executed after the payload: %s' % cmd)
+            logger.info('utility command to be executed after the payload: %s', cmd)
 
             # how should this command be executed?
             utilitycommand = user.get_utility_command_setup(cmd_dictionary.get('command'), job)
@@ -166,8 +166,8 @@ def utility_after_payload_started(self, job):
             try:
                 proc1 = execute(utilitycommand, workdir=job.workdir, returnproc=True, usecontainer=False,
                                 stdout=PIPE, stderr=PIPE, cwd=job.workdir, job=job)
-            except Exception as e:
-                logger.error('could not execute: %s' % e)
+            except Exception as error:
+                logger.error('could not execute: %s', error)
             else:
                 # store process handle in job object, and keep track on how many times the command has been launched
                 # also store the full command in case it needs to be restarted later (by the job_monitor() thread)
@@ -191,7 +191,7 @@ def utility_after_payload_started_new(self, job):
         cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_STARTED, job=job)
         if cmd_dictionary:
             cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args'))
-            logger.info('utility command to be executed after the payload: %s' % cmd)
+            logger.info('utility command to be executed after the payload: %s', cmd)
 
         return cmd
 
@@ -203,8 +203,8 @@ def utility_after_payload_started_new(self, job):
 #            try:
 #                proc = execute(utilitycommand, workdir=job.workdir, returnproc=True, usecontainer=False,
 #                               stdout=PIPE, stderr=PIPE, cwd=job.workdir, job=job)
-#            except Exception as e:
-#                logger.error('could not execute: %s' % e)
+#            except Exception as error:
+#                logger.error('could not execute: %s', error)
 #            else:
 #                # store process handle in job object, and keep track on how many times the command has been launched
 #                # also store the full command in case it needs to be restarted later (by the job_monitor() thread)
@@ -233,7 +233,7 @@ def utility_after_payload_finished(self, job, order):
         cmd_dictionary = user.get_utility_commands(order=order, job=job)
         if cmd_dictionary:
             cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args'))
-            logger.info('utility command (\'%s\') to be executed after the payload has finished: %s' % (cmd_dictionary.get('label', 'utility'), cmd))
+            logger.info('utility command (\'%s\') to be executed after the payload has finished: %s', cmd_dictionary.get('label', 'utility'), cmd)
 
         return cmd, cmd_dictionary.get('label')
 
@@ -249,7 +249,7 @@ def execute_utility_command(self, cmd, job, label):
 
         exit_code, stdout, stderr = execute(cmd, workdir=job.workdir, cwd=job.workdir, usecontainer=False)
         if exit_code:
-            logger.warning('command returned non-zero exit code: %s (exit code = %d) - see utility logs for details' % (cmd, exit_code))
+            logger.warning('command returned non-zero exit code: %s (exit code = %d) - see utility logs for details', cmd, exit_code)
             if label == 'preprocess':
                 err = errors.PREPROCESSFAILURE
             elif label == 'postprocess':
@@ -289,18 +289,18 @@ def write_utility_output(self, workdir, step, stdout, stderr):
                 self.__postprocess_stderr_name = name_stderr
             name = os.path.join(workdir, step + '_stdout.txt')
             write_file(name, stdout, unique=True)
-        except PilotException as e:
-            logger.warning('failed to write utility stdout to file: %s, %s' % (e, stdout))
+        except PilotException as error:
+            logger.warning('failed to write utility stdout to file: %s, %s', error, stdout)
         else:
-            logger.debug('wrote %s' % name)
+            logger.debug('wrote %s', name)
 
         try:
             name = os.path.join(workdir, step + '_stderr.txt')
             write_file(name, stderr, unique=True)
-        except PilotException as e:
-            logger.warning('failed to write utility stderr to file: %s, %s' % (e, stderr))
+        except PilotException as error:
+            logger.warning('failed to write utility stderr to file: %s, %s', error, stderr)
         else:
-            logger.debug('wrote %s' % name)
+            logger.debug('wrote %s', name)
 
     def pre_payload(self, job):
         """
@@ -331,13 +331,13 @@ def run_command(self, cmd, label=None):
         """
 
         if label:
-            logger.info('\n\n%s:\n\n%s\n' % (label, cmd))
+            logger.info('\n\n%s:\n\n%s\n', label, cmd)
         if label == 'coprocess':
             try:
                 out = open(os.path.join(self.__job.workdir, self.__coprocess_stdout_name), 'wb')
                 err = open(os.path.join(self.__job.workdir, self.__coprocess_stderr_name), 'wb')
-            except Exception as e:
-                logger.warning('failed to open coprocess stdout/err: %s' % e)
+            except Exception as error:
+                logger.warning('failed to open coprocess stdout/err: %s', error)
                 out = None
                 err = None
         else:
@@ -346,14 +346,14 @@ def run_command(self, cmd, label=None):
         try:
             proc = execute(cmd, workdir=self.__job.workdir, returnproc=True, stdout=out, stderr=err,
                            usecontainer=False, cwd=self.__job.workdir, job=self.__job)
-        except Exception as e:
-            logger.error('could not execute: %s' % str(e))
+        except Exception as error:
+            logger.error('could not execute: %s', error)
             return None
         if type(proc) == tuple and not proc[0]:
             logger.error('failed to execute command')
             return None
 
-        logger.info('started %s -- pid=%s executable=%s' % (label, proc.pid, cmd))
+        logger.info('started %s -- pid=%s executable=%s', label, proc.pid, cmd)
 
         return proc
 
@@ -374,25 +374,23 @@ def run_payload(self, job, cmd, out, err):
         # add time for PILOT_PRE_PAYLOAD
         self.pre_payload(job)
 
-        logger.info("\n\npayload execution command:\n\n%s\n" % cmd)
+        logger.info("\n\npayload execution command:\n\n%s\n", cmd)
         try:
             proc = execute(cmd, workdir=job.workdir, returnproc=True,
                            usecontainer=True, stdout=out, stderr=err, cwd=job.workdir, job=job)
-        except Exception as e:
-            logger.error('could not execute: %s' % str(e))
+        except Exception as error:
+            logger.error('could not execute: %s', error)
             return None
         if type(proc) == tuple and not proc[0]:
             logger.error('failed to execute payload')
             return None
 
-        logger.info('started -- pid=%s executable=%s' % (proc.pid, cmd))
+        logger.info('started -- pid=%s executable=%s', proc.pid, cmd)
         job.pid = proc.pid
         job.pgrp = os.getpgid(job.pid)
         set_pilot_state(job=job, state="running")
 
         #_cmd = self.utility_with_payload(job)
-        #if _cmd:
-        #    logger.info('could have executed: %s' % _cmd)
 
         self.utility_after_payload_started(job)
 
@@ -457,13 +455,13 @@ def wait_graceful(self, args, proc):
             for i in range(60):  # Python 2/3
                 if args.graceful_stop.is_set():
                     breaker = True
-                    logger.info('breaking -- sending SIGTERM pid=%s' % proc.pid)
+                    logger.info('breaking -- sending SIGTERM pid=%s', proc.pid)
                     os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
                     # proc.terminate()
                     break
                 time.sleep(1)
             if breaker:
-                logger.info('breaking -- sleep 3s before sending SIGKILL pid=%s' % proc.pid)
+                logger.info('breaking -- sleep 3s before sending SIGKILL pid=%s', proc.pid)
                 time.sleep(3)
                 proc.kill()
                 break
@@ -471,7 +469,7 @@ def wait_graceful(self, args, proc):
             exit_code = proc.poll()
 
             if iteration % 10 == 0:
-                logger.info('running: iteration=%d pid=%s exit_code=%s' % (iteration, proc.pid, exit_code))
+                logger.info('running: iteration=%d pid=%s exit_code=%s', iteration, proc.pid, exit_code)
             if exit_code is not None:
                 break
             else:
@@ -504,7 +502,7 @@ def get_payload_command(self, job):
             job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code())
             self.__traces.pilot['error_code'] = job.piloterrorcodes[0]
             logger.fatal(
-                'could not define payload command (traces error set to: %d)' % self.__traces.pilot['error_code'])
+                'could not define payload command (traces error set to: %d)', self.__traces.pilot['error_code'])
 
         return cmd
 
@@ -527,19 +525,19 @@ def run_preprocess(self, job):
 
         if cmd_before_payload:
             cmd_before_payload = job.setup + cmd_before_payload
-            logger.info("\n\npreprocess execution command:\n\n%s\n" % cmd_before_payload)
+            logger.info("\n\npreprocess execution command:\n\n%s\n", cmd_before_payload)
             exit_code = self.execute_utility_command(cmd_before_payload, job, 'preprocess')
             if exit_code == 160:
                 logger.fatal('no more HP points - time to abort processing loop')
             elif exit_code:
                 # set error code
                 job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PREPROCESSFAILURE)
-                logger.fatal('cannot continue since preprocess failed: exit_code=%d' % exit_code)
+                logger.fatal('cannot continue since preprocess failed: exit_code=%d', exit_code)
             else:
                 # in case the preprocess produced a command, chmod it
                 path = os.path.join(job.workdir, job.containeroptions.get('containerExec', 'does_not_exist'))
                 if os.path.exists(path):
-                    logger.debug('chmod 0o755: %s' % path)
+                    logger.debug('chmod 0o755: %s', path)
                     os.chmod(path, 0o755)
 
         return exit_code
@@ -566,7 +564,7 @@ def run(self):  # noqa: C901
         # abort when nothing more to run, or when the preprocess returns a special exit code
         iteration = 0
         while True:
-            logger.info('payload iteration loop #%d' % (iteration + 1))
+            logger.info('payload iteration loop #%d', iteration + 1)
             os.environ['PILOT_EXEC_ITERATION_COUNT'] = '%s' % iteration
             show_memory_usage()
 
@@ -592,8 +590,8 @@ def run(self):  # noqa: C901
             if os.environ.get('HARVESTER_HOROVOD', '') == '':
 
                 exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[before payload start] stdout=%s' % _stdout)
-                logger.debug('[before payload start] stderr=%s' % _stderr)
+                logger.debug('[before payload start] stdout=%s', _stdout)
+                logger.debug('[before payload start] stderr=%s', _stderr)
 
                 proc = self.run_payload(self.__job, cmd, self.__out, self.__err)
             else:
@@ -622,7 +620,7 @@ def run(self):  # noqa: C901
                 # allow for a secondary command to be started after the payload (e.g. a coprocess)
                 utility_cmd = self.get_utility_command(order=UTILITY_AFTER_PAYLOAD_STARTED2)
                 if utility_cmd:
-                    logger.debug('starting utility command: %s' % utility_cmd)
+                    logger.debug('starting utility command: %s', utility_cmd)
                     label = 'coprocess' if 'coprocess' in utility_cmd else None
                     proc_co = self.run_command(utility_cmd, label=label)
 
@@ -639,15 +637,15 @@ def run(self):  # noqa: C901
                 else:
                     state = 'finished' if exit_code == 0 else 'failed'
                 set_pilot_state(job=self.__job, state=state)
-                logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n' % (proc.pid, exit_code, self.__job.state))
+                logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n', proc.pid, exit_code, self.__job.state)
 
                 exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[after payload finish] stdout=%s' % _stdout)
-                logger.debug('[after payload finish] stderr=%s' % _stderr)
+                logger.debug('[after payload finish] stdout=%s', _stdout)
+                logger.debug('[after payload finish] stderr=%s', _stderr)
 
                 # stop the utility command (e.g. a coprocess if necessary
                 if proc_co:
-                    logger.debug('stopping utility command: %s' % utility_cmd)
+                    logger.debug('stopping utility command: %s', utility_cmd)
                     kill_processes(proc_co.pid)
 
                 if exit_code is None:
@@ -692,24 +690,24 @@ def run_utility_after_payload_finished(self, state, order):
         else:
             if cmd_after_payload and self.__job.postprocess and state != 'failed':
                 cmd_after_payload = self.__job.setup + cmd_after_payload
-                logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload)
+                logger.info("\n\npostprocess execution command:\n\n%s\n", cmd_after_payload)
                 exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label)
             elif cmd_after_payload:
-                logger.info("\n\npostprocess execution command:\n\n%s\n" % cmd_after_payload)
+                logger.info("\n\npostprocess execution command:\n\n%s\n", cmd_after_payload)
 
                 # xcache debug
                 if 'xcache' in cmd_after_payload:
                     _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                    logger.debug('[before xcache kill] stdout=%s' % _stdout)
-                    logger.debug('[before xcache kill] stderr=%s' % _stderr)
+                    logger.debug('[before xcache kill] stdout=%s', _stdout)
+                    logger.debug('[before xcache kill] stderr=%s', _stderr)
 
                 exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label)
 
                 # xcache debug
                 if 'xcache' in cmd_after_payload:
                     _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                    logger.debug('[after xcache kill] stdout=%s' % _stdout)
-                    logger.debug('[after xcache kill] stderr=%s' % _stderr)
+                    logger.debug('[after xcache kill] stdout=%s', _stdout)
+                    logger.debug('[after xcache kill] stderr=%s', _stderr)
 
         return exit_code
 
@@ -727,11 +725,11 @@ def stop_utilities(self):
             if utproc:
                 user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
                 sig = user.get_utility_command_kill_signal(utcmd)
-                logger.info("stopping process \'%s\' with signal %d" % (utcmd, sig))
+                logger.info("stopping process \'%s\' with signal %d", utcmd, sig)
                 try:
                     os.killpg(os.getpgid(utproc.pid), sig)
-                except Exception as e:
-                    logger.warning('exception caught: %s (ignoring)' % e)
+                except Exception as error:
+                    logger.warning('exception caught: %s (ignoring)', error)
 
                 user.post_utility_command_action(utcmd, self.__job)
 
@@ -748,4 +746,4 @@ def rename_log_files(self, iteration):
             if os.path.exists(name):
                 os.rename(name, name + '%d' % iteration)
             else:
-                logger.warning('cannot rename %s since it does not exist' % name)
+                logger.warning('cannot rename %s since it does not exist', name)
diff --git a/pilot/user/atlas/dbrelease.py b/pilot/user/atlas/dbrelease.py
index cbbec484..5f090b5d 100644
--- a/pilot/user/atlas/dbrelease.py
+++ b/pilot/user/atlas/dbrelease.py
@@ -58,7 +58,7 @@ def get_dbrelease_dir():
     :return: path to DBRelease (string).
     """
 
-    path = os.path.expandvars('$VO_ATLAS_SW_DIR/database/DBRelease') if 'VO_ATLAS_SW_DIR' in os.environ else os.path.expandvars('$OSG_APP/database/DBRelease')
+    path = os.path.join(os.environ.get('VO_ATLAS_SW_DIR', 'OSG_APP'), 'database/DBRelease')
     if path == "" or path.startswith('OSG_APP'):
         logger.warning("note: the DBRelease database directory is not available (will not attempt to skip DBRelease stage-in)")
     else:
diff --git a/pilot/user/atlas/setup.py b/pilot/user/atlas/setup.py
index 87d77faf..f21dfe27 100644
--- a/pilot/user/atlas/setup.py
+++ b/pilot/user/atlas/setup.py
@@ -413,8 +413,8 @@ def get_payload_environment_variables(cmd, job_id, task_id, attempt_nr, processi
 def get_writetoinput_filenames(writetofile):
     """
     Extract the writeToFile file name(s).
-    writeToFile='tmpin_mc16_13TeV.345935.PhPy8EG_A14_ttbarMET100_200_hdamp258p75_nonallhad.merge.AOD.e6620_e5984_s3126_r10724_r10726_tid15760866_00:AOD.15760866._000002.pool.root.1'
-    -> return 'tmpin_mc16_13TeV.345935.PhPy8EG_A14_ttbarMET100_200_hdamp258p75_nonallhad.merge.AOD.e6620_e5984_s3126_r10724_r10726_tid15760866_00'
+    writeToFile='tmpin_mc16_13TeV.blah:AOD.15760866._000002.pool.root.1'
+    -> return 'tmpin_mc16_13TeV.blah'
 
     :param writetofile: string containing file name information.
     :return: list of file names
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index b24d9de6..55163017 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '39'    # build number should be reset to '1' for every new development cycle
+BUILD = '40b'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 552b54885dbe2d6d25e2a4efedc26ef0136f3e61 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Mon, 14 Jun 2021 14:17:05 +0200
Subject: [PATCH 67/96] General debug commands now supported. Added event
 number to job metrics (to be tested). Now creating core dump when looping is
 detected, added to log.

---
 PILOTVERSION                   |   2 +-
 pilot/control/job.py           | 124 +++++++++++++--------------------
 pilot/control/payload.py       |   2 +-
 pilot/user/atlas/jobmetrics.py |  82 +++++++++++++++++++---
 pilot/util/auxiliary.py        |  55 ++++++++++++++-
 pilot/util/constants.py        |   2 +-
 pilot/util/filehandling.py     |  17 +++++
 pilot/util/loopingjob.py       |  32 ++++++++-
 pilot/util/processes.py        |  26 -------
 9 files changed, 226 insertions(+), 116 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 6fca09e1..abaf199a 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.12.1.40b
\ No newline at end of file
+2.12.1.40
\ No newline at end of file
diff --git a/pilot/control/job.py b/pilot/control/job.py
index 94b1c94c..58b6c84e 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -33,7 +33,7 @@
 from pilot.util import https
 from pilot.util.auxiliary import get_batchsystem_jobid, get_job_scheduler_id, get_pilot_id, \
     set_pilot_state, get_pilot_state, check_for_final_server_update, pilot_version_banner, is_virtual_machine, \
-    is_python3, show_memory_usage, has_instruction_sets
+    is_python3, show_memory_usage, has_instruction_sets, locate_core_file
 from pilot.util.config import config
 from pilot.util.common import should_abort, was_pilot_killed
 from pilot.util.constants import PILOT_MULTIJOB_START_TIME, PILOT_PRE_GETJOB, PILOT_POST_GETJOB, PILOT_KILL_SIGNAL, LOG_TRANSFER_NOT_DONE, \
@@ -50,7 +50,7 @@
 from pilot.util.middleware import containerise_general_command
 from pilot.util.monitoring import job_monitor_tasks, check_local_space
 from pilot.util.monitoringtime import MonitoringTime
-from pilot.util.processes import cleanup, threads_aborted, kill_process, get_pid_from_command, kill_processes
+from pilot.util.processes import cleanup, threads_aborted, kill_process, kill_processes
 from pilot.util.proxy import get_distinguished_name
 from pilot.util.queuehandling import scan_for_jobs, put_in_queue, queue_report, purge_queue
 from pilot.util.timing import add_to_pilot_timing, timing_report, get_postgetjob_time, get_time_since, time_stamp
@@ -158,14 +158,14 @@ def verify_error_code(job):
     """
 
     if job.piloterrorcode == 0 and len(job.piloterrorcodes) > 0:
-        logger.warning('piloterrorcode set to first piloterrorcodes list entry: %s' % str(job.piloterrorcodes))
+        logger.warning('piloterrorcode set to first piloterrorcodes list entry: %s', str(job.piloterrorcodes))
         job.piloterrorcode = job.piloterrorcodes[0]
 
     if job.piloterrorcode != 0 and job.is_analysis():
         if errors.is_recoverable(code=job.piloterrorcode):
             job.piloterrorcode = -abs(job.piloterrorcode)
             job.state = 'failed'
-            logger.info('failed user job is recoverable (error code=%s)' % job.piloterrorcode)
+            logger.info('failed user job is recoverable (error code=%s)', job.piloterrorcode)
         else:
             logger.info('failed user job is not recoverable')
     else:
@@ -184,8 +184,6 @@ def get_proper_state(job, state):
     :return: valid server state (string).
     """
 
-    logger.debug('state=%s' % state)
-    logger.debug('serverstate=%s' % job.serverstate)
     if job.serverstate == "finished" or job.serverstate == "failed":
         pass
     elif job.serverstate == "" and state != "finished" and state != "failed":
@@ -194,7 +192,6 @@ def get_proper_state(job, state):
         job.serverstate = state
     else:
         job.serverstate = 'running'
-    logger.debug('serverstate=%s' % job.serverstate)
 
     return job.serverstate
 
@@ -219,7 +216,7 @@ def publish_harvester_reports(state, args, data, job, final):
 
     # publish work report
     if not publish_work_report(data, path):
-        logger.debug('failed to write to workerAttributesFile %s' % path)
+        logger.debug('failed to write to workerAttributesFile %s', path)
         return False
 
     # check if we are in final state then write out information for output files
@@ -227,9 +224,9 @@ def publish_harvester_reports(state, args, data, job, final):
         # Use the job information to write Harvester event_status.dump file
         event_status_file = get_event_status_file(args)
         if publish_stageout_files(job, event_status_file):
-            logger.debug('wrote log and output files to file %s' % event_status_file)
+            logger.debug('wrote log and output files to file %s', event_status_file)
         else:
-            logger.warning('could not write log and output files to file %s' % event_status_file)
+            logger.warning('could not write log and output files to file %s', event_status_file)
             return False
 
         # publish job report
@@ -258,8 +255,8 @@ def write_heartbeat_to_file(data):
 
     path = os.path.join(os.environ.get('PILOT_HOME'), config.Pilot.heartbeat_message)
     if write_json(path, data):
-        logger.debug('heartbeat dictionary: %s' % data)
-        logger.debug('wrote heartbeat to file %s' % path)
+        logger.debug('heartbeat dictionary: %s', data)
+        logger.debug('wrote heartbeat to file %s', path)
         return True
     else:
         return False
@@ -289,7 +286,7 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False)
     if state == 'finished' or state == 'failed' or state == 'holding':
         final = True
         os.environ['SERVER_UPDATE'] = SERVER_UPDATE_UPDATING
-        logger.info('job %s has %s - %s final server update' % (job.jobid, state, tag))
+        logger.info('job %s has %s - %s final server update', job.jobid, state, tag)
 
         # make sure that job.state is 'failed' if there's a set error code
         if job.piloterrorcode or job.piloterrorcodes:
@@ -301,7 +298,7 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False)
             verify_error_code(job)
     else:
         final = False
-        logger.info('job %s has state \'%s\' - %s heartbeat' % (job.jobid, state, tag))
+        logger.info('job %s has state \'%s\' - %s heartbeat', job.jobid, state, tag)
 
     # build the data structure needed for getJob, updateJob
     data = get_data_structure(job, state, args, xml=xml, metadata=metadata)
@@ -323,7 +320,7 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False)
             attempt = 0
             done = False
             while attempt < max_attempts and not done:
-                logger.info('job update attempt %d/%d' % (attempt + 1, max_attempts))
+                logger.info('job update attempt %d/%d', attempt + 1, max_attempts)
 
                 # get the URL for the PanDA server from pilot options or from config
                 pandaserver = get_panda_server(args.url, args.port)
@@ -334,8 +331,8 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False)
                 attempt += 1
 
             time_after = int(time.time())
-            logger.info('server updateJob request completed in %ds for job %s' % (time_after - time_before, job.jobid))
-            logger.info("server responded with: res = %s" % str(res))
+            logger.info('server updateJob request completed in %ds for job %s', time_after - time_before, job.jobid)
+            logger.info("server responded with: res = %s", str(res))
 
             show_memory_usage()
 
@@ -351,9 +348,9 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False)
             logger.info('skipping job update for fake test job')
             return True
 
-    except Exception as e:
-        logger.warning('exception caught while sending https request: %s' % e)
-        logger.warning('possibly offending data: %s' % data)
+    except Exception as error:
+        logger.warning('exception caught while sending https request: %s', error)
+        logger.warning('possibly offending data: %s', data)
         pass
 
     if final:
@@ -400,7 +397,7 @@ def get_job_status_from_server(job_id, url, port):
             # open connection
             ret = https.request('{pandaserver}/server/panda/getStatus'.format(pandaserver=pandaserver), data=data)
             response = ret[1]
-            logger.info("response: %s" % str(response))
+            logger.info("response: %s", str(response))
             if response:
                 try:
                     # decode the response
@@ -410,21 +407,21 @@ def get_job_status_from_server(job_id, url, port):
                     status = response['status']  # e.g. 'holding'
                     attempt_nr = int(response['attemptNr'])  # e.g. '0'
                     status_code = int(response['StatusCode'])  # e.g. '0'
-                except Exception as e:
+                except Exception as error:
                     logger.warning(
-                        "exception: dispatcher did not return allowed values: %s, %s" % (str(ret), e))
+                        "exception: dispatcher did not return allowed values: %s, %s", str(ret), error)
                     status = "unknown"
                     attempt_nr = -1
                     status_code = 20
                 else:
-                    logger.debug('server job status=%s, attempt_nr=%d, status_code=%d' % (status, attempt_nr, status_code))
+                    logger.debug('server job status=%s, attempt_nr=%d, status_code=%d', status, attempt_nr, status_code)
             else:
-                logger.warning("dispatcher did not return allowed values: %s" % str(ret))
+                logger.warning("dispatcher did not return allowed values: %s", str(ret))
                 status = "unknown"
                 attempt_nr = -1
                 status_code = 20
-        except Exception as e:
-            logger.warning("could not interpret job status from dispatcher: %s" % e)
+        except Exception as error:
+            logger.warning("could not interpret job status from dispatcher: %s", error)
             status = 'unknown'
             attempt_nr = -1
             status_code = -1
@@ -471,7 +468,7 @@ def get_panda_server(url, port):
     if default in pandaserver:
         rnd = random.choice([socket.getfqdn(vv) for vv in set([v[-1][0] for v in socket.getaddrinfo(default, 25443, socket.AF_INET)])])
         pandaserver = pandaserver.replace(default, rnd)
-        logger.debug('updated %s to %s' % (default, pandaserver))
+        logger.debug('updated %s to %s', default, pandaserver)
 
     return pandaserver
 
@@ -494,15 +491,15 @@ def get_debug_command(cmd):
     try:
         tmp = cmd.split(' ')
         com = tmp[0]
-    except Exception as e:
-        logger.warning('failed to identify debug command: %s' % e)
+    except Exception as error:
+        logger.warning('failed to identify debug command: %s', error)
     else:
         if com not in allowed_commands:
-            logger.warning('command=%s is not in the list of allowed commands: %s' % (com, str(allowed_commands)))
+            logger.warning('command=%s is not in the list of allowed commands: %s', com, str(allowed_commands))
         elif ';' in cmd or '&#59' in cmd:
-            logger.warning('debug command cannot contain \';\': \'%s\'' % cmd)
+            logger.warning('debug command cannot contain \';\': \'%s\'', cmd)
         elif com in forbidden_commands:
-            logger.warning('command=%s is not allowed' % com)
+            logger.warning('command=%s is not allowed', com)
         else:
             debug_mode = True
             debug_command = cmd
@@ -531,11 +528,10 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False):
         if ' ' in cmd and 'tobekilled' not in cmd:
             try:
                 job.debug, job.debug_command = get_debug_command(cmd)
-            except Exception as e:
-                logger.debug('exception caught in get_debug_command(): %s' % e)
+            except Exception as error:
+                logger.debug('exception caught in get_debug_command(): %s', error)
         elif 'tobekilled' in cmd:
-            logger.info('pilot received a panda server signal to kill job %s at %s' %
-                        (job.jobid, time_stamp()))
+            logger.info('pilot received a panda server signal to kill job %s at %s', job.jobid, time_stamp())
             set_pilot_state(job=job, state="failed")
             job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PANDAKILL)
             if job.pid:
@@ -545,8 +541,7 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False):
                 logger.debug('no pid to kill')
             args.abort_job.set()
         elif 'softkill' in cmd:
-            logger.info('pilot received a panda server signal to softkill job %s at %s' %
-                        (job.jobid, time_stamp()))
+            logger.info('pilot received a panda server signal to softkill job %s at %s', job.jobid, time_stamp())
             # event service kill instruction
             job.debug_command = 'softkill'
         elif 'debug' in cmd:
@@ -558,20 +553,17 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False):
             job.debug = False
             job.debug_command = 'debugoff'
         else:
-            logger.warning('received unknown server command via backchannel: %s' % cmd)
+            logger.warning('received unknown server command via backchannel: %s', cmd)
 
     # for testing debug mode
-
-
-
-    job.debug = True
-    job.debug_command = 'du -dk'
+    # job.debug = True
+    # job.debug_command = 'du -sk'
     # job.debug_command = 'tail -30 payload.stdout'
     # job.debug_command = 'ls -ltr workDir'  # not really tested
     # job.debug_command = 'ls -ltr %s' % job.workdir
     # job.debug_command = 'ps -ef'
     # job.debug_command = 'ps axo pid,ppid,pgid,args'
-    #job.debug_command = 'gdb --pid % -ex \'generate-core-file\''
+    # job.debug_command = 'gdb --pid % -ex \'generate-core-file\''
 
 
 def add_data_structure_ids(data, version_tag):
@@ -647,13 +639,13 @@ def get_data_structure(job, state, args, xml=None, metadata=None):
         #data['coreCount'] = mean(job.corecounts) if job.corecounts else job.corecount
     if job.corecounts:
         _mean = mean(job.corecounts)
-        logger.info('mean actualcorecount: %f' % _mean)
+        logger.info('mean actualcorecount: %f', _mean)
         data['meanCoreCount'] = _mean
 
     # get the number of events, should report in heartbeat in case of preempted.
     if job.nevents != 0:
         data['nEvents'] = job.nevents
-        logger.info("total number of processed events: %d (read)" % job.nevents)
+        logger.info("total number of processed events: %d (read)", job.nevents)
     else:
         logger.info("payload/TRF did not report the number of read events")
 
@@ -725,6 +717,7 @@ def get_debug_stdout(job):
     else:
         # general command, execute and return output
         exit_code, stdout, stderr = execute(job.debug_command)
+        logger.info('debug_command: %s:\n\n%s\n', job.debug_command, stdout)
         return stdout
 
 
@@ -751,17 +744,17 @@ def get_general_command_stdout(job):
                 containerise_general_command(job, job.infosys.queuedata.container_options,
                                              label='general',
                                              container_type='container')
-            except PilotException as e:
-                logger.warning('general containerisation threw a pilot exception: %s' % e)
-            except Exception as e:
-                logger.warning('general containerisation threw an exception: %s' % e)
+            except PilotException as error:
+                logger.warning('general containerisation threw a pilot exception: %s', error)
+            except Exception as error:
+                logger.warning('general containerisation threw an exception: %s', error)
         else:
             ec, stdout, stderr = execute(job.debug_command)
-            logger.debug("%s (stdout):\n\n%s\n\n" % (job.debug_command, stdout))
-            logger.debug("%s (stderr):\n\n%s\n\n" % (job.debug_command, stderr))
+            logger.debug("%s (stdout):\n\n%s\n\n", job.debug_command, stdout)
+            logger.debug("%s (stderr):\n\n%s\n\n", job.debug_command, stderr)
 
         # in case a core file was produced, locate it
-        path = locate_core_file(job.debug_command) if 'gdb ' in job.debug_command else ''
+        path = locate_core_file(cmd=job.debug_command) if 'gdb ' in job.debug_command else ''
         if path:
             # copy it to the working directory (so it will be saved in the log)
             try:
@@ -772,27 +765,6 @@ def get_general_command_stdout(job):
     return stdout
 
 
-def locate_core_file(debug_command):
-    """
-
-    """
-
-    path = None
-    pid = get_pid_from_command(debug_command)
-    if pid:
-        filename = 'core.%d' % pid
-        path = os.path.join(os.environ.get('PILOT_HOME', '.'), filename)
-        if os.path.exists(path):
-            logger.debug('found core file at: %s' % path)
-
-        else:
-            logger.debug('did not find %s in %s' % (filename, path))
-    else:
-        logger.warning('cannot locate core file since pid could not be extracted from debug command')
-
-    return path
-
-
 def get_ls(debug_command, workdir):
     """
     Return the requested ls debug command.
diff --git a/pilot/control/payload.py b/pilot/control/payload.py
index f6bef60e..61cb033c 100644
--- a/pilot/control/payload.py
+++ b/pilot/control/payload.py
@@ -371,7 +371,7 @@ def perform_initial_payload_error_analysis(job, exit_code):
                 logger.warning('error code(s) already set: %s' % str(job.piloterrorcodes))
             else:
                 # check if core dumps exist, if so remove them and return True
-                if remove_core_dumps(job.workdir):
+                if remove_core_dumps(job.workdir) and not job.debug:
                     job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.COREDUMP)
                 else:
                     logger.warning('initial error analysis did not resolve the issue (and core dumps were not found)')
diff --git a/pilot/user/atlas/jobmetrics.py b/pilot/user/atlas/jobmetrics.py
index 20b5d31a..d0802040 100644
--- a/pilot/user/atlas/jobmetrics.py
+++ b/pilot/user/atlas/jobmetrics.py
@@ -5,16 +5,18 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Authors:
-# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2020
+# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021
 
 from pilot.api import analytics
 from pilot.util.jobmetrics import get_job_metrics_entry
+from pilot.util.filehandling import find_last_line
 
 from .cpu import get_core_count
 from .common import get_db_info, get_resimevents
 from .utilities import get_memory_monitor_output_filename
 
 import os
+import re
 import logging
 logger = logging.getLogger(__name__)
 
@@ -31,7 +33,7 @@ def get_job_metrics_string(job):
 
     # report core count (will also set corecount in job object)
     corecount = get_core_count(job)
-    logger.debug('job definition core count: %d' % corecount)
+    logger.debug('job definition core count: %d', corecount)
 
     #if corecount is not None and corecount != "NULL" and corecount != 'null':
     #    job_metrics += get_job_metrics_entry("coreCount", corecount)
@@ -69,14 +71,32 @@ def get_job_metrics_string(job):
         if max_space > zero:
             job_metrics += get_job_metrics_entry("workDirSize", max_space)
         else:
-            logger.info("will not add max space = %d B to job metrics" % max_space)
+            logger.info("will not add max space = %d B to job metrics", max_space)
 
     # get analytics data
-    path = os.path.join(job.workdir, get_memory_monitor_output_filename())
+    job_metrics = add_analytics_data(job_metrics, job.workdir, job.state)
+
+    # extract event number from file and add to job metrics if it exists
+    job_metrics = add_event_number(job_metrics, job.workdir)
+
+    return job_metrics
+
+
+def add_analytics_data(job_metrics, workdir, state):
+    """
+    Add the memory leak+chi2 analytics data to the job metrics.
+
+    :param job_metrics: job metrics (string).
+    :param workdir: work directory (string).
+    :param state: job state (string).
+    :return: updated job metrics (string).
+    """
+
+    path = os.path.join(workdir, get_memory_monitor_output_filename())
     if os.path.exists(path):
         client = analytics.Analytics()
         # do not include tails on final update
-        tails = False if (job.state == "finished" or job.state == "failed" or job.state == "holding") else True
+        tails = False if (state == "finished" or state == "failed" or state == "holding") else True
         data = client.get_fitted_data(path, tails=tails)
         slope = data.get("slope", "")
         chi2 = data.get("chi2", "")
@@ -88,6 +108,28 @@ def get_job_metrics_string(job):
     return job_metrics
 
 
+def add_event_number(job_metrics, workdir):
+    """
+    Extract event number from file and add to job metrics if it exists
+
+    :param job_metrics: job metrics (string).
+    :param workdir: work directory (string).
+    :return: updated job metrics (string).
+    """
+
+    path = os.path.join(workdir, 'eventLoopHeartBeat.txt')
+    if os.path.exists(path):
+        last_line = find_last_line(path)
+        if last_line:
+            event_number = get_number_in_string(last_line)
+            if event_number:
+                job_metrics += get_job_metrics_entry("eventnumber", event_number)
+    else:
+        logger.debug('file %s does not exist (skip for now)', path)
+
+    return job_metrics
+
+
 def get_job_metrics(job):
     """
     Return a properly formatted job metrics string.
@@ -109,17 +151,41 @@ def get_job_metrics(job):
     job_metrics = job_metrics.lstrip().rstrip()
 
     if job_metrics != "":
-        logger.debug('job metrics=\"%s\"' % (job_metrics))
+        logger.debug('job metrics=\"%s\"', job_metrics)
     else:
         logger.debug("no job metrics (all values are zero)")
 
     # is job_metrics within allowed size?
     if len(job_metrics) > 500:
-        logger.warning("job_metrics out of size (%d)" % (len(job_metrics)))
+        logger.warning("job_metrics out of size (%d)",len(job_metrics))
 
         # try to reduce the field size and remove the last entry which might be cut
         job_metrics = job_metrics[:500]
         job_metrics = " ".join(job_metrics.split(" ")[:-1])
-        logger.warning("job_metrics has been reduced to: %s" % (job_metrics))
+        logger.warning("job_metrics has been reduced to: %s", job_metrics)
 
     return job_metrics
+
+
+def get_number_in_string(line, pattern=r'\ done\ processing\ event\ \#(\d+)\,'):
+    """
+    Extract a number from the given string.
+
+    E.g. file eventLoopHeartBeat.txt contains
+        done processing event #20166959, run #276689 22807 events read so far  <<<===
+    This function will return 20166959 as in int.
+
+    :param line: line from a file (string).
+    :param pattern: reg ex pattern (raw string).
+    :return: extracted number (int).
+    """
+
+    event_number = None
+    match = re.search(pattern, line)
+    if match:
+        try:
+            event_number = int(match.group(1))
+        except Exception:
+            pass
+
+    return event_number
diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py
index bd938c7b..c1a33cec 100644
--- a/pilot/util/auxiliary.py
+++ b/pilot/util/auxiliary.py
@@ -5,9 +5,10 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Authors:
-# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2020
+# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021
 
 import os
+import re
 import sys
 
 from collections import Set, Mapping, deque, OrderedDict
@@ -539,3 +540,55 @@ def has_instruction_sets(instruction_sets):
                 ret += '|%s' % i.upper() if ret else i.upper()
 
     return ret
+
+
+def locate_core_file(cmd=None, pid=None):
+    """
+    Locate the core file produced by gdb.
+
+    :param cmd: optional command containing pid corresponding to core file (string).
+    :param pid: optional pid to use with core file (core.pid) (int).
+    :return: path to core file (string).
+    """
+
+    path = None
+    if not pid and cmd:
+        pid = get_pid_from_command(cmd)
+    if pid:
+        filename = 'core.%d' % pid
+        path = os.path.join(os.environ.get('PILOT_HOME', '.'), filename)
+        if os.path.exists(path):
+            logger.debug('found core file at: %s', path)
+
+        else:
+            logger.debug('did not find %s in %s', filename, path)
+    else:
+        logger.warning('cannot locate core file since pid could not be extracted from command')
+
+    return path
+
+
+def get_pid_from_command(cmd, pattern=r'gdb --pid (\d+)'):
+    """
+    Identify an explicit process id in the given command.
+
+    Example:
+        cmd = 'gdb --pid 19114 -ex \'generate-core-file\''
+        -> pid = 19114
+
+    :param cmd: command containing a pid (string).
+    :param pattern: regex pattern (raw string).
+    :return: pid (int).
+    """
+
+    pid = None
+    match = re.search(pattern, cmd)
+    if match:
+        try:
+            pid = int(match.group(1))
+        except Exception:
+            pid = None
+    else:
+        print('no match for pattern \'%s\' in command=\'%s\'' % (pattern, cmd))
+
+    return pid
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 55163017..ef399020 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '40b'    # build number should be reset to '1' for every new development cycle
+BUILD = '40'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1
diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py
index 53972c30..79ccb710 100644
--- a/pilot/util/filehandling.py
+++ b/pilot/util/filehandling.py
@@ -1119,3 +1119,20 @@ def locate_file(pattern):
             path = fname
 
     return path
+
+
+def find_last_line(filename):
+    """
+    Find the last line in a (not too large) file.
+
+    :param filename: file name, full path (string).
+    :return: last line (string).
+    """
+
+    last_line = ""
+    with open(filename) as f:
+        for line in f:
+            pass
+        last_line = line
+
+    return last_line
diff --git a/pilot/util/loopingjob.py b/pilot/util/loopingjob.py
index e2c451a0..e45056de 100644
--- a/pilot/util/loopingjob.py
+++ b/pilot/util/loopingjob.py
@@ -8,10 +8,10 @@
 # - Paul Nilsson, paul.nilsson@cern.ch, 2018-2020
 
 from pilot.common.errorcodes import ErrorCodes
-from pilot.util.auxiliary import whoami, set_pilot_state, cut_output
+from pilot.util.auxiliary import whoami, set_pilot_state, cut_output, locate_core_file
 from pilot.util.config import config
 from pilot.util.container import execute
-from pilot.util.filehandling import remove_files, find_latest_modified_file, verify_file_list
+from pilot.util.filehandling import remove_files, find_latest_modified_file, verify_file_list, copy
 from pilot.util.parameters import convert_to_int
 from pilot.util.processes import kill_processes
 from pilot.util.timing import time_stamp
@@ -64,6 +64,10 @@ def looping_job(job, mt):
             logger.info('looping limit: %d s' % looping_limit)
             if ct - time_last_touched > looping_limit:
                 try:
+                    # first produce core dump and copy it
+                    create_core_dump(pid=job.pid, workdir=job.workdir)
+                    # set debug mode to prevent core file from being removed before log creation
+                    job.debug = True
                     kill_looping_job(job)
                 except Exception as e:
                     logger.warning('exception caught: %s' % e)
@@ -73,6 +77,30 @@ def looping_job(job, mt):
     return exit_code, diagnostics
 
 
+def create_core_dump(pid=None, workdir=None):
+    """
+    Create core dump and copy it to work directory
+    """
+
+    if not pid or not workdir:
+        logger.warning('cannot create core file since pid or workdir is unknown')
+        return
+
+    cmd = 'gdb --pid %d -ex \'generate-core-file\'' % pid
+    exit_code, stdout, stderr = execute(cmd)
+    if not exit_code:
+        path = locate_core_file(pid=pid)
+        if path:
+            try:
+                copy(path, workdir)
+            except Exception as error:
+                logger.warning('failed to copy core file: %s', error)
+            else:
+                logger.debug('copied core dump to workdir')
+
+    else:
+        logger.warning('failed to execute command: %s, stdout+err=%s', cmd, stdout + stderr)
+
 def get_time_for_last_touch(job, mt, looping_limit):
     """
     Return the time when the files in the workdir were last touched.
diff --git a/pilot/util/processes.py b/pilot/util/processes.py
index e5b94ae8..6ee9b84d 100644
--- a/pilot/util/processes.py
+++ b/pilot/util/processes.py
@@ -757,29 +757,3 @@ def is_child(pid, pandaid_pid, dictionary):
         else:
             # try another pid
             return is_child(ppid, pandaid_pid, dictionary)
-
-
-def get_pid_from_command(cmd, pattern=r'gdb --pid (\d+)'):
-    """
-    Identify an explicit process id in the given command.
-
-    Example:
-        cmd = 'gdb --pid 19114 -ex \'generate-core-file\''
-        -> pid = 19114
-
-    :param cmd: command containing a pid (string).
-    :param pattern: regex pattern (raw string).
-    :return: pid (int).
-    """
-
-    pid = None
-    match = re.search(pattern, cmd)
-    if match:
-        try:
-            pid = int(match.group(1))
-        except Exception:
-            pid = None
-    else:
-        print('no match for pattern \'%s\' in command=\'%s\'' % (pattern, cmd))
-
-    return pid

From 29a819adafaf33db8855c1a71830f8917d65f82e Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Mon, 14 Jun 2021 14:31:35 +0200
Subject: [PATCH 68/96] Added prmon to list with unwanted files in looping job
 killer

---
 pilot/user/generic/loopingjob_definitions.py | 1 +
 pilot/util/constants.py                      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pilot/user/generic/loopingjob_definitions.py b/pilot/user/generic/loopingjob_definitions.py
index ad392257..9f64b65c 100644
--- a/pilot/user/generic/loopingjob_definitions.py
+++ b/pilot/user/generic/loopingjob_definitions.py
@@ -34,6 +34,7 @@ def remove_unwanted_files(workdir, files):
     _files = []
     for _file in files:
         if not (workdir == _file or
+                "prmon" in _file or
                 "pilotlog" in _file or
                 ".lib.tgz" in _file or
                 ".py" in _file or
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index ef399020..273dcf6c 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '40'    # build number should be reset to '1' for every new development cycle
+BUILD = '41'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From f53b7d5bf3edb87778fa6aa5f24d1b0ae20a0f18 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 15 Jun 2021 14:31:54 +0200
Subject: [PATCH 69/96] Pylint updates. Fixed debug mode to allow
 'debug,<cmd+options>'

---
 PILOTVERSION                  |   2 +-
 pilot.py                      |   6 +-
 pilot/api/data.py             |   6 +-
 pilot/control/job.py          | 243 +++++++++++++++++-----------------
 pilot/control/payload.py      |  36 ++---
 pilot/user/atlas/dbrelease.py |  48 +++----
 pilot/user/atlas/setup.py     |  36 +++--
 pilot/util/auxiliary.py       |   8 +-
 pilot/util/constants.py       |   2 +-
 pilot/util/filehandling.py    | 114 ++++++++--------
 pilot/util/harvester.py       |   4 +-
 pilot/util/loopingjob.py      |  30 ++---
 pilot/util/monitoring.py      |  83 ++++++------
 pilot/util/processes.py       | 101 +++++++-------
 14 files changed, 358 insertions(+), 361 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index abaf199a..fb45e883 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.12.1.40
\ No newline at end of file
+2.12.1.44
\ No newline at end of file
diff --git a/pilot.py b/pilot.py
index 82fc4d99..c3fc0e7c 100755
--- a/pilot.py
+++ b/pilot.py
@@ -363,10 +363,10 @@ def create_main_work_dir(args):
         try:
             # create the main PanDA Pilot work directory
             mkdirs(mainworkdir)
-        except Exception as e:
+        except PilotException as error:
             # print to stderr since logging has not been established yet
-            print('failed to create workdir at %s -- aborting: %s' % (mainworkdir, e), file=sys.stderr)
-            exit_code = shell_exit_code(e._errorCode)
+            print('failed to create workdir at %s -- aborting: %s' % (mainworkdir, error), file=sys.stderr)
+            exit_code = shell_exit_code(error._errorCode)
     else:
         mainworkdir = getcwd()
 
diff --git a/pilot/api/data.py b/pilot/api/data.py
index 7f104b54..ccdc874d 100644
--- a/pilot/api/data.py
+++ b/pilot/api/data.py
@@ -272,8 +272,8 @@ def resolve_replicas(self, files, use_vp=False):
 
         try:
             replicas = c.list_replicas(**query)
-        except Exception as e:
-            raise PilotException("Failed to get replicas from Rucio: %s" % e, code=ErrorCodes.RUCIOLISTREPLICASFAILED)
+        except Exception as error:
+            raise PilotException("Failed to get replicas from Rucio: %s" % error, code=ErrorCodes.RUCIOLISTREPLICASFAILED)
 
         show_memory_usage()
 
@@ -775,7 +775,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs):  # noqa: C90
                     primary_schemas = (self.direct_remoteinput_allowed_schemas if fspec.direct_access_wan and
                                        fspec.is_directaccess(ensure_replica=False) else None)
                     xschemas = self.remoteinput_allowed_schemas
-                    allowed_schemas = [e for e in allowed_schemas if e in xschemas] if allowed_schemas else xschemas
+                    allowed_schemas = [schema for schema in allowed_schemas if schema in xschemas] if allowed_schemas else xschemas
                     replica = resolve_replica(fspec, primary_schemas, allowed_schemas, domain='wan')
 
                 if not replica and fspec.allow_wan:
diff --git a/pilot/control/job.py b/pilot/control/job.py
index 58b6c84e..11a3de6a 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -488,6 +488,10 @@ def get_debug_command(cmd):
 
     allowed_commands = ['tail', 'ls', 'ps', 'gdb', 'du']
     forbidden_commands = ['rm']
+
+    # remove any 'debug,' command that the server might send redundantly
+    if ',' in cmd and 'debug' in cmd:
+        cmd = cmd.replace('debug,', '').replace(',debug', '')
     try:
         tmp = cmd.split(' ')
         com = tmp[0]
@@ -784,7 +788,7 @@ def get_ls(debug_command, workdir):
     debug_command = debug_command.replace(path, finalpath)
 
     ec, stdout, stderr = execute(debug_command)
-    logger.debug("%s:\n\n%s\n\n" % (debug_command, stdout))
+    logger.debug("%s:\n\n%s\n\n", debug_command, stdout)
 
     return stdout
 
@@ -806,8 +810,8 @@ def get_requested_log_tail(debug_command, workdir):
     items = debug_command.split(' ')
     cmd = items[0]
     options = ' '.join(items[1:])
-    logger.debug('debug command: %s' % cmd)
-    logger.debug('debug options: %s' % options)
+    logger.debug('debug command: %s', cmd)
+    logger.debug('debug options: %s', options)
 
     # assume that the path is the last of the options; <some option> <some path>
     path = options.split(' ')[-1] if ' ' in options else options
@@ -816,13 +820,13 @@ def get_requested_log_tail(debug_command, workdir):
     # find all files with the given pattern and pick the latest updated file (if several)
     files = glob(fullpath)
     if files:
-        logger.info('files found: %s' % str(files))
+        logger.info('files found: %s', str(files))
         _tail = get_latest_log_tail(files)
     else:
-        logger.warning('did not find \'%s\' in path %s' % (path, fullpath))
+        logger.warning('did not find \'%s\' in path %s', path, fullpath)
 
     if _tail:
-        logger.debug('tail =\n\n%s\n\n' % _tail)
+        logger.debug('tail =\n\n%s\n\n', _tail)
 
     return _tail
 
@@ -840,7 +844,7 @@ def add_error_codes(data, job):
     pilot_error_code = job.piloterrorcode
     pilot_error_codes = job.piloterrorcodes
     if pilot_error_codes != []:
-        logger.warning('pilotErrorCodes = %s (will report primary/first error code)' % str(pilot_error_codes))
+        logger.warning('pilotErrorCodes = %s (will report primary/first error code)', str(pilot_error_codes))
         data['pilotErrorCode'] = pilot_error_codes[0]
     else:
         data['pilotErrorCode'] = pilot_error_code
@@ -849,7 +853,7 @@ def add_error_codes(data, job):
     pilot_error_diag = job.piloterrordiag
     pilot_error_diags = job.piloterrordiags
     if pilot_error_diags != []:
-        logger.warning('pilotErrorDiags = %s (will report primary/first error diag)' % str(pilot_error_diags))
+        logger.warning('pilotErrorDiags = %s (will report primary/first error diag)', str(pilot_error_diags))
         data['pilotErrorDiag'] = pilot_error_diags[0]
     else:
         data['pilotErrorDiag'] = pilot_error_diag
@@ -874,7 +878,7 @@ def get_cpu_consumption_time(cpuconsumptiontime):
     except Exception:
         constime = None
     if constime and constime > 10 ** 9:
-        logger.warning("unrealistic cpuconsumptiontime: %d (reset to -1)" % constime)
+        logger.warning("unrealistic cpuconsumptiontime: %d (reset to -1)", constime)
         constime = -1
 
     return constime
@@ -903,7 +907,7 @@ def add_timing_and_extracts(data, job, state, args):
         user = __import__('pilot.user.%s.diagnose' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
         extracts = user.get_log_extracts(job, state)
         if extracts != "":
-            logger.warning('\nXXXXXXXXXXXXXXXXXXXXX[begin log extracts]\n%s\nXXXXXXXXXXXXXXXXXXXXX[end log extracts]' % extracts)
+            logger.warning('\nXXXXXXXXXXXXXXXXXXXXX[begin log extracts]\n%s\nXXXXXXXXXXXXXXXXXXXXX[end log extracts]', extracts)
     data['pilotLog'] = extracts[:1024]
     data['endTime'] = time.time()
 
@@ -924,8 +928,8 @@ def add_memory_info(data, workdir, name=""):
     try:
         utility_node = utilities.get_memory_monitor_info(workdir, name=name)
         data.update(utility_node)
-    except Exception as e:
-        logger.info('memory information not available: %s' % e)
+    except Exception as error:
+        logger.info('memory information not available: %s', error)
         pass
 
 
@@ -947,16 +951,15 @@ def remove_pilot_logs_from_list(list_of_files):
                          config.Container.container_script, config.Container.release_setup,
                          config.Container.stagein_status_dictionary, config.Container.stagein_replica_dictionary,
                          'eventLoopHeartBeat.txt', 'memory_monitor_output.txt', 'memory_monitor_summary.json_snapshot']
-    except Exception as e:
-        logger.warning('exception caught: %s' % e)
+    except Exception as error:
+        logger.warning('exception caught: %s', error)
         to_be_removed = []
 
     new_list_of_files = []
     for filename in list_of_files:
-        if os.path.basename(filename) not in to_be_removed and '/pilot/' not in filename:
+        if os.path.basename(filename) not in to_be_removed and '/pilot/' not in filename and 'prmon' not in filename:
             new_list_of_files.append(filename)
 
-    #logger.debug('list_of_files=%s' % str(new_list_of_files))
     return new_list_of_files
 
 
@@ -975,7 +978,7 @@ def get_payload_log_tail(workdir):
     list_of_files = remove_pilot_logs_from_list(list_of_files)
 
     if not list_of_files:
-        logger.info('no log files were found (will use default %s)' % config.Payload.payloadstdout)
+        logger.info('no log files were found (will use default %s)', config.Payload.payloadstdout)
         list_of_files = [os.path.join(workdir, config.Payload.payloadstdout)]
 
     return get_latest_log_tail(list_of_files)
@@ -992,13 +995,13 @@ def get_latest_log_tail(files):
 
     try:
         latest_file = max(files, key=os.path.getmtime)
-        logger.info('tail of file %s will be added to heartbeat' % latest_file)
+        logger.info('tail of file %s will be added to heartbeat', latest_file)
 
         # now get the tail of the found log file and protect against potentially large tails
         stdout_tail = latest_file + "\n" + tail(latest_file)
         stdout_tail = stdout_tail[-2048:]
-    except Exception as e:
-        logger.warning('failed to get payload stdout tail: %s' % e)
+    except Exception as error:
+        logger.warning('failed to get payload stdout tail: %s', error)
 
     return stdout_tail
 
@@ -1024,7 +1027,7 @@ def validate(queues, traces, args):
 
         # set the environmental variable for the task id
         os.environ['PanDA_TaskID'] = str(job.taskid)
-        logger.info('processing PanDA job %s from task %s' % (job.jobid, job.taskid))
+        logger.info('processing PanDA job %s from task %s', job.jobid, job.taskid)
 
         if _validate_job(job):
 
@@ -1032,16 +1035,16 @@ def validate(queues, traces, args):
             os.setpgrp()
 
             job_dir = os.path.join(args.mainworkdir, 'PanDA_Pilot-%s' % job.jobid)
-            logger.debug('creating job working directory: %s' % job_dir)
+            logger.debug('creating job working directory: %s', job_dir)
             try:
                 os.mkdir(job_dir)
                 os.chmod(job_dir, 0o770)
                 job.workdir = job_dir
-            except Exception as e:
-                logger.debug('cannot create working directory: %s' % str(e))
+            except Exception as error:
+                logger.debug('cannot create working directory: %s', error)
                 traces.pilot['error_code'] = errors.MKDIR
                 job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(traces.pilot['error_code'])
-                job.piloterrordiag = e
+                job.piloterrordiag = error
                 put_in_queue(job, queues.failed_jobs)
                 break
             else:
@@ -1051,15 +1054,15 @@ def validate(queues, traces, args):
 #                # stream the job object to file
 #                job_dict = job.to_json()
 #                write_json(os.path.join(job.workdir, 'job.json'), job_dict)
-#            except Exception as e:
-#                logger.debug('exception caught: %s' % e)
+#            except Exception as error:
+#                logger.debug('exception caught: %s', error)
 #            else:
 #                try:
 #                    _job_dict = read_json(os.path.join(job.workdir, 'job.json'))
 #                    job_dict = loads(_job_dict)
 #                    _job = JobData(job_dict, use_kmap=False)
-#                except Exception as e:
-#                    logger.warning('exception caught: %s' % e)
+#                except Exception as error:
+#                    logger.warning('exception caught: %s', error)
 
             create_symlink(from_path='../%s' % config.Pilot.pilotlog, to_path=os.path.join(job_dir, config.Pilot.pilotlog))
 
@@ -1068,8 +1071,8 @@ def validate(queues, traces, args):
             utilities = __import__('pilot.user.%s.utilities' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
             try:
                 utilities.precleanup()
-            except Exception as e:
-                logger.warning('exception caught: %s' % e)
+            except Exception as error:
+                logger.warning('exception caught: %s', error)
 
             # store the PanDA job id for the wrapper to pick up
             store_jobid(job.jobid, args.sourcedir)
@@ -1080,7 +1083,7 @@ def validate(queues, traces, args):
             # make sure that ctypes is available (needed at the end by orphan killer)
             verify_ctypes(queues, job)
         else:
-            logger.debug('Failed to validate job=%s' % job.jobid)
+            logger.debug('Failed to validate job=%s', job.jobid)
             put_in_queue(job, queues.failed_jobs)
 
     # proceed to set the job_aborted flag?
@@ -1104,11 +1107,11 @@ def verify_ctypes(queues, job):
 
     try:
         import ctypes
-    except Exception as e:
-        diagnostics = 'ctypes python module could not be imported: %s' % e
+    except Exception as error:
+        diagnostics = 'ctypes python module could not be imported: %s' % error
         logger.warning(diagnostics)
         #job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOCTYPES, msg=diagnostics)
-        #logger.debug('Failed to validate job=%s' % job.jobid)
+        #logger.debug('Failed to validate job=%s', job.jobid)
         #put_in_queue(job, queues.failed_jobs)
     else:
         logger.debug('ctypes python module imported')
@@ -1141,7 +1144,7 @@ def delayed_space_check(queues, traces, args, job):
             traces.pilot['error_code'] = errors.NOLOCALSPACE
             # set the corresponding error code
             job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOLOCALSPACE, msg=diagnostics)
-            logger.debug('Failed to validate job=%s' % job.jobid)
+            logger.debug('Failed to validate job=%s', job.jobid)
             put_in_queue(job, queues.failed_jobs)
         else:
             put_in_queue(job, queues.validated_jobs)
@@ -1177,10 +1180,10 @@ def store_jobid(jobid, init_dir):
         path = os.path.join(os.path.join(init_dir, 'pilot2'), config.Pilot.jobid_file)
         path = path.replace('pilot2/pilot2', 'pilot2')  # dirty fix for bad paths
         mode = 'a' if os.path.exists(path) else 'w'
-        logger.debug('path=%s  mode=%s' % (path, mode))
+        logger.debug('path=%s  mode=%s', path, mode)
         write_file(path, "%s\n" % str(jobid), mode=mode, mute=False)
-    except Exception as e:
-        logger.warning('exception caught while trying to store job id: %s' % e)
+    except Exception as error:
+        logger.warning('exception caught while trying to store job id: %s', error)
 
 
 def create_data_payload(queues, traces, args):
@@ -1331,7 +1334,7 @@ def get_dispatcher_dictionary(args):
     taskid = get_task_id()
     if taskid != "" and args.allow_same_user:
         data['taskID'] = taskid
-        logger.info("will download a new job belonging to task id: %s" % (data['taskID']))
+        logger.info("will download a new job belonging to task id: %s", data['taskID'])
 
     if args.resource_type != "":
         data['resourceType'] = args.resource_type
@@ -1401,7 +1404,7 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge
 
     maximum_getjob_requests = 60 if harvester else max_getjob_requests  # 1 s apart (if harvester)
     if getjob_requests > int(maximum_getjob_requests):
-        logger.warning('reached maximum number of getjob requests (%s) -- will abort pilot' % maximum_getjob_requests)
+        logger.warning('reached maximum number of getjob requests (%s) -- will abort pilot', maximum_getjob_requests)
         # use singleton:
         # instruct the pilot to wrap up quickly
         os.environ['PILOT_WRAP_UP'] = 'QUICKLY'
@@ -1415,7 +1418,7 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge
         return False
 
     if (currenttime - starttime > timefloor) and jobnumber > 0:
-        logger.warning("the pilot has run out of time (timefloor=%d has been passed)" % timefloor)
+        logger.warning("the pilot has run out of time (timefloor=%d has been passed)", timefloor)
         # use singleton:
         # instruct the pilot to wrap up quickly
         os.environ['PILOT_WRAP_UP'] = 'QUICKLY'
@@ -1423,8 +1426,7 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge
 
     # timefloor not relevant for the first job
     if jobnumber > 0:
-        logger.info('since timefloor=%d s and only %d s has passed since launch, pilot can run another job' %
-                    (timefloor, currenttime - starttime))
+        logger.info('since timefloor=%d s and only %d s has passed since launch, pilot can run another job',timefloor, currenttime - starttime)
 
     if harvester and jobnumber > 0:
         # unless it's the first job (which is preplaced in the init dir), instruct Harvester to place another job
@@ -1454,7 +1456,7 @@ def getjob_server_command(url, port):
         if not findall(port_pattern, url):
             url = url + ':%s' % port
         else:
-            logger.debug('URL already contains port: %s' % url)
+            logger.debug('URL already contains port: %s', url)
     else:
         url = config.Pilot.pandaserver
     if url == "":
@@ -1482,7 +1484,7 @@ def get_job_definition_from_file(path, harvester):
         if is_json(path):
             job_definition_list = parse_job_definition_file(path)
             if not job_definition_list:
-                logger.warning('no jobs were found in Harvester job definitions file: %s' % path)
+                logger.warning('no jobs were found in Harvester job definitions file: %s', path)
                 return {}
             else:
                 # remove the job definition file from the original location, place a renamed copy in the pilot dir
@@ -1498,11 +1500,11 @@ def get_job_definition_from_file(path, harvester):
     with open(path, 'r') as jobdatafile:
         response = jobdatafile.read()
         if len(response) == 0:
-            logger.fatal('encountered empty job definition file: %s' % path)
+            logger.fatal('encountered empty job definition file: %s', path)
             res = None  # this is a fatal error, no point in continuing as the file will not be replaced
         else:
             # parse response message
-            # logger.debug('%s:\n\n%s\n\n' % (path, response))
+            # logger.debug('%s:\n\n%s\n\n', path, response)
             try:
                 from urlparse import parse_qsl  # Python 2
             except Exception:
@@ -1534,7 +1536,7 @@ def get_job_definition_from_server(args):
 
     cmd = getjob_server_command(args.url, args.port)
     if cmd != "":
-        logger.info('executing server command: %s' % cmd)
+        logger.info('executing server command: %s', cmd)
         res = https.request(cmd, data=data)
 
     return res
@@ -1585,7 +1587,7 @@ def get_job_definition(args):
         logger.info('will use a fake PanDA job')
         res = get_fake_job()
     elif os.path.exists(path):
-        logger.info('will read job definition from file %s' % path)
+        logger.info('will read job definition from file %s', path)
         res = get_job_definition_from_file(path, args.harvester)
     else:
         if args.harvester and args.harvester_submitmode.lower() == 'push':
@@ -1733,7 +1735,7 @@ def get_fake_job(input=True):
                'taskID': 'NULL',
                'logFile': '%s.job.log.tgz' % job_name}
     else:
-        logger.warning('unknown test job type: %s' % config.Pilot.testjobtype)
+        logger.warning('unknown test job type: %s', config.Pilot.testjobtype)
 
     if res:
         if not input:
@@ -1747,7 +1749,7 @@ def get_fake_job(input=True):
         if config.Pilot.testtransfertype == "NULL" or config.Pilot.testtransfertype == 'direct':
             res['transferType'] = config.Pilot.testtransfertype
         else:
-            logger.warning('unknown test transfer type: %s (ignored)' % config.Pilot.testtransfertype)
+            logger.warning('unknown test transfer type: %s (ignored)', config.Pilot.testtransfertype)
 
         if config.Pilot.testjobcommand == 'sleep':
             res['transformation'] = 'sleep'
@@ -1826,7 +1828,7 @@ def retrieve(queues, traces, args):  # noqa: C901
 
         # get a job definition from a source (file or server)
         res = get_job_definition(args)
-        logger.info('job definition = %s' % str(res))
+        logger.info('job definition = %s', str(res))
 
         if res is None:
             logger.fatal('fatal error in job download loop - cannot continue')
@@ -1839,7 +1841,7 @@ def retrieve(queues, traces, args):  # noqa: C901
         if not res:
             delay = get_job_retrieval_delay(args.harvester)
             if not args.harvester:
-                logger.warning('did not get a job -- sleep %d s and repeat' % delay)
+                logger.warning('did not get a job -- sleep %d s and repeat', delay)
             for i in range(delay):
                 if args.graceful_stop.is_set():
                     break
@@ -1848,7 +1850,7 @@ def retrieve(queues, traces, args):  # noqa: C901
             # it seems the PanDA server returns StatusCode as an int, but the aCT returns it as a string
             # note: StatusCode keyword is not available in job definition files from Harvester (not needed)
             if 'StatusCode' in res and res['StatusCode'] != '0' and res['StatusCode'] != 0:
-                logger.warning('did not get a job -- sleep 60s and repeat -- status: %s' % res['StatusCode'])
+                logger.warning('did not get a job -- sleep 60s and repeat -- status: %s', res['StatusCode'])
                 for i in range(60):
                     if args.graceful_stop.is_set():
                         break
@@ -1864,11 +1866,11 @@ def retrieve(queues, traces, args):  # noqa: C901
                     #try:
                     #    job_status, job_attempt_nr, job_status_code = get_job_status_from_server(job.jobid, args.url, args.port)
                     #    if job_status == "running":
-                    #        pilot_error_diag = "job %s is already running elsewhere - aborting" % (job.jobid)
+                    #        pilot_error_diag = "job %s is already running elsewhere - aborting" % job.jobid
                     #        logger.warning(pilot_error_diag)
                     #        raise JobAlreadyRunning(pilot_error_diag)
-                    #except Exception as e:
-                    #    logger.warning("%s" % e)
+                    #except Exception as error:
+                    #    logger.warning("%s", error)
                 # write time stamps to pilot timing file
                 # note: PILOT_POST_GETJOB corresponds to START_TIME in Pilot 1
                 add_to_pilot_timing(job.jobid, PILOT_PRE_GETJOB, time_pre_getjob, args)
@@ -1941,8 +1943,8 @@ def create_job(dispatcher_response, queue):
 
     #job.workdir = os.getcwd()
 
-    logger.info('received job: %s (sleep until the job has finished)' % job.jobid)
-    logger.info('job details: \n%s' % job)
+    logger.info('received job: %s (sleep until the job has finished)', job.jobid)
+    logger.info('job details: \n%s', job)
 
     # payload environment wants the PANDAID to be set, also used below
     os.environ['PANDAID'] = job.jobid
@@ -1968,13 +1970,13 @@ def has_job_completed(queues, args):
     else:
         make_job_report(job)
         cmd = 'ls -lF %s' % os.environ.get('PILOT_HOME')
-        logger.debug('%s:\n' % cmd)
+        logger.debug('%s:\n', cmd)
         ec, stdout, stderr = execute(cmd)
         logger.debug(stdout)
 
         queue_report(queues)
         job.reset_errors()
-        logger.info("job %s has completed (purged errors)" % job.jobid)
+        logger.info("job %s has completed (purged errors)", job.jobid)
 
         # cleanup of any remaining processes
         if job.pid:
@@ -1987,14 +1989,14 @@ def has_job_completed(queues, args):
     #finished_queue_snapshot = list(queues.finished_jobs.queue)
     #peek = [obj for obj in finished_queue_snapshot if jobid == obj.jobid]
     #if peek:
-    #    logger.info("job %s has completed (finished)" % jobid)
+    #    logger.info("job %s has completed (finished)", jobid)
     #    return True
 
     # is there anything in the failed_jobs queue?
     #failed_queue_snapshot = list(queues.failed_jobs.queue)
     #peek = [obj for obj in failed_queue_snapshot if jobid == obj.jobid]
     #if peek:
-    #    logger.info("job %s has completed (failed)" % jobid)
+    #    logger.info("job %s has completed (failed)", jobid)
     #    return True
 
     return False
@@ -2021,31 +2023,31 @@ def get_job_from_queue(queues, state):
     else:
         # make sure that state=failed
         set_pilot_state(job=job, state=state)
-        logger.info("job %s has state=%s" % (job.jobid, job.state))
+        logger.info("job %s has state=%s", job.jobid, job.state)
 
     return job
 
 
-def is_queue_empty(queues, q):
+def is_queue_empty(queues, queue):
     """
     Check if the given queue is empty (without pulling).
 
     :param queues: pilot queues object.
-    :param q: queue name (string).
+    :param queue: queue name (string).
     :return: True if queue is empty, False otherwise
     """
 
     status = False
-    if q in queues._fields:
-        _q = getattr(queues, q)
-        jobs = list(_q.queue)
+    if queue in queues._fields:
+        _queue = getattr(queues, queue)
+        jobs = list(_queue.queue)
         if len(jobs) > 0:
-            logger.info('queue %s not empty: found %d job(s)' % (q, len(jobs)))
+            logger.info('queue %s not empty: found %d job(s)', queue, len(jobs))
         else:
-            logger.info('queue %s is empty' % q)
+            logger.info('queue %s is empty', queue)
             status = True
     else:
-        logger.warning('queue %s not present in %s' % (q, queues._fields))
+        logger.warning('queue %s not present in %s', queue, queues._fields)
 
     return status
 
@@ -2072,7 +2074,7 @@ def order_log_transfer(queues, job):
     while n < nmax:
         # refresh the log_transfer since it might have changed
         log_transfer = job.get_status('LOG_TRANSFER')
-        logger.info('waiting for log transfer to finish (#%d/#%d): %s' % (n + 1, nmax, log_transfer))
+        logger.info('waiting for log transfer to finish (#%d/#%d): %s', n + 1, nmax, log_transfer)
         if is_queue_empty(queues, 'data_out') and \
                 (log_transfer == LOG_TRANSFER_DONE or log_transfer == LOG_TRANSFER_FAILED):  # set in data component
             logger.info('stage-out of log has completed')
@@ -2083,7 +2085,7 @@ def order_log_transfer(queues, job):
             time.sleep(2)
             n += 1
 
-    logger.info('proceeding with server update (n=%d)' % n)
+    logger.info('proceeding with server update (n=%d)', n)
 
 
 def wait_for_aborted_job_stageout(args, queues, job):
@@ -2101,9 +2103,9 @@ def wait_for_aborted_job_stageout(args, queues, job):
         time_since_kill = get_time_since('1', PILOT_KILL_SIGNAL, args)
         was_killed = was_pilot_killed(args.timing)
         if was_killed:
-            logger.info('%d s passed since kill signal was intercepted - make sure that stage-out has finished' % time_since_kill)
-    except Exception as e:
-        logger.warning('exception caught: %s' % e)
+            logger.info('%d s passed since kill signal was intercepted - make sure that stage-out has finished', time_since_kill)
+    except Exception as error:
+        logger.warning('exception caught: %s', error)
         time_since_kill = 60
     else:
         if time_since_kill > 60 or time_since_kill < 0:  # fail-safe
@@ -2113,7 +2115,7 @@ def wait_for_aborted_job_stageout(args, queues, job):
     # if stage-out has not finished, we need to wait (less than two minutes or the batch system will issue
     # a hard SIGKILL)
     max_wait_time = 2 * 60 - time_since_kill - 5
-    logger.debug('using max_wait_time = %d s' % max_wait_time)
+    logger.debug('using max_wait_time = %d s', max_wait_time)
     t0 = time.time()
     while time.time() - t0 < max_wait_time:
         if job in queues.finished_data_out.queue or job in queues.failed_data_out.queue:
@@ -2180,14 +2182,14 @@ def queue_monitor(queues, traces, args):  # noqa: C901
         while i < imax and os.environ.get('PILOT_WRAP_UP', '') == 'NORMAL':
             job = get_finished_or_failed_job(args, queues)
             if job:
-                logger.debug('returned job has state=%s' % job.state)
+                logger.debug('returned job has state=%s', job.state)
                 #if job.state == 'failed':
                 #    logger.warning('will abort failed job (should prepare for final server update)')
                 break
             i += 1
             state = get_pilot_state()  # the job object is not available, but the state is also kept in PILOT_JOB_STATE
             if state != 'stage-out':
-                # logger.info("no need to wait since job state=\'%s\'" % state)
+                # logger.info("no need to wait since job state=\'%s\'", state)
                 break
             pause_queue_monitor(1) if not abort_thread else pause_queue_monitor(10)
 
@@ -2197,7 +2199,7 @@ def queue_monitor(queues, traces, args):  # noqa: C901
 
         completed_jobids = queues.completed_jobids.queue if queues.completed_jobids else []
         if job and job.jobid not in completed_jobids:
-            logger.info("preparing for final server update for job %s in state=\'%s\'" % (job.jobid, job.state))
+            logger.info("preparing for final server update for job %s in state=\'%s\'", job.jobid, job.state)
 
             if args.job_aborted.is_set():
                 # wait for stage-out to finish for aborted job
@@ -2214,7 +2216,7 @@ def queue_monitor(queues, traces, args):  # noqa: C901
                 logger.warning('failed to dequeue job: queue is empty (did job fail before job monitor started?)')
                 make_job_report(job)
             else:
-                logger.debug('job %s was dequeued from the monitored payloads queue' % _job.jobid)
+                logger.debug('job %s was dequeued from the monitored payloads queue', _job.jobid)
                 # now ready for the next job (or quit)
                 put_in_queue(job.jobid, queues.completed_jobids)
 
@@ -2250,8 +2252,8 @@ def update_server(job, args):
     metadata = user.get_metadata(job.workdir)
     try:
         user.update_server(job)
-    except Exception as e:
-        logger.warning('exception caught in update_server(): %s' % e)
+    except Exception as error:
+        logger.warning('exception caught in update_server(): %s', error)
     if job.fileinfo:
         send_state(job, args, job.state, xml=dumps(job.fileinfo), metadata=metadata)
     else:
@@ -2266,7 +2268,7 @@ def pause_queue_monitor(delay):
     :return:
     """
 
-    logger.warning('since job:queue_monitor is responsible for sending job updates, we sleep for %d s' % delay)
+    logger.warning('since job:queue_monitor is responsible for sending job updates, we sleep for %d s', delay)
     time.sleep(delay)
 
 
@@ -2323,8 +2325,8 @@ def get_heartbeat_period(debug=False):
 
     try:
         return int(config.Pilot.heartbeat if not debug else config.Pilot.debug_heartbeat)
-    except Exception as e:
-        logger.warning('bad config data for heartbeat period: %s (will use default 1800 s)' % e)
+    except Exception as error:
+        logger.warning('bad config data for heartbeat period: %s (will use default 1800 s)', error)
         return 1800
 
 
@@ -2338,7 +2340,7 @@ def check_for_abort_job(args, caller=''):
     """
     abort_job = False
     if args.abort_job.is_set():
-        logger.warning('%s detected an abort_job request (signal=%s)' % (caller, args.signal))
+        logger.warning('%s detected an abort_job request (signal=%s)', caller, args.signal)
         logger.warning('in case pilot is running more than one job, all jobs will be aborted')
         abort_job = True
 
@@ -2371,8 +2373,7 @@ def interceptor(queues, traces, args):
             jobs = queues.monitored_payloads.queue
             if jobs:
                 for i in range(len(jobs)):
-
-                    logger.info('interceptor loop %d: looking for communication file' % n)
+                    logger.info('interceptor loop %d: looking for communication file', n)
             time.sleep(30)
 
         n += 1
@@ -2439,8 +2440,8 @@ def job_monitor(queues, traces, args):  # noqa: C901
                         # note: when sending a state change to the server, the server might respond with 'tobekilled'
                         try:
                             jobs[i]
-                        except Exception as e:
-                            logger.warning('detected stale jobs[i] object in job_monitor: %s' % e)
+                        except Exception as error:
+                            logger.warning('detected stale jobs[i] object in job_monitor: %s', error)
                         else:
                             if jobs[i].state == 'failed':
                                 logger.warning('job state is \'failed\' - order log transfer and abort job_monitor() (1)')
@@ -2464,9 +2465,9 @@ def job_monitor(queues, traces, args):  # noqa: C901
             peeking_time = int(time.time())
             for i in range(len(jobs)):
                 current_id = jobs[i].jobid
-                logger.info('monitor loop #%d: job %d:%s is in state \'%s\'' % (n, i, current_id, jobs[i].state))
+                logger.info('monitor loop #%d: job %d:%s is in state \'%s\'', n, i, current_id, jobs[i].state)
                 if jobs[i].state == 'finished' or jobs[i].state == 'failed':
-                    logger.info('will abort job monitoring soon since job state=%s (job is still in queue)' % jobs[i].state)
+                    logger.info('will abort job monitoring soon since job state=%s (job is still in queue)', jobs[i].state)
                     break
 
                 # perform the monitoring tasks
@@ -2482,8 +2483,8 @@ def job_monitor(queues, traces, args):  # noqa: C901
                     else:
                         try:
                             fail_monitored_job(jobs[i], exit_code, diagnostics, queues, traces)
-                        except Exception as e:
-                            logger.warning('(1) exception caught: %s (job id=%s)' % (e, current_id))
+                        except Exception as error:
+                            logger.warning('(1) exception caught: %s (job id=%s)', error, current_id)
                         break
 
                 # run this check again in case job_monitor_tasks() takes a long time to finish (and the job object
@@ -2491,15 +2492,15 @@ def job_monitor(queues, traces, args):  # noqa: C901
                 try:
                     _job = jobs[i]
                 except Exception:
-                    logger.info('aborting job monitoring since job object (job id=%s) has expired' % current_id)
+                    logger.info('aborting job monitoring since job object (job id=%s) has expired', current_id)
                     break
 
                 # send heartbeat if it is time (note that the heartbeat function might update the job object, e.g.
                 # by turning on debug mode, ie we need to get the heartbeat period in case it has changed)
                 try:
                     update_time = send_heartbeat_if_time(_job, args, update_time)
-                except Exception as e:
-                    logger.warning('(2) exception caught: %s (job id=%s)' % (e, current_id))
+                except Exception as error:
+                    logger.warning('(2) exception caught: %s (job id=%s)', error, current_id)
                     break
                 else:
                     # note: when sending a state change to the server, the server might respond with 'tobekilled'
@@ -2594,7 +2595,7 @@ def fail_monitored_job(job, exit_code, diagnostics, queues, traces):
     job.piloterrordiag = diagnostics
     traces.pilot['error_code'] = exit_code
     put_in_queue(job, queues.failed_payloads)
-    logger.info('aborting job monitoring since job state=%s' % job.state)
+    logger.info('aborting job monitoring since job state=%s', job.state)
 
 
 def make_job_report(job):
@@ -2609,37 +2610,37 @@ def make_job_report(job):
     logger.info('')
     logger.info('job summary report')
     logger.info('--------------------------------------------------')
-    logger.info('PanDA job id: %s' % job.jobid)
-    logger.info('task id: %s' % job.taskid)
+    logger.info('PanDA job id: %s', job.jobid)
+    logger.info('task id: %s', job.taskid)
     n = len(job.piloterrorcodes)
     if n > 0:
         for i in range(n):
-            logger.info('error %d/%d: %s: %s' % (i + 1, n, job.piloterrorcodes[i], job.piloterrordiags[i]))
+            logger.info('error %d/%d: %s: %s', i + 1, n, job.piloterrorcodes[i], job.piloterrordiags[i])
     else:
         logger.info('errors: (none)')
     if job.piloterrorcode != 0:
-        logger.info('pilot error code: %d' % job.piloterrorcode)
-        logger.info('pilot error diag: %s' % job.piloterrordiag)
+        logger.info('pilot error code: %d', job.piloterrorcode)
+        logger.info('pilot error diag: %s', job.piloterrordiag)
     info = ""
     for key in job.status:
         info += key + " = " + job.status[key] + " "
-    logger.info('status: %s' % info)
+    logger.info('status: %s', info)
     s = ""
     if job.is_analysis() and job.state != 'finished':
         s = '(user job is recoverable)' if errors.is_recoverable(code=job.piloterrorcode) else '(user job is not recoverable)'
-    logger.info('pilot state: %s %s' % (job.state, s))
-    logger.info('transexitcode: %d' % job.transexitcode)
-    logger.info('exeerrorcode: %d' % job.exeerrorcode)
-    logger.info('exeerrordiag: %s' % job.exeerrordiag)
-    logger.info('exitcode: %d' % job.exitcode)
-    logger.info('exitmsg: %s' % job.exitmsg)
-    logger.info('cpuconsumptiontime: %d %s' % (job.cpuconsumptiontime, job.cpuconsumptionunit))
-    logger.info('nevents: %d' % job.nevents)
-    logger.info('neventsw: %d' % job.neventsw)
-    logger.info('pid: %s' % job.pid)
-    logger.info('pgrp: %s' % str(job.pgrp))
-    logger.info('corecount: %d' % job.corecount)
-    logger.info('event service: %s' % str(job.is_eventservice))
-    logger.info('sizes: %s' % str(job.sizes))
+    logger.info('pilot state: %s %s', job.state, s)
+    logger.info('transexitcode: %d', job.transexitcode)
+    logger.info('exeerrorcode: %d', job.exeerrorcode)
+    logger.info('exeerrordiag: %s', job.exeerrordiag)
+    logger.info('exitcode: %d', job.exitcode)
+    logger.info('exitmsg: %s', job.exitmsg)
+    logger.info('cpuconsumptiontime: %d %s', job.cpuconsumptiontime, job.cpuconsumptionunit)
+    logger.info('nevents: %d', job.nevents)
+    logger.info('neventsw: %d', job.neventsw)
+    logger.info('pid: %s', job.pid)
+    logger.info('pgrp: %s', str(job.pgrp))
+    logger.info('corecount: %d', job.corecount)
+    logger.info('event service: %s', str(job.is_eventservice))
+    logger.info('sizes: %s', str(job.sizes))
     logger.info('--------------------------------------------------')
     logger.info('')
diff --git a/pilot/control/payload.py b/pilot/control/payload.py
index 61cb033c..33029f0c 100644
--- a/pilot/control/payload.py
+++ b/pilot/control/payload.py
@@ -8,7 +8,7 @@
 # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017
 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017
 # - Tobias Wegner, tobias.wegner@cern.ch, 2017
-# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2020
+# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021
 # - Wen Guan, wen.guan@cern.ch, 2017-2018
 
 import os
@@ -64,7 +64,7 @@ def control(queues, traces, args):
                 pass
             else:
                 exc_type, exc_obj, exc_trace = exc
-                logger.warning("thread \'%s\' received an exception from bucket: %s" % (thread.name, exc_obj))
+                logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj)
 
                 # deal with the exception
                 # ..
@@ -146,8 +146,8 @@ def _validate_payload(job):
     user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
     try:
         status = user.validate(job)
-    except Exception as e:
-        logger.fatal('failed to execute user validate() function: %s' % e)
+    except Exception as error:
+        logger.fatal('failed to execute user validate() function: %s', error)
         status = False
 
     return status
@@ -213,13 +213,13 @@ def execute_payloads(queues, traces, args):  # noqa: C901
             #queues.monitored_payloads.put(job)
             put_in_queue(job, queues.monitored_payloads)
 
-            logger.info('job %s added to monitored payloads queue' % job.jobid)
+            logger.info('job %s added to monitored payloads queue', job.jobid)
 
             try:
                 out = open(os.path.join(job.workdir, config.Payload.payloadstdout), 'wb')
                 err = open(os.path.join(job.workdir, config.Payload.payloadstderr), 'wb')
-            except Exception as e:
-                logger.warning('failed to open payload stdout/err: %s' % e)
+            except Exception as error:
+                logger.warning('failed to open payload stdout/err: %s', error)
                 out = None
                 err = None
             send_state(job, args, 'starting')
@@ -230,7 +230,7 @@ def execute_payloads(queues, traces, args):  # noqa: C901
                 break
 
             payload_executor = get_payload_executor(args, job, out, err, traces)
-            logger.info("Got payload executor: %s" % payload_executor)
+            logger.info("Got payload executor: %s", payload_executor)
 
             show_memory_usage()
 
@@ -252,13 +252,13 @@ def execute_payloads(queues, traces, args):  # noqa: C901
                                   0)  # Python 2/3
                 try:
                     user.update_output_for_hpo(job)
-                except Exception as e:
-                    logger.warning('exception caught by update_output_for_hpo(): %s' % e)
+                except Exception as error:
+                    logger.warning('exception caught by update_output_for_hpo(): %s', error)
                 else:
                     for dat in job.outdata:
                         if not dat.guid:
                             dat.guid = get_guid()
-                            logger.warning('guid not set: generated guid=%s for lfn=%s' % (dat.guid, dat.lfn))
+                            logger.warning('guid not set: generated guid=%s for lfn=%s', dat.guid, dat.lfn)
 
             #if traces.pilot['nr_jobs'] == 1:
             #    logger.debug('faking job failure in first multi-job')
@@ -275,8 +275,8 @@ def execute_payloads(queues, traces, args):  # noqa: C901
             user = __import__('pilot.user.%s.diagnose' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3
             try:
                 exit_code_interpret = user.interpret(job)
-            except Exception as e:
-                logger.warning('exception caught: %s' % e)
+            except Exception as error:
+                logger.warning('exception caught: %s', error)
                 #exit_code_interpret = -1
                 job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.INTERNALPILOTPROBLEM)
 
@@ -298,8 +298,8 @@ def execute_payloads(queues, traces, args):  # noqa: C901
 
         except queue.Empty:
             continue
-        except Exception as e:
-            logger.fatal('execute payloads caught an exception (cannot recover): %s, %s' % (e, traceback.format_exc()))
+        except Exception as error:
+            logger.fatal('execute payloads caught an exception (cannot recover): %s, %s', error, traceback.format_exc())
             if job:
                 job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADEXECUTIONEXCEPTION)
                 #queues.failed_payloads.put(job)
@@ -346,7 +346,7 @@ def perform_initial_payload_error_analysis(job, exit_code):
     if exit_code != 0:
         msg = ""
         ec = 0
-        logger.warning('main payload execution returned non-zero exit code: %d' % exit_code)
+        logger.warning('main payload execution returned non-zero exit code: %d', exit_code)
         stderr = read_file(os.path.join(job.workdir, config.Payload.payloadstderr))
         if stderr != "":
             msg = errors.extract_stderr_error(stderr)
@@ -357,7 +357,7 @@ def perform_initial_payload_error_analysis(job, exit_code):
             else:
                 fatal = True
             if msg != "":
-                logger.warning("extracted message from stderr:\n%s" % msg)
+                logger.warning("extracted message from stderr:\n%s", msg)
                 ec = set_error_code_from_stderr(msg, fatal)
 
         if not ec:
@@ -368,7 +368,7 @@ def perform_initial_payload_error_analysis(job, exit_code):
             job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec, msg=msg)
         else:
             if job.piloterrorcodes:
-                logger.warning('error code(s) already set: %s' % str(job.piloterrorcodes))
+                logger.warning('error code(s) already set: %s', str(job.piloterrorcodes))
             else:
                 # check if core dumps exist, if so remove them and return True
                 if remove_core_dumps(job.workdir) and not job.debug:
diff --git a/pilot/user/atlas/dbrelease.py b/pilot/user/atlas/dbrelease.py
index 5f090b5d..c3cf9ee4 100644
--- a/pilot/user/atlas/dbrelease.py
+++ b/pilot/user/atlas/dbrelease.py
@@ -5,7 +5,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Authors:
-# - Paul Nilsson, paul.nilsson@cern.ch, 2019
+# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2021
 
 import os
 import re
@@ -63,9 +63,9 @@ def get_dbrelease_dir():
         logger.warning("note: the DBRelease database directory is not available (will not attempt to skip DBRelease stage-in)")
     else:
         if os.path.exists(path):
-            logger.info("local DBRelease path verified: %s (will attempt to skip DBRelease stage-in)" % path)
+            logger.info("local DBRelease path verified: %s (will attempt to skip DBRelease stage-in)", path)
         else:
-            logger.warning("note: local DBRelease path does not exist: %s (will not attempt to skip DBRelease stage-in)" % path)
+            logger.warning("note: local DBRelease path does not exist: %s (will not attempt to skip DBRelease stage-in)", path)
 
     return path
 
@@ -95,14 +95,14 @@ def is_dbrelease_available(version):
         # is the required DBRelease version available?
         if dir_list:
             if version in dir_list:
-                logger.info("found version %s in path %s (%d releases found)" % (version, path, len(dir_list)))
+                logger.info("found version %s in path %s (%d releases found)", version, path, len(dir_list))
                 status = True
             else:
-                logger.warning("did not find version %s in path %s (%d releases found)" % (version, path, len(dir_list)))
+                logger.warning("did not find version %s in path %s (%d releases found)", version, path, len(dir_list))
         else:
-            logger.warning("empty DBRelease directory list: %s" % path)
+            logger.warning("empty DBRelease directory list: %s", path)
     else:
-        logger.warning('no such DBRelease path: %s' % path)
+        logger.warning('no such DBRelease path: %s', path)
 
     return status
 
@@ -131,13 +131,13 @@ def create_setup_file(version, path):
 
         try:
             status = write_file(path, txt)
-        except FileHandlingFailure as e:
-            logger.warning('failed to create DBRelease setup file: %s' % e)
+        except FileHandlingFailure as error:
+            logger.warning('failed to create DBRelease setup file: %s', error)
         else:
-            logger.info("Created setup file with the following content:.................................\n%s" % txt)
+            logger.info("Created setup file with the following content:.................................\n%s", txt)
             logger.info("...............................................................................")
     else:
-        logger.warning('failed to create %s for DBRelease version=%s and directory=%s' % (path, version, d))
+        logger.warning('failed to create %s for DBRelease version=%s and directory=%s', path, version, d)
 
     return status
 
@@ -158,25 +158,25 @@ def create_dbrelease(version, path):
     _path = os.path.join(dbrelease_path, version)
     try:
         mkdirs(_path, chmod=None)
-    except PilotException as e:
-        logger.warning('failed to create directories for DBRelease: %s' % e)
+    except PilotException as error:
+        logger.warning('failed to create directories for DBRelease: %s', error)
     else:
-        logger.debug('created directories: %s' % _path)
+        logger.debug('created directories: %s', _path)
 
         # create the setup file in the DBRelease directory
         version_path = os.path.join(dbrelease_path, version)
         setup_filename = "setup.py"
         _path = os.path.join(version_path, setup_filename)
         if create_setup_file(version, _path):
-            logger.info("created DBRelease setup file: %s" % _path)
+            logger.info("created DBRelease setup file: %s", _path)
 
             # now create a new DBRelease tarball
             filename = os.path.join(path, "DBRelease-%s.tar.gz" % version)
-            logger.info("creating file: %s" % filename)
+            logger.info("creating file: %s", filename)
             try:
                 tar = tarfile.open(filename, "w:gz")
-            except Exception as e:
-                logger.warning("could not create DBRelease tar file: %s" % e)
+            except Exception as error:
+                logger.warning("could not create DBRelease tar file: %s", error)
             else:
                 if tar:
                     # add the setup file to the tar file
@@ -186,10 +186,10 @@ def create_dbrelease(version, path):
                     try:
                         _link = os.path.join(path, "DBRelease/current")
                         os.symlink(version, _link)
-                    except Exception as e:
-                        logger.warning("failed to create symbolic link %s: %s" % (_link, e))
+                    except Exception as error:
+                        logger.warning("failed to create symbolic link %s: %s", _link, error)
                     else:
-                        logger.warning("created symbolic link: %s" % _link)
+                        logger.warning("created symbolic link: %s", _link)
 
                         # add the symbolic link to the tar file
                         tar.add(_link)
@@ -197,17 +197,17 @@ def create_dbrelease(version, path):
                         # done with the tar archive
                         tar.close()
 
-                        logger.info("created new DBRelease tar file: %s" % filename)
+                        logger.info("created new DBRelease tar file: %s", filename)
                         status = True
                 else:
                     logger.warning("failed to open DBRelease tar file")
 
             # clean up
             if rmdirs(dbrelease_path):
-                logger.debug("cleaned up directories in path: %s" % dbrelease_path)
+                logger.debug("cleaned up directories in path: %s", dbrelease_path)
         else:
             logger.warning("failed to create DBRelease setup file")
             if rmdirs(dbrelease_path):
-                logger.debug("cleaned up directories in path: %s" % dbrelease_path)
+                logger.debug("cleaned up directories in path: %s", dbrelease_path)
 
     return status
diff --git a/pilot/user/atlas/setup.py b/pilot/user/atlas/setup.py
index f21dfe27..e01aeece 100644
--- a/pilot/user/atlas/setup.py
+++ b/pilot/user/atlas/setup.py
@@ -196,7 +196,7 @@ def set_inds(dataset):
             inds = ds
             break
     if inds != "":
-        logger.info("setting INDS environmental variable to: %s" % (inds))
+        logger.info("setting INDS environmental variable to: %s", inds)
         os.environ['INDS'] = inds
     else:
         logger.warning("INDS unknown")
@@ -219,24 +219,24 @@ def get_analysis_trf(transform, workdir):
     harvester_workdir = os.environ.get('HARVESTER_WORKDIR')
     if harvester_workdir is not None:
         search_pattern = "%s/jobO.*.tar.gz" % harvester_workdir
-        logger.debug("search_pattern - %s" % search_pattern)
+        logger.debug("search_pattern - %s", search_pattern)
         jobopt_files = glob.glob(search_pattern)
         for jobopt_file in jobopt_files:
-            logger.debug("jobopt_file = %s workdir = %s" % (jobopt_file, workdir))
+            logger.debug("jobopt_file = %s workdir = %s", jobopt_file, workdir)
             try:
                 copy(jobopt_file, workdir)
-            except Exception as e:
-                logger.error("could not copy file %s to %s : %s" % (jobopt_file, workdir, e))
+            except Exception as error:
+                logger.error("could not copy file %s to %s : %s", jobopt_file, workdir, error)
 
     if '/' in transform:
         transform_name = transform.split('/')[-1]
     else:
-        logger.warning('did not detect any / in %s (using full transform name)' % transform)
+        logger.warning('did not detect any / in %s (using full transform name)', transform)
         transform_name = transform
 
     # is the command already available? (e.g. if already downloaded by a preprocess/main process step)
     if os.path.exists(os.path.join(workdir, transform_name)):
-        logger.info('script %s is already available - no need to download again' % transform_name)
+        logger.info('script %s is already available - no need to download again', transform_name)
         return ec, diagnostics, transform_name
 
     original_base_url = ""
@@ -255,7 +255,7 @@ def get_analysis_trf(transform, workdir):
     status = False
     for base_url in get_valid_base_urls(order=original_base_url):
         trf = re.sub(original_base_url, base_url, transform)
-        logger.debug("attempting to download script: %s" % trf)
+        logger.debug("attempting to download script: %s", trf)
         status, diagnostics = download_transform(trf, transform_name, workdir)
         if status:
             break
@@ -265,11 +265,11 @@ def get_analysis_trf(transform, workdir):
 
     logger.info("successfully downloaded script")
     path = os.path.join(workdir, transform_name)
-    logger.debug("changing permission of %s to 0o755" % path)
+    logger.debug("changing permission of %s to 0o755", path)
     try:
         os.chmod(path, 0o755)  # Python 2/3
-    except Exception as e:
-        diagnostics = "failed to chmod %s: %s" % (transform_name, e)
+    except Exception as error:
+        diagnostics = "failed to chmod %s: %s" % (transform_name, error)
         return errors.CHMODTRF, diagnostics, ""
 
     return ec, diagnostics, transform_name
@@ -307,7 +307,7 @@ def download_transform(url, transform_name, workdir):
 
     # try to download the trf a maximum of 3 times
     while trial <= max_trials:
-        logger.info("executing command [trial %d/%d]: %s" % (trial, max_trials, cmd))
+        logger.info("executing command [trial %d/%d]: %s", trial, max_trials, cmd)
 
         exit_code, stdout, stderr = execute(cmd, mute=True)
         if not stdout:
@@ -317,14 +317,14 @@ def download_transform(url, transform_name, workdir):
             diagnostics = "curl command failed: %d, %s, %s" % (exit_code, stdout, stderr)
             logger.warning(diagnostics)
             if trial == max_trials:
-                logger.fatal('could not download transform: %s' % stdout)
+                logger.fatal('could not download transform: %s', stdout)
                 status = False
                 break
             else:
                 logger.info("will try again after 60 s")
                 sleep(60)
         else:
-            logger.info("curl command returned: %s" % stdout)
+            logger.info("curl command returned: %s", stdout)
             status = True
             break
         trial += 1
@@ -456,12 +456,11 @@ def replace_lfns_with_turls(cmd, workdir, filename, infiles, writetofile=""):
                 # if turl.startswith('root://') and turl not in cmd:
                 if turl not in cmd:
                     cmd = cmd.replace(inputfile, turl)
-                    logger.info("replaced '%s' with '%s' in the run command" % (inputfile, turl))
+                    logger.info("replaced '%s' with '%s' in the run command", inputfile, turl)
 
         # replace the LFNs with TURLs in the writetofile input file list (if it exists)
         if writetofile and turl_dictionary:
             filenames = get_writetoinput_filenames(writetofile)
-            logger.info("filenames=%s" % filenames)
             for fname in filenames:
                 new_lines = []
                 path = os.path.join(workdir, fname)
@@ -479,10 +478,9 @@ def replace_lfns_with_turls(cmd, workdir, filename, infiles, writetofile=""):
                     lines = '\n'.join(new_lines)
                     if lines:
                         write_file(path, lines)
-                        logger.info("lines=%s" % lines)
                 else:
-                    logger.warning("file does not exist: %s" % path)
+                    logger.warning("file does not exist: %s", path)
     else:
-        logger.warning("could not find file: %s (cannot locate TURLs for direct access)" % filename)
+        logger.warning("could not find file: %s (cannot locate TURLs for direct access)", filename)
 
     return cmd
diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py
index c1a33cec..e9820e8b 100644
--- a/pilot/util/auxiliary.py
+++ b/pilot/util/auxiliary.py
@@ -98,7 +98,7 @@ def display_architecture_info():
         dump("/etc/issue")
         dump("$MACHTYPE", cmd="echo")
     else:
-        logger.info("\n%s" % stdout)
+        logger.info("\n%s", stdout)
 
 
 def get_batchsystem_jobid():
@@ -309,7 +309,7 @@ def inner(obj):
                 pass
                 # <class 'collections.OrderedDict'>: unbound method iteritems() must be called
                 # with OrderedDict instance as first argument (got nothing instead)
-                #logger.debug('exception caught for obj=%s: %s' % (str(obj), e))
+                #logger.debug('exception caught for obj=%s: %s', (str(obj), e))
 
         # Check for custom object instances - may subclass above too
         if hasattr(obj, '__dict__'):
@@ -376,7 +376,7 @@ def check_for_final_server_update(update_server):
         if server_update == SERVER_UPDATE_FINAL or server_update == SERVER_UPDATE_TROUBLE:
             logger.info('server update done, finishing')
             break
-        logger.info('server update not finished (#%d/#%d)' % (i + 1, max_i))
+        logger.info('server update not finished (#%d/#%d)', i + 1, max_i)
         sleep(30)
         i += 1
 
@@ -444,7 +444,7 @@ def show_memory_usage():
         _value = extract_memory_usage_value(_stdout)
     except Exception:
         _value = "(unknown)"
-    logger.debug('current pilot memory usage:\n\n%s\n\nusage: %s kB\n' % (_stdout, _value))
+    logger.debug('current pilot memory usage:\n\n%s\n\nusage: %s kB\n', _stdout, _value)
 
 
 def get_memory_usage(pid):
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 273dcf6c..9eaf93ef 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '41'    # build number should be reset to '1' for every new development cycle
+BUILD = '44'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1
diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py
index 79ccb710..c3f80c37 100644
--- a/pilot/util/filehandling.py
+++ b/pilot/util/filehandling.py
@@ -58,8 +58,8 @@ def mkdirs(workdir, chmod=0o770):  # Python 2/3
         os.makedirs(workdir)
         if chmod:
             os.chmod(workdir, chmod)
-    except Exception as e:
-        raise MKDirFailure(e)
+    except Exception as error:
+        raise MKDirFailure(error)
 
 
 def rmdirs(path):
@@ -74,8 +74,8 @@ def rmdirs(path):
 
     try:
         rmtree(path)
-    except OSError as e:
-        logger.warning("failed to remove directories %s: %s" % (path, e))
+    except OSError as error:
+        logger.warning("failed to remove directories %s: %s", path, error)
     else:
         status = True
 
@@ -122,17 +122,17 @@ def write_file(path, contents, mute=True, mode='w', unique=False):
     if f:
         try:
             f.write(contents)
-        except IOError as e:
-            raise FileHandlingFailure(e)
+        except IOError as error:
+            raise FileHandlingFailure(error)
         else:
             status = True
         f.close()
 
     if not mute:
         if 'w' in mode:
-            logger.info('created file: %s' % path)
+            logger.info('created file: %s', path)
         if 'a' in mode:
-            logger.info('appended file: %s' % path)
+            logger.info('appended file: %s', path)
 
     return status
 
@@ -151,8 +151,8 @@ def open_file(filename, mode):
     f = None
     try:
         f = open(filename, mode)
-    except IOError as e:
-        raise FileHandlingFailure(e)
+    except IOError as error:
+        raise FileHandlingFailure(error)
 
     return f
 
@@ -329,8 +329,8 @@ def read_list(filename):
     try:
         with open(filename, 'r') as filehandle:
             _list = load(filehandle)
-    except IOError as e:
-        logger.warning('failed to read %s: %s' % (filename, e))
+    except IOError as error:
+        logger.warning('failed to read %s: %s', filename, error)
 
     return convert(_list)
 
@@ -349,9 +349,9 @@ def read_json(filename):
     if f:
         try:
             dictionary = load(f)
-        except Exception as e:
-            logger.warning('exception caught: %s' % e)
-            #raise FileHandlingFailure(str(e))
+        except Exception as error:
+            logger.warning('exception caught: %s', error)
+            #raise FileHandlingFailure(str(error))
         else:
             f.close()
 
@@ -359,8 +359,8 @@ def read_json(filename):
             if dictionary != {}:
                 try:
                     dictionary = convert(dictionary)
-                except Exception as e:
-                    raise ConversionFailure(e)
+                except Exception as error:
+                    raise ConversionFailure(error)
 
     return dictionary
 
@@ -383,8 +383,8 @@ def write_json(filename, data, sort_keys=True, indent=4, separators=(',', ': '))
     try:
         with open(filename, 'w') as fh:
             dumpjson(data, fh, sort_keys=sort_keys, indent=indent, separators=separators)
-    except IOError as e:
-        raise FileHandlingFailure(e)
+    except IOError as error:
+        raise FileHandlingFailure(error)
     else:
         status = True
 
@@ -434,8 +434,8 @@ def remove(path):
 
     try:
         os.remove(path)
-    except OSError as e:
-        logger.warning("failed to remove file: %s (%s, %s)" % (path, e.errno, e.strerror))
+    except OSError as error:
+        logger.warning("failed to remove file: %s (%s, %s)", path, error.errno, error.strerror)
         return -1
     return 0
 
@@ -449,8 +449,8 @@ def remove_dir_tree(path):
 
     try:
         rmtree(path)
-    except OSError as e:
-        logger.warning("failed to remove directory: %s (%s, %s)" % (path, e.errno, e.strerror))
+    except OSError as error:
+        logger.warning("failed to remove directory: %s (%s, %s)", path, error.errno, error.strerror)
         return -1
     return 0
 
@@ -466,7 +466,7 @@ def remove_files(workdir, files):
 
     ec = 0
     if type(files) != list:
-        logger.warning('files parameter not a list: %s' % str(type(list)))
+        logger.warning('files parameter not a list: %s', str(type(list)))
         ec = -1
     else:
         for f in files:
@@ -533,17 +533,17 @@ def move(path1, path2):
     """
 
     if not os.path.exists(path1):
-        logger.warning('file copy failure: path does not exist: %s' % path1)
+        logger.warning('file copy failure: path does not exist: %s', path1)
         raise NoSuchFile("File does not exist: %s" % path1)
 
     try:
         import shutil
         shutil.move(path1, path2)
-    except IOError as e:
-        logger.warning("exception caught during file move: %s" % e)
-        raise FileHandlingFailure(e)
+    except IOError as error:
+        logger.warning("exception caught during file move: %s", error)
+        raise FileHandlingFailure(error)
     else:
-        logger.info("moved %s to %s" % (path1, path2))
+        logger.info("moved %s to %s", path1, path2)
 
 
 def copy(path1, path2):
@@ -557,16 +557,16 @@ def copy(path1, path2):
     """
 
     if not os.path.exists(path1):
-        logger.warning('file copy failure: path does not exist: %s' % path1)
+        logger.warning('file copy failure: path does not exist: %s', path1)
         raise NoSuchFile("File does not exist: %s" % path1)
 
     try:
         copy2(path1, path2)
-    except IOError as e:
-        logger.warning("exception caught during file copy: %s" % e)
-        raise FileHandlingFailure(e)
+    except IOError as error:
+        logger.warning("exception caught during file copy: %s", error)
+        raise FileHandlingFailure(error)
     else:
-        logger.info("copied %s to %s" % (path1, path2))
+        logger.info("copied %s to %s", path1, path2)
 
 
 def find_executable(name):
@@ -596,8 +596,8 @@ def get_directory_size(directory="."):
         try:
             # convert to int and B
             size = int(stdout.split()[0]) * 1024
-        except Exception as e:
-            logger.warning('exception caught while trying convert dirsize: %s' % e)
+        except Exception as error:
+            logger.warning('exception caught while trying convert dirsize: %s', error)
 
     return size
 
@@ -615,13 +615,13 @@ def add_to_total_size(path, total_size):
         # Get the file size
         fsize = get_local_file_size(path)
         if fsize:
-            logger.info("size of file %s: %d B" % (path, fsize))
+            logger.info("size of file %s: %d B", path, fsize)
             try:
                 total_size += long(fsize)  # Python 2  # noqa: F821
             except Exception:
                 total_size += int(fsize)  # Python 3 (note order in try statement)
     else:
-        logger.warning("skipping file %s since it is not present" % path)
+        logger.warning("skipping file %s since it is not present", path)
 
     return total_size
 
@@ -639,10 +639,10 @@ def get_local_file_size(filename):
     if os.path.exists(filename):
         try:
             file_size = os.path.getsize(filename)
-        except Exception as e:
-            logger.warning("failed to get file size: %s" % e)
+        except Exception as error:
+            logger.warning("failed to get file size: %s", error)
     else:
-        logger.warning("local file does not exist: %s" % filename)
+        logger.warning("local file does not exist: %s", filename)
 
     return file_size
 
@@ -683,8 +683,8 @@ def get_table_from_file(filename, header=None, separator="\t", convert_to_float=
 
     try:
         f = open_file(filename, 'r')
-    except Exception as e:
-        logger.warning("failed to open file: %s, %s" % (filename, e))
+    except Exception as error:
+        logger.warning("failed to open file: %s, %s", filename, error)
     else:
         firstline = True
         for line in f:
@@ -704,8 +704,8 @@ def get_table_from_file(filename, header=None, separator="\t", convert_to_float=
                 if convert_to_float:
                     try:
                         field = float(field)
-                    except Exception as e:
-                        logger.warning("failed to convert %s to float: %s (aborting)" % (field, e))
+                    except Exception as error:
+                        logger.warning("failed to convert %s to float: %s (aborting)", field, error)
                         return None
                 tabledict[key].append(field)
                 i += 1
@@ -906,7 +906,7 @@ def verify_file_list(list_of_files):
 
     diff = diff_lists(list_of_files, filtered_list)
     if diff:
-        logger.debug('found %d file(s) that do not exist (e.g. %s)' % (len(diff), diff[0]))
+        logger.debug('found %d file(s) that do not exist (e.g. %s)', len(diff), diff[0])
 
     return filtered_list
 
@@ -927,8 +927,8 @@ def find_latest_modified_file(list_of_files):
     try:
         latest_file = max(list_of_files, key=os.path.getmtime)
         mtime = int(os.path.getmtime(latest_file))
-    except Exception as e:
-        logger.warning("int conversion failed for mod time: %s" % e)
+    except Exception as error:
+        logger.warning("int conversion failed for mod time: %s", error)
         latest_file = ""
         mtime = None
 
@@ -947,9 +947,9 @@ def dump(path, cmd="cat"):
     if os.path.exists(path) or cmd == "echo":
         _cmd = "%s %s" % (cmd, path)
         exit_code, stdout, stderr = execute(_cmd)
-        logger.info("%s:\n%s" % (_cmd, stdout + stderr))
+        logger.info("%s:\n%s", _cmd, stdout + stderr)
     else:
-        logger.info("path %s does not exist" % path)
+        logger.info("path %s does not exist", path)
 
 
 def establish_logging(debug=True, nopilotlog=False, filename=config.Pilot.pilotlog):
@@ -1001,7 +1001,7 @@ def remove_core_dumps(workdir):
     coredumps = coredumps1 + coredumps2
     if coredumps:
         for coredump in coredumps:
-            logger.info("removing core dump: %s" % str(coredump))
+            logger.info("removing core dump: %s", str(coredump))
             remove(coredump)
         found = True
 
@@ -1072,14 +1072,14 @@ def copy_pilot_source(workdir):
     diagnostics = ""
     srcdir = os.path.join(os.environ.get('PILOT_SOURCE_DIR', '.'), 'pilot2')
     try:
-        logger.debug('copy %s to %s' % (srcdir, workdir))
+        logger.debug('copy %s to %s', srcdir, workdir)
         cmd = 'cp -r %s/* %s' % (srcdir, workdir)
         exit_code, stdout, stderr = execute(cmd)
         if exit_code != 0:
             diagnostics = 'file copy failed: %d, %s' % (exit_code, stdout)
             logger.warning(diagnostics)
-    except Exception as e:
-        diagnostics = 'exception caught when copying pilot2 source: %s' % e
+    except Exception as error:
+        diagnostics = 'exception caught when copying pilot2 source: %s' % error
         logger.warning(diagnostics)
 
     return diagnostics
@@ -1095,10 +1095,10 @@ def create_symlink(from_path='', to_path=''):
 
     try:
         os.symlink(from_path, to_path)
-    except Exception as e:
-        logger.warning('failed to create symlink from %s to %s: %s' % (from_path, to_path, e))
+    except Exception as error:
+        logger.warning('failed to create symlink from %s to %s: %s', from_path, to_path, error)
     else:
-        logger.debug('created symlink from %s to %s' % (from_path, to_path))
+        logger.debug('created symlink from %s to %s', from_path, to_path)
 
 
 def locate_file(pattern):
diff --git a/pilot/util/harvester.py b/pilot/util/harvester.py
index 643253ef..91891549 100644
--- a/pilot/util/harvester.py
+++ b/pilot/util/harvester.py
@@ -5,7 +5,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Authors:
-# - Paul Nilsson, paul.nilsson@cern.ch, 2018
+# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021
 
 import os
 import os.path
@@ -68,7 +68,7 @@ def remove_job_request_file():
     path = get_job_request_file_name()
     if os.path.exists(path):
         if remove(path) == 0:
-            logger.info('removed %s' % path)
+            logger.info('removed %s', path)
     else:
         logger.debug('there is no job request file')
 
diff --git a/pilot/util/loopingjob.py b/pilot/util/loopingjob.py
index e45056de..bcb1876b 100644
--- a/pilot/util/loopingjob.py
+++ b/pilot/util/loopingjob.py
@@ -5,7 +5,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Authors:
-# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2020
+# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021
 
 from pilot.common.errorcodes import ErrorCodes
 from pilot.util.auxiliary import whoami, set_pilot_state, cut_output, locate_core_file
@@ -59,9 +59,9 @@ def looping_job(job, mt):
         # the payload process is considered to be looping if it's files have not been touched within looping_limit time
         if time_last_touched:
             ct = int(time.time())
-            logger.info('current time: %d' % ct)
-            logger.info('last time files were touched: %d' % time_last_touched)
-            logger.info('looping limit: %d s' % looping_limit)
+            logger.info('current time: %d', ct)
+            logger.info('last time files were touched: %d', time_last_touched)
+            logger.info('looping limit: %d s', looping_limit)
             if ct - time_last_touched > looping_limit:
                 try:
                     # first produce core dump and copy it
@@ -69,8 +69,8 @@ def looping_job(job, mt):
                     # set debug mode to prevent core file from being removed before log creation
                     job.debug = True
                     kill_looping_job(job)
-                except Exception as e:
-                    logger.warning('exception caught: %s' % e)
+                except Exception as error:
+                    logger.warning('exception caught: %s', error)
         else:
             logger.info('no files were touched')
 
@@ -126,14 +126,14 @@ def get_time_for_last_touch(job, mt, looping_limit):
             # remove unwanted list items (*.py, *.pyc, workdir, ...)
             files = loopingjob_definitions.remove_unwanted_files(job.workdir, files)
             if files:
-                logger.info('found %d files that were recently updated' % len(files))
-                logger.debug('recent files:\n%s' % files)
+                logger.info('found %d files that were recently updated', len(files))
+                logger.debug('recent files:\n%s', files)
                 updated_files = verify_file_list(files)
 
                 # now get the mod times for these file, and identify the most recently update file
                 latest_modified_file, mtime = find_latest_modified_file(updated_files)
                 if latest_modified_file:
-                    logger.info("file %s is the most recently updated file (at time=%d)" % (latest_modified_file, mtime))
+                    logger.info("file %s is the most recently updated file (at time=%d)", latest_modified_file, mtime)
                 else:
                     logger.warning('looping job algorithm failed to identify latest updated file')
                     return mt.ct_looping_last_touched
@@ -148,7 +148,7 @@ def get_time_for_last_touch(job, mt, looping_limit):
         # cut the output if too long
         stdout = cut_output(stdout)
         stderr = cut_output(stderr)
-        logger.warning('find command failed: %d, %s, %s' % (exit_code, stdout, stderr))
+        logger.warning('find command failed: %d, %s, %s', exit_code, stdout, stderr)
 
     return mt.ct_looping_last_touched
 
@@ -168,19 +168,19 @@ def kill_looping_job(job):
 
     cmd = 'ps -fwu %s' % whoami()
     exit_code, stdout, stderr = execute(cmd, mute=True)
-    logger.info("%s: %s" % (cmd + '\n', stdout))
+    logger.info("%s: %s", cmd + '\n', stdout)
 
     cmd = 'ls -ltr %s' % (job.workdir)
     exit_code, stdout, stderr = execute(cmd, mute=True)
-    logger.info("%s: %s" % (cmd + '\n', stdout))
+    logger.info("%s: %s", cmd + '\n', stdout)
 
     cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % whoami()
     exit_code, stdout, stderr = execute(cmd, mute=True)
-    logger.info("%s: %s" % (cmd + '\n', stdout))
+    logger.info("%s: %s", cmd + '\n', stdout)
 
     cmd = 'pstree -g -a'
     exit_code, stdout, stderr = execute(cmd, mute=True)
-    logger.info("%s: %s" % (cmd + '\n', stdout))
+    logger.info("%s: %s", cmd + '\n', stdout)
 
     # set the relevant error code
     if job.state == 'stagein':
@@ -212,6 +212,6 @@ def get_looping_job_limit():
     looping_limit = convert_to_int(config.Pilot.looping_limit_default, default=2 * 3600)
     looping_limit_min_default = convert_to_int(config.Pilot.looping_limit_min_default, default=2 * 3600)
     looping_limit = max(looping_limit, looping_limit_min_default)
-    logger.info("using looping job limit: %d s" % looping_limit)
+    logger.info("using looping job limit: %d s", looping_limit)
 
     return looping_limit
diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py
index 27252ed9..c1f442d6 100644
--- a/pilot/util/monitoring.py
+++ b/pilot/util/monitoring.py
@@ -54,15 +54,15 @@ def job_monitor_tasks(job, mt, args):
         check_hz()
         try:
             cpuconsumptiontime = get_current_cpu_consumption_time(job.pid)
-        except Exception as e:
-            diagnostics = "Exception caught: %s" % e
+        except Exception as error:
+            diagnostics = "Exception caught: %s" % error
             logger.warning(diagnostics)
             exit_code = get_exception_error_code(diagnostics)
             return exit_code, diagnostics
         else:
             job.cpuconsumptiontime = int(round(cpuconsumptiontime))
             job.cpuconversionfactor = 1.0
-            logger.info('CPU consumption time for pid=%d: %f (rounded to %d)' % (job.pid, cpuconsumptiontime, job.cpuconsumptiontime))
+            logger.info('CPU consumption time for pid=%d: %f (rounded to %d)', job.pid, cpuconsumptiontime, job.cpuconsumptiontime)
 
         # check how many cores the payload is using
         set_number_used_cores(job)
@@ -123,7 +123,7 @@ def display_oom_info(payload_pid):
 
     payload_score = get_score(payload_pid) if payload_pid else 'UNKNOWN'
     pilot_score = get_score(os.getpid())
-    logger.info('oom_score(pilot) = %s, oom_score(payload) = %s' % (pilot_score, payload_score))
+    logger.info('oom_score(pilot) = %s, oom_score(payload) = %s', pilot_score, payload_score)
 
 
 def get_score(pid):
@@ -136,8 +136,8 @@ def get_score(pid):
 
     try:
         score = '%s' % read_file('/proc/%d/oom_score' % pid)
-    except Exception as e:
-        logger.warning('caught exception reading oom_score: %s' % e)
+    except Exception as error:
+        logger.warning('caught exception reading oom_score: %s', error)
         score = 'UNKNOWN'
     else:
         if score.endswith('\n'):
@@ -207,8 +207,8 @@ def verify_memory_usage(current_time, mt, job):
         # is the used memory within the allowed limit?
         try:
             exit_code, diagnostics = memory.memory_usage(job)
-        except Exception as e:
-            logger.warning('caught exception: %s' % e)
+        except Exception as error:
+            logger.warning('caught exception: %s', error)
             exit_code = -1
         if exit_code != 0:
             logger.warning('ignoring failure to parse memory monitor output')
@@ -291,8 +291,8 @@ def verify_looping_job(current_time, mt, job):
         # is the job looping?
         try:
             exit_code, diagnostics = looping_job(job, mt)
-        except Exception as e:
-            diagnostics = 'exception caught in looping job algorithm: %s' % e
+        except Exception as error:
+            diagnostics = 'exception caught in looping job algorithm: %s' % error
             logger.warning(diagnostics)
             if "No module named" in diagnostics:
                 exit_code = errors.BLACKHOLE
@@ -371,15 +371,15 @@ def verify_running_processes(current_time, mt, pid):
         nproc = get_number_of_child_processes(pid)
         try:
             nproc_env = int(os.environ.get('PILOT_MAXNPROC', 0))
-        except Exception as e:
-            logger.warning('failed to convert PILOT_MAXNPROC to int: %s' % e)
+        except Exception as error:
+            logger.warning('failed to convert PILOT_MAXNPROC to int: %s', error)
         else:
             if nproc > nproc_env:
                 # set the maximum number of found processes
                 os.environ['PILOT_MAXNPROC'] = str(nproc)
 
         if nproc_env > 0:
-            logger.info('maximum number of monitored processes: %d' % nproc_env)
+            logger.info('maximum number of monitored processes: %d', nproc_env)
 
     return 0, ""
 
@@ -417,19 +417,19 @@ def utility_monitor(job):
                 try:
                     proc1 = execute(utility_command, workdir=job.workdir, returnproc=True, usecontainer=False,
                                     stdout=PIPE, stderr=PIPE, cwd=job.workdir, queuedata=job.infosys.queuedata)
-                except Exception as e:
-                    logger.error('could not execute: %s' % e)
+                except Exception as error:
+                    logger.error('could not execute: %s', error)
                 else:
                     # store process handle in job object, and keep track on how many times the
                     # command has been launched
                     job.utilities[utcmd] = [proc1, utility_subprocess_launches + 1, utility_command]
             else:
-                logger.warning('detected crashed utility subprocess - too many restarts, will not restart %s again' % utcmd)
+                logger.warning('detected crashed utility subprocess - too many restarts, will not restart %s again', utcmd)
         else:  # check the utility output (the selector option adds a substring to the output file name)
             filename = usercommon.get_utility_command_output_filename(utcmd, selector=True)
             path = os.path.join(job.workdir, filename)
             if not os.path.exists(path):
-                logger.warning('file: %s does not exist' % path)
+                logger.warning('file: %s does not exist', path)
 
             time.sleep(10)
 
@@ -444,10 +444,9 @@ def get_local_size_limit_stdout(bytes=True):
 
     try:
         localsizelimit_stdout = int(config.Pilot.local_size_limit_stdout)
-    except Exception as e:
+    except Exception as error:
         localsizelimit_stdout = 2097152
-        logger.warning('bad value in config for local_size_limit_stdout: %s (will use value: %d kB)' %
-                       (e, localsizelimit_stdout))
+        logger.warning('bad value in config for local_size_limit_stdout: %s (will use value: %d kB)', error, localsizelimit_stdout)
 
     # convert from kB to B
     if bytes:
@@ -484,17 +483,17 @@ def check_payload_stdout(job):
     # now loop over all files and check each individually (any large enough file will fail the job)
     for filename in file_list:
 
-        logger.debug('check_payload_stdout: filename=%s' % filename)
+        logger.debug('check_payload_stdout: filename=%s', filename)
         if "job.log.tgz" in filename:
-            logger.info("skipping file size check of file (%s) since it is a special log file" % (filename))
+            logger.info("skipping file size check of file (%s) since it is a special log file", filename)
             continue
 
         if os.path.exists(filename):
             try:
                 # get file size in bytes
                 fsize = os.path.getsize(filename)
-            except Exception as e:
-                logger.warning("could not read file size of %s: %s" % (filename, e))
+            except Exception as error:
+                logger.warning("could not read file size of %s: %s", filename, error)
             else:
                 # is the file too big?
                 localsizelimit_stdout = get_local_size_limit_stdout()
@@ -517,9 +516,9 @@ def check_payload_stdout(job):
                         # remove any lingering input files from the work dir
                         exit_code = remove_files(job.workdir, lfns)
                 else:
-                    logger.info("payload log (%s) within allowed size limit (%d B): %d B" % (os.path.basename(filename), localsizelimit_stdout, fsize))
+                    logger.info("payload log (%s) within allowed size limit (%d B): %d B", os.path.basename(filename), localsizelimit_stdout, fsize)
         else:
-            logger.info("skipping file size check of payload stdout file (%s) since it has not been created yet" % filename)
+            logger.info("skipping file size check of payload stdout file (%s) since it has not been created yet", filename)
 
     return exit_code, diagnostics
 
@@ -539,7 +538,7 @@ def check_local_space(initial=True):
 
     # is there enough local space to run a job?
     cwd = os.getcwd()
-    logger.debug('checking local space on %s' % cwd)
+    logger.debug('checking local space on %s', cwd)
     spaceleft = convert_mb_to_b(get_local_disk_space(cwd))  # B (diskspace is in MB)
     free_space_limit = human2bytes(config.Pilot.free_space_limit) if initial else human2bytes(config.Pilot.free_space_limit_running)
 
@@ -549,7 +548,7 @@ def check_local_space(initial=True):
         ec = errors.NOLOCALSPACE
         logger.warning(diagnostics)
     else:
-        logger.info('sufficient remaining disk space (%d B)' % spaceleft)
+        logger.info('sufficient remaining disk space (%d B)', spaceleft)
 
     return ec, diagnostics
 
@@ -578,11 +577,11 @@ def check_work_dir(job):
                 exit_code = errors.USERDIRTOOLARGE
                 diagnostics = "work directory (%s) is too large: %d B (must be < %d B)" % \
                               (job.workdir, workdirsize, maxwdirsize)
-                logger.fatal("%s" % diagnostics)
+                logger.fatal("%s", diagnostics)
 
                 cmd = 'ls -altrR %s' % job.workdir
                 _ec, stdout, stderr = execute(cmd, mute=True)
-                logger.info("%s: %s" % (cmd + '\n', stdout))
+                logger.info("%s: %s", cmd + '\n', stdout)
 
                 # kill the job
                 # pUtil.createLockFile(True, self.__env['jobDic'][k][1].workdir, lockfile="JOBWILLBEKILLED")
@@ -598,13 +597,13 @@ def check_work_dir(job):
                     # remeasure the size of the workdir at this point since the value is stored below
                     workdirsize = get_directory_size(directory=job.workdir)
             else:
-                logger.info("size of work directory %s: %d B (within %d B limit)" % (job.workdir, workdirsize, maxwdirsize))
+                logger.info("size of work directory %s: %d B (within %d B limit)", job.workdir, workdirsize, maxwdirsize)
 
             # Store the measured disk space (the max value will later be sent with the job metrics)
             if workdirsize > 0:
                 job.add_workdir_size(workdirsize)
         else:
-            logger.warning('job work dir does not exist: %s' % job.workdir)
+            logger.warning('job work dir does not exist: %s', job.workdir)
     else:
         logger.warning('skipping size check of workdir since it has not been created yet')
 
@@ -621,17 +620,17 @@ def get_max_allowed_work_dir_size(queuedata):
 
     try:
         maxwdirsize = convert_mb_to_b(get_maximum_input_sizes())  # from MB to B, e.g. 16336 MB -> 17,129,537,536 B
-    except Exception as e:
+    except Exception as error:
         max_input_size = get_max_input_size()
         maxwdirsize = max_input_size + config.Pilot.local_size_limit_stdout * 1024
         logger.info("work directory size check will use %d B as a max limit (maxinputsize [%d B] + local size limit for"
-                    " stdout [%d B])" % (maxwdirsize, max_input_size, config.Pilot.local_size_limit_stdout * 1024))
-        logger.warning('conversion caught exception: %s' % e)
+                    " stdout [%d B])", maxwdirsize, max_input_size, config.Pilot.local_size_limit_stdout * 1024)
+        logger.warning('conversion caught exception: %s', error)
     else:
         # grace margin, as discussed in https://its.cern.ch/jira/browse/ATLASPANDA-482
         margin = 10.0  # percent, read later from somewhere
         maxwdirsize = int(maxwdirsize * (1 + margin / 100.0))
-        logger.info("work directory size check will use %d B as a max limit (10%% grace limit added)" % maxwdirsize)
+        logger.info("work directory size check will use %d B as a max limit (10%% grace limit added)", maxwdirsize)
 
     return maxwdirsize
 
@@ -654,8 +653,8 @@ def get_max_input_size(queuedata, megabyte=False):
                 _maxinputsize = int(_maxinputsize)  # MB
             else:  # convert to B int
                 _maxinputsize = int(_maxinputsize) * 1024 * 1024  # MB -> B
-        except Exception as e:
-            logger.warning("schedconfig.maxinputsize: %s" % e)
+        except Exception as error:
+            logger.warning("schedconfig.maxinputsize: %s", error)
             if megabyte:
                 _maxinputsize = max_input_file_sizes_mb
             else:
@@ -667,9 +666,9 @@ def get_max_input_size(queuedata, megabyte=False):
             _maxinputsize = max_input_file_sizes
 
     if megabyte:
-        logger.info("max input size = %d MB (pilot default)" % _maxinputsize)
+        logger.info("max input size = %d MB (pilot default)", _maxinputsize)
     else:
-        logger.info("Max input size = %d B (pilot default)" % _maxinputsize)
+        logger.info("Max input size = %d B (pilot default)", _maxinputsize)
 
     return _maxinputsize
 
@@ -693,12 +692,12 @@ def check_output_file_sizes(job):
             fsize = get_local_file_size(path)
             max_fsize = human2bytes(config.Pilot.maximum_output_file_size)
             if fsize and fsize < max_fsize:
-                logger.info('output file %s is within allowed size limit (%d B < %d B)' % (path, fsize, max_fsize))
+                logger.info('output file %s is within allowed size limit (%d B < %d B)', path, fsize, max_fsize)
             else:
                 exit_code = errors.OUTPUTFILETOOLARGE
                 diagnostics = 'output file %s is not within allowed size limit (%d B > %d B)' % (path, fsize, max_fsize)
                 logger.warning(diagnostics)
         else:
-            logger.info('output file size check: skipping output file %s since it does not exist' % path)
+            logger.info('output file size check: skipping output file %s since it does not exist', path)
 
     return exit_code, diagnostics
diff --git a/pilot/util/processes.py b/pilot/util/processes.py
index 6ee9b84d..c51b717e 100644
--- a/pilot/util/processes.py
+++ b/pilot/util/processes.py
@@ -5,7 +5,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Authors:
-# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2019
+# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021
 
 import os
 import time
@@ -47,8 +47,8 @@ def find_processes_in_group(cpids, pid):
             try:
                 thispid = int(lines[i].split()[0])
                 thisppid = int(lines[i].split()[1])
-            except Exception as e:
-                logger.warning('exception caught: %s' % e)
+            except Exception as error:
+                logger.warning('exception caught: %s', error)
             if thisppid == pid:
                 find_processes_in_group(cpids, thispid)
 
@@ -84,7 +84,7 @@ def get_process_commands(euid, pids):
     exit_code, stdout, stderr = execute(cmd, mute=True)
 
     if exit_code != 0 or stdout == '':
-        logger.warning('ps command failed: %d, \"%s\", \"%s\"' % (exit_code, stdout, stderr))
+        logger.warning('ps command failed: %d, \"%s\", \"%s\"', exit_code, stdout, stderr)
     else:
         # extract the relevant processes
         p_commands = stdout.split('\n')
@@ -153,13 +153,13 @@ def kill_processes(pid):
             return
 
         children.reverse()
-        logger.info("process IDs to be killed: %s (in reverse order)" % str(children))
+        logger.info("process IDs to be killed: %s (in reverse order)", str(children))
 
         # find which commands are still running
         try:
             cmds = get_process_commands(os.geteuid(), children)
-        except Exception as e:
-            logger.warning("get_process_commands() threw an exception: %s" % e)
+        except Exception as error:
+            logger.warning("get_process_commands() threw an exception: %s", error)
         else:
             if len(cmds) <= 1:
                 logger.warning("found no corresponding commands to process id(s)")
@@ -195,13 +195,13 @@ def kill_child_processes(pid):
 
     # reverse the process order so that the athena process is killed first (otherwise the stdout will be truncated)
     children.reverse()
-    logger.info("process IDs to be killed: %s (in reverse order)" % str(children))
+    logger.info("process IDs to be killed: %s (in reverse order)", str(children))
 
     # find which commands are still running
     try:
         cmds = get_process_commands(os.geteuid(), children)
-    except Exception as e:
-        logger.warning("get_process_commands() threw an exception: %s" % e)
+    except Exception as error:
+        logger.warning("get_process_commands() threw an exception: %s", error)
     else:
         if len(cmds) <= 1:
             logger.warning("found no corresponding commands to process id(s)")
@@ -231,26 +231,26 @@ def kill_process_group(pgrp):
     _sleep = True
 
     # kill the process gracefully
-    logger.info("killing group process %d" % pgrp)
+    logger.info("killing group process %d", pgrp)
     try:
         os.killpg(pgrp, signal.SIGTERM)
-    except Exception as e:
-        logger.warning("exception thrown when killing child group process under SIGTERM: %s" % e)
+    except Exception as error:
+        logger.warning("exception thrown when killing child group process under SIGTERM: %s", error)
         _sleep = False
     else:
-        logger.info("SIGTERM sent to process group %d" % pgrp)
+        logger.info("SIGTERM sent to process group %d", pgrp)
 
     if _sleep:
         _t = 30
-        logger.info("sleeping %d s to allow processes to exit" % _t)
+        logger.info("sleeping %d s to allow processes to exit", _t)
         time.sleep(_t)
 
     try:
         os.killpg(pgrp, signal.SIGKILL)
-    except Exception as e:
-        logger.warning("exception thrown when killing child group process with SIGKILL: %s" % e)
+    except Exception as error:
+        logger.warning("exception thrown when killing child group process with SIGKILL: %s", error)
     else:
-        logger.info("SIGKILL sent to process group %d" % pgrp)
+        logger.info("SIGKILL sent to process group %d", pgrp)
         status = True
 
     return status
@@ -270,7 +270,7 @@ def kill_process(pid):
     kill(pid, signal.SIGTERM)
 
     _t = 10
-    logger.info("sleeping %d s to allow process to exit" % _t)
+    logger.info("sleeping %d s to allow process to exit", _t)
     time.sleep(_t)
 
     # now do a hard kill just in case some processes haven't gone away
@@ -291,10 +291,10 @@ def kill(pid, sig):
     status = False
     try:
         os.kill(pid, sig)
-    except Exception as e:
-        logger.warning("exception thrown when killing process %d with signal=%d: %s" % (pid, sig, e))
+    except Exception as error:
+        logger.warning("exception thrown when killing process %d with signal=%d: %s", pid, sig, error)
     else:
-        logger.info("killed process %d with signal=%d" % (pid, sig))
+        logger.info("killed process %d with signal=%d", pid, sig)
         status = True
 
     return status
@@ -313,12 +313,12 @@ def get_number_of_child_processes(pid):
     n = 0
     try:
         find_processes_in_group(children, pid)
-    except Exception as e:
-        logger.warning("exception caught in find_processes_in_group: %s" % e)
+    except Exception as error:
+        logger.warning("exception caught in find_processes_in_group: %s", error)
     else:
         if pid:
             n = len(children)
-            logger.info("number of running child processes to parent process %d: %d" % (pid, n))
+            logger.info("number of running child processes to parent process %d: %d", pid, n)
         else:
             logger.debug("pid not yet set")
     return n
@@ -335,16 +335,16 @@ def killpg(pid, sig, args):
 
     try:
         os.killpg(int(pid), sig)
-    except Exception as e:
-        logger.warning("failed to execute killpg(): %s" % e)
+    except Exception as error:
+        logger.warning("failed to execute killpg(): %s", error)
         cmd = 'kill -%d %s' % (sig, pid)
         exit_code, rs, stderr = execute(cmd)
         if exit_code != 0:
             logger.warning(rs)
         else:
-            logger.info("killed orphaned process %s (%s)" % (pid, args))
+            logger.info("killed orphaned process %s (%s)", pid, args)
     else:
-        logger.info("killed orphaned process group %s (%s)" % (pid, args))
+        logger.info("killed orphaned process group %s (%s)", pid, args)
 
 
 def get_pilot_pid_from_processes(_processes, pattern):
@@ -364,8 +364,8 @@ def get_pilot_pid_from_processes(_processes, pattern):
             args = ids.group(3)
             try:
                 pid = int(pid)
-            except Exception as e:
-                logger.warning('failed to convert pid to int: %s' % e)
+            except Exception as error:
+                logger.warning('failed to convert pid to int: %s', error)
                 continue
             if 'pilot.py' in args and 'python' in args:
                 pilot_pid = pid
@@ -405,30 +405,29 @@ def kill_orphans():
             args = ids.group(3)
             try:
                 pid = int(pid)
-            except Exception as e:
-                logger.warning('failed to convert pid to int: %s' % e)
+            except Exception as error:
+                logger.warning('failed to convert pid to int: %s', error)
                 continue
             if 'cvmfs2' in args:
-                logger.info("ignoring possible orphan process running cvmfs2: pid=%s, ppid=%s, args=\'%s\'" %
-                            (pid, ppid, args))
+                logger.info("ignoring possible orphan process running cvmfs2: pid=%s, ppid=%s, args=\'%s\'", pid, ppid, args)
             elif 'pilots_starter.py' in args or 'runpilot2-wrapper.sh' in args:
-                logger.info("ignoring pilot launcher: pid=%s, ppid=%s, args='%s'" % (pid, ppid, args))
+                logger.info("ignoring pilot launcher: pid=%s, ppid=%s, args='%s'", pid, ppid, args)
             elif ppid == '1':
                 count += 1
-                logger.info("found orphan process: pid=%s, ppid=%s, args='%s'" % (pid, ppid, args))
+                logger.info("found orphan process: pid=%s, ppid=%s, args='%s'", pid, ppid, args)
                 if 'bash' in args or ('python' in args and 'pilot.py' in args):
                     logger.info("will not kill bash process")
                 else:
                     killpg(pid, signal.SIGTERM, args)
                     _t = 10
-                    logger.info("sleeping %d s to allow processes to exit" % _t)
+                    logger.info("sleeping %d s to allow processes to exit", _t)
                     time.sleep(_t)
                     killpg(pid, signal.SIGKILL, args)
 
     if count == 0:
         logger.info("did not find any orphan processes")
     else:
-        logger.info("found %d orphan process(es)" % count)
+        logger.info("found %d orphan process(es)", count)
 
 
 def get_max_memory_usage_from_cgroups():
@@ -453,19 +452,19 @@ def get_max_memory_usage_from_cgroups():
             if ":memory:" in out:
                 pos = out.find('/')
                 path = out[pos:]
-                logger.info("extracted path = %s" % path)
+                logger.info("extracted path = %s", path)
 
                 pre = get_cgroups_base_path()
                 if pre != "":
                     path = pre + os.path.join(path, "memory.max_usage_in_bytes")
-                    logger.info("path to CGROUPS memory info: %s" % path)
+                    logger.info("path to CGROUPS memory info: %s", path)
                     max_memory = read_file(path)
                 else:
                     logger.info("CGROUPS base path could not be extracted - not a CGROUPS site")
             else:
-                logger.warning("invalid format: %s (expected ..:memory:[path])" % out)
+                logger.warning("invalid format: %s (expected ..:memory:[path])", out)
     else:
-        logger.info("path %s does not exist (not a CGROUPS site)" % path)
+        logger.info("path %s does not exist (not a CGROUPS site)", path)
 
     return max_memory
 
@@ -518,7 +517,7 @@ def get_instant_cpu_consumption_time(pid):
 
     hz = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
     if type(hz) != int:
-        logger.warning('unknown SC_CLK_TCK: %s' % str(hz))
+        logger.warning('unknown SC_CLK_TCK: %s', str(hz))
         return 0.0
 
     if pid and hz and hz > 0:
@@ -586,21 +585,21 @@ def cleanup(job, args):
     # make sure the workdir is deleted
     if args.cleanup:
         if remove_dir_tree(job.workdir):
-            logger.info('removed %s' % job.workdir)
+            logger.info('removed %s', job.workdir)
 
         if os.path.exists(job.workdir):
-            logger.warning('work directory still exists: %s' % job.workdir)
+            logger.warning('work directory still exists: %s', job.workdir)
         else:
-            logger.debug('work directory was removed: %s' % job.workdir)
+            logger.debug('work directory was removed: %s', job.workdir)
     else:
-        logger.info('workdir not removed %s' % job.workdir)
+        logger.info('workdir not removed %s', job.workdir)
 
     # collect any zombie processes
     job.collect_zombies(tn=10)
     logger.info("collected zombie processes")
 
     if job.pid:
-        logger.info("will now attempt to kill all subprocesses of pid=%d" % job.pid)
+        logger.info("will now attempt to kill all subprocesses of pid=%d", job.pid)
         kill_processes(job.pid)
     else:
         logger.warning('cannot kill any subprocesses since job.pid is not set')
@@ -672,8 +671,8 @@ def convert_ps_to_dict(output, pattern=r'(\d+) (\d+) (\d+) (.+)'):
                             var = match.group(i + 1)
                         dictionary[first_line[i]].append(var)
 
-        except Exception as e:
-            print("unexpected format of utility output: %s" % e)
+        except Exception as error:
+            print("unexpected format of utility output: %s", error)
 
     return dictionary
 

From 0dd9750c6292fb4b764708bcf524def728a52d3c Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 15 Jun 2021 14:38:29 +0200
Subject: [PATCH 70/96] Pylint updates

---
 pilot/control/monitor.py               | 47 +++++++++++++-------------
 pilot/control/payloads/eventservice.py |  4 +--
 2 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py
index 8770f82b..390776ac 100644
--- a/pilot/control/monitor.py
+++ b/pilot/control/monitor.py
@@ -6,7 +6,7 @@
 #
 # Authors:
 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017
-# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019
+# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021
 
 # NOTE: this module should deal with non-job related monitoring, such as thread monitoring. Job monitoring is
 #       a task for the job_monitor thread in the Job component.
@@ -56,7 +56,7 @@ def control(queues, traces, args):
 
     try:
         # overall loop counter (ignoring the fact that more than one job may be running)
-        n = 0
+        niter = 0
 
         while not args.graceful_stop.is_set():
             # every seconds, run the monitoring checks
@@ -84,8 +84,8 @@ def control(queues, traces, args):
                 args.graceful_stop.set()
                 break
             else:
-                if n % 60 == 0:
-                    logger.info('%d s have passed since pilot start' % time_since_start)
+                if niter % 60 == 0:
+                    logger.info('%d s have passed since pilot start', time_since_start)
             time.sleep(1)
 
             # time to check the CPU?
@@ -93,12 +93,12 @@ def control(queues, traces, args):
                 processes = get_process_info('python pilot2/pilot.py', pid=getpid())
                 if processes:
                     logger.info('-' * 100)
-                    logger.info('PID=%d has CPU usage=%s%% MEM usage=%s%% CMD=%s' % (getpid(), processes[0], processes[1], processes[2]))
-                    n = processes[3]
-                    if n > 1:
-                        logger.info('there are %d such processes running' % n)
+                    logger.info('PID=%d has CPU usage=%s%% MEM usage=%s%% CMD=%s', getpid(), processes[0], processes[1], processes[2])
+                    nproc = processes[3]
+                    if nproc > 1:
+                        logger.info('there are %d such processes running', nproc)
                     else:
-                        logger.info('there is %d such process running' % n)
+                        logger.info('there is %d such process running', nproc)
                     logger.info('-' * 100)
                 tcpu = time.time()
 
@@ -111,20 +111,19 @@ def control(queues, traces, args):
                 for thread in threading.enumerate():
                     # logger.info('thread name: %s' % thread.name)
                     if not thread.is_alive():
-                        logger.fatal('thread \'%s\' is not alive' % thread.name)
+                        logger.fatal('thread \'%s\' is not alive', thread.name)
                         # args.graceful_stop.set()
 
-            n += 1
+            niter += 1
 
-    except Exception as e:
-        print(("monitor: exception caught: %s" % e))
-        raise PilotException(e)
+    except Exception as error:
+        print(("monitor: exception caught: %s" % error))
+        raise PilotException(error)
 
     logger.info('[monitor] control thread has ended')
 
 #def log_lifetime(sig, frame, traces):
-#    logger.info('lifetime: %i used, %i maximum' % (int(time.time() - traces.pilot['lifetime_start']),
-#                                                   traces.pilot['lifetime_max']))
+#    logger.info('lifetime: %i used, %i maximum', int(time.time() - traces.pilot['lifetime_start']), traces.pilot['lifetime_max'])
 
 
 def get_process_info(cmd, user=None, args='aufx', pid=None):
@@ -194,7 +193,7 @@ def run_checks(queues, args):
 
         t_max = 2 * 60
         logger.warning('pilot monitor received instruction that abort_job has been requested')
-        logger.warning('will wait for a maximum of %d seconds for threads to finish' % t_max)
+        logger.warning('will wait for a maximum of %d seconds for threads to finish', t_max)
         t0 = time.time()
         while time.time() - t0 < t_max:
             if args.job_aborted.is_set():
@@ -210,7 +209,7 @@ def run_checks(queues, args):
             args.graceful_stop.set()
 
         if not args.job_aborted.is_set():
-            logger.warning('will wait for a maximum of %d seconds for graceful_stop to take effect' % t_max)
+            logger.warning('will wait for a maximum of %d seconds for graceful_stop to take effect', t_max)
             t_max = 10
             t0 = time.time()
             while time.time() - t0 < t_max:
@@ -241,20 +240,20 @@ def get_max_running_time(lifetime, queuedata):
     # use the schedconfig value if set, otherwise use the pilot option lifetime value
     if not queuedata:
         logger.warning('queuedata could not be extracted from queues, will use default for max running time '
-                       '(%d s)' % max_running_time)
+                       '(%d s)', max_running_time)
     else:
         if queuedata.maxtime:
             try:
                 max_running_time = int(queuedata.maxtime)
-            except Exception as e:
-                logger.warning('exception caught: %s' % e)
+            except Exception as error:
+                logger.warning('exception caught: %s', error)
                 logger.warning('failed to convert maxtime from queuedata, will use default value for max running time '
-                               '(%d s)' % max_running_time)
+                               '(%d s)', max_running_time)
             else:
                 if max_running_time == 0:
                     max_running_time = lifetime  # fallback to default value
-                    logger.info('will use default value for max running time: %d s' % max_running_time)
+                    logger.info('will use default value for max running time: %d s', max_running_time)
                 else:
-                    logger.info('will use queuedata.maxtime value for max running time: %d s' % max_running_time)
+                    logger.info('will use queuedata.maxtime value for max running time: %d s', max_running_time)
 
     return max_running_time
diff --git a/pilot/control/payloads/eventservice.py b/pilot/control/payloads/eventservice.py
index 07829a73..3e0390d3 100644
--- a/pilot/control/payloads/eventservice.py
+++ b/pilot/control/payloads/eventservice.py
@@ -72,8 +72,8 @@ def run_payload(self, job, cmd, out, err):
                 job.pgrp = os.getpgid(job.pid)
 
             self.utility_after_payload_started(job)
-        except Exception as e:
-            logger.error('could not execute: %s', str(e))
+        except Exception as error:
+            logger.error('could not execute: %s', str(error))
             return None
 
         return executor

From f26dfdef150c9a57485492c2dba37fb16ac2466b Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 15 Jun 2021 14:48:53 +0200
Subject: [PATCH 71/96] Flake8 corrections

---
 pilot/control/job.py           | 2 +-
 pilot/user/atlas/jobmetrics.py | 2 +-
 pilot/util/loopingjob.py       | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pilot/control/job.py b/pilot/control/job.py
index 11a3de6a..a980120f 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -1426,7 +1426,7 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge
 
     # timefloor not relevant for the first job
     if jobnumber > 0:
-        logger.info('since timefloor=%d s and only %d s has passed since launch, pilot can run another job',timefloor, currenttime - starttime)
+        logger.info('since timefloor=%d s and only %d s has passed since launch, pilot can run another job', timefloor, currenttime - starttime)
 
     if harvester and jobnumber > 0:
         # unless it's the first job (which is preplaced in the init dir), instruct Harvester to place another job
diff --git a/pilot/user/atlas/jobmetrics.py b/pilot/user/atlas/jobmetrics.py
index d0802040..0d974175 100644
--- a/pilot/user/atlas/jobmetrics.py
+++ b/pilot/user/atlas/jobmetrics.py
@@ -157,7 +157,7 @@ def get_job_metrics(job):
 
     # is job_metrics within allowed size?
     if len(job_metrics) > 500:
-        logger.warning("job_metrics out of size (%d)",len(job_metrics))
+        logger.warning("job_metrics out of size (%d)", len(job_metrics))
 
         # try to reduce the field size and remove the last entry which might be cut
         job_metrics = job_metrics[:500]
diff --git a/pilot/util/loopingjob.py b/pilot/util/loopingjob.py
index bcb1876b..b3b97a0a 100644
--- a/pilot/util/loopingjob.py
+++ b/pilot/util/loopingjob.py
@@ -101,6 +101,7 @@ def create_core_dump(pid=None, workdir=None):
     else:
         logger.warning('failed to execute command: %s, stdout+err=%s', cmd, stdout + stderr)
 
+
 def get_time_for_last_touch(job, mt, looping_limit):
     """
     Return the time when the files in the workdir were last touched.

From 54b5a0a8393cdc99e5df0de7430bf429c30e8f7a Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Thu, 17 Jun 2021 11:24:30 +0200
Subject: [PATCH 72/96] Pylint corrections. Fixes for localSite problem in
 traces

---
 PILOTVERSION                  |  2 +-
 pilot/control/job.py          |  5 ++-
 pilot/copytool/common.py      |  6 ++--
 pilot/copytool/gfal.py        |  3 +-
 pilot/copytool/gs.py          | 23 +++++++-------
 pilot/copytool/lsm.py         | 27 ++++++++--------
 pilot/copytool/mv.py          | 12 +++----
 pilot/copytool/objectstore.py | 10 +++---
 pilot/copytool/rucio.py       | 59 ++++++++++++++++++-----------------
 pilot/copytool/s3.py          | 20 ++++++------
 pilot/copytool/xrdcp.py       | 31 +++++++++---------
 pilot/util/auxiliary.py       | 44 ++++++++++++++++++++++++++
 pilot/util/constants.py       |  2 +-
 pilot/util/tracereport.py     |  7 +++++
 pilot/util/workernode.py      | 30 +++++++++---------
 pilot/workflow/generic_hpc.py | 12 +++----
 16 files changed, 175 insertions(+), 118 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index fb45e883..a14d3a59 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.12.1.44
\ No newline at end of file
+2.12.1.46
\ No newline at end of file
diff --git a/pilot/control/job.py b/pilot/control/job.py
index a980120f..449af861 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -33,7 +33,7 @@
 from pilot.util import https
 from pilot.util.auxiliary import get_batchsystem_jobid, get_job_scheduler_id, get_pilot_id, \
     set_pilot_state, get_pilot_state, check_for_final_server_update, pilot_version_banner, is_virtual_machine, \
-    is_python3, show_memory_usage, has_instruction_sets, locate_core_file
+    is_python3, show_memory_usage, has_instruction_sets, locate_core_file, get_display_info
 from pilot.util.config import config
 from pilot.util.common import should_abort, was_pilot_killed
 from pilot.util.constants import PILOT_MULTIJOB_START_TIME, PILOT_PRE_GETJOB, PILOT_POST_GETJOB, PILOT_KILL_SIGNAL, LOG_TRANSFER_NOT_DONE, \
@@ -661,11 +661,14 @@ def get_data_structure(job, state, args, xml=None, metadata=None):
     data['cpuConsumptionUnit'] = job.cpuconsumptionunit + "+" + get_cpu_model()
 
     instruction_sets = has_instruction_sets(['AVX2'])
+    product, vendor = get_display_info()
     if instruction_sets:
         if 'cpuConsumptionUnit' in data:
             data['cpuConsumptionUnit'] += '+' + instruction_sets
         else:
             data['cpuConsumptionUnit'] = instruction_sets
+        if product and vendor:
+            logger.debug('cpuConsumptionUnit: could have added: product=%s, vendor=%s', product, vendor)
 
     # add memory information if available
     add_memory_info(data, job.workdir, name=job.memorymonitor)
diff --git a/pilot/copytool/common.py b/pilot/copytool/common.py
index ce5f0df3..12381b3d 100644
--- a/pilot/copytool/common.py
+++ b/pilot/copytool/common.py
@@ -6,7 +6,7 @@
 #
 # Authors:
 # - Tobias Wegner, tobias.wegner@cern.ch, 2017
-# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019
+# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021
 # - Mario Lassnig, mario.lassnig@cern.ch, 2020
 
 import logging
@@ -61,8 +61,8 @@ def verify_catalog_checksum(fspec, path):
         checksum_local = calculate_checksum(path, algorithm=checksum_type)
         if checksum_type == 'ad32':
             checksum_type = 'adler32'
-        logger.info('checksum (catalog): %s (type: %s)' % (checksum_catalog, checksum_type))
-        logger.info('checksum (local): %s' % checksum_local)
+        logger.info('checksum (catalog): %s (type: %s)', checksum_catalog, checksum_type)
+        logger.info('checksum (local): %s', checksum_local)
         if checksum_local and checksum_local != '' and checksum_local != checksum_catalog:
             diagnostics = 'checksum verification failed for LFN=%s: checksum (catalog)=%s != checksum (local)=%s' % \
                           (fspec.lfn, checksum_catalog, checksum_local)
diff --git a/pilot/copytool/gfal.py b/pilot/copytool/gfal.py
index 2c184c89..54034e7f 100644
--- a/pilot/copytool/gfal.py
+++ b/pilot/copytool/gfal.py
@@ -57,7 +57,8 @@ def copy_in(files, **kwargs):
     if not check_for_gfal():
         raise StageInFailure("No GFAL2 tools found")
 
-    localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None)
+    # note, env vars might be unknown inside middleware contrainers, if so get the value already in the trace report
+    localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', trace_report.get_value('localSite'))
     for fspec in files:
         # update the trace report
         localsite = localsite if localsite else fspec.ddmendpoint
diff --git a/pilot/copytool/gs.py b/pilot/copytool/gs.py
index 68e50b5c..ddaa68d6 100644
--- a/pilot/copytool/gs.py
+++ b/pilot/copytool/gs.py
@@ -6,6 +6,7 @@
 #
 # Authors:
 # - Paul Nilsson, paul.nilsson@cern.ch, 2021
+# - Shuwei
 
 import os
 import logging
@@ -73,11 +74,11 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs):
     #      http_access = rprotocols["http_access"]
     #      os.environ['GTAG'] = http_access + os.path.join(remote_path, config.Pilot.pilotlog)
     #      logger.debug('http_access=%s' % http_access)
-    # except Exception as e:
+    # except Exception:
     #   logger.warning("Failed in get 'http_access' in ddm.rprotocols")
 
     surl = protocol.get('endpoint', '') + remote_path
-    logger.info('For GCS bucket, set surl=%s' % surl)
+    logger.info('For GCS bucket, set surl=%s', surl)
 
     # example:
     #   protocol = {u'path': u'/atlas-eventservice', u'endpoint': u's3://s3.cern.ch:443/', u'flavour': u'AWS-S3-SSL', u'id': 175}
@@ -97,7 +98,7 @@ def copy_in(files, **kwargs):
 
         dst = fspec.workdir or kwargs.get('workdir') or '.'
         path = os.path.join(dst, fspec.lfn)
-        logger.info('downloading surl=%s to local file %s' % (fspec.surl, path))
+        logger.info('downloading surl=%s to local file %s', fspec.surl, path)
         status, diagnostics = download_file(path, fspec.surl, object_name=fspec.lfn)
 
         if not status:  ## an error occurred
@@ -131,8 +132,8 @@ def download_file(path, surl, object_name=None):
         target = pathlib.Path(object_name)
         with target.open(mode="wb") as downloaded_file:
             client.download_blob_to_file(surl, downloaded_file)
-    except Exception as e:
-        diagnostics = 'exception caught in gs client: %s' % e
+    except Exception as error:
+        diagnostics = 'exception caught in gs client: %s' % error
         logger.critical(diagnostics)
         return False, diagnostics
 
@@ -150,7 +151,7 @@ def copy_out(files, **kwargs):
     workdir = kwargs.pop('workdir')
 
     for fspec in files:
-        logger.info('Going to process fspec.turl=%s' % fspec.turl)
+        logger.info('Going to process fspec.turl=%s', fspec.turl)
 
         import re
         # bucket = re.sub(r'gs://(.*?)/.*', r'\1', fspec.turl)
@@ -164,7 +165,7 @@ def copy_out(files, **kwargs):
             path = os.path.join(workdir, logfile)
             if os.path.exists(path):
                 object_name = os.path.join(remote_path, logfile)
-                logger.info('uploading %s to bucket=%s using object name=%s' % (path, bucket, object_name))
+                logger.info('uploading %s to bucket=%s using object name=%s', path, bucket, object_name)
                 status, diagnostics = upload_file(path, bucket, object_name=object_name)
 
                 if not status:  ## an error occurred
@@ -204,15 +205,15 @@ def upload_file(file_name, bucket, object_name=None):
     try:
         client = storage.Client()
         gs_bucket = client.get_bucket(bucket)
-        logger.info('uploading a file to bucket=%s in full path=%s' % (bucket, object_name))
+        logger.info('uploading a file to bucket=%s in full path=%s', bucket, object_name)
         blob = gs_bucket.blob(object_name)
         blob.upload_from_filename(filename=file_name)
         if file_name.endswith(config.Pilot.pilotlog):
             url_pilotlog = blob.public_url
             os.environ['GTAG'] = url_pilotlog
-            logger.debug("Set envvar GTAG with the pilotLot URL=%s" % url_pilotlog)
-    except Exception as e:
-        diagnostics = 'exception caught in gs client: %s' % e
+            logger.debug("Set envvar GTAG with the pilotLot URL=%s", url_pilotlog)
+    except Exception as error:
+        diagnostics = 'exception caught in gs client: %s' % error
         logger.critical(diagnostics)
         return False, diagnostics
 
diff --git a/pilot/copytool/lsm.py b/pilot/copytool/lsm.py
index 8f63cd46..67d8b791 100644
--- a/pilot/copytool/lsm.py
+++ b/pilot/copytool/lsm.py
@@ -7,7 +7,7 @@
 # Authors:
 # - Pavlo Svirin, pavlo.svirin@cern.ch, 2017
 # - Tobias Wegner, tobias.wegner@cern.ch, 2018
-# - Paul Nilsson, paul.nilsson@cern.ch, 2018
+# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021
 
 import os
 import logging
@@ -75,7 +75,9 @@ def copy_in(files, **kwargs):
     copysetup = get_copysetup(copytools, 'lsm')
     trace_report = kwargs.get('trace_report')
     #allow_direct_access = kwargs.get('allow_direct_access')
-    localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None)
+
+    # note, env vars might be unknown inside middleware contrainers, if so get the value already in the trace report
+    localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', trace_report.get_value('localSite'))
 
     for fspec in files:
         # update the trace report
@@ -99,17 +101,16 @@ def copy_in(files, **kwargs):
         source = fspec.turl
         destination = os.path.join(dst, fspec.lfn)
 
-        logger.info("transferring file %s from %s to %s" % (fspec.lfn, source, destination))
+        logger.info("transferring file %s from %s to %s", fspec.lfn, source, destination)
 
         exit_code, stdout, stderr = move(source, destination, dst_in=True, copysetup=copysetup)
 
         if exit_code != 0:
-            logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s" % (exit_code, stdout, stderr))
+            logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr)
 
             error = resolve_common_transfer_errors(stderr, is_stagein=True)
             fspec.status = 'failed'
             fspec.status_code = error.get('rcode')
-            logger.warning('error=%d' % error.get('rcode'))
             trace_report.update(clientState=error.get('state') or 'STAGEIN_ATTEMPT_FAILED',
                                 stateReason=error.get('error'), timeEnd=time())
             trace_report.send()
@@ -186,7 +187,7 @@ def copy_out(files, **kwargs):
         except Exception:
             opts = " ".join(["%s %s" % (k, v) for (k, v) in list(opts.items())])  # Python 3
 
-        logger.info("transferring file %s from %s to %s" % (fspec.lfn, source, destination))
+        logger.info("transferring file %s from %s to %s", fspec.lfn, source, destination)
 
         nretries = 1  # input parameter to function?
         for retry in range(nretries):
@@ -246,7 +247,7 @@ def move_all_files_in(files, nretries=1):
     stderr = ""
 
     for entry in files:  # entry = {'name':<filename>, 'source':<dir>, 'destination':<dir>}
-        logger.info("transferring file %s from %s to %s" % (entry['name'], entry['source'], entry['destination']))
+        logger.info("transferring file %s from %s to %s", entry['name'], entry['source'], entry['destination'])
 
         source = entry['source'] + '/' + entry['name']
         destination = os.path.join(entry['destination'], entry['name'])
@@ -255,7 +256,7 @@ def move_all_files_in(files, nretries=1):
 
             if exit_code != 0:
                 if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries:
-                    logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s" % (exit_code, stdout, stderr))
+                    logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr)
                     return exit_code, stdout, stderr
             else:  # all successful
                 break
@@ -276,7 +277,7 @@ def move_all_files_out(files, nretries=1):
     stderr = ""
 
     for entry in files:  # entry = {'name':<filename>, 'source':<dir>, 'destination':<dir>}
-        logger.info("transferring file %s from %s to %s" % (entry['name'], entry['source'], entry['destination']))
+        logger.info("transferring file %s from %s to %s", entry['name'], entry['source'], entry['destination'])
 
         destination = entry['destination'] + '/' + entry['name']
         source = os.path.join(entry['source'], entry['name'])
@@ -285,7 +286,7 @@ def move_all_files_out(files, nretries=1):
 
             if exit_code != 0:
                 if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries:
-                    logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s" % (exit_code, stdout, stderr))
+                    logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr)
                     return exit_code, stdout, stderr
             else:  # all successful
                 break
@@ -321,16 +322,16 @@ def move(source, destination, dst_in=True, copysetup="", options=None):
 
     try:
         exit_code, stdout, stderr = execute(cmd, usecontainer=False, copytool=True)  #, timeout=get_timeout(fspec.filesize))
-    except Exception as e:
+    except Exception as error:
         if dst_in:
             exit_code = ErrorCodes.STAGEINFAILED
         else:
             exit_code = ErrorCodes.STAGEOUTFAILED
-        stdout = 'exception caught: e' % e
+        stdout = 'exception caught: e' % error
         stderr = ''
         logger.warning(stdout)
 
-    logger.info('exit_code=%d, stdout=%s, stderr=%s' % (exit_code, stdout, stderr))
+    logger.info('exit_code=%d, stdout=%s, stderr=%s', exit_code, stdout, stderr)
     return exit_code, stdout, stderr
 
 
diff --git a/pilot/copytool/mv.py b/pilot/copytool/mv.py
index 73093a92..3ff42143 100644
--- a/pilot/copytool/mv.py
+++ b/pilot/copytool/mv.py
@@ -5,7 +5,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Authors:
-# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019
+# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021
 # - Tobias Wegner, tobias.wegner@cern.ch, 2018
 # - David Cameron, david.cameron@cern.ch, 2018-2019
 
@@ -48,12 +48,12 @@ def create_output_list(files, init_dir, ddmconf):
             # resolve token value from fspec.ddmendpoint
             token = ddmconf.get(fspec.ddmendpoint).token
             if not token:
-                logger.info('No space token info for %s' % fspec.ddmendpoint)
+                logger.info('No space token info for %s', fspec.ddmendpoint)
             else:
                 arcturl = re.sub(r'((:\d+)/)', r'\2;autodir=no;spacetoken=%s/' % token, arcturl)
             arcturl += ':checksumtype=%s:checksumvalue=%s' % (checksumtype, checksum)
 
-        logger.info('Adding to output.list: %s %s' % (fspec.lfn, arcturl))
+        logger.info('Adding to output.list: %s %s', fspec.lfn, arcturl)
         # Write output.list
         with open(os.path.join(init_dir, 'output.list'), 'a') as f:
             f.write('%s %s\n' % (fspec.lfn, arcturl))
@@ -124,7 +124,7 @@ def copy_out(files, copy_type="mv", **kwargs):
         raise StageOutFailure(stdout)
 
     # Create output list for ARC CE if necessary
-    logger.debug('init_dir for output.list=%s' % os.path.dirname(kwargs.get('workdir')))
+    logger.debug('init_dir for output.list=%s', os.path.dirname(kwargs.get('workdir')))
     output_dir = kwargs.get('output_dir', '')
     if not output_dir:
         create_output_list(files, os.path.dirname(kwargs.get('workdir')), kwargs.get('ddmconf', None))
@@ -168,11 +168,11 @@ def move_all_files(files, copy_type, workdir):
         # resolve canonical path
         source = os.path.realpath(source)
 
-        logger.info("transferring file %s from %s to %s" % (name, source, destination))
+        logger.info("transferring file %s from %s to %s", name, source, destination)
 
         exit_code, stdout, stderr = copy_method(source, destination)
         if exit_code != 0:
-            logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s" % (exit_code, stdout, stderr))
+            logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr)
             fspec.status = 'failed'
             if fspec.filetype == 'input':
                 fspec.status_code = ErrorCodes.STAGEINFAILED
diff --git a/pilot/copytool/objectstore.py b/pilot/copytool/objectstore.py
index a8ccb38d..e13c20e4 100644
--- a/pilot/copytool/objectstore.py
+++ b/pilot/copytool/objectstore.py
@@ -7,7 +7,7 @@
 # Authors:
 # - Wen Guan, wen.guan@cern.ch, 2018
 # - Alexey Anisenkov, anisyonk@cern.ch, 2019
-# - Paul Nilsson, paul.nilsson@cern.ch, 2019
+# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2021
 
 import os
 import json
@@ -73,7 +73,7 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs):
 #        :return: protocol as dictionary
 #    """
 #
-#    logger.info("Resolving protocol for file(lfn: %s, ddmendpoint: %s) with activity(%s)" % (fspec.lfn, fspec.ddmendpoint, activity))
+#    logger.info("Resolving protocol for file(lfn: %s, ddmendpoint: %s) with activity(%s)", fspec.lfn, fspec.ddmendpoint, activity)
 #
 #    activity = get_ddm_activity(activity)
 #    protocols = ddm.arprotocols.get(activity)
@@ -87,7 +87,7 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs):
 #        logger.error(err)
 #        raise PilotException(err)
 #    protocol = protocols_allow[0]
-#    logger.info("Resolved protocol for file(lfn: %s, ddmendpoint: %s) with activity(%s): %s" % (fspec.lfn, fspec.ddmendpoint, activity, protocol))
+#    logger.info("Resolved protocol for file(lfn: %s, ddmendpoint: %s) with activity(%s): %s", fspec.lfn, fspec.ddmendpoint, activity, protocol)
 #    return protocol
 
 
@@ -109,7 +109,7 @@ def copy_in(files, **kwargs):
     for fspec in files:
 
         cmd = []
-        logger.info("To transfer file: %s" % fspec)
+        logger.info("To transfer file: %s", fspec)
         if fspec.protocol_id:
             ddm = ddmconf.get(fspec.ddmendpoint)
             if ddm:
@@ -212,7 +212,7 @@ def copy_out(files, **kwargs):
             cwd = fspec.workdir or kwargs.get('workdir') or '.'
             path = os.path.join(cwd, 'rucio_upload.json')
             if not os.path.exists(path):
-                logger.error('Failed to resolve Rucio summary JSON, wrong path? file=%s' % path)
+                logger.error('Failed to resolve Rucio summary JSON, wrong path? file=%s', path)
             else:
                 with open(path, 'rb') as f:
                     summary = json.load(f)
diff --git a/pilot/copytool/rucio.py b/pilot/copytool/rucio.py
index 626821eb..675688e5 100644
--- a/pilot/copytool/rucio.py
+++ b/pilot/copytool/rucio.py
@@ -48,7 +48,7 @@ def verify_stage_out(fspec):
     from rucio.rse import rsemanager as rsemgr
     rse_settings = rsemgr.get_rse_info(fspec.ddmendpoint)
     uploaded_file = {'name': fspec.lfn, 'scope': fspec.scope}
-    logger.info('Checking file: %s' % str(fspec.lfn))
+    logger.info('Checking file: %s', str(fspec.lfn))
     return rsemgr.exists(rse_settings, [uploaded_file])
 
 
@@ -66,15 +66,16 @@ def copy_in(files, **kwargs):
     trace_report = kwargs.get('trace_report')
     use_pcache = kwargs.get('use_pcache')
     #job = kwargs.get('job')
-    #use_pcache = job.infosys.queuedata.use_pcache if job else False
-    logger.debug('use_pcache=%s' % use_pcache)
 
     # don't spoil the output, we depend on stderr parsing
     os.environ['RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]'
 
-    localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None)
+    logger.debug('RUCIO_LOCAL_SITE_ID=%s', os.environ.get('RUCIO_LOCAL_SITE_ID', '<unknown>'))
+    logger.debug('trace_report[localSite]=%s', trace_report.get_value('localSite'))
+    # note, env vars might be unknown inside middleware contrainers, if so get the value already in the trace report
+    localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', trace_report.get_value('localSite'))
     for fspec in files:
-        logger.info('rucio copytool, downloading file with scope:%s lfn:%s' % (str(fspec.scope), str(fspec.lfn)))
+        logger.info('rucio copytool, downloading file with scope:%s lfn:%s', str(fspec.scope), str(fspec.lfn))
         # update the trace report
         localsite = localsite if localsite else fspec.ddmendpoint
         trace_report.update(localSite=localsite, remoteSite=fspec.ddmendpoint, filesize=fspec.filesize)
@@ -152,7 +153,7 @@ def copy_in(files, **kwargs):
 
 def get_protocol(trace_report_out):
     """
-    Extract the protocol used for the transdfer from the dictionary returned by rucio.
+    Extract the protocol used for the transfer from the dictionary returned by rucio.
 
     :param trace_report_out: returned rucio transfer dictionary (dictionary).
     :return: protocol (string).
@@ -160,8 +161,8 @@ def get_protocol(trace_report_out):
 
     try:
         p = trace_report_out[0].get('protocol')
-    except Exception as e:
-        logger.warning('exception caught: %s' % e)
+    except Exception as error:
+        logger.warning('exception caught: %s' % error)
         p = ''
 
     return p
@@ -481,21 +482,21 @@ def _stage_in_api(dst, fspec, trace_report, trace_report_out, transfer_timeout,
             result = download_client.download_pfns([f], 1, trace_custom_fields=trace_pattern, traces_copy_out=trace_report_out)
         else:
             result = download_client.download_dids([f], trace_custom_fields=trace_pattern, traces_copy_out=trace_report_out)
-    except Exception as e:
+    except Exception as error:
         logger.warning('*** rucio API download client failed ***')
-        logger.warning('caught exception: %s' % e)
-        logger.debug('trace_report_out=%s' % trace_report_out)
+        logger.warning('caught exception: %s', error)
+        logger.debug('trace_report_out=%s', trace_report_out)
         # only raise an exception if the error info cannot be extracted
         if not trace_report_out:
-            raise e
+            raise error
         if not trace_report_out[0].get('stateReason'):
-            raise e
+            raise error
         ec = -1
     else:
         logger.info('*** rucio API download client finished ***')
-        logger.debug('client returned %s' % result)
+        logger.debug('client returned %s', result)
 
-    logger.debug('trace_report_out=%s' % trace_report_out)
+    logger.debug('trace_report_out=%s', trace_report_out)
 
     return ec, trace_report_out
 
@@ -552,18 +553,18 @@ def _stage_in_bulk(dst, files, trace_report_out=None, trace_common_fields=None):
     logger.info('*** rucio API downloading files (taking over logging) ***')
     try:
         result = download_client.download_pfns(file_list, num_threads, trace_custom_fields=trace_pattern, traces_copy_out=trace_report_out)
-    except Exception as e:
+    except Exception as error:
         logger.warning('*** rucio API download client failed ***')
-        logger.warning('caught exception: %s' % e)
-        logger.debug('trace_report_out=%s' % trace_report_out)
+        logger.warning('caught exception: %s', error)
+        logger.debug('trace_report_out=%s', trace_report_out)
         # only raise an exception if the error info cannot be extracted
         if not trace_report_out:
-            raise e
+            raise error
         if not trace_report_out[0].get('stateReason'):
-            raise e
+            raise error
     else:
         logger.info('*** rucio API download client finished ***')
-        logger.debug('client returned %s' % result)
+        logger.debug('client returned %s', result)
 
 
 def _stage_out_api(fspec, summary_file_path, trace_report, trace_report_out, transfer_timeout):
@@ -607,31 +608,31 @@ def _stage_out_api(fspec, summary_file_path, trace_report, trace_report_out, tra
         logger.debug('summary_file_path=%s' % summary_file_path)
         logger.debug('trace_report_out=%s' % trace_report_out)
         result = upload_client.upload([f], summary_file_path=summary_file_path, traces_copy_out=trace_report_out)
-    except Exception as e:
+    except Exception as error:
         logger.warning('*** rucio API upload client failed ***')
-        logger.warning('caught exception: %s' % e)
+        logger.warning('caught exception: %s', error)
         import traceback
         logger.error(traceback.format_exc())
-        logger.debug('trace_report_out=%s' % trace_report_out)
+        logger.debug('trace_report_out=%s', trace_report_out)
         if not trace_report_out:
-            raise e
+            raise error
         if not trace_report_out[0].get('stateReason'):
-            raise e
+            raise error
         ec = -1
     except UnboundLocalError:
         logger.warning('*** rucio API upload client failed ***')
         logger.warning('rucio still needs a bug fix of the summary in the uploadclient')
     else:
         logger.warning('*** rucio API upload client finished ***')
-        logger.debug('client returned %s' % result)
+        logger.debug('client returned %s', result)
 
     try:
         file_exists = verify_stage_out(fspec)
         logger.info('file exists at the storage: %s' % str(file_exists))
         if not file_exists:
             raise StageOutFailure('physical check after upload failed')
-    except Exception as e:
-        msg = 'file existence verification failed with: %s' % e
+    except Exception as error:
+        msg = 'file existence verification failed with: %s' % error
         logger.info(msg)
         raise StageOutFailure(msg)
 
diff --git a/pilot/copytool/s3.py b/pilot/copytool/s3.py
index 365f49cb..a0a480bc 100644
--- a/pilot/copytool/s3.py
+++ b/pilot/copytool/s3.py
@@ -81,7 +81,7 @@ def copy_in(files, **kwargs):
 
         bucket = 'bucket'  # UPDATE ME
         path = os.path.join(dst, fspec.lfn)
-        logger.info('downloading object %s from bucket=%s to local file %s' % (fspec.lfn, bucket, path))
+        logger.info('downloading object %s from bucket=%s to local file %s', fspec.lfn, bucket, path)
         status, diagnostics = download_file(path, bucket, object_name=fspec.lfn)
 
         if not status:  ## an error occurred
@@ -113,12 +113,12 @@ def download_file(path, bucket, object_name=None):
     try:
         s3 = boto3.client('s3')
         s3.download_file(bucket, object_name, path)
-    except ClientError as e:
-        diagnostics = 'S3 ClientError: %s' % e
+    except ClientError as error:
+        diagnostics = 'S3 ClientError: %s' % error
         logger.critical(diagnostics)
         return False, diagnostics
-    except Exception as e:
-        diagnostics = 'exception caught in s3_client: %s' % e
+    except Exception as error:
+        diagnostics = 'exception caught in s3_client: %s' % error
         logger.critical(diagnostics)
         return False, diagnostics
 
@@ -140,7 +140,7 @@ def copy_out(files, **kwargs):
         path = os.path.join(workdir, fspec.lfn)
         if os.path.exists(path):
             bucket = 'bucket'  # UPDATE ME
-            logger.info('uploading %s to bucket=%s using object name=%s' % (path, bucket, fspec.lfn))
+            logger.info('uploading %s to bucket=%s using object name=%s', path, bucket, fspec.lfn)
             status, diagnostics = upload_file(path, bucket, object_name=fspec.lfn)
 
             if not status:  ## an error occurred
@@ -181,12 +181,12 @@ def upload_file(file_name, bucket, object_name=None):
         s3_client = boto3.client('s3')
         #response = s3_client.upload_file(file_name, bucket, object_name)
         s3_client.upload_file(file_name, bucket, object_name)
-    except ClientError as e:
-        diagnostics = 'S3 ClientError: %s' % e
+    except ClientError as error:
+        diagnostics = 'S3 ClientError: %s' % error
         logger.critical(diagnostics)
         return False, diagnostics
-    except Exception as e:
-        diagnostics = 'exception caught in s3_client: %s' % e
+    except Exception as error:
+        diagnostics = 'exception caught in s3_client: %s' % error
         logger.critical(diagnostics)
         return False, diagnostics
 
diff --git a/pilot/copytool/xrdcp.py b/pilot/copytool/xrdcp.py
index 9eafbfc5..bfcd2f75 100644
--- a/pilot/copytool/xrdcp.py
+++ b/pilot/copytool/xrdcp.py
@@ -6,7 +6,7 @@
 #
 # Authors:
 # - Tobias Wegner, tobias.wegner@cern.ch, 2017-2018
-# - Paul Nilsson, paul.nilsson@cern.ch, 2017
+# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021
 
 # Reimplemented by Alexey Anisenkov
 
@@ -42,28 +42,28 @@ def _resolve_checksum_option(setup, **kwargs):
     if setup:
         cmd = "source %s; %s" % (setup, cmd)
 
-    logger.info("Execute command (%s) to check xrdcp client version" % cmd)
+    logger.info("Execute command (%s) to check xrdcp client version", cmd)
 
     rcode, stdout, stderr = execute(cmd, **kwargs)
-    logger.info("return code: %s" % rcode)
-    logger.info("return output: %s" % (stdout + stderr))
+    logger.info("return code: %s", rcode)
+    logger.info("return output: %s", stdout + stderr)
 
     cmd = "%s -h" % copy_command
     if setup:
         cmd = "source %s; %s" % (setup, cmd)
 
-    logger.info("Execute command (%s) to decide which option should be used to calc/verify file checksum.." % cmd)
+    logger.info("Execute command (%s) to decide which option should be used to calc/verify file checksum..", cmd)
 
     rcode, stdout, stderr = execute(cmd, **kwargs)
     output = stdout + stderr
-    logger.info("return code: %s" % rcode)
-    logger.debug("return output: %s" % output)
+    logger.info("return code: %s", rcode)
+    logger.debug("return output: %s", output)
 
     coption = ""
     checksum_type = 'adler32'  ## consider only adler32 for now
 
     if rcode:
-        logger.error('FAILED to execute command=%s: %s' % (cmd, output))
+        logger.error('FAILED to execute command=%s: %s', cmd, output)
     else:
         if "--cksum" in output:
             coption = "--cksum %s:print" % checksum_type
@@ -73,7 +73,7 @@ def _resolve_checksum_option(setup, **kwargs):
             coption = "-md5"
 
     if coption:
-        logger.info("Use %s option to get the checksum for %s command" % (coption, copy_command))
+        logger.info("Use %s option to get the checksum for %s command", coption, copy_command)
 
     return coption
 
@@ -96,7 +96,7 @@ def _stagefile(coption, source, destination, filesize, is_stagein, setup=None, *
     #logger.info("Executing command: %s, timeout=%s" % (cmd, timeout))
 
     rcode, stdout, stderr = execute(cmd, **kwargs)
-    logger.info('rcode=%d, stdout=%s, stderr=%s' % (rcode, stdout, stderr))
+    logger.info('rcode=%d, stdout=%s, stderr=%s', rcode, stdout, stderr)
 
     if rcode:  ## error occurred
         error = resolve_common_transfer_errors(stdout + stderr, is_stagein=is_stagein)
@@ -138,7 +138,8 @@ def copy_in(files, **kwargs):
     coption = _resolve_checksum_option(setup, **kwargs)
     trace_report = kwargs.get('trace_report')
 
-    localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None)
+    # note, env vars might be unknown inside middleware contrainers, if so get the value already in the trace report
+    localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', trace_report.get_value('localSite'))
     for fspec in files:
         # update the trace report
         localsite = localsite if localsite else fspec.ddmendpoint
@@ -243,7 +244,7 @@ def get_file_info_from_output(output):
         return None, None, None
 
     if not ("xrootd" in output or "XRootD" in output or "adler32" in output):
-        logger.warning("WARNING: Failed to extract checksum: Unexpected output: %s" % output)
+        logger.warning("WARNING: Failed to extract checksum: Unexpected output: %s", output)
         return None, None, None
 
     pattern = r"(?P<type>md5|adler32):\ (?P<checksum>[a-zA-Z0-9]+)\ \S+\ (?P<filesize>[0-9]+)"  # Python 3 (added r)
@@ -258,10 +259,10 @@ def get_file_info_from_output(output):
         if filesize:
             try:
                 filesize = int(filesize)
-            except ValueError as e:
-                logger.warning('failed to convert filesize to int: %s' % e)
+            except ValueError as error:
+                logger.warning('failed to convert filesize to int: %s', error)
                 filesize = None
     else:
-        logger.warning("WARNING: Checksum/file size info not found in output: failed to match pattern=%s in output=%s" % (pattern, output))
+        logger.warning("WARNING: Checksum/file size info not found in output: failed to match pattern=%s in output=%s", pattern, output)
 
     return filesize, checksum, checksum_type
diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py
index e9820e8b..23314ae6 100644
--- a/pilot/util/auxiliary.py
+++ b/pilot/util/auxiliary.py
@@ -592,3 +592,47 @@ def get_pid_from_command(cmd, pattern=r'gdb --pid (\d+)'):
         print('no match for pattern \'%s\' in command=\'%s\'' % (pattern, cmd))
 
     return pid
+
+
+def list_hardware():
+    """
+    Execute lshw to list local hardware.
+
+    :return: lshw output (string).
+    """
+
+    exit_code, stdout, stderr = execute('lshw -numeric -C display', mute=True)
+    if 'Command not found' in stdout or 'Command not found' in stderr:
+        stdout = ''
+    return stdout
+
+
+def get_display_info():
+    """
+    Extract the product and vendor from the lshw command.
+    E.g.
+           product: GD 5446 [1013:B8]
+           vendor: Cirrus Logic [1013]
+    -> GD 5446, Cirrus Logic
+
+    :return: product (string), vendor (string).
+    """
+
+    vendor = ''
+    product = ''
+    stdout = list_hardware()
+    if stdout:
+        vendor_pattern = re.compile(r'vendor\:\ (.+)\ .')
+        product_pattern = re.compile(r'product\:\ (.+)\ .')
+
+        for line in stdout.split('\n'):
+            if 'vendor' in line:
+                result = re.findall(vendor_pattern, line)
+                if result:
+                    vendor = result[0]
+            elif 'product' in line:
+                result = re.findall(product_pattern, line)
+                if result:
+                    product = result[0]
+
+    return product, vendor
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 9eaf93ef..667d188b 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '44'    # build number should be reset to '1' for every new development cycle
+BUILD = '46'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1
diff --git a/pilot/util/tracereport.py b/pilot/util/tracereport.py
index 9f2f8c09..717d17f5 100644
--- a/pilot/util/tracereport.py
+++ b/pilot/util/tracereport.py
@@ -102,6 +102,13 @@ def init(self, job):
             exit_code, stdout, stderr = execute(cmd)
             self['uuid'] = stdout.replace('-', '')
 
+    def get_value(self, key):
+        """
+
+        """
+
+        return self.get(key, None)
+
     def verify_trace(self):
         """
         Verify the trace consistency.
diff --git a/pilot/util/workernode.py b/pilot/util/workernode.py
index f23bb86f..e15ade4f 100644
--- a/pilot/util/workernode.py
+++ b/pilot/util/workernode.py
@@ -5,13 +5,13 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Authors:
-# - Paul Nilsson, paul.nilsson@cern.ch, 2017
+# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021
 
 import os
 import re
 
-from pilot.util.disk import disk_usage
 from pilot.info import infosys
+from pilot.util.disk import disk_usage
 
 import logging
 logger = logging.getLogger(__name__)
@@ -36,8 +36,8 @@ def get_local_disk_space(path):
     if not diskpipe.close():
         try:
             disk = float(disks.splitlines()[1].split()[3])
-        except ValueError as e:
-            logger.warning('exception caught while trying to convert disk info: %s' % e)
+        except ValueError as error:
+            logger.warning('exception caught while trying to convert disk info: %s', error)
 
     return disk
 
@@ -56,8 +56,8 @@ def get_meminfo():
             if mems.upper().find("MEMTOTAL") != -1:
                 try:
                     mem = float(mems.split()[1]) / 1024  # value listed by command as kB, convert to MB
-                except ValueError as e:
-                    logger.warning('exception caught while trying to convert meminfo: %s' % e)
+                except ValueError as error:
+                    logger.warning('exception caught while trying to convert meminfo: %s', error)
                 break
             mems = fd.readline()
 
@@ -78,8 +78,8 @@ def get_cpuinfo():
             if line.find("cpu MHz") != -1:  # Python 2/3
                 try:
                     cpu = float(line.split(":")[1])
-                except ValueError as e:
-                    logger.warning('exception caught while trying to convert cpuinfo: %s' % e)
+                except ValueError as error:
+                    logger.warning('exception caught while trying to convert cpuinfo: %s', error)
                 break  # command info is the same for all cores, so break here
 
     return cpu
@@ -114,21 +114,21 @@ def get_disk_space(queuedata):
     # --- non Job related queue data
     # jobinfo provider is required to consider overwriteAGIS data coming from Job
     _maxinputsize = infosys.queuedata.maxwdir
-    logger.debug("resolved value from global infosys.queuedata instance: infosys.queuedata.maxwdir=%s B" % _maxinputsize)
+    logger.debug("resolved value from global infosys.queuedata instance: infosys.queuedata.maxwdir=%s B", _maxinputsize)
     _maxinputsize = queuedata.maxwdir
-    logger.debug("resolved value: queuedata.maxwdir=%s B" % _maxinputsize)
+    logger.debug("resolved value: queuedata.maxwdir=%s B", _maxinputsize)
 
     try:
         du = disk_usage(os.path.abspath("."))
         _diskspace = int(du[2] / (1024 * 1024))  # need to convert from B to MB
-    except ValueError as e:
-        logger.warning("failed to extract disk space: %s (will use schedconfig default)" % e)
+    except ValueError as error:
+        logger.warning("failed to extract disk space: %s (will use schedconfig default)", error)
         _diskspace = _maxinputsize
     else:
-        logger.info("available WN disk space: %d MB" % (_diskspace))
+        logger.info("available WN disk space: %d MB", _diskspace)
 
     _diskspace = min(_diskspace, _maxinputsize)
-    logger.info("sending disk space %d MB to dispatcher" % (_diskspace))
+    logger.info("sending disk space %d MB to dispatcher", _diskspace)
 
     return _diskspace
 
@@ -226,5 +226,3 @@ def check_hz():
         import traceback
         logger.fatal('failed to read SC_CLK_TCK - will not be able to perform CPU consumption calculation')
         logger.warning(traceback.format_exc())
-    else:
-        logger.debug('SC_CLK_TCK=%s' % str(hz))
diff --git a/pilot/workflow/generic_hpc.py b/pilot/workflow/generic_hpc.py
index 8b567599..caf3309f 100644
--- a/pilot/workflow/generic_hpc.py
+++ b/pilot/workflow/generic_hpc.py
@@ -6,7 +6,7 @@
 #
 # Authors:
 # - Mario Lassnig, mario.lassnig@cern.ch, 2016
-# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2019
+# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021
 # - Danila Oleynik danila.oleynik@cern.ch, 2018
 
 import functools
@@ -48,9 +48,9 @@ def interrupt(args, signum, frame):
     """
 
     try:
-        logger.info('caught signal: %s' % [v for v, k in signal.__dict__.iteritems() if k == signum][0])  # Python 2
+        logger.info('caught signal: %s', [v for v, k in signal.__dict__.iteritems() if k == signum][0])  # Python 2
     except Exception:
-        logger.info('caught signal: %s' % [v for v, k in list(signal.__dict__.items()) if k == signum][0])  # Python 3
+        logger.info('caught signal: %s', [v for v, k in list(signal.__dict__.items()) if k == signum][0])  # Python 3
 
     args.graceful_stop.set()
 
@@ -212,11 +212,11 @@ def run(args):
         logger.debug("Final report: {0}".format(work_report))
         add_to_pilot_timing(job.jobid, PILOT_POST_FINAL_UPDATE, time.time(), args)
 
-    except Exception as e:
+    except Exception as error:
         work_report["jobStatus"] = "failed"
-        work_report["exitMsg"] = str(e)
+        work_report["exitMsg"] = str(error)
         publish_work_report(work_report, worker_attributes_file)
-        logging.exception('exception caught:')
+        logging.exception('exception caught: %s', error)
         traces.pilot['state'] = FAILURE
 
     return traces

From a77cd07f1dc162c0062a494cc1b041d54d916974 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Thu, 17 Jun 2021 11:36:59 +0200
Subject: [PATCH 73/96] Pylint corrections. Fixes for localSite problem in
 traces

---
 pilot/control/job.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pilot/control/job.py b/pilot/control/job.py
index 449af861..77ea47ab 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -1974,7 +1974,7 @@ def has_job_completed(queues, args):
         make_job_report(job)
         cmd = 'ls -lF %s' % os.environ.get('PILOT_HOME')
         logger.debug('%s:\n', cmd)
-        ec, stdout, stderr = execute(cmd)
+        _, stdout, _ = execute(cmd)
         logger.debug(stdout)
 
         queue_report(queues)
@@ -2375,7 +2375,7 @@ def interceptor(queues, traces, args):
             # peek at the jobs in the validated_jobs queue and send the running ones to the heartbeat function
             jobs = queues.monitored_payloads.queue
             if jobs:
-                for i in range(len(jobs)):
+                for _ in range(len(jobs)):
                     logger.info('interceptor loop %d: looking for communication file', n)
             time.sleep(30)
 

From cb0adac8757468d0e5a6983f4931acf1058485d9 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Thu, 17 Jun 2021 13:25:05 +0200
Subject: [PATCH 74/96] Pylint corrections

---
 pilot/control/data.py                  | 133 ++++++++++++-------------
 pilot/control/job.py                   |  35 +++----
 pilot/control/monitor.py               |  29 +++---
 pilot/control/payload.py               |  37 ++++---
 pilot/control/payloads/eventservice.py |   6 +-
 pilot/control/payloads/generic.py      |  30 +++---
 6 files changed, 133 insertions(+), 137 deletions(-)

diff --git a/pilot/control/data.py b/pilot/control/data.py
index 6f820d33..20151b92 100644
--- a/pilot/control/data.py
+++ b/pilot/control/data.py
@@ -271,7 +271,7 @@ def get_rse(data, lfn=""):
     return rse
 
 
-def stage_in_auto(site, files):
+def stage_in_auto(files):
     """
     Separate dummy implementation for automatic stage-in outside of pilot workflows.
     Should be merged with regular stage-in functionality later, but we need to have
@@ -289,47 +289,47 @@ def stage_in_auto(site, files):
                   '--no-subdir']
 
     # quickly remove non-existing destinations
-    for f in files:
-        if not os.path.exists(f['destination']):
-            f['status'] = 'failed'
-            f['errmsg'] = 'Destination directory does not exist: %s' % f['destination']
-            f['errno'] = 1
+    for _file in files:
+        if not os.path.exists(_file['destination']):
+            _file['status'] = 'failed'
+            _file['errmsg'] = 'Destination directory does not exist: %s' % _file['destination']
+            _file['errno'] = 1
         else:
-            f['status'] = 'running'
-            f['errmsg'] = 'File not yet successfully downloaded.'
-            f['errno'] = 2
+            _file['status'] = 'running'
+            _file['errmsg'] = 'File not yet successfully downloaded.'
+            _file['errno'] = 2
 
-    for f in files:
-        if f['errno'] == 1:
+    for _file in files:
+        if _file['errno'] == 1:
             continue
 
         tmp_executable = objectcopy.deepcopy(executable)
 
-        tmp_executable += ['--dir', f['destination']]
-        tmp_executable.append('%s:%s' % (f['scope'],
-                                         f['name']))
+        tmp_executable += ['--dir', _file['destination']]
+        tmp_executable.append('%s:%s' % (_file['scope'],
+                                         _file['name']))
         process = subprocess.Popen(tmp_executable,
                                    bufsize=-1,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
-        f['errno'] = 2
+        _file['errno'] = 2
         while True:
             time.sleep(0.5)
             exit_code = process.poll()
             if exit_code is not None:
-                stdout, stderr = process.communicate()
+                _, stderr = process.communicate()
                 if exit_code == 0:
-                    f['status'] = 'done'
-                    f['errno'] = 0
-                    f['errmsg'] = 'File successfully downloaded.'
+                    _file['status'] = 'done'
+                    _file['errno'] = 0
+                    _file['errmsg'] = 'File successfully downloaded.'
                 else:
-                    f['status'] = 'failed'
-                    f['errno'] = 3
+                    _file['status'] = 'failed'
+                    _file['errno'] = 3
                     try:
                         # the Details: string is set in rucio: lib/rucio/common/exception.py in __str__()
-                        f['errmsg'] = [detail for detail in stderr.split('\n') if detail.startswith('Details:')][0][9:-1]
-                    except Exception as e:
-                        f['errmsg'] = 'Could not find rucio error message details - please check stderr directly: %s' % str(e)
+                        _file['errmsg'] = [detail for detail in stderr.split('\n') if detail.startswith('Details:')][0][9:-1]
+                    except Exception as error:
+                        _file['errmsg'] = 'Could not find rucio error message details - please check stderr directly: %s' % error
                 break
             else:
                 continue
@@ -337,7 +337,7 @@ def stage_in_auto(site, files):
     return files
 
 
-def stage_out_auto(site, files):
+def stage_out_auto(files):
     """
     Separate dummy implementation for automatic stage-out outside of pilot workflows.
     Should be merged with regular stage-out functionality later, but we need to have
@@ -351,63 +351,60 @@ def stage_out_auto(site, files):
                   'rucio', '-v', 'upload']
 
     # quickly remove non-existing destinations
-    for f in files:
-        if not os.path.exists(f['file']):
-            f['status'] = 'failed'
-            f['errmsg'] = 'Source file does not exist: %s' % f['file']
-            f['errno'] = 1
+    for _file in files:
+        if not os.path.exists(_file['file']):
+            _file['status'] = 'failed'
+            _file['errmsg'] = 'Source file does not exist: %s' % _file['file']
+            _file['errno'] = 1
         else:
-            f['status'] = 'running'
-            f['errmsg'] = 'File not yet successfully uploaded.'
-            f['errno'] = 2
+            _file['status'] = 'running'
+            _file['errmsg'] = 'File not yet successfully uploaded.'
+            _file['errno'] = 2
 
-    for f in files:
-        if f['errno'] == 1:
+    for _file in files:
+        if _file['errno'] == 1:
             continue
 
         tmp_executable = objectcopy.deepcopy(executable)
 
-        tmp_executable += ['--rse', f['rse']]
+        tmp_executable += ['--rse', _file['rse']]
 
-        if 'no_register' in list(f.keys()) and f['no_register']:  # Python 2/3
+        if 'no_register' in list(_file.keys()) and _file['no_register']:  # Python 2/3
             tmp_executable += ['--no-register']
 
-        if 'summary' in list(f.keys()) and f['summary']:  # Python 2/3
+        if 'summary' in list(_file.keys()) and _file['summary']:  # Python 2/3
             tmp_executable += ['--summary']
 
-        if 'lifetime' in list(f.keys()):  # Python 2/3
-            tmp_executable += ['--lifetime', str(f['lifetime'])]
+        if 'lifetime' in list(_file.keys()):  # Python 2/3
+            tmp_executable += ['--lifetime', str(_file['lifetime'])]
 
-        if 'guid' in list(f.keys()):  # Python 2/3
-            tmp_executable += ['--guid', f['guid']]
+        if 'guid' in list(_file.keys()):  # Python 2/3
+            tmp_executable += ['--guid', _file['guid']]
 
-        if 'attach' in list(f.keys()):  # Python 2/3
-            tmp_executable += ['--scope', f['scope'], '%s:%s' % (f['attach']['scope'], f['attach']['name']), f['file']]
+        if 'attach' in list(_file.keys()):  # Python 2/3
+            tmp_executable += ['--scope', _file['scope'], '%s:%s' % (_file['attach']['scope'], _file['attach']['name']), _file['file']]
         else:
-            tmp_executable += ['--scope', f['scope'], f['file']]
+            tmp_executable += ['--scope', _file['scope'], _file['file']]
 
-        process = subprocess.Popen(tmp_executable,
-                                   bufsize=-1,
-                                   stdout=subprocess.PIPE,
-                                   stderr=subprocess.PIPE)
-        f['errno'] = 2
+        process = subprocess.Popen(tmp_executable, bufsize=-1, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        _file['errno'] = 2
         while True:
             time.sleep(0.5)
             exit_code = process.poll()
             if exit_code is not None:
-                stdout, stderr = process.communicate()
+                _, stderr = process.communicate()
                 if exit_code == 0:
-                    f['status'] = 'done'
-                    f['errno'] = 0
-                    f['errmsg'] = 'File successfully uploaded.'
+                    _file['status'] = 'done'
+                    _file['errno'] = 0
+                    _file['errmsg'] = 'File successfully uploaded.'
                 else:
-                    f['status'] = 'failed'
-                    f['errno'] = 3
+                    _file['status'] = 'failed'
+                    _file['errno'] = 3
                     try:
                         # the Details: string is set in rucio: lib/rucio/common/exception.py in __str__()
-                        f['errmsg'] = [detail for detail in stderr.split('\n') if detail.startswith('Details:')][0][9:-1]
-                    except Exception as e:
-                        f['errmsg'] = 'Could not find rucio error message details - please check stderr directly: %s' % str(e)
+                        _file['errmsg'] = [detail for detail in stderr.split('\n') if detail.startswith('Details:')][0][9:-1]
+                    except Exception as error:
+                        _file['errmsg'] = 'Could not find rucio error message details - please check stderr directly: %s' % error
                 break
             else:
                 continue
@@ -478,16 +475,16 @@ def copytool_in(queues, traces, args):
             cmd = user.get_utility_commands(job=job, order=UTILITY_BEFORE_STAGEIN)
             if cmd:
                 # xcache debug
-                exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                _, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                 logger.debug('[before xcache start] stdout=%s', _stdout)
                 logger.debug('[before xcache start] stderr=%s', _stderr)
 
-                exit_code, stdout, stderr = execute(cmd.get('command'))
+                _, stdout, stderr = execute(cmd.get('command'))
                 logger.debug('stdout=%s', stdout)
                 logger.debug('stderr=%s', stderr)
 
                 # xcache debug
-                exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                _, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                 logger.debug('[after xcache start] stdout=%s', _stdout)
                 logger.debug('[after xcache start] stderr=%s', _stderr)
 
@@ -711,7 +708,7 @@ def filter_files_for_log(directory):
     """
     filtered_files = []
     maxfilesize = 10
-    for root, dirnames, filenames in os.walk(directory):
+    for root, _, filenames in os.walk(directory):
         for filename in filenames:
             location = os.path.join(root, filename)
             if os.path.exists(location):  # do not include broken links
@@ -752,8 +749,8 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out
         user.remove_redundant_files(workdir, islooping=is_looping, debugmode=debugmode)
 
     # remove any present input/output files before tarring up workdir
-    for f in input_files + output_files:
-        path = os.path.join(workdir, f)
+    for fname in input_files + output_files:
+        path = os.path.join(workdir, fname)
         if os.path.exists(path):
             logger.info('removing file: %s', path)
             remove(path)
@@ -768,9 +765,9 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out
     logger.info('will create archive %s', fullpath)
     try:
         cmd = "pwd;tar cvfz %s %s --dereference --one-file-system; echo $?" % (fullpath, tarball_name)
-        exit_code, stdout, stderr = execute(cmd)
-    except Exception as e:
-        raise LogFileCreationFailure(e)
+        _, stdout, _ = execute(cmd)
+    except Exception as error:
+        raise LogFileCreationFailure(error)
     else:
         if pilot_home != current_dir:
             os.chdir(pilot_home)
diff --git a/pilot/control/job.py b/pilot/control/job.py
index 77ea47ab..cf6b8394 100644
--- a/pilot/control/job.py
+++ b/pilot/control/job.py
@@ -17,6 +17,7 @@
 import hashlib
 import random
 import socket
+import logging
 
 try:
     import Queue as queue  # noqa: N813
@@ -56,9 +57,7 @@
 from pilot.util.timing import add_to_pilot_timing, timing_report, get_postgetjob_time, get_time_since, time_stamp
 from pilot.util.workernode import get_disk_space, collect_workernode_info, get_node_name, get_cpu_model
 
-import logging
 logger = logging.getLogger(__name__)
-
 errors = ErrorCodes()
 
 
@@ -351,7 +350,6 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False)
     except Exception as error:
         logger.warning('exception caught while sending https request: %s', error)
         logger.warning('possibly offending data: %s', data)
-        pass
 
     if final:
         os.environ['SERVER_UPDATE'] = SERVER_UPDATE_TROUBLE
@@ -723,7 +721,7 @@ def get_debug_stdout(job):
         return get_general_command_stdout(job)
     else:
         # general command, execute and return output
-        exit_code, stdout, stderr = execute(job.debug_command)
+        _, stdout, _ = execute(job.debug_command)
         logger.info('debug_command: %s:\n\n%s\n', job.debug_command, stdout)
         return stdout
 
@@ -756,7 +754,7 @@ def get_general_command_stdout(job):
             except Exception as error:
                 logger.warning('general containerisation threw an exception: %s', error)
         else:
-            ec, stdout, stderr = execute(job.debug_command)
+            _, stdout, stderr = execute(job.debug_command)
             logger.debug("%s (stdout):\n\n%s\n\n", job.debug_command, stdout)
             logger.debug("%s (stderr):\n\n%s\n\n", job.debug_command, stderr)
 
@@ -790,7 +788,7 @@ def get_ls(debug_command, workdir):
     finalpath = os.path.join(workdir, path)
     debug_command = debug_command.replace(path, finalpath)
 
-    ec, stdout, stderr = execute(debug_command)
+    _, stdout, _ = execute(debug_command)
     logger.debug("%s:\n\n%s\n\n", debug_command, stdout)
 
     return stdout
@@ -933,7 +931,6 @@ def add_memory_info(data, workdir, name=""):
         data.update(utility_node)
     except Exception as error:
         logger.info('memory information not available: %s', error)
-        pass
 
 
 def remove_pilot_logs_from_list(list_of_files):
@@ -1142,8 +1139,8 @@ def delayed_space_check(queues, traces, args, job):
     proceed_with_local_space_check = True if (args.harvester_submitmode.lower() == 'push' and args.update_server) else False
     if proceed_with_local_space_check:
         logger.debug('pilot will now perform delayed space check')
-        ec, diagnostics = check_local_space()
-        if ec != 0:
+        exit_code, diagnostics = check_local_space()
+        if exit_code != 0:
             traces.pilot['error_code'] = errors.NOLOCALSPACE
             # set the corresponding error code
             job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOLOCALSPACE, msg=diagnostics)
@@ -1398,8 +1395,8 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge
     # pilot can report the error with a server update)
     proceed_with_local_space_check = False if (submitmode.lower() == 'push' and update_server) else True
     if proceed_with_local_space_check:
-        ec, diagnostics = check_local_space()
-        if ec != 0:
+        exit_code, diagnostics = check_local_space()
+        if exit_code != 0:
             traces.pilot['error_code'] = errors.NOLOCALSPACE
             return False
     else:
@@ -1515,8 +1512,8 @@ def get_job_definition_from_file(path, harvester):
             datalist = parse_qsl(response, keep_blank_values=True)
 
             # convert to dictionary
-            for d in datalist:
-                res[d[0]] = d[1]
+            for data in datalist:
+                res[data[0]] = data[1]
 
     if os.path.exists(path):
         remove(path)
@@ -1716,11 +1713,11 @@ def get_fake_job(input=True):
                'destinationDblock': job_name,
                'dispatchDBlockToken': 'NULL',
                'jobPars': '-a sources.20115461.derivation.tgz -r ./ -j "Reco_tf.py '
-                           '--inputAODFile AOD.07709524._000050.pool.root.1 --outputDAODFile test.pool.root '
-                           '--reductionConf HIGG3D1" -i "[\'AOD.07709524._000050.pool.root.1\']" -m "[]" -n "[]" --trf'
-                           ' --useLocalIO --accessmode=copy -o '
-                           '"{\'IROOT\': [(\'DAOD_HIGG3D1.test.pool.root\', \'%s.root\')]}" '
-                           '--sourceURL https://aipanda012.cern.ch:25443' % (job_name),
+                          '--inputAODFile AOD.07709524._000050.pool.root.1 --outputDAODFile test.pool.root '
+                          '--reductionConf HIGG3D1" -i "[\'AOD.07709524._000050.pool.root.1\']" -m "[]" -n "[]" --trf'
+                          ' --useLocalIO --accessmode=copy -o '
+                          '"{\'IROOT\': [(\'DAOD_HIGG3D1.test.pool.root\', \'%s.root\')]}" '
+                          '--sourceURL https://aipanda012.cern.ch:25443' % (job_name),
                'attemptNr': '0',
                'swRelease': 'Atlas-20.7.6',
                'nucleus': 'NULL',
@@ -1845,7 +1842,7 @@ def retrieve(queues, traces, args):  # noqa: C901
             delay = get_job_retrieval_delay(args.harvester)
             if not args.harvester:
                 logger.warning('did not get a job -- sleep %d s and repeat', delay)
-            for i in range(delay):
+            for _ in range(delay):
                 if args.graceful_stop.is_set():
                     break
                 time.sleep(1)
diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py
index 390776ac..99ba180b 100644
--- a/pilot/control/monitor.py
+++ b/pilot/control/monitor.py
@@ -41,15 +41,15 @@ def control(queues, traces, args):
     :return:
     """
 
-    t0 = time.time()
-    traces.pilot['lifetime_start'] = t0  # ie referring to when pilot monitoring began
-    traces.pilot['lifetime_max'] = t0
+    t_0 = time.time()
+    traces.pilot['lifetime_start'] = t_0  # ie referring to when pilot monitoring began
+    traces.pilot['lifetime_max'] = t_0
 
     threadchecktime = int(config.Pilot.thread_check)
 
     # for CPU usage debugging
     cpuchecktime = int(config.Pilot.cpu_check)
-    tcpu = t0
+    tcpu = t_0
 
     queuedata = get_queuedata_from_job(queues)
     max_running_time = get_max_running_time(args.lifetime, queuedata)
@@ -74,8 +74,7 @@ def control(queues, traces, args):
             time_since_start = get_time_since_start(args)
             grace_time = 10 * 60
             if time_since_start - grace_time > max_running_time:
-                logger.fatal('max running time (%d s) minus grace time (%d s) has been exceeded - must abort pilot' %
-                             (max_running_time, grace_time))
+                logger.fatal('max running time (%d s) minus grace time (%d s) has been exceeded - must abort pilot', max_running_time, grace_time)
                 logger.info('setting REACHED_MAXTIME and graceful stop')
                 environ['REACHED_MAXTIME'] = 'REACHED_MAXTIME'  # TODO: use singleton instead
                 # do not set graceful stop if pilot has not finished sending the final job update
@@ -109,7 +108,7 @@ def control(queues, traces, args):
             if int(time.time() - traces.pilot['lifetime_start']) % threadchecktime == 0:
                 # get all threads
                 for thread in threading.enumerate():
-                    # logger.info('thread name: %s' % thread.name)
+                    # logger.info('thread name: %s', thread.name)
                     if not thread.is_alive():
                         logger.fatal('thread \'%s\' is not alive', thread.name)
                         # args.graceful_stop.set()
@@ -150,14 +149,14 @@ def get_process_info(cmd, user=None, args='aufx', pid=None):
     """
 
     processes = []
-    n = 0
+    num = 0
     if not user:
         user = getuid()
     pattern = re.compile(r"\S+|[-+]?\d*\.\d+|\d+")
     arguments = ['ps', '-u', user, args, '--no-headers']
 
     process = Popen(arguments, stdout=PIPE, stderr=PIPE)
-    stdout, notused = process.communicate()
+    stdout, _ = process.communicate()
     for line in stdout.splitlines():
         found = re.findall(pattern, line)
         if found is not None:
@@ -166,12 +165,12 @@ def get_process_info(cmd, user=None, args='aufx', pid=None):
             mem = found[3]
             command = ' '.join(found[10:])
             if cmd in command:
-                n += 1
+                num += 1
                 if processid == str(pid):
                     processes = [cpu, mem, command]
 
     if processes:
-        processes.append(n)
+        processes.append(num)
 
     return processes
 
@@ -194,8 +193,8 @@ def run_checks(queues, args):
         t_max = 2 * 60
         logger.warning('pilot monitor received instruction that abort_job has been requested')
         logger.warning('will wait for a maximum of %d seconds for threads to finish', t_max)
-        t0 = time.time()
-        while time.time() - t0 < t_max:
+        t_0 = time.time()
+        while time.time() - t_0 < t_max:
             if args.job_aborted.is_set():
                 logger.warning('job_aborted has been set - aborting pilot monitoring')
                 args.abort_job.clear()
@@ -211,8 +210,8 @@ def run_checks(queues, args):
         if not args.job_aborted.is_set():
             logger.warning('will wait for a maximum of %d seconds for graceful_stop to take effect', t_max)
             t_max = 10
-            t0 = time.time()
-            while time.time() - t0 < t_max:
+            t_0 = time.time()
+            while time.time() - t_0 < t_max:
                 if args.job_aborted.is_set():
                     logger.warning('job_aborted has been set - aborting pilot monitoring')
                     args.abort_job.clear()
diff --git a/pilot/control/payload.py b/pilot/control/payload.py
index 33029f0c..b51063df 100644
--- a/pilot/control/payload.py
+++ b/pilot/control/payload.py
@@ -203,7 +203,7 @@ def execute_payloads(queues, traces, args):  # noqa: C901
             peek = [s_job for s_job in q_snapshot if job.jobid == s_job.jobid]
             if len(peek) == 0:
                 put_in_queue(job, queues.validated_payloads)
-                for i in range(10):  # Python 3
+                for _ in range(10):  # Python 3
                     if args.graceful_stop.is_set():
                         break
                     time.sleep(1)
@@ -329,8 +329,7 @@ def set_cpu_consumption_time(job):
     job.cpuconsumptiontime = int(round(cpuconsumptiontime))
     job.cpuconsumptionunit = "s"
     job.cpuconversionfactor = 1.0
-    logger.info('CPU consumption time: %f %s (rounded to %d %s)' %
-                (cpuconsumptiontime, job.cpuconsumptionunit, job.cpuconsumptiontime, job.cpuconsumptionunit))
+    logger.info('CPU consumption time: %f %s (rounded to %d %s)', cpuconsumptiontime, job.cpuconsumptionunit, job.cpuconsumptiontime, job.cpuconsumptionunit)
 
 
 def perform_initial_payload_error_analysis(job, exit_code):
@@ -345,7 +344,7 @@ def perform_initial_payload_error_analysis(job, exit_code):
 
     if exit_code != 0:
         msg = ""
-        ec = 0
+        exit_code = 0
         logger.warning('main payload execution returned non-zero exit code: %d', exit_code)
         stderr = read_file(os.path.join(job.workdir, config.Payload.payloadstderr))
         if stderr != "":
@@ -358,14 +357,14 @@ def perform_initial_payload_error_analysis(job, exit_code):
                 fatal = True
             if msg != "":
                 logger.warning("extracted message from stderr:\n%s", msg)
-                ec = set_error_code_from_stderr(msg, fatal)
+                exit_code = set_error_code_from_stderr(msg, fatal)
 
-        if not ec:
-            ec = errors.resolve_transform_error(exit_code, stderr)
-        if ec != 0:
+        if not exit_code:
+            exit_code = errors.resolve_transform_error(exit_code, stderr)
+        if exit_code != 0:
             if msg:
-                msg = errors.format_diagnostics(ec, msg)
-            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec, msg=msg)
+                msg = errors.format_diagnostics(exit_code, msg)
+            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code, msg=msg)
         else:
             if job.piloterrorcodes:
                 logger.warning('error code(s) already set: %s', str(job.piloterrorcodes))
@@ -390,23 +389,23 @@ def set_error_code_from_stderr(msg, fatal):
     """
 
     if "Failed invoking the NEWUSER namespace runtime" in msg:
-        ec = errors.SINGULARITYNEWUSERNAMESPACE
+        exit_code = errors.SINGULARITYNEWUSERNAMESPACE
     elif "Failed to create user namespace" in msg:
-        ec = errors.SINGULARITYFAILEDUSERNAMESPACE
+        exit_code = errors.SINGULARITYFAILEDUSERNAMESPACE
     elif "command not found" in msg:
-        ec = errors.TRANSFORMNOTFOUND
+        exit_code = errors.TRANSFORMNOTFOUND
     elif "SL5 is unsupported" in msg:
-        ec = errors.UNSUPPORTEDSL5OS
+        exit_code = errors.UNSUPPORTEDSL5OS
     elif "resource temporarily unavailable" in msg:
-        ec = errors.SINGULARITYRESOURCEUNAVAILABLE
+        exit_code = errors.SINGULARITYRESOURCEUNAVAILABLE
     elif "unrecognized arguments" in msg:
-        ec = errors.UNRECOGNIZEDTRFARGUMENTS
+        exit_code = errors.UNRECOGNIZEDTRFARGUMENTS
     elif fatal:
-        ec = errors.UNRECOGNIZEDTRFSTDERR
+        exit_code = errors.UNRECOGNIZEDTRFSTDERR
     else:
-        ec = 0
+        exit_code = 0
 
-    return ec
+    return exit_code
 
 
 def validate_post(queues, traces, args):
diff --git a/pilot/control/payloads/eventservice.py b/pilot/control/payloads/eventservice.py
index 3e0390d3..dc36ec72 100644
--- a/pilot/control/payloads/eventservice.py
+++ b/pilot/control/payloads/eventservice.py
@@ -100,15 +100,15 @@ def wait_graceful(self, args, proc):
         :return:
         """
 
-        t1 = time.time()
+        t_1 = time.time()
         while proc.is_alive():
             if args.graceful_stop.is_set():
                 logger.debug("Graceful stop is set, stopping work executor")
                 proc.stop()
                 break
-            if time.time() > t1 + 300:  # 5 minutes
+            if time.time() > t_1 + 300:  # 5 minutes
                 logger.info("Process is still running")
-                t1 = time.time()
+                t_1 = time.time()
             time.sleep(2)
 
         while proc.is_alive():
diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py
index 94fba2af..747e6acb 100644
--- a/pilot/control/payloads/generic.py
+++ b/pilot/control/payloads/generic.py
@@ -349,7 +349,7 @@ def run_command(self, cmd, label=None):
         except Exception as error:
             logger.error('could not execute: %s', error)
             return None
-        if type(proc) == tuple and not proc[0]:
+        if isinstance(proc, tuple) and not proc[0]:
             logger.error('failed to execute command')
             return None
 
@@ -381,7 +381,7 @@ def run_payload(self, job, cmd, out, err):
         except Exception as error:
             logger.error('could not execute: %s', error)
             return None
-        if type(proc) == tuple and not proc[0]:
+        if isinstance(proc, tuple) and not proc[0]:
             logger.error('failed to execute payload')
             return None
 
@@ -405,13 +405,17 @@ def extract_setup(self, cmd):
         :return: updated secondary command (string).
         """
 
-        def cut_str_from(_cmd, s):
-            # cut the string from the position of the given _cmd
-            return _cmd[:_cmd.find(s)]
+        def cut_str_from(_cmd, _str):
+            """
+            Cut the string from the position of the given _cmd
+            """
+            return _cmd[:_cmd.find(_str)]
 
         def cut_str_from_last_semicolon(_cmd):
-            # cut the string from the last semicolon
-            # NOTE: this will not work if jobParams also contain ;
+            """
+            Cut the string from the last semicolon
+            NOTE: this will not work if jobParams also contain ;
+            """
             # remove any trailing spaces and ;-signs
             _cmd = _cmd.strip()
             _cmd = _cmd[:-1] if _cmd.endswith(';') else _cmd
@@ -452,7 +456,7 @@ def wait_graceful(self, args, proc):
             time.sleep(0.1)
 
             iteration += 1
-            for i in range(60):  # Python 2/3
+            for _ in range(60):  # Python 2/3
                 if args.graceful_stop.is_set():
                     breaker = True
                     logger.info('breaking -- sending SIGTERM pid=%s', proc.pid)
@@ -519,9 +523,9 @@ def run_preprocess(self, job):
         try:
             # note: this might update the jobparams
             cmd_before_payload = self.utility_before_payload(job)
-        except Exception as e:
-            logger.error(e)
-            raise e
+        except Exception as error:
+            logger.error(error)
+            raise error
 
         if cmd_before_payload:
             cmd_before_payload = job.setup + cmd_before_payload
@@ -685,8 +689,8 @@ def run_utility_after_payload_finished(self, state, order):
         exit_code = 0
         try:
             cmd_after_payload, label = self.utility_after_payload_finished(self.__job, order)
-        except Exception as e:
-            logger.error(e)
+        except Exception as error:
+            logger.error(error)
         else:
             if cmd_after_payload and self.__job.postprocess and state != 'failed':
                 cmd_after_payload = self.__job.setup + cmd_after_payload

From 5c09bb5e4b9381bd81dad4ebddb810494ad3b197 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Thu, 17 Jun 2021 15:03:40 +0200
Subject: [PATCH 75/96] Flake8 correction. UTF-8 fix for Popen

---
 pilot/util/container.py  | 11 +++++------
 pilot/util/workernode.py |  2 +-
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/pilot/util/container.py b/pilot/util/container.py
index 3ab76b66..0a6c20b0 100644
--- a/pilot/util/container.py
+++ b/pilot/util/container.py
@@ -80,12 +80,11 @@ def execute(executable, **kwargs):
         exe = ['/bin/bash', '-c', executable]
 
     # try: intercept exception such as OSError -> report e.g. error.RESOURCEUNAVAILABLE: "Resource temporarily unavailable"
-    process = subprocess.Popen(exe,
-                               bufsize=-1,
-                               stdout=stdout,
-                               stderr=stderr,
-                               cwd=cwd,
-                               preexec_fn=setpgrp)  #setsid)
+    if is_python3():
+        process = subprocess.Popen(exe, bufsize=-1, stdout=stdout, stderr=stderr, cwd=cwd, preexec_fn=setpgrp, encoding='utf-8')  # Python 3
+    else:
+        process = subprocess.Popen(exe, bufsize=-1, stdout=stdout, stderr=stderr, cwd=cwd, preexec_fn=setpgrp)  # Python 2
+
     if returnproc:
         return process
     else:
diff --git a/pilot/util/workernode.py b/pilot/util/workernode.py
index e15ade4f..e96d3ba9 100644
--- a/pilot/util/workernode.py
+++ b/pilot/util/workernode.py
@@ -221,7 +221,7 @@ def check_hz():
     """
 
     try:
-        hz = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
+        _ = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
     except Exception:
         import traceback
         logger.fatal('failed to read SC_CLK_TCK - will not be able to perform CPU consumption calculation')

From c70d879fb21cce147ed0233e8c58c400ddd833ee Mon Sep 17 00:00:00 2001
From: Brinick Simmons <brinick.simmons@gmail.com>
Date: Mon, 21 Jun 2021 10:56:05 +0200
Subject: [PATCH 76/96] Fix pylint issues

---
 pilot/user/atlas/common.py | 1413 ++++++++++++++++++++++--------------
 1 file changed, 857 insertions(+), 556 deletions(-)

diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index 1a1344de..d02faaf8 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -8,40 +8,74 @@
 # - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021
 # - Wen Guan, wen.guan@cern.ch, 2018
 
-import os
-import re
-import fnmatch
 from collections import defaultdict
+import fnmatch
 from glob import glob
+import logging
+import os
+import re
+from random import randint
 from signal import SIGTERM, SIGUSR1
+from typing import Type
+# from tarfile import ExFileObject
 
 try:
     from functools import reduce  # Python 3
-except Exception:
+except ImportError:
     pass
 
 from .container import create_root_container_command
 from .dbrelease import get_dbrelease_version, create_dbrelease
-from .setup import should_pilot_prepare_setup, is_standard_atlas_job, get_asetup,\
-    set_inds, get_analysis_trf, get_payload_environment_variables, replace_lfns_with_turls
-from .utilities import get_memory_monitor_setup, get_network_monitor_setup, post_memory_monitor_action,\
-    get_memory_monitor_summary_filename, get_prefetcher_setup, get_benchmark_setup, get_memory_monitor_output_filename,\
-    get_metadata_dict_from_txt
-
-from pilot.util.auxiliary import get_resource_name, show_memory_usage
+from .setup import (
+    should_pilot_prepare_setup,
+    is_standard_atlas_job,
+    get_asetup,
+    set_inds,
+    get_analysis_trf,
+    get_payload_environment_variables,
+    replace_lfns_with_turls,
+)
+from .utilities import (
+    get_memory_monitor_setup,
+    get_network_monitor_setup,
+    post_memory_monitor_action,
+    get_memory_monitor_summary_filename,
+    get_prefetcher_setup,
+    get_benchmark_setup,
+    get_memory_monitor_output_filename,
+    get_metadata_dict_from_txt,
+)
+
+from pilot.util.auxiliary import get_resource_name, show_memory_usage, is_python3
 from pilot.common.errorcodes import ErrorCodes
 from pilot.common.exception import TrfDownloadFailure, PilotException
-from pilot.util.auxiliary import is_python3
 from pilot.util.config import config
-from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED,\
-    UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_STARTED2, UTILITY_BEFORE_STAGEIN, UTILITY_AFTER_PAYLOAD_FINISHED2
+from pilot.util.constants import (
+    UTILITY_BEFORE_PAYLOAD,
+    UTILITY_WITH_PAYLOAD,
+    UTILITY_AFTER_PAYLOAD_STARTED,
+    UTILITY_AFTER_PAYLOAD_FINISHED,
+    UTILITY_AFTER_PAYLOAD_STARTED2,
+    UTILITY_BEFORE_STAGEIN,
+    UTILITY_AFTER_PAYLOAD_FINISHED2
+)
 from pilot.util.container import execute
-from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy,\
-    copy_pilot_source, write_file, read_json, read_file, update_extension, get_local_file_size, calculate_checksum
-from pilot.util.processes import convert_ps_to_dict, find_cmd_pids, get_trimmed_dictionary, find_pid, is_child
+from pilot.util.filehandling import (
+    copy, copy_pilot_source, calculate_checksum,
+    get_guid, get_local_file_size,
+    remove, remove_dir_tree, remove_core_dumps, read_file, read_json,
+    update_extension,
+    write_file,
+    # read_list
+)
+from pilot.util.processes import (
+    convert_ps_to_dict,
+    find_pid, find_cmd_pids,
+    get_trimmed_dictionary,
+    is_child
+)
 from pilot.util.tracereport import TraceReport
 
-import logging
 logger = logging.getLogger(__name__)
 
 errors = ErrorCodes()
@@ -49,8 +83,9 @@
 
 def sanity_check():
     """
-    Perform an initial sanity check before doing anything else in a given workflow.
-    This function can be used to verify importing of modules that are otherwise used much later, but it is better to abort
+    Perform an initial sanity check before doing anything else in a
+    given workflow. This function can be used to verify importing of
+    modules that are otherwise used much later, but it is better to abort
     the pilot if a problem is discovered early.
 
     :return: exit code (0 if all is ok, otherwise non-zero exit code).
@@ -61,7 +96,8 @@ def sanity_check():
     #try:
     #    from rucio.client.downloadclient import DownloadClient
     #    from rucio.client.uploadclient import UploadClient
-    #    # note: must do something with Download/UploadClients or flake8 will complain - but do not instantiate
+    #    # note: must do something with Download/UploadClients or flake8
+    # will complain - but do not instantiate
     #except Exception as e:
     #    logger.warning('sanity check failed: %s' % e)
     #    exit_code = errors.MIDDLEWAREIMPORTFAILURE
@@ -81,7 +117,9 @@ def validate(job):
     status = True
 
     if 'DBRelease' in job.jobparams:
-        logger.debug('encountered DBRelease info in job parameters - will attempt to create a local DBRelease file')
+        logger.debug((
+            'encountered DBRelease info in job parameters - '
+            'will attempt to create a local DBRelease file'))
         version = get_dbrelease_version(job.jobparams)
         if version:
             status = create_dbrelease(version, job.workdir)
@@ -94,20 +132,22 @@ def validate(job):
     if status:
         if job.imagename and job.imagename.startswith('/'):
             if os.path.exists(job.imagename):
-                logger.info('verified that image exists: %s' % job.imagename)
+                logger.info('verified that image exists: %s', job.imagename)
             else:
                 status = False
-                logger.warning('image does not exist: %s' % job.imagename)
+                logger.warning('image does not exist: %s', job.imagename)
                 job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.IMAGENOTFOUND)
 
     # cleanup job parameters if only copy-to-scratch
     #if job.only_copy_to_scratch():
     #    logger.debug('job.params=%s' % job.jobparams)
     #    if ' --usePFCTurl' in job.jobparams:
-    #        logger.debug('cleaning up --usePFCTurl from job parameters since all input is copy-to-scratch')
+    #        logger.debug('cleaning up --usePFCTurl from job parameters
+    #         since all input is copy-to-scratch')
     #        job.jobparams = job.jobparams.replace(' --usePFCTurl', '')
     #    if ' --directIn' in job.jobparams:
-    #        logger.debug('cleaning up --directIn from job parameters since all input is copy-to-scratch')
+    #        logger.debug('cleaning up --directIn from job parameters
+    #           since all input is copy-to-scratch')
     #        job.jobparams = job.jobparams.replace(' --directIn', '')
 
     return status
@@ -122,7 +162,7 @@ def open_remote_files(indata, workdir):
     :return: exit code (int), diagnostics (string).
     """
 
-    ec = 0
+    exitcode = 0
     diagnostics = ""
     not_opened = ""
 
@@ -140,22 +180,25 @@ def open_remote_files(indata, workdir):
         final_script_path = os.path.join(workdir, script)
         os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH') + ':' + workdir
         script_path = os.path.join('pilot/scripts', script)
-        d1 = os.path.join(os.path.join(os.environ['PILOT_HOME'], 'pilot2'), script_path)
-        d2 = os.path.join(workdir, script_path)
-        full_script_path = d1 if os.path.exists(d1) else d2
+        dir1 = os.path.join(os.path.join(os.environ['PILOT_HOME'], 'pilot2'), script_path)
+        dir2 = os.path.join(workdir, script_path)
+        full_script_path = dir1 if os.path.exists(dir1) else dir2
         if not os.path.exists(full_script_path):
             # do not set ec since this will be a pilot issue rather than site issue
-            diagnostics = 'cannot perform file open test - script path does not exist: %s' % full_script_path
+            diagnostics = (
+                'cannot perform file open test - script path does '
+                'not exist: %s' % full_script_path
+            )
             logger.warning(diagnostics)
-            logger.warning('tested both path=%s and path=%s (none exists)' % (d1, d2))
-            return ec, diagnostics, not_opened
+            logger.warning('tested both path=%s and path=%s (none exists)', dir1, dir2)
+            return exitcode, diagnostics, not_opened
         try:
             copy(full_script_path, final_script_path)
-        except Exception as e:
+        except PilotException as exc:
             # do not set ec since this will be a pilot issue rather than site issue
-            diagnostics = 'cannot perform file open test - pilot source copy failed: %s' % e
+            diagnostics = 'cannot perform file open test - pilot source copy failed: %s' % exc
             logger.warning(diagnostics)
-            return ec, diagnostics, not_opened
+            return exitcode, diagnostics, not_opened
         else:
             # correct the path when containers have been used
             final_script_path = os.path.join('.', script)
@@ -165,38 +208,45 @@ def open_remote_files(indata, workdir):
 
             show_memory_usage()
 
-            logger.info('*** executing file open verification script:\n\n\'%s\'\n\n' % cmd)
+            logger.info('*** executing file open verification script:\n\n\'%s\'\n\n', cmd)
             exit_code, stdout, stderr = execute(cmd, usecontainer=False)
             if config.Pilot.remotefileverification_log:
-                write_file(os.path.join(workdir, config.Pilot.remotefileverification_log), stdout + stderr, mute=False)
+                fpath = os.path.join(workdir, config.Pilot.remotefileverification_log)
+                write_file(fpath, stdout + stderr, mute=False)
 
             show_memory_usage()
 
             # error handling
             if exit_code:
-                logger.warning('script %s finished with ec=%d' % (script, exit_code))
+                logger.warning('script %s finished with ec=%d', script, exit_code)
             else:
-                dictionary_path = os.path.join(workdir, config.Pilot.remotefileverification_dictionary)
+                dictionary_path = os.path.join(
+                    workdir,
+                    config.Pilot.remotefileverification_dictionary
+                )
                 if not dictionary_path:
-                    logger.warning('file does not exist: %s' % dictionary_path)
+                    logger.warning('file does not exist: %s', dictionary_path)
                 else:
                     file_dictionary = read_json(dictionary_path)
                     if not file_dictionary:
-                        logger.warning('could not read dictionary from %s' % dictionary_path)
+                        logger.warning('could not read dictionary from %s', dictionary_path)
                     else:
                         not_opened = ""
                         for turl in file_dictionary:
                             opened = file_dictionary[turl]
-                            logger.info('turl could be opened: %s' % turl) if opened else logger.info('turl could not be opened: %s' % turl)
                             if not opened:
+                                logger.info('turl could not be opened: %s', turl)
                                 not_opened += turl if not not_opened else ",%s" % turl
+                            else:
+                                logger.info('turl could be opened: %s', turl)
+
                         if not_opened:
-                            ec = errors.REMOTEFILECOULDNOTBEOPENED
+                            exitcode = errors.REMOTEFILECOULDNOTBEOPENED
                             diagnostics = "Remote file could not be opened: %s" % not_opened if "," not in not_opened else "turls not opened:%s" % not_opened
     else:
         logger.info('nothing to verify (for remote files)')
 
-    return ec, diagnostics, not_opened
+    return exitcode, diagnostics, not_opened
 
 
 def get_file_open_command(script_path, turls):
@@ -217,19 +267,22 @@ def extract_turls(indata):
     :return: comma-separated list of turls (string).
     """
 
-    turls = ""
-    for f in indata:
-        if f.status == 'remote_io':
-            turls += f.turl if not turls else ",%s" % f.turl
+    # turls = ""
+    # for filespc in indata:
+    # if filespc.status == 'remote_io':
+    # turls += filespc.turl if not turls else ",%s" % filespc.turl
+    # return turls
 
-    return turls
+    return ",".join(
+        fspec.turl for fspec in indata if fspec.status == 'remote_io'
+    )
 
 
 def process_remote_file_traces(path, job, not_opened_turls):
     """
     Report traces for remote files.
-    The function reads back the base trace report (common part of all traces) and updates it per file before reporting
-    it to the Rucio server.
+    The function reads back the base trace report (common part of all traces)
+    and updates it per file before reporting it to the Rucio server.
 
     :param path: path to base trace report (string).
     :param job: job object.
@@ -239,8 +292,8 @@ def process_remote_file_traces(path, job, not_opened_turls):
 
     try:
         base_trace_report = read_json(path)
-    except PilotException as e:
-        logger.warning('failed to open base trace report (cannot send trace reports): %s' % e)
+    except PilotException as exc:
+        logger.warning('failed to open base trace report (cannot send trace reports): %s', exc)
     else:
         if not base_trace_report:
             logger.warning('failed to read back base trace report (cannot send trace reports)')
@@ -262,13 +315,13 @@ def process_remote_file_traces(path, job, not_opened_turls):
                     if trace_report:
                         trace_report.send()
                     else:
-                        logger.warning('failed to create trace report for turl=%s' % fspec.turl)
+                        logger.warning('failed to create trace report for turl=%s', fspec.turl)
 
 
 def get_payload_command(job):
     """
-    Return the full command for executing the payload, including the sourcing of all setup files and setting of
-    environment variables.
+    Return the full command for executing the payload, including the
+    sourcing of all setup files and setting of environment variables.
 
     :param job: job object.
     :raises PilotException: TrfDownloadFailure.
@@ -285,52 +338,55 @@ def get_payload_command(job):
 
     # Is it a user job or not?
     userjob = job.is_analysis()
-    logger.info('pilot is running a user analysis job') if userjob else logger.info('pilot is running a production job')
+    logger.info('pilot is running a %s job', 'user analysis' if userjob else 'production')
 
     resource_name = get_resource_name()  # 'grid' if no hpc_resource is set
-    resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0)  # Python 3, -1 -> 0
+
+    # Python 3, level -1 -> 0
+    modname = 'pilot.user.atlas.resource.%s' % resource_name
+    resource = __import__(modname, globals(), locals(), [resource_name], 0)
 
     # get the general setup command and then verify it if required
     cmd = resource.get_setup_command(job, preparesetup)
     if cmd:
-        ec, diagnostics = resource.verify_setup_command(cmd)
-        if ec != 0:
-            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec)
-            raise PilotException(diagnostics, code=ec)
+        exitcode, diagnostics = resource.verify_setup_command(cmd)
+        if exitcode != 0:
+            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exitcode)
+            raise PilotException(diagnostics, code=exitcode)
 
     # make sure that remote file can be opened before executing payload
     catchall = job.infosys.queuedata.catchall.lower() if job.infosys.queuedata.catchall else ''
     if config.Pilot.remotefileverification_log and 'remoteio_test=false' not in catchall:
-        ec = 0
+        exitcode = 0
         diagnostics = ""
         not_opened_turls = ""
         try:
-            ec, diagnostics, not_opened_turls = open_remote_files(job.indata, job.workdir)
-        except Exception as e:
-            logger.warning('caught exception: %s' % e)
+            exitcode, diagnostics, not_opened_turls = open_remote_files(job.indata, job.workdir)
+        except PilotException as exc:
+            logger.warning('caught exception: %s', exc)
         else:
             # read back the base trace report
             path = os.path.join(job.workdir, config.Pilot.base_trace_report)
             if not os.path.exists(path):
-                logger.warning('base trace report does not exist (%s) - input file traces should already have been sent' % path)
+                logger.warning((
+                    'base trace report does not exist (%s) - input file '
+                    'traces should already have been sent', path))
             else:
                 process_remote_file_traces(path, job, not_opened_turls)
 
             # fail the job if the remote files could not be verified
-            if ec != 0:
-                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec, msg=diagnostics)
-                raise PilotException(diagnostics, code=ec)
+            if exitcode != 0:
+                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exitcode, msg=diagnostics)
+                raise PilotException(diagnostics, code=exitcode)
     else:
         logger.debug('no remote file open verification')
 
     if is_standard_atlas_job(job.swrelease):
-
         # Normal setup (production and user jobs)
         logger.info("preparing normal production/analysis job setup command")
         cmd = get_normal_payload_command(cmd, job, preparesetup, userjob)
-
-    else:  # Generic, non-ATLAS specific jobs, or at least a job with undefined swRelease
-
+    else:
+        # Generic, non-ATLAS specific jobs, or at least a job with undefined swRelease
         logger.info("generic job (non-ATLAS specific or with undefined swRelease)")
         cmd = get_generic_payload_command(cmd, job, preparesetup, userjob)
 
@@ -341,7 +397,8 @@ def get_payload_command(job):
     # only if not using a user container
     if not job.imagename:
         site = os.environ.get('PILOT_SITENAME', '')
-        variables = get_payload_environment_variables(cmd, job.jobid, job.taskid, job.attemptnr, job.processingtype, site, userjob)
+        variables = get_payload_environment_variables(
+            cmd, job.jobid, job.taskid, job.attemptnr, job.processingtype, site, userjob)
         cmd = ''.join(variables) + cmd
 
     # prepend PanDA job id in case it is not there already (e.g. runcontainer jobs)
@@ -350,24 +407,36 @@ def get_payload_command(job):
 
     cmd = cmd.replace(';;', ';')
 
-    # For direct access in prod jobs, we need to substitute the input file names with the corresponding TURLs
+    # For direct access in prod jobs, we need to substitute the input file names
+    # with the corresponding TURLs
     # get relevant file transfer info
     #use_copy_tool, use_direct_access, use_pfc_turl = get_file_transfer_info(job)
     #if not userjob and use_direct_access and job.transfertype == 'direct':
-    if not userjob and not job.is_build_job() and job.has_remoteio():  ## ported from old logic
+
+    ## ported from old logic
+    if not userjob and not job.is_build_job() and job.has_remoteio():
         ## ported from old logic but still it looks strange (anisyonk)
-        ## the "PoolFileCatalog.xml" should already contains proper TURLs values as it created by create_input_file_metadata()
-        ## if the case is just to patch `writetofile` file, than logic should be cleaned and decoupled
-        ## anyway, instead of parsing the file, it's much more easy to generate properly `writetofile` content from the beginning with TURL data
+        ## the "PoolFileCatalog.xml" should already contains proper TURLs
+        ## values as it created by create_input_file_metadata() if the case
+        ## is just to patch `writetofile` file, than logic should be cleaned
+        ## and decoupled anyway, instead of parsing the file, it's much easier
+        ## to generate properly `writetofile` content from the beginning
+        ##  with TURL data
         lfns = job.get_lfns_and_guids()[0]
-        cmd = replace_lfns_with_turls(cmd, job.workdir, "PoolFileCatalog.xml", lfns, writetofile=job.writetofile)
+        cmd = replace_lfns_with_turls(
+            cmd,
+            job.workdir,
+            "PoolFileCatalog.xml",
+            lfns,
+            writetofile=job.writetofile
+        )
 
     # Explicitly add the ATHENA_PROC_NUMBER (or JOB value)
     cmd = add_athena_proc_number(cmd)
 
     show_memory_usage()
 
-    logger.info('payload run command: %s' % cmd)
+    logger.info('payload run command: %s', cmd)
 
     return cmd
 
@@ -379,27 +448,30 @@ def get_normal_payload_command(cmd, job, preparesetup, userjob):
     :param cmd: any preliminary command setup (string).
     :param job: job object.
     :param userjob: True for user analysis jobs, False otherwise (bool).
-    :param preparesetup: True if the pilot should prepare the setup, False if already in the job parameters.
+    :param preparesetup: True if the pilot should prepare the setup,
+    False if already in the job parameters.
     :return: normal payload command (string).
     """
 
-    # set the INDS env variable (used by runAthena but also for EventIndex production jobs)
+    # set the INDS env variable
+    # (used by runAthena but also for EventIndex production jobs)
     set_inds(job.datasetin)  # realDatasetsIn
 
     if userjob:
         # Try to download the trf (skip when user container is to be used)
-        ec, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir)
-        if ec != 0:
+        exitcode, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir)
+        if exitcode != 0:
             raise TrfDownloadFailure(diagnostics)
-        else:
-            logger.debug('user analysis trf: %s' % trf_name)
+
+        logger.debug('user analysis trf: %s', trf_name)
 
         if preparesetup:
             _cmd = get_analysis_run_command(job, trf_name)
         else:
             _cmd = job.jobparams
 
-        # Correct for multi-core if necessary (especially important in case coreCount=1 to limit parallel make)
+        # Correct for multi-core if necessary (especially important in
+        # case coreCount=1 to limit parallel make)
         cmd += "; " + add_makeflags(job.corecount, "") + _cmd
     else:
         # Add Database commands if they are set by the local site
@@ -437,19 +509,19 @@ def get_generic_payload_command(cmd, job, preparesetup, userjob):
         #if job.imagename != "" or "--containerImage" in job.jobparams:
         #    job.transformation = os.path.join(os.path.dirname(job.transformation), "runcontainer")
         #    logger.warning('overwrote job.transformation, now set to: %s' % job.transformation)
-        ec, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir)
-        if ec != 0:
+        exitcode, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir)
+        if exitcode != 0:
             raise TrfDownloadFailure(diagnostics)
-        else:
-            logger.debug('user analysis trf: %s' % trf_name)
+
+        logger.debug('user analysis trf: %s', trf_name)
 
         if preparesetup:
             _cmd = get_analysis_run_command(job, trf_name)
         else:
             _cmd = job.jobparams
 
-        # correct for multi-core if necessary (especially important in case coreCount=1 to limit parallel make)
-        # only if not using a user container
+        # correct for multi-core if necessary (especially important in case
+        # coreCount=1 to limit parallel make), only if not using a user container
         if not job.imagename:
             cmd += "; " + add_makeflags(job.corecount, "") + _cmd
         else:
@@ -471,7 +543,8 @@ def get_generic_payload_command(cmd, job, preparesetup, userjob):
 
 def add_athena_proc_number(cmd):
     """
-    Add the ATHENA_PROC_NUMBER and ATHENA_CORE_NUMBER to the payload command if necessary.
+    Add the ATHENA_PROC_NUMBER and ATHENA_CORE_NUMBER to
+    the payload command if necessary.
 
     :param cmd: payload execution command (string).
     :return: updated payload execution command (string).
@@ -480,13 +553,13 @@ def add_athena_proc_number(cmd):
     # get the values if they exist
     try:
         value1 = int(os.environ['ATHENA_PROC_NUMBER_JOB'])
-    except Exception as e:
-        logger.warning('failed to convert ATHENA_PROC_NUMBER_JOB to int: %s' % e)
+    except (TypeError, ValueError) as exc:
+        logger.warning('failed to convert ATHENA_PROC_NUMBER_JOB to int: %s', exc)
         value1 = None
     try:
         value2 = int(os.environ['ATHENA_CORE_NUMBER'])
-    except Exception as e:
-        logger.warning('failed to convert ATHENA_CORE_NUMBER to int: %s' % e)
+    except (TypeError, ValueError) as exc:
+        logger.warning('failed to convert ATHENA_CORE_NUMBER to int: %s', exc)
         value2 = None
 
     if "ATHENA_PROC_NUMBER" not in cmd:
@@ -496,9 +569,13 @@ def add_athena_proc_number(cmd):
             if value1 > 1:
                 cmd = 'export ATHENA_PROC_NUMBER=%d;' % value1 + cmd
             else:
-                logger.info("will not add ATHENA_PROC_NUMBER to cmd since the value is %s" % str(value1))
+                logger.info((
+                    "will not add ATHENA_PROC_NUMBER to cmd "
+                    "since the value is %s", str(value1)))
         else:
-            logger.warning("don't know how to set ATHENA_PROC_NUMBER (could not find it in os.environ)")
+            logger.warning((
+                "don't know how to set ATHENA_PROC_NUMBER "
+                "(could not find it in os.environ)"))
     else:
         logger.info("ATHENA_PROC_NUMBER already in job command")
 
@@ -506,9 +583,13 @@ def add_athena_proc_number(cmd):
         if value2 > 1:
             cmd = 'export ATHENA_CORE_NUMBER=%d;' % value2 + cmd
         else:
-            logger.info("will not add ATHENA_CORE_NUMBER to cmd since the value is %s" % str(value2))
+            logger.info((
+                "will not add ATHENA_CORE_NUMBER to cmd since the "
+                "value is %s", str(value2)))
     else:
-        logger.warning('there is no ATHENA_CORE_NUMBER in os.environ (cannot add it to payload command)')
+        logger.warning((
+            'there is no ATHENA_CORE_NUMBER in os.environ '
+            '(cannot add it to payload command)'))
 
     return cmd
 
@@ -534,7 +615,8 @@ def verify_release_string(release):
 
 def add_makeflags(job_core_count, cmd):
     """
-    Correct for multi-core if necessary (especially important in case coreCount=1 to limit parallel make).
+    Correct for multi-core if necessary (especially important in
+    case coreCount=1 to limit parallel make).
 
     :param job_core_count: core count from the job definition (int).
     :param cmd: payload execution command (string).
@@ -544,16 +626,18 @@ def add_makeflags(job_core_count, cmd):
     # ATHENA_PROC_NUMBER is set in Node.py using the schedconfig value
     try:
         core_count = int(os.environ.get('ATHENA_PROC_NUMBER'))
-    except Exception:
+    except (TypeError, ValueError):
         core_count = -1
+
     if core_count == -1:
         try:
             core_count = int(job_core_count)
-        except Exception:
+        except (TypeError, ValueError):
             pass
         else:
             if core_count >= 1:
-                # Note: the original request (AF) was to use j%d and not -j%d, now using the latter
+                # Note: the original request (AF) was to use j%d
+                # and not -j%d, now using the latter
                 cmd += "export MAKEFLAGS=\'-j%d QUICK=1 -l1\';" % (core_count)
 
     # make sure that MAKEFLAGS is always set
@@ -567,10 +651,12 @@ def get_analysis_run_command(job, trf_name):
     """
     Return the proper run command for the user job.
 
-    Example output: export X509_USER_PROXY=<..>;./runAthena <job parameters> --usePFCTurl --directIn
+    Example output:
+    export X509_USER_PROXY=<..>;./runAthena <job parameters> --usePFCTurl --directIn
 
     :param job: job object.
-    :param trf_name: name of the transform that will run the job (string). Used when containers are not used.
+    :param trf_name: name of the transform that will run the job (string).
+    Used when containers are not used.
     :return: command (string).
     """
 
@@ -579,7 +665,8 @@ def get_analysis_run_command(job, trf_name):
     # get relevant file transfer info
     #use_copy_tool, use_direct_access, use_pfc_turl = get_file_transfer_info(job)
     # check if the input files are to be accessed locally (ie if prodDBlockToken is set to local)
-    #if job.is_local():   ## useless since stage-in phase has already passed (DEPRECATE ME, anisyonk)
+    ## useless since stage-in phase has already passed (DEPRECATE ME, anisyonk)
+    #if job.is_local():
     #    logger.debug('switched off direct access for local prodDBlockToken')
     #    use_direct_access = False
     #    use_pfc_turl = False
@@ -601,12 +688,12 @@ def get_analysis_run_command(job, trf_name):
         # check if image is on disk as defined by envar PAYLOAD_CONTAINER_LOCATION
         payload_container_location = os.environ.get('PAYLOAD_CONTAINER_LOCATION')
         if payload_container_location is not None:
-            logger.debug("$PAYLOAD_CONTAINER_LOCATION = %s" % payload_container_location)
+            logger.debug("$PAYLOAD_CONTAINER_LOCATION = %s", payload_container_location)
             # get container name
             containername = imagename.rsplit('/')[-1]
             image_location = os.path.join(payload_container_location, containername)
             if os.path.exists(image_location):
-                logger.debug("image exists at %s" % image_location)
+                logger.debug("image exists at %s", image_location)
                 imagename = image_location
 
         # restore the image name if necessary
@@ -621,15 +708,19 @@ def get_analysis_run_command(job, trf_name):
     #        cmd += ' --directIn'
 
     if job.has_remoteio():
-        logger.debug('direct access (remoteio) is used to access some input files: --usePFCTurl and --directIn will be added to payload command')
+        logger.debug((
+            'direct access (remoteio) is used to access some input files: '
+            '--usePFCTurl and --directIn will be added to payload command'))
         if '--usePFCTurl' not in cmd:
             cmd += ' --usePFCTurl'
         if '--directIn' not in cmd:
             cmd += ' --directIn'
 
     # update the payload command for forced accessmode
-    ## -- REDUNDANT logic, since it should be done from the beginning at the step of FileSpec initialization (anisyonk)
-    #cmd = update_forced_accessmode(log, cmd, job.transfertype, job.jobparams, trf_name)  ## DEPRECATE ME (anisyonk)
+    ## -- REDUNDANT logic, since it should be done from the beginning at
+    ## the step of FileSpec initialization (anisyonk)
+    #cmd = update_forced_accessmode(log, cmd, job.transfertype,
+    # job.jobparams, trf_name)  ## DEPRECATE ME (anisyonk)
 
     # add guids when needed
     # get the correct guids list (with only the direct access files)
@@ -644,16 +735,19 @@ def get_analysis_run_command(job, trf_name):
     return cmd
 
 
-## SHOULD NOT BE USED since payload cmd should be properly generated from the beginning (consider final directio settings) (anisyonk)
-def update_forced_accessmode(log, cmd, transfertype, jobparams, trf_name):  ## DEPRECATE ME (anisyonk)
+## SHOULD NOT BE USED since payload cmd should be properly generated
+## from the beginning (consider final directio settings) (anisyonk)
+## DEPRECATE ME (anisyonk)
+def update_forced_accessmode(log, cmd, transfertype, jobparams, trf_name):
     """
     Update the payload command for forced accessmode.
-    accessmode is an option that comes from HammerCloud and is used to force a certain input file access mode; i.e.
-    copy-to-scratch or direct access.
+    accessmode is an option that comes from HammerCloud and is used to
+    force a certain input file access mode; i.e. copy-to-scratch or direct access.
 
     :param log: logging object.
     :param cmd: payload command.
-    :param transfertype: transfer type (.e.g 'direct') from the job definition with priority over accessmode (string).
+    :param transfertype: transfer type (.e.g 'direct') from the job
+    definition with priority over accessmode (string).
     :param jobparams: job parameters (string).
     :param trf_name: transformation name (string).
     :return: updated payload command string.
@@ -669,7 +763,7 @@ def update_forced_accessmode(log, cmd, transfertype, jobparams, trf_name):  ## D
         for _mode in list(_accessmode_dic.keys()):  # Python 2/3
             if _mode in jobparams:
                 # any accessmode set in jobPars should overrule schedconfig
-                logger.info("enforcing %s" % _accessmode_dic[_mode][0])
+                logger.info("enforcing %s", _accessmode_dic[_mode][0])
                 if _mode == "--accessmode=copy":
                     # make sure direct access is turned off
                     accessmode_usect = True
@@ -709,7 +803,8 @@ def update_forced_accessmode(log, cmd, transfertype, jobparams, trf_name):  ## D
                 cmd = cmd.replace("./%s" % trf_name, "export X509_USER_PROXY=%s;./%s" %
                                   (os.environ.get('X509_USER_PROXY'), trf_name))
 
-    # if both direct access and the accessmode loop added a directIn switch, remove the first one from the string
+    # if both direct access and the accessmode loop added a
+    # directIn switch, remove the first one from the string
     if cmd.count("directIn") > 1:
         cmd = cmd.replace(' --directIn', ' ', 1)
 
@@ -721,8 +816,10 @@ def get_guids_from_jobparams(jobparams, infiles, infilesguids):
     Extract the correct guid from the input file list.
     The guids list is used for direct reading.
     1. extract input file list for direct reading from job parameters
-    2. for each input file in this list, find the corresponding guid from the input file guid list
-    Since the job parameters string is entered by a human, the order of the input files might not be the same.
+    2. for each input file in this list, find the corresponding guid from
+    the input file guid list.
+    Since the job parameters string is entered by a human, the order of
+    the input files might not be the same.
 
     :param jobparams: job parameters.
     :param infiles: input file list.
@@ -750,22 +847,23 @@ def get_guids_from_jobparams(jobparams, infiles, infilesguids):
                 tail = match.group(3)
                 body = match.group(2).split(',')
                 attr = match.group(4).split(',')
-                for idx in range(len(body)):
-                    lfn = '%s%s%s%s' % (head, body[idx], tail, attr[idx])
+
+                for idx, item in enumerate(body):
+                    lfn = '%s%s%s%s' % (head, item, tail, attr[idx])
                     infiles.append(lfn)
             else:
                 infiles = [compactinfiles]
 
-    if _infiles != []:
-        for infile in _infiles:
-            # get the corresponding index from the inputFiles list, which has the same order as infilesguids
-            try:
-                index = infiles.index(infile)
-            except Exception as e:
-                logger.warning("exception caught: %s (direct reading will fail)" % e)
-            else:
-                # add the corresponding guid to the list
-                guidlist.append(infilesguids[index])
+    for infile in _infiles:
+        # get the corresponding index from the inputFiles list,
+        # which has the same order as infilesguids
+        try:
+            index = infiles.index(infile)
+        except ValueError as exc:
+            logger.warning("exception caught: %s (direct reading will fail)", exc)
+        else:
+            # add the corresponding guid to the list
+            guidlist.append(infilesguids[index])
 
     return guidlist
 
@@ -775,7 +873,8 @@ def get_file_transfer_info(job):   ## TO BE DEPRECATED, NOT USED (anisyonk)
     Return information about desired file transfer.
 
     :param job: job object
-    :return: use copy tool (boolean), use direct access (boolean), use PFC Turl (boolean).
+    :return: use copy tool (boolean), use direct access (boolean),
+    use PFC Turl (boolean).
     """
 
     use_copy_tool = True
@@ -783,10 +882,14 @@ def get_file_transfer_info(job):   ## TO BE DEPRECATED, NOT USED (anisyonk)
     use_pfc_turl = False
 
     # check with schedconfig
-    if (job.infosys.queuedata.direct_access_lan or job.infosys.queuedata.direct_access_wan or job.transfertype == 'direct') and not job.is_build_job():
+    is_lan = job.infosys.queuedata.direct_access_lan
+    is_wan = job.infosys.queuedata.direct_access_wan
+    if not job.is_build_job() and (is_lan or is_wan or job.transfertype == 'direct'):
         # override if all input files are copy-to-scratch
         if job.only_copy_to_scratch():
-            logger.info('all input files are copy-to-scratch (--usePFCTurl and --directIn will not be set)')
+            logger.info((
+                'all input files are copy-to-scratch '
+                '(--usePFCTurl and --directIn will not be set)'))
         else:
             logger.debug('--usePFCTurl and --directIn will be set')
             use_copy_tool = False
@@ -799,17 +902,19 @@ def get_file_transfer_info(job):   ## TO BE DEPRECATED, NOT USED (anisyonk)
 def update_job_data(job):
     """
     This function can be used to update/add data to the job object.
-    E.g. user specific information can be extracted from other job object fields. In the case of ATLAS, information
-    is extracted from the metadata field and added to other job object fields.
+    E.g. user specific information can be extracted from other job object fields.
+    In the case of ATLAS, information is extracted from the metadata field and
+    added to other job object fields.
 
     :param job: job object
     :return:
     """
 
     ## comment from Alexey:
-    ## it would be better to reallocate this logic (as well as parse metadata values)directly to Job object
-    ## since in general it's Job related part
-    ## later on once we introduce VO specific Job class (inherited from JobData) this can be easily customized
+    ## it would be better to reallocate this logic (as well as parse
+    ## metadata values)directly to Job object since in general it's Job
+    ## related part. Later on once we introduce VO specific Job class
+    ## (inherited from JobData) this can be easily customized
 
     # get label "all" or "log"
     stageout = get_stageout_label(job)
@@ -817,7 +922,7 @@ def update_job_data(job):
     if 'exeErrorDiag' in job.metadata:
         job.exeerrordiag = job.metadata['exeErrorDiag']
         if job.exeerrordiag:
-            logger.warning('payload failed: exeErrorDiag=%s' % job.exeerrordiag)
+            logger.warning('payload failed: exeErrorDiag=%s', job.exeerrordiag)
 
     # determine what should be staged out
     job.stageout = stageout  # output and log file or only log file
@@ -825,37 +930,47 @@ def update_job_data(job):
     work_attributes = None
     try:
         work_attributes = parse_jobreport_data(job.metadata)
-    except Exception as e:
-        logger.warning('failed to parse job report (cannot set job.nevents): %s' % e)
+    except Exception as exc:
+        logger.warning('failed to parse job report (cannot set job.nevents): %s', exc)
     else:
-        # note: the number of events can be set already at this point if the value was extracted from the job report
-        # (a more thorough search for this value is done later unless it was set here)
+        # note: the number of events can be set already at this point
+        # if the value was extracted from the job report (a more thorough
+        # search for this value is done later unless it was set here)
         nevents = work_attributes.get('nEvents', 0)
         if nevents:
             job.nevents = nevents
 
-    # extract output files from the job report if required, in case the trf has created additional (overflow) files
-    # also make sure all guids are assigned (use job report value if present, otherwise generate the guid)
+    # extract output files from the job report if required, in case the trf
+    # has created additional (overflow) files. Also make sure all guids are
+    # assigned (use job report value if present, otherwise generate the guid)
     if job.metadata and not job.is_eventservice:
-        extract_output_file_guids(job)  # keep this for now, complicated to merge with verify_output_files?
+        # keep this for now, complicated to merge with verify_output_files?
+        extract_output_file_guids(job)
         try:
             verify_output_files(job)
-        except Exception as e:
-            logger.warning('exception caught while trying verify output files: %s' % e)
+        except Exception as exc:
+            logger.warning('exception caught while trying verify output files: %s', exc)
     else:
         if not job.allownooutput:  # i.e. if it's an empty list/string, do nothing
-            logger.debug("will not try to extract output files from jobReport for user job (and allowNoOut list is empty)")
+            logger.debug((
+                "will not try to extract output files from jobReport "
+                "for user job (and allowNoOut list is empty)"))
         else:
             # remove the files listed in allowNoOutput if they don't exist
             remove_no_output_files(job)
 
     ## validate output data (to be moved into the JobData)
-    ## warning: do no execute this code unless guid lookup in job report has failed - pilot should only generate guids
+    ## warning: do no execute this code unless guid lookup in job report
+    # has failed - pilot should only generate guids
     ## if they are not present in job report
     for dat in job.outdata:
         if not dat.guid:
             dat.guid = get_guid()
-            logger.warning('guid not set: generated guid=%s for lfn=%s' % (dat.guid, dat.lfn))
+            logger.warning(
+                'guid not set: generated guid=%s for lfn=%s',
+                dat.guid,
+                dat.lfn
+            )
 
 
 def get_stageout_label(job):
@@ -878,7 +993,7 @@ def get_stageout_label(job):
             if job.exeerrorcode == 0:
                 stageout = "all"
             else:
-                logger.info('payload failed: exeErrorCode=%d' % job.exeerrorcode)
+                logger.info('payload failed: exeErrorCode=%d', job.exeerrorcode)
                 stageout = "log"
 
     return stageout
@@ -894,11 +1009,13 @@ def update_output_for_hpo(job):
 
     try:
         new_outdata = discover_new_outdata(job)
-    except Exception as e:
-        logger.warning('exception caught while discovering new outdata: %s' % e)
+    except Exception as exc:
+        logger.warning('exception caught while discovering new outdata: %s', exc)
     else:
         if new_outdata:
-            logger.info('replacing job outdata with discovered output (%d file(s))' % len(new_outdata))
+            logger.info((
+                'replacing job outdata with discovered output '
+                '(%d file(s))', len(new_outdata)))
             job.outdata = new_outdata
 
 
@@ -918,12 +1035,22 @@ def discover_new_outdata(job):
         if new_output:
             # create new FileSpec objects out of the new output
             for outfile in new_output:
-                # note: guid will be taken from job report after this function has been called
-                files = [{'scope': outdata_file.scope, 'lfn': outfile, 'workdir': job.workdir,
-                          'dataset': outdata_file.dataset, 'ddmendpoint': outdata_file.ddmendpoint,
-                          'ddmendpoint_alt': None, 'filesize': new_output[outfile]['filesize'],
-                          'checksum': new_output[outfile]['checksum'], 'guid': ''}]
-                # do not abbreviate the following two lines as otherwise the content of xfiles will be a list of generator objects
+                # note: guid will be taken from job report
+                # after this function has been called
+                files = [{
+                    'scope': outdata_file.scope,
+                    'lfn': outfile,
+                    'workdir': job.workdir,
+                    'dataset': outdata_file.dataset,
+                    'ddmendpoint': outdata_file.ddmendpoint,
+                    'ddmendpoint_alt': None, 
+                    'filesize': new_output[outfile]['filesize'],
+                    'checksum': new_output[outfile]['checksum'],
+                    'guid': ''
+                }]
+
+                # do not abbreviate the following two lines as otherwise
+                # the content of xfiles will be a list of generator objects
                 _xfiles = [FileSpec(type='output', **f) for f in files]
                 new_outdata += _xfiles
 
@@ -958,29 +1085,43 @@ def discover_new_output(name_pattern, workdir):
             if filesize and checksum:
                 new_output[lfn] = {'path': path, 'filesize': filesize, 'checksum': checksum}
             else:
-                logger.warning('failed to create file info (filesize=%d, checksum=%s) for lfn=%s' %
-                               (filesize, checksum, lfn))
+                logger.warning(
+                    'failed to create file info (filesize=%d, checksum=%s) for lfn=%s',
+                    filesize,
+                    checksum,
+                    lfn
+                )
+
     return new_output
 
 
 def extract_output_file_guids(job):
     """
-    Extract output file info from the job report and make sure all guids are assigned (use job report value if present,
-    otherwise generate the guid - note: guid generation is done later, not in this function since this function
-    might not be called if metadata info is not found prior to the call).
+    Extract output file info from the job report and make sure all guids\
+    are assigned (use job report value if present, otherwise generate the guid.\
+    Note: guid generation is done later, not in this function since
+    this function might not be called if metadata info is not found prior
+    to the call).
 
     :param job: job object.
     :return:
     """
 
-    # make sure there is a defined output file list in the job report - unless it is allowed by task parameter allowNoOutput
+    # make sure there is a defined output file list in the job report -
+    # unless it is allowed by task parameter allowNoOutput
     if not job.allownooutput:
         output = job.metadata.get('files', {}).get('output', [])
         if output:
-            logger.info('verified that job report contains metadata for %d file(s)' % len(output))
+            logger.info((
+                'verified that job report contains metadata '
+                'for %d file(s)', len(output)))
         else:
-            logger.warning('job report contains no output files and allowNoOutput is not set')  #- will fail job since allowNoOutput is not set')
-            #job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOOUTPUTINJOBREPORT)
+            #- will fail job since allowNoOutput is not set')
+            logger.warning((
+                'job report contains no output '
+                'files and allowNoOutput is not set'))
+            #job.piloterrorcodes, job.piloterrordiags =
+            # errors.add_error_code(errors.NOOUTPUTINJOBREPORT)
             return
 
     # extract info from metadata (job report JSON)
@@ -991,20 +1132,27 @@ def extract_output_file_guids(job):
             lfn = fdat['name']
 
             # verify the guid if the lfn is known
-            # only extra guid if the file is known by the job definition (March 18 change, v 2.5.2)
+            # only extra guid if the file is known by the
+            # job definition (March 18 change, v 2.5.2)
             if lfn in data:
                 data[lfn].guid = fdat['file_guid']
-                logger.info('set guid=%s for lfn=%s (value taken from job report)' % (data[lfn].guid, lfn))
+                logger.info((
+                    'set guid=%s for lfn=%s '
+                    '(value taken from job report)', data[lfn].guid, lfn))
             else:  # found new entry
-                logger.warning('pilot no longer considers output files not mentioned in job definition (lfn=%s)' % lfn)
+                logger.warning((
+                    'pilot no longer considers output files not mentioned '
+                    'in job definition (lfn=%s)', lfn))
                 continue
 
                 #if job.outdata:
                 #    kw = {'lfn': lfn,
-                #          'scope': job.outdata[0].scope,  ## take value from 1st output file?
+                # .         # take value from 1st output file?
+                #          'scope': job.outdata[0].scope,
                 #          'guid': fdat['file_guid'],
                 #          'filesize': fdat['file_size'],
-                #          'dataset': dat.get('dataset') or job.outdata[0].dataset  ## take value from 1st output file?
+                #           # take value from 1st output file?
+                #          'dataset': dat.get('dataset') or job.outdata[0].dataset
                 #          }
                 #    spec = FileSpec(filetype='output', **kw)
                 #    extra.append(spec)
@@ -1013,25 +1161,28 @@ def extract_output_file_guids(job):
     for fspec in job.outdata:
         if fspec.guid != data[fspec.lfn].guid:
             fspec.guid = data[fspec.lfn].guid
-            logger.debug('reset guid=%s for lfn=%s' % (fspec.guid, fspec.lfn))
+            logger.debug('reset guid=%s for lfn=%s', fspec.guid, fspec.lfn)
         else:
             if fspec.guid:
-                logger.debug('verified guid=%s for lfn=%s' % (fspec.guid, fspec.lfn))
+                logger.debug('verified guid=%s for lfn=%s', fspec.guid, fspec.lfn)
             else:
-                logger.warning('guid not set for lfn=%s' % fspec.lfn)
+                logger.warning('guid not set for lfn=%s', fspec.lfn)
     #if extra:
-        #logger.info('found extra output files in job report, will overwrite output file list: extra=%s' % extra)
+        #logger.info('found extra output files in job report,
+        # will overwrite output file list: extra=%s' % extra)
         #job.outdata = extra
 
 
 def verify_output_files(job):
     """
-    Make sure that the known output files from the job definition are listed in the job report and number of processed events
-    is greater than zero. If the output file is not listed in the job report, then if the file is listed in allowNoOutput
-    remove it from stage-out, otherwise fail the job.
+    Make sure that the known output files from the job definition are listed
+    in the job report and number of processed events is greater than zero.
+    If the output file is not listed in the job report, then if the file is
+    listed in allowNoOutput remove it from stage-out, otherwise fail the job.
 
-    Note from Rod: fail scenario: The output file is not in output:[] or is there with zero events. Then if allownooutput is not
-    set - fail the job. If it is set, then do not store the output, and finish ok.
+    Note from Rod: fail scenario: The output file is not in output:[] or is
+    there with zero events. Then if allownooutput is not set - fail the job.
+    If it is set, then do not store the output, and finish ok.
 
     :param job: job object.
     :return: Boolean (and potentially updated job.outdata list)
@@ -1048,38 +1199,50 @@ def verify_output_files(job):
         return True
 
     # get list of output files from job report
-    # (if None is returned, it means the job report is from an old release and does not contain an output list)
+    # (if None is returned, it means the job report is from an old release
+    # and does not contain an output list)
     output = job.metadata.get('files', {}).get('output', None)
     if not output and output is not None:
         # ie empty list, output=[] - are all known output files in allowNoOutput?
-        logger.warning('encountered an empty output file list in job report, consulting allowNoOutput list')
+        logger.warning((
+            'encountered an empty output file list in job report, '
+            'consulting allowNoOutput list'))
         failed = False
         for lfn in lfns_jobdef:
             if lfn not in job.allownooutput:
                 if job.is_analysis():
-                    logger.warning('lfn %s is not in allowNoOutput list - ignore for user job' % lfn)
+                    logger.warning((
+                        'lfn %s is not in allowNoOutput list - '
+                        'ignore for user job',
+                        lfn
+                    ))
                 else:
                     failed = True
-                    logger.warning('lfn %s is not in allowNoOutput list - job will fail' % lfn)
+                    logger.warning(
+                        'lfn %s is not in allowNoOutput list - job will fail',
+                        lfn
+                    )
                     job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.MISSINGOUTPUTFILE)
                     break
             else:
-                logger.info('lfn %s listed in allowNoOutput - will be removed from stage-out' % lfn)
+                logger.info('lfn %s listed in allowNoOutput - will be removed from stage-out', lfn)
                 remove_from_stageout(lfn, job)
 
     elif output is None:
         # ie job report is ancient / output could not be extracted
-        logger.warning('output file list could not be extracted from job report (nothing to verify)')
+        logger.warning((
+            'output file list could not be extracted from job report '
+            '(nothing to verify)'))
     else:
         verified, nevents = verify_extracted_output_files(output, lfns_jobdef, job)
-        failed = True if not verified else False
+        failed = (not verified)
         if nevents > 0 and not failed and job.nevents == 0:
             job.nevents = nevents
-            logger.info('number of events from summed up output files: %d' % nevents)
+            logger.info('number of events from summed up output files: %d', nevents)
         else:
-            logger.info('number of events previously set to %d' % job.nevents)
+            logger.info('number of events previously set to %d', job.nevents)
 
-    status = True if not failed else False
+    status = (not failed)
 
     if status:
         logger.info('output file verification succeeded')
@@ -1103,7 +1266,9 @@ def verify_extracted_output_files(output, lfns_jobdef, job):
     failed = False
     nevents = 0
     output_jobrep = {}  # {lfn: nentries, ..}
-    logger.debug('extracted output file list from job report - make sure all known output files are listed')
+    logger.debug((
+        'extracted output file list from job report - '
+        'make sure all known output files are listed'))
 
     # first collect the output files from the job report
     for dat in output:
@@ -1118,45 +1283,68 @@ def verify_extracted_output_files(output, lfns_jobdef, job):
     for lfn in lfns_jobdef:
         if lfn not in output_jobrep and lfn not in job.allownooutput:
             if job.is_analysis():
-                logger.warning(
-                    'output file %s from job definition is not present in job report and is not listed in allowNoOutput' % lfn)
+                logger.warning((
+                    'output file %s from job definition is not present '
+                    'in job report and is not listed in allowNoOutput', lfn))
             else:
-                logger.warning(
-                    'output file %s from job definition is not present in job report and is not listed in allowNoOutput - job will fail' % lfn)
+                logger.warning((
+                    'output file %s from job definition is not present '
+                    'in job report and is not listed in allowNoOutput - '
+                    'job will fail', lfn))
                 job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.MISSINGOUTPUTFILE)
                 failed = True
                 break
+
         if lfn not in output_jobrep and lfn in job.allownooutput:
-            logger.warning(
-                'output file %s from job definition is not present in job report but is listed in allowNoOutput - remove from stage-out' % lfn)
+            logger.warning((
+                'output file %s from job definition is not present '
+                'in job report but is listed in allowNoOutput - '
+                'remove from stage-out', lfn))
             remove_from_stageout(lfn, job)
         else:
             nentries = output_jobrep[lfn]
             if nentries == "UNDEFINED":
-                logger.warning('encountered file with nentries=UNDEFINED - will ignore %s' % lfn)
-                continue
-            elif nentries is None and lfn not in job.allownooutput:
-                logger.warning(
-                    'output file %s is listed in job report, but has no events and is not listed in allowNoOutput - will ignore' % lfn)
-                continue
-            elif nentries is None and lfn in job.allownooutput:
-                logger.warning(
-                    'output file %s is listed in job report, nentries is None and is listed in allowNoOutput - remove from stage-out' % lfn)
-                remove_from_stageout(lfn, job)
-            elif type(nentries) is int and nentries == 0 and lfn not in job.allownooutput:
-                logger.warning(
-                    'output file %s is listed in job report, has zero events and is not listed in allowNoOutput - will ignore' % lfn)
-            elif type(nentries) is int and nentries == 0 and lfn in job.allownooutput:
-                logger.warning(
-                    'output file %s is listed in job report, has zero events and is listed in allowNoOutput - remove from stage-out' % lfn)
-                remove_from_stageout(lfn, job)
+                logger.warning((
+                    'encountered file with nentries=UNDEFINED - '
+                    'will ignore %s', lfn))
+
+            elif nentries is None:
+
+                if lfn not in job.allownooutput:
+                    logger.warning((
+                        'output file %s is listed in job report, '
+                        'but has no events and is not listed in '
+                        'allowNoOutput - will ignore', lfn))
+                else:
+                    logger.warning((
+                        'output file %s is listed in job report, '
+                        'nentries is None and is listed in allowNoOutput - '
+                        'remove from stage-out', lfn))
+                    remove_from_stageout(lfn, job)
+
+            elif nentries == 0:
+
+                if lfn not in job.allownooutput:
+                    logger.warning((
+                        'output file %s is listed in job report, '
+                        'has zero events and is not listed in '
+                        'allowNoOutput - will ignore', lfn))
+                else:
+                    logger.warning((
+                        'output file %s is listed in job report, '
+                        'has zero events and is listed in allowNoOutput - '
+                        'remove from stage-out', lfn))
+                    remove_from_stageout(lfn, job)
+
             elif type(nentries) is int and nentries:
-                logger.info('output file %s has %d event(s)' % (lfn, nentries))
+                logger.info('output file %s has %d event(s)', lfn, nentries)
                 nevents += nentries
             else:  # should not reach this step
-                logger.warning('case not handled for output file %s with %s event(s) (ignore)' % (lfn, str(nentries)))
+                logger.warning((
+                    'case not handled for output file %s with %s event(s) '
+                    '(ignore)', lfn, str(nentries)))
 
-    status = False if failed else True
+    status = (not failed)
     return status, nevents
 
 
@@ -1172,7 +1360,7 @@ def remove_from_stageout(lfn, job):
     outdata = []
     for fspec in job.outdata:
         if fspec.lfn == lfn:
-            logger.info('removing %s from stage-out list' % lfn)
+            logger.info('removing %s from stage-out list', lfn)
         else:
             outdata.append(fspec)
     job.outdata = outdata
@@ -1180,7 +1368,8 @@ def remove_from_stageout(lfn, job):
 
 def remove_no_output_files(job):
     """
-    Remove files from output file list if they are listed in allowNoOutput and do not exist.
+    Remove files from output file list if they are listed in
+    allowNoOutput and do not exist.
 
     :param job: job object.
     :return:
@@ -1194,15 +1383,22 @@ def remove_no_output_files(job):
 
         if filename in job.allownooutput:
             if os.path.exists(path):
-                logger.info("file %s is listed in allowNoOutput but exists (will not be removed from list of files to be staged-out)" % filename)
+                logger.info((
+                    "file %s is listed in allowNoOutput but exists "
+                    "(will not be removed from list of files to be "
+                    "staged-out)", filename))
                 _outfiles.append(filename)
             else:
-                logger.info("file %s is listed in allowNoOutput and does not exist (will be removed from list of files to be staged-out)" % filename)
+                logger.info((
+                    "file %s is listed in allowNoOutput and does not exist "
+                    "(will be removed from list of files to be staged-out)", filename))
         else:
             if os.path.exists(path):
-                logger.info("file %s is not listed in allowNoOutput (will be staged-out)" % filename)
+                logger.info("file %s is not listed in allowNoOutput (will be staged-out)", filename)
             else:
-                logger.warning("file %s is not listed in allowNoOutput and does not exist (job will fail)" % filename)
+                logger.warning((
+                    "file %s is not listed in allowNoOutput and "
+                    "does not exist (job will fail)", filename))
             _outfiles.append(filename)
 
     # now remove the unwanted fspecs
@@ -1223,12 +1419,15 @@ def get_outfiles_records(subfiles):
     """
 
     res = {}
-    for f in subfiles:
-        res[f['name']] = {'guid': f['file_guid'],
-                          'size': f['file_size']}
-        nentries = f.get('nentries', 'UNDEFINED')
+    for subfile in subfiles:
+        res[subfile['name']] = {
+            'guid': subfile['file_guid'],
+            'size': subfile['file_size']
+        }
+
+        nentries = subfile.get('nentries', 'UNDEFINED')
         if type(nentries) == int:
-            res[f['name']]['nentries'] = nentries
+            res[subfile['name']]['nentries'] = nentries
         else:
             logger.warning("nentries is undefined in job report")
 
@@ -1241,14 +1440,15 @@ def get(self, path, dst_dict, dst_key):
         if len(keys) == 0:
             return
         last_key = keys.pop()
-        v = self
+        me_ = self
         for key in keys:
-            if key in v and isinstance(v[key], dict):
-                v = v[key]
-            else:
+            if not (key in me_ and isinstance(me_[key], dict)):
                 return
-        if last_key in v:
-            dst_dict[dst_key] = v[last_key]
+
+            me_ = me_[key]
+
+        if last_key in me_:
+            dst_dict[dst_key] = me_[last_key]
 
 
 def parse_jobreport_data(job_report):
@@ -1271,25 +1471,25 @@ def parse_jobreport_data(job_report):
     work_attributes["outputfiles"] = []
 
     if "ATHENA_PROC_NUMBER" in os.environ:
-        logger.debug("ATHENA_PROC_NUMBER: {0}".format(os.environ["ATHENA_PROC_NUMBER"]))
+        logger.debug("ATHENA_PROC_NUMBER: %s", os.environ["ATHENA_PROC_NUMBER"])
         work_attributes['core_count'] = int(os.environ["ATHENA_PROC_NUMBER"])
         core_count = int(os.environ["ATHENA_PROC_NUMBER"])
 
-    dq = DictQuery(job_report)
-    dq.get("resource/transform/processedEvents", work_attributes, "nEvents")
-    dq.get("resource/transform/cpuTimeTotal", work_attributes, "cpuConsumptionTime")
-    dq.get("resource/machine/node", work_attributes, "node")
-    dq.get("resource/machine/model_name", work_attributes, "cpuConsumptionUnit")
-    dq.get("resource/dbTimeTotal", work_attributes, "dbTime")
-    dq.get("resource/dbDataTotal", work_attributes, "dbData")
-    dq.get("exitCode", work_attributes, "transExitCode")
-    dq.get("exitMsg", work_attributes, "exeErrorDiag")
-    dq.get("files/input", work_attributes, "inputfiles")
-    dq.get("files/output", work_attributes, "outputfiles")
+    dictq = DictQuery(job_report)
+    dictq.get("resource/transform/processedEvents", work_attributes, "nEvents")
+    dictq.get("resource/transform/cpuTimeTotal", work_attributes, "cpuConsumptionTime")
+    dictq.get("resource/machine/node", work_attributes, "node")
+    dictq.get("resource/machine/model_name", work_attributes, "cpuConsumptionUnit")
+    dictq.get("resource/dbTimeTotal", work_attributes, "dbTime")
+    dictq.get("resource/dbDataTotal", work_attributes, "dbData")
+    dictq.get("exitCode", work_attributes, "transExitCode")
+    dictq.get("exitMsg", work_attributes, "exeErrorDiag")
+    dictq.get("files/input", work_attributes, "inputfiles")
+    dictq.get("files/output", work_attributes, "outputfiles")
 
     outputfiles_dict = {}
-    for of in work_attributes['outputfiles']:
-        outputfiles_dict.update(get_outfiles_records(of['subFiles']))
+    for opf in work_attributes['outputfiles']:
+        outputfiles_dict.update(get_outfiles_records(opf['subFiles']))
     work_attributes['outputfiles'] = outputfiles_dict
 
     if work_attributes['inputfiles']:
@@ -1302,20 +1502,36 @@ def parse_jobreport_data(job_report):
 
     if 'resource' in job_report and 'executor' in job_report['resource']:
         j = job_report['resource']['executor']
+
+        # Original version
         exc_report = []
         fin_report = defaultdict(int)
+
         try:
             _tmplist = filter(lambda d: 'memory' in d and ('Max' or 'Avg' in d['memory']), j.itervalues())  # Python 2
         except Exception:
             _tmplist = [d for d in iter(list(j.values())) if
                         'memory' in d and ('Max' or 'Avg' in d['memory'])]  # Python 3
-        for v in _tmplist:
-            if 'Avg' in v['memory']:
-                exc_report.extend(list(v['memory']['Avg'].items()))  # Python 2/3
-            if 'Max' in v['memory']:
-                exc_report.extend(list(v['memory']['Max'].items()))  # Python 2/3
-        for x in exc_report:
-            fin_report[x[0]] += x[1]
+
+        for item in _tmplist:
+            if 'Avg' in item['memory']:
+                exc_report.extend(list(item['memory']['Avg'].items()))  # Python 2/3
+            if 'Max' in item['memory']:
+                exc_report.extend(list(item['memory']['Max'].items()))  # Python 2/3
+
+        for item in exc_report:
+            fin_report[item[0]] += item[1]
+
+        # Proposed version
+        # fin_report_brinick = defaultdict(int)
+        # for value in j.values():
+        #     mem = value.get('memory')
+        #    for key in ('Avg', 'Max'):
+        #        for subk, subv in mem.get(key, {}).items():
+        #            fin_report_brinick[subk] += subv
+        # logger.debug("Original code yields fin_report: %s", fin_report)
+        # logger.debug("Proposed code yields fin_report: %s", fin_report_brinick)
+
         work_attributes.update(fin_report)
 
     workdir_size = get_workdir_size()
@@ -1325,8 +1541,8 @@ def parse_jobreport_data(job_report):
                                         work_attributes["dbTime"],
                                         work_attributes["dbData"],
                                         workdir_size)
-    del(work_attributes["dbData"])
-    del(work_attributes["dbTime"])
+    del work_attributes["dbData"]
+    del work_attributes["dbTime"]
 
     return work_attributes
 
@@ -1337,9 +1553,9 @@ def get_workdir_size():
 
     :return:
     """
-    c, o, e = execute('du -s', shell=True)
-    if o is not None:
-        return o.split()[0]
+    _, stdout, _ = execute('du -s', shell=True)
+    if stdout is not None:
+        return stdout.split()[0]
     return None
 
 
@@ -1366,42 +1582,6 @@ def get_executor_dictionary(jobreport_dictionary):
     return executor_dictionary
 
 
-def get_number_of_events_deprecated(jobreport_dictionary):  # TODO: remove this function
-    """
-    Extract the number of events from the job report.
-
-    :param jobreport_dictionary:
-    :return:
-    """
-
-    nevents = {}  # FORMAT: { format : total_events, .. }
-    nmax = 0
-
-    executor_dictionary = get_executor_dictionary(jobreport_dictionary)
-    if executor_dictionary != {}:
-        for format in list(executor_dictionary.keys()):  # "RAWtoESD", .., Python 2/3
-            if 'nevents' in executor_dictionary[format]:
-                if format in nevents:
-                    nevents[format] += executor_dictionary[format]['nevents']
-                else:
-                    nevents[format] = executor_dictionary[format]['nevents']
-            else:
-                logger.warning("format %s has no such key: nevents" % (format))
-
-    # Now find the largest number of events among the different formats
-    if nevents != {}:
-        try:
-            nmax = max(nevents.values())
-        except Exception as e:
-            logger.warning("exception caught: %s" % (e))
-            nmax = 0
-    else:
-        logger.warning("did not find the number of events in the job report")
-        nmax = 0
-
-    return nmax
-
-
 def get_resimevents(jobreport_dictionary):
     """
     Extract and add up the resimevents from the job report.
@@ -1415,11 +1595,11 @@ def get_resimevents(jobreport_dictionary):
 
     executor_dictionary = get_executor_dictionary(jobreport_dictionary)
     if executor_dictionary != {}:
-        for format in list(executor_dictionary.keys()):  # "ReSim", Python 2/3
-            if 'resimevents' in executor_dictionary[format]:
+        for fmt in list(executor_dictionary.keys()):  # "ReSim", Python 2/3
+            if 'resimevents' in executor_dictionary[fmt]:
                 try:
-                    resimevents = int(executor_dictionary[format]['resimevents'])
-                except Exception:
+                    resimevents = int(executor_dictionary[fmt]['resimevents'])
+                except (KeyError, ValueError, TypeError):
                     pass
                 else:
                     break
@@ -1431,8 +1611,9 @@ def get_db_info(jobreport_dictionary):
     """
     Extract and add up the DB info from the job report.
     This information is reported with the jobMetrics.
-    Note: this function adds up the different dbData and dbTime's in the different executor steps. In modern job
-    reports this might have been done already by the transform and stored in dbDataTotal and dbTimeTotal.
+    Note: this function adds up the different dbData and dbTime's in
+    the different executor steps. In modern job reports this might have
+    been done already by the transform and stored in dbDataTotal and dbTimeTotal.
 
     :param jobreport_dictionary: job report dictionary.
     :return: db_time (int), db_data (long)
@@ -1441,26 +1622,26 @@ def get_db_info(jobreport_dictionary):
     db_time = 0
     try:
         db_data = long(0)  # Python 2  # noqa: F821
-    except Exception:
+    except NameError:
         db_data = 0  # Python 3
 
     executor_dictionary = get_executor_dictionary(jobreport_dictionary)
     if executor_dictionary != {}:
-        for format in list(executor_dictionary.keys()):  # "RAWtoESD", .., Python 2/3
-            if 'dbData' in executor_dictionary[format]:
+        for fmt in list(executor_dictionary.keys()):  # "RAWtoESD", .., Python 2/3
+            if 'dbData' in executor_dictionary[fmt]:
                 try:
-                    db_data += executor_dictionary[format]['dbData']
+                    db_data += executor_dictionary[fmt]['dbData']
                 except Exception:
                     pass
             else:
-                logger.warning("format %s has no such key: dbData" % format)
-            if 'dbTime' in executor_dictionary[format]:
+                logger.warning("format %s has no such key: dbData", fmt)
+            if 'dbTime' in executor_dictionary[fmt]:
                 try:
-                    db_time += executor_dictionary[format]['dbTime']
+                    db_time += executor_dictionary[fmt]['dbTime']
                 except Exception:
                     pass
             else:
-                logger.warning("format %s has no such key: dbTime" % format)
+                logger.warning("format %s has no such key: dbTime", fmt)
 
     return db_time, db_data
 
@@ -1477,17 +1658,16 @@ def get_db_info_str(db_time, db_data):
 
     try:
         zero = long(0)  # Python 2  # noqa: F821
-    except Exception:
+    except NameError:
         zero = 0  # Python 3
 
+    db_data_s = ""
     if db_data != zero:
         db_data_s = "%s" % (db_data)
-    else:
-        db_data_s = ""
+
+    db_time_s = ""
     if db_time != 0:
         db_time_s = "%.2f" % (db_time)
-    else:
-        db_time_s = ""
 
     return db_time_s, db_data_s
 
@@ -1500,24 +1680,24 @@ def get_cpu_times(jobreport_dictionary):
     Note: this function is used with Event Service jobs
 
     :param jobreport_dictionary:
-    :return: cpu_conversion_unit (unit), total_cpu_time, conversion_factor (output consistent with set_time_consumed())
+    :return: cpu_conversion_unit (unit), total_cpu_time,
+    conversion_factor (output consistent with set_time_consumed())
     """
 
     try:
         total_cpu_time = long(0)  # Python 2 # noqa: F821
-    except Exception:
+    except NameError:
         total_cpu_time = 0  # Python 3
 
     executor_dictionary = get_executor_dictionary(jobreport_dictionary)
     if executor_dictionary != {}:
-        for format in list(executor_dictionary.keys()):  # "RAWtoESD", .., Python 2/3
-            if 'cpuTime' in executor_dictionary[format]:
-                try:
-                    total_cpu_time += executor_dictionary[format]['cpuTime']
-                except Exception:
-                    pass
-            else:
-                logger.warning("format %s has no such key: cpuTime" % (format))
+        for fmt in list(executor_dictionary.keys()):  # "RAWtoESD", .., Python 2/3
+            try:
+                total_cpu_time += executor_dictionary[fmt]['cpuTime']
+            except KeyError:
+                logger.warning("format %s has no such key: cpuTime", fmt)
+            except Exception:
+                pass
 
     conversion_factor = 1.0
     cpu_conversion_unit = "s"
@@ -1546,15 +1726,15 @@ def cleanup_looping_payload(workdir):
     :return:
     """
 
-    for (p, d, f) in os.walk(workdir):
-        for filename in f:
+    for (root, _, files) in os.walk(workdir):
+        for filename in files:
             if 'pool.root' in filename:
-                path = os.path.join(p, filename)
+                path = os.path.join(root, filename)
                 path = os.path.abspath(path)
                 remove(path)
 
 
-def cleanup_payload(workdir, outputfiles=[], removecores=True):
+def cleanup_payload(workdir, outputfiles=None, removecores=True):
     """
     Cleanup of payload (specifically AthenaMP) sub directories prior to log file creation.
     Also remove core dumps.
@@ -1565,26 +1745,31 @@ def cleanup_payload(workdir, outputfiles=[], removecores=True):
     :return:
     """
 
+    if outputfiles is None:
+        outputfiles = []
+
     if removecores:
         remove_core_dumps(workdir)
 
     for ampdir in glob('%s/athenaMP-workers-*' % workdir):
-        for (p, d, f) in os.walk(ampdir):
-            for filename in f:
-                if ('core' in filename and removecores) or 'pool.root' in filename or 'tmp.' in filename:
-                    path = os.path.join(p, filename)
-                    path = os.path.abspath(path)
+        for (root, _, files) in os.walk(ampdir):
+            for filename in files:
+                path = os.path.abspath(os.path.join(root, filename))
+
+                if ('core' in filename and removecores) or \
+                    'pool.root' in filename or \
+                    'tmp.' in filename:
                     remove(path)
+
                 for outfile in outputfiles:
                     if outfile in filename:
-                        path = os.path.join(p, filename)
-                        path = os.path.abspath(path)
                         remove(path)
 
 
 def get_redundant_path():
     """
-    Return the path to the file containing the redundant files and directories to be removed prior to log file creation.
+    Return the path to the file containing the redundant files
+    and directories to be removed prior to log file creation.
 
     :return: file path (string).
     """
@@ -1601,20 +1786,26 @@ def get_redundant_path():
 def get_redundants():
     """
     Get list of redundant files and directories (to be removed).
-    The function will return the content of an external file. It that can't be read, then a list defined in this
-    function will be returned instead. Any updates to the external file must be propagated to this function.
+    The function will return the content of an external file. It that
+    can't be read, then a list defined in this function will be returned instead.
+    Any updates to the external file must be propagated to this function.
 
     :return: files and directories list
     """
 
     # try to read the list from the external file
     filename = get_redundant_path()
-    if os.path.exists(filename) and False:  # do not use the cvmfs file since it is not being updated
-        dir_list = read_list(filename)
-        if dir_list:
-            return dir_list
 
-    logger.debug('list of redundant files could not be read from external file: %s (will use internal list)' % filename)
+    # do not use the cvmfs file since it is not being updated
+    # If you uncomment this block, need to also uncomment the read_list import
+    # if os.path.exists(filename) and False:
+    #    dir_list = read_list(filename)
+    #    if dir_list:
+    #        return dir_list
+
+    logger.debug((
+        'list of redundant files could not be read from external file: %s '
+        '(will use internal list)', filename))
 
     # else return the following
     dir_list = ["AtlasProduction*",
@@ -1683,7 +1874,8 @@ def get_redundants():
 
 def remove_archives(workdir):
     """
-    Explicitly remove any soft linked archives (.a files) since they will be dereferenced by the tar command
+    Explicitly remove any soft linked archives (.a files) since
+    they will be dereferenced by the tar command
     (--dereference option).
 
     :param workdir: working directory (string)
@@ -1691,15 +1883,15 @@ def remove_archives(workdir):
     """
 
     matches = []
-    for root, dirnames, filenames in os.walk(workdir):
+    for root, _, filenames in os.walk(workdir):
         for filename in fnmatch.filter(filenames, '*.a'):
             matches.append(os.path.join(root, filename))
-    for root, dirnames, filenames in os.walk(os.path.dirname(workdir)):
+    for root, _, filenames in os.walk(os.path.dirname(workdir)):
         for filename in fnmatch.filter(filenames, 'EventService_premerge_*.tar'):
             matches.append(os.path.join(root, filename))
-    if matches != []:
-        for f in matches:
-            remove(f)
+
+    for match in matches:
+        remove(match)
 
 
 def cleanup_broken_links(workdir):
@@ -1711,28 +1903,26 @@ def cleanup_broken_links(workdir):
     """
 
     broken = []
-    for root, dirs, files in os.walk(workdir):
+    for root, _, files in os.walk(workdir):
         for filename in files:
             path = os.path.join(root, filename)
-            if os.path.islink(path):
-                target_path = os.readlink(path)
-                # Resolve relative symlinks
-                if not os.path.isabs(target_path):
-                    target_path = os.path.join(os.path.dirname(path), target_path)
-                if not os.path.exists(target_path):
-                    broken.append(path)
-            else:
-                # If it's not a symlink we're not interested.
+            if not os.path.islink(path):
                 continue
 
-    if broken:
-        for p in broken:
-            remove(p)
+            target_path = os.readlink(path)
+            # Resolve relative symlinks
+            if not os.path.isabs(target_path):
+                target_path = os.path.join(os.path.dirname(path), target_path)
+            if not os.path.exists(target_path):
+                broken.append(path)
+
+    for brok in broken:
+        remove(brok)
 
 
-def ls(workdir):
+def list_work_dir(workdir):
     cmd = 'ls -lF %s' % workdir
-    ec, stdout, stderr = execute(cmd)
+    _, stdout, stderr = execute(cmd)
     logger.debug('%s:\n' % stdout + stderr)
 
 
@@ -1752,33 +1942,32 @@ def remove_special_files(workdir, dir_list, outputfiles):
     to_delete = []
     for _dir in dir_list:
         files = glob(os.path.join(workdir, _dir))
+        if not files:
+            continue
+
         exclude = []
+        for exc in exceptions_list:
+            for item in files:
+                if exc in item:
+                    exclude.append(os.path.abspath(item))
 
-        if files:
-            for exc in exceptions_list:
-                for f in files:
-                    if exc in f:
-                        exclude.append(os.path.abspath(f))
-            _files = []
-            for f in files:
-                if f not in exclude:
-                    _files.append(os.path.abspath(f))
-            to_delete += _files
+        _files = [os.path.abspath(item) for item in files if item not in exclude]
+        to_delete += _files
 
     exclude_files = []
-    for of in outputfiles:
-        exclude_files.append(os.path.join(workdir, of))
-
-    for f in to_delete:
-        if f not in exclude_files:
-            logger.debug('removing %s' % f)
-            if os.path.isfile(f):
-                remove(f)
+    for opf in outputfiles:
+        exclude_files.append(os.path.join(workdir, opf))
+
+    for item in to_delete:
+        if item not in exclude_files:
+            logger.debug('removing %s', item)
+            if os.path.isfile(item):
+                remove(item)
             else:
-                remove_dir_tree(f)
+                remove_dir_tree(item)
 
 
-def remove_redundant_files(workdir, outputfiles=[], islooping=False, debugmode=False):
+def remove_redundant_files(workdir, outputfiles=None, islooping=False, debugmode=False):
     """
     Remove redundant files and directories prior to creating the log file.
 
@@ -1786,28 +1975,32 @@ def remove_redundant_files(workdir, outputfiles=[], islooping=False, debugmode=F
 
     :param workdir: working directory (string).
     :param outputfiles: list of protected output files (list).
-    :param islooping: looping job variable to make sure workDir is not removed in case of looping (boolean).
+    :param islooping: looping job variable to make sure workDir
+    is not removed in case of looping (boolean).
     :param debugmode: True if debug mode has been switched on (Boolean).
     :return:
     """
 
+    if outputfiles is None:
+        outputfiles = []
+
     logger.debug("removing redundant files prior to log creation")
     workdir = os.path.abspath(workdir)
 
-    ls(workdir)
+    list_work_dir(workdir)
 
     # get list of redundant files and directories (to be removed)
     dir_list = get_redundants()
 
     # remove core and pool.root files from AthenaMP sub directories
+    logger.debug('cleaning up payload')
     try:
-        logger.debug('cleaning up payload')
         cleanup_payload(workdir, outputfiles, removecores=not debugmode)
-    except Exception as e:
-        logger.warning("failed to execute cleanup_payload(): %s" % e)
+    except OSError as exc:
+        logger.warning("failed to execute cleanup_payload(): %s", exc)
 
-    # explicitly remove any soft linked archives (.a files) since they will be dereferenced by the tar command
-    # (--dereference option)
+    # explicitly remove any soft linked archives (.a files)
+    # since they will be dereferenced by the tar command (--dereference option)
     logger.debug('removing archives')
     remove_archives(workdir)
 
@@ -1824,7 +2017,7 @@ def remove_redundant_files(workdir, outputfiles=[], islooping=False, debugmode=F
         # remove at least root files from workDir (ie also in the case of looping job)
         cleanup_looping_payload(path)
         if not islooping:
-            logger.debug('removing \'workDir\' from workdir=%s' % workdir)
+            logger.debug('removing \'workDir\' from workdir=%s', workdir)
             remove_dir_tree(path)
 
     # remove additional dirs
@@ -1832,10 +2025,10 @@ def remove_redundant_files(workdir, outputfiles=[], islooping=False, debugmode=F
     for additional in additionals:
         path = os.path.join(workdir, additional)
         if os.path.exists(path):
-            logger.debug('removing \'%s\' from workdir=%s' % (additional, workdir))
+            logger.debug('removing \'%s\' from workdir=%s', additional, workdir)
             remove_dir_tree(path)
 
-    ls(workdir)
+    list_work_dir(workdir)
 
 
 def download_command(process, workdir):
@@ -1854,9 +2047,9 @@ def download_command(process, workdir):
     # download the command if necessary
     if cmd.startswith('http'):
         # Try to download the trf (skip when user container is to be used)
-        ec, diagnostics, cmd = get_analysis_trf(cmd, workdir)
-        if ec != 0:
-            logger.warning('cannot execute command due to previous error: %s' % cmd)
+        exitcode, _, cmd = get_analysis_trf(cmd, workdir)
+        if exitcode != 0:
+            logger.warning('cannot execute command due to previous error: %s', cmd)
             return {}
 
         # update the preprocess command (the URL should be stripped)
@@ -1867,15 +2060,26 @@ def download_command(process, workdir):
 
 def get_utility_commands(order=None, job=None):
     """
-    Return a dictionary of utility commands and arguments to be executed in parallel with the payload.
-    This could e.g. be memory and network monitor commands. A separate function can be used to determine the
-    corresponding command setups using the utility command name.
-    If the optional order parameter is set, the function should return the list of corresponding commands.
-    E.g. if order=UTILITY_BEFORE_PAYLOAD, the function should return all commands that are to be executed before the
-    payload. If order=UTILITY_WITH_PAYLOAD, the corresponding commands will be prepended to the payload execution
-    string. If order=UTILITY_AFTER_PAYLOAD_STARTED, the commands that should be executed after the payload has been started
-    should be returned. If order=UTILITY_WITH_STAGEIN, the commands that should be executed parallel with stage-in will
-    be returned.
+    Return a dictionary of utility commands and arguments to be executed
+    in parallel with the payload. This could e.g. be memory and network
+    monitor commands. A separate function can be used to determine the
+    corresponding command setups using the utility command name. If the
+    optional order parameter is set, the function should return the list
+    of corresponding commands.
+
+    For example:
+
+    If order=UTILITY_BEFORE_PAYLOAD, the function should return all
+    commands that are to be executed before the payload.
+
+    If order=UTILITY_WITH_PAYLOAD, the corresponding commands will be
+    prepended to the payload execution string.
+
+    If order=UTILITY_AFTER_PAYLOAD_STARTED, the commands that should be
+    executed after the payload has been started should be returned.
+
+    If order=UTILITY_WITH_STAGEIN, the commands that should be executed
+    parallel with stage-in will be returned.
 
     FORMAT: {'command': <command>, 'args': <args>, 'label': <some name>}
 
@@ -1886,18 +2090,38 @@ def get_utility_commands(order=None, job=None):
 
     if order == UTILITY_BEFORE_PAYLOAD and job.preprocess:
         return get_precopostprocess_command(job.preprocess, job.workdir, 'preprocess')
-    elif order == UTILITY_WITH_PAYLOAD:
+
+    if order == UTILITY_WITH_PAYLOAD:
         return {'command': 'NetworkMonitor', 'args': '', 'label': 'networkmonitor'}
-    elif order == UTILITY_AFTER_PAYLOAD_STARTED:
+
+    if order == UTILITY_AFTER_PAYLOAD_STARTED:
         return get_utility_after_payload_started()
-    elif order == UTILITY_AFTER_PAYLOAD_STARTED2 and job.coprocess:
+
+    if order == UTILITY_AFTER_PAYLOAD_STARTED2 and job.coprocess:
         return get_precopostprocess_command(job.coprocess, job.workdir, 'coprocess')
-    elif order == UTILITY_AFTER_PAYLOAD_FINISHED:
-        return get_xcache_command(job.infosys.queuedata.catchall, job.workdir, job.jobid, 'xcache_kill', xcache_deactivation_command)
-    elif order == UTILITY_AFTER_PAYLOAD_FINISHED2 and job.postprocess:
+
+    if order == UTILITY_AFTER_PAYLOAD_FINISHED:
+        return get_xcache_command(
+            job.infosys.queuedata.catchall,
+            job.workdir,
+            job.jobid,
+            'xcache_kill',
+            xcache_deactivation_command,
+        )
+
+    if order == UTILITY_AFTER_PAYLOAD_FINISHED2 and job.postprocess:
         return get_precopostprocess_command(job.postprocess, job.workdir, 'postprocess')
-    elif order == UTILITY_BEFORE_STAGEIN:
-        return get_xcache_command(job.infosys.queuedata.catchall, job.workdir, job.jobid, 'xcache_start', xcache_activation_command)
+
+    if order == UTILITY_BEFORE_STAGEIN:
+        return get_xcache_command(
+            job.infosys.queuedata.catchall,
+            job.workdir,
+            job.jobid,
+            'xcache_start',
+            xcache_activation_command,
+        )
+
+    return None
 
 
 def get_precopostprocess_command(process, workdir, label):
@@ -1973,16 +2197,16 @@ def post_prestagein_utility_command(**kwargs):
     stdout = kwargs.get('output', None)
 
     if stdout:
-        logger.debug('processing stdout for label=%s' % label)
+        logger.debug('processing stdout for label=%s', label)
         xcache_proxy(stdout)
     else:
-        logger.warning('no output for label=%s' % label)
+        logger.warning('no output for label=%s', label)
 
     alrb_xcache_files = os.environ.get('ALRB_XCACHE_FILES', '')
     if alrb_xcache_files:
         cmd = 'cat $ALRB_XCACHE_FILES/settings.sh'
-        exit_code, _stdout, _stderr = execute(cmd)
-        logger.debug('cmd=%s:\n\n%s\n\n' % (cmd, _stdout))
+        _, _stdout, _ = execute(cmd)
+        logger.debug('cmd=%s:\n\n%s\n\n', cmd, _stdout)
 
 
 def xcache_proxy(output):
@@ -1996,16 +2220,31 @@ def xcache_proxy(output):
     # loop over each line in the xcache stdout and identify the needed environmental variables
     for line in output.split('\n'):
         if 'ALRB_XCACHE_PROXY' in line:
-            remote = 'REMOTE' in line
-            name = 'ALRB_XCACHE_PROXY_REMOTE' if remote else 'ALRB_XCACHE_PROXY'
-            pattern = r'\ export\ ALRB_XCACHE_PROXY_REMOTE\=\"(.+)\"' if remote else r'\ export\ ALRB_XCACHE_PROXY\=\"(.+)\"'
+            suffix = '_REMOTE' if 'REMOTE' in line else ''
+            name = 'ALRB_XCACHE_PROXY%s' % suffix
+            pattern = r'\ export\ ALRB_XCACHE_PROXY%s\=\"(.+)\"' % suffix
             set_xcache_var(line, name=name, pattern=pattern)
+
         elif 'ALRB_XCACHE_MYPROCESS' in line:
-            set_xcache_var(line, name='ALRB_XCACHE_MYPROCESS', pattern=r'\ ALRB_XCACHE_MYPROCESS\=(.+)')
+            set_xcache_var(
+                line,
+                name='ALRB_XCACHE_MYPROCESS',
+                pattern=r'\ ALRB_XCACHE_MYPROCESS\=(.+)'
+            )
+
         elif 'Messages logged in' in line:
-            set_xcache_var(line, name='ALRB_XCACHE_LOG', pattern=r'xcache\ started\ successfully.\ \ Messages\ logged\ in\ (.+)')
+            set_xcache_var(
+                line,
+                name='ALRB_XCACHE_LOG',
+                pattern=r'xcache\ started\ successfully.\ \ Messages\ logged\ in\ (.+)'
+            )
+
         elif 'ALRB_XCACHE_FILES' in line:
-            set_xcache_var(line, name='ALRB_XCACHE_FILES', pattern=r'\ ALRB_XCACHE_FILES\=(.+)')
+            set_xcache_var(
+                line,
+                name='ALRB_XCACHE_FILES',
+                pattern=r'\ ALRB_XCACHE_FILES\=(.+)'
+            )
 
 
 def set_xcache_var(line, name='', pattern=''):
@@ -2028,7 +2267,8 @@ def xcache_activation_command(workdir='', jobid=''):
     """
     Return the xcache service activation command.
 
-    Note: the workdir is not used here, but the function prototype needs it in the called (xcache_activation_command needs it).
+    Note: the workdir is not used here, but the function prototype
+    needs it in the called (xcache_activation_command needs it).
 
     :param workdir: unused work directory - do not remove (string).
     :param jobid: PanDA job id to guarantee that xcache process is unique (int).
@@ -2036,13 +2276,17 @@ def xcache_activation_command(workdir='', jobid=''):
     """
 
     # a successful startup will set ALRB_XCACHE_PROXY and ALRB_XCACHE_PROXY_REMOTE
-    # so any file access with root://...  should be replaced with one of the above
-    # (depending on whether you are on the same machine or not)
+    # so any file access with root://...  should be replaced with one of
+    # the above (depending on whether you are on the same machine or not)
     # example:
     # ${ALRB_XCACHE_PROXY}root://atlasxrootd-kit.gridka.de:1094//pnfs/gridka.de/../DAOD_FTAG4.24348858._000020.pool.root.1
     command = "%s " % get_asetup(asetup=False)
-    # add 'xcache list' which will also kill any orphaned processes lingering in the system
-    command += "lsetup xcache; xcache list; xcache start -d $PWD/%s/xcache -C centos7 --disklow 4g --diskhigh 5g -b 4" % jobid
+
+    # add 'xcache list' which will also kill any
+    # orphaned processes lingering in the system
+    command += (
+        "lsetup xcache; xcache list; "
+        "xcache start -d $PWD/%s/xcache -C centos7 --disklow 4g --diskhigh 5g -b 4" % jobid)
 
     return {'command': command, 'args': ''}
 
@@ -2053,7 +2297,8 @@ def xcache_deactivation_command(workdir='', jobid=''):
     This service should be stopped after the payload has finished.
     Copy the messages log before shutting down.
 
-    Note: the job id is not used here, but the function prototype needs it in the called (xcache_activation_command needs it).
+    Note: the job id is not used here, but the function prototype
+    needs it in the called (xcache_activation_command needs it).
 
     :param workdir: payload work directory (string).
     :param jobid: unused job id - do not remove (string).
@@ -2062,17 +2307,17 @@ def xcache_deactivation_command(workdir='', jobid=''):
 
     path = os.environ.get('ALRB_XCACHE_LOG', None)
     if path and os.path.exists(path):
-        logger.debug('copying xcache messages log file (%s) to work dir (%s)' % (path, workdir))
+        logger.debug('copying xcache messages log file (%s) to work dir (%s)', path, workdir)
         dest = os.path.join(workdir, 'xcache-messages.log')
         try:
             copy(path, dest)
-        except Exception as e:
-            logger.warning('exception caught copying xcache log: %s' % e)
+        except Exception as exc:
+            logger.warning('exception caught copying xcache log: %s', exc)
     else:
         if not path:
             logger.warning('ALRB_XCACHE_LOG is not set')
         if path and not os.path.exists(path):
-            logger.warning('path does not exist: %s' % path)
+            logger.warning('path does not exist: %s', path)
     command = "%s " % get_asetup(asetup=False)
     command += "lsetup xcache; xcache kill"  # -C centos7
 
@@ -2091,11 +2336,23 @@ def get_utility_command_setup(name, job, setup=None):
     """
 
     if name == 'MemoryMonitor':
-        # must know if payload is running in a container or not (enables search for pid in ps output)
+        # must know if payload is running in a container or not
+        # (enables search for pid in ps output)
         use_container = job.usecontainer or 'runcontainer' in job.transformation
-        dump_ps = True if "PRMON_DEBUG" in job.infosys.queuedata.catchall else False
-        setup, pid = get_memory_monitor_setup(job.pid, job.pgrp, job.jobid, job.workdir, job.command, use_container=use_container,
-                                              transformation=job.transformation, outdata=job.outdata, dump_ps=dump_ps)
+        dump_ps = ("PRMON_DEBUG" in job.infosys.queuedata.catchall)
+
+        setup, pid = get_memory_monitor_setup(
+            job.pid,
+            job.pgrp,
+            job.jobid,
+            job.workdir,
+            job.command,
+            use_container=use_container,
+            transformation=job.transformation,
+            outdata=job.outdata,
+            dump_ps=dump_ps
+        )
+
         _pattern = r"([\S]+)\ ."
         pattern = re.compile(_pattern)
         _name = re.findall(pattern, setup.split(';')[-1])
@@ -2105,21 +2362,24 @@ def get_utility_command_setup(name, job, setup=None):
             logger.warning('trf name could not be identified in setup string')
 
         # update the pgrp if the pid changed
-        if job.pid != pid and pid != --1:
-            logger.debug('updating pgrp=%d for pid=%d' % (job.pgrp, pid))
+        if pid not in (job.pid, -1):
+            logger.debug('updating pgrp=%d for pid=%d', job.pgrp, pid)
             try:
                 job.pgrp = os.getpgid(pid)
-            except Exception as e:
-                logger.warning('os.getpgid(%d) failed with: %s' % (pid, e))
+            except Exception as exc:
+                logger.warning('os.getpgid(%d) failed with: %s', pid, exc)
         return setup
-    elif name == 'NetworkMonitor' and setup:
+
+    if name == 'NetworkMonitor' and setup:
         return get_network_monitor_setup(setup, job)
-    elif name == 'Prefetcher':
+
+    if name == 'Prefetcher':
         return get_prefetcher_setup(job)
-    elif name == 'Benchmark':
+
+    if name == 'Benchmark':
         return get_benchmark_setup(job)
-    else:
-        return ""
+
+    return ""
 
 
 def get_utility_command_execution_order(name):
@@ -2133,12 +2393,13 @@ def get_utility_command_execution_order(name):
     # example implementation
     if name == 'NetworkMonitor':
         return UTILITY_WITH_PAYLOAD
-    elif name == 'MemoryMonitor':
-        return UTILITY_AFTER_PAYLOAD_STARTED
-    else:
-        logger.warning('unknown utility name: %s' % name)
+
+    if name == 'MemoryMonitor':
         return UTILITY_AFTER_PAYLOAD_STARTED
 
+    logger.warning('unknown utility name: %s', name)
+    return UTILITY_AFTER_PAYLOAD_STARTED
+
 
 def post_utility_command_action(name, job):
     """
@@ -2193,7 +2454,7 @@ def verify_lfn_length(outdata):
     :return: error code (int), diagnostics (string).
     """
 
-    ec = 0
+    exitcode = 0
     diagnostics = ""
     max_length = 255
 
@@ -2202,10 +2463,10 @@ def verify_lfn_length(outdata):
         if len(fspec.lfn) > max_length:
             diagnostics = "LFN too long (length: %d, must be less than %d characters): %s" % \
                           (len(fspec.lfn), max_length, fspec.lfn)
-            ec = errors.LFNTOOLONG
+            exitcode = errors.LFNTOOLONG
             break
 
-    return ec, diagnostics
+    return exitcode, diagnostics
 
 
 def verify_ncores(corecount):
@@ -2227,25 +2488,30 @@ def verify_ncores(corecount):
     except Exception:
         athena_proc_number = None
 
-    # Note: if ATHENA_PROC_NUMBER is set (by the wrapper), then do not overwrite it
-    # Otherwise, set it to the value of job.coreCount
-    # (actually set ATHENA_PROC_NUMBER_JOB and use it if it exists, otherwise use ATHENA_PROC_NUMBER directly;
-    # ATHENA_PROC_NUMBER_JOB will always be the value from the job definition)
+    # Note: if ATHENA_PROC_NUMBER is set (by the wrapper), then do not
+    # overwrite it. Otherwise, set it to the value of job.coreCount
+    # (actually set ATHENA_PROC_NUMBER_JOB and use it if it exists,
+    # otherwise use ATHENA_PROC_NUMBER directly; ATHENA_PROC_NUMBER_JOB
+    # will always be the value from the job definition)
     if athena_proc_number:
-        logger.info("encountered a set ATHENA_PROC_NUMBER (%d), will not overwrite it" % athena_proc_number)
+        logger.info((
+            "encountered a set ATHENA_PROC_NUMBER (%d), "
+            "will not overwrite it", athena_proc_number))
         logger.info('set ATHENA_CORE_NUMBER to same value as ATHENA_PROC_NUMBER')
-        os.environ['ATHENA_CORE_NUMBER'] = "%s" % athena_proc_number
+        os.environ['ATHENA_CORE_NUMBER'] = str(athena_proc_number)
     else:
-        os.environ['ATHENA_PROC_NUMBER_JOB'] = "%s" % corecount
-        os.environ['ATHENA_CORE_NUMBER'] = "%s" % corecount
-        logger.info("set ATHENA_PROC_NUMBER_JOB and ATHENA_CORE_NUMBER to %s (ATHENA_PROC_NUMBER will not be overwritten)" % corecount)
+        os.environ['ATHENA_PROC_NUMBER_JOB'] = str(corecount)
+        os.environ['ATHENA_CORE_NUMBER'] = str(corecount)
+        logger.info((
+            "set ATHENA_PROC_NUMBER_JOB and ATHENA_CORE_NUMBER to %s "
+            "(ATHENA_PROC_NUMBER will not be overwritten)", corecount))
 
 
 def verify_job(job):
     """
     Verify job parameters for specific errors.
     Note:
-      in case of problem, the function should set the corresponding pilot error code using
+      in case of problem, the function should set the corresponding pilot error code using:
       job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code())
 
     :param job: job object
@@ -2255,11 +2521,11 @@ def verify_job(job):
     status = False
 
     # are LFNs of correct lengths?
-    ec, diagnostics = verify_lfn_length(job.outdata)
-    if ec != 0:
+    exitcode, diagnostics = verify_lfn_length(job.outdata)
+    if exitcode != 0:
         logger.fatal(diagnostics)
         job.piloterrordiag = diagnostics
-        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec)
+        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exitcode)
     else:
         status = True
 
@@ -2292,7 +2558,7 @@ def get_metadata(workdir):
 
     path = os.path.join(workdir, config.Payload.jobreport)
     metadata = read_file(path) if os.path.exists(path) else None
-    logger.debug('metadata=%s' % str(metadata))
+    logger.debug('metadata=%s', str(metadata))
 
     return metadata
 
@@ -2304,12 +2570,7 @@ def should_update_logstash(frequency=10):
     :param frequency:
     :return: return True once per 'frequency' times.
     """
-
-    from random import randint
-    if randint(0, frequency - 1) == 0:
-        return True
-    else:
-        return False
+    return randint(0, frequency - 1) == 0
 
 
 def update_server(job):
@@ -2323,39 +2584,57 @@ def update_server(job):
     """
 
     # attempt to read memory_monitor_output.txt and convert it to json
-    if should_update_logstash():
-        path = os.path.join(job.workdir, get_memory_monitor_output_filename())
-        if os.path.exists(path):
-            # convert memory monitor text output to json and return the selection (don't store it, log has already been created)
-            metadata_dictionary = get_metadata_dict_from_txt(path, storejson=True, jobid=job.jobid)
-            if metadata_dictionary:
-                # the output was previously written to file, update the path and tell curl to send it
-                new_path = update_extension(path=path, extension='json')
-                #out = read_json(new_path)
-                #logger.debug('prmon json=\n%s' % out)
-                # logger.debug('final logstash prmon dictionary: %s' % str(metadata_dictionary))
-                url = 'https://pilot.atlas-ml.org'  # 'http://collector.atlas-ml.org:80'
-                #cmd = "curl --connect-timeout 20 --max-time 120 -H \"Content-Type: application/json\" -X POST -d \'%s\' %s" % \
-                #      (str(metadata_dictionary).replace("'", '"'), url)
-                # curl --connect-timeout 20 --max-time 120 -H "Content-Type: application/json" -X POST --upload-file test.json
-                # https://pilot.atlas-ml.org
-                cmd = "curl --connect-timeout 20 --max-time 120 -H \"Content-Type: application/json\" -X POST --upload-file %s %s" % (new_path, url)
-                #cmd = "curl --connect-timeout 20 --max-time 120 -F 'data=@%s' %s" % (new_path, url)
-                # send metadata to logstash
-                try:
-                    exit_code, stdout, stderr = execute(cmd, usecontainer=False)
-                except Exception as e:
-                    logger.warning('exception caught: %s' % e)
-                else:
-                    logger.debug('sent prmon JSON dictionary to logstash server')
-                    logger.debug('stdout: %s' % stdout)
-                    logger.debug('stderr: %s' % stderr)
-            else:
-                logger.warning('no prmon json available - cannot send anything to logstash server')
+    if not should_update_logstash():
+        logger.debug('no need to update logstash for this job')
+        return
+
+    path = os.path.join(job.workdir, get_memory_monitor_output_filename())
+    if not os.path.exists(path):
+        logger.warning('path does not exist: %s', path)
+        return
+
+    # convert memory monitor text output to json and return the selection
+    # (don't store it, log has already been created)
+    metadata_dictionary = get_metadata_dict_from_txt(path, storejson=True, jobid=job.jobid)
+    if metadata_dictionary:
+        # the output was previously written to file,
+        # update the path and tell curl to send it
+        new_path = update_extension(path=path, extension='json')
+
+        #out = read_json(new_path)
+        #logger.debug('prmon json=\n%s' % out)
+        # logger.debug('final logstash prmon dictionary: %s' % str(metadata_dictionary))
+        url = 'https://pilot.atlas-ml.org'  # 'http://collector.atlas-ml.org:80'
+
+        # cmd = (
+        #    "curl --connect-timeout 20 --max-time 120 "
+        #    "-H \"Content-Type: application/json\" -X POST -d \'%s\' %s" % \
+        #      (str(metadata_dictionary).replace("'", '"'), url)
+        #)
+
+        # curl --connect-timeout 20 --max-time 120 -H
+        # "Content-Type: application/json" -X POST --upload-file test.json
+        # https://pilot.atlas-ml.org
+        cmd = (
+            "curl --connect-timeout 20 --max-time 120 "
+            "-H \"Content-Type: application/json\" "
+            "-X POST "
+            "--upload-file %s %s" % (new_path, url)
+        )
+        #cmd = "curl --connect-timeout 20 --max-time 120 -F
+        #  'data=@%s' %s" % (new_path, url)
+        # send metadata to logstash
+        try:
+            _, stdout, stderr = execute(cmd, usecontainer=False)
+        except Exception as exc:
+            logger.warning('exception caught: %s', exc)
         else:
-            logger.warning('path does not exist: %s' % path)
+            logger.debug('sent prmon JSON dictionary to logstash server')
+            logger.debug('stdout: %s', stdout)
+            logger.debug('stderr: %s', stderr)
     else:
-        logger.debug('no need to update logstash for this job')
+        msg = 'no prmon json available - cannot send anything to logstash server'
+        logger.warning(msg)
 
 
 def preprocess_debug_command(job):
@@ -2367,7 +2646,11 @@ def preprocess_debug_command(job):
     preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams)
     # get the general setup command and then verify it if required
     resource_name = get_resource_name()  # 'grid' if no hpc_resource is set
-    resource = __import__('pilot.user.atlas.resource.%s' % resource_name, globals(), locals(), [resource_name], 0)  # Python 3, -1 -> 0
+
+    # Python 3, level: -1 -> 0
+    modname = 'pilot.user.atlas.resource.%s' % resource_name
+    resource = __import__(modname, globals(), locals(), [resource_name], 0)
+
     cmd = resource.get_setup_command(job, preparesetup)
     if not cmd.endswith(';'):
         cmd += '; '
@@ -2377,51 +2660,69 @@ def preprocess_debug_command(job):
 
 def process_debug_command(debug_command, pandaid):
     """
-    In debug mode, the server can send a special debug command to the pilot via the updateJob backchannel.
-    This function can be used to process that command, i.e. to identify a proper pid to debug (which is unknown
+    In debug mode, the server can send a special debug command to the piloti
+    via the updateJob backchannel. This function can be used to process that
+    command, i.e. to identify a proper pid to debug (which is unknown
     to the server).
 
-    For gdb, the server might send a command with gdb option --pid %. The pilot need to replace the % with the proper
-    pid. The default (hardcoded) process will be that of athena.py. The pilot will find the corresponding pid.
+    For gdb, the server might send a command with gdb option --pid %.
+    The pilot need to replace the % with the proper pid. The default
+    (hardcoded) process will be that of athena.py. The pilot will find the
+    corresponding pid.
 
     :param debug_command: debug command (string).
     :param pandaid: PanDA id (string).
     :return: updated debug command (string).
     """
 
+    if '--pid %' not in debug_command:
+        return debug_command
+
     pandaid_pid = None
-    if '--pid %' in debug_command:
-        # replace the % with the pid for athena.py
-        # note: if athena.py is not yet running, the --pid % will remain. Otherwise the % will be replaced by the pid
-        # first find the pid (if athena.py is running)
-        cmd = 'ps axo pid,ppid,pgid,args'
-        exit_code, stdout, stderr = execute(cmd)
-        if stdout:
-            #logger.debug('ps=\n\n%s\n' % stdout)
-            # convert the ps output to a dictionary
-            dictionary = convert_ps_to_dict(stdout)
-            # trim this dictionary to reduce the size (only keep the PID and PPID lists)
-            trimmed_dictionary = get_trimmed_dictionary(['PID', 'PPID'], dictionary)
-            # what is the pid of the trf?
-            pandaid_pid = find_pid(pandaid, dictionary)
-            # find all athena processes
-            pids = find_cmd_pids('athena.py', dictionary)
-            # which of the found pids are children of the trf? (which has an export PandaID=.. attached to it)
-            for pid in pids:
-                try:
-                    child = is_child(pid, pandaid_pid, trimmed_dictionary)
-                except RuntimeError as e:
-                    logger.warning('too many recursions: %s (cannot identify athena process)' % e)
-                else:
-                    if child:
-                        logger.info('pid=%d is a child process of the trf of this job' % pid)
-                        debug_command = debug_command.replace('--pid %', '--pid %d' % pid)
-                        logger.info('updated debug command: %s' % debug_command)
-                        break
-                    else:
-                        logger.info('pid=%d is not a child process of the trf of this job' % pid)
-            if not pids or '--pid %' in debug_command:
-                logger.debug('athena is not yet running (no corresponding pid)')
-                debug_command = ''  # reset the command to prevent the payload from being killed (will be killed when gdb has run)
+
+    # replace the % with the pid for athena.py
+    # note: if athena.py is not yet running, the --pid % will remain.
+    # Otherwise the % will be replaced by the pid first find the pid
+    # (if athena.py is running)
+    cmd = 'ps axo pid,ppid,pgid,args'
+    _, stdout, _ = execute(cmd)
+    if stdout:
+        #logger.debug('ps=\n\n%s\n' % stdout)
+        # convert the ps output to a dictionary
+        dictionary = convert_ps_to_dict(stdout)
+
+        # trim this dictionary to reduce the size
+        # (only keep the PID and PPID lists)
+        trimmed_dictionary = get_trimmed_dictionary(['PID', 'PPID'], dictionary)
+
+        # what is the pid of the trf?
+        pandaid_pid = find_pid(pandaid, dictionary)
+
+        # find all athena processes
+        pids = find_cmd_pids('athena.py', dictionary)
+
+        # which of the found pids are children of the trf?
+        # (which has an export PandaID=.. attached to it)
+        for pid in pids:
+            try:
+                child = is_child(pid, pandaid_pid, trimmed_dictionary)
+            except RuntimeError as rte:
+                logger.warning((
+                    'too many recursions: %s '
+                    '(cannot identify athena process)', rte))
+            else:
+                if child:
+                    logger.info('pid=%d is a child process of the trf of this job', pid)
+                    debug_command = debug_command.replace('--pid %', '--pid %d' % pid)
+                    logger.info('updated debug command: %s', debug_command)
+                    break
+                logger.info('pid=%d is not a child process of the trf of this job', pid)
+
+        if not pids or '--pid %' in debug_command:
+            logger.debug('athena is not yet running (no corresponding pid)')
+
+            # reset the command to prevent the payload from being killed
+            # (will be killed when gdb has run)
+            debug_command = ''
 
     return debug_command

From d40f349e46655b1115f3a0a4dcff9a560f4b6724 Mon Sep 17 00:00:00 2001
From: Brinick Simmons <brinick.simmons@gmail.com>
Date: Mon, 21 Jun 2021 11:04:49 +0200
Subject: [PATCH 77/96] Remove trailing whitespace

---
 pilot/user/atlas/common.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index d02faaf8..3e26f401 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -16,7 +16,6 @@
 import re
 from random import randint
 from signal import SIGTERM, SIGUSR1
-from typing import Type
 # from tarfile import ExFileObject
 
 try:
@@ -1043,7 +1042,7 @@ def discover_new_outdata(job):
                     'workdir': job.workdir,
                     'dataset': outdata_file.dataset,
                     'ddmendpoint': outdata_file.ddmendpoint,
-                    'ddmendpoint_alt': None, 
+                    'ddmendpoint_alt': None,
                     'filesize': new_output[outfile]['filesize'],
                     'checksum': new_output[outfile]['checksum'],
                     'guid': ''

From 11c1a4e4f6b448ced3ee127b6c9538b8e896558f Mon Sep 17 00:00:00 2001
From: Brinick Simmons <brinick.simmons@gmail.com>
Date: Mon, 21 Jun 2021 11:07:57 +0200
Subject: [PATCH 78/96] Fix flake8 issues

---
 pilot/user/atlas/common.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index 3e26f401..40c6a511 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -1755,9 +1755,11 @@ def cleanup_payload(workdir, outputfiles=None, removecores=True):
             for filename in files:
                 path = os.path.abspath(os.path.join(root, filename))
 
-                if ('core' in filename and removecores) or \
-                    'pool.root' in filename or \
-                    'tmp.' in filename:
+                core_file = ('core' in filename and removecores)
+                pool_root_file = 'pool.root' in filename
+                tmp_file = 'tmp.' in filename
+
+                if core_file or pool_root_file or tmp_file:
                     remove(path)
 
                 for outfile in outputfiles:

From 401dbfbc357d477d877019e04afe956eb98fa08e Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Mon, 21 Jun 2021 13:04:57 +0200
Subject: [PATCH 79/96] Avoiding the decode problem with strings. Added
 protection against UTF-8 failures while parsing stdout

---
 PILOTVERSION                 |  2 +-
 pilot/user/atlas/diagnose.py |  5 ++++-
 pilot/util/constants.py      |  2 +-
 pilot/util/container.py      | 13 ++++++++++---
 4 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index a14d3a59..d2b75bf7 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.12.1.46
\ No newline at end of file
+2.12.1.49
\ No newline at end of file
diff --git a/pilot/user/atlas/diagnose.py b/pilot/user/atlas/diagnose.py
index 9f131c93..7b3dbaae 100644
--- a/pilot/user/atlas/diagnose.py
+++ b/pilot/user/atlas/diagnose.py
@@ -70,7 +70,10 @@ def interpret(job):
         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code)
 
     # interpret the exit info from the payload
-    interpret_payload_exit_info(job)
+    try:
+        interpret_payload_exit_info(job)
+    except Exception as error:
+        logger.warning('exception caught while interpreting payload exit info: %s', error)
 
     return exit_code
 
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 667d188b..9f9f2120 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '46'    # build number should be reset to '1' for every new development cycle
+BUILD = '49'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1
diff --git a/pilot/util/container.py b/pilot/util/container.py
index 0a6c20b0..c81e5f22 100644
--- a/pilot/util/container.py
+++ b/pilot/util/container.py
@@ -91,10 +91,17 @@ def execute(executable, **kwargs):
         stdout, stderr = process.communicate()
         exit_code = process.poll()
 
+        # this should not be necessary since encoding is set above for Py3 (it is necessary if encoding above is removed)
         # for Python 3, convert from byte-like object to str
-        if is_python3():
-            stdout = stdout.decode('utf-8')
-            stderr = stderr.decode('utf-8')
+        #import sys
+        #if is_python3():
+        #    logger.debug('Using python version=%s' % str(sys.version_info))
+        #    try:
+        #        stdout = stdout.decode('utf-8')
+        #        stderr = stderr.decode('utf-8')
+        #    except Exception as error:
+        #        logger.warning('exception caught: %s (can be ignored)', error)
+
         # remove any added \n
         if stdout and stdout.endswith('\n'):
             stdout = stdout[:-1]

From 7aa6bd8bdfefd03cdf69b862228c1c8f29c9903d Mon Sep 17 00:00:00 2001
From: Brinick Simmons <brinick.simmons@gmail.com>
Date: Mon, 21 Jun 2021 13:31:37 +0200
Subject: [PATCH 80/96] Improve code in jobreport parsing function

---
 pilot/user/atlas/common.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index 1a1344de..20beb8af 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -1251,7 +1251,7 @@ def get(self, path, dst_dict, dst_key):
             dst_dict[dst_key] = v[last_key]
 
 
-def parse_jobreport_data(job_report):
+def parse_jobreport_data(job_report):  # noqa: C901
     """
     Parse a job report and extract relevant fields.
 
@@ -1316,6 +1316,19 @@ def parse_jobreport_data(job_report):
                 exc_report.extend(list(v['memory']['Max'].items()))  # Python 2/3
         for x in exc_report:
             fin_report[x[0]] += x[1]
+
+        # Proposed version
+        fin_report_brinick = defaultdict(int)
+        for value in j.values():
+            mem = value.get('memory', {})
+            for key in ('Avg', 'Max'):
+                for subk, subv in mem.get(key, {}).items():
+                    fin_report_brinick[subk] += subv
+
+        # Compare output from the original and the proposed versions
+        logger.debug("Original code yields fin_report: %s", fin_report)
+        logger.debug("Proposed code yields fin_report: %s", fin_report_brinick)
+
         work_attributes.update(fin_report)
 
     workdir_size = get_workdir_size()

From df87ff5f8d0b19487689950ffc97c84df686f154 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Mon, 21 Jun 2021 16:07:18 +0200
Subject: [PATCH 81/96] Version update

---
 PILOTVERSION            | 2 +-
 pilot/util/constants.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index d2b75bf7..edc0161f 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.12.1.49
\ No newline at end of file
+2.12.1.50
\ No newline at end of file
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 9f9f2120..e3d9bebc 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '49'    # build number should be reset to '1' for every new development cycle
+BUILD = '50'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 2020e3d5f9c6453a6b31a72ea982ccd8071a0225 Mon Sep 17 00:00:00 2001
From: Brinick Simmons <brinick.simmons@gmail.com>
Date: Tue, 22 Jun 2021 11:50:27 +0200
Subject: [PATCH 82/96] Fix logging calls

---
 pilot/user/atlas/common.py | 48 +++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index 6f5254ea..09dd695a 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -369,7 +369,7 @@ def get_payload_command(job):
             if not os.path.exists(path):
                 logger.warning((
                     'base trace report does not exist (%s) - input file '
-                    'traces should already have been sent', path))
+                    'traces should already have been sent'), path)
             else:
                 process_remote_file_traces(path, job, not_opened_turls)
 
@@ -570,7 +570,7 @@ def add_athena_proc_number(cmd):
             else:
                 logger.info((
                     "will not add ATHENA_PROC_NUMBER to cmd "
-                    "since the value is %s", str(value1)))
+                    "since the value is %s"), str(value1))
         else:
             logger.warning((
                 "don't know how to set ATHENA_PROC_NUMBER "
@@ -1014,7 +1014,7 @@ def update_output_for_hpo(job):
         if new_outdata:
             logger.info((
                 'replacing job outdata with discovered output '
-                '(%d file(s))', len(new_outdata)))
+                '(%d file(s))'), len(new_outdata))
             job.outdata = new_outdata
 
 
@@ -1113,7 +1113,7 @@ def extract_output_file_guids(job):
         if output:
             logger.info((
                 'verified that job report contains metadata '
-                'for %d file(s)', len(output)))
+                'for %d file(s)'), len(output))
         else:
             #- will fail job since allowNoOutput is not set')
             logger.warning((
@@ -1137,11 +1137,11 @@ def extract_output_file_guids(job):
                 data[lfn].guid = fdat['file_guid']
                 logger.info((
                     'set guid=%s for lfn=%s '
-                    '(value taken from job report)', data[lfn].guid, lfn))
+                    '(value taken from job report)'), data[lfn].guid, lfn)
             else:  # found new entry
                 logger.warning((
                     'pilot no longer considers output files not mentioned '
-                    'in job definition (lfn=%s)', lfn))
+                    'in job definition (lfn=%s)'), lfn)
                 continue
 
                 #if job.outdata:
@@ -1212,9 +1212,9 @@ def verify_output_files(job):
                 if job.is_analysis():
                     logger.warning((
                         'lfn %s is not in allowNoOutput list - '
-                        'ignore for user job',
+                        'ignore for user job'),
                         lfn
-                    ))
+                    )
                 else:
                     failed = True
                     logger.warning(
@@ -1284,12 +1284,12 @@ def verify_extracted_output_files(output, lfns_jobdef, job):
             if job.is_analysis():
                 logger.warning((
                     'output file %s from job definition is not present '
-                    'in job report and is not listed in allowNoOutput', lfn))
+                    'in job report and is not listed in allowNoOutput'), lfn)
             else:
                 logger.warning((
                     'output file %s from job definition is not present '
                     'in job report and is not listed in allowNoOutput - '
-                    'job will fail', lfn))
+                    'job will fail'), lfn)
                 job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.MISSINGOUTPUTFILE)
                 failed = True
                 break
@@ -1298,14 +1298,14 @@ def verify_extracted_output_files(output, lfns_jobdef, job):
             logger.warning((
                 'output file %s from job definition is not present '
                 'in job report but is listed in allowNoOutput - '
-                'remove from stage-out', lfn))
+                'remove from stage-out'), lfn)
             remove_from_stageout(lfn, job)
         else:
             nentries = output_jobrep[lfn]
             if nentries == "UNDEFINED":
                 logger.warning((
                     'encountered file with nentries=UNDEFINED - '
-                    'will ignore %s', lfn))
+                    'will ignore %s'), lfn)
 
             elif nentries is None:
 
@@ -1313,12 +1313,12 @@ def verify_extracted_output_files(output, lfns_jobdef, job):
                     logger.warning((
                         'output file %s is listed in job report, '
                         'but has no events and is not listed in '
-                        'allowNoOutput - will ignore', lfn))
+                        'allowNoOutput - will ignore'), lfn)
                 else:
                     logger.warning((
                         'output file %s is listed in job report, '
                         'nentries is None and is listed in allowNoOutput - '
-                        'remove from stage-out', lfn))
+                        'remove from stage-out'), lfn)
                     remove_from_stageout(lfn, job)
 
             elif nentries == 0:
@@ -1327,12 +1327,12 @@ def verify_extracted_output_files(output, lfns_jobdef, job):
                     logger.warning((
                         'output file %s is listed in job report, '
                         'has zero events and is not listed in '
-                        'allowNoOutput - will ignore', lfn))
+                        'allowNoOutput - will ignore'), lfn)
                 else:
                     logger.warning((
                         'output file %s is listed in job report, '
                         'has zero events and is listed in allowNoOutput - '
-                        'remove from stage-out', lfn))
+                        'remove from stage-out'), lfn)
                     remove_from_stageout(lfn, job)
 
             elif type(nentries) is int and nentries:
@@ -1341,7 +1341,7 @@ def verify_extracted_output_files(output, lfns_jobdef, job):
             else:  # should not reach this step
                 logger.warning((
                     'case not handled for output file %s with %s event(s) '
-                    '(ignore)', lfn, str(nentries)))
+                    '(ignore)'), lfn, str(nentries))
 
     status = (not failed)
     return status, nevents
@@ -1385,19 +1385,19 @@ def remove_no_output_files(job):
                 logger.info((
                     "file %s is listed in allowNoOutput but exists "
                     "(will not be removed from list of files to be "
-                    "staged-out)", filename))
+                    "staged-out)"), filename)
                 _outfiles.append(filename)
             else:
                 logger.info((
                     "file %s is listed in allowNoOutput and does not exist "
-                    "(will be removed from list of files to be staged-out)", filename))
+                    "(will be removed from list of files to be staged-out)"), filename)
         else:
             if os.path.exists(path):
                 logger.info("file %s is not listed in allowNoOutput (will be staged-out)", filename)
             else:
                 logger.warning((
                     "file %s is not listed in allowNoOutput and "
-                    "does not exist (job will fail)", filename))
+                    "does not exist (job will fail)"), filename)
             _outfiles.append(filename)
 
     # now remove the unwanted fspecs
@@ -1784,7 +1784,7 @@ def get_redundants():
 
     logger.debug((
         'list of redundant files could not be read from external file: %s '
-        '(will use internal list)', filename))
+        '(will use internal list)'), filename)
 
     # else return the following
     dir_list = ["AtlasProduction*",
@@ -2475,7 +2475,7 @@ def verify_ncores(corecount):
     if athena_proc_number:
         logger.info((
             "encountered a set ATHENA_PROC_NUMBER (%d), "
-            "will not overwrite it", athena_proc_number))
+            "will not overwrite it"), athena_proc_number)
         logger.info('set ATHENA_CORE_NUMBER to same value as ATHENA_PROC_NUMBER')
         os.environ['ATHENA_CORE_NUMBER'] = str(athena_proc_number)
     else:
@@ -2483,7 +2483,7 @@ def verify_ncores(corecount):
         os.environ['ATHENA_CORE_NUMBER'] = str(corecount)
         logger.info((
             "set ATHENA_PROC_NUMBER_JOB and ATHENA_CORE_NUMBER to %s "
-            "(ATHENA_PROC_NUMBER will not be overwritten)", corecount))
+            "(ATHENA_PROC_NUMBER will not be overwritten)"), corecount)
 
 
 def verify_job(job):
@@ -2688,7 +2688,7 @@ def process_debug_command(debug_command, pandaid):
             except RuntimeError as rte:
                 logger.warning((
                     'too many recursions: %s '
-                    '(cannot identify athena process)', rte))
+                    '(cannot identify athena process)'), rte)
             else:
                 if child:
                     logger.info('pid=%d is a child process of the trf of this job', pid)

From 8fc2d5f5dd86015f10ec0d4adb271b150c98f0ea Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Wed, 23 Jun 2021 10:27:13 +0200
Subject: [PATCH 83/96] Added handling for new preprocess exit codes. Removed
 thread name info from log messages. No longer dumping stage-in/out in main
 log. Cleanup and pylint corrections

---
 PILOTVERSION                      |  2 +-
 pilot/control/payloads/generic.py | 14 ++++--
 pilot/util/constants.py           |  2 +-
 pilot/util/container.py           | 10 ++---
 pilot/util/filehandling.py        | 12 ++++--
 pilot/util/middleware.py          | 71 +++++++++++--------------------
 6 files changed, 53 insertions(+), 58 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index edc0161f..24b55189 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.12.1.50
\ No newline at end of file
+2.12.1.51
\ No newline at end of file
diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py
index 747e6acb..8968938c 100644
--- a/pilot/control/payloads/generic.py
+++ b/pilot/control/payloads/generic.py
@@ -249,6 +249,7 @@ def execute_utility_command(self, cmd, job, label):
 
         exit_code, stdout, stderr = execute(cmd, workdir=job.workdir, cwd=job.workdir, usecontainer=False)
         if exit_code:
+            ignored_exit_codes = [160, 161, 162]
             logger.warning('command returned non-zero exit code: %s (exit code = %d) - see utility logs for details', cmd, exit_code)
             if label == 'preprocess':
                 err = errors.PREPROCESSFAILURE
@@ -256,8 +257,10 @@ def execute_utility_command(self, cmd, job, label):
                 err = errors.POSTPROCESSFAILURE
             else:
                 err = 0  # ie ignore
-            if err and exit_code != 160:  # ignore no-more-data-points exit code
+            if err and exit_code not in ignored_exit_codes:  # ignore no-more-data-points exit codes
                 job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(err)
+            if exit_code in ignored_exit_codes:
+                job.transexitcode = exit_code
 
         # write output to log files
         self.write_utility_output(job.workdir, label, stdout, stderr)
@@ -532,7 +535,11 @@ def run_preprocess(self, job):
             logger.info("\n\npreprocess execution command:\n\n%s\n", cmd_before_payload)
             exit_code = self.execute_utility_command(cmd_before_payload, job, 'preprocess')
             if exit_code == 160:
-                logger.fatal('no more HP points - time to abort processing loop')
+                logger.warning('no more HP points - time to abort processing loop')
+            elif exit_code == 161:
+                logger.warning('no more HP points but at least one point was processed - time to abort processing loop')
+            elif exit_code == 162:
+                logger.warning('loop count reached the limit - time to abort processing loop')
             elif exit_code:
                 # set error code
                 job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PREPROCESSFAILURE)
@@ -568,6 +575,7 @@ def run(self):  # noqa: C901
         # abort when nothing more to run, or when the preprocess returns a special exit code
         iteration = 0
         while True:
+
             logger.info('payload iteration loop #%d', iteration + 1)
             os.environ['PILOT_EXEC_ITERATION_COUNT'] = '%s' % iteration
             show_memory_usage()
@@ -577,7 +585,7 @@ def run(self):  # noqa: C901
             exit_code = self.run_preprocess(self.__job)
             jobparams_post = self.__job.jobparams
             if exit_code:
-                if exit_code == 160:
+                if exit_code >= 160 and exit_code <= 162:
                     exit_code = 0
                     # wipe the output file list since there won't be any new files
                     # any output files from previous iterations, should have been transferred already
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index e3d9bebc..a13e7af7 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '50'    # build number should be reset to '1' for every new development cycle
+BUILD = '51'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1
diff --git a/pilot/util/container.py b/pilot/util/container.py
index c81e5f22..89a6d0ed 100644
--- a/pilot/util/container.py
+++ b/pilot/util/container.py
@@ -5,7 +5,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Authors:
-# - Paul Nilsson, paul.nilsson@cern.ch
+# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021
 
 import subprocess
 from os import environ, getcwd, setpgrp  #, getpgid  #setsid
@@ -72,7 +72,7 @@ def execute(executable, **kwargs):
                 secret_key = sub_cmd.split('S3_SECRET_KEY=')[1]
                 secret_key = 'S3_SECRET_KEY=' + secret_key
                 executable_readable = executable_readable.replace(secret_key, 'S3_SECRET_KEY=********')
-        logger.info('executing command: %s' % executable_readable)
+        logger.info('executing command: %s', executable_readable)
 
     if mode == 'python':
         exe = ['/usr/bin/python'] + executable.split()
@@ -95,7 +95,7 @@ def execute(executable, **kwargs):
         # for Python 3, convert from byte-like object to str
         #import sys
         #if is_python3():
-        #    logger.debug('Using python version=%s' % str(sys.version_info))
+        #    logger.debug('Using python version=%s', str(sys.version_info))
         #    try:
         #        stdout = stdout.decode('utf-8')
         #        stderr = stderr.decode('utf-8')
@@ -134,8 +134,8 @@ def containerise_executable(executable, **kwargs):
             diagnostics = ""
             try:
                 executable = container.wrapper(executable, **kwargs)
-            except Exception as e:
-                diagnostics = 'failed to execute wrapper function: %s' % e
+            except Exception as exc:
+                diagnostics = 'failed to execute wrapper function: %s' % exc
                 logger.fatal(diagnostics)
             else:
                 if executable == "":
diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py
index c3f80c37..caf0beeb 100644
--- a/pilot/util/filehandling.py
+++ b/pilot/util/filehandling.py
@@ -952,13 +952,19 @@ def dump(path, cmd="cat"):
         logger.info("path %s does not exist", path)
 
 
-def establish_logging(debug=True, nopilotlog=False, filename=config.Pilot.pilotlog):
+def establish_logging(debug=True, nopilotlog=False, filename=config.Pilot.pilotlog, loglevel=0):
     """
     Setup and establish logging.
 
+    Option loglevel can be used to decide which (predetermined) logging format to use.
+    Example:
+      loglevel=0: '%(asctime)s | %(levelname)-8s | %(name)-32s | %(funcName)-25s | %(message)s'
+      loglevel=1: 'ts=%(asctime)s level=%(levelname)-8s event=%(name)-32s.%(funcName)-25s msg="%(message)s"'
+
     :param debug: debug mode (Boolean),
     :param nopilotlog: True when pilot log is not known (Boolean).
-    :param filename: name of log file.
+    :param filename: name of log file (string).
+    :param loglevel: selector for logging level (int).
     :return:
     """
 
@@ -968,7 +974,7 @@ def establish_logging(debug=True, nopilotlog=False, filename=config.Pilot.pilotl
 
     console = logging.StreamHandler(sys.stdout)
     if debug:
-        format_str = '%(asctime)s | %(levelname)-8s | %(threadName)-19s | %(name)-32s | %(funcName)-25s | %(message)s'
+        format_str = '%(asctime)s | %(levelname)-8s | %(name)-32s | %(funcName)-25s | %(message)s'
         level = logging.DEBUG
     else:
         format_str = '%(asctime)s | %(levelname)-8s | %(message)s'
diff --git a/pilot/util/middleware.py b/pilot/util/middleware.py
index a9fe101b..decfd0c8 100644
--- a/pilot/util/middleware.py
+++ b/pilot/util/middleware.py
@@ -5,7 +5,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Authors:
-# - Paul Nilsson, paul.nilsson@cern.ch, 2020
+# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2021
 
 from os import environ, path, getcwd  #, chmod
 
@@ -47,17 +47,17 @@ def containerise_general_command(job, container_options, label='command', contai
         raise PilotException
 
     try:
-        logger.info('*** executing %s (logging will be redirected) ***' % label)
+        logger.info('*** executing %s (logging will be redirected) ***', label)
         exit_code, stdout, stderr = execute(cmd, job=job, usecontainer=False)
-    except Exception as e:
-        logger.info('*** %s has failed ***' % label)
-        logger.warning('exception caught: %s' % e)
+    except Exception as exc:
+        logger.info('*** %s has failed ***', label)
+        logger.warning('exception caught: %s', exc)
     else:
         if exit_code == 0:
-            logger.info('*** %s has finished ***' % label)
+            logger.info('*** %s has finished ***', label)
         else:
-            logger.info('*** %s has failed ***' % label)
-        logger.debug('%s script returned exit_code=%d' % (label, exit_code))
+            logger.info('*** %s has failed ***', label)
+        logger.debug('%s script returned exit_code=%d', label, exit_code)
 
 
 def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, container_options, external_dir,
@@ -104,30 +104,28 @@ def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite,
         except PilotException as e:
             raise e
     else:
-        logger.warning('%s will not be done in a container (but it will be done by a script)' % label)
+        logger.warning('%s will not be done in a container (but it will be done by a script)', label)
 
     try:
-        logger.info('*** executing %s (logging will be redirected) ***' % label)
+        logger.info('*** executing %s (logging will be redirected) ***', label)
         exit_code, stdout, stderr = execute(cmd, job=job, usecontainer=False)
-    except Exception as e:
-        logger.info('*** %s has failed ***' % label)
-        logger.warning('exception caught: %s' % e)
+    except Exception as exc:
+        logger.info('*** %s has failed ***', label)
+        logger.warning('exception caught: %s', exc)
     else:
         if exit_code == 0:
-            logger.info('*** %s has finished ***' % label)
+            logger.info('*** %s has finished ***', label)
         else:
-            logger.info('*** %s has failed ***' % label)
-        logger.debug('%s script returned exit_code=%d' % (label, exit_code))
+            logger.info('*** %s has failed ***', label)
+        logger.debug('%s script returned exit_code=%d', label, exit_code)
 
         # write stdout+stderr to files
         try:
             _stdout_name, _stderr_name = get_logfile_names(label)
             write_file(path.join(job.workdir, _stdout_name), stdout, mute=False)
             write_file(path.join(job.workdir, _stderr_name), stderr, mute=False)
-            logger.debug('stage-in/out stdout=\n%s' % stdout)
-            logger.debug('stage-in/out stderr=\n%s' % stderr)
-        except PilotException as e:
-            msg = 'exception caught: %s' % e
+        except PilotException as exc:
+            msg = 'exception caught: %s' % exc
             if label == 'stage-in':
                 raise StageInFailure(msg)
             else:
@@ -136,8 +134,8 @@ def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite,
     # handle errors, file statuses, etc (the stage-in/out scripts write errors and file status to a json file)
     try:
         handle_updated_job_object(job, xdata, label=label)
-    except PilotException as e:
-        raise e
+    except PilotException as exc:
+        raise exc
 
 
 def get_script_path(script):
@@ -149,8 +147,6 @@ def get_script_path(script):
     """
 
     srcdir = environ.get('PILOT_SOURCE_DIR', '.')
-    logger.debug('PILOT_SOURCE_DIR=%s' % srcdir)
-
     _path = path.join(srcdir, 'pilot/scripts')
     if not path.exists(_path):
         _path = path.join(srcdir, 'pilot2')
@@ -190,8 +186,8 @@ def get_command(job, xdata, queue, script, eventtype, localsite, remotesite, ext
         # write file data to file
         try:
             status = write_json(path.join(job.workdir, config.Container.stagein_replica_dictionary), filedata_dictionary)
-        except Exception as e:
-            diagnostics = 'exception caught in get_command(): %s' % e
+        except Exception as exc:
+            diagnostics = 'exception caught in get_command(): %s' % exc
             logger.warning(diagnostics)
             raise PilotException(diagnostics)
         else:
@@ -283,8 +279,8 @@ def handle_updated_job_object(job, xdata, label='stage-in'):
                     fspec.turl = file_dictionary[fspec.lfn][3]
                     fspec.checksum['adler32'] = file_dictionary[fspec.lfn][4]
                     fspec.filesize = file_dictionary[fspec.lfn][5]
-            except Exception as e:
-                msg = "exception caught while reading file dictionary: %s" % e
+            except Exception as exc:
+                msg = "exception caught while reading file dictionary: %s" % exc
                 logger.warning(msg)
                 if label == 'stage-in':
                     raise StageInFailure(msg)
@@ -359,8 +355,8 @@ def get_filedata(data):
                                           'istar': fspec.is_tar,
                                           'accessmode': fspec.accessmode,
                                           'storagetoken': fspec.storage_token}
-        except Exception as e:
-            logger.warning('exception caught in get_filedata(): %s' % e)
+        except Exception as exc:
+            logger.warning('exception caught in get_filedata(): %s', exc)
 
     return file_dictionary
 
@@ -421,19 +417,4 @@ def use_middleware_script(container_type):
     :return: Boolean (True if middleware should be containerised).
     """
 
-    # see definition in atlas/container.py, but also see useful code below (in case middleware is available locally)
-    #:param cmd: middleware command, used to determine if the container should be used or not (string).
-    #usecontainer = False
-    #if not config.Container.middleware_container:
-    #    logger.info('container usage for middleware is not allowed by pilot config')
-    #else:
-    #    # if the middleware is available locally, do not use container
-    #    if find_executable(cmd) == "":
-    #        usecontainer = True
-    #        logger.info('command %s is not available locally, will attempt to use container' % cmd)
-    #    else:
-    #        logger.info('command %s is available locally, no need to use container' % cmd)
-
-    # FOR TESTING
-    #return True if config.Container.middleware_container_stagein_script else False
     return True if container_type == 'container' or container_type == 'bash' else False

From e2346903cc7ce59110e377d2589ea77e59dcbc28 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Wed, 23 Jun 2021 10:30:54 +0200
Subject: [PATCH 84/96] Updated build number after merge with Brinick's pylint
 updates

---
 PILOTVERSION            | 2 +-
 pilot/util/constants.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 24b55189..2430ed91 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.12.1.51
\ No newline at end of file
+2.12.1.52
\ No newline at end of file
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index a13e7af7..54a51713 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '51'    # build number should be reset to '1' for every new development cycle
+BUILD = '52'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 347b22cd4eb74146008ec93c5b6f99259a504a97 Mon Sep 17 00:00:00 2001
From: Brinick Simmons <brinick.simmons@gmail.com>
Date: Wed, 23 Jun 2021 11:35:22 +0200
Subject: [PATCH 85/96] Fix initialisation bug and logging bug

self.trace_report was being used before being bound to the instance.
logging.getLogger was being called with multiple args.
---
 pilot/api/data.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pilot/api/data.py b/pilot/api/data.py
index ccdc874d..c6ca6a99 100644
--- a/pilot/api/data.py
+++ b/pilot/api/data.py
@@ -69,7 +69,7 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_
         super(StagingClient, self).__init__()
 
         if not logger:
-            logger = logging.getLogger('%s.%s', __name__, 'null')
+            logger = logging.getLogger(__name__ + '.null')
             logger.disabled = True
 
         self.logger = logger
@@ -93,6 +93,9 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_
         if not self.acopytools.get('default'):
             self.acopytools['default'] = self.get_default_copytools(default_copytools)
 
+        # get an initialized trace report (has to be updated for get/put if not defined before)
+        self.trace_report = trace_report if trace_report else TraceReport(pq=os.environ.get('PILOT_SITENAME', ''))
+
         if not self.acopytools:
             msg = 'failed to initilize StagingClient: no acopytools options found, acopytools=%s' % self.acopytools
             logger.error(msg)
@@ -101,9 +104,6 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_
             raise PilotException("failed to resolve acopytools settings")
         logger.info('configured copytools per activity: acopytools=%s', self.acopytools)
 
-        # get an initialized trace report (has to be updated for get/put if not defined before)
-        self.trace_report = trace_report if trace_report else TraceReport(pq=os.environ.get('PILOT_SITENAME', ''))
-
     def set_acopytools(self):
         """
         Set the internal acopytools.

From 5c5b62b58098d03079cf194dc3207a9763a521e1 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Wed, 23 Jun 2021 16:55:03 +0200
Subject: [PATCH 86/96] Merge with Brinick's pylint updates. Nuber of
 concurrent remote file open attempts can now be set with catchall

---
 PILOTVERSION                      |   2 +-
 pilot/info/jobdata.py             |  17 +----
 pilot/scripts/open_remote_file.py | 107 ++++++++++++++++++++++++++++--
 pilot/user/atlas/common.py        |  34 ++++++++--
 pilot/util/auxiliary.py           |  24 ++++++-
 pilot/util/constants.py           |   2 +-
 6 files changed, 155 insertions(+), 31 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 2430ed91..2d3b6bc8 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.12.1.52
\ No newline at end of file
+2.12.1.54
\ No newline at end of file
diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py
index 4e6ecbfe..52ea3255 100644
--- a/pilot/info/jobdata.py
+++ b/pilot/info/jobdata.py
@@ -30,7 +30,7 @@
 
 from .basedata import BaseData
 from .filespec import FileSpec
-from pilot.util.auxiliary import get_object_size
+from pilot.util.auxiliary import get_object_size, get_key_value
 from pilot.util.constants import LOG_TRANSFER_NOT_DONE
 from pilot.util.filehandling import get_guid, get_valid_path_from_list
 from pilot.util.timing import get_elapsed_real_time
@@ -201,7 +201,7 @@ def init(self, infosys):
             # prepend IMAGE_BASE to imagename if necessary (for testing purposes)
             image_base = os.environ.get('IMAGE_BASE', '')
             if not image_base and 'IMAGE_BASE' in infosys.queuedata.catchall:
-                image_base = self.get_key_value(infosys.queuedata.catchall, key='IMAGE_BASE')
+                image_base = get_key_value(infosys.queuedata.catchall, key='IMAGE_BASE')
             if image_base:
                 paths = [os.path.join(image_base, os.path.basename(self.imagename)),
                          os.path.join(image_base, self.imagename)]
@@ -211,19 +211,6 @@ def init(self, infosys):
             #if image_base and not os.path.isabs(self.imagename) and not self.imagename.startswith('docker'):
             #    self.imagename = os.path.join(image_base, self.imagename)
 
-    def get_key_value(self, catchall, key='SOMEKEY'):
-        """
-        Return the value corresponding to key in catchall.
-        :param catchall: catchall free string.
-        :param key: key name (string).
-        :return: value (string).
-        """
-
-        # ignore any non-key-value pairs that might be present in the catchall string
-        s = dict(s.split('=', 1) for s in catchall.split() if '=' in s)
-
-        return s.get(key)
-
     def prepare_infiles(self, data):
         """
             Construct FileSpec objects for input files from raw dict `data`
diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py
index 92dec03b..cd3d127b 100644
--- a/pilot/scripts/open_remote_file.py
+++ b/pilot/scripts/open_remote_file.py
@@ -4,15 +4,21 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Authors:
-# - Paul Nilsson, paul.nilsson@cern.ch, 2020
+# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2021
 
 import argparse
 import os
 import logging
+import threading
+import queue
 import ROOT
+from collections import namedtuple
 
 from pilot.util.config import config
-from pilot.util.filehandling import establish_logging, write_json
+from pilot.util.filehandling import (
+    establish_logging,
+    write_json,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -31,6 +37,12 @@ def get_args():
                             action='store_true',
                             default=False,
                             help='Enable debug mode for logging messages')
+    arg_parser.add_argument('-t',
+                            dest='nthreads',
+                            default=1,
+                            required=False,
+                            type=int,
+                            help='Number of concurrent file open threads')
     arg_parser.add_argument('-w',
                             dest='workdir',
                             required=False,
@@ -50,10 +62,26 @@ def get_args():
 
 
 def message(msg):
+    """
+    Print message to stdout or to log.
+    Note: not using lazy formatting.
+
+    :param msg: message (string).
+    :return:
+    """
+
     print(msg) if not logger else logger.info(msg)
 
 
 def get_file_lists(turls):
+    """
+    Return a dictionary with the turls.
+    Format: {'turls': <turl list>}
+
+    :param turls: comma separated turls (string)
+    :return: turls dictionary.
+    """
+
     _turls = []
 
     try:
@@ -64,7 +92,17 @@ def get_file_lists(turls):
     return {'turls': _turls}
 
 
-def try_open_file(turl):
+def try_open_file(turl, queues):
+    """
+    Attempt to open a remote file.
+    Successfully opened turls will be put in the queues.opened queue. Unsuccessful turls will be placed in
+    the queues.unopened queue.
+
+    :param turl: turl (string).
+    :param queues: queues collection.
+    :return:
+    """
+
     turl_opened = False
     try:
         in_file = ROOT.TFile.Open(turl)
@@ -75,7 +113,29 @@ def try_open_file(turl):
             in_file.Close()
             turl_opened = True
 
-    return turl_opened
+    queues.opened.put(turl) if turl_opened else queues.unopened.put(turl)
+
+
+def spawn_file_open_thread(queues, file_list):
+    """
+    Spawn a thread for the try_open_file().
+
+    :param queues: queue collection.
+    :param file_list: files to open (list).
+    :return: thread.
+    """
+
+    thread = None
+    try:
+        turl = file_list.pop(0)
+    except IndexError:
+        pass
+    else:
+        # create and start thread for the current turl
+        thread = threading.Thread(target=try_open_file, args=(turl, queues))
+        thread.start()
+
+    return thread
 
 
 if __name__ == '__main__':
@@ -106,10 +166,43 @@ def try_open_file(turl):
     file_list_dictionary = get_file_lists(args.turls)
     turls = file_list_dictionary.get('turls')
     processed_turls_dictionary = {}
+
+    queues = namedtuple('queues', ['result', 'opened', 'unopened'])
+    queues.result = queue.Queue()
+    queues.opened = queue.Queue()
+    queues.unopened = queue.Queue()
+    threads = []
+
     if turls:
-        message('got TURLs: %s' % str(turls))
-        for turl in turls:
-            processed_turls_dictionary[turl] = try_open_file(turl)
+        # make N calls to begin with
+        for index in range(args.nthreads):
+            thread = spawn_file_open_thread(queues, turls)
+            if thread:
+                threads.append(thread)
+
+        while turls:
+
+            try:
+                _ = queues.result.get(block=True)
+            except Exception as error:
+                message("caught exception: %s" % error)
+
+            thread = spawn_file_open_thread(queues, turls)
+            if thread:
+                threads.append(thread)
+
+        # wait until all threads have finished
+        [thread.join() for thread in threads]
+
+        opened_turls = list(queues.opened.queue)
+        opened_turls.sort()
+        unopened_turls = list(queues.unopened.queue)
+        unopened_turls.sort()
+
+        for turl in opened_turls:
+            processed_turls_dictionary[turl] = True
+        for turl in unopened_turls:
+            processed_turls_dictionary[turl] = False
 
         # write dictionary to file with results
         _status = write_json(os.path.join(args.workdir, config.Pilot.remotefileverification_dictionary), processed_turls_dictionary)
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index 09dd695a..cd05e799 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -45,7 +45,13 @@
     get_metadata_dict_from_txt,
 )
 
-from pilot.util.auxiliary import get_resource_name, show_memory_usage, is_python3
+from pilot.util.auxiliary import (
+    get_resource_name,
+    show_memory_usage,
+    is_python3,
+    get_key_value,
+)
+
 from pilot.common.errorcodes import ErrorCodes
 from pilot.common.exception import TrfDownloadFailure, PilotException
 from pilot.util.config import config
@@ -152,12 +158,13 @@ def validate(job):
     return status
 
 
-def open_remote_files(indata, workdir):
+def open_remote_files(indata, workdir, nthreads):
     """
     Verify that direct i/o files can be opened.
 
     :param indata: list of FileSpec.
     :param workdir: working directory (string).
+    :param nthreads: number of concurrent file open threads (int).
     :return: exit code (int), diagnostics (string).
     """
 
@@ -202,7 +209,7 @@ def open_remote_files(indata, workdir):
             # correct the path when containers have been used
             final_script_path = os.path.join('.', script)
 
-            _cmd = get_file_open_command(final_script_path, turls)
+            _cmd = get_file_open_command(final_script_path, turls, nthreads)
             cmd = create_root_container_command(workdir, _cmd)
 
             show_memory_usage()
@@ -248,14 +255,16 @@ def open_remote_files(indata, workdir):
     return exitcode, diagnostics, not_opened
 
 
-def get_file_open_command(script_path, turls):
+def get_file_open_command(script_path, turls, nthreads):
     """
 
     :param script_path: path to script (string).
+    :param turls: comma-separated turls (string).
+    :param nthreads: number of concurrent file open threads (int).
     :return: comma-separated list of turls (string).
     """
 
-    return "%s --turls=%s -w %s" % (script_path, turls, os.path.dirname(script_path))
+    return "%s --turls=%s -w %s -t %s" % (script_path, turls, os.path.dirname(script_path), str(nthreads))
 
 
 def extract_turls(indata):
@@ -317,6 +326,19 @@ def process_remote_file_traces(path, job, not_opened_turls):
                         logger.warning('failed to create trace report for turl=%s', fspec.turl)
 
 
+def get_nthreads(catchall):
+    """
+    Extract number of concurrent file open threads from catchall.
+    Return nthreads=1 if nopenfiles=.. is not present in catchall.
+
+    :param catchall: queuedata catchall (string).
+    :return: number of threads (int).
+    """
+
+    _nthreads = get_key_value(catchall, key='nopenfiles')
+    return _nthreads if _nthreads else 1
+
+
 def get_payload_command(job):
     """
     Return the full command for executing the payload, including the
@@ -360,7 +382,7 @@ def get_payload_command(job):
         diagnostics = ""
         not_opened_turls = ""
         try:
-            exitcode, diagnostics, not_opened_turls = open_remote_files(job.indata, job.workdir)
+            exitcode, diagnostics, not_opened_turls = open_remote_files(job.indata, job.workdir, get_nthreads(catchall))
         except PilotException as exc:
             logger.warning('caught exception: %s', exc)
         else:
diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py
index 23314ae6..f0277273 100644
--- a/pilot/util/auxiliary.py
+++ b/pilot/util/auxiliary.py
@@ -22,9 +22,17 @@
     zero_depth_bases = (str, bytes, Number, range, bytearray)  # Python 3
     iteritems = 'items'
 
+from pilot.util.constants import (
+    SUCCESS,
+    FAILURE,
+    SERVER_UPDATE_FINAL,
+    SERVER_UPDATE_NOT_DONE,
+    SERVER_UPDATE_TROUBLE,
+    get_pilot_version,
+)
+
 from pilot.common.errorcodes import ErrorCodes
 from pilot.util.container import execute
-from pilot.util.constants import SUCCESS, FAILURE, SERVER_UPDATE_FINAL, SERVER_UPDATE_NOT_DONE, SERVER_UPDATE_TROUBLE, get_pilot_version
 from pilot.util.filehandling import dump
 
 import logging
@@ -636,3 +644,17 @@ def get_display_info():
                     product = result[0]
 
     return product, vendor
+
+
+def get_key_value(catchall, key='SOMEKEY'):
+    """
+    Return the value corresponding to key in catchall.
+    :param catchall: catchall free string.
+    :param key: key name (string).
+    :return: value (string).
+    """
+
+    # ignore any non-key-value pairs that might be present in the catchall string
+    _dic = dict(_str.split('=', 1) for _str in catchall.split() if '=' in _str)
+
+    return _dic.get(key)
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 54a51713..210a974f 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '52'    # build number should be reset to '1' for every new development cycle
+BUILD = '54'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 76d399ad129510b314752d6e4e1ab4a98ade1532 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Wed, 23 Jun 2021 16:57:12 +0200
Subject: [PATCH 87/96] Flake8 correction

---
 pilot/scripts/open_remote_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py
index cd3d127b..ff0de545 100644
--- a/pilot/scripts/open_remote_file.py
+++ b/pilot/scripts/open_remote_file.py
@@ -192,7 +192,7 @@ def spawn_file_open_thread(queues, file_list):
                 threads.append(thread)
 
         # wait until all threads have finished
-        [thread.join() for thread in threads]
+        [_thread.join() for _thread in threads]
 
         opened_turls = list(queues.opened.queue)
         opened_turls.sort()

From 90125ed08a8c3c3f09698811dc703c155461e93f Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Fri, 25 Jun 2021 17:21:50 +0200
Subject: [PATCH 88/96] Popen corrections for Python 3 and utf-8

---
 pilot/util/container.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/pilot/util/container.py b/pilot/util/container.py
index 89a6d0ed..f220f6c0 100644
--- a/pilot/util/container.py
+++ b/pilot/util/container.py
@@ -42,8 +42,8 @@ def execute(executable, **kwargs):
     mute = kwargs.get('mute', False)
     mode = kwargs.get('mode', 'bash')
     cwd = kwargs.get('cwd', getcwd())
-    stdout = kwargs.get('stdout', subprocess.PIPE)
-    stderr = kwargs.get('stderr', subprocess.PIPE)
+    stdout_name = kwargs.get('stdout', subprocess.PIPE)
+    stderr_name = kwargs.get('stderr', subprocess.PIPE)
     usecontainer = kwargs.get('usecontainer', False)
     returnproc = kwargs.get('returnproc', False)
     job = kwargs.get('job')
@@ -80,33 +80,33 @@ def execute(executable, **kwargs):
         exe = ['/bin/bash', '-c', executable]
 
     # try: intercept exception such as OSError -> report e.g. error.RESOURCEUNAVAILABLE: "Resource temporarily unavailable"
-    if is_python3():
-        process = subprocess.Popen(exe, bufsize=-1, stdout=stdout, stderr=stderr, cwd=cwd, preexec_fn=setpgrp, encoding='utf-8')  # Python 3
+    if is_python3():  # Python 3
+        process = subprocess.Popen(exe,
+                                   bufsize=-1,
+                                   stdout=stdout_name,
+                                   stderr=stderr_name,
+                                   cwd=cwd,
+                                   preexec_fn=setpgrp,
+                                   encoding='utf-8',
+                                   errors='replace')
     else:
-        process = subprocess.Popen(exe, bufsize=-1, stdout=stdout, stderr=stderr, cwd=cwd, preexec_fn=setpgrp)  # Python 2
-
+        process = subprocess.Popen(exe,
+                                   bufsize=-1,
+                                   stdout=stdout_name,
+                                   stderr=stderr_name,
+                                   cwd=cwd,
+                                   preexec_fn=setpgrp)
     if returnproc:
         return process
     else:
         stdout, stderr = process.communicate()
         exit_code = process.poll()
 
-        # this should not be necessary since encoding is set above for Py3 (it is necessary if encoding above is removed)
-        # for Python 3, convert from byte-like object to str
-        #import sys
-        #if is_python3():
-        #    logger.debug('Using python version=%s', str(sys.version_info))
-        #    try:
-        #        stdout = stdout.decode('utf-8')
-        #        stderr = stderr.decode('utf-8')
-        #    except Exception as error:
-        #        logger.warning('exception caught: %s (can be ignored)', error)
-
         # remove any added \n
         if stdout and stdout.endswith('\n'):
             stdout = stdout[:-1]
 
-        return exit_code, stdout, stderr
+    return exit_code, stdout, stderr
 
 
 def containerise_executable(executable, **kwargs):

From fe032cf2db59899da8bee4cf00dfd0755cee7932 Mon Sep 17 00:00:00 2001
From: Brinick Simmons <brinick.simmons@gmail.com>
Date: Mon, 28 Jun 2021 12:47:04 +0200
Subject: [PATCH 89/96] Add nojekyll file to build docs workflow

---
 .github/workflows/build-docs.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml
index 351e3fee..fef80173 100644
--- a/.github/workflows/build-docs.yml
+++ b/.github/workflows/build-docs.yml
@@ -30,13 +30,16 @@ jobs:
               run: |
                   cd ./doc
                   make github 
-                  cd ..
+
+            - name: Add nojekyll file to repo root dir
+              run: |
+                  touch .nojekyll
 
             - name: Push docs to repo
               run: |
                   git config user.name "brinick"
                   git config user.email "brinick@users.noreply.github.com"
-                  git add docs
-                  git commit -m "Adding documentation"
+                  git add docs .nojekyll
+                  git commit -m "Adding Pilot documentation"
                   git push 
 

From 395e0fa3a0a76e3518cfc5f898364c244f4b9435 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Mon, 28 Jun 2021 15:53:09 +0200
Subject: [PATCH 90/96] Improved check for singularity errors

---
 PILOTVERSION               |  2 +-
 pilot/api/data.py          |  8 ++++----
 pilot/common/errorcodes.py |  8 ++++++--
 pilot/control/data.py      |  6 +-----
 pilot/control/payload.py   | 38 ++++++++++++++++++++------------------
 pilot/util/constants.py    |  2 +-
 6 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 2d3b6bc8..22f7df92 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.12.1.54
\ No newline at end of file
+2.12.1.58
\ No newline at end of file
diff --git a/pilot/api/data.py b/pilot/api/data.py
index c6ca6a99..bd538531 100644
--- a/pilot/api/data.py
+++ b/pilot/api/data.py
@@ -522,10 +522,10 @@ def transfer(self, files, activity='default', **kwargs):  # noqa: C901
                     caught_errors[-1].get_error_code() == ErrorCodes.MISSINGOUTPUTFILE:
                 raise caught_errors[-1]
 
-        remain_files = [f for f in files if f.status not in ['remote_io', 'transferred', 'no_transfer']]
+        remain_files = [fspec for fspec in files if fspec.status not in ['remote_io', 'transferred', 'no_transfer']]
 
         if remain_files:  # failed or incomplete transfer
-            # Propagate message from first error back up
+            # propagate message from first error back up
             errmsg = str(caught_errors[0]) if caught_errors else ''
             if caught_errors and "Cannot authenticate" in str(caught_errors):
                 code = ErrorCodes.STAGEINAUTHENTICATIONFAILURE
@@ -1065,13 +1065,13 @@ def transfer_files(self, copytool, files, activity, **kwargs):
 
             if not fspec.ddmendpoint:  # ensure that output destination is properly set
                 if 'mv' not in self.infosys.queuedata.copytools:
-                    msg = 'No output RSE defined for file=%s' % fspec.lfn
+                    msg = 'no output RSE defined for file=%s' % fspec.lfn
                     self.logger.error(msg)
                     raise PilotException(msg, code=ErrorCodes.NOSTORAGE, state='NO_OUTPUTSTORAGE_DEFINED')
 
             pfn = fspec.surl or getattr(fspec, 'pfn', None) or os.path.join(kwargs.get('workdir', ''), fspec.lfn)
             if not os.path.isfile(pfn) or not os.access(pfn, os.R_OK):
-                msg = "Error: output pfn file does not exist: %s" % pfn
+                msg = "output pfn file does not exist: %s" % pfn
                 self.logger.error(msg)
                 self.trace_report.update(clientState='MISSINGOUTPUTFILE', stateReason=msg)
                 self.trace_report.send()
diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py
index 49233a76..01986479 100644
--- a/pilot/common/errorcodes.py
+++ b/pilot/common/errorcodes.py
@@ -403,8 +403,12 @@ def resolve_transform_error(self, exit_code, stderr):
         elif exit_code == -1:
             ec = self.UNKNOWNTRFFAILURE
         else:
-            # do not assign a pilot error code for unidentified transform error, return 0
-            ec = 0
+            # singularity errors can appear even with no exit code set
+            if "Singularity is not installed" in stderr:
+                ec = self.SINGULARITYNOTINSTALLED
+            else:
+                # do not assign a pilot error code for unidentified transform error, return 0
+                ec = 0
 
         return ec
 
diff --git a/pilot/control/data.py b/pilot/control/data.py
index 20151b92..79bb33f6 100644
--- a/pilot/control/data.py
+++ b/pilot/control/data.py
@@ -1006,11 +1006,7 @@ def queue_monitoring(queues, traces, args):
             job.stageout = "log"
             set_pilot_state(job=job, state="failed")
             if not _stage_out_new(job, args):
-                logger.info("job %s failed during stage-out of data file(s) as well as during stage-out of log, "
-                            "adding job object to failed_jobs queue", job.jobid)
-            else:
-                logger.info("job %s failed during stage-out of data file(s) - stage-out of log succeeded, adding job "
-                            "object to failed_jobs queue", job.jobid)
+                logger.info("job %s failed during stage-out", job.jobid)
 
             put_in_queue(job, queues.failed_jobs)
 
diff --git a/pilot/control/payload.py b/pilot/control/payload.py
index b51063df..03cdc0fb 100644
--- a/pilot/control/payload.py
+++ b/pilot/control/payload.py
@@ -342,11 +342,15 @@ def perform_initial_payload_error_analysis(job, exit_code):
     :return:
     """
 
+    # look for singularity errors (the exit code can be zero in this case)
+    stderr = read_file(os.path.join(job.workdir, config.Payload.payloadstderr))
+    if stderr:
+        exit_code = errors.resolve_transform_error(exit_code, stderr)
+
     if exit_code != 0:
         msg = ""
         exit_code = 0
         logger.warning('main payload execution returned non-zero exit code: %d', exit_code)
-        stderr = read_file(os.path.join(job.workdir, config.Payload.payloadstderr))
         if stderr != "":
             msg = errors.extract_stderr_error(stderr)
             if msg == "":
@@ -359,8 +363,6 @@ def perform_initial_payload_error_analysis(job, exit_code):
                 logger.warning("extracted message from stderr:\n%s", msg)
                 exit_code = set_error_code_from_stderr(msg, fatal)
 
-        if not exit_code:
-            exit_code = errors.resolve_transform_error(exit_code, stderr)
         if exit_code != 0:
             if msg:
                 msg = errors.format_diagnostics(exit_code, msg)
@@ -388,22 +390,22 @@ def set_error_code_from_stderr(msg, fatal):
     :return: error code (int).
     """
 
-    if "Failed invoking the NEWUSER namespace runtime" in msg:
-        exit_code = errors.SINGULARITYNEWUSERNAMESPACE
-    elif "Failed to create user namespace" in msg:
-        exit_code = errors.SINGULARITYFAILEDUSERNAMESPACE
-    elif "command not found" in msg:
-        exit_code = errors.TRANSFORMNOTFOUND
-    elif "SL5 is unsupported" in msg:
-        exit_code = errors.UNSUPPORTEDSL5OS
-    elif "resource temporarily unavailable" in msg:
-        exit_code = errors.SINGULARITYRESOURCEUNAVAILABLE
-    elif "unrecognized arguments" in msg:
-        exit_code = errors.UNRECOGNIZEDTRFARGUMENTS
-    elif fatal:
+    exit_code = 0
+    error_map = {errors.SINGULARITYNEWUSERNAMESPACE: "Failed invoking the NEWUSER namespace runtime",
+                 errors.SINGULARITYFAILEDUSERNAMESPACE: "Failed to create user namespace",
+                 errors.SINGULARITYRESOURCEUNAVAILABLE: "resource temporarily unavailable",
+                 errors.SINGULARITYNOTINSTALLED: "Singularity is not installed",
+                 errors.TRANSFORMNOTFOUND: "command not found",
+                 errors.UNSUPPORTEDSL5OS: "SL5 is unsupported",
+                 errors.UNRECOGNIZEDTRFARGUMENTS: "unrecognized arguments",}
+
+    for key, value in error_map.items():
+        if value in msg:
+            exit_code = key
+            break
+
+    if fatal and not exit_code:
         exit_code = errors.UNRECOGNIZEDTRFSTDERR
-    else:
-        exit_code = 0
 
     return exit_code
 
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 210a974f..7c8499b7 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '54'    # build number should be reset to '1' for every new development cycle
+BUILD = '58'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 74d382962c69a20ace5066b38fc682252c291849 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Mon, 28 Jun 2021 16:56:23 +0200
Subject: [PATCH 91/96] Added missing result queue in remote file open script

---
 PILOTVERSION                      | 2 +-
 pilot/scripts/open_remote_file.py | 1 +
 pilot/util/constants.py           | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 22f7df92..e7a26e32 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.12.1.58
\ No newline at end of file
+2.12.1.59
\ No newline at end of file
diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py
index ff0de545..b6e739d5 100644
--- a/pilot/scripts/open_remote_file.py
+++ b/pilot/scripts/open_remote_file.py
@@ -114,6 +114,7 @@ def try_open_file(turl, queues):
             turl_opened = True
 
     queues.opened.put(turl) if turl_opened else queues.unopened.put(turl)
+    queues.result.put(turl)
 
 
 def spawn_file_open_thread(queues, file_list):
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 7c8499b7..c241977e 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '58'    # build number should be reset to '1' for every new development cycle
+BUILD = '59'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 514f83e949e4a58b401de4f7c0b0e262935e97fc Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Mon, 28 Jun 2021 17:53:41 +0200
Subject: [PATCH 92/96] Added log messages

---
 PILOTVERSION                      | 2 +-
 pilot/scripts/open_remote_file.py | 5 ++++-
 pilot/util/constants.py           | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index e7a26e32..ca156888 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.12.1.59
\ No newline at end of file
+2.12.1.60
\ No newline at end of file
diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py
index b6e739d5..69eb80b3 100644
--- a/pilot/scripts/open_remote_file.py
+++ b/pilot/scripts/open_remote_file.py
@@ -105,6 +105,7 @@ def try_open_file(turl, queues):
 
     turl_opened = False
     try:
+        message('opening %s' % turl)
         in_file = ROOT.TFile.Open(turl)
     except Exception as error:
         message('caught exception: %s' % error)
@@ -112,7 +113,7 @@ def try_open_file(turl, queues):
         if in_file and in_file.IsOpen():
             in_file.Close()
             turl_opened = True
-
+            message('closed %s' % turl)
     queues.opened.put(turl) if turl_opened else queues.unopened.put(turl)
     queues.result.put(turl)
 
@@ -174,6 +175,8 @@ def spawn_file_open_thread(queues, file_list):
     queues.unopened = queue.Queue()
     threads = []
 
+    message('will attempt to open %d file(s) using %d thread(s)' % (len(turls), args.nthreads))
+
     if turls:
         # make N calls to begin with
         for index in range(args.nthreads):
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index c241977e..5209e811 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '59'    # build number should be reset to '1' for every new development cycle
+BUILD = '60'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 8d8c1428197986462b6bfc30888e8bcae8ebcafc Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 29 Jun 2021 12:05:36 +0200
Subject: [PATCH 93/96] Flake8 correction

---
 pilot/control/payload.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pilot/control/payload.py b/pilot/control/payload.py
index 03cdc0fb..6da11b4a 100644
--- a/pilot/control/payload.py
+++ b/pilot/control/payload.py
@@ -397,7 +397,7 @@ def set_error_code_from_stderr(msg, fatal):
                  errors.SINGULARITYNOTINSTALLED: "Singularity is not installed",
                  errors.TRANSFORMNOTFOUND: "command not found",
                  errors.UNSUPPORTEDSL5OS: "SL5 is unsupported",
-                 errors.UNRECOGNIZEDTRFARGUMENTS: "unrecognized arguments",}
+                 errors.UNRECOGNIZEDTRFARGUMENTS: "unrecognized arguments"}
 
     for key, value in error_map.items():
         if value in msg:

From cce6852ed0f6d1c35d7c4ed9295dbc4529f2765e Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 29 Jun 2021 16:13:55 +0200
Subject: [PATCH 94/96] Fixes and cleanup

---
 PILOTVERSION                      |  2 +-
 pilot/common/errorcodes.py        |  7 +++++--
 pilot/control/data.py             | 12 +++++------
 pilot/control/payload.py          | 25 ++++++++++++++---------
 pilot/control/payloads/generic.py | 33 +++++++++++++------------------
 pilot/user/atlas/common.py        |  4 +---
 pilot/util/constants.py           |  2 +-
 7 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index ca156888..c3a7e952 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-2.12.1.60
\ No newline at end of file
+2.12.1.62
\ No newline at end of file
diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py
index 01986479..f6e43197 100644
--- a/pilot/common/errorcodes.py
+++ b/pilot/common/errorcodes.py
@@ -388,6 +388,7 @@ def resolve_transform_error(self, exit_code, stderr):
         :return: pilot error code (int)
         """
 
+        ec = 0
         if exit_code == 251 and "Not mounting requested bind point" in stderr:
             ec = self.SINGULARITYBINDPOINTFAILURE
         elif exit_code == 255 and "No more available loop devices" in stderr:
@@ -406,10 +407,12 @@ def resolve_transform_error(self, exit_code, stderr):
             # singularity errors can appear even with no exit code set
             if "Singularity is not installed" in stderr:
                 ec = self.SINGULARITYNOTINSTALLED
-            else:
+            #else:
                 # do not assign a pilot error code for unidentified transform error, return 0
-                ec = 0
+                # ec = 0
 
+        if not ec:
+            ec = exit_code
         return ec
 
     def extract_stderr_error(self, stderr):
diff --git a/pilot/control/data.py b/pilot/control/data.py
index 79bb33f6..596c3a49 100644
--- a/pilot/control/data.py
+++ b/pilot/control/data.py
@@ -475,18 +475,18 @@ def copytool_in(queues, traces, args):
             cmd = user.get_utility_commands(job=job, order=UTILITY_BEFORE_STAGEIN)
             if cmd:
                 # xcache debug
-                _, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[before xcache start] stdout=%s', _stdout)
-                logger.debug('[before xcache start] stderr=%s', _stderr)
+                #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                #logger.debug('[before xcache start] stdout=%s', _stdout)
+                #logger.debug('[before xcache start] stderr=%s', _stderr)
 
                 _, stdout, stderr = execute(cmd.get('command'))
                 logger.debug('stdout=%s', stdout)
                 logger.debug('stderr=%s', stderr)
 
                 # xcache debug
-                _, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[after xcache start] stdout=%s', _stdout)
-                logger.debug('[after xcache start] stderr=%s', _stderr)
+                #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                #logger.debug('[after xcache start] stdout=%s', _stdout)
+                #logger.debug('[after xcache start] stderr=%s', _stderr)
 
                 # perform any action necessary after command execution (e.g. stdout processing)
                 kwargs = {'label': cmd.get('label', 'utility'), 'output': stdout}
diff --git a/pilot/control/payload.py b/pilot/control/payload.py
index 6da11b4a..65ff94c4 100644
--- a/pilot/control/payload.py
+++ b/pilot/control/payload.py
@@ -22,7 +22,7 @@
 
 from pilot.control.payloads import generic, eventservice, eventservicemerge
 from pilot.control.job import send_state
-from pilot.util.auxiliary import set_pilot_state, show_memory_usage
+from pilot.util.auxiliary import set_pilot_state
 from pilot.util.processes import get_cpu_consumption_time
 from pilot.util.config import config
 from pilot.util.filehandling import read_file, remove_core_dumps, get_guid
@@ -230,9 +230,7 @@ def execute_payloads(queues, traces, args):  # noqa: C901
                 break
 
             payload_executor = get_payload_executor(args, job, out, err, traces)
-            logger.info("Got payload executor: %s", payload_executor)
-
-            show_memory_usage()
+            logger.info("will use payload executor: %s", payload_executor)
 
             # run the payload and measure the execution time
             job.t0 = os.times()
@@ -349,7 +347,6 @@ def perform_initial_payload_error_analysis(job, exit_code):
 
     if exit_code != 0:
         msg = ""
-        exit_code = 0
         logger.warning('main payload execution returned non-zero exit code: %d', exit_code)
         if stderr != "":
             msg = errors.extract_stderr_error(stderr)
@@ -359,10 +356,15 @@ def perform_initial_payload_error_analysis(job, exit_code):
                 fatal = False
             else:
                 fatal = True
-            if msg != "":
-                logger.warning("extracted message from stderr:\n%s", msg)
-                exit_code = set_error_code_from_stderr(msg, fatal)
+            #if msg != "":  # redundant since resolve_transform_error is used above
+            #    logger.warning("extracted message from stderr:\n%s", msg)
+            #    exit_code = set_error_code_from_stderr(msg, fatal)
+
+        if msg:
+            msg = errors.format_diagnostics(exit_code, msg)
+        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code, msg=msg)
 
+        '''
         if exit_code != 0:
             if msg:
                 msg = errors.format_diagnostics(exit_code, msg)
@@ -376,8 +378,13 @@ def perform_initial_payload_error_analysis(job, exit_code):
                     job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.COREDUMP)
                 else:
                     logger.warning('initial error analysis did not resolve the issue (and core dumps were not found)')
+        '''
     else:
-        logger.info('main payload execution returned zero exit code, but will check it more carefully')
+        logger.info('main payload execution returned zero exit code')
+
+    # check if core dumps exist, if so remove them and return True
+    if remove_core_dumps(job.workdir) and not job.debug:
+        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.COREDUMP)
 
 
 def set_error_code_from_stderr(msg, fatal):
diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py
index 8968938c..df5a852f 100644
--- a/pilot/control/payloads/generic.py
+++ b/pilot/control/payloads/generic.py
@@ -492,15 +492,12 @@ def get_payload_command(self, job):
         :return: command (string).
         """
 
-        show_memory_usage()
-
         cmd = ""
         # for testing looping job: cmd = user.get_payload_command(job) + ';sleep 240'
         try:
             pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
             user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user],
                               0)  # Python 2/3
-            show_memory_usage()
             cmd = user.get_payload_command(job)  #+ 'sleep 1000'  # to test looping jobs
         except PilotException as error:
             self.post_setup(job)
@@ -564,8 +561,6 @@ def run(self):  # noqa: C901
         # get the payload command from the user specific code
         self.pre_setup(self.__job)
 
-        show_memory_usage()
-
         cmd = self.get_payload_command(self.__job)
         # extract the setup in case the preprocess command needs it
         self.__job.setup = self.extract_setup(cmd)
@@ -601,9 +596,9 @@ def run(self):  # noqa: C901
             # note: no need to run any main payload in HPO Horovod jobs on Kubernetes
             if os.environ.get('HARVESTER_HOROVOD', '') == '':
 
-                exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[before payload start] stdout=%s', _stdout)
-                logger.debug('[before payload start] stderr=%s', _stderr)
+                #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                #logger.debug('[before payload start] stdout=%s', _stdout)
+                #logger.debug('[before payload start] stderr=%s', _stderr)
 
                 proc = self.run_payload(self.__job, cmd, self.__out, self.__err)
             else:
@@ -651,9 +646,9 @@ def run(self):  # noqa: C901
                 set_pilot_state(job=self.__job, state=state)
                 logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n', proc.pid, exit_code, self.__job.state)
 
-                exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                logger.debug('[after payload finish] stdout=%s', _stdout)
-                logger.debug('[after payload finish] stderr=%s', _stderr)
+                #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                #logger.debug('[after payload finish] stdout=%s', _stdout)
+                #logger.debug('[after payload finish] stderr=%s', _stderr)
 
                 # stop the utility command (e.g. a coprocess if necessary
                 if proc_co:
@@ -708,18 +703,18 @@ def run_utility_after_payload_finished(self, state, order):
                 logger.info("\n\npostprocess execution command:\n\n%s\n", cmd_after_payload)
 
                 # xcache debug
-                if 'xcache' in cmd_after_payload:
-                    _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                    logger.debug('[before xcache kill] stdout=%s', _stdout)
-                    logger.debug('[before xcache kill] stderr=%s', _stderr)
+                #if 'xcache' in cmd_after_payload:
+                #    _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                #    logger.debug('[before xcache kill] stdout=%s', _stdout)
+                #    logger.debug('[before xcache kill] stderr=%s', _stderr)
 
                 exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label)
 
                 # xcache debug
-                if 'xcache' in cmd_after_payload:
-                    _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
-                    logger.debug('[after xcache kill] stdout=%s', _stdout)
-                    logger.debug('[after xcache kill] stderr=%s', _stderr)
+                #if 'xcache' in cmd_after_payload:
+                #    _exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
+                #    logger.debug('[after xcache kill] stdout=%s', _stdout)
+                #    logger.debug('[after xcache kill] stderr=%s', _stderr)
 
         return exit_code
 
diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py
index cd05e799..1fb3be4a 100644
--- a/pilot/user/atlas/common.py
+++ b/pilot/user/atlas/common.py
@@ -604,9 +604,7 @@ def add_athena_proc_number(cmd):
         if value2 > 1:
             cmd = 'export ATHENA_CORE_NUMBER=%d;' % value2 + cmd
         else:
-            logger.info((
-                "will not add ATHENA_CORE_NUMBER to cmd since the "
-                "value is %s", str(value2)))
+            logger.info("will not add ATHENA_CORE_NUMBER to cmd since the value is %s", str(value2))
     else:
         logger.warning((
             'there is no ATHENA_CORE_NUMBER in os.environ '
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index 5209e811..7bb1caf9 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -14,7 +14,7 @@
 RELEASE = '2'   # released number should be fixed at 2 for Pilot 2
 VERSION = '12'  # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates
 REVISION = '1'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '60'    # build number should be reset to '1' for every new development cycle
+BUILD = '62'    # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

From 02b7d21a738b3ca16a4390f0a7e70ca898311265 Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 29 Jun 2021 16:17:03 +0200
Subject: [PATCH 95/96] Update

---
 pilot/common/errorcodes.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py
index f6e43197..68192b14 100644
--- a/pilot/common/errorcodes.py
+++ b/pilot/common/errorcodes.py
@@ -397,19 +397,15 @@ def resolve_transform_error(self, exit_code, stderr):
             ec = self.SINGULARITYIMAGEMOUNTFAILURE
         elif exit_code == 255 and "Operation not permitted" in stderr:
             ec = self.SINGULARITYGENERALFAILURE
-        elif exit_code == 64 and "Singularity is not installed" in stderr:
+        elif "Singularity is not installed" in stderr:  # exit code should be 64 but not always?
             ec = self.SINGULARITYNOTINSTALLED
         elif exit_code == 64 and "cannot create directory" in stderr:
             ec = self.MKDIR
         elif exit_code == -1:
             ec = self.UNKNOWNTRFFAILURE
-        else:
-            # singularity errors can appear even with no exit code set
-            if "Singularity is not installed" in stderr:
-                ec = self.SINGULARITYNOTINSTALLED
-            #else:
-                # do not assign a pilot error code for unidentified transform error, return 0
-                # ec = 0
+        #else:
+            # do not assign a pilot error code for unidentified transform error, return 0
+            # ec = 0
 
         if not ec:
             ec = exit_code

From 90b8a6160490220f9ccbffdfd7ea46d693fe87de Mon Sep 17 00:00:00 2001
From: Paul Nilsson <paul.nilsson@cern.ch>
Date: Tue, 29 Jun 2021 16:19:07 +0200
Subject: [PATCH 96/96] Flake8 correction

---
 pilot/control/payload.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pilot/control/payload.py b/pilot/control/payload.py
index 65ff94c4..c5641c70 100644
--- a/pilot/control/payload.py
+++ b/pilot/control/payload.py
@@ -353,9 +353,9 @@ def perform_initial_payload_error_analysis(job, exit_code):
             if msg == "":
                 # look for warning messages instead (might not be fatal so do not set UNRECOGNIZEDTRFSTDERR)
                 msg = errors.extract_stderr_warning(stderr)
-                fatal = False
-            else:
-                fatal = True
+            #    fatal = False
+            #else:
+            #    fatal = True
             #if msg != "":  # redundant since resolve_transform_error is used above
             #    logger.warning("extracted message from stderr:\n%s", msg)
             #    exit_code = set_error_code_from_stderr(msg, fatal)