crawlerHPSS.py

#!/usr/bin/env python
b'This script requires python 3.4'

"""
Crawler which runs over all HPSS picoDST folder for now and populates
mongoDB collections.

For detailed documentation, see: README_CrawlerHPSS.md
"""

import sys
import os
import re

import logging as log
import time
import socket
import datetime
import shlex, subprocess

from mongoUtil import mongoDbUtil
import pymongo

from pymongo import results
from pymongo import errors
from pymongo import bulk

from pprint import pprint

##############################################
# -- GLOBAL CONSTANTS

HPSS_BASE_FOLDER = "/nersc/projects/starofl"
PICO_FOLDERS     = [ 'picodsts', 'picoDST' ]

##############################################

# -- Check for a proper Python Version
if sys.version[0:3] < '3.0':
    print ('Python version 3.0 or greater required (found: {0}).'.format(sys.version[0:5]))
    sys.exit(-1)

# ----------------------------------------------------------------------------------
class hpssUtil:
    """Helper Class for HPSS connections and retrieving stuff"""

    # _________________________________________________________
    def __init__(self, target = 'picoDst', pathKeysSchema = 'runyear/system/energy/trigger/production/day%d/runnumber'):
        #    def __init__(self, target = 'picoDst', pathKeysSchema = 'runyear/system/energy/trigger/production/day%d/runnumber%d'):
        self._today = datetime.datetime.today().strftime('%Y-%m-%d')

        self._target           = target
        self._fileSuffix       = '.{0}.root'.format(target)
        self._lengthFileSuffix = len(self._fileSuffix)

        if target == 'picoDst':
            pathKeys = pathKeysSchema.split(os.path.sep)

            # -- Get the type from each path key (tailing % char), or 's' for
            #    string if absent.  i.e.
            #    [['runyear', 's'], ['system', 's'], ['day', 'd'], ['runnumber', 'd']]
            self._typedPathKeys = [k.split('%') if '%' in k else [k, 's'] for k in pathKeys]
            self._typeMap = {'s': str, 'd': int, 'f': float}

    # _________________________________________________________
    def _getTypedPathKeys(self, tokenizedPath):
        """Get typed path keys for different scenarios."""

        # -- Default case
        if len(tokenizedPath) == 8:
            return self._typedPathKeys

        elif len(tokenizedPath) == 7:
            pathKeysSchema = 'runyear/system/energy/trigger/production/day%d/runnumber'

            # _________________________________________________________
            def _getDateIndex(tokenizedPath):
                """Get index of date field."""

                for idx in range(len(tokenizedPath)):
                    isDate = True
                    try:
                        date = int(tokenizedPath[idx])
                        if date > 370:
                            isDate = False
                    except ValueError:
                        isDate = False

                    if isDate:
                        return idx
                return -1

            dateIdx = _getDateIndex(tokenizedPath)

            if dateIdx == 4 and "GeV" in tokenizedPath[2]:
                # ORIG: pathKeysSchema = 'runyear/system/energy/trigger/production/day%d/runnumber'
                pathKeysSchema = 'runyear/system/energy/trigger/day%d/runnumber'

                pathKeys = pathKeysSchema.split(os.path.sep)
                typedPathKeys = [k.split('%') if '%' in k else [k, 's'] for k in pathKeys]
                return typedPathKeys

            else:
                print("SCHEMA NOT KNOWN !!! - use Default", tokenizedPath)
                return self._typedPathKeys

        else:
            print("SCHEMA NOT KNOWN !!! - use Default", tokenizedPath)
            return self._typedPathKeys

    # _________________________________________________________
    def setCollections(self, collHpssFiles, collHpssPicoDsts, collHpssDuplicates):
        """Get collection from mongoDB."""

        self._collHpssFiles      = collHpssFiles
        self._collHpssPicoDsts   = collHpssPicoDsts
        self._collHpssDuplicates = collHpssDuplicates

    # _________________________________________________________
    def getFileList(self):
        """Loop over both folders containing picoDSTs on HPSS."""

        for picoFolder in PICO_FOLDERS:
            self._getFolderContent(picoFolder)
            break

    # _________________________________________________________
    def _getFolderContent(self, picoFolder):
        """Get listing of content of picoFolder."""

        # -- Get subfolders from HPSS
        cmdLine = 'hsi -q ls -1 {0}/{1}'.format(HPSS_BASE_FOLDER, picoFolder)
        cmd = shlex.split(cmdLine)
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

        # -- Loop of the list of subfolders
        for subFolder in iter(p.stdout.readline, b''):
            if "Run" in subFolder.decode("utf-8").rstrip():
                self._parseSubFolder(subFolder.decode("utf-8").rstrip())

    # _________________________________________________________
    def _parseSubFolder(self, subFolder):
        """Get recursive list of folders and files in subFolder ... as "ls" output."""

        cmdLine = 'hsi -q ls -lR {0}'.format(subFolder)
        cmd = shlex.split(cmdLine)
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

        # -- Parse ls output line-by-line -> utilizing output blocks in ls
        inBlock = 0
        listPicoDsts = []
        for lineTerminated in iter(p.stdout.readline, b''):
            line = lineTerminated.decode("utf-8").rstrip('\t\n')
            lineCleaned = ' '.join(line.split())

            if lineCleaned.startswith(subFolder):
                inBlock = 1
                self._currentBlockPath = line.rstrip(':')
            else:
                if not lineCleaned:
                    inBlock = 0
                    self._currentBlockPath = ""
                else:
                    if inBlock and not lineCleaned.startswith('d'):
                        doc = self._parseLine(lineCleaned)

                        # -- update lastSeen and insert if not in yet
                        ret = self._collHpssFiles.find_one_and_update({'fileFullPath': doc['fileFullPath']},
                                                                      {'$set': {'lastSeen': self._today},
                                                                       '$setOnInsert' : doc},
                                                                       upsert = True)

                        # -- document already there - do nothing
                        if ret:
                            continue

                        # -- new document inserted - add the picoDst(s)
                        if doc['fileType'] == "tar":
                            # -- get picoDsts within tar files
                            nDocsInTar = self._parseTarFile(doc)
                            if nDocsInTar == -1:
                                print("Error: reading tar file {0} - fix manually, file has \
                                       not been added to HPSS_Files collection.".format(doc['fileFullPath']))
                            else:
                                self._collHpssFiles.find_one_and_update({'fileFullPath': doc['fileFullPath']},
                                                                        {'$set': {'filesInTar': nDocsInTar}})
                            continue

                        if doc['fileType'] == "picoDst":
                            listPicoDsts.append(self._makePicoDstDoc(doc['fileFullPath'], doc['fileSize']))

                            if len(listPicoDsts) >= 10000:
                                self._insertPicoDsts(listPicoDsts)
                                listPicoDsts[:] = []

        # -- Insert picoDsts in collection
        self._insertPicoDsts(listPicoDsts)

    # _________________________________________________________
    def _parseLine(self, line):
        """Parse one entry in HPSS subfolder.

           Get every file with full path, size, and details
           """

        lineTokenized = line.split(' ', 9)

        fileName     = lineTokenized[8]
        fileFullPath = "{0}/{1}".format(self._currentBlockPath, fileName)
        fileSize     = int(lineTokenized[4])
        fileType     = "other"

        if fileName.endswith(".tar"):
            fileType = "tar"
        elif fileName.endswith(".idx"):
            fileType = "idx"
        elif fileName.endswith(".picoDst.root"):
            fileType = "picoDst"

        # -- return record
        return { 'fileFullPath': fileFullPath, 'fileSize': fileSize, 'fileType': fileType}

    # _________________________________________________________
    def _parseTarFile(self, hpssDoc):
        """Get Content of tar file and parse it.

           return
                - number of documents in tar file
                - -1 if error reading tar file
           """

        cmdLine = 'htar -tf {0}'.format(hpssDoc['fileFullPath'])
        cmd = shlex.split(cmdLine)
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

        listDocs = []

        for lineTerminated in iter(p.stdout.readline, b''):
            line = lineTerminated.decode("utf-8").rstrip('\t\n')
            lineCleaned = ' '.join(line.split())

            if lineCleaned == "HTAR: HTAR SUCCESSFUL" or \
                    lineCleaned.startswith('HTAR: d'):
                continue

            if 'ERROR: No such file: {0}.idx'.format(hpssDoc['fileFullPath']) == lineCleaned :
                print('ERROR no IDX file ...', lineCleaned, '... recovering')
                return -1

            lineTokenized = lineCleaned.split(' ', 7)

            if len(lineTokenized) < 7:
                print("Error tokenizing hTar line:", lineTokenized)
                continue

            # -- Check if file is marked as deleted in Tar file
            elif len(lineTokenized) > 7:
                if  lineTokenized[2] == 'D':
                    # Is deleted in tarfile
                    continue
                else:
                    print("Error tokenizing hTar line:", lineTokenized)
                    continue

            fileFullPath  = lineTokenized[6]
            fileSize      = int(lineTokenized[3])

            # -- select only target
            if not fileFullPath.endswith(self._fileSuffix):
                continue

            # -- make PicoDst document and add it to list
            listDocs.append(self._makePicoDstDoc(fileFullPath, fileSize, hpssDoc=hpssDoc, isInTarFile=True))

        nDocsInTar = len(listDocs)

        # -- Insert picoDsts in collection
        self._insertPicoDsts(listDocs)

        return nDocsInTar

    # _________________________________________________________
    def _makePicoDstDoc(self, fileFullPath, fileSize, hpssDoc=None, isInTarFile=False):
        """Create entry for picoDsts."""

        # -- identify start of "STAR naming conventions"
        idxBasePath = fileFullPath.find("/Run")+1

        # -- Create document
        doc = {
               'filePath':     fileFullPath[idxBasePath:],
               'fileFullPath': fileFullPath,
               'fileSize':     fileSize,
               'target':       self._target,
               'isInTarFile':  isInTarFile,
               'staging':      {'stageMarkerXRD': False},
            }

        if isInTarFile:
            doc['fileFullPathTar'] = hpssDoc['fileFullPath']

        # -- Strip basePath of fileName and tokenize it
        cleanPathTokenized = doc['filePath'].split(os.path.sep)

        # -- Get TypedKeys for tokenized path
        typedPathKeys = self._getTypedPathKeys(cleanPathTokenized)

        # -- Create STAR details sub document
        docStarDetails = dict([(keys[0], self._typeMap[keys[1]](value))
                               for keys, value in zip(typedPathKeys, cleanPathTokenized)])

        # -- Create a regex pattern to get the stream from the fileName
        regexStream = re.compile('(st_.*)_{}'.format(docStarDetails.get('runnumber', '')))

        fileNameParts = re.split(regexStream, cleanPathTokenized[-1])
        if len(fileNameParts) == 3 and len(fileNameParts[0]) == 0:
            docStarDetails['stream'] = fileNameParts[1]

            strippedSuffix = fileNameParts[-1][1:-self._lengthFileSuffix]
            strippedSuffixParts = strippedSuffix.split('_')

            docStarDetails['picoType'] = strippedSuffixParts[0] \
                if len(strippedSuffixParts) == 2 \
                else strippedSuffix
        else:
            print('xxx: ', fileNameParts, docStarDetails)
            docStarDetails['stream'] = 'xx'
            docStarDetails['picoType'] = 'xx'

        # -- Add STAR details to document
        doc['starDetails'] = docStarDetails

        # -- return picoDst document
        return doc

    # _________________________________________________________
    def _insertPicoDsts(self, listDocs):
        """Insert list of picoDsts in to collections.

        In HPSSPicoDst collection and
        in to HPSSDuplicates collection if a duplicate
        """

        # -- Empty list
        if not listDocs:
            return

        # -- Clean listDocs with duplicate entries and move them on extra list: listDuplicates
        listDuplicates = []

        for entry in self._collHpssPicoDsts.find({'starDetails.runyear': listDocs[0]['starDetails']['runyear']}, {'filePath': True, '_id': False}):
            element = next((item for item in listDocs if item['filePath'] == entry['filePath']), None)
            if element:
                listDuplicates.append(element)
                listDocs.remove(element)

        # -- Insert list of picoDsts in to HpssPicoDsts collection
        if listDocs:
            print("Insert List: Add {0} picoDsts".format(len(listDocs)))
            self._collHpssPicoDsts.insert_many(listDocs, ordered=False)

        # -- Insert list of duplicate picoDsts in to HpssDuplicates collection
        if listDuplicates:
            print("Insert List: Add {0} duplicate picoDsts".format(len(listDuplicates)))
            self._collHpssDuplicates.insert_many(listDuplicates, ordered=False)


# ____________________________________________________________________________
def checkForHPSSTransfer():
    """Check for ongoing transfer of files into HPSS"""

    cmdLine = 'qstat -u starofl'
    cmd = shlex.split(cmdLine)
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    for lineTerminated in iter(p.stdout.readline, b''):
        line = lineTerminated.decode("utf-8").rstrip('\t\n')
        lineCleaned = ' '.join(line.split())

        if "tarToHPSS" in lineCleaned:
            return True

    return False

# ____________________________________________________________________________
def main():
    """initialize and run"""

    # -- Check for ongoing transfer into HPSS
    if checkForHPSSTransfer():
        print ("Abort - Data is currently moved to HPSS")
        return

    # -- Connect to mongoDB
    dbUtil = mongoDbUtil("", "admin")

    collHpssFiles      = dbUtil.getCollection("HPSS_Files")
    collHpssPicoDsts   = dbUtil.getCollection("HPSS_PicoDsts")
    collHpssDuplicates = dbUtil.getCollection("HPSS_Duplicates")

    hpss = hpssUtil()
    hpss.setCollections(collHpssFiles, collHpssPicoDsts, collHpssDuplicates)
    hpss.getFileList()

    dbUtil.close()

# ____________________________________________________________________________
if __name__ == "__main__":
    sys.exit(main())