scripts/spooker

#!/usr/bin/env bash
# This script is designed to be part of the
# - onerror
# - oncomplete
# part of Snakefiles to gather info about:
# 1. the pipeline
# 2. the user
# 3. other metadata
# This data is then tar-gzipped and saved to a common location.
# For runs on BIOWULF:
# - tarball is saved to /scratch/ccbrpipeliner using the spook tool
# - a cronjob then picks up the tarball and saves it to /data/CCBR_Pipeliner/userdata/ccbrpipeliner
# - [TODO] another cronjob then reads files under /data/CCBR_Pipeliner/userdata/ccbrpipeliner and generates
# detailed HTML reports about pipeline usages.
# For runs on FRCE:
# - tarball is saved to /mnt/projects/CCBR-Pipelines/pipelines/userdata/ccbrpipeliner by simple cp command
# - [TODO] a cronjob then picks up the tarball and moves it to biowulf at /data/CCBR_Pipeliner/userdata/ccbrpipeliner/frce
# - [TODO] another cronjob then reads files under /data/CCBR_Pipeliner/userdata/ccbrpipeliner/frce and adds to the
# detailed HTML report about pipeline usages.

# requires 2 inputs:
# 1. Pipelines outputdir ... absolute path
# 2. name of the pipeline ... eg. RENEE or XAVIER

SCRIPTNAME="$0"
SCRIPTBASENAME=$(basename $SCRIPTNAME)

# 2 arguments are required ... PIPELINE_OUTDIR and PIPELINE_NAME
if [[ "$#" != "2" ]];then
    echo "$SCRIPTBASENAME FAILED!: ERROR: 2 arguments expected!"
    echo "$SCRIPTBASENAME FAILED!: ERROR: Argument 1: pipeline outdir"
    echo "$SCRIPTBASENAME FAILED!: ERROR: Argument 2: pipeline name"
    exit 1
fi

set -o pipefail
PIPELINE_OUTDIR=$1
PIPELINE_NAME=$2

PIPELINE_OUTDIR_SIZE=$(du -bs $PIPELINE_OUTDIR | awk '{print $1}')
PIPELINE_NAME_UPPER=$(echo "$PIPELINE_NAME" | tr '[:lower:]' '[:upper:]')
PIPELINE_NAME_LOWER=$(echo "$PIPELINE_NAME" | tr '[:upper:]' '[:lower:]')
PIPELINE_PATH=$(which $PIPELINE_NAME_LOWER)
PIPELINE_VERSION=$($PIPELINE_NAME_LOWER --version 2>/dev/null | tail -n1 | awk '{print $NF}' || echo "UNKNOWN")

DT=$(date +%y%m%d%H%M%S)
archivefile="${PIPELINE_OUTDIR}/${DT}.tar.gz"
treefile="${PIPELINE_OUTDIR}/${DT}.tree.json"
metadata="${PIPELINE_OUTDIR}/${DT}.json"

SCONTROL=$(type -P scontrol)
if [[ "$SCONTROL" == "" ]];then
    echo "$SCRIPTBASENAME FAILED!: ERROR: scontrol command not in PATH!"
    echo "$SCRIPTBASENAME FAILED!: ERROR: usage metadata cannot be collected!!"
    exit 1
fi

# create the archive with all metadata
dryrunlogfile=""
if [[ -d "$PIPELINE_OUTDIR" ]];then
    # find the newest dryrun file
    dryrunlogfile=$(ls -rt ${PIPELINE_OUTDIR}/dryrun*log 2>/dev/null |tail -n1 || echo "")
    cmd="tar czvf ${archivefile}"
    if [[ "$dryrunlogfile" != "" ]];then
        cmd="$cmd $dryrunlogfile"
    fi
    # gather some info
    echo "PIPELINE_OUTDIR: $PIPELINE_OUTDIR" > $metadata
    echo "PIPELINE_OUTDIR_SIZE: $PIPELINE_OUTDIR_SIZE" >> $metadata
    echo "PIPELINE_NAME: $PIPELINE_NAME_UPPER" >> $metadata
    echo "PIPELINE_PATH: $PIPELINE_PATH" >> $metadata
    echo "PIPELINE_VERSION: $PIPELINE_VERSION" >> $metadata
    echo "USER: $USER" >> $metadata
    #GROUPS=$(groups 2>/dev/null)
    echo "GROUPS:" $(groups) >> $metadata
    echo "DATE: $DT" >> $metadata
    tree -J $PIPELINE_OUTDIR > $treefile
    cmd="$cmd $metadata $treefile"

# files from pipelines in written in snakemake
    if [[ -d "${PIPELINE_OUTDIR}/logfiles" ]];then
        logdir="${PIPELINE_OUTDIR}/logfiles"
        for thisfile in "snakemake.log" "snakemake.log.jobby" "master.log" "runtime_statistics.json";do
            absthisfile="${logdir}/${thisfile}"
            if [[ -f "$absthisfile" ]];then
                cmd="$cmd $absthisfile"
            fi
        done
    fi

# [TODO] files from pipelines in written in nextflow
# [TODO] ... add nextflow related files here ...

    echo "$SCRIPTBASENAME: $cmd"
    $cmd && echo "$SCRIPTBASENAME: $archivefile created!"
    rm -f $metadata $treefile

else # PIPELINE_OUTDIR does not exist!
    echo "$SCRIPTBASENAME FAILED!: ERROR: $PIPELINE_OUTDIR does not exist!"
    echo "$SCRIPTBASENAME FAILED!: ERROR: usage metadata cannot be collected!!"
    exit 1
fi

# check if you are on BIOWULF or FRCE
clustername=$(scontrol show config|grep -i clustername|awk '{print $NF}')
if [[ "$clustername" == "biowulf" ]];then ISBIOWULF=true;else ISBIOWULF=false;fi
if [[ "$clustername" == "fnlcr" ]];then ISFRCE=true;else ISFRCE=false;fi

if [[ $ISBIOWULF == true || $ISFRCE == true ]];then
    if [[ $ISBIOWULF == true ]];then
        SPOOK=$(type -P spook)
        if [[ "$SPOOK" == "" ]];then
            echo "$SCRIPTBASENAME: spook is NOT in PATH."
            echo "$SCRIPTBASENAME: trying to add it by sourcing /data/CCBR_Pipeliner/cronjobs/scripts/setup"
            . "/data/CCBR_Pipeliner/cronjobs/scripts/setup"
            SPOOK=$(type -P spook)
            if [[ "$SPOOK" == "" ]];then
                echo "$SCRIPTBASENAME FAILED!: ERROR: spook is still not in PATH!"
                echo "$SCRIPTBASENAME FAILED!: ERROR: usage metadata cannot be collected!!"
                exit 1
            fi
        fi
        echo "$SCRIPTBASENAME: spook is now in PATH:$SPOOK"
        SPOOK_COPY2DIR="/scratch/ccbrpipeliner"
    fi
    if [[ $ISFRCE == true ]];then
        SPOOK_COPY2DIR="/mnt/projects/CCBR-Pipelines/pipelines/userdata/ccbrpipeliner"
    fi
    echo "$SCRIPTBASENAME: SPOOK_COPY2DIR: $SPOOK_COPY2DIR"

    # copy over the metadata archive
    if [ -f "${archivefile}" ];then
        if [[ $ISBIOWULF == true ]]; then
            cmd="$SPOOK -f ${archivefile} -d $SPOOK_COPY2DIR"
            echo "$SCRIPTBASENAME: $cmd"
            $cmd
        fi
        if [[ $ISFRCE == true ]];then
            cmd="cp -rv ${archivefile} $SPOOK_COPY2DIR"
            echo "$SCRIPTBASENAME: $cmd"
            $cmd
        fi
    fi

else # not biowulf or frce ... so exit
    echo "$SCRIPTBASENAME FAILED!: ERROR: Neither on BIOWULF Nor on FRCE"
    echo "$SCRIPTBASENAME FAILED!: ERROR: $archivefile created but NOT copied!"
    exit 1
fi