forked from glencoesoftware/omero-ms-image-region
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregen-memo-files.sh
executable file
·205 lines (190 loc) · 7.45 KB
/
regen-memo-files.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/bin/bash
# Copyright (C) 2019 Glencoe Software, Inc. All rights reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
# probably want to run the following before you start
# yum install paralllel
# parallel --bibtex
usage() {
echo "Usage:"
echo "$0 [OPTIONS]"
echo "Regenerates Bio-Formats memo files in parallel"
echo
echo "This utility queries the OMERO database for a list of filesets, splits the output"
echo "into several input files and runs the memoregenerator utility using GNU parallel."
echo
echo " OPTIONS:"
echo " --batch-size Maximum number of entries in each input file sent to parallel (default: 500)"
echo " --cache-options Memofile cache options [/path/to/dir | inplace] (required)"
echo " --csv Bypass sql query and use this csv for image list"
echo " --db Database connection string"
echo " --force-image-regen Force regeneration of image list even if it exists already"
echo " --help Display usage and exit"
echo " --jobs Maximum number of jobs to parallelize (default: number of processing units available)"
echo " --memoizer-home Location of image-region micro-service (default: current directory)"
echo " --no-ask Do not ask for confirmation"
echo " --no-wait Do not wait to start generating -- DO IT NOW"
echo
echo "Examples:"
echo " Regenerate memo files using the current cache directory and all available CPUs"
echo " $0 --cache-options inplace"
echo " Regenerate memo files offline using a secondary cache directory and 4 CPUs"
echo " $0 --jobs 4 --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )"
echo " Regenerate memo files offline using a secondary cache directory, all available CPUs and a database connection string"
echo " $0 --db postgresql://user:pass@host:port/db --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )"
exit $1
}
run_split_parallel_os_dep() {
set -x
export JAVA_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=rslt.${DATESTR} -Xmx2g -Dlogback.configurationFile=${MEMOIZER_HOME}/logback-memoizer.xml -Dprocessname=memoizer"
cd rslt.${DATESTR}
# Split the CSV file into N * JOBS files of at most BATCH_SIZE entries using round-robin distribution
N=$(wc -l ${FULL_CSV} | awk '{print $1}')
NFILES=$(( (($N - 1) / ($BATCH_SIZE * $JOBS) + 1 ) * $JOBS ))
split -a 3 -n r/$NFILES ${FULL_CSV} -d input.
PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}"
set -x
/usr/bin/time -p -o timed parallel ${PARALLEL_OPTS} \
${MEMOIZER_HOME}/bin/memoregenerator \
--config=${MEMOIZER_HOME}/conf/config.yaml \
${CACHE_OPTIONS} ::: input.*
}
while true; do
case "$1" in
--help)
usage 0
;;
--dry-run)
DRYRUN="--dry-run"; shift;;
--no-ask)
NO_ASK="1"; shift;;
--no-wait)
NO_WAIT="1"; shift;;
--force-image-regen)
FORCE_IMAGE_REGEN="1"; shift;;
--db)
case "$2" in
"") echo "No parameter specified for --db"; break;;
*) DB=$2; shift 2;;
esac;;
--batch-size)
case "$2" in
"") echo "No parameter specified for --batch-size"; break;;
*) BATCH_SIZE=$2; shift 2;;
esac;;
--jobs)
case "$2" in
"") echo "No parameter specified for --jobs"; break;;
*) JOBS=$2; shift 2;;
esac;;
--memoizer-home)
case "$2" in
"") echo "No parameter specified for --memoizer-home"; break;;
*) MEMOIZER_HOME=$2; shift 2;;
esac;;
--cache-options)
case "$2" in
"") echo "No parameter specified for --cache-options"; break;;
*) CACHE_OPTIONS=$2; shift 2;;
esac;;
--csv)
case "$2" in
"") echo "No parameter specified for --csv"; break;;
*) FULL_CSV=$2; shift 2;;
esac;;
"") break;;
*) echo "Unknown keywords $*"; usage 1;;
esac
done
DATESTR="$( date "+%Y%m%d" ).$$"
if [ -z "${CACHE_OPTIONS}" ]; then
echo "Missing --cache-options : must specify a directory or 'inplace'"
usage 1
else
if [ "${CACHE_OPTIONS}" == "inplace" ]; then
CACHE_OPTIONS="--inplace"
else
CACHE_OPTIONS="--cache-dir=${CACHE_OPTIONS}"
fi
fi
if [ -z "${BATCH_SIZE}" ]; then
echo "Setting batch size to 500"
BATCH_SIZE=500
fi
if [ -z "${MEMOIZER_HOME}" ]; then
echo "Setting memoizer-home to cwd (${PWD})"
MEMOIZER_HOME=${PWD}
fi
set -e
# max cpu/jobs calc
MAX_JOBS=$(nproc)
if [ -z "${JOBS}" ]; then
[ -n "${MAX_JOBS}" ] && JOBS="${MAX_JOBS}"
else
if [ "${JOBS}" == "max" ]; then
JOBS=${MAX_JOBS}
fi
fi
[ -z "${JOBS}" ] && JOBS=2
echo "Setting ${JOBS} parallel memo regeneration jobs"
echo "This operation can consume up to $((JOBS * 2))GB of memory"
echo "Current memory usage (in GB) is"
echo ""
echo "$(free --giga)"
if [ -z "${FULL_CSV}" ]; then
FULL_CSV="image-list-${DATESTR}.csv"
fi
if [ -f "${FULL_CSV}" ]; then
echo "existing images file"
else
echo "CSV (${FULL_CSV}) not found, generating from database..."
echo "running sql to generate images file"
[ -n "${DRYRUN}" ] && set -x
if [ -z "${DB}" ]; then
MS_CONFIG="${MEMOIZER_HOME}/conf/config.yaml"
DB_USER=$( grep omero.db.user ${MS_CONFIG} |awk -F: '{ print $2 }' | sed -re 's/\s+//g' -e 's/\"//g')
DB_HOST=$( grep omero.db.host ${MS_CONFIG} |awk -F: '{ print $2 }' | sed -re 's/\s+//g' -e 's/\"//g')
DB_NAME=$( grep omero.db.name ${MS_CONFIG} |awk -F: '{ print $2 }' | sed -re 's/\s+//g' -e 's/\"//g')
DB_PASS=$( grep omero.db.pass ${MS_CONFIG} |awk -F: '{ print $2 }' | sed -re 's/\s+//g' -e 's/\"//g')
PSQL_OPTIONS="postgresql://${DB_USER:-omero}:${DB_PASS:-omero}@${DB_HOST:-localhost}:${DB_PORT:-5432}/${DB_NAME:-omero}"
else
PSQL_OPTIONS=${DB}
fi
psql ${PSQL_OPTIONS} omero -f ${MEMOIZER_HOME}/memo_regenerator.sql > ${FULL_CSV}
fi
[ -n "${DRYRUN}" ] && set -x
if [ -z "${NO_ASK}" ]; then
read -p "Are you sure you want to regenerate memo files? (yes/no) " ANS
if [ "${ANS}" == "yes" ]; then
true
else
echo "quitting..."
exit 0
fi
fi
if [ -s "${FULL_CSV}" ]; then
NUM_IMAGES=$( wc -l ${FULL_CSV} |cut -f 1 -d' ' )
if [ -n "${NO_WAIT}" ]; then
echo "${NUM_IMAGES} images to process using ${JOBS} threads...starting"
else
echo "${NUM_IMAGES} images to process using ${JOBS} threads... 5 seconds to cancel."
sleep 5s
fi
mkdir -p rslt.${DATESTR}
mv -v ${FULL_CSV} rslt.${DATESTR}/image-list-${DATESTR}.csv
run_split_parallel_os_dep
else
echo "No images to process"
fi