Skip to content

Commit 8d351f1

Browse files
authored
exit codes, json optional
1 parent 7270b70 commit 8d351f1

File tree

1 file changed

+95
-55
lines changed

1 file changed

+95
-55
lines changed

Diff for: collectSacctDB.sh

+95-55
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#
88
# trigger with, for example:
99
# crontab entry:
10-
# 37 3,16 1,16 * * sh -c "[ -x ${SACCT_DIR}/bin/collectSacctDB.sh ] && \
10+
# 37 3 1,16 * * sh -c "[ -x ${SACCT_DIR}/bin/collectSacctDB.sh ] && \
1111
# ( cd ${SACCT_DIR}/$(hostname -s); ${SACCT_DIR}/bin/collectSacctDB.sh )"
1212
#
1313
# To Do:
@@ -16,8 +16,15 @@
1616
# - set SACCT_DIR
1717
# - set DEFAULT_INITIALIZED date
1818
# - set timeouts, esp. for json sacct query
19+
# - set breadcrumb, previous timeout used, esp. if previous run had no json output
20+
# - check for timeout_long breadcrumb, increase from TIMEOUT_LONG if found
21+
# - SKIP_JSON=false
1922
# - verify env
2023
# - slurm env & commands appear reasonable
24+
# - refine collect/timeout/wait
25+
# - slice query into smaller time chunks if no json output in a TIMEOUT period
26+
# - instead of sleeping, poll for output, possibly warn, if no progress made
27+
# then if no json ever collected, leave timeout_long breadcrumb
2128

2229
set -e
2330
set -u
@@ -48,14 +55,30 @@ findEarliestDBJobRecord() {
4855
SLURM_VERSION="SlurmVersion"
4956
TSTAMP_YMD=`date +%Y-%b-%d`
5057
TSTAMP=`date +%FT%T`
51-
TIMEOUT_SHORTLONG_MULTIPLIER=10
52-
TIMEOUT_SHORT=1
58+
59+
SKIP_JSON="true"
60+
61+
TIMEOUT_SAFE_MIN=5
62+
# tune for your db responsiveness and size (TIMEOUT_SHORTLONG_MULTIPLIER)
63+
TIMEOUT_SHORTLONG_MULTIPLIER=20
64+
TIMEOUT_SHORT=${TIMEOUT_SAFE_MIN}
5365
TIMEOUT_LONG=$((${TIMEOUT_SHORT} * ${TIMEOUT_SHORTLONG_MULTIPLIER}))
5466
TIMEOUT_UNITS="m"
5567
TIMEOUT=${TIMEOUT_SHORT}${TIMEOUT_UNITS}
5668
TIMEOUT_LONG=${TIMEOUT_LONG}${TIMEOUT_UNITS}
69+
70+
#Exit codes
71+
EX_OK=0
72+
EX_TXTJSON_MISMATCH=1
73+
EX_TXTDELIM_MISMATCH=2
74+
EX_JSONDELIM_MISMATCH=3
75+
EX_OVERWRITE=10
76+
EX_INCOMPLETE=11
77+
EX_PREP_DATE=20
78+
5779
TMPDIR=/tmp/$(basename $0 .sh).$(id -u -n).$$
5880
TMPFILE=${TMPDIR}/delim-conv.$$
81+
5982
MSG_INCOMPLETE="output not complete"
6083
MSG_JOBID_MISMATCH="jobid mismatch in output formats"
6184
GENERATED="Generated"
@@ -70,6 +93,7 @@ DELTA="∆"
7093
COMMA=","
7194
SINGLE_QUOTE="'"
7295
READ_ONLY="0444"
96+
ISATTY=""
7397

7498
FIELDS="jobidraw%9,jobid,priority,qos,partition,nnodes%6,ntasks,submit%24,eligible%24,start%24,end%24,timelimit%24,state,reservation,nodelist,flags%36,submitline%-7,account,user,consumedenergy,consumedenergyraw,failednode%-80"
7599
ARGS=" -a -X "
@@ -83,99 +107,114 @@ START="${START_PREFIX}${INITIALIZED}${START_SUFFIX}"
83107

84108

85109
Prep() {
110+
if [[ -c /dev/tty ]] ; then
111+
ISATTY=">/dev/tty"
112+
fi
86113
# sanity check
87114
date -d "${INITIALIZED}" +%s >/dev/null 2>&1
88-
if [ $? -ne 0 ] ; then
89-
echo date failure determining when the db was initialized. >/dev/tty
90-
exit 1
115+
if [ $? -ne ${EX_OK} ] ; then
116+
echo date failure determining when the db was initialized. ${ISATTY}
117+
exit ${EX_PREP_DATE}
91118
fi
92119
if [ -z "${INITIALIZED}" ] ; then
93120
INITIALIZED=${DEFAULT_INITIALIZED}
94-
echo initialized: ${DEFAULT_INITIALIZED} >/dev/tty
121+
echo initialized: ${DEFAULT_INITIALIZED} ${ISATTY}
95122
fi
96123
mkdir -p ${TMPDIR}
97124
}
98125

99126
CollectSlurmData() {
100-
# json takes much longer
101-
DEBUG_TIME="/usr/bin/time -v"
102-
timeout ${TIMEOUT_LONG} ${DEBUG_TIME} sacct ${JSON_ARGS} ${START} > ${OUT_JSON} &
103-
timeout ${TIMEOUT} sacct --format=${FIELDS} ${ARGS} ${START} > ${OUT} &
104-
timeout ${TIMEOUT} sacct --format=${FIELDS} ${P_ARGS} ${START} > ${OUT_DELIM} &
127+
# json takes much (much) longer
128+
#DEBUG_TIME="/usr/bin/time -v"
129+
# timeout ${TIMEOUT_LONG} ${DEBUG_TIME} sacct ${JSON_ARGS} ${START} > ${OUT_JSON} &
130+
if [[ -z "${SKIP_JSON}" ]] ; then
131+
nohup timeout ${TIMEOUT_LONG} sacct ${JSON_ARGS} ${START} > ${OUT_JSON} &
132+
fi
133+
134+
# these don't take very long
135+
timeout ${TIMEOUT} sacct --format=${FIELDS} ${ARGS} ${START} > ${OUT} &
136+
timeout ${TIMEOUT} sacct --format=${FIELDS} ${P_ARGS} ${START} > ${OUT_DELIM} &
105137
wait
138+
return
106139
}
107140

108141
ConvertDelimiters() {
109142
sed "s/${COMMA}/|/g" ${OUT_DELIM} | sed "s/^/'/" | sed "s/$/'/" > ${TMPFILE}
110143
sed "s|${DELTA}|${COMMA}|g" ${TMPFILE} > ${OUT_DELIM}${DELTA}
111144
iconv -c -f us-ascii -t UTF-8//TRANSLIT ${OUT_DELIM}${DELTA} > ${OUT_DELIM}
112145
rm -f ${TMPFILE} ${OUT_DELIM}${DELTA}
146+
return
113147
}
114148

115149
CheckValid() {
116-
local rc=0
150+
local rc=${EX_OK}
151+
local last_jobid_json=""
117152
last_jobid_txt=$(tail -1 ${OUT} | awk '{print $1;}')
118153
last_jobid_delim=$(tail -1 ${OUT_DELIM} | awk -F "${COMMA}" '{print $1;}' | sed "s/${SINGLE_QUOTE}//g" )
119154

120155
# XXX JSON *not* fatal: See SchedMD ticket# 20797, https://support.schedmd.com/show_bug.cgi?id=20797
121-
last_jobid_json=$(tail -${JSON_TAIL_LINES} ${OUT_JSON} | grep job_id | grep -v '"job_id": 0,' | sed 's/,$//' | sort | uniq | awk '{print $2;}' )
122-
123-
124-
if [[ "${last_jobid_txt}" != "${last_jobid_json}" ]] ; then
125-
(\
126-
echo ${MSG_JOBID_MISMATCH} ;\
127-
echo " txt: ${last_jobid_txt}" ;\
128-
echo " json: ${last_jobid_json}" ) | tee ${WARNING}txt,json
129-
rc=1
130-
fi
156+
if [[ -z "${SKIP_JSON}" ]] ; then
157+
last_jobid_json=$(tail -${JSON_TAIL_LINES} ${OUT_JSON} | grep job_id | grep -v '"job_id": 0,' | sed 's/,$//' | sort | uniq | awk '{print $2;}' )
158+
159+
if [[ -n "${last_jobid_json}" ]] ; then
160+
if [[ "${last_jobid_txt}" != "${last_jobid_json}" ]] ; then
161+
(\
162+
echo ${MSG_JOBID_MISMATCH} ;\
163+
echo " txt: ${last_jobid_txt}" ;\
164+
echo " json: ${last_jobid_json}" ) | tee ${WARNING}txt,json
165+
rc=${EX_TXTJSON_MISMATCH}
166+
fi
167+
if [[ "${last_jobid_json}" != "${last_jobid_delim}" ]] ; then
168+
(\
169+
echo ${MSG_JOBID_MISMATCH} ;\
170+
echo " json: ${last_jobid_json}" ;\
171+
echo " delim: ${last_jobid_delim}" ) | tee ${WARNING}json,delim
172+
rc=${EX_JSONDELIM_MISMATCH}
173+
fi
174+
else
175+
# ...for TIMEOUT_LONG
176+
wait
177+
fi
178+
fi ## SKIP_JSON
131179

132180
if [[ "${last_jobid_txt}" != "${last_jobid_delim}" ]] ; then
133181
(\
134182
echo ${MSG_JOBID_MISMATCH} ;\
135183
echo " txt: ${last_jobid_txt}" ;\
136184
echo " delim: ${last_jobid_delim}" ) | tee ${WARNING}txt,delim
137-
rc=2
138-
fi
139-
if [[ "${last_jobid_json}" != "${last_jobid_delim}" ]] ; then
140-
(\
141-
echo ${MSG_JOBID_MISMATCH} ;\
142-
echo " json: ${last_jobid_json}" ;\
143-
echo " delim: ${last_jobid_delim}" ) | tee ${WARNING}json,delim
144-
rc=3
185+
rc=${EX_TXTDELIM_MISMATCH}
145186
fi
146187

147-
if [[ "${rc}" -ne 0 ]] ; then
148-
return ${rc}
149-
fi
150-
151-
if [[ -e ${TSTAMP_YMD}/${GENERATED} ]] ; then
152-
rc=10
153-
else
154-
# if incomplete or in error, leave behind all of the breadcrumb temporary files
155-
if [[ -s ${OUT} && -s ${OUT_DELIM} && -s ${OUT_JSON} ]] ; then
156-
mkdir -p ${TSTAMP_YMD}
157-
echo ${TSTAMP} > ${TSTAMP_YMD}/${GENERATED}
158-
sinfo --version | awk '{print $2}' > ${TMPDIR}/${SLURM_VERSION}
159-
mv ${TMPDIR}/* ${TSTAMP_YMD}/
160-
161-
# completed successfully, seal it
162-
( cd ${TSTAMP_YMD}; chmod 0444 * )
163-
trap "rm -rf ${TMPDIR}" 0
164-
else
165-
166-
echo "${MSG_INCOMPLETE} See: ${TMPDIR}" >/dev/tty
167-
rc=11
168-
fi
188+
if [[ "${rc}" -eq ${EX_OK} ]] ; then
189+
if [[ -e ${TSTAMP_YMD}/${GENERATED} ]] ; then
190+
rc=${EX_OVERWRITE}
191+
else
192+
# if incomplete or in error, leave behind all of the breadcrumb temporary files
193+
# we record the slurm version as the output format and fields may differ between slurm versions
194+
if [ -s "${OUT}" -a -s "${OUT_DELIM}" -a \( -s "${OUT_JSON}" -o "${SKIP_JSON}" \) ] ; then
195+
mkdir -p ${TSTAMP_YMD}
196+
echo ${TSTAMP} > ${TSTAMP_YMD}/${GENERATED}
197+
sinfo --version | awk '{print $2}' > ${TMPDIR}/${SLURM_VERSION}
198+
mv ${TMPDIR}/* ${TSTAMP_YMD}/
199+
200+
# completed successfully, seal it
201+
( cd ${TSTAMP_YMD}; chmod ${READ_ONLY} * )
202+
trap "rm -rf ${TMPDIR}" 0
203+
else
204+
echo "${MSG_INCOMPLETE} See: ${TMPDIR}" ${ISATTY}
205+
rc=${EX_INCOMPLETE}
206+
fi
207+
fi
169208
fi
170209
return ${rc}
171210
}
172211

173212
main() {
174-
local rc=0
213+
local rc=${EX_OK}
175214
Prep
176215
CollectSlurmData
177216
ConvertDelimiters
178-
CheckValid
217+
CheckValid # potentially long to return (TIMEOUT_LONG)
179218
rc=$?
180219
exit ${rc}
181220
}
@@ -184,3 +223,4 @@ main $*
184223
exit $?
185224

186225
#python3 src/__main__.py -i${OUT} -t36 -n${CLUSTERNAME} -c368
226+
#vi: set background=dark paste

0 commit comments

Comments
 (0)