7
7
#
8
8
# trigger with, for example:
9
9
# crontab entry:
10
- # 37 3,16 1,16 * * sh -c "[ -x ${SACCT_DIR}/bin/collectSacctDB.sh ] && \
10
+ # 37 3 1,16 * * sh -c "[ -x ${SACCT_DIR}/bin/collectSacctDB.sh ] && \
11
11
# ( cd ${SACCT_DIR}/$(hostname -s); ${SACCT_DIR}/bin/collectSacctDB.sh )"
12
12
#
13
13
# To Do:
16
16
# - set SACCT_DIR
17
17
# - set DEFAULT_INITIALIZED date
18
18
# - set timeouts, esp. for json sacct query
19
+ # - set breadcrumb, previous timeout used, esp. if previous run had no json output
20
+ # - check for timeout_long breadcrumb, increase from TIMEOUT_LONG if found
21
+ # - SKIP_JSON=false
19
22
# - verify env
20
23
# - slurm env & commands appear reasonable
24
+ # - refine collect/timeout/wait
25
+ # - slice query into smaller time chunks if no json output in a TIMEOUT period
26
+ # - instead of sleeping, poll for output, possibly warn, if no progress made
27
+ # then if no json ever collected, leave timeout_long breadcrumb
21
28
22
29
set -e
23
30
set -u
@@ -48,14 +55,30 @@ findEarliestDBJobRecord() {
48
55
SLURM_VERSION=" SlurmVersion"
49
56
TSTAMP_YMD=` date +%Y-%b-%d`
50
57
TSTAMP=` date +%FT%T`
51
- TIMEOUT_SHORTLONG_MULTIPLIER=10
52
- TIMEOUT_SHORT=1
58
+
59
+ SKIP_JSON=" true"
60
+
61
+ TIMEOUT_SAFE_MIN=5
62
+ # tune for your db responsiveness and size (TIMEOUT_SHORTLONG_MULTIPLIER)
63
+ TIMEOUT_SHORTLONG_MULTIPLIER=20
64
+ TIMEOUT_SHORT=${TIMEOUT_SAFE_MIN}
53
65
TIMEOUT_LONG=$(( ${TIMEOUT_SHORT} * ${TIMEOUT_SHORTLONG_MULTIPLIER} ))
54
66
TIMEOUT_UNITS=" m"
55
67
TIMEOUT=${TIMEOUT_SHORT}${TIMEOUT_UNITS}
56
68
TIMEOUT_LONG=${TIMEOUT_LONG}${TIMEOUT_UNITS}
69
+
70
+ # Exit codes
71
+ EX_OK=0
72
+ EX_TXTJSON_MISMATCH=1
73
+ EX_TXTDELIM_MISMATCH=2
74
+ EX_JSONDELIM_MISMATCH=3
75
+ EX_OVERWRITE=10
76
+ EX_INCOMPLETE=11
77
+ EX_PREP_DATE=20
78
+
57
79
TMPDIR=/tmp/$( basename $0 .sh) .$( id -u -n) .$$
58
80
TMPFILE=${TMPDIR} /delim-conv.$$
81
+
59
82
MSG_INCOMPLETE=" output not complete"
60
83
MSG_JOBID_MISMATCH=" jobid mismatch in output formats"
61
84
GENERATED=" Generated"
@@ -70,6 +93,7 @@ DELTA="∆"
70
93
COMMA=" ,"
71
94
SINGLE_QUOTE=" '"
72
95
READ_ONLY=" 0444"
96
+ ISATTY=" "
73
97
74
98
FIELDS=" jobidraw%9,jobid,priority,qos,partition,nnodes%6,ntasks,submit%24,eligible%24,start%24,end%24,timelimit%24,state,reservation,nodelist,flags%36,submitline%-7,account,user,consumedenergy,consumedenergyraw,failednode%-80"
75
99
ARGS=" -a -X "
@@ -83,99 +107,114 @@ START="${START_PREFIX}${INITIALIZED}${START_SUFFIX}"
83
107
84
108
85
109
Prep () {
110
+ if [[ -c /dev/tty ]] ; then
111
+ ISATTY=" >/dev/tty"
112
+ fi
86
113
# sanity check
87
114
date -d " ${INITIALIZED} " +%s > /dev/null 2>&1
88
- if [ $? -ne 0 ] ; then
89
- echo date failure determining when the db was initialized. > /dev/tty
90
- exit 1
115
+ if [ $? -ne ${EX_OK} ] ; then
116
+ echo date failure determining when the db was initialized. ${ISATTY}
117
+ exit ${EX_PREP_DATE}
91
118
fi
92
119
if [ -z " ${INITIALIZED} " ] ; then
93
120
INITIALIZED=${DEFAULT_INITIALIZED}
94
- echo initialized: ${DEFAULT_INITIALIZED} > /dev/tty
121
+ echo initialized: ${DEFAULT_INITIALIZED} ${ISATTY}
95
122
fi
96
123
mkdir -p ${TMPDIR}
97
124
}
98
125
99
126
CollectSlurmData () {
100
- # json takes much longer
101
- DEBUG_TIME=" /usr/bin/time -v"
102
- timeout ${TIMEOUT_LONG} ${DEBUG_TIME} sacct ${JSON_ARGS} ${START} > ${OUT_JSON} &
103
- timeout ${TIMEOUT} sacct --format=${FIELDS} ${ARGS} ${START} > ${OUT} &
104
- timeout ${TIMEOUT} sacct --format=${FIELDS} ${P_ARGS} ${START} > ${OUT_DELIM} &
127
+ # json takes much (much) longer
128
+ # DEBUG_TIME="/usr/bin/time -v"
129
+ # timeout ${TIMEOUT_LONG} ${DEBUG_TIME} sacct ${JSON_ARGS} ${START} > ${OUT_JSON} &
130
+ if [[ -z " ${SKIP_JSON} " ]] ; then
131
+ nohup timeout ${TIMEOUT_LONG} sacct ${JSON_ARGS} ${START} > ${OUT_JSON} &
132
+ fi
133
+
134
+ # these don't take very long
135
+ timeout ${TIMEOUT} sacct --format=${FIELDS} ${ARGS} ${START} > ${OUT} &
136
+ timeout ${TIMEOUT} sacct --format=${FIELDS} ${P_ARGS} ${START} > ${OUT_DELIM} &
105
137
wait
138
+ return
106
139
}
107
140
108
141
ConvertDelimiters () {
109
142
sed " s/${COMMA} /|/g" ${OUT_DELIM} | sed " s/^/'/" | sed " s/$/'/" > ${TMPFILE}
110
143
sed " s|${DELTA} |${COMMA} |g" ${TMPFILE} > ${OUT_DELIM}${DELTA}
111
144
iconv -c -f us-ascii -t UTF-8//TRANSLIT ${OUT_DELIM}${DELTA} > ${OUT_DELIM}
112
145
rm -f ${TMPFILE} ${OUT_DELIM}${DELTA}
146
+ return
113
147
}
114
148
115
149
CheckValid () {
116
- local rc=0
150
+ local rc=${EX_OK}
151
+ local last_jobid_json=" "
117
152
last_jobid_txt=$( tail -1 ${OUT} | awk ' {print $1;}' )
118
153
last_jobid_delim=$( tail -1 ${OUT_DELIM} | awk -F " ${COMMA} " ' {print $1;}' | sed " s/${SINGLE_QUOTE} //g" )
119
154
120
155
# XXX JSON *not* fatal: See SchedMD ticket# 20797, https://support.schedmd.com/show_bug.cgi?id=20797
121
- last_jobid_json=$( tail -${JSON_TAIL_LINES} ${OUT_JSON} | grep job_id | grep -v ' "job_id": 0,' | sed ' s/,$//' | sort | uniq | awk ' {print $2;}' )
122
-
123
-
124
- if [[ " ${last_jobid_txt} " != " ${last_jobid_json} " ]] ; then
125
- (\
126
- echo ${MSG_JOBID_MISMATCH} ; \
127
- echo " txt: ${last_jobid_txt} " ; \
128
- echo " json: ${last_jobid_json} " ) | tee ${WARNING} txt,json
129
- rc=1
130
- fi
156
+ if [[ -z " ${SKIP_JSON} " ]] ; then
157
+ last_jobid_json=$( tail -${JSON_TAIL_LINES} ${OUT_JSON} | grep job_id | grep -v ' "job_id": 0,' | sed ' s/,$//' | sort | uniq | awk ' {print $2;}' )
158
+
159
+ if [[ -n " ${last_jobid_json} " ]] ; then
160
+ if [[ " ${last_jobid_txt} " != " ${last_jobid_json} " ]] ; then
161
+ (\
162
+ echo ${MSG_JOBID_MISMATCH} ; \
163
+ echo " txt: ${last_jobid_txt} " ; \
164
+ echo " json: ${last_jobid_json} " ) | tee ${WARNING} txt,json
165
+ rc=${EX_TXTJSON_MISMATCH}
166
+ fi
167
+ if [[ " ${last_jobid_json} " != " ${last_jobid_delim} " ]] ; then
168
+ (\
169
+ echo ${MSG_JOBID_MISMATCH} ; \
170
+ echo " json: ${last_jobid_json} " ; \
171
+ echo " delim: ${last_jobid_delim} " ) | tee ${WARNING} json,delim
172
+ rc=${EX_JSONDELIM_MISMATCH}
173
+ fi
174
+ else
175
+ # ...for TIMEOUT_LONG
176
+ wait
177
+ fi
178
+ fi # # SKIP_JSON
131
179
132
180
if [[ " ${last_jobid_txt} " != " ${last_jobid_delim} " ]] ; then
133
181
(\
134
182
echo ${MSG_JOBID_MISMATCH} ; \
135
183
echo " txt: ${last_jobid_txt} " ; \
136
184
echo " delim: ${last_jobid_delim} " ) | tee ${WARNING} txt,delim
137
- rc=2
138
- fi
139
- if [[ " ${last_jobid_json} " != " ${last_jobid_delim} " ]] ; then
140
- (\
141
- echo ${MSG_JOBID_MISMATCH} ; \
142
- echo " json: ${last_jobid_json} " ; \
143
- echo " delim: ${last_jobid_delim} " ) | tee ${WARNING} json,delim
144
- rc=3
185
+ rc=${EX_TXTDELIM_MISMATCH}
145
186
fi
146
187
147
- if [[ " ${rc} " -ne 0 ]] ; then
148
- return ${rc}
149
- fi
150
-
151
- if [[ -e ${TSTAMP_YMD} /${GENERATED} ]] ; then
152
- rc=10
153
- else
154
- # if incomplete or in error, leave behind all of the breadcrumb temporary files
155
- if [[ -s ${OUT} && -s ${OUT_DELIM} && -s ${OUT_JSON} ]] ; then
156
- mkdir -p ${TSTAMP_YMD}
157
- echo ${TSTAMP} > ${TSTAMP_YMD} /${GENERATED}
158
- sinfo --version | awk ' {print $2}' > ${TMPDIR} /${SLURM_VERSION}
159
- mv ${TMPDIR} /* ${TSTAMP_YMD} /
160
-
161
- # completed successfully, seal it
162
- ( cd ${TSTAMP_YMD} ; chmod 0444 * )
163
- trap " rm -rf ${TMPDIR} " 0
164
- else
165
-
166
- echo " ${MSG_INCOMPLETE} See: ${TMPDIR} " > /dev/tty
167
- rc=11
168
- fi
188
+ if [[ " ${rc} " -eq ${EX_OK} ]] ; then
189
+ if [[ -e ${TSTAMP_YMD} /${GENERATED} ]] ; then
190
+ rc=${EX_OVERWRITE}
191
+ else
192
+ # if incomplete or in error, leave behind all of the breadcrumb temporary files
193
+ # we record the slurm version as the output format and fields may differ between slurm versions
194
+ if [ -s " ${OUT} " -a -s " ${OUT_DELIM} " -a \( -s " ${OUT_JSON} " -o " ${SKIP_JSON} " \) ] ; then
195
+ mkdir -p ${TSTAMP_YMD}
196
+ echo ${TSTAMP} > ${TSTAMP_YMD} /${GENERATED}
197
+ sinfo --version | awk ' {print $2}' > ${TMPDIR} /${SLURM_VERSION}
198
+ mv ${TMPDIR} /* ${TSTAMP_YMD} /
199
+
200
+ # completed successfully, seal it
201
+ ( cd ${TSTAMP_YMD} ; chmod ${READ_ONLY} * )
202
+ trap " rm -rf ${TMPDIR} " 0
203
+ else
204
+ echo " ${MSG_INCOMPLETE} See: ${TMPDIR} " ${ISATTY}
205
+ rc=${EX_INCOMPLETE}
206
+ fi
207
+ fi
169
208
fi
170
209
return ${rc}
171
210
}
172
211
173
212
main () {
174
- local rc=0
213
+ local rc=${EX_OK}
175
214
Prep
176
215
CollectSlurmData
177
216
ConvertDelimiters
178
- CheckValid
217
+ CheckValid # potentially long to return (TIMEOUT_LONG)
179
218
rc=$?
180
219
exit ${rc}
181
220
}
@@ -184,3 +223,4 @@ main $*
184
223
exit $?
185
224
186
225
# python3 src/__main__.py -i${OUT} -t36 -n${CLUSTERNAME} -c368
226
+ # vi: set background=dark paste
0 commit comments