-
Notifications
You must be signed in to change notification settings - Fork 3
/
run_with_perf_counters.sh
executable file
·74 lines (64 loc) · 3.06 KB
/
run_with_perf_counters.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/bin/bash
# Choose a version of the DGEMM benchmark code
EXE_VERSION=MMAP # or "default" or "MMAP_2MB" or "MMAP_1GB"
# Path to the background performance counter monitoring program
PERFCOUNTERDIRECTORY=../Util/periodic-performance-counters
# The problem size needs to be large enough to give repeatable, near-asymptotic performance
# Using 24 cores on a single Xeon Platinum 8160, performance is close to asymptotic for
# sizes of 12000 or larger.
# The (internal) iteration count should be large enough to distinguish between runs with
# consistently slow performance and runs with slow performance due to external interference.
#
# When using the "periodic-perf-counters" tool to collect performance counter statistics,
# the combination of SIZE and ITERS should be chosen so that each trial runs for at least
# 100 performance counter samples. This allows accurate results when computing deltas
# using the samples nearest the start TSC of iteration 1 (excluding iteration 0) and
# nearest the end of the final iteration.
#
# Most of the results in the paper used SIZE=20000 and ITERS=22, giving an execution
# time of just over 11 seconds per iteration, and about 235 seconds from the start
# of iteration 1 (skipping iteration 0) to the end of iteration 22. This allows
# use of 1 second performance counter sampling with negligible errors due to
# lack of precise start/stop synchronization.
#
# The number of trials will typically be at least 200, since only one of 100-200
# runs shows a strong snoop filter conflict. If this is being run on multiple
# servers, only the aggregate (#nodes * NUMTRIALS) needs to be large. Most
# of the results in the paper included 651 trials (21 trials on each of 31 nodes).
#
SIZE=20000
ITERS=22
NUMTRIALS=200
MAXTRIAL=$(( $NUMTRIALS - 1 ))
# for short tests
SIZE=10000
ITERS=6
NUMTRIALS=10
MAXTRIAL=$(( $NUMTRIALS - 1 ))
# Select number of cores to use -- defaults to all available
export MKL_NUM_THREADS=24
# Spread threads across available cores
# (The "numactl" command below will keep all threads on the same socket.)
export KMP_AFFINITY=scatter
# Pick a logical processor to run the background performance monitoring code.
# Overhead is reduced if this is an unused core.
# Pinning to a single logical processor makes it easy to see the overhead
# in the performance monitoring output for the selected logical processor.
PERFCOUNTERPROC=93
# "perf_counters" uses a bunch of local files, so switch to that directory
# and run the program in its home directory
# Launch in the background and save the PID to send it a signal at job completion.
pushd $PERFCOUNTERDIRECTORY
taskset -c $PERFCOUNTERPROC ./perf_counters &
PERF_COUNT_PID=$!
popd
for TRIAL in `seq 0 $MAXTRIAL`
do
LABEL=`printf %.3d $TRIAL`
# echo $LABEL
time numactl --membind=0 --cpunodebind=0 ./dgemm_test_${EXE_VERSION}.exe $SIZE $ITERS > log.${EXE_VERSION}.$LABEL
done
echo "Benchmark done -- signalling perf_counters to dump its output"
kill -SIGCONT $PERF_COUNT_PID
echo "sleeping for 10 seconds to let perf_counters finish writing its output"
sleep 10