Add microbenchmark for FC + add fusion (#20780)

anko-intel · web-flow · commit 9653ab402de4 · 2021-12-23T13:38:52.000+01:00
+ utils scripts to run it
diff --git a/benchmark/python/dnnl/fc_add.py b/benchmark/python/dnnl/fc_add.py
@@ -0,0 +1,164 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import time
+import gc
+import sys
+import mxnet as mx
+from mxnet.gluon import nn
+from mxnet.contrib import quantization
+
+#shape, num_hidden:
+sizes = [
+    ((  1, 224),   512),
+    ((  1, 224),  4096),
+    (( 16, 1024), 1024),
+    (( 32, 4096), 1024),
+    (( 32, 4096), 4096),
+    ((512,  512), 4096)]
+
+rounds = 1000
+warmup = 10
+
+test_header = "--no_test_header" not in sys.argv
+table_header = "--no_table_header" not in sys.argv
+table_left_colums = "--no_size_column" not in sys.argv
+dump_graph = "--dump_graph" in sys.argv
+
+def dump_graph_fn(net, postfix):
+    if dump_graph:
+        net.export("/tmp/fc_add_" + postfix)
+
+def operator_string(elemwise_add):
+    return 'elemwise_add' if elemwise_add else 'npi_add'
+
+def print_header(header):
+    print("\n")
+    print(header if test_header else "", "\n")
+    if table_header:
+        if table_left_colums:
+            print("|    Shape    | Hidden | Mean [ms] |" )
+            print("|------------:|-------:|----------:|" )
+        else:
+            print(" Mean [ms] |" )
+            print("----------:|" )
+
+def print_value(shape, hidden, mean):
+    if table_left_colums:
+        print("| ({:4},{:4}) | {:6} | {:9.3f} |".format(shape[0], shape[1], hidden, mean))
+    else:
+        print(" {:9.3f} |".format(mean))
+
+
+def measure(net, data0, data1, data2, shape, nhid):
+    mx.nd.waitall()
+    gc.collect()
+    gc.disable()
+    for i in range(rounds + warmup):
+        if i == warmup:
+            start_time = time.time()
+        o = net(data0, data1, data2)
+        o.wait_to_read()
+    end_time = time.time()
+    run_time = (end_time - start_time)
+    print_value(shape, nhid, 1000 * run_time / rounds)
+    gc.enable()
+
+
+class FCWithSum(nn.HybridBlock):
+    def __init__(self, num_in, num_hidden, elemwise_add, **kwargs):
+        super(FCWithSum, self).__init__(**kwargs)
+        self.fc0 = nn.Dense(units=num_hidden, in_units=num_in)
+        self.fc1 = nn.Dense(units=num_hidden)
+        self.elemwise_add = elemwise_add
+
+    def forward(self, data0, data1, data2):
+        _fc0 = self.fc0(data0)
+        _fc1 = self.fc1(data1)
+        if  self.elemwise_add:
+            _sum0 = mx.nd.elemwise_add(data2.as_nd_ndarray(), _fc0.as_nd_ndarray()).as_np_ndarray()
+            _sum1 = mx.nd.elemwise_add(_fc1.as_nd_ndarray(), _sum0.as_nd_ndarray()).as_np_ndarray()
+        else:
+            _sum0 = data2 + _fc0
+            _sum1 = _fc1 + _sum0
+        return _sum1
+
+def benchmark_float(elemwise_add):
+    header = operator_string(elemwise_add) + ', float'
+    print_header(header)
+    for shape, nhid in sizes:
+        net = FCWithSum(shape[1], nhid, elemwise_add)
+        net.initialize()
+        net.hybridize(static_alloc=True, static_shape=True)
+        data0 = mx.np.random.uniform(size=shape, low=-1.0, high=1.0)
+        data1 = mx.np.random.uniform(size=shape, low=-1.0, high=1.0)
+        shape2 = (shape[0], nhid)
+        data2 = mx.np.random.uniform(size=shape2, low=-1.0, high=1.0)
+        net.optimize_for(data0, data1, data2, backend='ONEDNN')
+        measure(net, data0, data1, data2, shape, nhid)
+    dump_graph_fn(net, operator_string(elemwise_add) + '_float')
+
+class CalibIter(mx.io.DataIter):
+    def __init__(self, batch, data_shape, batch_size):
+        super(CalibIter, self).__init__(batch_size)
+        self.label_shape = (batch_size,)
+        self.data_shape = data_shape
+        if isinstance(data_shape, tuple):
+            self.provide_data = [('data', data_shape)]
+        else:
+            self.provide_data = data_shape
+        self.provide_label = []
+        self.batch = batch
+    def __iter__(self):
+        yield self.batch
+
+def benchmark_int8(quantize_mode, quantize_granularity, elemwise_add):
+    header = operator_string(elemwise_add) + ', mode = ' + quantize_mode + \
+             ', granularity = ' + quantize_granularity
+    print_header(header)
+    for shape, nhid in sizes:
+        net = FCWithSum(shape[1], nhid, elemwise_add)
+        net.initialize()
+        net.hybridize(static_alloc=True, static_shape=True)
+        data0 = mx.np.random.uniform(size=shape, low=-1.0, high=1.0)
+        data1 = mx.np.random.uniform(size=shape, low=-1.0, high=1.0)
+        shape2 = (shape[0], nhid)
+        data2 = mx.np.random.uniform(size=shape2, low=-1.0, high=1.0)
+        data = mx.gluon.data.ArrayDataset(data0, data1, data2)
+        calib_data = mx.gluon.data.DataLoader(data, batch_size=1)
+        net = quantization.quantize_net(net,
+                                        device=mx.cpu(),
+                                        exclude_layers=None,
+                                        exclude_operators=None,
+                                        calib_mode='naive',
+                                        calib_data=calib_data,
+                                        num_calib_batches=1,
+                                        quantize_mode=quantize_mode,
+                                        quantize_granularity=quantize_granularity
+                                        )
+        net.hybridize(static_alloc=True, static_shape=True)
+        measure(net, data0, data1, data2, shape, nhid)
+    dump_graph_fn(net, operator_string(elemwise_add) + \
+                    '_' + str(quantize_mode) + '_' + str(quantize_granularity))
+
+for elemwise_add in [True, False]:
+    benchmark_float(elemwise_add)
+
+for quantize_mode in ['smart', 'full']:
+    for quantize_granularity in ['tensor-wise', 'channel-wise']:
+        for elemwise_add in [True, False]:
+            benchmark_int8(quantize_mode, quantize_granularity, elemwise_add)
diff --git a/benchmark/python/dnnl/run.sh b/benchmark/python/dnnl/run.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Script for running python benchmark with properly setting OMP prarameters for it
+
+check_parametrs() {
+ 	if [ "$#" -eq 0 ] ; then
+		echo "Please give python script to run as parameter."
+		echo "Optionally you can give number of threads to use and python scripts parameters:"
+		echo "    `basename "$0"`  [num_threads] python_script [python script parameters]"
+		exit
+	fi
+}
+
+check_parametrs $@
+
+NUM_SOCKET=`lscpu | grep 'Socket(s)' | awk '{print $NF}'`
+CORES_PER_SOCKET=`lscpu | grep 'Core(s) per socket' | awk '{print $NF}'`
+NUM_CORES=$((CORES_PER_SOCKET * NUM_SOCKET))
+
+integer_reg='^[0-9]+$'
+if [[ $1 =~ $integer_reg ]] ; then
+	if (($1 > $NUM_CORES)); then
+		echo >&2
+		echo "WARNING: given number of threads = $1" \
+			" is greater than number of physical cores = $NUM_CORES." >&2
+		echo >&2
+	fi
+	NUM_CORES=$1
+	shift
+	check_parametrs $@
+fi
+
+CORES={0}:${NUM_CORES}:1
+
+INSTRUCTION="OMP_NUM_THREADS=${NUM_CORES} OMP_PROC_BIND=TRUE OMP_PLACES=${CORES} python3 -u $@"
+echo $INSTRUCTION >&2
+eval $INSTRUCTION
diff --git a/benchmark/python/dnnl/run_per_thread.sh b/benchmark/python/dnnl/run_per_thread.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Script for running python benchmark against number of used OMP threads
+
+
+help_and_exit() {
+	echo "Usage:"
+	echo "    `basename "$0"`  [start_num_threads step_num_threads end_num_threads] python_script [python script parameters]"
+	echo "Number of threads range parameters and python script are optional."
+	exit
+}
+
+if [ "$#" -eq 0 ] ; then
+	help_and_exit
+fi
+
+NUM_SOCKET=`lscpu | grep 'Socket(s)' | awk '{print $NF}'`
+CORES_PER_SOCKET=`lscpu | grep 'Core(s) per socket' | awk '{print $NF}'`
+NUM_CORES=$((CORES_PER_SOCKET * NUM_SOCKET))
+
+NT_START=1
+NT_STEP=1
+NT_END=$NUM_CORES
+
+integer_reg='^[0-9]+$'
+signed_integer_reg='^[+-]*[0-9]+$'
+if [[ $1 =~ $integer_reg ]] ; then
+	if [[ $2 =~ $signed_integer_reg ]] && [[ $3 =~ $integer_reg ]]; then
+		NT_START=$1
+		NT_STEP=$2
+		NT_END=$3
+		shift 3
+		if [ "$#" -eq 0 ] ; then
+			help_and_exit
+		fi
+	else
+		echo "Provide 3 numbers for threads range: start, step and the end."
+		help_and_exit
+	fi
+fi
+
+NT_SEQUENCE=`seq $NT_START $NT_STEP $NT_END`
+if [ -z "$NT_SEQUENCE" ]; then
+	echo "Given threads range produce empy sequence."
+	help_and_exit
+else
+	echo "Start python script $1 for following number of threads:"  >&2
+	echo $NT_SEQUENCE  >&2
+fi
+
+RUN_SCRIPT=`dirname "$0"`/run.sh
+for NT in $NT_SEQUENCE;
+do
+	TMP_FILE=/tmp/_result_${NT}.txt
+	echo  1>${TMP_FILE}
+	if [[ $NT -eq $NT_START ]]; then
+		echo "NUM_THREADS = $NT" 1>>${TMP_FILE}
+		$RUN_SCRIPT $NT $@ 1>>${TMP_FILE}
+	else
+		echo " $NT" 1>>${TMP_FILE}
+		$RUN_SCRIPT $NT $@ --no_size_column --no_test_header 1>>${TMP_FILE}
+	fi
+	TMP_FILES+=" ${TMP_FILE}"
+done
+paste -d "" ${TMP_FILES}