From 11a24e0d0ef0750bd6ab1c482b15098dce0356b6 Mon Sep 17 00:00:00 2001 From: dkorchevgithub <63178227+dkorchevgithub@users.noreply.github.com> Date: Wed, 19 Aug 2020 20:10:23 -0700 Subject: [PATCH 1/3] initial code for ONNX (#9) * adding ONNX related code * updating predict * running fixed batch size ONNX * fixed batch size works for separate index input --- recommendation/dlrm/pytorch/python/main.py | 55 +++++++++++++ recommendation/dlrm/pytorch/run_common.sh | 29 ++++--- .../python/backend_onnxruntime.py | 78 +++++++++++++++++++ 3 files changed, 153 insertions(+), 9 deletions(-) create mode 100755 v0.5/recommendation/python/backend_onnxruntime.py diff --git a/recommendation/dlrm/pytorch/python/main.py b/recommendation/dlrm/pytorch/python/main.py index 8ed44847e..4dc294cfa 100755 --- a/recommendation/dlrm/pytorch/python/main.py +++ b/recommendation/dlrm/pytorch/python/main.py @@ -77,6 +77,24 @@ "model": "dlrm", "max-batchsize": 2048, }, + "dlrm-kaggle-onnxruntime": { + "dataset": "kaggle", + "inputs": "continuous and categorical features", + "outputs": "probability", + "backend": "onnxruntime", + "model": "dlrm", + "max-batchsize": 128, + }, + "dlrm-terabyte-onnxruntime": { + "dataset": "terabyte", + "inputs": "continuous and categorical features", + "outputs": "probability", + "backend": "onnxruntime", + "model": "dlrm", + "max-batchsize": 2048, + }, + + } SCENARIO_MAP = { @@ -196,6 +214,43 @@ def get_backend(backend, dataset, max_ind_range, data_sub_sample_rate, use_gpu): else: raise ValueError("only kaggle|terabyte dataset options are supported") + elif backend == "onnxruntime": + from backend_onnxruntime import BackendOnnxruntime + + # NOTE: pass model parameters here, the following options are available + if dataset == "kaggle": + # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see ./bench/dlrm_s_criteo_kaggle.sh) + backend = BackendOnnxruntime( + m_spa=16, + ln_emb=np.array([1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572]), + ln_bot=np.array([13,512,256,64,16]), + ln_top=np.array([367,512,256,1]), + use_gpu=use_gpu + ) + elif dataset == "terabyte": + if max_ind_range == 10000000: + # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh [--sub-sample=0.875] --max-in-range=10000000) + backend = BackendOnnxruntime( + m_spa=64, + ln_emb=np.array([9980333,36084,17217,7378,20134,3,7112,1442,61, 9758201,1333352,313829,10,2208,11156,122,4,970,14, 9994222, 7267859, 9946608,415421,12420,101, 36]), + ln_bot=np.array([13,512,256,64]), + ln_top=np.array([415,512,512,256,1]), + use_gpu=use_gpu + ) + elif max_ind_range == 40000000: + # 3. Criteo Terabyte MLPerf training (see ./bench/run_and_time.sh --max-in-range=40000000) + backend = BackendOnnxruntime( + m_spa=128, + ln_emb=np.array([39884406,39043,17289,7420,20263,3,7120,1543,63,38532951,2953546,403346,10,2208,11938,155,4,976,14,39979771,25641295,39664984,585935,12972,108,36]), + ln_bot=np.array([13,512,256,128]), + ln_top=np.array([479,1024,1024,512,256,1]), + use_gpu=use_gpu + ) + else: + raise ValueError("only --max-in-range 10M or 40M is supported") + else: + raise ValueError("only kaggle|terabyte dataset options are supported") + else: raise ValueError("unknown backend: " + backend) return backend diff --git a/recommendation/dlrm/pytorch/run_common.sh b/recommendation/dlrm/pytorch/run_common.sh index 39c051a5d..99c7e3570 100755 --- a/recommendation/dlrm/pytorch/run_common.sh +++ b/recommendation/dlrm/pytorch/run_common.sh @@ -23,21 +23,22 @@ device="cpu" for i in $* ; do case $i in - pytorch) backend=$i; shift;; + pytorch|onnxruntime) backend=$i; shift;; dlrm) model=$i; shift;; kaggle|terabyte) dataset=$i; shift;; cpu|gpu) device=$i; shift;; esac done + # debuging -# echo $backend -# echo $model -# echo $dataset -# echo $device -# echo $MODEL_DIR -# echo $DATA_DIR -# echo $DLRM_DIR -# echo $EXTRA_OPS +echo $backend +echo $model +echo $dataset +echo $device +echo $MODEL_DIR +echo $DATA_DIR +echo $DLRM_DIR +echo $EXTRA_OPS if [ $device == "cpu" ] ; then export CUDA_VISIBLE_DEVICES="" @@ -47,6 +48,7 @@ else fi name="$model-$dataset-$backend" +echo $name # # pytorch @@ -59,6 +61,15 @@ if [ $name == "dlrm-terabyte-pytorch" ] ; then model_path="$MODEL_DIR/dlrm_terabyte.pytorch" profile=dlrm-terabyte-pytorch fi +if [ $name == "dlrm-kaggle-onnxruntime" ] ; then + model_path="$MODEL_DIR/dlrm_kaggle.onnxruntime" + profile=dlrm-kaggle-onnxruntime +fi +if [ $name == "dlrm-terabyte-onnxruntime" ] ; then + model_path="$MODEL_DIR/dlrm_terabyte.onnxruntime" + profile=dlrm-terabyte-onnxruntime +fi + # debuging # echo $model_path # echo $profile diff --git a/v0.5/recommendation/python/backend_onnxruntime.py b/v0.5/recommendation/python/backend_onnxruntime.py new file mode 100755 index 000000000..f756d7d0b --- /dev/null +++ b/v0.5/recommendation/python/backend_onnxruntime.py @@ -0,0 +1,78 @@ +""" +onnxruntime backend (https://github.com/microsoft/onnxruntime) +""" + +# pylint: disable=unused-argument,missing-docstring,useless-super-delegation + +import onnxruntime as rt +import numpy as np +import backend +import torch + + +class BackendOnnxruntime(backend.Backend): + def __init__(self, m_spa, ln_emb, ln_bot, ln_top, use_gpu=False, mini_batch_size=1): + super(BackendOnnxruntime, self).__init__() + + def version(self): + return rt.__version__ + + def name(self): + """Name of the runtime.""" + return "onnxruntime" + + def load(self, model_path, inputs=None, outputs=None): + + inputs = None + outputs = None + print("onnx load", model_path, inputs, outputs) + """Load model and find input/outputs from the model file.""" + opt = rt.SessionOptions() + # enable level 3 optimizations + # FIXME: enable below once onnxruntime 0.5 is released + # opt.set_graph_optimization_level(3) + self.sess = rt.InferenceSession(model_path, opt) + + # get input and output names +# if inputs is None: + self.inputs = [meta.name for meta in self.sess.get_inputs()] +# else: +# self.inputs = inputs + +# if outputs is None: + self.outputs = [meta.name for meta in self.sess.get_outputs()] +# else: +# self.outputs = outputs + + print("inputs", self.inputs) + print("outputs", self.outputs) + #self.outputs = ["predict"] + return self + + def predict(self, batch_dense_X, batch_lS_o, batch_lS_i): + #print("onnx predict") + """Run the prediction.""" + + dict_inputs = {} + # dict_inputs[self.inputs[0]] = batch_dense_X.numpy().astype(np.float32) + # dict_inputs[self.inputs[1]] = batch_lS_o.numpy().astype(np.int64) + # dict_inputs[self.inputs[2]] = batch_lS_i.numpy().astype(np.int64) + + ind = 0 + + for i in self.inputs: + + if "input.1" == i: + dict_inputs[i] = batch_dense_X.numpy().astype(np.float32) + + elif "lS_o" == i: + dict_inputs[i] = batch_lS_o.numpy().astype(np.int64) + + else: + dict_inputs[i] = batch_lS_i[ind].numpy().astype(np.int64) + ind = ind + 1 + + prediction = self.sess.run(output_names=self.outputs, input_feed=dict_inputs) + # print("prediction", prediction) + + return torch.tensor(prediction, requires_grad=False).view(-1,1) From 3f69366cdd33177fd3d0b7501601c505efa0fc1b Mon Sep 17 00:00:00 2001 From: mnaumovfb Date: Tue, 25 Aug 2020 12:03:17 -0700 Subject: [PATCH 2/3] Some adjustments to onnxruntime backend --- recommendation/dlrm/pytorch/run_common.sh | 21 ++++--- .../python/backend_onnxruntime.py | 59 ++++++++++--------- 2 files changed, 41 insertions(+), 39 deletions(-) diff --git a/recommendation/dlrm/pytorch/run_common.sh b/recommendation/dlrm/pytorch/run_common.sh index 99c7e3570..8cb4a4671 100755 --- a/recommendation/dlrm/pytorch/run_common.sh +++ b/recommendation/dlrm/pytorch/run_common.sh @@ -29,16 +29,15 @@ for i in $* ; do cpu|gpu) device=$i; shift;; esac done - # debuging -echo $backend -echo $model -echo $dataset -echo $device -echo $MODEL_DIR -echo $DATA_DIR -echo $DLRM_DIR -echo $EXTRA_OPS +# echo $backend +# echo $model +# echo $dataset +# echo $device +# echo $MODEL_DIR +# echo $DATA_DIR +# echo $DLRM_DIR +# echo $EXTRA_OPS if [ $device == "cpu" ] ; then export CUDA_VISIBLE_DEVICES="" @@ -47,8 +46,8 @@ else extra_args="--use-gpu" fi name="$model-$dataset-$backend" - -echo $name +# debuging +# echo $name # # pytorch diff --git a/v0.5/recommendation/python/backend_onnxruntime.py b/v0.5/recommendation/python/backend_onnxruntime.py index f756d7d0b..189484da8 100755 --- a/v0.5/recommendation/python/backend_onnxruntime.py +++ b/v0.5/recommendation/python/backend_onnxruntime.py @@ -22,46 +22,36 @@ def name(self): return "onnxruntime" def load(self, model_path, inputs=None, outputs=None): - - inputs = None - outputs = None - print("onnx load", model_path, inputs, outputs) """Load model and find input/outputs from the model file.""" opt = rt.SessionOptions() # enable level 3 optimizations # FIXME: enable below once onnxruntime 0.5 is released # opt.set_graph_optimization_level(3) + # print("onnx load", model_path, inputs, outputs) self.sess = rt.InferenceSession(model_path, opt) - # get input and output names -# if inputs is None: - self.inputs = [meta.name for meta in self.sess.get_inputs()] -# else: -# self.inputs = inputs - -# if outputs is None: - self.outputs = [meta.name for meta in self.sess.get_outputs()] -# else: -# self.outputs = outputs - - print("inputs", self.inputs) - print("outputs", self.outputs) - #self.outputs = ["predict"] + if True: #not inputs: + self.inputs = [meta.name for meta in self.sess.get_inputs()] + else: + self.inputs = inputs + if True: #not outputs: + self.outputs = [meta.name for meta in self.sess.get_outputs()] + else: + self.outputs = outputs return self def predict(self, batch_dense_X, batch_lS_o, batch_lS_i): - #print("onnx predict") """Run the prediction.""" + # print("onnx predict") + # print(self.inputs) + # print(self.outputs) dict_inputs = {} - # dict_inputs[self.inputs[0]] = batch_dense_X.numpy().astype(np.float32) - # dict_inputs[self.inputs[1]] = batch_lS_o.numpy().astype(np.int64) - # dict_inputs[self.inputs[2]] = batch_lS_i.numpy().astype(np.int64) + # Dmitriy's approach to build dictionaries ind = 0 - for i in self.inputs: - + if "input.1" == i: dict_inputs[i] = batch_dense_X.numpy().astype(np.float32) @@ -71,8 +61,21 @@ def predict(self, batch_dense_X, batch_lS_o, batch_lS_i): else: dict_inputs[i] = batch_lS_i[ind].numpy().astype(np.int64) ind = ind + 1 + ''' + # Maxim's approach to build dictionaries + dict_inputs[self.inputs[0]] = batch_dense_X.numpy().astype(np.float32) + dict_inputs[self.inputs[1]] = batch_lS_o.numpy().astype(np.int64) + if False: #torch.is_tensor(batch_lS_i): # approach 1: tensor + dict_inputs[self.inputs[2]] = batch_lS_i.numpy().astype(np.int64) + else: # approach 2: list + for j in range(26): # 26 sparse features + dict_inputs[self.inputs[j+2]] = batch_lS_i[j].numpy().astype(np.int64) + ''' + # predict and return output + # print(dict_inputs) + output = self.sess.run(output_names=self.outputs, input_feed=dict_inputs) + output = torch.tensor(output, requires_grad=False).view(-1, 1) + # print("output", output) + # print("output.shape", output.shape) - prediction = self.sess.run(output_names=self.outputs, input_feed=dict_inputs) - # print("prediction", prediction) - - return torch.tensor(prediction, requires_grad=False).view(-1,1) + return output From cf0d0b3e71d0889bf24e3e75c8d22c9895e808f0 Mon Sep 17 00:00:00 2001 From: mnaumovfb Date: Fri, 28 Aug 2020 01:48:22 -0700 Subject: [PATCH 3/3] Fixing latent bug when fixed number of samples is agglomerated and --mlperf-bin-loader is used --- recommendation/dlrm/pytorch/python/criteo.py | 5 ++++- v0.5/recommendation/python/backend_onnxruntime.py | 11 ++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/recommendation/dlrm/pytorch/python/criteo.py b/recommendation/dlrm/pytorch/python/criteo.py index de08b82f1..77c7184c2 100755 --- a/recommendation/dlrm/pytorch/python/criteo.py +++ b/recommendation/dlrm/pytorch/python/criteo.py @@ -253,7 +253,10 @@ def load_query_samples(self, sample_list): s = self.random_offsets[l] e = self.random_offsets[l+1] - ls = [self.test_data[i] for i in range(s, e)] + if self.use_mlperf_bin_loader and self.samples_to_aggregate > 1: + ls = [self.test_data[l]] + else: + ls = [self.test_data[i] for i in range(s, e)] if self.use_mlperf_bin_loader: # NOTE: in binary dataset the values are transformed ls_t = list(zip(*ls)) diff --git a/v0.5/recommendation/python/backend_onnxruntime.py b/v0.5/recommendation/python/backend_onnxruntime.py index 189484da8..6c1198a84 100755 --- a/v0.5/recommendation/python/backend_onnxruntime.py +++ b/v0.5/recommendation/python/backend_onnxruntime.py @@ -45,7 +45,16 @@ def predict(self, batch_dense_X, batch_lS_o, batch_lS_i): # print("onnx predict") # print(self.inputs) # print(self.outputs) - + + ''' + incoming_bs = batch_dense_X.shape[0] + model_saved_bs = 2048 + if (incoming_bs != model_saved_bs): + print("WARNING: mismatch beween incoming " + str(incoming_bs) + " and model saved " + str(model_saved_bs) + " mini-batch size") + fake_output = torch.zeros(size=(incoming_bs,1), dtype=torch.float32) + return fake_output + ''' + dict_inputs = {} # Dmitriy's approach to build dictionaries