diff --git a/.gitmodules b/.gitmodules
index e0ffec11bfd0..6a22aeafe752 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "3rdparty/mshadow"]
 	path = 3rdparty/mshadow
-	url = https://github.com/dmlc/mshadow.git
+	url = https://github.com/yinghu5/mshadow.git
 [submodule "3rdparty/dmlc-core"]
 	path = 3rdparty/dmlc-core
 	url = https://github.com/dmlc/dmlc-core.git
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 6e94643bdf1d..66279663326b 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 6e94643bdf1d51a505b147f28c358fb71070b8fd
+Subproject commit 66279663326be19b162e7f797cc58c43dae54f90
diff --git a/3rdparty/onnx-tensorrt b/3rdparty/onnx-tensorrt
index f1c7aa63d88d..1e209e546061 160000
--- a/3rdparty/onnx-tensorrt
+++ b/3rdparty/onnx-tensorrt
@@ -1 +1 @@
-Subproject commit f1c7aa63d88d8d8ef70490f2ebb6b33f7450218b
+Subproject commit 1e209e546061173ccc37b25bbca69a795c6c86e4
diff --git a/3rdparty/sparse-matrix/Makefile b/3rdparty/sparse-matrix/Makefile
deleted file mode 100644
index 214312f6586c..000000000000
--- a/3rdparty/sparse-matrix/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-CC = g++
-C = gcc
-MKLROOT = /opt/intel/mkl
-
-ifneq ($(USE_INTEL_PATH),)
-	MKLROOT = $(USE_INTEL_PATH)/mkl
-endif
-
-CFLAGS  = -fpic -O2 -I/opt/intel/mkl/include -c -Wall -Werror  -DMKL_ILP64 -m64  -std=c++11 
-LDFLAGS =  -Wl,--start-group -L${MKLROOT}/../compiler/lib/intel64 ${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a ${MKLROOT}/lib/intel64/libmkl_intel_thread.a ${MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group -liomp5 -lpthread -lm -ldl
-
-default: libsparse_matrix.so
-
-libsparse_matrix.so:  sparse_matrix.o
-	$(CC) -shared -o libsparse_matrix.so sparse_matrix.o $(LDFLAGS) 
-
-sparse_matrix.o:  sparse_matrix.cc sparse_matrix.h
-	$(CC) $(CFLAGS) sparse_matrix.cc 
-
-clean:
-	$(RM) libsparse_matrix.so *.o *~
diff --git a/3rdparty/sparse-matrix/sparse_matrix.cc b/3rdparty/sparse-matrix/sparse_matrix.cc
deleted file mode 100644
index fa362f0f8a18..000000000000
--- a/3rdparty/sparse-matrix/sparse_matrix.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-#include <iostream>
-#include <string>
-#include <fstream>
-#include <mkl_spblas.h>
-#include "sparse_matrix.h"
-
-
-
-bool mkl_DotCsrDnsDns(SP_INT64* rows_start, SP_INT64* col_indx,
-	float* values, float* X, float* y,
-	int rows, int cols, int X_columns)
-{
-
-	sparse_index_base_t indexing = SPARSE_INDEX_BASE_ZERO;
-	sparse_status_t status;
-	sparse_matrix_t A = NULL;
-	sparse_layout_t layout = SPARSE_LAYOUT_ROW_MAJOR;
-	float one, zero;
-	one = (float)1.0;
-	zero = (float)0.0;
-
-	MKL_INT* rows_end = rows_start + 1;
-	status = mkl_sparse_s_create_csr(&A, indexing, rows, cols, rows_start, rows_end, col_indx, values);
-
-  if (status != SPARSE_STATUS_SUCCESS)
-  {
-    std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl;
-    return false;
-  }
-	sparse_operation_t operation = SPARSE_OPERATION_NON_TRANSPOSE;
-	struct matrix_descr descrA;
-	descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
-
-	status = mkl_sparse_s_mm(operation, one, A, descrA, layout, X, X_columns, X_columns, zero, y, X_columns);
-  if (status != SPARSE_STATUS_SUCCESS)
-  {
-    std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl;
-    return false;
-  }
-	
-	mkl_sparse_destroy(A);
-	
-	return true;
-
-}
diff --git a/3rdparty/sparse-matrix/sparse_matrix.h b/3rdparty/sparse-matrix/sparse_matrix.h
deleted file mode 100644
index 93054a80b374..000000000000
--- a/3rdparty/sparse-matrix/sparse_matrix.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef MXNET_OPERATOR_SPARSE_MATRIX_INL_H_
-#define MXNET_OPERATOR_SPARSE_MATRIX_INL_H_
-
-
-#if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER)
-#define SP_INT64 __int64
-#define SP_UINT64 unsigned __int64
-#else
-#define SP_INT64 long long int
-#define SP_UINT64 unsigned long long int
-#endif
-
-
-#if defined _WIN32 || defined __CYGWIN__
-  #ifdef BUILDING_DLL
-    #ifdef __GNUC__
-      #define SPM_API_PUBLIC __attribute__ ((dllexport))
-    #else
-      #define SPM_API_PUBLIC __declspec(dllexport) // Note: actually gcc seems to also supports this syntax.
-    #endif
-  #else
-    #ifdef __GNUC__
-      #define SPM_API_PUBLIC __attribute__ ((dllimport))
-    #else
-      #define SPM_API_PUBLIC __declspec(dllimport) // Note: actually gcc seems to also supports this syntax.
-    #endif
-  #endif
-  #define SPM_API_LOCAL
-#else
-  #if __GNUC__ >= 4
-    #define SPM_API_PUBLIC __attribute__ ((visibility ("default")))
-    #define SPM_API_LOCAL  __attribute__ ((visibility ("hidden")))
-  #else
-    #define SPM_API_PUBLIC
-    #define SPM_API_LOCAL
-  #endif
-#endif
-
-
-
-extern "C"
-{
-	extern SPM_API_PUBLIC bool mkl_DotCsrDnsDns(SP_INT64* rows_start, SP_INT64* col_indx,
-		float* values, float* X, float* y, int rows, int cols, int X_columns);
-
-}
-
-#endif //MXNET_OPERATOR_SPARSE_MATRIX_INL_H_
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d014e96ff77f..6f8c33b6a23d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,7 +46,7 @@ mxnet_option(USE_VTUNE            "Enable use of Intel Amplifier XE (VTune)" OFF
 mxnet_option(ENABLE_CUDA_RTC      "Build with CUDA runtime compilation support" ON)
 mxnet_option(BUILD_CPP_EXAMPLES   "Build cpp examples" ON)
 mxnet_option(INSTALL_EXAMPLES     "Install the example source files." OFF)
-mxnet_option(USE_SIGNAL_HANDLER   "Print stack traces on segfaults." OFF)
+mxnet_option(USE_SIGNAL_HANDLER   "Print stack traces on segfaults." ON)
 mxnet_option(USE_TENSORRT         "Enable infeference optimization with TensorRT." OFF)
 mxnet_option(USE_ASAN             "Enable Clang/GCC ASAN sanitizers." OFF)
 mxnet_option(ENABLE_TESTCOVERAGE  "Enable compilation with test coverage metric output" OFF)
@@ -766,10 +766,16 @@ install(TARGETS ${MXNET_INSTALL_TARGETS}
 #       https://cmake.org/cmake/help/v3.0/variable/CMAKE_INSTALL_PREFIX.html
 #       https://cmake.org/cmake/help/v3.0/module/GNUInstallDirs.html
 
-install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-install(DIRECTORY 3rdparty/tvm/nnvm/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/dlpack/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/dmlc-core/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+if(USE_MKLDNN)
+  install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/mkldnn/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+endif()
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/mshadow/mshadow/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/mshadow)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/mxnet)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/tvm/nnvm/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 if (INSTALL_EXAMPLES)
-  install(DIRECTORY example  DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME})
+  install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/example  DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME})
 endif()
 
 if (USE_SIGNAL_HANDLER)
diff --git a/CODEOWNERS b/CODEOWNERS
index a9655ecf0e1f..9d4145c6b0f2 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -23,20 +23,19 @@
 /julia/                           @iblis17
 
 # C++ base
-/src/kvstore/     @rahul003 @anirudh2290
-/include/         @anirudh2290 @pllarroy
-/src/c_api/       @anirudh2290
+/src/kvstore/     @rahul003 @anirudh2290 @eric-haibin-lin
+/include/         @anirudh2290 @pllarroy @eric-haibin-lin
+/src/c_api/       @anirudh2290 @eric-haibin-lin
 /src/common/      @anirudh2290
-/src/engine/      @anirudh2290
-/src/executor/    @anirudh2290
-/src/imperative/  @anirudh2290
-/src/io/          @anirudh2290
-/src/ndarray/     @anirudh2290
-/src/nnvm/        @anirudh2290
-/src/operator/    @anirudh2290
-/src/profiler/    @anirudh2290
-/src/kvstore/     @eric-haibin-lin
-/src/storage/     @anirudh2290
+/src/engine/      @anirudh2290 @eric-haibin-lin
+/src/executor/    @anirudh2290 @eric-haibin-lin
+/src/imperative/  @anirudh2290 @eric-haibin-lin
+/src/io/          @anirudh2290 @eric-haibin-lin
+/src/ndarray/     @anirudh2290 @eric-haibin-lin
+/src/nnvm/        @anirudh2290 @eric-haibin-lin
+/src/operator/    @anirudh2290 @eric-haibin-lin
+/src/profiler/    @anirudh2290 @eric-haibin-lin
+/src/storage/     @anirudh2290 @eric-haibin-lin
 /tests/cpp/       @anirudh2290
 /cpp-package/     @nswamy @pllarroy
 /src/             @pllarroy
diff --git a/Makefile b/Makefile
index 29cfd573665c..df0fe8809456 100644
--- a/Makefile
+++ b/Makefile
@@ -144,7 +144,6 @@ ifeq ($(USE_MKLDNN), 1)
 	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
 endif
 
-
 # setup opencv
 ifeq ($(USE_OPENCV), 1)
 	CFLAGS += -DMXNET_USE_OPENCV=1
@@ -416,14 +415,6 @@ ifeq ($(USE_DIST_KVSTORE), 1)
 	LDFLAGS += $(PS_LDFLAGS_A)
 endif
 
-#sparse-matrix
-ifeq ($(USE_BLAS), mkl)
-	SPARSE_MATRIX_DIR =  $(ROOTDIR)/3rdparty/sparse-matrix
-	LIB_DEP += $(SPARSE_MATRIX_DIR)/libsparse_matrix.so
-	CFLAGS += -I$(SPARSE_MATRIX_DIR)
-	LDFLAGS += -L$(SPARSE_MATRIX_DIR) -lsparse_matrix -Wl,-rpath,'$${ORIGIN}'
-endif
-
 .PHONY: clean all extra-packages test lint docs clean_all rcpplint rcppexport roxygen\
 	cython2 cython3 cython cyclean
 
@@ -561,30 +552,11 @@ ifeq ($(UNAME_S), Darwin)
 endif
 endif
 
-ifeq ($(USE_BLAS), mkl)
-ifeq ($(UNAME_S), Darwin)
-	install_name_tool -change '@rpath/libsparse_matrix.dylib' '@loader_path/libsparse_matrix.dylib' $@
-endif
-endif
-
 $(PS_PATH)/build/libps.a: PSLITE
 
 PSLITE:
 	$(MAKE) CXX="$(CXX)" DEPS_PATH="$(DEPS_PATH)" -C $(PS_PATH) ps
 
-ifeq ($(USE_BLAS), mkl)
-$(SPARSE_MATRIX_DIR)/libsparse_matrix.so: SPARSE_MATRIX
-
-SPARSE_MATRIX:
-ifeq ($(USE_INTEL_PATH), NONE)
-	$(MAKE) -C $(SPARSE_MATRIX_DIR)
-else
-	$(MAKE) -C $(SPARSE_MATRIX_DIR) USE_INTEL_PATH=$(USE_INTEL_PATH)
-endif
-	mkdir -p $(ROOTDIR)/lib
-	cp $(SPARSE_MATRIX_DIR)/libsparse_matrix.so $(ROOTDIR)/lib/
-endif
-
 $(DMLC_CORE)/libdmlc.a: DMLCCORE
 
 DMLCCORE:
@@ -620,7 +592,7 @@ cpplint:
 	--exclude_path src/operator/contrib/ctc_include
 
 pylint:
-	pylint --rcfile=$(ROOTDIR)/ci/other/pylintrc --ignore-patterns=".*\.so$$,.*\.dll$$,.*\.dylib$$" python/mxnet tools/caffe_converter/*.py
+	python3 -m pylint --rcfile=$(ROOTDIR)/ci/other/pylintrc --ignore-patterns=".*\.so$$,.*\.dll$$,.*\.dylib$$" python/mxnet tools/caffe_converter/*.py
 
 doc: docs
 
@@ -661,10 +633,6 @@ rpkg:
 		cp -rf lib/libmklml_intel.so R-package/inst/libs; \
 	fi
 
-	if [ -e "lib/libsparse_matrix.so" ]; then \
-		cp -rf lib/libsparse_matrix.so R-package/inst/libs; \
-	fi
-
 	mkdir -p R-package/inst/include
 	cp -rl include/* R-package/inst/include
 	Rscript -e "if(!require(devtools)){install.packages('devtools', repo = 'https://cloud.r-project.org/')}"
@@ -710,7 +678,6 @@ clean: rclean cyclean $(EXTRA_PACKAGES_CLEAN)
 	(cd scala-package && mvn clean) || true
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
-	cd $(SPARSE_MATRIX_DIR); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
 	cd $(AMALGAMATION_PATH); $(MAKE) clean; cd -
 	$(RM) -r  $(patsubst %, %/*.d, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.d, $(EXTRA_OPERATORS))
@@ -721,7 +688,6 @@ clean: rclean mkldnn_clean cyclean testclean $(EXTRA_PACKAGES_CLEAN)
 	(cd scala-package && mvn clean) || true
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
-	cd $(SPARSE_MATRIX_DIR); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
 	cd $(AMALGAMATION_PATH); $(MAKE) clean; cd -
 endif
diff --git a/ci/build.py b/ci/build.py
index 1c7a4f8b3231..3d6fb9b4d893 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -331,6 +331,7 @@ def container_run(platform: str,
                 ret = wait_result.get('StatusCode', 200)
                 if ret != 0:
                     logging.error("Container exited with an error 😞")
+                    logging.info("Executed command for reproduction:\n\n%s\n", " ".join(sys.argv))
                 else:
                     logging.info("Container exited with success 👍")
             except Exception as e:
diff --git a/ci/build_windows.py b/ci/build_windows.py
index e8658995b68e..7ec24395e22e 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -44,6 +44,8 @@
 class BuildFlavour(Enum):
     WIN_CPU = 'WIN_CPU'
     WIN_CPU_MKLDNN = 'WIN_CPU_MKLDNN'
+    WIN_CPU_MKLDNN_MKL = 'WIN_CPU_MKLDNN_MKL'
+    WIN_CPU_MKL = 'WIN_CPU_MKL'
     WIN_GPU = 'WIN_GPU'
     WIN_GPU_MKLDNN = 'WIN_GPU_MKLDNN'
 
@@ -72,8 +74,34 @@ class BuildFlavour(Enum):
                          '-DUSE_LAPACK=1 '
                          '-DUSE_DIST_KVSTORE=0 '
                          '-DUSE_MKL_IF_AVAILABLE=1 '
+                         '-DUSE_MKLDNN=1 '
                          '-DCMAKE_BUILD_TYPE=Release')
 
+    , 'WIN_CPU_MKLDNN_MKL': ('-DUSE_CUDA=0 '
+                         '-DUSE_CUDNN=0 '
+                         '-DUSE_NVRTC=0 '
+                         '-DUSE_OPENCV=1 '
+                         '-DUSE_OPENMP=1 '
+                         '-DUSE_PROFILER=1 '
+                         '-DUSE_BLAS=mkl '
+                         '-DUSE_LAPACK=1 '
+                         '-DUSE_DIST_KVSTORE=0 '
+                         '-DUSE_MKL_IF_AVAILABLE=1 '
+                         '-DUSE_MKLDNN=1 '
+                         '-DCMAKE_BUILD_TYPE=Release')
+
+    , 'WIN_CPU_MKL': ('-DUSE_CUDA=0 '
+                         '-DUSE_CUDNN=0 '
+                         '-DUSE_NVRTC=0 '
+                         '-DUSE_OPENCV=1 '
+                         '-DUSE_OPENMP=1 '
+                         '-DUSE_PROFILER=1 '
+                         '-DUSE_BLAS=mkl '
+                         '-DUSE_LAPACK=1 '
+                         '-DUSE_DIST_KVSTORE=0 '
+                         '-DUSE_MKL_IF_AVAILABLE=1 '
+                         '-DUSE_MKLDNN=0 '
+                         '-DCMAKE_BUILD_TYPE=Release')
     , 'WIN_GPU': ('-DUSE_CUDA=1 '
                   '-DUSE_CUDNN=1 '
                   '-DUSE_NVRTC=1 '
@@ -218,6 +246,8 @@ def main():
             os.environ["OpenCV_DIR"] = "C:\\Program Files\\OpenCV-v3.4.1\\build"
         if 'CUDA_PATH' not in os.environ:
             os.environ["CUDA_PATH"] = "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.2"
+        if 'MKL_ROOT' not in os.environ:
+            os.environ["MKL_ROOT"] = "C:\\Program Files (x86)\\IntelSWTools\\compilers_and_libraries\\windows\\mkl"
         windows_build(args)
 
     elif system == 'Linux' or system == 'Darwin':
diff --git a/ci/docker/Dockerfile.build.centos7_gpu b/ci/docker/Dockerfile.build.centos7_gpu
index 8bf2442731af..cb3ae82acd05 100644
--- a/ci/docker/Dockerfile.build.centos7_gpu
+++ b/ci/docker/Dockerfile.build.centos7_gpu
@@ -18,7 +18,9 @@
 #
 # Dockerfile to build and run MXNet on CentOS 7 for GPU
 
-FROM nvidia/cuda:9.1-cudnn7-devel-centos7
+FROM nvidia/cuda:10.0-devel-centos7
+
+ENV CUDNN_VERSION=7.3.1.20
 
 WORKDIR /work/deps
 
@@ -28,6 +30,8 @@ COPY install/centos7_ccache.sh /work/
 RUN /work/centos7_ccache.sh
 COPY install/centos7_python.sh /work/
 RUN /work/centos7_python.sh
+COPY install/centos7_cudnn.sh /work/
+RUN /work/centos7_cudnn.sh
 
 ARG USER_ID=0
 COPY install/centos7_adduser.sh /work/
diff --git a/ci/docker/Dockerfile.build.ubuntu_base_gpu b/ci/docker/Dockerfile.build.ubuntu_base_gpu
index 99b79f513bee..94e49b6fb297 100644
--- a/ci/docker/Dockerfile.build.ubuntu_base_gpu
+++ b/ci/docker/Dockerfile.build.ubuntu_base_gpu
@@ -19,12 +19,17 @@
 # Dockerfile to run the MXNet Installation Tests on Ubuntu 16.04
 # This should run in an empty docker with ubuntu and cuda.
 
-FROM nvidia/cuda:9.1-cudnn7-devel
+FROM nvidia/cuda:10.0-devel-ubuntu16.04
+
+ENV CUDNN_VERSION=7.3.1.20
 
 WORKDIR /work/deps
 
 RUN apt-get update && apt-get -y install sudo
 
+COPY install/ubuntu_cudnn.sh /work/
+RUN /work/ubuntu_cudnn.sh
+
 ARG USER_ID=0
 ARG GROUP_ID=0
 COPY install/ubuntu_adduser.sh /work/
diff --git a/ci/docker/Dockerfile.build.ubuntu_build_cuda b/ci/docker/Dockerfile.build.ubuntu_build_cuda
index 9ed0cbbe3e52..08c67cd660f8 100644
--- a/ci/docker/Dockerfile.build.ubuntu_build_cuda
+++ b/ci/docker/Dockerfile.build.ubuntu_build_cuda
@@ -21,7 +21,9 @@
 # package generation, requiring the actual CUDA library to be
 # present
 
-FROM nvidia/cuda:9.1-cudnn7-devel
+FROM nvidia/cuda:10.0-devel-ubuntu16.04
+
+ENV CUDNN_VERSION=7.3.1.20
 
 WORKDIR /work/deps
 
@@ -43,6 +45,8 @@ COPY install/ubuntu_clang.sh /work/
 RUN /work/ubuntu_clang.sh
 COPY install/ubuntu_mklml.sh /work/
 RUN /work/ubuntu_mklml.sh
+COPY install/ubuntu_cudnn.sh /work/
+RUN /work/ubuntu_cudnn.sh
 
 # Special case because the CPP-Package requires the CUDA runtime libs
 # and not only stubs (which are provided by the base image)
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu b/ci/docker/Dockerfile.build.ubuntu_gpu
index 8fcbcbbb9674..63fd5973fcd0 100644
--- a/ci/docker/Dockerfile.build.ubuntu_gpu
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu
@@ -18,7 +18,9 @@
 #
 # Dockerfile to run MXNet on Ubuntu 16.04 for GPU
 
-FROM nvidia/cuda:9.1-cudnn7-devel
+FROM nvidia/cuda:10.0-devel-ubuntu16.04
+
+ENV CUDNN_VERSION=7.3.1.20
 
 WORKDIR /work/deps
 
@@ -72,6 +74,9 @@ ARG GROUP_ID=0
 COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
+COPY install/ubuntu_cudnn.sh /work/
+RUN /work/ubuntu_cudnn.sh
+
 COPY runtime_functions.sh /work/
 
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt b/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt
index f4844115c0fd..8ad90aedeb6f 100644
--- a/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt
@@ -39,3 +39,4 @@ COPY runtime_functions.sh /work/
 
 WORKDIR /work/mxnet
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
+ENV CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH}:/usr/local/cuda-10.0/targets/x86_64-linux/include/
diff --git a/ci/docker/Dockerfile.build.ubuntu_nightly_gpu b/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
index deeed8b0d52a..934aded5101d 100644
--- a/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
+++ b/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
@@ -18,7 +18,9 @@
 #
 # Dockerfile to run MXNet on Ubuntu 16.04 for CPU
 
-FROM nvidia/cuda:9.1-cudnn7-devel
+FROM nvidia/cuda:10.0-devel-ubuntu16.04
+
+ENV CUDNN_VERSION=7.3.1.20
 
 WORKDIR /work/deps
 
@@ -70,6 +72,9 @@ RUN /work/ubuntu_tutorials.sh
 COPY install/ubuntu_nightly_tests.sh /work/
 RUN /work/ubuntu_nightly_tests.sh
 
+COPY install/ubuntu_cudnn.sh /work/
+RUN /work/ubuntu_cudnn.sh
+
 ARG USER_ID=0
 ARG GROUP_ID=0
 COPY install/ubuntu_adduser.sh /work/
diff --git a/ci/docker/install/centos7_cudnn.sh b/ci/docker/install/centos7_cudnn.sh
new file mode 100755
index 000000000000..43ff89b121cc
--- /dev/null
+++ b/ci/docker/install/centos7_cudnn.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# build and install are separated so changes to build don't invalidate
+# the whole docker cache for the image
+
+set -ex
+
+# Multipackage installation does not fail in yum
+CUDNN_DOWNLOAD_SUM=4e15a323f2edffa928b4574f696fc0e449a32e6bc35c9ccb03a47af26c2de3fa
+curl -fsSL http://developer.download.nvidia.com/compute/redist/cudnn/v7.3.1/cudnn-10.0-linux-x64-v7.3.1.20.tgz -O
+echo "$CUDNN_DOWNLOAD_SUM cudnn-10.0-linux-x64-v7.3.1.20.tgz" | sha256sum -c -
+tar --no-same-owner -xzf cudnn-10.0-linux-x64-v7.3.1.20.tgz -C /usr/local
+rm cudnn-10.0-linux-x64-v7.3.1.20.tgz
+ldconfig
+
diff --git a/ci/docker/install/ubuntu_cudnn.sh b/ci/docker/install/ubuntu_cudnn.sh
new file mode 100755
index 000000000000..12b64865a219
--- /dev/null
+++ b/ci/docker/install/ubuntu_cudnn.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# build and install are separated so changes to build don't invalidate
+# the whole docker cache for the image
+
+# Assumes base image is from nvidia/cuda
+
+set -ex
+
+apt-get update || true
+apt-get install -y libcudnn7=7.3.1.20-1+cuda10.0 libcudnn7-dev=7.3.1.20-1+cuda10.0
+
diff --git a/ci/docker/install/ubuntu_nvidia.sh b/ci/docker/install/ubuntu_nvidia.sh
index 7012b897ff91..36eb21b8a03e 100755
--- a/ci/docker/install/ubuntu_nvidia.sh
+++ b/ci/docker/install/ubuntu_nvidia.sh
@@ -22,4 +22,4 @@ set -ex
 # Retrieve ppa:graphics-drivers and install nvidia-drivers.
 # Note: DEBIAN_FRONTEND required to skip the interactive setup steps
 apt update
-DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends cuda-9-1
+DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends cuda-10-0
diff --git a/ci/docker/install/ubuntu_publish.sh b/ci/docker/install/ubuntu_publish.sh
index 1fb7bf165b9a..2d8b019372c7 100755
--- a/ci/docker/install/ubuntu_publish.sh
+++ b/ci/docker/install/ubuntu_publish.sh
@@ -66,5 +66,5 @@ python2 get-pip.py
 
 apt-get remove -y python3-urllib3
 
-pip2 install nose cpplint==1.3.0 pylint==1.9.3 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
-pip3 install nose cpplint==1.3.0 pylint==2.1.1 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
+pip2 install nose cpplint==1.3.0 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
+pip3 install nose cpplint==1.3.0 pylint==2.3.1 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
diff --git a/ci/docker/install/ubuntu_python.sh b/ci/docker/install/ubuntu_python.sh
index ee05058f227e..23158ba4c068 100755
--- a/ci/docker/install/ubuntu_python.sh
+++ b/ci/docker/install/ubuntu_python.sh
@@ -30,5 +30,5 @@ wget -nv https://bootstrap.pypa.io/get-pip.py
 python3 get-pip.py
 python2 get-pip.py
 
-pip2 install nose cpplint==1.3.0 pylint==1.9.3 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
-pip3 install nose cpplint==1.3.0 pylint==2.1.1 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
+pip2 install nose cpplint==1.3.0 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
+pip3 install nose cpplint==1.3.0 pylint==2.3.1 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index c3610d2452e0..091ffdf2551d 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -565,7 +565,7 @@ build_ubuntu_cpu_mkldnn_mkl() {
 }
 
 build_ubuntu_gpu() {
-    build_ubuntu_gpu_cuda91_cudnn7
+    build_ubuntu_gpu_cuda100_cudnn7
 }
 
 build_ubuntu_gpu_tensorrt() {
@@ -665,7 +665,7 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
         -j$(nproc)
 }
 
-build_ubuntu_gpu_cuda91_cudnn7() {
+build_ubuntu_gpu_cuda100_cudnn7() {
     set -ex
     # unfortunately this build has problems in 3rdparty dependencies with ccache and make
     # build_ccache_wrappers
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 23230ac0442f..b460eb35a519 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -33,7 +33,7 @@ mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/li
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
 mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, lib/libsparse_matrix.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'build/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*'
 mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*'
@@ -218,12 +218,12 @@ def compile_unix_mkldnn_nocudnn_gpu() {
 }
 
 def compile_unix_full_gpu() {
-    return ['GPU: CUDA9.1+cuDNN7': {
+    return ['GPU: CUDA10.0+cuDNN7': {
       node(NODE_LINUX_CPU) {
         ws('workspace/build-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda91_cudnn7', false)
+            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda100_cudnn7', false)
             utils.pack_lib('gpu', mx_lib_cpp_examples, true)
           }
         }
@@ -515,6 +515,49 @@ def compile_windows_cpu() {
     }]
 }
 
+def compile_windows_cpu_mkldnn() {
+    return ['Build CPU MKLDNN windows':{
+      node(NODE_WINDOWS_CPU) {
+        ws('workspace/build-cpu-mkldnn') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git_win()
+            powershell 'py -3 ci/build_windows.py -f WIN_CPU_MKLDNN'
+            stash includes: 'windows_package.7z', name: 'windows_package_cpu_mkldnn'
+          }
+        }
+      }
+    }]
+}
+
+def compile_windows_cpu_mkldnn_mkl() {
+    return ['Build CPU MKLDNN MKL windows':{
+      node(NODE_WINDOWS_CPU) {
+        ws('workspace/build-cpu-mkldnn-mkl') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git_win()
+            powershell 'py -3 ci/build_windows.py -f WIN_CPU_MKLDNN_MKL'
+            stash includes: 'windows_package.7z', name: 'windows_package_cpu_mkldnn_mkl'
+          }
+        }
+      }
+    }]
+}
+
+def compile_windows_cpu_mkl() {
+    return ['Build CPU MKL windows':{
+      node(NODE_WINDOWS_CPU) {
+        ws('workspace/build-cpu-mkl') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git_win()
+            powershell 'cd $Env:MKL_ROOT'
+            powershell 'py -3 ci/build_windows.py -f WIN_CPU_MKL'
+            stash includes: 'windows_package.7z', name: 'windows_package_cpu_mkl'
+          }
+        }
+      }
+    }]
+}
+
 def compile_windows_gpu() {
     return ['Build GPU windows':{
       node(NODE_WINDOWS_CPU) {
diff --git a/ci/jenkins/Jenkinsfile_windows_cpu b/ci/jenkins/Jenkinsfile_windows_cpu
index 5bc40d625930..4475796771d1 100644
--- a/ci/jenkins/Jenkinsfile_windows_cpu
+++ b/ci/jenkins/Jenkinsfile_windows_cpu
@@ -34,7 +34,10 @@ utils.assign_node_labels(utility: 'utility', windows_cpu: 'mxnetwindows-cpu')
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-    custom_steps.compile_windows_cpu()
+    custom_steps.compile_windows_cpu(),
+    custom_steps.compile_windows_cpu_mkldnn(),
+    custom_steps.compile_windows_cpu_mkldnn_mkl(),
+    custom_steps.compile_windows_cpu_mkl()
   ])
 
   utils.parallel_stage('Tests', [
diff --git a/cmake/ChooseBlas.cmake b/cmake/ChooseBlas.cmake
index 5f4af2d89c91..e16594794ae8 100644
--- a/cmake/ChooseBlas.cmake
+++ b/cmake/ChooseBlas.cmake
@@ -18,14 +18,14 @@
 set(BLAS "Open" CACHE STRING "Selected BLAS library")
 set_property(CACHE BLAS PROPERTY STRINGS "Atlas;Open;MKL")
 
-if(USE_MKL_IF_AVAILABLE)
-  if(NOT MKL_FOUND)
-    find_package(MKL)
-  endif()
-  if(MKL_FOUND)
-	if(USE_MKLDNN)
-      set(BLAS "open")
-    else()
+if(DEFINED USE_BLAS)
+  set(BLAS "${USE_BLAS}")
+else()
+  if(USE_MKL_IF_AVAILABLE)
+    if(NOT MKL_FOUND)
+      find_package(MKL)
+    endif()
+    if(MKL_FOUND)
       set(BLAS "MKL")
     endif()
   endif()
diff --git a/cmake/DownloadMKLML.cmake b/cmake/DownloadMKLML.cmake
index 7b0e5ecf7c9c..8de85300e2fc 100644
--- a/cmake/DownloadMKLML.cmake
+++ b/cmake/DownloadMKLML.cmake
@@ -21,7 +21,7 @@ message(STATUS "Downloading MKLML...")
 
 set(MKLDNN_RELEASE v0.18)
 set(MKLML_RELEASE_FILE_SUFFIX 2019.0.3.20190220)
-
+#windows MD5 have issue
 set(MKLML_LNX_MD5 76354b74325cd293aba593d7cbe36b3f)
 set(MKLML_WIN_MD5 02286cb980f12af610c05e99dbd78755)
 set(MKLML_MAC_MD5 3b28da686a25a4cf995ca4fc5e30e514)
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
index 70405566d8ae..51fca23c1161 100644
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@@ -43,55 +43,6 @@ endif()
 # ---[ Root folders
 set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
 
-if(USE_MKLDNN)
-
-  find_path(MKL_ROOT include/mkl_blas.h
-    PATHS $ENV{MKL_ROOT}
-    ${INTEL_ROOT}/mklml
-    ${DIRECT_DEPENDENCY_ROOTS}
-    DOC "Folder contains MKL"
-    )
-
-  # ---[ Find include dir
-  find_path(MKL_INCLUDE_DIR mkl_blas.h PATHS ${MKL_ROOT} PATH_SUFFIXES include)
-  set(__looked_for MKL_INCLUDE_DIR)
-
-  # ---[ Find libraries
-  if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-    set(__path_suffixes lib lib/ia32)
-  else()
-    set(__path_suffixes lib lib/intel64)
-  endif()
-
-  set(__mkl_libs "")
-
-  if(WIN32)
-    list(APPEND __mkl_libs mklml_intel)
-  else()
-    list(APPEND __mkl_libs mklml_gnu)
-  endif()
-  list(APPEND __mkl_libs mkldnn)
-
-  foreach (__lib ${__mkl_libs})
-    set(__mkl_lib "${__lib}")
-    string(TOUPPER ${__mkl_lib} __mkl_lib_upper)
-
-    if(MKL_USE_STATIC_LIBS)
-      set(__mkl_lib "lib${__mkl_lib}.a")
-    endif()
-
-    find_library(${__mkl_lib_upper}_LIBRARY
-      NAMES ${__mkl_lib}
-      PATHS ${MKL_ROOT} "${MKL_INCLUDE_DIR}/.."
-      PATH_SUFFIXES ${__path_suffixes}
-      DOC "The path to Intel(R) MKL ${__mkl_lib} library")
-    mark_as_advanced(${__mkl_lib_upper}_LIBRARY)
-
-    list(APPEND __looked_for ${__mkl_lib_upper}_LIBRARY)
-    list(APPEND MKL_LIBRARIES ${${__mkl_lib_upper}_LIBRARY})
-  endforeach()
-
-else(USE_MKLDNN)
 
   # ---[ Options
   mxnet_option(MKL_USE_SINGLE_DYNAMIC_LIBRARY "Use single dynamic library interface" ON)
@@ -193,7 +144,7 @@ else(USE_MKLDNN)
     list(APPEND MKL_LIBRARIES ${MKL_RTL_LIBRARY})
   endif()
 
-endif(USE_MKLDNN)
+
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(MKL DEFAULT_MSG ${__looked_for})
diff --git a/contrib/clojure-package/.gitignore b/contrib/clojure-package/.gitignore
index 71d812e56ecd..8efd090255a5 100644
--- a/contrib/clojure-package/.gitignore
+++ b/contrib/clojure-package/.gitignore
@@ -39,8 +39,10 @@ examples/visualization/test-vis.pdf
 src/.DS_Store
 src/org/.DS_Store
 test/test-ndarray.clj
+test/test-ndarray-random.clj
 test/test-ndarray-api.clj
 test/test-symbol.clj
+test/test-symbol-random.clj
 test/test-symbol-api.clj
 src/org/apache/clojure_mxnet/gen/*
 
diff --git a/contrib/clojure-package/examples/bert-qa/README.md b/contrib/clojure-package/examples/bert-qa/README.md
index 9a21bcdfd66b..55f13e671c00 100644
--- a/contrib/clojure-package/examples/bert-qa/README.md
+++ b/contrib/clojure-package/examples/bert-qa/README.md
@@ -57,9 +57,8 @@ Some sample questions and answers are provide in the `squad-sample.edn` file. So
 
 * `lein install` in the root of the main project directory
 * cd into this project directory and do `lein run`. This will execute the cpu version.
-
-`lein run :cpu` - to run with cpu
-`lein run :gpu` - to run with gpu
+  * `lein run` or `lein run :cpu` to run with cpu
+  * `lein run :gpu` to run with gpu
 
 ## Background
 
diff --git a/contrib/clojure-package/examples/bert-qa/src/bert_qa/infer.clj b/contrib/clojure-package/examples/bert-qa/src/bert_qa/infer.clj
index 836684e04977..9dcc783ff1ac 100644
--- a/contrib/clojure-package/examples/bert-qa/src/bert_qa/infer.clj
+++ b/contrib/clojure-package/examples/bert-qa/src/bert_qa/infer.clj
@@ -15,13 +15,10 @@
 ;; limitations under the License.
 ;;
 
-
 (ns bert-qa.infer
   (:require [clojure.string :as string]
-            [clojure.reflect :as r]
             [cheshire.core :as json]
             [clojure.java.io :as io]
-            [clojure.set :as set]
             [org.apache.clojure-mxnet.dtype :as dtype]
             [org.apache.clojure-mxnet.context :as context]
             [org.apache.clojure-mxnet.layout :as layout]
@@ -30,11 +27,7 @@
             [clojure.pprint :as pprint]))
 
 (def model-path-prefix "model/static_bert_qa")
-;; epoch number of the model
-(def epoch 2)
-;; the vocabulary used in the model
-(def model-vocab "model/vocab.json")
-;; the input question
+
 ;; the maximum length of the sequence
 (def seq-length 384)
 
@@ -60,16 +53,13 @@
     (into tokens (repeat (- num (count tokens)) pad-item))))
 
 (defn get-vocab []
-  (let [vocab (json/parse-stream (clojure.java.io/reader "model/vocab.json"))]
+  (let [vocab (json/parse-stream (io/reader "model/vocab.json"))]
     {:idx->token (get vocab "idx_to_token")
      :token->idx (get vocab "token_to_idx")}))
 
 (defn tokens->idxs [token->idx tokens]
   (let [unk-idx (get token->idx "[UNK]")]
-   (mapv #(get token->idx % unk-idx) tokens)))
-
-(defn idxs->tokens [idx->token idxs]
-  (mapv #(get idx->token %) idxs))
+    (mapv #(get token->idx % unk-idx) tokens)))
 
 (defn post-processing [result tokens]
   (let [output1 (ndarray/slice-axis result 2 0 1)
@@ -131,22 +121,23 @@
      :tokens tokens
      :qa-map qa-map}))
 
-(defn infer [ctx]
-  (let [ctx (context/default-context)
-        predictor (make-predictor ctx)
-        {:keys [idx->token token->idx]} (get-vocab)
+(defn infer
+  ([] (infer (context/default-context)))
+  ([ctx]
+   (let [predictor (make-predictor ctx)
+         {:keys [idx->token token->idx]} (get-vocab)
         ;;; samples taken from https://rajpurkar.github.io/SQuAD-explorer/explore/v2.0/dev/
-        question-answers (clojure.edn/read-string (slurp "squad-samples.edn"))]
-    (doseq [qa-map question-answers]
-      (let [{:keys [input-batch tokens qa-map]} (pre-processing ctx idx->token token->idx qa-map)
-            result (first (infer/predict-with-ndarray predictor input-batch))
-            answer (post-processing result tokens)]
-        (println "===============================")
-        (println "      Question Answer Data")
-        (pprint/pprint qa-map)
-        (println)
-        (println "  Predicted Answer: " answer)
-        (println "===============================")))))
+         question-answers (clojure.edn/read-string (slurp "squad-samples.edn"))]
+     (doseq [qa-map question-answers]
+       (let [{:keys [input-batch tokens qa-map]} (pre-processing ctx idx->token token->idx qa-map)
+             result (first (infer/predict-with-ndarray predictor input-batch))
+             answer (post-processing result tokens)]
+         (println "===============================")
+         (println "      Question Answer Data")
+         (pprint/pprint qa-map)
+         (println)
+         (println "  Predicted Answer: " answer)
+         (println "==============================="))))))
 
 (defn -main [& args]
   (let [[dev] args]
@@ -156,4 +147,8 @@
 
 (comment
 
-  (infer :cpu))
+  (infer)
+
+  (infer (context/gpu))
+
+  )
diff --git a/contrib/clojure-package/src/dev/generator.clj b/contrib/clojure-package/src/dev/generator.clj
index 864c67ff6bcd..d1f59dc5082e 100644
--- a/contrib/clojure-package/src/dev/generator.clj
+++ b/contrib/clojure-package/src/dev/generator.clj
@@ -29,6 +29,7 @@
 
 
 (defn clojure-case
+  "Transforms a scala string (function name) to clojure case"
   [string]
   (-> string
       (clojure.string/replace #"(\s+)([A-Z][a-z]+)" "$1-$2")
@@ -57,10 +58,9 @@
        count
        pos?))
 
-
 (defn increment-param-name [pname]
   (if-let [num-str (re-find #"-\d" pname)]
-    (str 
+    (str
      (first (clojure.string/split pname #"-"))
      "-"
      (inc (Integer/parseInt (last (clojure.string/split num-str #"-")))))
@@ -130,15 +130,33 @@
       (.write w fstr))
     (.write w "\n"))))
 
+(defn remove-prefix
+  [prefix s]
+  (let [regex (re-pattern (str prefix "(.*)"))
+        replacement "$1"]
+  (clojure.string/replace s regex replacement)))
+
+(defn in-namespace-random? [op-name]
+  (or (clojure.string/includes? op-name "random_")
+      (clojure.string/includes? op-name "sample_")))
+
+(defn op-name->namespace-type [op-name]
+  (cond
+    (#{"uniform" "normal"} op-name)              :deprecated
+    (clojure.string/includes? op-name "random_") :random
+    (clojure.string/includes? op-name "sample_") :random
+    :else                                        :core))
+
 ;;;;;;; Common operations
 
 (def libinfo (Base/_LIB))
+
 (def op-names
   (let [l ($ ListBuffer/empty)]
-    (do (.mxListAllOpNames libinfo l)
-        (remove #(or (= "Custom" %)
-                     (re-matches #"^_.*" %))
-                (util/buffer->vec l)))))
+    (.mxListAllOpNames libinfo l)
+    (->> l
+         (util/buffer->vec)
+         (remove #(or (= "Custom" %) (re-matches #"^_.*" %))))))
 
 (defn- parse-arg-type [s]
   (let [[_ var-arg-type _ set-arg-type arg-spec _ type-req _ default-val] (re-find #"(([\w-\[\]\s]+)|\{([^}]+)\})\s*(\([^)]+\))?(,\s*(optional|required)(,\s*default=(.*))?)?" s)]
@@ -288,8 +306,6 @@
      `(~'defn ~function-name
        ~@(remove nil? (gen-symbol-function-arity op-name op-values function-name))))))
 
-
-
 (def symbol-gen-ns "(ns org.apache.clojure-mxnet.symbol
   (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max
                             min repeat reverse set sort take to-array empty sin
@@ -300,7 +316,9 @@
 
 (defn generate-symbol-file []
   (println "Generating symbol file")
-  (write-to-file all-symbol-functions symbol-gen-ns "src/org/apache/clojure_mxnet/gen/symbol.clj"))
+  (write-to-file all-symbol-functions
+                 symbol-gen-ns
+                 "src/org/apache/clojure_mxnet/gen/symbol.clj"))
 
 ;;;;;;; NDArray
 
@@ -322,21 +340,17 @@
        count
        pos?))
 
-
 (def ndarray-public-to-hand-gen
   (filter is-ndarray-hand-gen? ndarray-public-no-default))
 (def ndarray-public-to-gen
   (get-public-to-gen-methods ndarray-public-to-hand-gen
                              ndarray-public-no-default))
 
-
 (count ndarray-public-to-hand-gen) ;=> 15
 (count ndarray-public-to-gen) ;=> 486
 
 (->> ndarray-public-to-hand-gen (map :name) (into #{}))
 
-
-
 (defn gen-ndarray-function-arity [op-name op-values]
   (for [[param-count info] op-values]
     (let [targets (->> (mapv :parameter-types info)
@@ -380,7 +394,8 @@
 (def all-ndarray-functions
   (gen-ndarray-functions ndarray-public-to-gen))
 
-(def ndarray-gen-ns "(ns org.apache.clojure-mxnet.ndarray
+(def ndarray-gen-ns
+  "(ns org.apache.clojure-mxnet.ndarray
   (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max
                             min repeat reverse set sort take to-array empty shuffle
                             ref])
@@ -395,6 +410,17 @@
 
 ;;;;;;; SymbolAPI
 
+(defn fn-name->random-fn-name
+  [fn-name]
+  (cond
+    (clojure.string/starts-with? fn-name "-random-")
+    (remove-prefix "-random-" fn-name)
+
+    (clojure.string/starts-with? fn-name "-sample-")
+    (str (remove-prefix "-sample-" fn-name) "-like")
+
+    :else fn-name))
+
 (defn symbol-api-coerce-param
   [{:keys [name sym type optional?]}]
   (let [coerced-param (case type
@@ -435,47 +461,80 @@
        (~(symbol (str "SymbolAPI/" op-name))
         ~@coerced-params)))))
 
-(defn gen-symbol-api-function [op-name]
-  (let [{:keys [fn-name fn-description args]} (gen-op-info op-name)
-        params (mapv (fn [{:keys [name type optional?] :as opts}]
-                       (assoc opts
-                              :sym (symbol name)
-                              :optional? (or optional?
-                                             (= "NDArray-or-Symbol" type))))
-                     (conj args
-                           {:name "name"
-                            :type "String"
-                            :optional? true
-                            :description "Name of the symbol"}
-                           {:name "attr"
-                            :type "Map[String, String]"
-                            :optional? true
-                            :description "Attributes of the symbol"}))
-        doc (clojure.string/join
-             "\n\n  "
-             (-> (gen-symbol-api-doc fn-description params)
-                 (clojure.string/split #"\n")))
-        default-call (gen-symbol-api-default-arity op-name params)]
-    `(~'defn ~(symbol fn-name)
-      ~doc
-      ~@default-call)))
-
-(def all-symbol-api-functions
-  (mapv gen-symbol-api-function op-names))
-
-(def symbol-api-gen-ns "(ns
-  ^{:doc \"Experimental\"}
-  org.apache.clojure-mxnet.symbol-api
-  (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max
-                            min repeat reverse set sort take to-array empty sin
-                            get apply shuffle ref])
-  (:require [org.apache.clojure-mxnet.util :as util]
-            [org.apache.clojure-mxnet.shape :as mx-shape])
-  (:import (org.apache.mxnet SymbolAPI)))")
+(defn symbol-api-gen-ns
+  [random-namespace?]
+  (str
+    "(ns\n"
+    "  ^{:doc \"Experimental\"}\n"
+    (if random-namespace?
+      "  org.apache.clojure-mxnet.symbol-random-api\n"
+      "  org.apache.clojure-mxnet.symbol-api\n")
+    "  (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max\n"
+    "                            min repeat reverse set sort take to-array empty sin\n"
+    "                            get apply shuffle ref])\n"
+    "  (:require [org.apache.clojure-mxnet.util :as util]\n"
+    "            [org.apache.clojure-mxnet.shape :as mx-shape])\n"
+    "  (:import (org.apache.mxnet SymbolAPI)))"))
+
+(defn make-gen-symbol-api-function
+  [{:keys [fn-name->fn-name] :or {fn-name->fn-name identity}}]
+  (fn [op-name]
+    (let [{:keys [fn-name fn-description args]}
+          (-> op-name (gen-op-info) (update :fn-name fn-name->fn-name))
+          params (mapv (fn [{:keys [name type optional?] :as opts}]
+                         (assoc opts
+                                :sym (symbol name)
+                                :optional? (or optional?
+                                               (= "NDArray-or-Symbol" type))))
+                       (conj args
+                             {:name "name"
+                              :type "String"
+                              :optional? true
+                              :description "Name of the symbol"}
+                             {:name "attr"
+                              :type "Map[String, String]"
+                              :optional? true
+                              :description "Attributes of the symbol"}))
+          doc (clojure.string/join
+                "\n\n  "
+                (-> (gen-symbol-api-doc fn-description params)
+                    (clojure.string/split #"\n")))
+          default-call (gen-symbol-api-default-arity op-name params)]
+      `(~'defn ~(symbol fn-name)
+         ~doc
+         ~@default-call))))
+
+(def gen-symbol-api-function
+  (make-gen-symbol-api-function {}))
+
+(def gen-symbol-random-api-function
+  (make-gen-symbol-api-function {:fn-name->fn-name fn-name->random-fn-name}))
+
+(defn all-symbol-api-functions [op-names]
+  (->> op-names
+       (filter #(= :core (op-name->namespace-type %)))
+       (mapv gen-symbol-api-function)))
+
+(count (all-symbol-api-functions op-names)) ;215
 
-(defn generate-symbol-api-file []
+(defn all-symbol-random-api-functions [op-names]
+  (->> op-names
+       (filter #(= :random (op-name->namespace-type %)))
+       (mapv gen-symbol-random-api-function)))
+
+(count (all-symbol-random-api-functions op-names)) ;16
+
+(defn generate-symbol-api-file [op-names]
   (println "Generating symbol-api file")
-  (write-to-file all-symbol-api-functions symbol-api-gen-ns "src/org/apache/clojure_mxnet/gen/symbol_api.clj"))
+  (write-to-file (all-symbol-api-functions op-names)
+                 (symbol-api-gen-ns false)
+                 "src/org/apache/clojure_mxnet/gen/symbol_api.clj"))
+
+(defn generate-symbol-random-api-file [op-names]
+  (println "Generating symbol-random-api file")
+  (write-to-file (all-symbol-random-api-functions op-names)
+                 (symbol-api-gen-ns true)
+                 "src/org/apache/clojure_mxnet/gen/symbol_random_api.clj"))
 
 ;;;;;;; NDArrayAPI
 
@@ -519,57 +578,94 @@
     `(~(mapv :sym req-params)
       (~(symbol fn-name) ~req-args))))
 
-(defn gen-ndarray-api-function [op-name]
-  (let [{:keys [fn-name fn-description args]} (gen-op-info op-name)
-        params (mapv (fn [{:keys [name] :as opts}]
-                       (assoc opts :sym (symbol name)))
-                     (conj args {:name "out"
-                                 :type "NDArray-or-Symbol"
-                                 :optional? true
-                                 :description "Output array."}))
-        doc (clojure.string/join
-             "\n\n  "
-             (-> (gen-ndarray-api-doc fn-description params)
-                 (clojure.string/split #"\n")))
-        opt-params (filter :optional? params)
-        req-params (remove :optional? params)
-        req-call (gen-ndarray-api-required-arity fn-name req-params)
-        default-call (gen-ndarray-api-default-arity op-name params)]
-    (if (= 1 (count req-params))
-      `(~'defn ~(symbol fn-name)
-        ~doc
-        ~@default-call)
-      `(~'defn ~(symbol fn-name)
-        ~doc
-        ~req-call
-        ~default-call))))
-
-(def all-ndarray-api-functions
-  (mapv gen-ndarray-api-function op-names))
-
-(def ndarray-api-gen-ns "(ns
-  ^{:doc \"Experimental\"}
-  org.apache.clojure-mxnet.ndarray-api
-  (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max
-                            min repeat reverse set sort take to-array empty shuffle
-                            ref])
-  (:require [org.apache.clojure-mxnet.shape :as mx-shape]
-            [org.apache.clojure-mxnet.util :as util])
-  (:import (org.apache.mxnet NDArrayAPI)))")
-
-
-(defn generate-ndarray-api-file []
+(defn make-gen-ndarray-api-function
+  [{:keys [fn-name->fn-name] :or {fn-name->fn-name identity}}]
+  (fn [op-name]
+    (let [{:keys [fn-name fn-description args]}
+          (-> op-name (gen-op-info) (update :fn-name fn-name->fn-name))
+          params (mapv (fn [{:keys [name] :as opts}]
+                         (assoc opts :sym (symbol name)))
+                       (conj args {:name "out"
+                                   :type "NDArray-or-Symbol"
+                                   :optional? true
+                                   :description "Output array."}))
+          doc (clojure.string/join
+                "\n\n  "
+                (-> (gen-ndarray-api-doc fn-description params)
+                    (clojure.string/split #"\n")))
+          opt-params (filter :optional? params)
+          req-params (remove :optional? params)
+          req-call (gen-ndarray-api-required-arity fn-name req-params)
+          default-call (gen-ndarray-api-default-arity op-name params)]
+      (if (= 1 (count req-params))
+        `(~'defn ~(symbol fn-name)
+           ~doc
+           ~@default-call)
+        `(~'defn ~(symbol fn-name)
+           ~doc
+           ~req-call
+           ~default-call)))))
+
+(def gen-ndarray-api-function
+  (make-gen-ndarray-api-function {}))
+
+(def gen-ndarray-random-api-function
+  (make-gen-ndarray-api-function {:fn-name->fn-name fn-name->random-fn-name}))
+
+(defn all-ndarray-api-functions [op-names]
+  (->> op-names
+       (filter #(= :core (op-name->namespace-type %)))
+       (mapv gen-ndarray-api-function)))
+
+(count (all-ndarray-api-functions op-names)) ; 213
+
+(defn all-ndarray-random-api-functions [op-names]
+  (->> op-names
+       (filter #(= :random (op-name->namespace-type %)))
+       (mapv gen-ndarray-random-api-function)))
+
+(count (all-ndarray-random-api-functions op-names)) ;16
+
+(defn ndarray-api-gen-ns [random-namespace?]
+  (str
+    "(ns\n"
+    "  ^{:doc \"Experimental\"}\n"
+    (if random-namespace?
+      "  org.apache.clojure-mxnet.ndarray-random-api\n"
+      "  org.apache.clojure-mxnet.ndarray-api\n")
+    "  (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max\n"
+    "                            min repeat reverse set sort take to-array empty shuffle\n"
+    "                            ref])\n"
+    "  (:require [org.apache.clojure-mxnet.shape :as mx-shape]\n"
+    "            [org.apache.clojure-mxnet.util :as util])\n"
+    "  (:import (org.apache.mxnet NDArrayAPI)))"))
+
+(defn generate-ndarray-api-file [op-names]
   (println "Generating ndarray-api file")
-  (write-to-file all-ndarray-api-functions
-                 ndarray-api-gen-ns
+  (write-to-file (all-ndarray-api-functions op-names)
+                 (ndarray-api-gen-ns false)
                  "src/org/apache/clojure_mxnet/gen/ndarray_api.clj"))
 
+(defn generate-ndarray-random-api-file [op-names]
+  (println "Generating ndarray-random-api file")
+  (write-to-file (all-ndarray-random-api-functions op-names)
+                 (ndarray-api-gen-ns true)
+                 "src/org/apache/clojure_mxnet/gen/ndarray_random_api.clj"))
+
+
 ;;; autogen the files
 (do
   (generate-ndarray-file)
-  (generate-ndarray-api-file)
+
+  ;; NDArrayAPI
+  (generate-ndarray-api-file op-names)
+  (generate-ndarray-random-api-file op-names)
+
   (generate-symbol-file)
-  (generate-symbol-api-file))
+
+  ;; SymbolAPI
+  (generate-symbol-api-file op-names)
+  (generate-symbol-random-api-file op-names))
 
 
 (comment
@@ -580,8 +676,14 @@
 
   (gen-symbol-api-function "Activation")
 
+  (gen-ndarray-random-api-function "random_randint")
+
+  (gen-ndarray-random-api-function "sample_normal")
+
+  (gen-symbol-random-api-function "random_poisson")
+
   ;; This generates a file with the bulk of the nd-array functions
   (generate-ndarray-file)
 
   ;; This generates a file with the bulk of the symbol functions
-  (generate-symbol-file)  )
+  (generate-symbol-file))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_api.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_api.clj
index 70359a6ef9b7..e222775c60f6 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_api.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_api.clj
@@ -16,10 +16,10 @@
 
 (ns org.apache.clojure-mxnet.ndarray-api
   "Experimental NDArray API"
-  (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max
-                            min repeat reverse set sort take to-array empty shuffle
-                            ref])
-
+  (:refer-clojure
+    :exclude [* - + > >= < <= / cast concat flatten identity load max
+              min repeat reverse set sort take to-array empty shuffle
+              ref])
   (:require [org.apache.clojure-mxnet.base :as base]
             [org.apache.clojure-mxnet.context :as mx-context]
             [org.apache.clojure-mxnet.shape :as mx-shape]
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_random_api.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_random_api.clj
new file mode 100644
index 000000000000..1f45b6d4d646
--- /dev/null
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_random_api.clj
@@ -0,0 +1,28 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.ndarray-random-api
+  "Experimental NDArray Random API"
+  (:require [org.apache.clojure-mxnet.base :as base]
+            [org.apache.clojure-mxnet.context :as mx-context]
+            [org.apache.clojure-mxnet.shape :as mx-shape]
+            [org.apache.clojure-mxnet.util :as util]
+            [clojure.reflect :as r]
+            [t6.from-scala.core :refer [$] :as $])
+  (:import (org.apache.mxnet NDArrayAPI)))
+
+;; loads the generated functions into the namespace
+(do (clojure.core/load "gen/ndarray_random_api"))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol_random_api.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol_random_api.clj
new file mode 100644
index 000000000000..76f6fdefc334
--- /dev/null
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol_random_api.clj
@@ -0,0 +1,32 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.symbol-random-api
+  "Experimental Symbol Random API"
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max
+                            min repeat reverse set sort take to-array empty sin
+                            get apply shuffle ref])
+  (:require [org.apache.clojure-mxnet.base :as base]
+            [org.apache.clojure-mxnet.context :as mx-context]
+            [org.apache.clojure-mxnet.executor :as ex]
+            [org.apache.clojure-mxnet.shape :as mx-shape]
+            [org.apache.clojure-mxnet.util :as util]
+            [t6.from-scala.core :refer [$] :as $]
+            [org.apache.clojure-mxnet.ndarray :as ndarray])
+  (:import (org.apache.mxnet SymbolAPI)))
+
+;; loads the generated functions into the namespace
+(do (clojure.core/load "gen/symbol_random_api"))
diff --git a/contrib/clojure-package/test/dev/generator_test.clj b/contrib/clojure-package/test/dev/generator_test.clj
index cf28241c59e8..acc81afcbcd5 100644
--- a/contrib/clojure-package/test/dev/generator_test.clj
+++ b/contrib/clojure-package/test/dev/generator_test.clj
@@ -27,6 +27,20 @@
   (is (= "foo-bar" (gen/clojure-case "Foo_Bar")))
   (is (= "div+" (gen/clojure-case "/+"))))
 
+(deftest fn-name->random-fn-name
+  (is (= "poisson" (gen/fn-name->random-fn-name "-random-poisson")))
+  (is (= "poisson-like" (gen/fn-name->random-fn-name "-sample-poisson"))))
+
+(deftest remove-prefix
+  (is (= "randint" (gen/remove-prefix "-random-" "-random-randint")))
+  (is (= "exponential" (gen/remove-prefix "-sample-" "-sample-exponential"))))
+
+(deftest in-namespace-random?
+  (is (gen/in-namespace-random? "random_randint"))
+  (is (gen/in-namespace-random? "sample_poisson"))
+  (is (not (gen/in-namespace-random? "rnn")))
+  (is (not (gen/in-namespace-random? "activation"))))
+
 (defn ndarray-reflect-info [name]
   (->> gen/ndarray-public-no-default
        (filter #(= name (str (:name %))))
@@ -317,14 +331,25 @@
 (deftest test-write-to-file
   (testing "symbol-api"
     (let [fname "test/test-symbol-api.clj"
-          _ (gen/write-to-file [(first gen/all-symbol-api-functions)
-                                (second gen/all-symbol-api-functions)]
-                               gen/symbol-api-gen-ns
+          fns (gen/all-symbol-api-functions gen/op-names)
+          _ (gen/write-to-file [(first fns) (second fns)]
+                               (gen/symbol-api-gen-ns false)
                                fname)
           good-contents (slurp "test/good-test-symbol-api.clj")
           contents (slurp fname)]
       (is (= good-contents contents))))
 
+  (testing "symbol-random-api"
+    (let [fname "test/test-symbol-random-api.clj"
+          fns (gen/all-symbol-random-api-functions gen/op-names)
+          _ (gen/write-to-file [(first fns) (second fns)]
+                               (gen/symbol-api-gen-ns true)
+                               fname)
+          good-contents (slurp "test/good-test-symbol-random-api.clj")
+          contents (slurp fname)]
+      (is (= good-contents contents))))
+
+
  (testing "symbol"
     (let [fname "test/test-symbol.clj"
           _ (gen/write-to-file [(first gen/all-symbol-functions)]
@@ -336,14 +361,24 @@
 
   (testing "ndarray-api"
     (let [fname "test/test-ndarray-api.clj"
-          _ (gen/write-to-file [(first gen/all-ndarray-api-functions)
-                                (second gen/all-ndarray-api-functions)]
-                               gen/ndarray-api-gen-ns
+          fns (gen/all-ndarray-api-functions gen/op-names)
+          _ (gen/write-to-file [(first fns) (second fns)]
+                               (gen/ndarray-api-gen-ns false)
                                fname)
           good-contents (slurp "test/good-test-ndarray-api.clj")
           contents (slurp fname)]
       (is (= good-contents contents))))
 
+  (testing "ndarray-random-api"
+    (let [fname "test/test-ndarray-random-api.clj"
+          fns (gen/all-ndarray-random-api-functions gen/op-names)
+          _ (gen/write-to-file [(first fns) (second fns)]
+                               (gen/ndarray-api-gen-ns true)
+                               fname)
+          good-contents (slurp "test/good-test-ndarray-random-api.clj")
+          contents (slurp fname)]
+      (is (= good-contents contents))))
+
   (testing "ndarray"
     (let [fname "test/test-ndarray.clj"
           _ (gen/write-to-file [(first gen/all-ndarray-functions)]
diff --git a/contrib/clojure-package/test/good-test-ndarray-random-api.clj b/contrib/clojure-package/test/good-test-ndarray-random-api.clj
new file mode 100644
index 000000000000..230e1033c008
--- /dev/null
+++ b/contrib/clojure-package/test/good-test-ndarray-random-api.clj
@@ -0,0 +1,95 @@
+(ns
+  ^{:doc "Experimental"}
+  org.apache.clojure-mxnet.ndarray-random-api
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max
+                            min repeat reverse set sort take to-array empty shuffle
+                            ref])
+  (:require [org.apache.clojure-mxnet.shape :as mx-shape]
+            [org.apache.clojure-mxnet.util :as util])
+  (:import (org.apache.mxnet NDArrayAPI)))
+
+;; Do not edit - this is auto-generated
+
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+
+
+(defn
+ exponential
+ "Draw random samples from an exponential distribution.
+  
+  Samples are distributed according to an exponential distribution parametrized by *lambda* (rate).
+  
+  Example::
+  
+     exponential(lam=4, shape=(2,2)) = [[ 0.0097189 ,  0.08999364],
+                                        [ 0.04146638,  0.31715935]]
+  
+  
+  Defined in src/operator/random/sample_op.cc:L137
+  
+  `lam`: Lambda parameter (rate) of the exponential distribution. (optional)
+  `shape`: Shape of the output. (optional)
+  `ctx`: Context of output, in format [cpu|gpu|cpu_pinned](n). Only used for imperative calls. (optional)
+  `dtype`: DType of the output in case this can't be inferred. Defaults to float32 if not defined (dtype=None). (optional)
+  `out`: Output array. (optional)"
+ ([] (exponential {}))
+ ([{:keys [lam shape ctx dtype out],
+    :or {lam nil, shape nil, ctx nil, dtype nil, out nil},
+    :as opts}]
+  (util/coerce-return
+   (NDArrayAPI/random_exponential
+    (util/->option lam)
+    (util/->option (clojure.core/when shape (mx-shape/->shape shape)))
+    (util/->option ctx)
+    (util/->option dtype)
+    (util/->option out)))))
+
+(defn
+ gamma
+ "Draw random samples from a gamma distribution.
+  
+  Samples are distributed according to a gamma distribution parametrized by *alpha* (shape) and *beta* (scale).
+  
+  Example::
+  
+     gamma(alpha=9, beta=0.5, shape=(2,2)) = [[ 7.10486984,  3.37695289],
+                                              [ 3.91697288,  3.65933681]]
+  
+  
+  Defined in src/operator/random/sample_op.cc:L125
+  
+  `alpha`: Alpha parameter (shape) of the gamma distribution. (optional)
+  `beta`: Beta parameter (scale) of the gamma distribution. (optional)
+  `shape`: Shape of the output. (optional)
+  `ctx`: Context of output, in format [cpu|gpu|cpu_pinned](n). Only used for imperative calls. (optional)
+  `dtype`: DType of the output in case this can't be inferred. Defaults to float32 if not defined (dtype=None). (optional)
+  `out`: Output array. (optional)"
+ ([] (gamma {}))
+ ([{:keys [alpha beta shape ctx dtype out],
+    :or {alpha nil, beta nil, shape nil, ctx nil, dtype nil, out nil},
+    :as opts}]
+  (util/coerce-return
+   (NDArrayAPI/random_gamma
+    (util/->option alpha)
+    (util/->option beta)
+    (util/->option (clojure.core/when shape (mx-shape/->shape shape)))
+    (util/->option ctx)
+    (util/->option dtype)
+    (util/->option out)))))
+
diff --git a/contrib/clojure-package/test/good-test-symbol-random-api.clj b/contrib/clojure-package/test/good-test-symbol-random-api.clj
new file mode 100644
index 000000000000..7202d2e27d12
--- /dev/null
+++ b/contrib/clojure-package/test/good-test-symbol-random-api.clj
@@ -0,0 +1,118 @@
+(ns
+  ^{:doc "Experimental"}
+  org.apache.clojure-mxnet.symbol-random-api
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max
+                            min repeat reverse set sort take to-array empty sin
+                            get apply shuffle ref])
+  (:require [org.apache.clojure-mxnet.util :as util]
+            [org.apache.clojure-mxnet.shape :as mx-shape])
+  (:import (org.apache.mxnet SymbolAPI)))
+
+;; Do not edit - this is auto-generated
+
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+
+
+(defn
+ exponential
+ "Draw random samples from an exponential distribution.
+  
+  Samples are distributed according to an exponential distribution parametrized by *lambda* (rate).
+  
+  Example::
+  
+     exponential(lam=4, shape=(2,2)) = [[ 0.0097189 ,  0.08999364],
+                                        [ 0.04146638,  0.31715935]]
+  
+  
+  Defined in src/operator/random/sample_op.cc:L137
+  
+  `lam`: Lambda parameter (rate) of the exponential distribution. (optional)
+  `shape`: Shape of the output. (optional)
+  `ctx`: Context of output, in format [cpu|gpu|cpu_pinned](n). Only used for imperative calls. (optional)
+  `dtype`: DType of the output in case this can't be inferred. Defaults to float32 if not defined (dtype=None). (optional)
+  `name`: Name of the symbol (optional)
+  `attr`: Attributes of the symbol (optional)"
+ [{:keys [lam shape ctx dtype name attr],
+   :or {lam nil, shape nil, ctx nil, dtype nil, name nil, attr nil},
+   :as opts}]
+ (util/coerce-return
+  (SymbolAPI/random_exponential
+   (util/->option lam)
+   (util/->option (clojure.core/when shape (mx-shape/->shape shape)))
+   (util/->option ctx)
+   (util/->option dtype)
+   name
+   (clojure.core/when
+    attr
+    (clojure.core/->>
+     attr
+     (clojure.core/mapv
+      (clojure.core/fn [[k v]] [k (clojure.core/str v)]))
+     (clojure.core/into {})
+     util/convert-map)))))
+
+(defn
+ gamma
+ "Draw random samples from a gamma distribution.
+  
+  Samples are distributed according to a gamma distribution parametrized by *alpha* (shape) and *beta* (scale).
+  
+  Example::
+  
+     gamma(alpha=9, beta=0.5, shape=(2,2)) = [[ 7.10486984,  3.37695289],
+                                              [ 3.91697288,  3.65933681]]
+  
+  
+  Defined in src/operator/random/sample_op.cc:L125
+  
+  `alpha`: Alpha parameter (shape) of the gamma distribution. (optional)
+  `beta`: Beta parameter (scale) of the gamma distribution. (optional)
+  `shape`: Shape of the output. (optional)
+  `ctx`: Context of output, in format [cpu|gpu|cpu_pinned](n). Only used for imperative calls. (optional)
+  `dtype`: DType of the output in case this can't be inferred. Defaults to float32 if not defined (dtype=None). (optional)
+  `name`: Name of the symbol (optional)
+  `attr`: Attributes of the symbol (optional)"
+ [{:keys [alpha beta shape ctx dtype name attr],
+   :or
+   {alpha nil,
+    beta nil,
+    shape nil,
+    ctx nil,
+    dtype nil,
+    name nil,
+    attr nil},
+   :as opts}]
+ (util/coerce-return
+  (SymbolAPI/random_gamma
+   (util/->option alpha)
+   (util/->option beta)
+   (util/->option (clojure.core/when shape (mx-shape/->shape shape)))
+   (util/->option ctx)
+   (util/->option dtype)
+   name
+   (clojure.core/when
+    attr
+    (clojure.core/->>
+     attr
+     (clojure.core/mapv
+      (clojure.core/fn [[k v]] [k (clojure.core/str v)]))
+     (clojure.core/into {})
+     util/convert-map)))))
+
diff --git a/cpp-package/tests/travis/setup.sh b/cpp-package/tests/travis/setup.sh
index 5a3813ee34eb..e0c850ed39a9 100755
--- a/cpp-package/tests/travis/setup.sh
+++ b/cpp-package/tests/travis/setup.sh
@@ -19,5 +19,5 @@
 
 
 if [ ${TASK} == "lint" ]; then
-    pip install cpplint 'pylint==1.4.4' 'astroid==1.3.6' --user
+    pip3 install cpplint 'pylint==2.3.1' --user
 fi
diff --git a/docs/api/python/gluon/contrib.md b/docs/api/python/gluon/contrib.md
index 790f6b496516..a940f697de69 100644
--- a/docs/api/python/gluon/contrib.md
+++ b/docs/api/python/gluon/contrib.md
@@ -59,6 +59,17 @@ In the rest of this document, we list routines provided by the `gluon.contrib` p
     PixelShuffle3D
 ```
 
+### Convolutional neural network
+
+```eval_rst
+.. currentmodule:: mxnet.gluon.contrib.cnn
+
+.. autosummary::
+    :nosignatures:
+    
+    DeformableConvolution
+```
+
 ### Recurrent neural network
 
 ```eval_rst
@@ -116,6 +127,10 @@ In the rest of this document, we list routines provided by the `gluon.contrib` p
 .. automodule:: mxnet.gluon.contrib.nn
     :members:
     :imported-members:
+    
+.. automodule:: mxnet.gluon.contrib.cnn
+    :members:
+    :imported-members:
 
 .. automodule:: mxnet.gluon.contrib.rnn
     :members:
diff --git a/docs/api/python/ndarray/linalg.md b/docs/api/python/ndarray/linalg.md
index 41436c3ba2d1..b73d9680a874 100644
--- a/docs/api/python/ndarray/linalg.md
+++ b/docs/api/python/ndarray/linalg.md
@@ -51,10 +51,14 @@ In the rest of this document, we list routines provided by the `ndarray.linalg`
     potri
     trmm
     trsm
-    sumlogdiag
     syrk
     gelqf
     syevd
+    sumlogdiag
+    extractdiag
+    makediag
+    extracttrian
+    maketrian
 ```
 
 ## API Reference
diff --git a/docs/api/python/symbol/linalg.md b/docs/api/python/symbol/linalg.md
index f1891e29f896..5b467b501247 100644
--- a/docs/api/python/symbol/linalg.md
+++ b/docs/api/python/symbol/linalg.md
@@ -51,10 +51,14 @@ In the rest of this document, we list routines provided by the `symbol.linalg` p
     potri
     trmm
     trsm
-    sumlogdiag
     syrk
     gelqf
     syevd
+    sumlogdiag
+    extractdiag
+    makediag
+    extracttrian
+    maketrian
 ```
 
 ## API Reference
diff --git a/docs/architecture/note_data_loading.md b/docs/architecture/note_data_loading.md
index 293b67572cb3..a60bf905d0e4 100644
--- a/docs/architecture/note_data_loading.md
+++ b/docs/architecture/note_data_loading.md
@@ -83,7 +83,7 @@ In MXNet, we rely on the binary recordIO format implemented in dmlc-core.
 In MXNet's binary RecordIO, we store each data instance as a record.
 **kMagic** is a *magic number* indicating the start of a record.
 **Lrecord** encodes length and a continue flag.
-In lrecord,  
+In lrecord,
 - cflag == 0: this is a complete record
 - cflag == 1: start of a multiple-records
 - cflag == 2: middle of multiple-records
@@ -228,7 +228,11 @@ dataiter = mx.io.ImageRecordIter(
     # Backend Parameter, preprocessing thread number
     preprocess_threads=4,
     # Backend Parameter, prefetch buffer size
-    prefetch_buffer=1)
+    prefetch_buffer=1,
+    # Optional, the device context which data loader optimized for, could be 'gpu' or 'cpu'
+    ctx="gpu",
+    # The out data type, could be 'float32' 'int8' or 'uint8'
+    dtype="float32")
 ```
 
 Generally, to create a data iterator, you need to provide five kinds of parameters:
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index 095c214e66b3..c5ebd54c55a1 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -80,16 +80,20 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_GPU_MEM_POOL_RESERVE
   - Values: Int ```(default=5)```
   - The percentage of GPU memory to reserve for things other than the GPU array, such as kernel launch or cudnn handle space.
-  - If you see a strange out-of-memory error from the kernel launch, after multiple iterations, try setting this to a larger value.  
+  - If you see a strange out-of-memory error from the kernel launch, after multiple iterations, try setting this to a larger value.
+
 * MXNET_GPU_MEM_POOL_TYPE
   - Values: String ```(default=Naive)```
   - The type of memory pool.
   - Choices:
     - Naive: A simple memory pool that allocates memory for the exact requested size and cache memory buffers. If a buffered memory chunk matches the size of a new request, the chunk from the memory pool will be returned and reused.
     - Round: A memory pool that always rounds the requested memory size and allocates memory of the rounded size. MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF defines how to round up a memory size. Caching and allocating buffered memory works in the same way as the naive memory pool.
+    - Unpooled: No memory pool is used.
+
 * MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF
   - Values: Int ```(default=24)```
   - The cutoff threshold that decides the rounding strategy. Let's denote the threshold as T. If the memory size is smaller than `2 ** T` (by default, it's 2 ** 24 = 16MB), it rounds to the smallest `2 ** n` that is larger than the requested memory size; if the memory size is larger than `2 ** T`, it rounds to the next k * 2 ** T.
+
 * MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE
   - Values: Int ```(default=2097152)```
   - When using the naive pool type, memory allocations larger than this threshhold are rounded up to a multiple of this value.
diff --git a/docs/install/requirements.txt b/docs/install/requirements.txt
index dfc3f70c96fb..b3620d607740 100644
--- a/docs/install/requirements.txt
+++ b/docs/install/requirements.txt
@@ -3,6 +3,6 @@ h5py==2.8.0rc1
 nose
 nose-timer
 numpy<=1.15.2,>=1.8.2
-pylint==1.8.3
+pylint==2.3.1; python_version >= '3.0'
 requests<2.19.0,>=2.18.4
 scipy==1.0.1
diff --git a/docs/tutorials/embedded/wine_detector.md b/docs/tutorials/embedded/wine_detector.md
index f0ae8273203e..6a9372cda3b7 100644
--- a/docs/tutorials/embedded/wine_detector.md
+++ b/docs/tutorials/embedded/wine_detector.md
@@ -91,6 +91,7 @@ The next step is to create a python script to load the model, and run inference
 
 import mxnet as mx
 import numpy as np
+import time
 import cv2, os, urllib
 from collections import namedtuple
 Batch = namedtuple('Batch', ['data'])
@@ -100,13 +101,14 @@ with open('synset.txt', 'r') as f:
     synsets = [l.rstrip() for l in f]
 
 # Load the network parameters
-sym, arg_params, aux_params = mx.model.load_checkpoint('Inception_BN', 0)
+sym, arg_params, aux_params = mx.model.load_checkpoint('Inception-BN', 126)
+
 
 # Load the network into an MXNet module and bind the corresponding parameters
 mod = mx.mod.Module(symbol=sym, context=mx.cpu())
 mod.bind(for_training=False, data_shapes=[('data', (1,3,224,224))])
 mod.set_params(arg_params, aux_params)
- 
+
 '''
 Function to predict objects by giving the model a pointer to an image file and running a forward pass through the model.
 
@@ -129,14 +131,14 @@ def predict(filename, mod, synsets, N=5):
     img = np.swapaxes(img, 1, 2)
     img = img[np.newaxis, :]
     print "pre-processed image in "+str(time.time()-tic)
- 
+
     toc = time.time()
     mod.forward(Batch([mx.nd.array(img)]))
     prob = mod.get_outputs()[0].asnumpy()
     prob = np.squeeze(prob)
     print "forward pass in "+str(time.time()-toc)
- 
- 
+
+
     topN = []
     a = np.argsort(prob)[::-1]
     for i in a[0:N]:
@@ -156,7 +158,7 @@ def predict_from_url(url, N=5):
         return predict(filename, mod, synsets, N)
 
 # Code to predict on a local file
-def predict_from_local_file(filename, N=5):        
+def predict_from_local_file(filename, N=5):
     return predict(filename, mod, synsets, N)
 ```
 
@@ -164,11 +166,24 @@ Now that we have defined inception_predict.py we can test that the model is runn
 
 ```bash
 python
->>> import inception_predict
->>> predict_from_url("http://imgur.com/HzafyBA")
+>>> from inception_predict import *
+>>> predict_from_url("https://i.imgur.com/HzafyBA.jpg")
 ```
 
-This should give a reasonable prediction for the fluffy cow in this [image](http://imgur.com/HzafyBA). 
+This should give a reasonable prediction for the fluffy cow in this [image](http://imgur.com/HzafyBA).
+
+```
+pre-processed image in 0.20366191864
+forward pass in 63.2164611816
+probability=0.718524, class=n02403003 ox
+probability=0.176381, class=n02389026 sorrel
+probability=0.095558, class=n03868242 oxcart
+probability=0.002765, class=n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+probability=0.001262, class=n03935335 piggy bank, penny bank
+[(0.71852392, 'n02403003 ox'), (0.17638102, 'n02389026 sorrel'), (0.09555836, 'n03868242 oxcart'),
+(0.0027645244, 'n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis'),
+(0.0012616422, 'n03935335 piggy bank, penny bank')]
+```
 
 
 ## Running an Inception on Real-Time Video From PiCamera
@@ -194,11 +209,11 @@ while True:
     camera.start_preview()
     camera.capture(filename)
     camera.stop_preview()
-    
+
     # Run inception prediction on image
     print "Predicting"
     topn = inception_predict.predict_from_local_file(filename, N=5)
-    
+
     # Print the top N most likely objects in image (default set to 5, change this in the function call above)
     print topn
 ```
@@ -209,7 +224,7 @@ You can then run this file by entering the following command:
 python camera_test.py
 ```
 
-If camera_test.py is working you should see a preview every few seconds of the image that is being captured and fed to the model, as well as predicted classes for objects in the image being written to the terminal. 
+If camera_test.py is working you should see a preview every few seconds of the image that is being captured and fed to the model, as well as predicted classes for objects in the image being written to the terminal.
 
 Try pointing the PiCamera at a few different objects and see what predictions the network comes out with.
 
@@ -248,13 +263,13 @@ def customCallback(client, userdata, message):
 
 # Usage
 usageInfo = """Usage:
- 
+
 Use certificate based mutual authentication:
 python wine_alerter.py -e <endpoint> -r <rootCAFilePath> -c <certFilePath> -k <privateKeyFilePath>
- 
+
 Use MQTT over WebSocket:
 python wine_alerter.py -e <endpoint> -r <rootCAFilePath> -w
- 
+
 Type "python wine_alerter.py -h" for available options.
 """
 
@@ -272,7 +287,7 @@ helpInfo = """-e, --endpoint
 -h, --help
     Help information
 """
- 
+
 # Read in command-line parameters
 useWebsocket = False
 host = ""
@@ -367,10 +382,10 @@ while True:
     camera.capture(filename)
     camera.stop_preview()
     topn = inception_predict.predict_from_local_file(filename, N=5)
-    
+
     # Check if either of the top two predictions are wine related and publish a message if it is
     # you can change 'wine' here to anything you want to alert the server about detecting
-    if 'wine' in topn[0][1] or 'wine' in topn[1][1]: 
+    if 'wine' in topn[0][1] or 'wine' in topn[1][1]:
         myAWSIoTMQTTClient.publish("sdk/test/Python", "New Message: WINE DETECTED!", 0)
 ```
 
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index 8dca7e86d509..01c59b16def7 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -40,12 +40,22 @@
    vision/index.md
 ```
 
-MXNet tutorials can be found in this section. A variety of language bindings are available for MXNet (including Python, Scala, C++ and R) and we have a different tutorial section for each language.
+MXNet tutorials can be found in this section. A variety of language bindings are available for MXNet (including Python, Scala, Java, Clojure, C++ and R) and we have a different tutorial section for each language.
 
 Are you new to MXNet, and don't have a preference on language? We currently recommend starting with Python, and specifically the Gluon APIs (versus Module APIs) as they're more flexible and easier to debug.
 
 Another great resource for learning MXNet is our [examples section](https://github.com/apache/incubator-mxnet/tree/master/example) which includes a wide variety of models (from basic to state-of-the-art) for a wide variety of tasks including: object detection, style transfer, reinforcement learning, and many others.
 
+**Contents:**
+  * [Python Tutorials](#python-tutorials)
+  * [Scala Tutorials](#scala-tutorials)
+  * [Java Tutorials](#java-tutorials)
+  * [Clojure Tutorials](#clojure-tutorials)
+  * [C++ Tutorials](#c---tutorials)
+  * [R Tutorials](#r-tutorials)
+  * [Perl Tutorials](#perl-tutorials)
+  * [Contributing Tutorials](#contributing-tutorials)
+
 <hr>
 
 ## Python Tutorials
@@ -57,8 +67,8 @@ A comprehensive introduction to Gluon can be found at [Dive into Deep Learning](
 Use the tutorial selector below to filter to the relevant tutorials. You might see a download link in the top right corner of some tutorials. Use this to download a Jupyter Notebook version of the tutorial, and re-run and adjust the code as you wish.
 
 <script type="text/javascript" src='../_static/js/options.js'></script>
-
 <!-- Gluon vs Module -->
+
 Select API:&nbsp;
 <div class="btn-group opt-group" role="group">
   <button type="button" class="btn btn-default opt active" style="font-size:22px">Gluon</button>
@@ -186,6 +196,13 @@ Select API:&nbsp;
 * [MXNet-Java  Examples](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
 <hr>
 
+## Clojure Tutorials
+* [How to use the NDArray API to perform vector/matrix/tensor operations](../api/clojure/ndarray.html)
+* [Multi-GPU and multi-host distributed training with the KVStore API](../api/clojure/kvstore.html)
+* [How to use the Symbol API to assemble neural networks from layers](../api/clojure/symbol.html)
+* [How to use the Module API](../api/clojure/module.html) (deprecated)
+<hr>
+
 ## C++ Tutorials
 
 * Models
diff --git a/docs/tutorials/mkldnn/MKLDNN_README.md b/docs/tutorials/mkldnn/MKLDNN_README.md
index c5779670cd87..ea8634f2d320 100644
--- a/docs/tutorials/mkldnn/MKLDNN_README.md
+++ b/docs/tutorials/mkldnn/MKLDNN_README.md
@@ -1,20 +1,20 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
 # Build/Install MXNet with MKL-DNN
 
 A better training and inference performance is expected to be achieved on Intel-Architecture CPUs with MXNet built with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) on multiple operating system, including Linux, Windows and MacOS.
@@ -108,14 +108,13 @@ On Windows, you can use [Micrsoft Visual Studio 2015](https://www.visualstudio.c
 To build and install MXNet yourself, you need the following dependencies. Install the required dependencies:
 
 1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition.
-2. Download and Install [CMake 3](https://cmake.org/) if it is not already installed.
-3. Download and install [OpenCV 3](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
+2. Download and Install [CMake 3](https://cmake.org/files/v3.14/cmake-3.14.0-win64-x64.msi) if it is not already installed.
+3. Download and install [OpenCV 3](https://sourceforge.net/projects/opencvlibrary/files/3.4.5/opencv-3.4.5-vc14_vc15.exe/download).
 4. Unzip the OpenCV package.
-5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (```C:\opencv\build\x64\vc14``` for example). Also, you need to add the OpenCV bin directory (```C:\opencv\build\x64\vc14\bin``` for example) to the ``PATH`` variable.
-6. If you have Intel Math Kernel Library (MKL) installed, set ```MKL_ROOT``` to point to ```MKL``` directory that contains the ```include``` and ```lib```. If you want to use MKL blas, you should set ```-DUSE_BLAS=mkl``` when cmake. Typically, you can find the directory in
-```C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\mkl```.
-7. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBLAS](http://sourceforge.net/projects/openblas/files/v0.2.14/). Note that you should also download ```mingw64.dll.zip`` along with openBLAS and add them to PATH.
-8. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```. 
+5.  Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (e.g.,```OpenCV_DIR = C:\opencv\build ```). Also, you need to add the OpenCV bin directory (```C:\opencv\build\x64\vc14\bin``` for example) to the ``PATH`` variable.
+6. If you have Intel Math Kernel Library (Intel MKL) installed, set ```MKL_ROOT``` to point to ```MKL``` directory that contains the ```include``` and ```lib```. If you want to use MKL blas, you should set ```-DUSE_BLAS=mkl``` when cmake. Typically, you can find the directory in ```C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl```.
+7. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBLAS](http://sourceforge.net/projects/openblas/files/v0.2.14/). or build the latest version of OpenBLAS from source. Note that you should also download ```mingw64.dll.zip``` along with openBLAS and add them to PATH. 
+8. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Downloads\OpenBLAS\```. 
 
 After you have installed all of the required dependencies, build the MXNet source code:
 
@@ -123,96 +122,107 @@ After you have installed all of the required dependencies, build the MXNet sourc
 ```
 git clone --recursive https://github.com/apache/incubator-mxnet.git
 ```
-
-2. Copy file `3rdparty/mkldnn/config_template.vcxproj` to incubator-mxnet root.
-
-3. Start a Visual Studio command prompt.
-
-4. Use [CMake 3](https://cmake.org/) to create a Visual Studio solution in ```./build``` or some other directory. Make sure to specify the architecture in the 
-[CMake 3](https://cmake.org/) command:
+2. Start a Visual Studio command prompt by click windows Start menu>>Visual Studio 2015>>VS2015 X64 Native Tools Command Prompt, 
+go to MXNet source directory, for example
+```cd C:\incubator-mxnet\```
+3. Use [CMake 3](https://cmake.org/) to create a Visual Studio solution in ```./build```. Make sure to specify the architecture in the 
+command:
 ```
 mkdir build
 cd build
 cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
 ```
-
-5. In Visual Studio, open the solution file,```.sln```, and compile it.
-These commands produce a library called ```libmxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
-Also ```libmkldnn.dll``` with be in the ```./build/3rdparty/mkldnn/src/Release/```
-
-6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
+4. Enable Intel MKL-DNN and Intel MKL as BLAS library by the command:  
+```
+"C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl\bin\mklvars.bat" intel64
+cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=mkl -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release -DMKL_ROOT="C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl" -DUSE_MKL_IF_AVAILABLE=1
+```
+5. After the CMake successfully completed, compile the the MXNet source code by using following command:
+```r
+msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
+```
+   Or 
+       in Visual Studio, open the solution file,```.sln```, and compile it.
+       These commands produce a library called ```libmxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder. Also ```libmkldnn.dll``` with be in the ```./build/3rdparty/mkldnn/src/Release/```
+  
+6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml*.dll`, `libiomp5.dll`, `libopenblas*.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
 
 **Visual Studio 2017**
 
 To build and install MXNet yourself using [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/), you need the following dependencies. Install the required dependencies:
 
 1. If [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) is not already installed, download and install it. You can download and install the free community edition.
-2. Download and install [CMake 3](https://cmake.org/files/v3.11/cmake-3.11.0-rc4-win64-x64.msi) if it is not already installed.
-3. Download and install [OpenCV](https://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.4.1/opencv-3.4.1-vc14_vc15.exe/download).
+2. Download and Install [CMake 3](https://cmake.org/files/v3.14/cmake-3.14.0-win64-x64.msi) if it is not already installed.
+3. Download and install [OpenCV 3](https://sourceforge.net/projects/opencvlibrary/files/3.4.5/opencv-3.4.5-vc14_vc15.exe/download).
 4. Unzip the OpenCV package.
-5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (e.g., ```OpenCV_DIR = C:\utils\opencv\build```).
-6. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBlas](https://sourceforge.net/projects/openblas/files/v0.2.20/OpenBLAS%200.2.20%20version.zip/download).
-7. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories (e.g., ```OpenBLAS_HOME = C:\utils\OpenBLAS```).
+5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (e.g.,```OpenCV_DIR = C:\opencv\build ```). Also, you need to add the OpenCV bin directory (```C:\opencv\build\x64\vc15\bin``` for example) to the ``PATH`` variable.
+6. If you have Intel Math Kernel Library (Intel MKL) installed, set ```MKL_ROOT``` to point to ```MKL``` directory that contains the ```include``` and ```lib```. If you want to use MKL blas, you should set ```-DUSE_BLAS=mkl``` when cmake. Typically, you can find the directory in ```C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl```.
+7. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBLAS](http://sourceforge.net/projects/openblas/files/v0.2.14/). or build the latest version of OpenBLAS from source. Note that you should also download ```mingw64.dll.zip``` along with openBLAS and add them to PATH. 
+8. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Downloads\OpenBLAS\```.
 
 After you have installed all of the required dependencies, build the MXNet source code:
 
 1. Start ```cmd``` in windows.
-
 2. Download the MXNet source code from GitHub by using following command:
 
 ```r
 cd C:\
 git clone --recursive https://github.com/apache/incubator-mxnet.git
-```
-
-3. Copy file `3rdparty/mkldnn/config_template.vcxproj` to incubator-mxnet root.
-
-4. Follow [this link](https://docs.microsoft.com/en-us/visualstudio/install/modify-visual-studio) to modify ```Individual components```, and check ```VC++ 2017 version 15.4 v14.11 toolset```, and click ```Modify```.
 
-5. Change the version of the Visual studio 2017 to v14.11 using the following command (by default the VS2017 is installed in the following path):
-
-```r
-"C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.11
 ```
-
-6. Create a build dir using the following command and go to the directory, for example:
+3. Start a Visual Studio command prompt by click windows Start menu>>Visual Studio 2017>>VS2017 X64 Native Tools Command Prompt, go to the MXNet source directory, for example
 
 ```r
-mkdir C:\build
-cd C:\build
+cd C:\incubator-mxnet
 ```
 
-7. CMake the MXNet source code by using following command:
+   Step 4 and step 5 are optional, for who want to use vs2015 toolset in VS2017 environment. 
 
+4. (Optional) Follow [this link](https://docs.microsoft.com/en-us/visualstudio/install/modify-visual-studio) to modify ```Individual components```, and check ```VC++ 2017 version 15.4 v14.11 toolset```, and click ```Modify```.
+5. (Optional)Change the version of the Visual studio 2017 to v14.11 using the following command (by default the VS2017 is installed in the following path):
 ```r
-cmake -G "Visual Studio 15 2017 Win64" .. -T host=x64 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
+"C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.11
+```
+6. Create a build dir using the following command and go to the directory, make the MXNet source code by using following command:
+```
+mkdir build
+cd build
+cmake -G "Visual Studio 15 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
+```
+7. Enable Intel MKL-DNN and Intel MKL as BLAS library by the command:  
+```
+"C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl\bin\mklvars.bat" intel64
+cmake -G "Visual Studio 15 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=mkl -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release -DMKL_ROOT="C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl" -DUSE_MKL_IF_AVAILABLE=1
 ```
 
 8. After the CMake successfully completed, compile the the MXNet source code by using following command:
-
 ```r
 msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
 ```
 
-9. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
+9. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml*.dll`, `libiomp5.dll`, `libopenblas*.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
 
 <h2 id="4">Verify MXNet with python</h2>
 
+Preinstall python and some dependent modules: 
+```
+pip install numpy==1.15.2 graphviz==0.8.4 requests
+set PYTHONPATH=[workdir]\incubator-mxnet\python
+```
+or install mxnet
 ```
 cd python
 sudo python setup.py install
 python -c "import mxnet as mx;print((mx.nd.ones((2, 3))*2).asnumpy());"
-
+```
 Expected Output:
-
+```
 [[ 2.  2.  2.]
  [ 2.  2.  2.]]
 ```
-
 ### Verify whether MKL-DNN works
 
 After MXNet is installed, you can verify if MKL-DNN backend works well with a single Convolution layer.
-
 ```
 import mxnet as mx
 import numpy as np
@@ -295,7 +305,7 @@ Then by running above code snippet, you probably will get the following output m
 Numpy + Intel(R) MKL: THREADING LAYER: (null)
 Numpy + Intel(R) MKL: setting Intel(R) MKL to use INTEL OpenMP runtime
 Numpy + Intel(R) MKL: preloading libiomp5.so runtime
-MKL_VERBOSE Intel(R) MKL 2018.0 Update 1 Product build 20171007 for Intel(R) 64 architecture Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) enabled processors, Lnx 2.40GHz lp64 intel_thread NMICDev:0
+MKL_VERBOSE Intel(R) MKL 2019.0 Update 3 Product build 20190125 for Intel(R) 64 architecture Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) enabled processors, Lnx 2.40GHz lp64 intel_thread NMICDev:0
 MKL_VERBOSE SGEMM(T,N,12,10,8,0x7f7f927b1378,0x1bc2140,8,0x1ba8040,8,0x7f7f927b1380,0x7f7f7400a280,12) 8.93ms CNR:OFF Dyn:1 FastMM:1 TID:0  NThr:40 WDiv:HOST:+0.000
 ```
 
@@ -306,14 +316,14 @@ Graph optimization by subgraph feature are available in master branch. You can b
 ```
 export MXNET_SUBGRAPH_BACKEND=MKLDNN
 ```
-
-When `MKLDNN` backend is enabled, advanced control options are avaliable:
-
-```
-export MXNET_DISABLE_MKLDNN_CONV_OPT=1 # disable MKLDNN convolution optimization pass
-export MXNET_DISABLE_MKLDNN_FC_OPT=1 # disable MKLDNN FullyConnected optimization pass
-```
-
+
+When `MKLDNN` backend is enabled, advanced control options are avaliable:
+
+```
+export MXNET_DISABLE_MKLDNN_CONV_OPT=1 # disable MKLDNN convolution optimization pass
+export MXNET_DISABLE_MKLDNN_FC_OPT=1 # disable MKLDNN FullyConnected optimization pass
+```
+
 
 This limitations of this experimental feature are:
 
diff --git a/example/gluon/super_resolution/data.py b/example/gluon/data.py
similarity index 100%
rename from example/gluon/super_resolution/data.py
rename to example/gluon/data.py
diff --git a/example/gluon/lstm_crf/lstm_crf.py b/example/gluon/lstm_crf/lstm_crf.py
index 011dcfbc4aea..6cdc6e95a383 100644
--- a/example/gluon/lstm_crf/lstm_crf.py
+++ b/example/gluon/lstm_crf/lstm_crf.py
@@ -118,7 +118,7 @@ def _score_sentence(self, feats, tags_array):
                     self.transitions.data()[to_scalar(tags_array[idx+1]),
                                             to_scalar(tags_array[idx])] + feat[to_scalar(tags_array[idx+1])]
         score = score + self.transitions.data()[self.tag2idx[STOP_TAG],
-                                                to_scalar(tags.array[int(tags_array.shape[0]-1)])]
+                                                to_scalar(tags_array[int(tags_array.shape[0]-1)])]
         return score
 
     def _viterbi_decode(self, feats):
diff --git a/example/quantization/README.md b/example/quantization/README.md
index fc9a26755b4e..93a14cf473ad 100644
--- a/example/quantization/README.md
+++ b/example/quantization/README.md
@@ -27,17 +27,19 @@ The following models have been tested on Linux systems.
 
 | Model | Source | Dataset | FP32 Accuracy (top-1/top-5)| INT8 Accuracy (top-1/top-5)|
 |:---|:---|---|:---:|:---:|
+| [ResNet18-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  |70.07%/89.30%|69.85%/89.23%|
 | [ResNet50-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  | 75.87%/92.72%  |  75.71%/92.65% |
-| [ResNet101-V1](#4)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  | 77.3%/93.58%  | 77.09%/93.41%  |
-|[Squeezenet 1.0](#5)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|57.01%/79.71%|56.62%/79.55%|
-|[MobileNet 1.0](#6)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|69.76%/89.32%|69.61%/89.09%|
+| [ResNet101-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  | 77.3%/93.58%  | 77.09%/93.41%  |
+|[Squeezenet 1.0](#4)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|57.01%/79.71%|56.62%/79.55%|
+|[MobileNet 1.0](#5)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|69.76%/89.32%|69.61%/89.09%|
+|[MobileNetV2 1.0](#6)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|70.14%/89.60%|69.53%/89.24%|
 |[Inception V3](#7)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|76.49%/93.10% |76.38%/93% |
 |[ResNet152-V2](#8)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/resnet/152-layers/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|76.76%/93.03%|76.48%/92.96%|
 |[Inception-BN](#9)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/inception-bn/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|72.09%/90.60%|72.00%/90.53%|
 | [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | VOC2007/2012  | 0.8366 mAP  | 0.8364 mAP  |
 | [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | COCO2014  | 0.2552 mAP  | 0.253 mAP  |
 
-<h3 id='3'>ResNet50-V1</h3>
+<h3 id='3'>ResNet18/50/101-V1</h3>
 
 The following command is to download the pre-trained model from Gluon-CV and transfer it into the symbolic model which would be finally quantized. The [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) is available for testing the pre-trained models:
 
@@ -52,47 +54,46 @@ The model would be automatically replaced in fusion and quantization format. It
 export MXNET_SUBGRAPH_BACKEND=MKLDNN
 
 # Launch FP32 Inference 
-python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --param-file=./model/resnet50_v1-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --param-file=./model/resnet50_v1-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/resnet50_v1-quantized-5batches-naive-symbol.json --param-file=./model/resnet50_v1-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu  --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/resnet50_v1-quantized-5batches-naive-symbol.json --param-file=./model/resnet50_v1-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch dummy data Inference
 python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
 python imagenet_inference.py --symbol-file=./model/resnet50_v1-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
 ```
 
-<h3 id='4'>ResNet101-V1</h3>
+<h3 id='4'>SqueezeNet 1.0</h3>
 
 The following command is to download the pre-trained model from Gluon-CV and transfer it into the symbolic model which would be finally quantized. The [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) is available for testing the pre-trained models:
 
 ```
-python imagenet_gen_qsym_mkldnn.py --model=resnet101_v1 --num-calib-batches=5 --calib-mode=naive
+python imagenet_gen_qsym_mkldnn.py --model=squeezenet1.0 --num-calib-batches=5 --calib-mode=naive
 ```
-
 The model would be automatically replaced in fusion and quantization format. It is then saved as the quantized symbol and parameter files in the `./model` directory. The following command is to launch inference.
 
 ```
 # USE MKLDNN AS SUBGRAPH BACKEND
 export MXNET_SUBGRAPH_BACKEND=MKLDNN
 
-# Launch FP32 Inference 
-python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --param-file=./model/resnet101_v1-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu --data-nthreads=1
+# Launch FP32 Inference
+python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --param-file=./model/squeezenet1.0-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/resnet101_v1-quantized-5batches-naive-symbol.json --param-file=./model/resnet101_v1-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu  --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/squeezenet1.0-quantized-5batches-naive-symbol.json --param-file=./model/squeezenet1.0-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch dummy data Inference
-python imagenet_inference.py --symbol-file=./model/resnet101_v1-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
-python imagenet_inference.py --symbol-file=./model/resnet101_v1-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
+python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu  --benchmark=True
+python imagenet_inference.py --symbol-file=./model/squeezenet1.0-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
 ```
 
-<h3 id='5'>SqueezeNet 1.0</h3>
+<h3 id='5'>MobileNet 1.0</h3>
 
 The following command is to download the pre-trained model from Gluon-CV and transfer it into the symbolic model which would be finally quantized. The [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) is available for testing the pre-trained models:
 
 ```
-python imagenet_gen_qsym_mkldnn.py --model=squeezenet1.0 --num-calib-batches=5 --calib-mode=naive
+python imagenet_gen_qsym_mkldnn.py --model=mobilenet1.0 --num-calib-batches=5 --calib-mode=naive
 ```
 The model would be automatically replaced in fusion and quantization format. It is then saved as the quantized symbol and parameter files in the `./model` directory. The following command is to launch inference.
 
@@ -101,22 +102,22 @@ The model would be automatically replaced in fusion and quantization format. It
 export MXNET_SUBGRAPH_BACKEND=MKLDNN
 
 # Launch FP32 Inference
-python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --param-file=./model/squeezenet1.0-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --param-file=./model/mobilenet1.0-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/squeezenet1.0-quantized-5batches-naive-symbol.json --param-file=./model/squeezenet1.0-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu  --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/mobilenet1.0-quantized-5batches-naive-symbol.json --param-file=./model/mobilenet1.0-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch dummy data Inference
-python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu  --benchmark=True
-python imagenet_inference.py --symbol-file=./model/squeezenet1.0-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
+python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu  --benchmark=True
+python imagenet_inference.py --symbol-file=./model/mobilenet1.0-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
 ```
 
-<h3 id='6'>MobileNet 1.0</h3>
+<h3 id='6'>MobileNetV2 1.0</h3>
 
 The following command is to download the pre-trained model from Gluon-CV and transfer it into the symbolic model which would be finally quantized. The [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) is available for testing the pre-trained models:
 
 ```
-python imagenet_gen_qsym_mkldnn.py --model=mobilenet1.0 --num-calib-batches=5 --calib-mode=naive
+python imagenet_gen_qsym_mkldnn.py --model=mobilenetv2_1.0 --num-calib-batches=5 --calib-mode=naive
 ```
 The model would be automatically replaced in fusion and quantization format. It is then saved as the quantized symbol and parameter files in the `./model` directory. The following command is to launch inference.
 
@@ -125,14 +126,14 @@ The model would be automatically replaced in fusion and quantization format. It
 export MXNET_SUBGRAPH_BACKEND=MKLDNN
 
 # Launch FP32 Inference
-python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --param-file=./model/mobilenet1.0-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --param-file=./model/mobilenetv2_1.0-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/mobilenet1.0-quantized-5batches-naive-symbol.json --param-file=./model/mobilenet1.0-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu  --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-quantized-5batches-naive-symbol.json --param-file=./model/mobilenetv2_1.0-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch dummy data Inference
-python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu  --benchmark=True
-python imagenet_inference.py --symbol-file=./model/mobilenet1.0-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
+python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu  --benchmark=True
+python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
 ```
 
 <h3 id='7'>Inception-V3</h3>
@@ -149,10 +150,10 @@ The model would be automatically replaced in fusion and quantization format. It
 export MXNET_SUBGRAPH_BACKEND=MKLDNN
 
 # Launch FP32 Inference
-python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --param-file=./model/inceptionv3-0000.params --image-shape=3,299,299 --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --param-file=./model/inceptionv3-0000.params --image-shape=3,299,299 --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/inceptionv3-quantized-5batches-naive-symbol.json --param-file=./model/inceptionv3-quantized-0000.params --image-shape=3,299,299 --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu  --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/inceptionv3-quantized-5batches-naive-symbol.json --param-file=./model/inceptionv3-quantized-0000.params --image-shape=3,299,299 --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch dummy data Inference
 python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=64 --num-inference-batches=500 --ctx=cpu  --benchmark=True
@@ -174,10 +175,10 @@ The model would be automatically replaced in fusion and quantization format. It
 export MXNET_SUBGRAPH_BACKEND=MKLDNN
 
 # Launch FP32 Inference 
-python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --param-file=./model/imagenet1k-resnet-152-0000.params --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --param-file=./model/imagenet1k-resnet-152-0000.params --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-5batches-naive-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu  --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-5batches-naive-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch dummy data Inference
 python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
@@ -199,10 +200,10 @@ The model would be automatically replaced in fusion and quantization format. It
 export MXNET_SUBGRAPH_BACKEND=MKLDNN
 
 # Launch FP32 Inference 
-python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-symbol.json --param-file=./model/imagenet1k-inception-bn-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-symbol.json --param-file=./model/imagenet1k-inception-bn-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-5batches-naive-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu  --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-5batches-naive-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch dummy data Inference
 python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
@@ -243,7 +244,7 @@ Some tips on quantization configs:
 export MXNET_SUBGRAPH_BACKEND=MKLDNN
 
 # Launch FP32 Inference 
-python imagenet_inference.py --symbol-file=./model/custom-symbol.json --param-file=./model/custom-0000.params --rgb-mean=* --rgb-std=* --num-skipped-batches=* --batch-size=* --num-inference-batches=*--dataset=./data/* --ctx=cpu --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/custom-symbol.json --param-file=./model/custom-0000.params --rgb-mean=* --rgb-std=* --num-skipped-batches=* --batch-size=* --num-inference-batches=*--dataset=./data/* --ctx=cpu
 ```
 
 3. Then, you should add `rgb_mean`, `rgb_std` and `excluded_sym_names` in this script. Notice that you should exclude conv/pool layers that have negative data since Intel® MKL-DNN only supports `uint8` quantization temporarily. You should also exclude all fc layers in your model.
@@ -260,7 +261,7 @@ python imagenet_gen_qsym_mkldnn.py --model=custom --num-calib-batches=5 --calib-
 
 ```
 # Launch INT8 Inference 
-python imagenet_inference.py --symbol-file=./model/*.json --param-file=./model/*.params --rgb-mean=* --rgb-std=* --num-skipped-batches=* --batch-size=* --num-inference-batches=*--dataset=./data/* --ctx=cpu --data-nthreads=1
+python imagenet_inference.py --symbol-file=./model/*.json --param-file=./model/*.params --rgb-mean=* --rgb-std=* --num-skipped-batches=* --batch-size=* --num-inference-batches=*--dataset=./data/* --ctx=cpu
 
 # Launch dummy data Inference
 python imagenet_inference.py --symbol-file=./model/*.json --batch-size=* --num-inference-batches=500 --ctx=cpu --benchmark=True
diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py
index 2ef137273cca..06a1272caf21 100644
--- a/example/quantization/imagenet_gen_qsym_mkldnn.py
+++ b/example/quantization/imagenet_gen_qsym_mkldnn.py
@@ -92,11 +92,13 @@ def save_params(fname, arg_params, aux_params, logger=None):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model with Intel MKL-DNN support')
-    parser.add_argument('--model', type=str, choices=['resnet50_v1',
+    parser.add_argument('--model', type=str, choices=['resnet18_v1',
+                                                      'resnet50_v1',
                                                       'resnet101_v1',
                                                       'inceptionv3',
                                                       'squeezenet1.0',
                                                       'mobilenet1.0',
+                                                      'mobilenetv2_1.0',
                                                       'imagenet1k-resnet-152',
                                                       'imagenet1k-inception-bn',
                                                       'custom'],
@@ -163,7 +165,13 @@ def save_params(fname, arg_params, aux_params, logger=None):
         download_calib_dataset('http://data.mxnet.io/data/val_256_q90.rec', args.calib_dataset)
 
     # download model
-    if args.model in ['resnet50_v1', 'resnet101_v1', 'squeezenet1.0', 'mobilenet1.0', 'inceptionv3']:
+    if args.model in ['resnet18_v1',
+                      'resnet50_v1',
+                      'resnet101_v1',
+                      'squeezenet1.0',
+                      'mobilenet1.0',
+                      'mobilenetv2_1.0',
+                      'inceptionv3']:
         logger.info('model %s is converted from GluonCV' % args.model)
         args.use_gluon_model = True
     if args.use_gluon_model == True:
@@ -216,7 +224,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
         excluded_sym_names += ['flatten']
         if exclude_first_conv:
             excluded_sym_names += ['conv_1']
-    elif args.model in ['resnet50_v1', 'resnet101_v1']:
+    elif args.model in ['resnet18_v1', 'resnet50_v1', 'resnet101_v1']:
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '58.393, 57.12, 57.375'
         if exclude_first_conv:
@@ -234,6 +242,12 @@ def save_params(fname, arg_params, aux_params, logger=None):
                                'mobilenet0_pool0_fwd']
         if exclude_first_conv:
             excluded_sym_names += ['mobilenet0_conv0_fwd']
+    elif args.model == 'mobilenetv2_1.0':
+        rgb_mean = '123.68,116.779,103.939'
+        rgb_std = '58.393, 57.12, 57.375'
+        excluded_sym_names += ['mobilenetv20_output_flatten0_flatten0']
+        if exclude_first_conv:
+            excluded_sym_names += ['mobilenetv20_conv0_fwd']
     elif args.model == 'inceptionv3':
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '58.393, 57.12, 57.375'
diff --git a/example/quantization/imagenet_inference.py b/example/quantization/imagenet_inference.py
index 47e206303e99..e78546140403 100644
--- a/example/quantization/imagenet_inference.py
+++ b/example/quantization/imagenet_inference.py
@@ -217,45 +217,21 @@ def benchmark_score(symbol_file, ctx, batch_size, num_batches, data_layer_type,
         logger.info('Dataset for inference: %s' % dataset)
 
         # creating data iterator
-        if data_layer_type == 'int8':
-            data = mx.io.ImageRecordInt8Iter(path_imgrec=dataset,
-                                             label_width=1,
-                                             preprocess_threads=data_nthreads,
-                                             batch_size=batch_size,
-                                             data_shape=data_shape,
-                                             label_name=label_name,
-                                             rand_crop=False,
-                                             rand_mirror=False,
-                                             shuffle=args.shuffle_dataset,
-                                             shuffle_chunk_seed=args.shuffle_chunk_seed,
-                                             seed=args.shuffle_seed,
-                                             **combine_mean_std)
-        elif data_layer_type == 'uint8':
-            data = mx.io.ImageRecordUInt8Iter(path_imgrec=dataset,
-                                              label_width=1,
-                                              preprocess_threads=data_nthreads,
-                                              batch_size=batch_size,
-                                              data_shape=data_shape,
-                                              label_name=label_name,
-                                              rand_crop=False,
-                                              rand_mirror=False,
-                                              shuffle=args.shuffle_dataset,
-                                              shuffle_chunk_seed=args.shuffle_chunk_seed,
-                                              seed=args.shuffle_seed,
-                                              **combine_mean_std)
-        else:  #float32
-            data = mx.io.ImageRecordIter(path_imgrec=dataset,
-                                         label_width=1,
-                                         preprocess_threads=data_nthreads,
-                                         batch_size=batch_size,
-                                         data_shape=data_shape,
-                                         label_name=label_name,
-                                         rand_crop=False,
-                                         rand_mirror=False,
-                                         shuffle=args.shuffle_dataset,
-                                         shuffle_chunk_seed=args.shuffle_chunk_seed,
-                                         seed=args.shuffle_seed,
-                                         **combine_mean_std)
+        data = mx.io.ImageRecordIter(
+            path_imgrec=dataset,
+            label_width=1,
+            preprocess_threads=data_nthreads,
+            batch_size=batch_size,
+            data_shape=data_shape,
+            label_name=label_name,
+            rand_crop=False,
+            rand_mirror=False,
+            shuffle=args.shuffle_dataset,
+            shuffle_chunk_seed=args.shuffle_chunk_seed,
+            seed=args.shuffle_seed,
+            dtype=data_layer_type,
+            ctx=args.ctx,
+            **combine_mean_std)
 
         # loading model
         sym, arg_params, aux_params = load_model(symbol_file, param_file, logger)
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 21018e3e1b06..f79f224029b2 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -823,6 +823,7 @@ MXNET_DLL int MXNDArrayToDLPack(NDArrayHandle handle,
 */
 MXNET_DLL int MXNDArrayFromDLPack(DLManagedTensorHandle dlpack,
                                   NDArrayHandle *out_handle);
+
 /*!
  * \brief Delete a dlpack tensor
  * \param dlpack the pointer of the input DLManagedTensor
@@ -2064,7 +2065,6 @@ MXNET_DLL int MXExecutorReshapeEx(int partial_shaping,
  */
 MXNET_DLL int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
                                            SymbolHandle *out);
-
 /*!
  * \brief set a call back to notify the completion of operation
  */
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 58f222dc1e85..53414016e39e 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable=invalid-name, no-member, trailing-comma-tuple, bad-mcs-classmethod-argument
+# pylint: disable=invalid-name, no-member, trailing-comma-tuple, bad-mcs-classmethod-argument, unnecessary-pass
 """ctypes library of mxnet and helper functions."""
 from __future__ import absolute_import
 
diff --git a/python/mxnet/contrib/onnx/mx2onnx/_export_helper.py b/python/mxnet/contrib/onnx/mx2onnx/_export_helper.py
index 781fb4cfbbc1..e73ff70fa5b0 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/_export_helper.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/_export_helper.py
@@ -40,7 +40,7 @@ def load_module(sym_filepath, params_filepath):
     params : params object
         Model weights including both arg and aux params.
     """
-    if not (os.path.isfile(sym_filepath) and os.path.isfile(params_filepath)):
+    if not (os.path.isfile(sym_filepath) and os.path.isfile(params_filepath)): # pylint: disable=no-else-raise
         raise ValueError("Symbol and params files provided are invalid")
     else:
         try:
diff --git a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
index f9d170d81c13..35f4ff451cdb 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
@@ -762,7 +762,7 @@ def convert_leakyrelu(node, **kwargs):
     act_name = {"elu": "Elu", "leaky": "LeakyRelu", "prelu": "PRelu",
                 "selu": "Selu"}
 
-    if act_type == "prelu" or act_type == "selu":
+    if act_type in ("prelu", "selu"):
         node = onnx.helper.make_node(
             act_name[act_type],
             inputs=input_nodes,
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py b/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py
index ce55a0b7d66a..48ede28ab022 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py
@@ -178,7 +178,7 @@ def _fix_channels(op_name, attrs, inputs, proto_obj):
     these attributes. We check the shape of weights provided to get the number.
     """
     weight_name = inputs[1].name
-    if not weight_name in proto_obj._params:
+    if not weight_name in proto_obj._params: # pylint: disable=no-else-raise
         raise ValueError("Unable to get channels/units attr from onnx graph.")
     else:
         wshape = proto_obj._params[weight_name].shape
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index 9e5f8c1e2311..b94b5a8da32a 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -61,6 +61,7 @@ def _quantize_params(qsym, params, th_dict):
         if name.endswith(('weight_quantize', 'bias_quantize')):
             original_name = name[:-len('_quantize')]
             param = params[original_name]
+            # pylint: disable=unbalanced-tuple-unpacking
             val, vmin, vmax = ndarray.contrib.quantize(data=param,
                                                        min_range=ndarray.min(param),
                                                        max_range=ndarray.max(param),
diff --git a/python/mxnet/contrib/tensorrt.py b/python/mxnet/contrib/tensorrt.py
index 4ff39c4b4829..d94600e1cee3 100644
--- a/python/mxnet/contrib/tensorrt.py
+++ b/python/mxnet/contrib/tensorrt.py
@@ -16,95 +16,50 @@
 # under the License.
 
 """ Module to enable the use of TensorRT optimized graphs."""
-
-import ctypes
-import logging
 import os
 
-from .. import symbol as sym
-
-from ..base import _LIB, SymbolHandle, MXNetError
-from ..base import check_call
-
-
-def set_use_tensorrt(status):
+def set_use_fp16(status):
     """
-    Set an environment variable which will enable or disable the use of TensorRT in the backend.
-    Note: this is useful for A/B testing purposes.
-    :param status: Boolean, true if TensorRT optimization should be applied, False for legacy
-    behaviour.
+    Set an environment variable which will enable or disable the use of FP16 precision in
+    TensorRT
+    Note: The mode FP16 force the whole TRT node to be executed in FP16
+    :param status: Boolean, True if TensorRT should run in FP16, False for FP32
     """
-    os.environ["MXNET_USE_TENSORRT"] = str(int(status))
-
+    os.environ["MXNET_TENSORRT_USE_FP16"] = str(int(status))
 
-def get_use_tensorrt():
+def get_use_fp16():
     """
-    Get an environment variable which describes if TensorRT is currently enabled in the backend.
-    Note: this is useful for A/B testing purposes.
-    :return: Boolean, true if TensorRT optimization should be applied, False for legacy
-    behaviour.
+    Get an environment variable which describes if TensorRT is currently running in FP16
+    :return: Boolean, true if TensorRT is running in FP16, False for FP32
     """
-    return bool(int(os.environ.get("MXNET_USE_TENSORRT", 0)) == 1)
+    return bool(int(os.environ.get("MXNET_TENSORRT_USE_FP16", 1)) == 1)
 
-
-def get_optimized_symbol(executor):
+def init_tensorrt_params(sym, arg_params, aux_params):
     """
-    Take an executor's underlying symbol graph and return its generated optimized version.
-
-    Parameters
-    ----------
-    executor :
-        An executor for which you want to see an optimized symbol. Getting an optimized symbol
-        is useful to compare and verify the work TensorRT has done against a legacy behaviour.
-
-    Returns
-    -------
-    symbol : nnvm::Symbol
-        The nnvm symbol optimized.
-    """
-    handle = SymbolHandle()
-    try:
-        check_call(_LIB.MXExecutorGetOptimizedSymbol(executor.handle, ctypes.byref(handle)))
-        result = sym.Symbol(handle=handle)
-        return result
-    except MXNetError:
-        logging.error('Error while trying to fetch TRT optimized symbol for graph. Please ensure '
-                      'build was compiled with MXNET_USE_TENSORRT enabled.')
-        raise
-
-
-def tensorrt_bind(symbol, ctx, all_params, type_dict=None, stype_dict=None, group2ctx=None,
-                  **kwargs):
-    """Bind current symbol to get an optimized trt executor.
-
-    Parameters
-    ----------
-    symbol : Symbol
-        The symbol you wish to bind, and optimize with TensorRT.
-
-    ctx : Context
-        The device context the generated executor to run on.
-
-    all_params : Dict of str->ndarray
-        A dictionary of mappings from parameter names to parameter NDArrays.
-
-    type_dict  : Dict of str->numpy.dtype
-        Input type dictionary, name->dtype
-
-    stype_dict  : Dict of str->str
-        Input storage type dictionary, name->storage_type
-
-    group2ctx : Dict of string to mx.Context
-        The dict mapping the `ctx_group` attribute to the context assignment.
-
-    kwargs : Dict of str->shape
-        Input shape dictionary, name->shape
-
-    Returns
-    -------
-    executor : mxnet.Executor
-        An optimized TensorRT executor.
+    Set weights in attributes of TensorRT nodes
+    :param sym: Symbol, the symbol graph should contains some TensorRT nodes
+    :param arg_params: arg_params
+    :param aux_params: aux_params
+    :return arg_params, aux_params: remaining params that are not in TensorRT nodes
     """
-    kwargs['shared_buffer'] = all_params
-    return symbol.simple_bind(ctx, type_dict=type_dict, stype_dict=stype_dict,
-                              group2ctx=group2ctx, **kwargs)
+    for s in sym.get_internals():
+        new_params_names = ""
+        tensorrt_params = {}
+        if 'subgraph_params_names' in s.list_attr():
+            keys = s.list_attr()['subgraph_params_names'].split(';')
+            for k in keys:
+                if k in arg_params:
+                    new_params_names += k + ";"
+                    tensorrt_params['subgraph_param_' + k] = arg_params[k]
+                    arg_params.pop(k)
+                elif k in aux_params:
+                    new_params_names += k + ";"
+                    tensorrt_params['subgraph_param_' + k] = aux_params[k]
+                    aux_params.pop(k)
+            new_attrs = {}
+            for k, v in tensorrt_params.items():
+                new_attrs[k] = str(v.handle.value)
+            if len(new_attrs) > 0:
+                s._set_attr(**new_attrs)
+                s._set_attr(subgraph_params_names=new_params_names[:-1])
+    return arg_params, aux_params
diff --git a/python/mxnet/contrib/text/vocab.py b/python/mxnet/contrib/text/vocab.py
index ede2ca535712..6e9920d601b6 100644
--- a/python/mxnet/contrib/text/vocab.py
+++ b/python/mxnet/contrib/text/vocab.py
@@ -210,7 +210,7 @@ def to_tokens(self, indices):
 
         tokens = []
         for idx in indices:
-            if not isinstance(idx, int) or idx > max_idx:
+            if not isinstance(idx, int) or idx > max_idx: # pylint: disable=no-else-raise
                 raise ValueError('Token index %d in the provided `indices` is invalid.' % idx)
             else:
                 tokens.append(self.idx_to_token[idx])
diff --git a/python/mxnet/gluon/contrib/__init__.py b/python/mxnet/gluon/contrib/__init__.py
index f708fb900227..83be8a39ba32 100644
--- a/python/mxnet/gluon/contrib/__init__.py
+++ b/python/mxnet/gluon/contrib/__init__.py
@@ -22,4 +22,6 @@
 
 from . import rnn
 
+from . import cnn
+
 from . import data
diff --git a/python/mxnet/gluon/contrib/cnn/__init__.py b/python/mxnet/gluon/contrib/cnn/__init__.py
new file mode 100644
index 000000000000..501b9ea829b8
--- /dev/null
+++ b/python/mxnet/gluon/contrib/cnn/__init__.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import
+"""Contrib convolutional neural network module."""
+
+from . import conv_layers
+from .conv_layers import *
+
+__all__ = conv_layers.__all__
diff --git a/python/mxnet/gluon/contrib/cnn/conv_layers.py b/python/mxnet/gluon/contrib/cnn/conv_layers.py
new file mode 100644
index 000000000000..9dd208702932
--- /dev/null
+++ b/python/mxnet/gluon/contrib/cnn/conv_layers.py
@@ -0,0 +1,221 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable= arguments-differ
+"""Custom convolutional neural network layers in model_zoo."""
+
+__all__ = ['DeformableConvolution']
+
+from .... import symbol
+from ...block import HybridBlock
+from ....base import numeric_types
+from ...nn import Activation
+
+class DeformableConvolution(HybridBlock):
+    """2-D Deformable Convolution v_1 (Dai, 2017).
+    Normal Convolution uses sampling points in a regular grid, while the sampling
+    points of Deformablem Convolution can be offset. The offset is learned with a
+    separate convolution layer during the training. Both the convolution layer for
+    generating the output features and the offsets are included in this gluon layer.
+
+    Parameters
+    ----------
+    channels : int,
+        The dimensionality of the output space
+        i.e. the number of output channels in the convolution.
+    kernel_size : int or tuple/list of 2 ints, (Default value = (1,1))
+        Specifies the dimensions of the convolution window.
+    strides : int or tuple/list of 2 ints, (Default value = (1,1))
+        Specifies the strides of the convolution.
+    padding : int or tuple/list of 2 ints, (Default value = (0,0))
+        If padding is non-zero, then the input is implicitly zero-padded
+        on both sides for padding number of points.
+    dilation : int or tuple/list of 2 ints, (Default value = (1,1))
+        Specifies the dilation rate to use for dilated convolution.
+    groups : int, (Default value = 1)
+        Controls the connections between inputs and outputs.
+        At groups=1, all inputs are convolved to all outputs.
+        At groups=2, the operation becomes equivalent to having two convolution
+        layers side by side, each seeing half the input channels, and producing
+        half the output channels, and both subsequently concatenated.
+    num_deformable_group : int, (Default value = 1)
+        Number of deformable group partitions.
+    layout : str, (Default value = NCHW)
+        Dimension ordering of data and weight. Can be 'NCW', 'NWC', 'NCHW',
+        'NHWC', 'NCDHW', 'NDHWC', etc. 'N', 'C', 'H', 'W', 'D' stands for
+        batch, channel, height, width and depth dimensions respectively.
+        Convolution is performed over 'D', 'H', and 'W' dimensions.
+    use_bias : bool, (Default value = True)
+        Whether the layer for generating the output features uses a bias vector.
+    in_channels : int, (Default value = 0)
+        The number of input channels to this layer. If not specified,
+        initialization will be deferred to the first time `forward` is called
+        and input channels will be inferred from the shape of input data.
+    activation : str, (Default value = None)
+        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    weight_initializer : str or `Initializer`, (Default value = None)
+        Initializer for the `weight` weights matrix for the convolution layer
+        for generating the output features.
+    bias_initializer : str or `Initializer`, (Default value = zeros)
+        Initializer for the bias vector for the convolution layer
+        for generating the output features.
+    offset_weight_initializer : str or `Initializer`, (Default value = zeros)
+        Initializer for the `weight` weights matrix for the convolution layer
+        for generating the offset.
+    offset_bias_initializer : str or `Initializer`, (Default value = zeros),
+        Initializer for the bias vector for the convolution layer
+        for generating the offset.
+    offset_use_bias: bool, (Default value = True)
+        Whether the layer for generating the offset uses a bias vector.
+
+    Inputs:
+        - **data**: 4D input tensor with shape
+          `(batch_size, in_channels, height, width)` when `layout` is `NCHW`.
+          For other layouts shape is permuted accordingly.
+
+    Outputs:
+        - **out**: 4D output tensor with shape
+          `(batch_size, channels, out_height, out_width)` when `layout` is `NCHW`.
+          out_height and out_width are calculated as::
+
+              out_height = floor((height+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0])+1
+              out_width = floor((width+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1])+1
+    """
+
+    def __init__(self, channels, kernel_size=(1, 1), strides=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1,
+                 num_deformable_group=1, layout='NCHW', use_bias=True, in_channels=0, activation=None,
+                 weight_initializer=None, bias_initializer='zeros',
+                 offset_weight_initializer='zeros', offset_bias_initializer='zeros', offset_use_bias=True,
+                 op_name='DeformableConvolution', adj=None, prefix=None, params=None):
+        super(DeformableConvolution, self).__init__(prefix=prefix, params=params)
+        with self.name_scope():
+            self._channels = channels
+            self._in_channels = in_channels
+
+            assert layout in ('NCHW', 'NHWC'), "Only supports 'NCHW' and 'NHWC' layout for now"
+            if isinstance(kernel_size, numeric_types):
+                kernel_size = (kernel_size,) * 2
+            if isinstance(strides, numeric_types):
+                strides = (strides,) * len(kernel_size)
+            if isinstance(padding, numeric_types):
+                padding = (padding,) * len(kernel_size)
+            if isinstance(dilation, numeric_types):
+                dilation = (dilation,) * len(kernel_size)
+            self._op_name = op_name
+
+            offset_channels = 2 * kernel_size[0] * kernel_size[1] * num_deformable_group
+            self._kwargs_offset = {
+                'kernel': kernel_size, 'stride': strides, 'dilate': dilation,
+                'pad': padding, 'num_filter': offset_channels, 'num_group': groups,
+                'no_bias': not offset_use_bias, 'layout': layout}
+
+            self._kwargs_deformable_conv = {
+                'kernel': kernel_size, 'stride': strides, 'dilate': dilation,
+                'pad': padding, 'num_filter': channels, 'num_group': groups,
+                'num_deformable_group': num_deformable_group,
+                'no_bias': not use_bias, 'layout': layout}
+
+            if adj:
+                self._kwargs_offset['adj'] = adj
+                self._kwargs_deformable_conv['adj'] = adj
+
+            dshape = [0] * (len(kernel_size) + 2)
+            dshape[layout.find('N')] = 1
+            dshape[layout.find('C')] = in_channels
+
+            op = getattr(symbol, 'Convolution')
+            offset = op(symbol.var('data', shape=dshape), **self._kwargs_offset)
+
+            offsetshapes = offset.infer_shape_partial()[0]
+
+            self.offset_weight = self.params.get('offset_weight', shape=offsetshapes[1],
+                                                 init=offset_weight_initializer,
+                                                 allow_deferred_init=True)
+
+            if offset_use_bias:
+                self.offset_bias = self.params.get('offset_bias', shape=offsetshapes[2],
+                                                   init=offset_bias_initializer,
+                                                   allow_deferred_init=True)
+            else:
+                self.offset_bias = None
+
+            deformable_conv_weight_shape = [0] * (len(kernel_size) + 2)
+            deformable_conv_weight_shape[0] = channels
+            deformable_conv_weight_shape[2] = kernel_size[0]
+            deformable_conv_weight_shape[3] = kernel_size[1]
+
+            self.deformable_conv_weight = self.params.get('deformable_conv_weight',
+                                                          shape=deformable_conv_weight_shape,
+                                                          init=weight_initializer,
+                                                          allow_deferred_init=True)
+
+            if use_bias:
+                self.deformable_conv_bias = self.params.get('deformable_conv_bias', shape=(channels,),
+                                                            init=bias_initializer,
+                                                            allow_deferred_init=True)
+            else:
+                self.deformable_conv_bias = None
+
+            if activation:
+                self.act = Activation(activation, prefix=activation + '_')
+            else:
+                self.act = None
+
+    def hybrid_forward(self, F, x, offset_weight, deformable_conv_weight, offset_bias=None, deformable_conv_bias=None):
+        if offset_bias is None:
+            offset = F.Convolution(x, offset_weight, cudnn_off=True, **self._kwargs_offset)
+        else:
+            offset = F.Convolution(x, offset_weight, offset_bias, cudnn_off=True, **self._kwargs_offset)
+
+        if deformable_conv_bias is None:
+            act = F.contrib.DeformableConvolution(data=x, offset=offset, weight=deformable_conv_weight,
+                                                  name='fwd', **self._kwargs_deformable_conv)
+        else:
+            act = F.contrib.DeformableConvolution(data=x, offset=offset, weight=deformable_conv_weight,
+                                                  bias=deformable_conv_bias, name='fwd',
+                                                  **self._kwargs_deformable_conv)
+
+        if self.act:
+            act = self.act(act)
+        return act
+
+    def _alias(self):
+        return 'deformable_conv'
+
+    def __repr__(self):
+        s = '{name}({mapping}, kernel_size={kernel}, stride={stride}'
+        len_kernel_size = len(self._kwargs_deformable_conv['kernel'])
+        if self._kwargs_deformable_conv['pad'] != (0,) * len_kernel_size:
+            s += ', padding={pad}'
+        if self._kwargs_deformable_conv['dilate'] != (1,) * len_kernel_size:
+            s += ', dilation={dilate}'
+        if hasattr(self, 'out_pad') and self.out_pad != (0,) * len_kernel_size:
+            s += ', output_padding={out_pad}'.format(out_pad=self.out_pad)
+        if self._kwargs_deformable_conv['num_group'] != 1:
+            s += ', groups={num_group}'
+        if self.deformable_conv_bias is None:
+            s += ', bias=False'
+        if self.act:
+            s += ', {}'.format(self.act)
+        s += ')'
+        shape = self.deformable_conv_weight.shape
+        return s.format(name=self.__class__.__name__,
+                        mapping='{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0]),
+                        **self._kwargs_deformable_conv)
diff --git a/python/mxnet/gluon/contrib/nn/basic_layers.py b/python/mxnet/gluon/contrib/nn/basic_layers.py
index ebe136e30208..6cbf988fc94a 100644
--- a/python/mxnet/gluon/contrib/nn/basic_layers.py
+++ b/python/mxnet/gluon/contrib/nn/basic_layers.py
@@ -199,10 +199,10 @@ class SyncBatchNorm(BatchNorm):
         Initializer for the beta weight.
     gamma_initializer: str or `Initializer`, default 'ones'
         Initializer for the gamma weight.
-    moving_mean_initializer: str or `Initializer`, default 'zeros'
-        Initializer for the moving mean.
-    moving_variance_initializer: str or `Initializer`, default 'ones'
-        Initializer for the moving variance.
+    running_mean_initializer: str or `Initializer`, default 'zeros'
+        Initializer for the running mean.
+    running_variance_initializer: str or `Initializer`, default 'ones'
+        Initializer for the running variance.
 
 
     Inputs:
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index f8566dd05aa5..3d6976c32740 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -305,10 +305,10 @@ class BatchNorm(HybridBlock):
         Initializer for the beta weight.
     gamma_initializer: str or `Initializer`, default 'ones'
         Initializer for the gamma weight.
-    moving_mean_initializer: str or `Initializer`, default 'zeros'
-        Initializer for the moving mean.
-    moving_variance_initializer: str or `Initializer`, default 'ones'
-        Initializer for the moving variance.
+    running_mean_initializer: str or `Initializer`, default 'zeros'
+        Initializer for the running mean.
+    running_variance_initializer: str or `Initializer`, default 'ones'
+        Initializer for the running variance.
     in_channels : int, default 0
         Number of channels (feature maps) in input data. If not specified,
         initialization will be deferred to the first time `forward` is called
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 2e130d498c14..f660b97f8789 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable=
+# pylint: disable=unnecessary-pass
 """Neural network parameter."""
 __all__ = ['DeferredInitializationError', 'Parameter', 'Constant',
            'ParameterDict', 'tensor_types']
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index a95417cf523b..6935c2752e1a 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -249,7 +249,7 @@ def _init_kvstore(self):
 
     @property
     def learning_rate(self):
-        if not isinstance(self._optimizer, opt.Optimizer):
+        if not isinstance(self._optimizer, opt.Optimizer): # pylint: disable=no-else-raise
             raise UserWarning("Optimizer has to be defined before its learning "
                               "rate can be accessed.")
         else:
@@ -263,7 +263,7 @@ def set_learning_rate(self, lr):
         lr : float
             The new learning rate of the optimizer.
         """
-        if not isinstance(self._optimizer, opt.Optimizer):
+        if not isinstance(self._optimizer, opt.Optimizer): # pylint: disable=no-else-raise
             raise UserWarning("Optimizer has to be defined before its learning "
                               "rate is mutated.")
         else:
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index b00cc043d493..861542220927 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -340,7 +340,7 @@ def download(url, path=None, overwrite=False, sha1_hash=None, retries=5, verify_
                 break
             except Exception as e:
                 retries -= 1
-                if retries <= 0:
+                if retries <= 0: # pylint: disable=no-else-raise
                     raise e
                 else:
                     print('download failed due to {}, retrying, {} attempt{} left'
diff --git a/python/mxnet/image/detection.py b/python/mxnet/image/detection.py
index d5b5ecab528a..a70e5723072f 100644
--- a/python/mxnet/image/detection.py
+++ b/python/mxnet/image/detection.py
@@ -809,7 +809,7 @@ def next(self):
         pad = batch_size - i
         # handle padding for the last batch
         if pad != 0:
-            if self.last_batch_handle == 'discard':
+            if self.last_batch_handle == 'discard': # pylint: disable=no-else-raise
                 raise StopIteration
             # if the option is 'roll_over', throw StopIteration and cache the data
             elif self.last_batch_handle == 'roll_over' and \
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index 8bcf724ac4d2..f7dc27b72951 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -1374,7 +1374,7 @@ def next(self):
         pad = batch_size - i
         # handle padding for the last batch
         if pad != 0:
-            if self.last_batch_handle == 'discard':
+            if self.last_batch_handle == 'discard': # pylint: disable=no-else-raise
                 raise StopIteration
             # if the option is 'roll_over', throw StopIteration and cache the data
             elif self.last_batch_handle == 'roll_over' and \
diff --git a/python/mxnet/io/io.py b/python/mxnet/io/io.py
index 2bd1d6115ac3..2a42840bcf22 100644
--- a/python/mxnet/io/io.py
+++ b/python/mxnet/io/io.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# coding: utf-8
+# pylint: disable=unnecessary-pass
 """Data iterators for common data formats."""
 from __future__ import absolute_import
 from collections import namedtuple
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index f44ff041e35d..9ff23b7afbc5 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -640,7 +640,7 @@ def _init_iter(self, X, y, is_train):
         """Initialize the iterator given input."""
         if isinstance(X, (np.ndarray, nd.NDArray)):
             if y is None:
-                if is_train:
+                if is_train: # pylint: disable=no-else-raise
                     raise ValueError('y must be specified when X is numpy.ndarray')
                 else:
                     y = np.zeros(X.shape[0])
diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
index ca8463153686..754e369b4e63 100644
--- a/python/mxnet/module/base_module.py
+++ b/python/mxnet/module/base_module.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# pylint: disable=fixme, too-many-arguments, too-many-locals
+# pylint: disable=fixme, too-many-arguments, too-many-locals, no-else-raise
 # pylint: disable=too-many-public-methods, too-many-branches, too-many-lines
 """`BaseModule` defines an API for modules."""
 
diff --git a/python/mxnet/module/python_module.py b/python/mxnet/module/python_module.py
index 886851efc305..df1648e82694 100644
--- a/python/mxnet/module/python_module.py
+++ b/python/mxnet/module/python_module.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# pylint: disable=too-many-instance-attributes, too-many-arguments
+# pylint: disable=too-many-instance-attributes, too-many-arguments, unnecessary-pass
 """Provide some handy classes for user to implement a simple computation module
 in Python easily.
 """
diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index 1718a2c68d13..601bc682db38 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -514,7 +514,7 @@ def isfinite(data):
     [0. 0. 0. 1.]
     <NDArray 4 @cpu(0)>
     """
-    is_data_not_nan = data == data
+    is_data_not_nan = data == data # pylint: disable=comparison-with-itself
     is_data_not_infinite = data.abs() != np.inf
     return ndarray.logical_and(is_data_not_infinite, is_data_not_nan)
 
@@ -542,7 +542,7 @@ def isnan(data):
     [1. 0.]
     <NDArray 2 @cpu(0)>
     """
-    return data != data
+    return data != data # pylint: disable=comparison-with-itself
 
 def adamw_update(weight, grad, mean, var, rescale_grad, lr, eta, beta1=0.9, beta2=0.999,
                  epsilon=1e-8, wd=0, clip_gradient=-1, out=None, name=None, **kwargs):
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 97cfd827c7fe..d912d38930a5 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -47,7 +47,7 @@
            "imdecode", "lesser", "lesser_equal", "logical_and", "logical_or", "logical_xor",
            "maximum", "minimum", "moveaxis", "modulo", "multiply", "not_equal", "onehot_encode",
            "power", "subtract", "true_divide", "waitall", "_new_empty_handle", "histogram",
-           "split_v2", "to_dlpack_for_read", "to_dlpack_for_write", "from_dlpack"]
+           "split_v2", "to_dlpack_for_read", "to_dlpack_for_write", "from_dlpack", "from_numpy"]
 
 _STORAGE_TYPE_UNDEFINED = -1
 _STORAGE_TYPE_DEFAULT = 0
@@ -4115,3 +4115,108 @@ def from_dlpack(dlpack):
     # delete the deleter of the old dlpack
     ctypes.pythonapi.PyCapsule_SetDestructor(dlpack, None)
     return NDArray(handle=handle)
+
+class DLContext(ctypes.Structure):
+    _fields_ = [("device_type", ctypes.c_int),
+                ("device_id", ctypes.c_int)]
+
+
+class DLDataType(ctypes.Structure):
+    _fields_ = [("type_code", ctypes.c_uint8),
+                ("bits", ctypes.c_uint8),
+                ("lanes", ctypes.c_uint16)]
+    TYPE_MAP = {
+        "int32": (0, 32, 1),
+        "int64": (0, 64, 1),
+        "bool": (1, 1, 1),
+        "uint32": (1, 32, 1),
+        "uint64": (1, 64, 1),
+        "float32": (2, 32, 1),
+        "float64": (2, 64, 1),
+    }
+
+
+class DLTensor(ctypes.Structure):
+    _fields_ = [("data", ctypes.c_void_p),
+                ("ctx", DLContext),
+                ("ndim", ctypes.c_int),
+                ("dtype", DLDataType),
+                ("shape", ctypes.POINTER(ctypes.c_int64)),
+                ("strides", ctypes.POINTER(ctypes.c_int64)),
+                ("byte_offset", ctypes.c_uint64)]
+
+class DLManagedTensor(ctypes.Structure):
+    pass
+
+
+DeleterFunc = ctypes.CFUNCTYPE(None, ctypes.POINTER(DLManagedTensor))
+
+
+DLManagedTensor._fields_ = [("dl_tensor", DLTensor),           # pylint: disable=protected-access
+                            ("manager_ctx", ctypes.c_void_p),
+                            ("deleter", DeleterFunc)]
+
+
+@DeleterFunc
+def dl_managed_tensor_deleter(dl_managed_tensor_handle):
+    void_p = dl_managed_tensor_handle.contents.manager_ctx
+    pyobj = ctypes.cast(void_p, ctypes.py_object)
+    ctypes.pythonapi.Py_DecRef(pyobj)
+
+
+def from_numpy(ndarray, zero_copy=True):
+    """Returns an MXNet's NDArray backed by Numpy's ndarray.
+
+    Parameters
+    ----------
+    ndarray: numpy.ndarray
+        input data
+
+    zero_copy: bool
+        Whether we use DLPack's zero-copy conversion to convert to MXNet's NDArray.
+        This is only available for c-contiguous arrays, i.e. array.flags[C_CONTIGUOUS] == True.
+
+    Returns
+    -------
+    NDArray
+        a NDArray backed by a dlpack tensor
+
+    """
+
+    def _make_manager_ctx(obj):
+        pyobj = ctypes.py_object(obj)
+        void_p = ctypes.c_void_p.from_buffer(pyobj)
+        ctypes.pythonapi.Py_IncRef(pyobj)
+        return void_p
+
+    def _make_dl_tensor(array):
+        if str(array.dtype) not in DLDataType.TYPE_MAP:
+            raise ValueError(str(array.dtype) + " is not supported.")
+        dl_tensor = DLTensor()
+        dl_tensor.data = array.ctypes.data_as(ctypes.c_void_p)
+        dl_tensor.ctx = DLContext(1, 0)
+        dl_tensor.ndim = array.ndim
+        dl_tensor.dtype = DLDataType.TYPE_MAP[str(array.dtype)]
+        dl_tensor.shape = array.ctypes.shape_as(ctypes.c_int64)
+        dl_tensor.strides = None
+        dl_tensor.byte_offset = 0
+        return dl_tensor
+
+    def _make_dl_managed_tensor(array):
+        c_obj = DLManagedTensor()
+        c_obj.dl_tensor = _make_dl_tensor(array)
+        c_obj.manager_ctx = _make_manager_ctx(array)
+        c_obj.deleter = dl_managed_tensor_deleter
+        return c_obj
+
+    if not zero_copy:
+        return array(ndarray, dtype=ndarray.dtype)
+
+    if not ndarray.flags['C_CONTIGUOUS']:
+        raise ValueError("Only c-contiguous arrays are supported for zero-copy")
+    c_obj = _make_dl_managed_tensor(ndarray)
+    address = ctypes.addressof(c_obj)
+    address = ctypes.cast(address, ctypes.c_void_p)
+    handle = NDArrayHandle()
+    check_call(_LIB.MXNDArrayFromDLPack(address, ctypes.byref(handle)))
+    return NDArray(handle=handle)
diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py
index 928079749db5..4987cb57b6ea 100644
--- a/python/mxnet/ndarray/sparse.py
+++ b/python/mxnet/ndarray/sparse.py
@@ -639,7 +639,7 @@ def __getitem__(self, key):
         if isinstance(key, int):
             raise Exception("__getitem__ with int key is not implemented for RowSparseNDArray yet")
         if isinstance(key, py_slice):
-            if key.step is not None or key.start is not None or key.stop is not None:
+            if key.step is not None or key.start is not None or key.stop is not None: # pylint: disable=no-else-raise
                 raise Exception('RowSparseNDArray only supports [:] for __getitem__')
             else:
                 return self
@@ -1102,7 +1102,7 @@ def row_sparse_array(arg1, shape=None, ctx=None, dtype=None):
     # construct a row sparse array from (D0, D1 ..) or (data, indices)
     if isinstance(arg1, tuple):
         arg_len = len(arg1)
-        if arg_len < 2:
+        if arg_len < 2: # pylint: disable=no-else-raise
             raise ValueError("Unexpected length of input tuple: " + str(arg_len))
         elif arg_len > 2:
             # empty ndarray with shape
diff --git a/python/mxnet/ndarray_doc.py b/python/mxnet/ndarray_doc.py
index 9d6258a89a3d..20e541f94d0b 100644
--- a/python/mxnet/ndarray_doc.py
+++ b/python/mxnet/ndarray_doc.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable=unused-argument, too-many-arguments
+# pylint: disable=unused-argument, too-many-arguments, unnecessary-pass
 """Extra symbol documents"""
 from __future__ import absolute_import as _abs
 import re as _re
diff --git a/python/mxnet/operator.py b/python/mxnet/operator.py
index 2c69b9b46521..33e9b89a032c 100644
--- a/python/mxnet/operator.py
+++ b/python/mxnet/operator.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable=invalid-name, protected-access, too-many-arguments, no-self-use, too-many-locals, broad-except, too-many-lines
+# pylint: disable=invalid-name, protected-access, too-many-arguments, no-self-use, too-many-locals, broad-except, too-many-lines, unnecessary-pass
 """numpy interface for operators."""
 from __future__ import absolute_import
 
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index 2e7fe86c5af9..613ae8985aca 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -298,7 +298,7 @@ def set_learning_rate(self, lr):
         lr : float
             The new learning rate of the optimizer.
         """
-        if self.lr_scheduler is not None:
+        if self.lr_scheduler is not None: # pylint: disable=no-else-raise
             raise UserWarning("LRScheduler of the optimizer has already been "
                               "defined. Note that set_learning_rate can mutate "
                               "the value of the learning rate of the optimizer "
diff --git a/python/mxnet/recordio.py b/python/mxnet/recordio.py
index bdc63235d702..225df3beb0dc 100644
--- a/python/mxnet/recordio.py
+++ b/python/mxnet/recordio.py
@@ -80,6 +80,8 @@ def open(self):
             self.writable = False
         else:
             raise ValueError("Invalid flag %s"%self.flag)
+        # pylint: disable=not-callable
+        # It's bug from pylint(astroid). See https://github.com/PyCQA/pylint/issues/1699
         self.pid = current_process().pid
         self.is_open = True
 
@@ -114,6 +116,8 @@ def __setstate__(self, d):
 
     def _check_pid(self, allow_reset=False):
         """Check process id to ensure integrity, reset if in new process."""
+        # pylint: disable=not-callable
+        # It's bug from pylint(astroid). See https://github.com/PyCQA/pylint/issues/1699
         if not self.pid == current_process().pid:
             if allow_reset:
                 self.reset()
diff --git a/python/mxnet/rnn/rnn_cell.py b/python/mxnet/rnn/rnn_cell.py
index 6738aa19e9cd..cc9e6067e9ee 100644
--- a/python/mxnet/rnn/rnn_cell.py
+++ b/python/mxnet/rnn/rnn_cell.py
@@ -515,7 +515,7 @@ def __call__(self, inputs, states):
                                     bias=self._hB,
                                     num_hidden=self._num_hidden * 3,
                                     name="%s_h2h" % name)
-
+        # pylint: disable=unbalanced-tuple-unpacking
         i2h_r, i2h_z, i2h = symbol.SliceChannel(i2h, num_outputs=3, name="%s_i2h_slice" % name)
         h2h_r, h2h_z, h2h = symbol.SliceChannel(h2h, num_outputs=3, name="%s_h2h_slice" % name)
 
@@ -1419,7 +1419,7 @@ def __call__(self, inputs, states):
         seq_idx = self._counter
         name = '%st%d_' % (self._prefix, seq_idx)
         i2h, h2h = self._conv_forward(inputs, states, name)
-
+        # pylint: disable=unbalanced-tuple-unpacking
         i2h_r, i2h_z, i2h = symbol.SliceChannel(i2h, num_outputs=3, name="%s_i2h_slice" % name)
         h2h_r, h2h_z, h2h = symbol.SliceChannel(h2h, num_outputs=3, name="%s_h2h_slice" % name)
 
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 4bf60a6a1fcd..467d612700ec 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -90,7 +90,7 @@ def __iter__(self):
         <Symbol d>
         <Symbol _plus0>
         """
-        return (self[i] for i in self.list_outputs())
+        return (self[i] for i in range(len(self)))
 
     def __add__(self, other):
         """x.__add__(y) <=> x+y
@@ -1013,7 +1013,6 @@ def _infer_type_impl(self, partial, *args, **kwargs):
             return (arg_types, out_types, aux_types)
         else:
             return (None, None, None)
-            # pylint: enable=too-many-locals
 
     def infer_shape(self, *args, **kwargs):
         """Infers the shapes of all arguments and all outputs given the known shapes of
@@ -1071,6 +1070,7 @@ def infer_shape(self, *args, **kwargs):
             List of auxiliary state shapes.
             The order is same as the order of list_auxiliary_states().
         """
+        # pylint: disable=too-many-locals
         try:
             res = self._infer_shape_impl(False, *args, **kwargs)
             if res[1] is None:
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index bbb12dd5d7af..d80fab58be42 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -206,7 +206,7 @@ def _get_powerlaw_dataset_csr(num_rows, num_cols, density=0.1, dtype=None):
                 return mx.nd.array(output_arr).tostype("csr")
         col_max = col_max * 2
 
-    if unused_nnz > 0:
+    if unused_nnz > 0: # pylint: disable=no-else-raise
         raise ValueError("not supported for this density: %s"
                          " for this shape (%s,%s)" % (density, num_rows, num_cols))
     else:
@@ -1348,7 +1348,7 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
             except AssertionError as e:
                 print('Predict Err: ctx %d vs ctx %d at %s'%(i, max_idx, name))
                 traceback.print_exc()
-                if raise_on_err:
+                if raise_on_err: # pylint: disable=no-else-raise
                     raise e
                 else:
                     print(str(e))
@@ -1375,7 +1375,7 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
                 except AssertionError as e:
                     print('Train Err: ctx %d vs ctx %d at %s'%(i, max_idx, name))
                     traceback.print_exc()
-                    if raise_on_err:
+                    if raise_on_err: # pylint: disable=no-else-raise
                         raise e
                     else:
                         print(str(e))
@@ -1455,7 +1455,7 @@ def download(url, fname=None, dirname=None, overwrite=False, retries=5):
                 break
         except Exception as e:
             retries -= 1
-            if retries <= 0:
+            if retries <= 0: # pylint: disable=no-else-raise
                 raise e
             else:
                 print("download failed, retrying, {} attempt{} left"
@@ -1536,7 +1536,7 @@ def get_mnist_iterator(batch_size, input_shape, num_parts=1, part_index=0):
     """
 
     get_mnist_ubyte()
-    flat = False if len(input_shape) == 3 else True
+    flat = False if len(input_shape) == 3 else True # pylint: disable=simplifiable-if-expression
 
     train_dataiter = mx.io.MNISTIter(
         image="data/train-images-idx3-ubyte",
@@ -1990,7 +1990,7 @@ def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='defa
     if w_stype == 'default':
         w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
         w1 = w2.copyto(default_context())
-    elif w_stype == 'row_sparse' or w_stype == 'csr':
+    elif w_stype in ('row_sparse', 'csr'):
         w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
         w1 = w2.copyto(default_context()).tostype('default')
     else:
@@ -1998,7 +1998,7 @@ def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='defa
     if g_stype == 'default':
         g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
         g1 = g2.copyto(default_context())
-    elif g_stype == 'row_sparse' or g_stype == 'csr':
+    elif g_stype in ('row_sparse', 'csr'):
         g2 = rand_ndarray(shape, g_stype, dtype=dtype)
         g1 = g2.copyto(default_context()).tostype('default')
     else:
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/module/BaseModule.scala b/scala-package/core/src/main/scala/org/apache/mxnet/module/BaseModule.scala
index 3be8e060fd6f..7fbdae5b3e21 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/module/BaseModule.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/module/BaseModule.scala
@@ -247,11 +247,23 @@ abstract class BaseModule {
 
   /**
    * Run prediction and collect the outputs.
-   * @param evalData
+   * @param evalData dataIter to do the Inference
    * @param numBatch Default is -1, indicating running all the batches in the data iterator.
    * @param reset Default is `True`, indicating whether we should reset the data iter before start
    *              doing prediction.
    * @return The return value will be a list `[out1, out2, out3]`.
+   *        The concatenation process will be like
+   *        {{{
+   *            outputBatches = [
+   *              [a1, a2, a3], // batch a
+   *              [b1, b2, b3]  // batch b
+   *            ]
+   *            result = [
+   *              NDArray, // [a1, b1]
+   *              NDArray, // [a2, b2]
+   *              NDArray, // [a3, b3]
+   *            ]
+   *        }}}
    *         Where each element is concatenation of the outputs for all the mini-batches.
    */
   def predict(evalData: DataIter, numBatch: Int = -1, reset: Boolean = true)
@@ -264,7 +276,8 @@ abstract class BaseModule {
           s"in mini-batches (${out.size})." +
       "Maybe bucketing is used?")
     )
-    val concatenatedOutput = outputBatches.map(out => NDArray.concatenate(out))
+    val oBT = outputBatches.transpose
+    val concatenatedOutput = oBT.map(out => NDArray.concatenate(out))
     outputBatches.foreach(_.foreach(_.dispose()))
     concatenatedOutput
   }
diff --git a/scala-package/core/src/test/java/org/apache/mxnet/javaapi/NDArrayTest.java b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/NDArrayTest.java
index 86c7eb29d2ef..1b7042d49795 100644
--- a/scala-package/core/src/test/java/org/apache/mxnet/javaapi/NDArrayTest.java
+++ b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/NDArrayTest.java
@@ -86,7 +86,7 @@ public void testGenerated(){
         NDArray$ NDArray = NDArray$.MODULE$;
         float[] arr = new float[]{1.0f, 2.0f, 3.0f};
         NDArray nd = new NDArray(arr, new Shape(new int[]{3}), new Context("cpu", 0));
-        float result = NDArray.norm(NDArray.new normParam(nd))[0].toArray()[0];
+        float result = NDArray.norm(new normParam(nd))[0].toArray()[0];
         float cal = 0.0f;
         for (float ele : arr) {
             cal += ele * ele;
@@ -94,7 +94,7 @@ public void testGenerated(){
         cal = (float) Math.sqrt(cal);
         assertTrue(Math.abs(result - cal) < 1e-5);
         NDArray dotResult = new NDArray(new float[]{0}, new Shape(new int[]{1}), new Context("cpu", 0));
-        NDArray.dot(NDArray.new dotParam(nd, nd).setOut(dotResult));
+        NDArray.dot(new dotParam(nd, nd).setOut(dotResult));
         assertTrue(Arrays.equals(dotResult.toArray(), new float[]{14.0f}));
     }
 }
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala
index 3e753a18d247..5aed01bde693 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala
@@ -23,6 +23,34 @@ import org.apache.mxnet.optimizer._
 import org.apache.mxnet.io._
 
 class ModuleSuite extends FunSuite with BeforeAndAfterAll {
+
+  class myModule(symbol : Symbol) extends Module (symbol) {
+    override def predictEveryBatch(evalData: DataIter,
+                                   numBatch: Int = 1, reset: Boolean = true):
+    IndexedSeq[IndexedSeq[NDArray]] = {
+      val data = IndexedSeq(
+        NDArray.ones(Shape(1, 10, 1)),
+        NDArray.ones(Shape(1, 10, 1)),
+        NDArray.ones(Shape(1, 10, 4))
+      )
+      List.fill(numBatch)(data).toIndexedSeq
+    }
+  }
+
+  test("predict") {
+    val sym = Symbol.Variable("data")
+    val mod = new myModule(sym)
+    val dummyIter = new NDArrayIter(IndexedSeq(NDArray.ones(1)))
+    var output = mod.predict(dummyIter, 1)
+    require(output(0).shape == Shape(1, 10, 1))
+    require(output(1).shape == Shape(1, 10, 1))
+    require(output(2).shape == Shape(1, 10, 4))
+    output = mod.predict(dummyIter, 2)
+    require(output(0).shape == Shape(2, 10, 1))
+    require(output(1).shape == Shape(2, 10, 1))
+    require(output(2).shape == Shape(2, 10, 4))
+  }
+
   test ("model dtype") {
     val dType = DType.Float32
     val dShape = Shape(3, 8, 7)
diff --git a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/BertQA.java b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/BertQA.java
index b40a4e94afbd..dd17b1d4a0a5 100644
--- a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/BertQA.java
+++ b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/BertQA.java
@@ -68,15 +68,15 @@ private static int argmax(float[] prob) {
      */
     static List<String> postProcessing(NDArray result, List<String> tokens) {
         NDArray[] output = NDArray.split(
-                NDArray.new splitParam(result, 2).setAxis(2));
+                new splitParam(result, 2).setAxis(2));
         // Get the formatted logits result
         NDArray startLogits = output[0].reshape(new int[]{0, -3});
         NDArray endLogits = output[1].reshape(new int[]{0, -3});
         // Get Probability distribution
         float[] startProb = NDArray.softmax(
-                NDArray.new softmaxParam(startLogits))[0].toArray();
+                new softmaxParam(startLogits))[0].toArray();
         float[] endProb = NDArray.softmax(
-                NDArray.new softmaxParam(endLogits))[0].toArray();
+                new softmaxParam(endLogits))[0].toArray();
         int startIdx = argmax(startProb);
         int endIdx = argmax(endProb);
         return tokens.subList(startIdx, endIdx + 1);
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
index a5102d6624ef..e939b2ebf9e7 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
@@ -152,7 +152,8 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
   def javaClassGen(FILE_PATH : String) : String = {
     val notGenerated = Set("Custom")
     val absClassFunctions = functionsToGenerate(false, false, true)
-    val absFuncs = absClassFunctions.filterNot(ele => notGenerated.contains(ele.name))
+    val (absFuncs, paramClassUncleaned) =
+      absClassFunctions.filterNot(ele => notGenerated.contains(ele.name))
       .groupBy(_.name.toLowerCase).map(ele => {
       /* Pattern matching for not generating deprecated method
        * Group all method name in lowercase
@@ -166,7 +167,8 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
       }
     }).map(absClassFunction => {
         generateJavaAPISignature(absClassFunction)
-      }).toSeq
+      }).toSeq.unzip
+    val paramClass = paramClassUncleaned.filterNot(_.isEmpty)
     val packageName = "NDArrayBase"
     val packageDef = "package org.apache.mxnet.javaapi"
     writeFile(
@@ -174,7 +176,7 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
       packageDef,
       packageName,
       "import org.apache.mxnet.annotation.Experimental",
-      absFuncs)
+      absFuncs, Some(paramClass))
   }
 
   /**
@@ -248,7 +250,7 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
     * @param func The function case class
     * @return A formatted string for the function
     */
-  def generateJavaAPISignature(func : Func) : String = {
+  def generateJavaAPISignature(func : Func) : (String, String) = {
     val useParamObject = func.listOfArgs.count(arg => arg.isOptional) >= 2
     var argDef = ListBuffer[String]()
     var classDef = ListBuffer[String]()
@@ -287,22 +289,23 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
            | }
            | def getOut() = this.out
            | """.stripMargin
-      s"""$scalaDocNoParam
+      (s"""$scalaDocNoParam
          | $experimentalTag
          | def ${func.name}(po: ${func.name}Param) : $returnType
-         | /**
+         | """.stripMargin,
+        s"""/**
          | * This Param Object is specifically used for ${func.name}
          | ${requiredParam.mkString("\n")}
          | */
          | class ${func.name}Param(${argDef.mkString(",")}) {
          |  ${classDef.mkString("\n  ")}
-         | }""".stripMargin
+         | }""".stripMargin)
     } else {
       argDef += "out : NDArray"
-      s"""$scalaDoc
+      (s"""$scalaDoc
          |$experimentalTag
          | def ${func.name}(${argDef.mkString(", ")}) : $returnType
-         | """.stripMargin
+         | """.stripMargin, "")
     }
   }
 
@@ -316,7 +319,8 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
     * @return A MD5 string
     */
   def writeFile(FILE_PATH: String, packageDef: String, className: String,
-                imports: String, absFuncs: Seq[String]): String = {
+                imports: String, absFuncs: Seq[String],
+                paramClass: Option[Seq[String]] = None): String = {
 
     val finalStr =
       s"""/*
@@ -343,7 +347,9 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
          |// scalastyle:off
          |abstract class $className {
          |${absFuncs.mkString("\n")}
-         |}""".stripMargin
+         |}
+         |${paramClass.getOrElse(Seq()).mkString("\n")}
+         |""".stripMargin
 
 
     val pw = new PrintWriter(new File(FILE_PATH + s"$className.scala"))
diff --git a/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayCreation.java b/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayCreation.java
index 32e2d84dcdbf..4361c06edf32 100644
--- a/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayCreation.java
+++ b/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayCreation.java
@@ -37,7 +37,7 @@ public static void main(String[] args) {
 
         // random
         NDArray random = NDArray.random_uniform(
-                NDArray.new random_uniformParam()
+                new random_uniformParam()
                         .setLow(0.0f)
                         .setHigh(2.0f)
                         .setShape(new Shape(new int[]{10, 10}))
diff --git a/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayOperation.java b/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayOperation.java
index 56a414307f46..646adf5550b1 100644
--- a/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayOperation.java
+++ b/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayOperation.java
@@ -38,7 +38,7 @@ public static void main(String[] args) {
         System.out.println(eleAdd);
 
         // norm (L2 Norm)
-        NDArray normed = NDArray.norm(NDArray.new normParam(nd))[0];
+        NDArray normed = NDArray.norm(new normParam(nd))[0];
         System.out.println(normed);
     }
 }
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index 5352fcfe0951..8fade7df223e 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -29,9 +29,6 @@
 #include "./c_api_common.h"
 #include "../executor/graph_executor.h"
 #include "../common/utils.h"
-#if MXNET_USE_TENSORRT
-#include "../executor/trt_graph_executor.h"
-#endif  // MXNET_USE_TENSORRT
 
 int MXExecutorPrint(ExecutorHandle handle, const char **out_str) {
   Executor *exec = static_cast<Executor*>(handle);
@@ -448,38 +445,12 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
   std::vector<NDArray> in_arg_vec;
   std::vector<NDArray> arg_grad_vec;
   std::vector<NDArray> aux_state_vec;
-#if MXNET_USE_TENSORRT
-  // If we've built with TensorRT support we by default return an TRTExecutor.
-  // Users can override this behaviour via env var, which is useful for example for A/B
-  // performance testing.
-  if (dmlc::GetEnv("MXNET_USE_TENSORRT", false)) {
-    *out = exec::TrtGraphExecutor::TensorRTBind(*sym, ctx, ctx_map, &in_arg_ctx_vec,
-                                                &arg_grad_ctx_vec, &aux_state_ctx_vec,
-                                                &arg_shape_map, &arg_dtype_map, &arg_stype_map,
-                                                &grad_req_type_vec, shared_arg_name_set,
-                                                &in_arg_vec, &arg_grad_vec, &aux_state_vec,
-                                                use_shared_buffer ? &shared_buffer_map : nullptr,
-                                                reinterpret_cast<Executor*>(shared_exec_handle));
-  } else {
-    // Checks to see if this env var has been set to true or false by the user.
-    // If the user is using a TensorRT build, but has not enabled TRT at inference time, warn
-    // them and describe further steps.
-    const int unset_indicator =  std::numeric_limits<int>::quiet_NaN();
-    if (dmlc::GetEnv("MXNET_USE_TENSORRT", unset_indicator) == unset_indicator) {
-      LOG(INFO) << "TensorRT not enabled by default.  Please set the MXNET_USE_TENSORRT "
-                   "environment variable to 1 or call mx.contrib.tensorrt.set_use_tensorrt(True) "
-                   "to enable.";
-    }
-#endif  // MXNET_USE_TENSORRT
-    *out = Executor::SimpleBind(*sym, ctx, ctx_map, in_arg_ctx_vec, arg_grad_ctx_vec,
-                                aux_state_ctx_vec, arg_shape_map, arg_dtype_map, arg_stype_map,
-                                grad_req_type_vec, shared_arg_name_set, &in_arg_vec,
-                                &arg_grad_vec, &aux_state_vec,
-                                use_shared_buffer ? &shared_buffer_map : nullptr,
-                                reinterpret_cast<Executor*>(shared_exec_handle));
-#if MXNET_USE_TENSORRT
-  }
-#endif  // MXNET_USE_TENSORRT
+  *out = Executor::SimpleBind(*sym, ctx, ctx_map, in_arg_ctx_vec, arg_grad_ctx_vec,
+                              aux_state_ctx_vec, arg_shape_map, arg_dtype_map, arg_stype_map,
+                              grad_req_type_vec, shared_arg_name_set, &in_arg_vec,
+                              &arg_grad_vec, &aux_state_vec,
+                              use_shared_buffer ? &shared_buffer_map : nullptr,
+                              reinterpret_cast<Executor*>(shared_exec_handle));
 
   // copy ndarray ptrs to ret->handles so that front end
   // can access them
@@ -808,38 +779,12 @@ int MXExecutorSimpleBindEx(SymbolHandle symbol_handle,
   std::vector<NDArray> in_arg_vec;
   std::vector<NDArray> arg_grad_vec;
   std::vector<NDArray> aux_state_vec;
-#if MXNET_USE_TENSORRT
-  // If we've built with TensorRT support we by default return an TRTExecutor.
-  // Users can override this behaviour via env var, which is useful for example for A/B
-  // performance testing.
-  if (dmlc::GetEnv("MXNET_USE_TENSORRT", false)) {
-    *out = exec::TrtGraphExecutor::TensorRTBind(*sym, ctx, ctx_map, &in_arg_ctx_vec,
-                                                &arg_grad_ctx_vec, &aux_state_ctx_vec,
-                                                &arg_shape_map, &arg_dtype_map, &arg_stype_map,
-                                                &grad_req_type_vec, shared_arg_name_set,
-                                                &in_arg_vec, &arg_grad_vec, &aux_state_vec,
-                                                use_shared_buffer ? &shared_buffer_map : nullptr,
-                                                reinterpret_cast<Executor*>(shared_exec_handle));
-  } else {
-    // Checks to see if this env var has been set to true or false by the user.
-    // If the user is using a TensorRT build, but has not enabled TRT at inference time, warn
-    // them and describe further steps.
-    const int unset_indicator =  std::numeric_limits<int>::quiet_NaN();
-    if (dmlc::GetEnv("MXNET_USE_TENSORRT", unset_indicator) == unset_indicator) {
-      LOG(INFO) << "TensorRT not enabled by default.  Please set the MXNET_USE_TENSORRT "
-                   "environment variable to 1 or call mx.contrib.tensorrt.set_use_tensorrt(True) "
-                   "to enable.";
-    }
-#endif  // MXNET_USE_TENSORRT
-    *out = Executor::SimpleBind(*sym, ctx, ctx_map, in_arg_ctx_vec, arg_grad_ctx_vec,
-                                aux_state_ctx_vec, arg_shape_map, arg_dtype_map, arg_stype_map,
-                                grad_req_type_vec, shared_arg_name_set, &in_arg_vec,
-                                &arg_grad_vec, &aux_state_vec,
-                                use_shared_buffer ? &shared_buffer_map : nullptr,
-                                reinterpret_cast<Executor*>(shared_exec_handle));
-#if MXNET_USE_TENSORRT
-  }
-#endif  // MXNET_USE_TENSORRT
+  *out = Executor::SimpleBind(*sym, ctx, ctx_map, in_arg_ctx_vec, arg_grad_ctx_vec,
+                              aux_state_ctx_vec, arg_shape_map, arg_dtype_map, arg_stype_map,
+                              grad_req_type_vec, shared_arg_name_set, &in_arg_vec,
+                              &arg_grad_vec, &aux_state_vec,
+                              use_shared_buffer ? &shared_buffer_map : nullptr,
+                              reinterpret_cast<Executor*>(shared_exec_handle));
 
   // copy ndarray ptrs to ret->handles so that front end
   // can access them
@@ -1091,14 +1036,9 @@ int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
   auto s = new nnvm::Symbol();
   API_BEGIN();
 
-#if MXNET_USE_TENSORRT
-  auto exec = static_cast<exec::TrtGraphExecutor*>(handle);
+  auto exec = static_cast<exec::GraphExecutor*>(handle);
   *s = exec->GetOptimizedSymbol();
   *out = s;
-#else
-  LOG(FATAL) << "GetOptimizedSymbol may only be used when MXNet is compiled with "
-                "MXNET_USE_TENSORRT enabled.  Please re-compile MXNet with TensorRT support.";
-#endif  // MXNET_USE_TENSORRT
 
   API_END_HANDLE_ERROR(delete s);
 }
diff --git a/src/common/serialization.h b/src/common/serialization.h
deleted file mode 100644
index c22d8bc82270..000000000000
--- a/src/common/serialization.h
+++ /dev/null
@@ -1,318 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file serialization.h
- * \brief Serialization of some STL and nnvm data-structures
- * \author Clement Fuji Tsang
- */
-
-#ifndef MXNET_COMMON_SERIALIZATION_H_
-#define MXNET_COMMON_SERIALIZATION_H_
-
-#include <dmlc/logging.h>
-#include <mxnet/graph_attr_types.h>
-#include <nnvm/graph_attr_types.h>
-
-#include <cstring>
-#include <map>
-#include <set>
-#include <string>
-#include <tuple>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-
-namespace mxnet {
-namespace common {
-
-template<typename T>
-inline size_t SerializedSize(const T &obj);
-
-template<typename T>
-inline size_t SerializedSize(const mxnet::Tuple <T> &obj);
-
-template<typename K, typename V>
-inline size_t SerializedSize(const std::map <K, V> &obj);
-
-template<>
-inline size_t SerializedSize(const std::string &obj);
-
-template<typename... Args>
-inline size_t SerializedSize(const std::tuple<Args...> &obj);
-
-template<typename T>
-inline void Serialize(const T &obj, char **buffer);
-
-template<typename T>
-inline void Serialize(const mxnet::Tuple <T> &obj, char **buffer);
-
-template<typename K, typename V>
-inline void Serialize(const std::map <K, V> &obj, char **buffer);
-
-template<>
-inline void Serialize(const std::string &obj, char **buffer);
-
-template<typename... Args>
-inline void Serialize(const std::tuple<Args...> &obj, char **buffer);
-
-template<typename T>
-inline void Deserialize(T *obj, const std::string &buffer, size_t *curr_pos);
-
-template<typename T>
-inline void Deserialize(mxnet::Tuple <T> *obj, const std::string &buffer, size_t *curr_pos);
-
-template<typename K, typename V>
-inline void Deserialize(std::map <K, V> *obj, const std::string &buffer, size_t *curr_pos);
-
-template<>
-inline void Deserialize(std::string *obj, const std::string &buffer, size_t *curr_pos);
-
-template<typename... Args>
-inline void Deserialize(std::tuple<Args...> *obj, const std::string &buffer, size_t *curr_pos);
-
-
-template<typename T>
-struct is_container {
-  static const bool value = !std::is_pod<T>::value;
-};
-
-template<typename T>
-inline size_t SerializedSize(const T &obj) {
-  return sizeof(T);
-}
-
-template<typename T>
-inline size_t SerializedSize(const mxnet::Tuple <T> &obj) {
-  if (is_container<T>::value) {
-    size_t sum_val = 4;
-    for (const auto& el : obj) {
-      sum_val += SerializedSize(el);
-    }
-    return sum_val;
-  } else {
-    return 4 + (obj.ndim() * sizeof(T));
-  }
-}
-
-template<typename K, typename V>
-inline size_t SerializedSize(const std::map <K, V> &obj) {
-  size_t sum_val = 4;
-  if (is_container<K>::value && is_container<V>::value) {
-    for (const auto& p : obj) {
-      sum_val += SerializedSize(p.first) + SerializedSize(p.second);
-    }
-  } else if (is_container<K>::value) {
-    for (const auto& p : obj) {
-      sum_val += SerializedSize(p.first);
-    }
-    sum_val += sizeof(V) * obj.size();
-  } else if (is_container<V>::value) {
-    for (const auto& p : obj) {
-      sum_val += SerializedSize(p.second);
-    }
-    sum_val += sizeof(K) * obj.size();
-  } else {
-    sum_val += (sizeof(K) + sizeof(V)) * obj.size();
-  }
-  return sum_val;
-}
-
-template<>
-inline size_t SerializedSize(const std::string &obj) {
-  return obj.size() + 4;
-}
-
-template<int I>
-struct serialized_size_tuple {
-  template<typename... Args>
-  static inline size_t Compute(const std::tuple<Args...> &obj) {
-    return SerializedSize(std::get<I>(obj)) + serialized_size_tuple<I-1>::Compute(obj);
-  }
-};
-
-template<>
-struct serialized_size_tuple<0> {
-  template<typename... Args>
-  static inline size_t Compute(const std::tuple<Args...> &obj) {
-    return SerializedSize(std::get<0>(obj));
-  }
-};
-
-template<typename... Args>
-inline size_t SerializedSize(const std::tuple<Args...> &obj) {
-  return serialized_size_tuple<sizeof... (Args)-1>::Compute(obj);
-}
-
-//  Serializer
-
-template<typename T>
-inline size_t SerializedContainerSize(const T &obj, char **buffer) {
-  uint32_t size = obj.size();
-  std::memcpy(*buffer, &size, 4);
-  *buffer += 4;
-  return (size_t) size;
-}
-
-template<typename T>
-inline void Serialize(const T &obj, char **buffer) {
-  std::memcpy(*buffer, &obj, sizeof(T));
-  *buffer += sizeof(T);
-}
-
-template<typename T>
-inline void Serialize(const mxnet::Tuple <T> &obj, char **buffer) {
-  uint32_t size = obj.ndim();
-  std::memcpy(*buffer, &size, 4);
-  *buffer += 4;
-  for (auto& el : obj) {
-    Serialize(el, buffer);
-  }
-}
-
-template<typename K, typename V>
-inline void Serialize(const std::map <K, V> &obj, char **buffer) {
-  SerializedContainerSize(obj, buffer);
-  for (auto& p : obj) {
-    Serialize(p.first, buffer);
-    Serialize(p.second, buffer);
-  }
-}
-
-template<>
-inline void Serialize(const std::string &obj, char **buffer) {
-  auto size = SerializedContainerSize(obj, buffer);
-  std::memcpy(*buffer, &obj[0], size);
-  *buffer += size;
-}
-
-template<int I>
-struct serialize_tuple {
-  template<typename... Args>
-  static inline void Compute(const std::tuple<Args...> &obj, char **buffer) {
-    serialize_tuple<I-1>::Compute(obj, buffer);
-    Serialize(std::get<I>(obj), buffer);
-  }
-};
-
-template<>
-struct serialize_tuple<0> {
-  template<typename... Args>
-  static inline void Compute(const std::tuple<Args...> &obj, char **buffer) {
-    Serialize(std::get<0>(obj), buffer);
-  }
-};
-
-template<typename... Args>
-inline void Serialize(const std::tuple<Args...> &obj, char **buffer) {
-  serialize_tuple<sizeof... (Args)-1>::Compute(obj, buffer);
-}
-
-// Deserializer
-
-template<typename T>
-inline size_t DeserializedContainerSize(T *obj, const std::string &buffer, size_t *curr_pos) {
-  uint32_t size = obj->size();
-  std::memcpy(&size, &buffer[*curr_pos], 4);
-  *curr_pos += 4;
-  return (size_t) size;
-}
-
-template<typename T>
-inline void Deserialize(T *obj, const std::string &buffer, size_t *curr_pos) {
-  std::memcpy(obj, &buffer[*curr_pos], sizeof(T));
-  *curr_pos += sizeof(T);
-}
-
-template<typename T>
-inline void Deserialize(mxnet::Tuple <T> *obj, const std::string &buffer, size_t *curr_pos) {
-  uint32_t size = obj->ndim();
-  std::memcpy(&size, &buffer[*curr_pos], 4);
-  *curr_pos += 4;
-  obj->SetDim(size);
-  for (size_t i = 0; i < size; ++i) {
-    Deserialize((*obj)[i], buffer, curr_pos);
-  }
-}
-
-template<typename K, typename V>
-inline void Deserialize(std::map <K, V> *obj, const std::string &buffer, size_t *curr_pos) {
-  auto size = DeserializedContainerSize(obj, buffer, curr_pos);
-  K first;
-  for (size_t i = 0; i < size; ++i) {
-    Deserialize(&first, buffer, curr_pos);
-    Deserialize(&(*obj)[first], buffer, curr_pos);
-  }
-}
-
-template<>
-inline void Deserialize(std::string *obj, const std::string &buffer, size_t *curr_pos) {
-  auto size = DeserializedContainerSize(obj, buffer, curr_pos);
-  obj->resize(size);
-  std::memcpy(&(obj->front()), &buffer[*curr_pos], size);
-  *curr_pos += size;
-}
-
-template<int I>
-struct deserialize_tuple {
-  template<typename... Args>
-  static inline void Compute(std::tuple<Args...> *obj,
-                             const std::string &buffer, size_t *curr_pos) {
-    deserialize_tuple<I-1>::Compute(obj, buffer, curr_pos);
-    Deserialize(&std::get<I>(*obj), buffer, curr_pos);
-  }
-};
-
-template<>
-struct deserialize_tuple<0> {
-  template<typename... Args>
-  static inline void Compute(std::tuple<Args...> *obj,
-                             const std::string &buffer, size_t *curr_pos) {
-    Deserialize(&std::get<0>(*obj), buffer, curr_pos);
-  }
-};
-
-template<typename... Args>
-inline void Deserialize(std::tuple<Args...> *obj, const std::string &buffer, size_t *curr_pos) {
-  deserialize_tuple<sizeof... (Args)-1>::Compute(obj, buffer, curr_pos);
-}
-
-
-template<typename T>
-inline void Serialize(const T& obj, std::string* serialized_data) {
-  serialized_data->resize(SerializedSize(obj));
-  char* curr_pos = &(serialized_data->front());
-  Serialize(obj, &curr_pos);
-  CHECK_EQ((int64_t)curr_pos - (int64_t)&(serialized_data->front()),
-           serialized_data->size());
-}
-
-template<typename T>
-inline void Deserialize(T* obj, const std::string& serialized_data) {
-  size_t curr_pos = 0;
-  Deserialize(obj, serialized_data, &curr_pos);
-  CHECK_EQ(curr_pos, serialized_data.size());
-}
-
-}  // namespace common
-}  // namespace mxnet
-#endif  // MXNET_COMMON_SERIALIZATION_H_
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index dd4132301346..acf20de54beb 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -202,18 +202,6 @@ Graph InferStorageType(Graph&& graph,
                        StorageTypeVector&& storage_type_inputs = StorageTypeVector(),
                        const std::string& storage_type_attr_key = "");
 
-#if MXNET_USE_TENSORRT
-/*!
- * \brief Replace subgraphs by TRT (forward only)
- */
-Graph ReplaceSubgraph(Graph&& g,
-                      const std::unordered_set<nnvm::Node*>& set_subgraph,
-                      std::unordered_map<std::string, NDArray>* const params_map);
-
-std::vector<std::unordered_set<nnvm::Node*>> GetTrtCompatibleSubsets(const Graph& g,
-    std::unordered_map<std::string, NDArray>* const params_map);
-#endif
-
 }  // namespace exec
 }  // namespace mxnet
 
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 4a4505581920..e726d29765ac 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -102,6 +102,16 @@ void GraphExecutor::Print(std::ostream &os) const {  // NOLINT(*)
   os << "Total " << 11 << " TempSpace resource requested\n";
 }
 
+/*!
+ * \brief Return the "optimized" symbol contained in the executor graph.
+ */
+nnvm::Symbol GraphExecutor::GetOptimizedSymbol() {
+  Symbol ret;
+  ret.outputs = std::vector<nnvm::NodeEntry>(graph_.outputs.begin(),
+      graph_.outputs.begin() + num_forward_outputs_);
+  return ret.Copy();
+}
+
 void GraphExecutor::SetMonitorCallback(const MonitorCallback& callback, bool monitor_all) {
   CHECK(callback) << "invalid callback";
   monitor_callback_ = callback;
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index b556a2bd0fe9..9a8660916357 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -68,6 +68,7 @@ class GraphExecutor : public Executor {
   const std::unordered_map<std::string, NDArray>& arg_grad_map() const override;
   const std::unordered_map<std::string, NDArray>& aux_state_map() const override;
   void Print(std::ostream &os) const override; // NOLINT(*)
+  nnvm::Symbol GetOptimizedSymbol();
   void SetMonitorCallback(const MonitorCallback& callback, bool monitor_all = false) override;
   // Initialize the rest of attributes
   // after setting up arguments.
diff --git a/src/executor/tensorrt_pass.cc b/src/executor/tensorrt_pass.cc
deleted file mode 100644
index f847d59a1298..000000000000
--- a/src/executor/tensorrt_pass.cc
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2018 by Contributors
- * \file tensorrt_pass.cc
- * \brief Replace TRT compatible subgraphs by TRT engines
- * \author Clement Fuji Tsang
- */
-
-#if MXNET_USE_TENSORRT
-
-#include <NvInfer.h>
-#include <mxnet/base.h>
-#include <mxnet/op_attr_types.h>
-#include <mxnet/operator.h>
-#include <nnvm/graph_attr_types.h>
-#include <onnx/onnx_pb.h>
-
-#include "../operator/contrib/nnvm_to_onnx-inl.h"
-#include "./exec_pass.h"
-#include "./onnx_to_tensorrt.h"
-
-namespace mxnet {
-namespace exec {
-
-using NodePtr = nnvm::NodePtr;
-
-/*!
- * \brief Custom graph class, which will contain bi-directional nodes
- * we need to compute DFS and reverse DFS for graph partitioning
- */
-class BidirectionalGraph {
- public:
-  struct Node {
-    nnvm::Node* nnvmptr;
-    std::vector<Node*> inputs;
-    std::vector<Node*> outputs;
-  };
-  std::vector<Node> nodes;
-  std::unordered_map<nnvm::Node*, uint32_t> nnvm2nid;
-  std::vector<Node*> outputs;
-  static const std::unordered_set<std::string> unconditionalTRTop;
-
-  explicit BidirectionalGraph(const Graph &g) {
-    auto& idx = g.indexed_graph();
-    auto num_nodes = idx.num_nodes();
-    nodes.reserve(num_nodes);
-    nnvm2nid.reserve(num_nodes);
-    outputs.reserve(idx.outputs().size());
-    DFSVisit(g.outputs, [this](const nnvm::NodePtr& n) {
-      BidirectionalGraph::Node new_node;
-      new_node.nnvmptr = n.get();
-      nnvm2nid[n.get()] = static_cast<uint32_t>(nodes.size());
-      nodes.emplace_back(std::move(new_node));
-    });
-    for (const auto& it : nnvm2nid) {
-      nnvm::Node* nnvmnode = it.first;
-      uint32_t nid = it.second;
-      for (auto& n : nnvmnode->inputs) {
-        uint32_t input_nid = nnvm2nid[n.node.get()];
-        nodes[input_nid].outputs.emplace_back(&nodes[nid]);
-        nodes[nid].inputs.emplace_back(&nodes[input_nid]);
-      }
-    }
-    for (auto& e : g.outputs) {
-      uint32_t nid = nnvm2nid[e.node.get()];
-      outputs.emplace_back(&nodes[nid]);
-    }
-  }
-
-  template <typename FVisit>
-  void DFS(const std::vector<Node*>& heads, bool reverse, FVisit fvisit) {
-    std::unordered_set<Node*> visited;
-    std::vector<Node*> vec(heads.begin(), heads.end());
-    visited.reserve(heads.size());
-    while (!vec.empty()) {
-      Node* vertex = vec.back();
-      vec.pop_back();
-      if (visited.count(vertex) == 0) {
-        visited.insert(vertex);
-        fvisit(vertex);
-        std::vector<Node*> nexts = reverse ? vertex->inputs : vertex->outputs;
-        for (Node* node : nexts) {
-          if (visited.count(node) == 0) {
-            vec.emplace_back(node);
-          }
-        }
-      }
-    }
-  }
-
-  using t_pairset = std::pair<std::unordered_set<Node*>, std::unordered_set<Node*>>;
-  using t_pairvec = std::pair<std::vector<Node*>, std::vector<Node*>>;
-  using t_uncomp_map = std::unordered_map<Node*, std::unordered_set<Node*>>;
-
-  std::unordered_set<Node*> naive_grow_subgraph(Node* head,
-                                                std::unordered_set<Node*>* set_unused,
-                                                t_uncomp_map* uncomp_map) {
-    std::unordered_set<Node*> subgraph;
-    std::unordered_set<Node*> uncomp_set;
-    std::deque<Node*> stack;
-    stack.emplace_back(head);
-    while (!stack.empty()) {
-      Node* vertex = stack.back();
-      stack.pop_back();
-      if (set_unused->count(vertex) && !uncomp_set.count(vertex)) {
-        set_unused->erase(vertex);
-        subgraph.insert(vertex);
-        uncomp_set.insert((*uncomp_map)[vertex].begin(), (*uncomp_map)[vertex].end());
-        for (Node* input : vertex->inputs) {
-          if (set_unused->count(input) && !uncomp_set.count(input)) {
-            stack.emplace_back(input);
-          }
-        }
-        for (Node* output : vertex->outputs) {
-          if (set_unused->count(output) && !uncomp_set.count(output)) {
-            stack.emplace_back(output);
-          }
-        }
-      }
-    }
-    return subgraph;
-  }
-
-  std::vector<std::unordered_set<Node*>> get_subsets(
-    std::unordered_map<std::string, NDArray>* const params_map) {
-    std::vector<std::unordered_set<Node*>> subgraphs;
-    std::unordered_set<Node*> set_nonTRTnodes;
-    std::unordered_set<Node*> set_allnodes(nodes.size());
-    std::vector<t_pairset> separation_sets;
-    for (Node& node : nodes) {
-      if (!IsTRTCompatible(node.nnvmptr)) {
-        set_nonTRTnodes.insert(&node);
-        std::unordered_set<Node*> in_graph;
-        std::unordered_set<Node*> out_graph;
-        std::vector<Node*> dummy_head;
-        dummy_head.emplace_back(&node);
-        DFS(dummy_head, false, [&out_graph](Node* node) {
-          out_graph.insert(node);
-        });
-        DFS(dummy_head, true, [&in_graph](Node* node) {
-          in_graph.insert(node);
-        });
-        separation_sets.emplace_back(std::make_pair(in_graph, out_graph));
-      }
-      set_allnodes.emplace(&node);
-    }
-    t_uncomp_map uncomp_map;
-    std::unordered_set<Node*> set_TRTnodes;
-    set_TRTnodes.insert(set_allnodes.begin(), set_allnodes.end());
-    for (Node* n : set_nonTRTnodes) {
-      set_TRTnodes.erase(n);
-    }
-    for (Node* n : set_TRTnodes) {
-      for (t_pairset p : separation_sets) {
-        if (p.first.count(n)) {
-          uncomp_map[n].insert(p.second.begin(), p.second.end());
-        } else if (p.second.count(n)) {
-          uncomp_map[n].insert(p.first.begin(), p.first.end());
-        }
-      }
-      for (Node* nonTRTn : set_nonTRTnodes) {
-        uncomp_map[n].erase(nonTRTn);
-      }
-    }
-    std::unordered_set<Node*> set_unused;
-    set_unused.reserve(set_TRTnodes.size());
-
-    for (auto& n : set_TRTnodes) {
-      if (n->nnvmptr->attrs.op != nullptr || params_map->count(n->nnvmptr->attrs.name)) {
-        set_unused.insert(n);
-      }
-    }
-    std::unordered_set<Node*> visited;
-    std::deque<Node*> stack(outputs.begin(), outputs.end());
-    while (!stack.empty()) {
-      Node* vertex = stack.front();
-      stack.pop_front();
-      if (!visited.count(vertex)) {
-        visited.insert(vertex);
-        if (set_unused.count(vertex)) {
-          subgraphs.emplace_back(naive_grow_subgraph(vertex, &set_unused, &uncomp_map));
-        }
-        for (Node* input : vertex->inputs) {
-          stack.emplace_back(input);
-        }
-      }
-    }
-
-    return subgraphs;
-  }
-
-
- private:
-  friend class Graph;
-
-  bool IsTRTCompatible(nnvm::Node* nodeptr) {
-    if (nodeptr->op() == nullptr) {
-      return true;
-    }
-
-    const std::string op_name = nodeptr->op()->name;
-    if (op_name == "Pooling") {
-      return (nodeptr->attrs.dict.at("pool_type") == "avg" ||
-          nodeptr->attrs.dict.at("pool_type") == "max");
-    }
-
-    if (unconditionalTRTop.count(op_name)) {
-      return true;
-    }
-
-    if (op_name == "Activation") {
-      return nodeptr->attrs.dict.at("act_type") == "relu" ||
-        nodeptr->attrs.dict.at("act_type") == "tanh" ||
-        nodeptr->attrs.dict.at("act_type") == "sigmoid";
-    }
-
-    return false;
-  }
-};  // class BidirectionalGraph
-
-/*!
- * \brief function which transform std::vector<dmlc::any> back to Attrs (dmlc::any)
- */
-const std::unordered_set<std::string> BidirectionalGraph::unconditionalTRTop = {
-  "Convolution",
-  "BatchNorm",
-  "elemwise_add",
-  "elemwise_sub",
-  "elemwise_mul",
-  "rsqrt",
-  "pad",
-  "Pad",
-  "mean",
-  "FullyConnected",
-  "Flatten",
-  "SoftmaxOutput",
-};
-
-
-using NodeEntrySet = std::unordered_set<nnvm::NodeEntry, nnvm::NodeEntryHash,
-                                        nnvm::NodeEntryEqual>;
-
-/*!
- * \brief get the output nodes of the subgraph in the main graph
- * \return a vector of the output nodes
-*/
-std::vector<nnvm::NodeEntry> GetSubgraphNodeEntries(Graph g,
-    std::unordered_set<nnvm::Node*> set_subgraph) {
-  std::vector<nnvm::NodeEntry> outputs;
-  NodeEntrySet _outputs;
-  for (auto& e : g.outputs) {
-    if (set_subgraph.count(e.node.get())) {
-      _outputs.insert(e);
-    }
-  }
-  DFSVisit(g.outputs, [&set_subgraph, &_outputs](const nnvm::NodePtr &node){
-    if (!set_subgraph.count(node.get())) {
-      for (auto& e : node->inputs) {
-        if (set_subgraph.count(e.node.get())) {
-          _outputs.insert(e);
-        }
-      }
-    }
-  });
-  outputs.insert(outputs.begin(), _outputs.begin(), _outputs.end());
-  return outputs;
-}
-
-
-/*!
- * \brief get the nodes outside of the subgraph for which outputs are used in the subgraph
- * \return a vector the nodes
-*/
-std::vector<nnvm::NodeEntry> GetSubgraphInterfaceNodes(Graph g,
-    std::unordered_set<nnvm::Node*> set_subgraph) {
-  std::vector<nnvm::NodeEntry> inputs;
-  NodeEntrySet _inputs;
-  DFSVisit(g.outputs, [&set_subgraph, &_inputs](const nnvm::NodePtr &node){
-    if (set_subgraph.count(node.get())) {
-      for (auto& e : node->inputs) {
-        if (!set_subgraph.count(e.node.get())) {
-          _inputs.insert(e);
-        }
-      }
-    }
-  });
-  inputs.insert(inputs.begin(), _inputs.begin(), _inputs.end());
-  return inputs;
-}
-
-std::unordered_map<uint32_t, uint32_t> GetGraphInputsMap(const Graph& g) {
-  std::unordered_map<uint32_t, uint32_t> outputs;
-  auto& idx = g.indexed_graph();
-  outputs.reserve(idx.num_nodes());
-  std::vector<uint32_t> input_nodes = idx.input_nodes();
-  for (size_t i = 0; i < input_nodes.size(); ++i) {
-    outputs[input_nodes[i]] = static_cast<uint32_t>(i);
-  }
-  return outputs;
-}
-
-/*!
- * \brief Dummy function which creates a fake TensorRT Node
- */
-nnvm::NodePtr ConvertNnvmGraphToOnnx(const nnvm::Graph &g,
-                                     std::unordered_map<std::string, NDArray>* const params_map) {
-  auto p = nnvm::Node::Create();
-  p->attrs.op = nnvm::Op::Get("_trt_op");
-  op::ONNXParam onnx_param = op::nnvm_to_onnx::ConvertNnvmGraphToOnnx(g, params_map);
-  p->attrs.dict["serialized_output_map"] = onnx_param.serialized_output_map;
-  p->attrs.dict["serialized_input_map"]  = onnx_param.serialized_input_map;
-  p->attrs.dict["serialized_onnx_graph"] = onnx_param.serialized_onnx_graph;
-  if (p->op()->attr_parser != nullptr) {
-    p->op()->attr_parser(&(p->attrs));
-  }
-  return p;
-}
-
-/*!
- * \brief Update attributes of the graph (such as some inputs properties)
- */
-Graph UpdateSubgraphAttrs(Graph&& subgraph, const Graph& g,
-                          const std::unordered_map<nnvm::Node*, nnvm::NodePtr>& old2new,
-                          const nnvm::NodeEntryMap<nnvm::NodeEntry>& main_input_entry_to_sub) {
-  const auto& idx     = g.indexed_graph();
-  const auto& sub_idx = subgraph.indexed_graph();
-
-  const auto& shape               = g.GetAttr<mxnet::ShapeVector>("shape");
-  const auto& dtype               = g.GetAttr<nnvm::DTypeVector>("dtype");
-  const auto& storage_type        = g.GetAttr<StorageTypeVector>("storage_type");
-  const auto& shape_inputs        = g.GetAttr<mxnet::ShapeVector>("shape_inputs");
-  const auto& dtype_inputs        = g.GetAttr<nnvm::DTypeVector>("dtype_inputs");
-  const auto& storage_type_inputs = g.GetAttr<StorageTypeVector>("storage_type_inputs");
-
-  mxnet::ShapeVector sub_shape(sub_idx.num_node_entries());
-  nnvm::DTypeVector sub_dtype(sub_idx.num_node_entries());
-  StorageTypeVector sub_storage_type(sub_idx.num_node_entries());
-  mxnet::ShapeVector sub_shape_inputs(sub_idx.input_nodes().size());
-  nnvm::DTypeVector sub_dtype_inputs(sub_idx.input_nodes().size());
-  StorageTypeVector sub_storage_type_inputs(sub_idx.input_nodes().size());
-
-  const std::unordered_map<uint32_t, uint32_t> inputsindex2pos     = GetGraphInputsMap(g);
-  const std::unordered_map<uint32_t, uint32_t> sub_inputsindex2pos = GetGraphInputsMap(subgraph);
-  // map attributes from graph to subgraph
-  for (auto& p : old2new) {
-    const uint32_t nid     = idx.node_id(p.first);
-    const uint32_t sub_nid = sub_idx.node_id(p.second.get());
-    const nnvm::Op* op = sub_idx[sub_nid].source->op();
-    if (op == nullptr) {  // if it's an input node, there is only one output node entry
-      const uint32_t sub_i       = sub_idx.entry_id(sub_nid, 0);
-      const uint32_t sub_input_i = sub_inputsindex2pos.at(sub_nid);
-      const uint32_t i           = idx.entry_id(nid, 0);
-
-      sub_shape[sub_i] = shape[i];
-      sub_dtype[sub_i] = dtype[i];
-      sub_storage_type[sub_i]       = storage_type[i];
-      sub_shape_inputs[sub_input_i] = shape_inputs[inputsindex2pos.at(nid)];
-      sub_dtype_inputs[sub_input_i] = dtype_inputs[inputsindex2pos.at(nid)];
-      sub_storage_type_inputs[sub_input_i] = storage_type_inputs[inputsindex2pos.at(nid)];
-
-    } else {
-      for (size_t oi = 0; oi < op->num_outputs; ++oi) {
-        const uint32_t sub_i = sub_idx.entry_id(sub_nid, oi);
-        const uint32_t i = idx.entry_id(nid, oi);
-          sub_shape[sub_i] = shape[i];
-          sub_dtype[sub_i] = dtype[i];
-          sub_storage_type[sub_i] = storage_type[i];
-      }
-    }
-  }
-  // old2new doesn't contain placeholder / interfaces
-  for (auto& p : main_input_entry_to_sub) {
-    nnvm::NodeEntry main_entry = p.first;
-    nnvm::NodeEntry sub_entry = p.second;
-    const uint32_t sub_nid = sub_idx.node_id(sub_entry.node.get());
-    const uint32_t sub_i = sub_idx.entry_id(sub_entry);
-    const uint32_t i = idx.entry_id(main_entry);
-    const uint32_t sub_input_i = sub_inputsindex2pos.at(sub_nid);
-    sub_shape[sub_i] = shape[i];
-    sub_dtype[sub_i] = dtype[i];
-    sub_storage_type[sub_i] = storage_type[i];
-    sub_shape_inputs[sub_input_i] = sub_shape[sub_i];
-    sub_dtype_inputs[sub_input_i] = sub_dtype[sub_i];
-    sub_storage_type_inputs[sub_input_i] = sub_storage_type[sub_i];
-  }
-  subgraph.attrs["shape"] =
-      std::make_shared<dmlc::any>(std::move(sub_shape));
-  subgraph.attrs["dtype"] =
-      std::make_shared<dmlc::any>(std::move(sub_dtype));
-  subgraph.attrs["storage_type"] =
-      std::make_shared<dmlc::any>(std::move(sub_storage_type));
-  subgraph.attrs["shape_inputs"] =
-      std::make_shared<dmlc::any>(std::move(sub_shape_inputs));
-  subgraph.attrs["dtype_inputs"] =
-      std::make_shared<dmlc::any>(std::move(sub_dtype_inputs));
-  subgraph.attrs["storage_type_inputs"] =
-      std::make_shared<dmlc::any>(std::move(sub_storage_type_inputs));
-
-  return subgraph;
-}
-
-/*!
- * \brief Generate a name for a new TRT node, avoid collision if some TRT_nodes are already defined
- */
-const std::string GetNewTrtName(const Graph& g, const Graph& subgraph) {
-  const std::string name_prefix("TRT_node");
-  std::unordered_set<std::string> name_set;
-  DFSVisit(g.outputs, [&name_set, &name_prefix](const nnvm::NodePtr& node) {
-    if (node->attrs.name.compare(0, name_prefix.size(), name_prefix) == 0) {
-      name_set.insert(node->attrs.name);
-    }
-  });
-  // name inside the subgraph will be avaible as they will be removed
-  DFSVisit(subgraph.outputs, [&name_set, &name_prefix](const nnvm::NodePtr& node) {
-    if (node->attrs.name.compare(0, name_prefix.size(), name_prefix) == 0) {
-      name_set.erase(node->attrs.name);
-    }
-  });
-  uint32_t name_suffix = 0;
-  std::string full_name = name_prefix + std::to_string(name_suffix);
-  while (name_set.count(full_name)) {
-    full_name = name_prefix + std::to_string(++name_suffix);
-  }
-  return full_name;
-}
-
-/*!
- * \brief helper function to display what nodes are in a specific subset
- */
-void dispNodesSet(Graph g, std::unordered_set<nnvm::Node*> s) {
-  DFSVisit(g.outputs, [&s](const nnvm::NodePtr n){
-    if (s.count(n.get())) {
-      std::cout << "  Y " << n->attrs.name << std::endl;
-    } else {
-      std::cout << "  N " << n->attrs.name << std::endl;
-    }
-  });
-}
-
-/*!
- * \brief Replace a set of nodes by a TensorRT node
- */
-Graph ReplaceSubgraph(Graph&& g,
-                      const std::unordered_set<nnvm::Node*>& set_subgraph,
-                      std::unordered_map<std::string, NDArray>* const params_map) {
-  // Create MXNet subgraph
-  Graph subgraph;
-
-  const auto sub_outputs_in_main = GetSubgraphNodeEntries(g, set_subgraph);
-  subgraph.outputs = sub_outputs_in_main;
-  // old2new will link raw pointer of the nodes in the graph to
-  // the corresponding shared_ptr of the nodes in the generated subgraph
-  std::unordered_map<nnvm::Node*, nnvm::NodePtr> old2new;
-  std::deque<nnvm::Node*> stack;
-  std::unordered_set<nnvm::Node*> visited;
-  int32_t reservation = set_subgraph.size();
-  old2new.reserve(reservation);
-  visited.reserve(reservation);
-
-  // Create the shared_ptr using the same raw pointer don't really matter
-  for (auto& n : set_subgraph) {
-    old2new[n] = std::make_shared<nnvm::Node>(*n);
-  }
-
-  // To generate a subgraph an input have to be replace by data node (no op)
-  // and it have to be agnostic to the node from which it's an output
-  // (For exemple even if two inputs are two different outputs from the same node)
-  nnvm::NodeEntryMap<nnvm::NodeEntry> main_input_entry_to_sub;
-  for (auto& e : GetSubgraphInterfaceNodes(g, set_subgraph)) {
-    auto node = nnvm::Node::Create();
-    node->attrs.name = e.node->attrs.name + "_" + std::to_string(e.index);
-    auto new_e = nnvm::NodeEntry{node, 0, 0};
-    main_input_entry_to_sub[e] = new_e;
-  }
-
-  for (nnvm::NodeEntry& e : subgraph.outputs) {
-    e.node = old2new[e.node.get()];
-    stack.emplace_back(e.node.get());
-  }
-  // link all nodes in the subgraph to nodes in the subgraph instead of main graph
-  while (!stack.empty()) {
-    auto vertex = stack.front();
-    stack.pop_front();
-    if (!visited.count(vertex)) {
-      visited.insert(vertex);
-      for (auto& e : vertex->inputs) {
-        auto it = main_input_entry_to_sub.find(e);
-        if (it != main_input_entry_to_sub.end()) {
-          e = it->second;
-        } else {
-          e.node = old2new[e.node.get()];
-        }
-      stack.emplace_back(e.node.get());
-      }
-    }
-  }
-  // Remove the control dependencies of the subgraph to nodes that are not in the subgraph
-  DFSVisit(subgraph.outputs, [&set_subgraph, &old2new](const nnvm::NodePtr& node) {
-    std::remove_if(node->control_deps.begin(),
-                   node->control_deps.end(),
-                   [&set_subgraph](nnvm::NodePtr n_ptr) {
-                    return !set_subgraph.count(n_ptr.get());
-                   });
-    for (nnvm::NodePtr& n_ptr : node->control_deps) {
-      n_ptr = old2new[n_ptr.get()];
-    }
-  });
-
-  subgraph = UpdateSubgraphAttrs(std::move(subgraph), g, old2new, main_input_entry_to_sub);
-  auto& sub_idx = subgraph.indexed_graph();
-
-  auto trtnodeptr = ConvertNnvmGraphToOnnx(subgraph, params_map);
-  trtnodeptr->attrs.name = GetNewTrtName(g, subgraph);
-
-  // Insert new trt node and unplug replaced nodes
-  std::unordered_map<uint32_t, nnvm::NodeEntry> sub_input_entryid_to_main;
-  for (auto& p : main_input_entry_to_sub) {
-    sub_input_entryid_to_main[sub_idx.entry_id(p.second)] = p.first;
-  }
-
-  // Plug the nodes from the main graph as inputs of the trt node
-  trtnodeptr->inputs.resize(main_input_entry_to_sub.size());
-  {
-    uint32_t counter = 0;
-    for (uint32_t i : sub_idx.input_nodes()) {
-      auto it = sub_input_entryid_to_main.find(sub_idx.entry_id(i, 0));
-      if (it != sub_input_entryid_to_main.end()) {
-        trtnodeptr->inputs[counter++] = it->second;
-      }
-    }
-  }
-  nnvm::NodeEntryMap<uint32_t> sub_outputs_in_main_to_pos;
-  for (uint32_t i = 0; i < sub_outputs_in_main.size(); ++i) {
-    sub_outputs_in_main_to_pos[sub_outputs_in_main[i]] = i;
-  }
-  // Plug the trt node as inputs to the main graph nodes
-  DFSVisit(g.outputs, [&sub_outputs_in_main_to_pos, &trtnodeptr](const nnvm::NodePtr& n) {
-    for (auto& e : n->inputs) {
-      auto it = sub_outputs_in_main_to_pos.find(e);
-      if (it != sub_outputs_in_main_to_pos.end()) {
-        e.index = it->second;
-        e.node = trtnodeptr;
-      }
-    }
-  });
-
-  for (auto& output : g.outputs) {
-    auto it = sub_outputs_in_main_to_pos.find(output);
-    if (it != sub_outputs_in_main_to_pos.end()) {
-      output.index = it->second;
-      output.node = trtnodeptr;
-    }
-  }
-
-  Graph new_graph;
-  new_graph.outputs = g.outputs;
-  return new_graph;
-}
-
-std::vector<std::unordered_set<nnvm::Node*>> GetTrtCompatibleSubsets(const Graph& g,
-    std::unordered_map<std::string, NDArray>* const params_map) {
-  BidirectionalGraph biG = BidirectionalGraph(g);
-  std::vector<std::unordered_set<BidirectionalGraph::Node*>> subsets = biG.get_subsets(params_map);
-  std::vector<std::unordered_set<nnvm::Node*>> nnvm_subsets(subsets.size(),
-                                                            std::unordered_set<nnvm::Node*>());
-  for (size_t i = 0; i < subsets.size(); ++i) {
-    nnvm_subsets[i].reserve(subsets[i].size());
-    for (auto& n : subsets[i]) {
-      nnvm_subsets[i].insert(n->nnvmptr);
-    }
-  }
-  return nnvm_subsets;
-}
-
-}  // namespace exec
-}  // namespace mxnet
-
-#endif  // MXNET_USE_TENSORRT
diff --git a/src/executor/trt_graph_executor.cc b/src/executor/trt_graph_executor.cc
deleted file mode 100644
index c923922d5184..000000000000
--- a/src/executor/trt_graph_executor.cc
+++ /dev/null
@@ -1,443 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#if MXNET_USE_TENSORRT
-
-#include "trt_graph_executor.h"
-
-#include <onnx/onnx_pb.h>
-#include <NvInfer.h>
-#include "./onnx_to_tensorrt.h"
-#include "../operator/contrib/tensorrt-inl.h"
-#include "../common/utils.h"
-#include "../common/exec_utils.h"
-
-
-namespace mxnet {
-namespace exec {
-
-using namespace mxnet::common;
-
-  /*!
- * \brief TrtGraphExecutor initializer for simple bind flow in
- * which only certain input shapes and dtypes are provided by users.
- * The initializer uses these shapes and dtypes to perform
- * shape and dtype inferences, and then create NDArrays
- * to populate data entries of the graph. The created NDArrays
- * for in_args, arg_grads and aux_states are passed to the
- * front end to attach the created executor.
- * In front end, if the simple_bind flow is trigger by
- * _bind_ith_exec, the shared data arrays of DataParallelExecutorGroup
- * and shared executor will be taken into account in creating
- * NDArrays for in_args, arg_grads, and aux_states for reusing
- * already allocated memory.
- *
- * This version of an executor exports the computation graph to TensorRT make use of fused
- * kernels and other runtime enhancements.  TRT will compile the sub-graphs to executable fused
- * operators without intervention from the user.  Operators in the original graph that are not
- * supported by TRT will continue to be executed normally by MXNet.
- *
- */
-void TrtGraphExecutor::Init(nnvm::Symbol symbol,
-                            const Context& default_ctx,
-                            const std::map<std::string, Context>& ctx_map,
-                            std::vector<Context> *in_arg_ctxes,
-                            std::vector<Context> *arg_grad_ctxes,
-                            std::vector<Context> *aux_state_ctxes,
-                            std::unordered_map<std::string, mxnet::TShape> *arg_shape_map,
-                            std::unordered_map<std::string, int> *arg_dtype_map,
-                            std::unordered_map<std::string, int> *arg_stype_map,
-                            std::vector<OpReqType> *grad_req_types,
-                            const std::unordered_set<std::string>& shared_arg_names,
-                            std::vector<NDArray>* in_arg_vec,
-                            std::vector<NDArray>* arg_grad_vec,
-                            std::vector<NDArray>* aux_state_vec,
-                            std::unordered_map<std::string, NDArray>* shared_buffer,
-                            Executor* shared_exec,
-                            const nnvm::NodeEntryMap<NDArray>& feed_dict) {
-  symbol = symbol.Copy();
-  nnvm::Graph g = InitGraph(symbol, default_ctx, ctx_map, *in_arg_ctxes, *arg_grad_ctxes,
-                            *aux_state_ctxes, *grad_req_types);
-
-  if (need_grad_) {
-    LOG(FATAL) << "You may be attempting to use TensorRT for training.  TensorRT is an inference "
-                  "only library.  To re-enable legacy MXNet graph execution, which will support "
-                  "training, set the MXNET_USE_TENSORRT environment variable to 0, or call "
-                  "mx.contrib.tensorrt.set_use_tensorrt(False)";
-  }
-
-  if (shared_buffer == nullptr || shared_buffer->empty()) {
-    LOG(FATAL) << "MXNET_USE_TENSORRT = 1 but shared_buffer is empty. "
-               << "Please provide weights and other parameters, such as "
-               << "BatchNorm moments, via the shared_buffer, during simple bind call.";
-  }
-
-  // The following code of shape and dtype inferences and argument
-  // initialization is for simple_bind only. Regular bind operation
-  // should do this differently.
-
-  // Initialize arg_shapes and arg_dtypes for shape and type inferences.
-  // It contains all in_args and aux_states' shapes and types in a certain order.
-  const nnvm::IndexedGraph& idx = g.indexed_graph();
-  mxnet::ShapeVector arg_shapes(idx.input_nodes().size(), mxnet::TShape());
-  nnvm::DTypeVector arg_dtypes(idx.input_nodes().size(), -1);
-  StorageTypeVector arg_stypes(idx.input_nodes().size(), kUndefinedStorage);
-  for (size_t i = 0; i < num_forward_inputs_; ++i) {
-    const uint32_t nid = idx.input_nodes().at(i);
-    const std::string& name = idx[nid].source->attrs.name;
-    auto it1 = arg_shape_map->find(name);
-    if (arg_shape_map->end() != it1) {
-      arg_shapes[i] = it1->second;
-    }
-    auto it2 = arg_dtype_map->find(name);
-    if (arg_dtype_map->end() != it2) {
-      arg_dtypes[i] = it2->second;
-    }
-    auto it3 = arg_stype_map->find(name);
-    if (arg_stype_map->end() != it3) {
-      arg_stypes[i] = it3->second;
-    }
-  }
-  g = InferShape(std::move(g), std::move(arg_shapes), "__shape__");
-  if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) {
-    HandleInferShapeError(num_forward_inputs_, g.indexed_graph(),
-                          g.GetAttr<mxnet::ShapeVector>("shape"));
-  }
-
-  g = InferType(std::move(g), std::move(arg_dtypes), "__dtype__");
-  if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) {
-    HandleInferTypeError(num_forward_inputs_, g.indexed_graph(),
-                         g.GetAttr<nnvm::DTypeVector>("dtype"));
-  }
-
-  g = InferStorageType(std::move(g), std::move(arg_stypes), "__storage_type__");
-  if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) {
-    HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(),
-                                g.GetAttr<StorageTypeVector>("storage_type"));
-  }
-
-  auto trt_groups = GetTrtCompatibleSubsets(g, shared_buffer);
-  for (const auto &trt_group : trt_groups) {
-    if (trt_group.size() > 1) {
-      g = ReplaceSubgraph(std::move(g), trt_group, shared_buffer);
-      g = ReinitGraph(std::move(g), default_ctx, ctx_map, in_arg_ctxes, arg_grad_ctxes,
-                      aux_state_ctxes, grad_req_types, arg_shape_map, arg_dtype_map,
-                      arg_stype_map, shared_buffer);
-    }
-  }
-
-  InitArguments(g.indexed_graph(), g.GetAttr<mxnet::ShapeVector>("shape"),
-                g.GetAttr<nnvm::DTypeVector>("dtype"),
-                g.GetAttr<StorageTypeVector>("storage_type"),
-                *in_arg_ctxes, *arg_grad_ctxes, *aux_state_ctxes,
-                *grad_req_types, shared_arg_names, shared_exec,
-                shared_buffer, in_arg_vec, arg_grad_vec, aux_state_vec);
-
-  // The above code of shape and dtype inferences and argument
-  // initialization is for simple_bind only. Regular bind operation
-  // should do this differently.
-
-  // Initialize the rest attributes of the graph.
-  // This function can be called by regular bind
-  // operation flow as well.
-  FinishInitGraph(symbol, g, shared_exec, feed_dict);
-}
-/*!
- * \brief Initialize in_args, arg_grads, and aux_states
- * and their data_entry_ of the executor using
- * shared_buffer from DataParallelExecutorGroup
- * and shared_exec if available.
- */
-void TrtGraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
-                                  const mxnet::ShapeVector& inferred_shapes,
-                                  const nnvm::DTypeVector& inferred_dtypes,
-                                  const StorageTypeVector& inferred_stypes,
-                                  const std::vector<Context>& in_arg_ctxes,
-                                  const std::vector<Context>& arg_grad_ctxes,
-                                  const std::vector<Context>& aux_state_ctxes,
-                                  const std::vector<OpReqType>& grad_req_types,
-                                  const std::unordered_set<std::string>& shared_arg_names,
-                                  const Executor* shared_exec,
-                                  std::unordered_map<std::string, NDArray>* shared_buffer,
-                                  std::vector<NDArray>* in_arg_vec,
-                                  std::vector<NDArray>* arg_grad_vec,
-                                  std::vector<NDArray>* aux_state_vec) {
-  // initialize in_args, arg_grads, and aux_states and populate grad_store_
-  data_entry_.resize(idx.num_node_entries());
-  size_t arg_top = 0, aux_top = 0;
-  const auto& mutable_nodes = idx.mutable_input_nodes();
-  for (size_t i = 0; i < num_forward_inputs_; ++i) {
-    const uint32_t nid = idx.input_nodes().at(i);
-    const uint32_t eid = idx.entry_id(nid, 0);
-    const mxnet::TShape& inferred_shape = inferred_shapes[eid];
-    const int inferred_dtype = inferred_dtypes[eid];
-    const auto inferred_stype = (NDArrayStorageType) inferred_stypes[eid];
-    const std::string& arg_name = idx[nid].source->attrs.name;
-    // aux_states
-    if (mutable_nodes.count(nid)) {
-      if (nullptr != shared_exec) {
-        const NDArray& aux_nd = shared_exec->aux_state_map().at(arg_name);
-        CHECK(inferred_stype == kDefaultStorage && aux_nd.storage_type() == kDefaultStorage)
-          << "Non-default storage type detected when creating auxilliary NDArray. The allocated "
-          << "memory of shared_exec.aux_array cannot be resued for argument: "
-          << arg_name << " for the current executor";
-        CHECK_EQ(inferred_shape, aux_nd.shape())
-          << "Inferred shape does not match shared_exec.aux_array's shape."
-             " Therefore, the allocated memory for shared_exec.aux_array cannot"
-             " be resued for creating auxilliary NDArray of the argument: "
-          << arg_name << " for the current executor";
-        CHECK_EQ(inferred_dtype, aux_nd.dtype())
-          << "Inferred dtype does not match shared_exec.aux_array's dtype."
-             " Therefore, the allocated memory for shared_exec.aux_array cannot"
-             " be resued for creating auxilliary NDArray of the argument: "
-          << arg_name << " for the current executor";
-        aux_state_vec->emplace_back(aux_nd);
-      } else {
-        auto it = shared_buffer->find(arg_name);
-        if (it != shared_buffer->end()) {
-          aux_state_vec->push_back(std::move(it->second.Copy(aux_state_ctxes[aux_top])));
-        } else {
-          aux_state_vec->push_back(std::move(InitZeros(inferred_stype, inferred_shape,
-                                                       aux_state_ctxes[aux_top], inferred_dtype)));
-        }
-      }  // if (has_shared_exec)
-      data_entry_[eid] = aux_state_vec->back();
-      aux_state_map_.emplace(arg_name, aux_state_vec->back());
-      ++aux_top;
-    } else {  // in_args and grad for in_args
-      if (shared_arg_names.count(arg_name)) {  // model parameter
-        // model parameter
-        if (nullptr != shared_exec) {
-          const NDArray& in_arg_nd = shared_exec->in_arg_map().at(arg_name);
-          auto arg_nd_stype = in_arg_nd.storage_type();
-          // for model parameter, both default storage and row_sparse storage can be shared
-          bool shareable_arg_stype = inferred_stype == kDefaultStorage ||
-                                     inferred_stype == kRowSparseStorage;
-          // try to reuse memory from shared_exec
-          CHECK(shareable_arg_stype) << "Inferred storage type "
-            << common::stype_string(inferred_stype)
-            << " does not support memory sharing with shared_exec.arg_array";
-          CHECK_EQ(inferred_stype, arg_nd_stype)
-            << "Inferred stype does not match shared_exec.arg_array's stype"
-               " Therefore, the allocated memory for shared_exec.arg_array cannot"
-               " be resued for creating NDArray of the argument "
-            << arg_name << " for the current executor";
-          CHECK_EQ(inferred_shape, in_arg_nd.shape())
-            << "Inferred shape does not match shared_exec.arg_array's shape"
-               " Therefore, the allocated memory for shared_exec.arg_array cannot"
-               " be resued for creating NDArray of the argument "
-            << arg_name << " for the current executor";
-          CHECK_EQ(inferred_dtype, in_arg_nd.dtype())
-            << "Inferred dtype does not match shared_exec.arg_array's dtype"
-               " Therefore, the allocated memory for shared_exec.arg_array cannot"
-               " be resued for creating NDArray of the argument "
-            << arg_name << " for the current executor";
-          in_arg_vec->emplace_back(in_arg_nd);
-        } else {
-          // doesn't have shared_exec, or non-default storage
-          EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top],
-                           inferred_dtype, in_arg_vec);
-        }
-        // gradient for model parameter
-        if (kNullOp == grad_req_types[arg_top]) {
-          arg_grad_vec->emplace_back();
-        } else {
-          auto grad_oid = grad_store_.size() + num_forward_outputs_;
-          auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]);
-          auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid];
-          if (nullptr != shared_exec && grad_stype == kDefaultStorage &&
-              shared_exec->arg_grad_map().at(arg_name).storage_type() == kDefaultStorage) {
-            // try to reuse memory from shared_exec
-            arg_grad_vec->emplace_back(shared_exec->arg_grad_map().at(arg_name));
-          } else {
-            // no need to reuse memory from shared_exec for gradient of non-default storage
-            EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top],
-                             inferred_dtype, arg_grad_vec);
-          }
-          grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
-        }
-      } else {  // !shared_arg_names.count(arg_name)
-        // model parameter, row_sparse ndarray sharing enabled
-        auto it = shared_buffer->find(arg_name);
-        if (it != shared_buffer->end()) {
-          in_arg_vec->push_back(std::move(it->second.Copy(in_arg_ctxes[arg_top])));
-        } else {
-          in_arg_vec->push_back(std::move(InitZeros(inferred_stype, inferred_shape,
-                                                    in_arg_ctxes[arg_top], inferred_dtype)));
-        }
-        // gradient for model parameter, row_sparse ndarray sharing disabled
-        if (kNullOp == grad_req_types[arg_top]) {
-          arg_grad_vec->emplace_back();
-        } else {
-          auto grad_oid = grad_store_.size() + num_forward_outputs_;
-          auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]);
-          auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid];
-          bool enable_row_sparse_sharing = false;
-          arg_grad_vec->emplace_back(ReshapeOrCreate("grad of " + arg_name, inferred_shape,
-                                                     inferred_dtype, grad_stype,
-                                                     arg_grad_ctxes[arg_top], shared_buffer,
-                                                     enable_row_sparse_sharing));
-          grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
-        }  // if (kNullOp == grad_req_types[arg_top])
-      }  // if (shared_arg_names.count(arg_name))
-      in_arg_map_.emplace(arg_name, in_arg_vec->back());
-      if (!arg_grad_vec->back().is_none()) {
-        arg_grad_map_.emplace(arg_name, arg_grad_vec->back());
-      }
-      data_entry_[eid] = in_arg_vec->back();
-      ++arg_top;
-    }
-  }
-}
-
-
-  /*!
- * \brief This function is triggered after each tensorrt subgraph replacement pass.
- * Reset arguments of GraphExecutor::Init(...) as some variables (weights and biases)
- * are absorbed into the TRT engine it also it reruns attributes inferences accordingly
- * to the new topology.
- */
-Graph TrtGraphExecutor::ReinitGraph(Graph&& g, const Context &default_ctx,
-                                 const std::map<std::string, Context> &ctx_map,
-                                 std::vector<Context> *in_arg_ctxes,
-                                 std::vector<Context> *arg_grad_ctxes,
-                                 std::vector<Context> *aux_state_ctxes,
-                                 std::vector<OpReqType> *grad_req_types,
-                                 std::unordered_map<std::string, mxnet::TShape> *arg_shape_map,
-                                 std::unordered_map<std::string, int> *arg_dtype_map,
-                                 std::unordered_map<std::string, int> *arg_stype_map,
-                                 std::unordered_map<std::string, NDArray> *params_map) {
-  std::unordered_set<std::string> to_remove_params;
-  for (auto& el : *params_map) {
-    to_remove_params.insert(el.first);
-  }
-
-  DFSVisit(g.outputs, [&to_remove_params](const nnvm::NodePtr n) {
-    to_remove_params.erase(n->attrs.name);
-  });
-
-  for (auto& el : to_remove_params) {
-    params_map->erase(el);
-    arg_shape_map->erase(el);
-    arg_dtype_map->erase(el);
-    arg_stype_map->erase(el);
-  }
-  const auto &idx = g.indexed_graph();
-  num_forward_inputs_ = idx.input_nodes().size();
-  in_arg_ctxes->resize(num_forward_inputs_ - idx.mutable_input_nodes().size());
-  arg_grad_ctxes->resize(num_forward_inputs_ - idx.mutable_input_nodes().size());
-  grad_req_types->resize(num_forward_inputs_ - idx.mutable_input_nodes().size());
-  aux_state_ctxes->resize(idx.mutable_input_nodes().size());
-
-  // create "device" and "context" attrs for the graph
-  g = AssignContext(g, default_ctx, ctx_map, *in_arg_ctxes, *arg_grad_ctxes,
-                    *aux_state_ctxes, *grad_req_types, num_forward_inputs_,
-                    num_forward_outputs_);
-
-  // get number of nodes used in forward pass
-  num_forward_nodes_ = 0;
-  for (size_t i = 0; i < num_forward_outputs_; ++i) {
-    num_forward_nodes_ = std::max(
-        num_forward_nodes_, static_cast<size_t>(idx.outputs()[i].node_id + 1));
-  }
-  mxnet::ShapeVector arg_shapes(idx.input_nodes().size(), mxnet::TShape());
-  nnvm::DTypeVector arg_dtypes(idx.input_nodes().size(), -1);
-  StorageTypeVector arg_stypes(idx.input_nodes().size(), kUndefinedStorage);
-  for (size_t i = 0; i < num_forward_inputs_; ++i) {
-    const uint32_t nid = idx.input_nodes().at(i);
-    const std::string &name = idx[nid].source->attrs.name;
-    auto it1 = arg_shape_map->find(name);
-    if (arg_shape_map->end() != it1) {
-      arg_shapes[i] = it1->second;
-    }
-    auto it2 = arg_dtype_map->find(name);
-    if (arg_dtype_map->end() != it2) {
-      arg_dtypes[i] = it2->second;
-    }
-    auto it3 = arg_stype_map->find(name);
-    if (arg_stype_map->end() != it3) {
-      arg_stypes[i] = it3->second;
-    }
-  }
-  g = InferShape(std::move(g), std::move(arg_shapes), "__shape__");
-  if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) {
-    HandleInferShapeError(num_forward_inputs_, g.indexed_graph(),
-                          g.GetAttr<mxnet::ShapeVector>("shape"));
-  }
-
-  g = InferType(std::move(g), std::move(arg_dtypes), "__dtype__");
-  if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) {
-    HandleInferTypeError(num_forward_inputs_, g.indexed_graph(),
-                         g.GetAttr<nnvm::DTypeVector>("dtype"));
-  }
-
-  g = InferStorageType(std::move(g), std::move(arg_stypes), "__storage_type__");
-
-  if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) {
-    HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(),
-                                g.GetAttr<StorageTypeVector>("storage_type"));
-  }
-
-  return g;
-}
-
-
-/*!
- * \brief Return the "optimized" symbol contained in the graph.
- * For optimization pass such as TensorRT pass
- */
-nnvm::Symbol TrtGraphExecutor::GetOptimizedSymbol() {
-  Symbol ret;
-  ret.outputs = std::vector<nnvm::NodeEntry>(graph_.outputs.begin(),
-                                             graph_.outputs.begin() + num_forward_outputs_);
-  return ret.Copy();
-}
-
-Executor *TrtGraphExecutor::TensorRTBind(nnvm::Symbol symbol,
-                                         const Context &default_ctx,
-                                         const std::map<std::string, Context> &group2ctx,
-                                         std::vector<Context> *in_arg_ctxes,
-                                         std::vector<Context> *arg_grad_ctxes,
-                                         std::vector<Context> *aux_state_ctxes,
-                                         std::unordered_map<std::string, mxnet::TShape>
-                                                                              *arg_shape_map,
-                                         std::unordered_map<std::string, int> *arg_dtype_map,
-                                         std::unordered_map<std::string, int> *arg_stype_map,
-                                         std::vector<OpReqType> *grad_req_types,
-                                         const std::unordered_set<std::string> &param_names,
-                                         std::vector<NDArray> *in_args,
-                                         std::vector<NDArray> *arg_grads,
-                                         std::vector<NDArray> *aux_states,
-                                         std::unordered_map<std::string, NDArray> *shared_buffer,
-                                         Executor *shared_exec) {
-  auto exec = new exec::TrtGraphExecutor();
-  exec->Init(std::move(symbol), default_ctx, group2ctx,
-             in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes,
-             arg_shape_map, arg_dtype_map, arg_stype_map,
-             grad_req_types, param_names,
-             in_args, arg_grads, aux_states,
-             shared_buffer, shared_exec);
-  return exec;
-}
-
-}  // namespace exec
-
-}  // namespace mxnet
-
-#endif  // MXNET_USE_TENSORRT
diff --git a/src/executor/trt_graph_executor.h b/src/executor/trt_graph_executor.h
deleted file mode 100644
index a4ec5bf657ae..000000000000
--- a/src/executor/trt_graph_executor.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef MXNET_EXECUTOR_TRT_GRAPH_EXECUTOR_H_
-#define MXNET_EXECUTOR_TRT_GRAPH_EXECUTOR_H_
-
-#if MXNET_USE_TENSORRT
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "./graph_executor.h"
-
-namespace mxnet {
-
-namespace exec {
-
-class TrtGraphExecutor : public GraphExecutor {
- public:
-  static Executor* TensorRTBind(nnvm::Symbol symbol,
-                                const Context& default_ctx,
-                                const std::map<std::string, Context>& group2ctx,
-                                std::vector<Context> *in_arg_ctxes,
-                                std::vector<Context>* arg_grad_ctxes,
-                                std::vector<Context>* aux_state_ctxes,
-                                std::unordered_map<std::string, mxnet::TShape>* arg_shape_map,
-                                std::unordered_map<std::string, int>* arg_dtype_map,
-                                std::unordered_map<std::string, int>* arg_stype_map,
-                                std::vector<OpReqType>* grad_req_types,
-                                const std::unordered_set<std::string>& param_names,
-                                std::vector<NDArray>* in_args,
-                                std::vector<NDArray>* arg_grads,
-                                std::vector<NDArray>* aux_states,
-                                std::unordered_map<std::string, NDArray>*
-                                shared_data_arrays = nullptr,
-                                Executor* shared_exec = nullptr);
-
-  virtual void Init(nnvm::Symbol symbol,
-                    const Context& default_ctx,
-                    const std::map<std::string, Context>& ctx_map,
-                    std::vector<Context> *in_arg_ctxes,
-                    std::vector<Context> *arg_grad_ctxes,
-                    std::vector<Context> *aux_state_ctxes,
-                    std::unordered_map<std::string, mxnet::TShape> *arg_shape_map,
-                    std::unordered_map<std::string, int> *arg_dtype_map,
-                    std::unordered_map<std::string, int> *arg_stype_map,
-                    std::vector<OpReqType> *grad_req_types,
-                    const std::unordered_set<std::string>& shared_arg_names,
-                    std::vector<NDArray>* in_arg_vec,
-                    std::vector<NDArray>* arg_grad_vec,
-                    std::vector<NDArray>* aux_state_vec,
-                    std::unordered_map<std::string, NDArray>* shared_buffer = nullptr,
-                    Executor* shared_exec = nullptr,
-                    const nnvm::NodeEntryMap<NDArray>& feed_dict
-                      = nnvm::NodeEntryMap<NDArray>());
-
-  // Returns symbol representing the TRT optimized graph for comparison purposes.
-  nnvm::Symbol GetOptimizedSymbol();
-
- protected:
-  Graph ReinitGraph(Graph&& g, const Context &default_ctx,
-        const std::map<std::string, Context> &ctx_map,
-        std::vector<Context> *in_arg_ctxes,
-        std::vector<Context> *arg_grad_ctxes,
-        std::vector<Context> *aux_state_ctxes,
-        std::vector<OpReqType> *grad_req_types,
-        std::unordered_map<std::string, mxnet::TShape> *arg_shape_map,
-        std::unordered_map<std::string, int> *arg_dtype_map,
-        std::unordered_map<std::string, int> *arg_stype_map,
-        std::unordered_map<std::string, NDArray> *params_map);
-
-  void InitArguments(const nnvm::IndexedGraph& idx,
-                     const mxnet::ShapeVector& inferred_shapes,
-                     const nnvm::DTypeVector& inferred_dtypes,
-                     const StorageTypeVector& inferred_stypes,
-                     const std::vector<Context>& in_arg_ctxes,
-                     const std::vector<Context>& arg_grad_ctxes,
-                     const std::vector<Context>& aux_state_ctxes,
-                     const std::vector<OpReqType>& grad_req_types,
-                     const std::unordered_set<std::string>& shared_arg_names,
-                     const Executor* shared_exec,
-                     std::unordered_map<std::string, NDArray>* shared_buffer,
-                     std::vector<NDArray>* in_arg_vec,
-                     std::vector<NDArray>* arg_grad_vec,
-                     std::vector<NDArray>* aux_state_vec) override;
-};
-
-}  // namespace exec
-
-}  // namespace mxnet
-
-#endif  // MXNET_USE_TENSORRT
-
-#endif  // MXNET_EXECUTOR_TRT_GRAPH_EXECUTOR_H_
diff --git a/src/initialize.cc b/src/initialize.cc
index 00a736abd8ba..7236ced52e93 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -46,8 +46,8 @@ class LibraryInitializer {
     dmlc::InitLogging("mxnet");
 #if MXNET_USE_SIGNAL_HANDLER && DMLC_LOG_STACK_TRACE
     struct sigaction sa;
-    sigaction(SIGSEGV, NULL, &sa);
-    if (sa.sa_handler == NULL) {
+    sigaction(SIGSEGV, nullptr, &sa);
+    if (sa.sa_handler == nullptr) {
         signal(SIGSEGV, SegfaultLogger);
     }
 #endif
diff --git a/src/io/image_iter_common.h b/src/io/image_iter_common.h
index 4bbcb9d21f9a..4d4b37306d8d 100644
--- a/src/io/image_iter_common.h
+++ b/src/io/image_iter_common.h
@@ -346,8 +346,13 @@ struct ImageDetNormalizeParam :  public dmlc::Parameter<ImageDetNormalizeParam>
 
 // Define prefetcher parameters
 struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
+  enum CtxType { kGPU = 0, kCPU};
   /*! \brief number of prefetched batches */
   size_t prefetch_buffer;
+
+  /*! \brief Context data loader optimized for */
+  int ctx;
+
   /*! \brief data type */
   dmlc::optional<int> dtype;
 
@@ -355,6 +360,10 @@ struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
   DMLC_DECLARE_PARAMETER(PrefetcherParam) {
     DMLC_DECLARE_FIELD(prefetch_buffer).set_default(4)
         .describe("Maximum number of batches to prefetch.");
+    DMLC_DECLARE_FIELD(ctx).set_default(kGPU)
+        .add_enum("cpu", kCPU)
+        .add_enum("gpu", kGPU)
+        .describe("Context data loader optimized for.");
     DMLC_DECLARE_FIELD(dtype)
       .add_enum("float32", mshadow::kFloat32)
       .add_enum("float64", mshadow::kFloat64)
@@ -362,6 +371,7 @@ struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
       .add_enum("int64", mshadow::kInt64)
       .add_enum("int32", mshadow::kInt32)
       .add_enum("uint8", mshadow::kUint8)
+      .add_enum("int8", mshadow::kInt8)
       .set_default(dmlc::optional<int>())
       .describe("Output data type. ``None`` means no change.");
   }
diff --git a/src/io/iter_image_recordio_2.cc b/src/io/iter_image_recordio_2.cc
index 0834dd7786ee..5d9e81d3f6b4 100644
--- a/src/io/iter_image_recordio_2.cc
+++ b/src/io/iter_image_recordio_2.cc
@@ -44,6 +44,7 @@
 #include "../common/utils.h"
 
 namespace mxnet {
+
 namespace io {
 // parser to parse image recordio
 template<typename DType>
@@ -87,7 +88,7 @@ class ImageRecordIOParser2 {
   ImageRecordParam record_param_;
   BatchParam batch_param_;
   ImageNormalizeParam normalize_param_;
-  PrefetcherParam prefetch_param_;
+
   #if MXNET_USE_OPENCV
   /*! \brief augmenters */
   std::vector<std::vector<std::unique_ptr<ImageAugmenter> > > augmenters_;
@@ -133,7 +134,6 @@ inline void ImageRecordIOParser2<DType>::Init(
   record_param_.InitAllowUnknown(kwargs);
   batch_param_.InitAllowUnknown(kwargs);
   normalize_param_.InitAllowUnknown(kwargs);
-  prefetch_param_.InitAllowUnknown(kwargs);
   n_parsed_ = 0;
   overflow = false;
   rnd_.seed(kRandMagic + record_param_.seed);
@@ -141,7 +141,7 @@ inline void ImageRecordIOParser2<DType>::Init(
   #pragma omp parallel
   {
     // be conservative, set number of real cores
-    maxthread = std::max(omp_get_num_procs() / 2 - 1, 1);
+    maxthread = std::max(omp_get_num_procs(), 1);
   }
   param_.preprocess_threads = std::min(maxthread, param_.preprocess_threads);
   #pragma omp parallel num_threads(param_.preprocess_threads)
@@ -763,6 +763,113 @@ class ImageRecordIter2 : public IIterator<DataBatch> {
     ImageRecordIOParser2<DType> parser_;
 };
 
+template<typename DType = real_t>
+class ImageRecordIter2CPU : public IIterator<DataBatch> {
+ public:
+  ImageRecordIter2CPU() {
+    out_ = new DataBatch();
+    var_ = Engine::Get()->NewVariable();
+  }
+
+  virtual ~ImageRecordIter2CPU(void) {
+    Engine::Get()->DeleteVariable([](mxnet::RunContext ctx) {}, Context::CPU(), var_);
+    delete out_;
+  }
+
+  virtual void Init(const std::vector<std::pair<std::string, std::string>>& kwargs) {
+    parser_.Init(kwargs);
+  }
+
+  virtual void BeforeFirst(void) { parser_.BeforeFirst(); }
+
+  // From iter_prefetcher.h
+  virtual bool Next(void) {
+    bool result = false;
+    const auto engine = Engine::Get();
+    engine->PushSync(
+        [this, &result](RunContext ctx) {
+          result = this->parser_.ParseNext(out_);
+        },
+        Context::CPU(), {}, {var_}, FnProperty::kNormal, 0, "DataLoader");
+    engine->WaitForVar(var_);
+    return result;
+  }
+
+  virtual const DataBatch& Value(void) const { return *out_; }
+
+ private:
+  /*! \brief Backend thread */
+  dmlc::ThreadedIter<DataBatch> iter_;
+  /*! \brief output data */
+  DataBatch* out_;
+  Engine::VarHandle var_;
+  /*! \brief queue to be recycled */
+  std::queue<DataBatch*> recycle_queue_;
+  /* \brief parser */
+  ImageRecordIOParser2<DType> parser_;
+};
+
+class ImageRecordIter2Wrapper : public IIterator<DataBatch> {
+ public:
+  ~ImageRecordIter2Wrapper(void) override {
+    if (record_iter_) delete record_iter_;
+  }
+  void Init(const std::vector<std::pair<std::string, std::string>>& kwargs) override {
+    PrefetcherParam prefetch_param;
+    prefetch_param.InitAllowUnknown(kwargs);
+    int dtype = mshadow::kFloat32;
+    if (prefetch_param.dtype.has_value()) {
+      dtype = prefetch_param.dtype.value();
+    }
+    if (prefetch_param.ctx == PrefetcherParam::CtxType::kCPU) {
+      LOG(INFO) << "Create ImageRecordIter2 optimized for CPU backend.";
+      switch (dtype) {
+        case mshadow::kFloat32:
+          record_iter_ = new ImageRecordIter2CPU<float>();
+          break;
+        case mshadow::kUint8:
+          record_iter_ = new ImageRecordIter2CPU<uint8_t>();
+          break;
+        case mshadow::kInt8:
+          record_iter_ = new ImageRecordIter2CPU<int8_t>();
+          break;
+        default:
+          LOG(FATAL) << "unknown dtype for ImageRecordIter2.";
+      }
+    } else {
+      // For gpu
+      switch (dtype) {
+        case mshadow::kFloat32:
+          record_iter_ = new ImageRecordIter2<float>();
+          break;
+        case mshadow::kUint8:
+          record_iter_ = new ImageRecordIter2<uint8_t>();
+          break;
+        case mshadow::kInt8:
+          record_iter_ = new ImageRecordIter2<int8_t>();
+          break;
+        default:
+          LOG(FATAL) << "unknown dtype for ImageRecordIter2.";
+      }
+    }
+    record_iter_->Init(kwargs);
+    }
+
+    void BeforeFirst(void) override {
+      record_iter_->BeforeFirst();
+    }
+
+    // From iter_prefetcher.h
+    bool Next(void) override { return record_iter_->Next(); }
+
+    const DataBatch &Value(void) const override {
+      return record_iter_->Value();
+    }
+
+ private:
+  IIterator<DataBatch>* record_iter_ = nullptr;
+};
+
 MXNET_REGISTER_IO_ITER(ImageRecordIter)
 .describe(R"code(Iterates on image RecordIO files
 
@@ -795,12 +902,14 @@ Example::
 .add_arguments(ListDefaultAugParams())
 .add_arguments(ImageNormalizeParam::__FIELDS__())
 .set_body([]() {
-    return new ImageRecordIter2<real_t>();
+    return new ImageRecordIter2Wrapper();
     });
 
 MXNET_REGISTER_IO_ITER(ImageRecordUInt8Iter)
 .describe(R"code(Iterating on image RecordIO files
 
+.. note:: ImageRecordUInt8Iter is deprecated. Use ImageRecordIter(dtype='uint8') instead.
+
 This iterator is identical to ``ImageRecordIter`` except for using ``uint8`` as
 the data type instead of ``float``.
 
@@ -817,6 +926,8 @@ the data type instead of ``float``.
 MXNET_REGISTER_IO_ITER(ImageRecordInt8Iter)
 .describe(R"code(Iterating on image RecordIO files
 
+.. note:: ``ImageRecordInt8Iter`` is deprecated. Use ImageRecordIter(dtype='int8') instead.
+
 This iterator is identical to ``ImageRecordIter`` except for using ``int8`` as
 the data type instead of ``float``.
 
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index eddfbcff9ce8..0bfca8c10a1a 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -339,8 +339,8 @@ NDArray NDArray::data_ndarray() const {
 }
 
 struct NDArrayDLManager {
-    NDArray handle;  // ref NDArray
-    DLManagedTensor tensor;
+  NDArray handle;  // ref NDArray
+  DLManagedTensor tensor;
 };
 
 DLManagedTensor* NDArray::ToDLPack() const {
@@ -356,13 +356,13 @@ DLManagedTensor* NDArray::ToDLPack() const {
 }
 
 NDArray NDArray::FromDLPack(const DLManagedTensor* tensor) {
-  const DLTensor &dl_tensor = tensor->dl_tensor;
-  auto deleter = [tensor](){
-    if (tensor->deleter != nullptr) {
-      tensor->deleter(const_cast<DLManagedTensor*>(tensor));
+  DLManagedTensor tensor_copy = *tensor;
+  auto deleter = [tensor_copy](){
+    if (tensor_copy.deleter != nullptr) {
+      tensor_copy.deleter(const_cast<DLManagedTensor*>(&tensor_copy));
     }
   };
-  return NDArray(TBlob(dl_tensor), dl_tensor.ctx.device_id, deleter);
+  return NDArray(TBlob(tensor_copy.dl_tensor), tensor_copy.dl_tensor.ctx.device_id, deleter);
 }
 
 bool NDArray::fresh_out_grad() const {
diff --git a/src/operator/contrib/tensorrt-inl.h b/src/operator/contrib/tensorrt-inl.h
deleted file mode 100644
index 062d22e35795..000000000000
--- a/src/operator/contrib/tensorrt-inl.h
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef MXNET_OPERATOR_CONTRIB_TENSORRT_INL_H_
-#define MXNET_OPERATOR_CONTRIB_TENSORRT_INL_H_
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2018 by Contributors
- * \file tensorrt-inl.h
- * \brief TensorRT Operator
- * \author Marek Kolodziej, Clement Fuji Tsang
-*/
-
-#if MXNET_USE_TENSORRT
-
-#include <dmlc/logging.h>
-#include <dmlc/memory_io.h>
-#include <dmlc/serializer.h>
-#include <dmlc/parameter.h>
-#include <mxnet/base.h>
-#include <mxnet/operator.h>
-#include <nnvm/graph.h>
-#include <nnvm/pass_functions.h>
-
-#include <NvInfer.h>
-
-#include <algorithm>
-#include <iostream>
-#include <map>
-#include <vector>
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-#include <string>
-
-#include "nnvm_to_onnx-inl.h"
-#include "../operator_common.h"
-#include "../../common/utils.h"
-#include "../../common/serialization.h"
-#include "../../executor/exec_pass.h"
-#include "../../executor/graph_executor.h"
-#include "../../executor/onnx_to_tensorrt.h"
-
-namespace mxnet {
-namespace op {
-
-using namespace nnvm;
-using int64 = ::google::protobuf::int64;
-
-
-using trt_name_to_idx = std::map<std::string, uint32_t>;
-
-
-struct TRTEngineParam {
-  nvinfer1::IExecutionContext* trt_executor;
-  std::vector<std::pair<uint32_t, nnvm_to_onnx::TypeIO> > binding_map;
-};
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_TENSORRT
-
-#endif  // MXNET_OPERATOR_CONTRIB_TENSORRT_INL_H_
diff --git a/src/operator/contrib/tensorrt.cc b/src/operator/contrib/tensorrt.cc
deleted file mode 100644
index 5b3df70fd825..000000000000
--- a/src/operator/contrib/tensorrt.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2018 by Contributors
- * \file trt.cc
- * \brief TensorRT operation registration
- * \author Marek Kolodziej, Clement Fuji Tsang
-*/
-
-#if MXNET_USE_TENSORRT
-
-#include "./tensorrt-inl.h"
-
-#include <mxnet/base.h>
-#include <nnvm/graph.h>
-#include <nnvm/pass_functions.h>
-
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <unordered_map>
-#include <vector>
-
-#include "../../common/serialization.h"
-#include "../../common/utils.h"
-
-namespace mxnet {
-namespace op {
-
-OpStatePtr GetPtrMapping(nvinfer1::ICudaEngine* trt_engine,
-                         nnvm_to_onnx::NameToIdx_t input_map,
-                         nnvm_to_onnx::NameToIdx_t output_map) {
-  TRTEngineParam param;
-  for (int b = 0; b < trt_engine->getNbBindings(); ++b) {
-    const std::string& binding_name = trt_engine->getBindingName(b);
-    if (trt_engine->bindingIsInput(b)) {
-      param.binding_map.emplace_back(input_map[binding_name],
-                                     nnvm_to_onnx::TypeIO::Inputs);
-    } else {
-      param.binding_map.emplace_back(output_map[binding_name],
-                                     nnvm_to_onnx::TypeIO::Outputs);
-    }
-  }
-  param.trt_executor = trt_engine->createExecutionContext();
-  return OpStatePtr::Create<TRTEngineParam>(param);
-}
-
-OpStatePtr TRTCreateState(const nnvm::NodeAttrs& attrs, Context /*ctx*/,
-                          const mxnet::ShapeVector& /*ishape*/,
-                          const std::vector<int>& /*itype*/) {
-  const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
-
-  ::onnx::ModelProto model_proto;
-  bool success = model_proto.ParseFromString(node_param.serialized_onnx_graph);
-  if (!success) {
-    LOG(FATAL) << "Problems parsing serialized ONNX model.";
-  }
-  auto graph = model_proto.graph();
-  auto first_input_type = graph.input(0).type().tensor_type();
-  auto dim_value = first_input_type.shape().dim(0).dim_value();
-  auto batch_size = static_cast<int32_t >(dim_value);
-  // Need to set up max workspace size based on device properties
-  nvinfer1::ICudaEngine* const trt_engine = ::onnx_to_tensorrt::onnxToTrtCtx(
-      node_param.serialized_onnx_graph, batch_size, 1 << 30);
-
-  nnvm_to_onnx::NameToIdx_t output_map;
-  for (auto& el : node_param.output_map) {
-    output_map[el.first] = std::get<0>(el.second);
-  }
-  return GetPtrMapping(trt_engine, node_param.input_map, output_map);
-}
-
-void TRTParamParser(nnvm::NodeAttrs* attrs) {
-  ONNXParam param_;
-
-  try {
-    param_.Init(attrs->dict);
-    common::Deserialize(&param_.input_map, param_.serialized_input_map);
-    common::Deserialize(&param_.output_map, param_.serialized_output_map);
-    param_.onnx_pb_graph.ParseFromString(param_.serialized_onnx_graph);
-  } catch (const dmlc::ParamError& e) {
-    std::ostringstream os;
-    os << e.what();
-    os << ", in operator " << attrs->op->name << "("
-       << "name=\"" << attrs->name << "\"";
-    for (const auto& k : attrs->dict) {
-      os << ", " << k.first << "=\"" << k.second << "\"";
-    }
-    os << ")";
-    throw dmlc::ParamError(os.str());
-  }
-
-  attrs->parsed = std::move(param_);
-}
-
-inline bool TRTInferShape(const NodeAttrs& attrs, mxnet::ShapeVector* /*in_shape*/,
-                          mxnet::ShapeVector* out_shape) {
-  const auto &node_param = nnvm::get<ONNXParam>(attrs.parsed);
-  for (auto& el : node_param.output_map) {
-    (*out_shape)[std::get<0>(el.second)] = std::get<1>(el.second);
-  }
-  return true;
-}
-
-inline bool TRTInferStorageType(const NodeAttrs& /*attrs*/, const int /*dev_mask*/,
-                                DispatchMode* dispatch_mode,
-                                std::vector<int>* /*in_storage_type*/,
-                                std::vector<int>* out_storage_type) {
-  return storage_type_assign(out_storage_type, mxnet::kDefaultStorage,
-                             dispatch_mode, DispatchMode::kFCompute);
-}
-
-inline bool TRTInferType(const NodeAttrs& attrs, std::vector<int>* /*in_dtype*/,
-                         std::vector<int>* out_dtype) {
-  const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
-  for (auto& el : node_param.output_map) {
-    (*out_dtype)[std::get<0>(el.second)] = std::get<3>(el.second);
-  }
-  return true;
-}
-
-inline std::vector<std::string> TRTListInputNames(const NodeAttrs& attrs) {
-  std::vector<std::string> output;
-  const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
-  output.resize(node_param.input_map.size());
-  for (auto& el : node_param.input_map) {
-    output[el.second] = el.first;
-  }
-  return output;
-}
-
-inline std::vector<std::string> TRTListOutputNames(const NodeAttrs& attrs) {
-  std::vector<std::string> output;
-  const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
-  output.resize(node_param.output_map.size());
-  for (auto& el : node_param.output_map) {
-    output[std::get<0>(el.second)] = el.first;
-  }
-  return output;
-}
-
-NNVM_REGISTER_OP(_trt_op)
-    .describe(R"code(TRT operation (one engine)
-)code" ADD_FILELINE)
-    .set_num_inputs([](const NodeAttrs& attrs) {
-      const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
-      return node_param.input_map.size();
-    })
-    .set_num_outputs([](const NodeAttrs& attrs) {
-      const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
-      return node_param.output_map.size();
-    })
-    .set_attr_parser(TRTParamParser)
-    .set_attr<mxnet::FInferShape>("FInferShape", TRTInferShape)
-    .set_attr<nnvm::FInferType>("FInferType", TRTInferType)
-    .set_attr<nnvm::FListInputNames>("FListInputNames", TRTListInputNames)
-    .set_attr<nnvm::FListOutputNames>("FListOutputNames", TRTListOutputNames)
-    .set_attr<FCreateOpState>("FCreateOpState", TRTCreateState)
-    .set_attr<FInferStorageType>("FInferStorageType", TRTInferStorageType);
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_TENSORRT
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index e80358ac636a..5dccba281fd0 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -392,6 +392,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
       case mkldnn_gOhwi8o:
       case mkldnn_gOhwi16o:
       case mkldnn_gOhIw16o4i:
+      case mkldnn_Goihw16g_s8s8:
         return mkldnn_goihw;
       default:
         LOG(FATAL) << "Unknown MKLDNN format for 5 dimensions: " << desc.data.format;
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
index 2da415877b8b..bd1b47e4c2de 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
@@ -108,7 +108,7 @@ void SgMKLDNNQuantizeOperator::Forward(const OpContext &ctx, const std::vector<N
     }
 
     // Write output min/max
-    auto out_type = GetOutputType(param_);
+    auto out_type = GetQuantizeOutputType(param_);
     if (out_type == mshadow::kUint8) {
       quantized_range = kUint8Range;
       *outputs[1].data().dptr<float>() = data_min;
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
index b8c47c3af11b..55028d8c8ccc 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
@@ -72,7 +72,7 @@ static void MKLDNNQuantizedConvForward(const nnvm::NodeAttrs& attrs,
   MKLDNNStream::Get()->Submit();
   Stream<cpu> *s = ctx.get_stream<cpu>();
   const size_t num_inputs = param.no_bias ? 2 : 3;
-  mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
+  mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(s, 1,
            out_data[1].data().dptr<float>(), out_data[2].data().dptr<float>(),
            in_data[num_inputs].data().dptr<float>(),
            in_data[num_inputs+1].data().dptr<float>(),
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_elemwise_add.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_elemwise_add.cc
new file mode 100644
index 000000000000..05da99207651
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_elemwise_add.cc
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkldnn_quantized_elemwise_add.cc
+ * \brief
+ */
+
+#if MXNET_USE_MKLDNN == 1
+#include "../quantized_elemwise_add-inl.h"
+#include "../../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+#include "../quantization_utils.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(QuantizeElemwiseAddParam);
+
+static inline float GetScale(const NDArray& data, float min, float max) {
+  auto data_range = (data.dtype() == mshadow::kInt8) ? kInt8Range : kUint8Range;
+  return data_range / MaxAbs(min, max);
+}
+
+static void MKLDNNQuantizedElemwiseAddForward(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                                              const std::vector<NDArray>& in_data,
+                                              const std::vector<OpReqType>& req,
+                                              const std::vector<NDArray>& out_data) {
+  const QuantizeElemwiseAddParam& params = nnvm::get<QuantizeElemwiseAddParam>(attrs.parsed);
+  // A, B, A_min, A_max, B_min, B_max
+  CHECK_EQ(in_data.size(), 6U) << "should be A, B, A_min, A_max, B_min, B_max";
+  // C, C_min, C_max
+  CHECK_EQ(out_data.size(), 3U) << "should be C, C_min, C_max";
+  // Collect data min,max,absmax
+  const float dataA_min = in_data[quantized_elemwise_add_enum::kAMin].data().dptr<float>()[0];
+  const float dataB_min = in_data[quantized_elemwise_add_enum::kBMin].data().dptr<float>()[0];
+  const float dataA_max = in_data[quantized_elemwise_add_enum::kAMax].data().dptr<float>()[0];
+  const float dataB_max = in_data[quantized_elemwise_add_enum::kBMax].data().dptr<float>()[0];
+  const float dataA_absmax = MaxAbs(dataA_min, dataA_max);
+  const float dataB_absmax = MaxAbs(dataB_min, dataB_max);
+
+  auto dataA_mem  = in_data[quantized_elemwise_add_enum::kDataA].GetMKLDNNData();
+  auto dataB_mem  = in_data[quantized_elemwise_add_enum::kDataB].GetMKLDNNData();
+  const bool is_dataA_int8 = (in_data[quantized_elemwise_add_enum::kDataA].dtype()
+                              == mshadow::kInt8);
+  const size_t dataA_range = is_dataA_int8 ? kInt8Range : kUint8Range;
+
+  const float A_scale = GetScale(in_data[quantized_elemwise_add_enum::kDataA],
+                                 dataA_min,
+                                 dataA_max);
+  const float B_scale = GetScale(in_data[quantized_elemwise_add_enum::kDataB],
+                                 dataB_min,
+                                 dataB_max);
+  // rescaled_mem is for reorder mkldnn memory
+  mkldnn::memory *rescaled_mem;
+
+  // output default set as int32
+  size_t output_data_range = kInt32Range;
+  auto output_data_type = mkldnn::memory::s32;
+  // dataA && dataB are uint8
+  if (out_data[quantized_elemwise_add_enum::kOut].dtype() == mshadow::kInt8) {
+    output_data_range = kInt8Range;
+    output_data_type = mkldnn::memory::s8;
+  } else if (out_data[quantized_elemwise_add_enum::kOut].dtype() == mshadow::kUint8) {
+    output_data_range = kUint8Range;
+    output_data_type = mkldnn::memory::u8;
+  } else {
+    output_data_range = kInt32Range;
+    output_data_type = mkldnn::memory::s32;
+  }
+
+  float output_min = 0;
+  float output_max = 0;
+  float out_data_scale = 0;
+  if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+    output_min = params.min_calib_range.value();
+    output_max = params.max_calib_range.value();
+    out_data_scale = output_data_range / MaxAbs(output_min, output_max);
+  } else {
+    output_max = dataA_absmax + dataB_absmax;
+    output_min = -output_max;
+  }
+  // 2: scale 0 for dataA, scale 1 for data B
+  const int scales_num = 2;
+  std::vector<float> scales(scales_num, 1);
+  if (in_data[quantized_elemwise_add_enum::kDataA].dtype()
+      != in_data[quantized_elemwise_add_enum::kDataB].dtype()) {
+    auto s8_pd = (is_dataA_int8 == true)
+                 ? dataA_mem->get_primitive_desc()
+                 : dataB_mem->get_primitive_desc();
+    rescaled_mem = TmpMemMgr::Get()->Alloc(s8_pd);
+    float u8_reorder_scale = 0;
+    if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+      if (is_dataA_int8 == true) {
+        u8_reorder_scale = out_data_scale / B_scale;
+        scales[0] = out_data_scale / A_scale;
+      } else {
+        u8_reorder_scale = out_data_scale / A_scale;
+        scales[1] = out_data_scale / B_scale;
+      }
+    } else {
+      // x*dataA_absmax/dataA_range = y*(dataA_absmax+dataB_absmax)/output_range
+      if (is_dataA_int8 == true) {
+        u8_reorder_scale = dataB_absmax * output_data_range
+                           / ((dataA_absmax + dataB_absmax) * kUint8Range);
+        scales[0] = dataA_absmax * output_data_range
+                         / ((dataA_absmax + dataB_absmax) * dataA_range);
+      } else {
+        u8_reorder_scale = dataA_absmax * output_data_range
+                           / ((dataA_absmax + dataB_absmax) * dataA_range);
+        scales[1] = dataB_absmax * output_data_range
+                         / ((dataA_absmax + dataB_absmax) * kInt8Range);
+      }
+    }
+    std::vector<float> reorder_scale = {u8_reorder_scale};
+    primitive_attr reorder_attr;
+    reorder_attr.set_int_output_round_mode(round_mode::round_nearest);
+    reorder_attr.set_output_scales(0, reorder_scale);
+    auto u8_mem = (is_dataA_int8 == true) ? dataB_mem : dataA_mem;
+    const auto reorder_pd = mkldnn::reorder::primitive_desc(u8_mem->get_primitive_desc(),
+                                                            s8_pd,
+                                                            reorder_attr);
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *u8_mem, *rescaled_mem));
+
+    if (is_dataA_int8 == true) {
+      dataB_mem = rescaled_mem;
+    } else {
+      dataA_mem = rescaled_mem;
+    }
+  } else {
+    // same data type and has same data range
+    if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+      scales[0] = out_data_scale / A_scale;
+      scales[1] = out_data_scale / B_scale;
+    } else {
+      scales[0] = dataA_absmax * output_data_range / ((dataA_absmax + dataB_absmax) * dataA_range);
+      scales[1] = dataB_absmax * output_data_range / ((dataA_absmax + dataB_absmax) * dataA_range);
+    }
+  }
+
+  std::vector<mkldnn::primitive::at> in_prims;
+  std::vector<mkldnn::memory::primitive_desc> in_pds;
+  in_prims.push_back(*dataA_mem);
+  in_prims.push_back(*dataB_mem);
+  in_pds.push_back(dataA_mem->get_primitive_desc());
+  in_pds.push_back(dataB_mem->get_primitive_desc());
+  size_t i_ndim = in_data[quantized_elemwise_add_enum::kDataA].shape().ndim();
+  mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+  for (size_t i = 0; i < i_ndim; i++) {
+    i_dims[i] = static_cast<int>(in_data[quantized_elemwise_add_enum::kDataA].shape()[i]);
+  }
+  mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(
+                                   in_pds[quantized_elemwise_add_enum::kDataA].desc().data.format);
+  auto output_desc = mkldnn::memory::desc(i_dims, output_data_type, i_fmt);
+  mkldnn::sum::primitive_desc pdesc(output_desc, scales, in_pds);
+  auto mem = CreateMKLDNNMem(out_data[quantized_elemwise_add_enum::kOut],
+                             pdesc.dst_primitive_desc(),
+                             req[0],
+                             &in_data[0]);
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *mem.second));
+  CommitOutput(out_data[quantized_elemwise_add_enum::kOut], mem);
+  stream->Submit();
+
+  out_data[quantized_elemwise_add_enum::kMin].data().dptr<float>()[0] = output_min;
+  out_data[quantized_elemwise_add_enum::kMax].data().dptr<float>()[0] = output_max;
+}
+
+inline static bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs, const int dev_mask,
+                                          DispatchMode* dispatch_mode, std::vector<int>* in_attrs,
+                                          std::vector<int>* out_attrs) {
+  // Check num of inputs: A, B, A_min, A_max, B_min, B_max
+  CHECK_EQ(in_attrs->size(), 6U);
+  // Check num of outputs: C, C_min, C_max
+  CHECK_EQ(out_attrs->size(), 3U);
+
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_elemwise_add)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseAddStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedElemwiseAddForward)
+.set_attr<bool>("TIsMKLDNN", true)
+.set_attr_parser(ParamParser<QuantizeElemwiseAddParam>)
+.add_arguments(QuantizeElemwiseAddParam::__FIELDS__());
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
index cf3d789e2882..e8abab22446e 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
@@ -80,7 +80,7 @@ void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs,
   }
 
   Stream<cpu> *s = ctx.get_stream<cpu>();
-  mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
+  mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(s, 1,
     min_output_ptr, max_output_ptr, &min_data, &max_data, &min_weight, &max_weight);
 
   bool is_train = false;
diff --git a/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
index ac414c72d51a..03d9b9067b57 100644
--- a/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
@@ -34,6 +34,7 @@
 namespace mxnet {
 namespace op {
 
+template <typename DstType>
 static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs& attrs,
                                        const OpContext& ctx,
                                        const std::vector<NDArray>& inputs,
@@ -45,7 +46,6 @@ static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs& attrs,
   using red::limits::MaxValue;
   using red::limits::MinValue;
   typedef int32_t SrcDType;
-  typedef int8_t  DstDType;
   // check shapes
   size_t i_dim = inputs[0].shape().ndim();
   size_t o_dim = outputs[0].shape().ndim();
@@ -56,12 +56,21 @@ static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs& attrs,
                                   *inputs[2].data().dptr<float>());
   float first_scale = first_real_range / first_quantized_range;
   float second_real_range = real_range;
-  float second_quantized_range = MinAbs(MaxValue<DstDType>(),
-                                        MinValue<DstDType>());
+  float second_quantized_range = 0.f;
+  if (std::is_same<DstType, int8_t>::value) {
+    second_quantized_range = MinAbs(MaxValue<DstType>(), MinValue<DstType>());
+    *outputs[1].data().dptr<float>() = -second_real_range;
+    *outputs[2].data().dptr<float>() = second_real_range;
+  } else if (std::is_same<DstType, uint8_t>::value) {
+    second_quantized_range = MaxValue<DstType>();
+    *outputs[1].data().dptr<float>() = 0.f;
+    *outputs[2].data().dptr<float>() = second_real_range;
+  } else {
+    LOG(FATAL) << "Unsupported requantize output type";
+  }
   float second_scale = second_quantized_range / second_real_range;
   float scale = first_scale * second_scale;
-  *outputs[1].data().dptr<float>() = -second_real_range;
-  *outputs[2].data().dptr<float>() = second_real_range;
+
   primitive_attr attr;
   const int mask = 0;
   std::vector<float> scales = {scale};
@@ -82,7 +91,7 @@ static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs& attrs,
     i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
   }
   auto o_desc = mkldnn::memory::desc(i_dims,
-                                    (mkldnn::memory::data_type)data_type_enum<DstDType>::type,
+                                    (mkldnn::memory::data_type)data_type_enum<DstType>::type,
                                     i_fmt);
   auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
   auto reorder_pd  = reorder::primitive_desc(i_mpd, o_mpd, attr);
@@ -99,55 +108,47 @@ static void MKLDNNRequantizeForward(const nnvm::NodeAttrs& attrs,
                                     const std::vector<NDArray>& outputs) {
   using namespace mshadow;
   using namespace mxnet_op;
+  using red::limits::MaxValue;
+  using red::limits::MinValue;
   typedef int32_t SrcDType;
   typedef int8_t  DstDType;
-  Stream<cpu> *s = ctx.get_stream<cpu>();
   const RequantizeParam& param = nnvm::get<RequantizeParam>(attrs.parsed);
   float real_range;
   // Model is calibrated
   if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
     real_range =
           MaxAbs(param.min_calib_range.value(), param.max_calib_range.value());
-    MKLDNNRequantizeForwardKer(attrs, ctx, inputs, req, outputs, real_range);
   // Model is not calibrated
   } else {
-    mxnet::TShape src_shape, dst_shape;
-    const size_t actual_float_size = sizeof(float);
-    const size_t actual_quantized_size = sizeof(SrcDType);
-    const size_t temp_reduce_size = ConfigReduce<cpu, SrcDType>(s,
-                         inputs[0].shape(), mxnet::TShape(1, 1), &src_shape, &dst_shape);
-    Tensor<cpu, 1, char> temp_space =
-      ctx.requested[0].get_space_typed<cpu, 1, char>(
-      Shape1(2*actual_float_size+2*actual_quantized_size+temp_reduce_size), s);
-    Tensor<cpu, 1, float> actual_min_float(
-                 reinterpret_cast<float*>(temp_space.dptr_), Shape1(1), s);
-    Tensor<cpu, 1, float> actual_max_float(
-                 reinterpret_cast<float*>(temp_space.dptr_) + 1, Shape1(1), s);
-    const int dev_id = ctx.run_ctx.ctx.dev_id;
-    TBlob actual_min_quantized(reinterpret_cast<SrcDType*>(
-                       temp_space.dptr_ + 8), Shape1(1), cpu::kDevMask, dev_id);
-    TBlob actual_max_quantized(reinterpret_cast<SrcDType*>(
-                   temp_space.dptr_ + 8) + 1, Shape1(1), cpu::kDevMask, dev_id);
-    Tensor<cpu, 1, char> workspace(
-            temp_space.dptr_+2*actual_float_size+2*actual_quantized_size,
-            Shape1(temp_reduce_size), s);
-    broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
-        s, actual_min_quantized.reshape(dst_shape), kWriteTo,
-        workspace, inputs[0].Reorder2Default().data().reshape(src_shape));
-    Kernel<QuantizedToFloatStruct, cpu>::Launch(s, 1,
-        actual_min_float.dptr_, actual_min_quantized.dptr<SrcDType>(),
-        inputs[1].Reorder2Default().data().dptr<float>(),
-        inputs[2].Reorder2Default().data().dptr<float>());
-    broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
-        s, actual_max_quantized.reshape(dst_shape), kWriteTo,
-        workspace, inputs[0].Reorder2Default().data().reshape(src_shape));
-    Kernel<QuantizedToFloatStruct, cpu>::Launch(s, 1,
-        actual_max_float.dptr_, actual_max_quantized.dptr<SrcDType>(),
-        inputs[1].Reorder2Default().data().dptr<float>(),
-        inputs[2].Reorder2Default().data().dptr<float>());
-
-    real_range = MaxAbs(*actual_min_float.dptr_, *actual_max_float.dptr_);
-    MKLDNNRequantizeForwardKer(attrs, ctx, inputs, req, outputs, real_range);
+    NDArray in_buffer = inputs[0].Reorder2Default();
+    auto in_ptr = in_buffer.data().dptr<SrcDType>();
+    auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+    SrcDType data_min = MaxValue<SrcDType>();
+    SrcDType data_max = MinValue<SrcDType>();
+    std::vector<SrcDType> data_maxs(nthreads, data_max);
+    std::vector<SrcDType> data_mins(nthreads, data_min);
+#pragma omp parallel for num_threads(nthreads)
+    for (index_t i = 0; i < static_cast<index_t>(in_buffer.shape().Size()); i++) {
+      int tid = omp_get_thread_num();
+      if (in_ptr[i] > data_maxs[tid]) data_maxs[tid] = in_ptr[i];
+      if (in_ptr[i] < data_mins[tid]) data_mins[tid] = in_ptr[i];
+    }
+    for (index_t i = 0; i < nthreads; i++) {
+      if (data_maxs[i] > data_max) data_max = data_maxs[i];
+      if (data_mins[i] < data_min) data_min = data_mins[i];
+    }
+    float src_range = MinAbs(MinValue<SrcDType>(), MaxValue<SrcDType>());
+    SrcDType data_range = MaxAbs(data_min, data_max);
+    float data_scale = MaxAbs(*inputs[1].data().dptr<float>(), *inputs[2].data().dptr<float>());
+    real_range = data_range * data_scale / src_range;
+  }
+  auto out_type = GetQuantizeOutputType(param);
+  if (out_type == mshadow::kUint8) {
+    MKLDNNRequantizeForwardKer<uint8_t>(attrs, ctx, inputs, req, outputs, real_range);
+  } else if (out_type == mshadow::kInt8) {
+    MKLDNNRequantizeForwardKer<int8_t>(attrs, ctx, inputs, req, outputs, real_range);
+  } else {
+    LOG(FATAL) << "mkldnn requantize op only supports int8 and uint8 as output type";
   }
 }
 
diff --git a/src/operator/quantization/quantization_utils.h b/src/operator/quantization/quantization_utils.h
index c540ea441431..e7f7ccdf13b7 100644
--- a/src/operator/quantization/quantization_utils.h
+++ b/src/operator/quantization/quantization_utils.h
@@ -34,6 +34,7 @@ namespace op {
 
 static const size_t kUint8Range = 255;
 static const size_t kInt8Range = 127;
+static const size_t kInt32Range = 0x7fffffff;
 
 template<typename T>
 MSHADOW_XINLINE int Sign(T val) {
@@ -127,39 +128,31 @@ MSHADOW_XINLINE void RequantizeManyInNewRange(size_t count, T2* output, const T1
  * \brief Get the scaling factor for converting type T to float.
  */
 template<typename T>
-MSHADOW_XINLINE float FloatForOneQuantizedLevel(float range_min, float range_max) {
+MSHADOW_XINLINE float FloatForOneQuantizedLevel(float range_min, float range_max, bool all_sign) {
   using mshadow::red::limits::MinValue;
   using mshadow::red::limits::MaxValue;
-  const int64_t highest = static_cast<int64_t>(MaxValue<T>());
-  const int64_t lowest  = static_cast<int64_t>(MinValue<T>());
-  const float float_for_one_quantized_level =
-      (range_max - range_min) / (highest - lowest);
-  return float_for_one_quantized_level;
+  float range_data = MaxAbs(range_min, range_max);
+  float range_T = all_sign ? MinAbs(MinValue<T>(), MaxValue<T>()) : MaxValue<T>();
+  return range_data / range_T;
 }
 
 template <typename TA, typename TB, typename TC>
-MSHADOW_XINLINE void QuantizationRangeForMultiplication(float min_a, float max_a,
-                                                        float min_b, float max_b,
-                                                        float* min_c, float* max_c) {
-  using mshadow::red::limits::MinValue;
+MSHADOW_XINLINE void QuantizationRangeForMultiplication(float min_a, float max_a, float min_b,
+                                                        float max_b, float *min_c, float *max_c,
+                                                        bool all_sign) {
   using mshadow::red::limits::MaxValue;
-  const float a_float_for_one_quant_level =
-    FloatForOneQuantizedLevel<TA>(min_a, max_a);
-  const float b_float_for_one_quant_level =
-    FloatForOneQuantizedLevel<TB>(min_b, max_b);
-
-  const int64_t c_highest =
-    static_cast<int64_t>(MaxValue<TC>());
-  const int64_t c_lowest  =
-    static_cast<int64_t>(MinValue<TC>());
+  using mshadow::red::limits::MinValue;
+  const float a_float_for_one_quant_level = FloatForOneQuantizedLevel<TA>(min_a, max_a, all_sign);
+  const float b_float_for_one_quant_level = FloatForOneQuantizedLevel<TB>(min_b, max_b, all_sign);
+  const float range_c =
+      MinAbs(static_cast<int64_t>(MinValue<TC>()), static_cast<int64_t>(MaxValue<TC>()));
   const float c_float_for_one_quant_level =
-    a_float_for_one_quant_level * b_float_for_one_quant_level;
-
-  *min_c = c_float_for_one_quant_level * c_lowest;
-  *max_c = c_float_for_one_quant_level * c_highest;
+      a_float_for_one_quant_level * b_float_for_one_quant_level;
+  *max_c = c_float_for_one_quant_level * range_c;
+  *min_c = -*max_c;
 }
 
-struct QuantizationRangeForMultiplicationStruct {
+struct QuantizationRangeForS8S8MultiplicationStruct {
   MSHADOW_XINLINE static void Map(int i,
                                   float *min_c,
                                   float *max_c,
@@ -168,7 +161,20 @@ struct QuantizationRangeForMultiplicationStruct {
                                   const float *min_b,
                                   const float *max_b) {
   QuantizationRangeForMultiplication<int8_t, int8_t, int32_t>(
-    min_a[i], max_a[i], min_b[i], max_b[i], min_c, max_c);
+    min_a[i], max_a[i], min_b[i], max_b[i], min_c, max_c, true);
+  }
+};
+
+struct QuantizationRangeForS8U8MultiplicationStruct {
+  MSHADOW_XINLINE static void Map(int i,
+                                  float *min_c,
+                                  float *max_c,
+                                  const float *min_a,
+                                  const float *max_a,
+                                  const float *min_b,
+                                  const float *max_b) {
+  QuantizationRangeForMultiplication<int8_t, uint8_t, int32_t>(
+    min_a[i], max_a[i], min_b[i], max_b[i], min_c, max_c, false);
   }
 };
 
@@ -186,6 +192,29 @@ inline size_t ConfigReduce(mshadow::Stream<xpu>* s,
   return broadcast::ReduceWorkspaceSize<NDim, DType>(s, *dst_shape, kWriteTo, *src_shape);
 }
 
+enum QuantizeOutType { kAuto = 0, kInt8, kUint8 };
+
+template<typename Param>
+static mshadow::TypeFlag GetQuantizeOutputType(const Param &param) {
+  auto out_type = mshadow::kInt8;
+  if (param.out_type == QuantizeOutType::kAuto) {
+    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+      if (param.min_calib_range.value() >= 0.0) {
+        out_type = mshadow::kUint8;
+      } else {
+        out_type = mshadow::kInt8;
+      }
+    }
+  } else if (param.out_type == QuantizeOutType::kInt8) {
+    out_type = mshadow::kInt8;
+  } else if (param.out_type == QuantizeOutType::kUint8) {
+    out_type = mshadow::kUint8;
+  } else {
+    LOG(FATAL) << "Unsupported out_type in params: " <<param.out_type;
+  }
+  return out_type;
+}
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZATION_UTILS_H_
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index 7ff2999b0c15..7591477b1081 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -248,6 +248,7 @@ Graph QuantizeGraph(Graph &&src) {
         NodePtr requantize_node = Node::Create();
         requantize_node->attrs.op = Op::Get("_contrib_requantize");
         requantize_node->attrs.name = "requantize_" + node->attrs.name;
+        requantize_node->attrs.dict["out_type"] = quantized_dtype;
         if (requantize_node->op()->attr_parser != nullptr) {
           requantize_node->op()->attr_parser(&(requantize_node->attrs));
         }
@@ -398,7 +399,7 @@ Graph SetCalibTableToQuantizedGraph(Graph&& g) {
         node->attrs.dict["max_calib_range"] = std::to_string(calib_table_iter->second.second);
         node->op()->attr_parser(&(node->attrs));
         const QuantizeV2Param& param = nnvm::get<QuantizeV2Param>(node->attrs.parsed);
-        if (param.out_type == QuantizeV2Param::OutType::kUint8 &&
+        if (param.out_type == QuantizeOutType::kUint8 &&
             param.min_calib_range.value() < 0.0f) {
           LOG(WARNING) << "Calibration statistics indicates that node `" << node->attrs.name
                        << "` has negative input, consider use `auto` or `int8` as out_type";
diff --git a/src/operator/quantization/quantize_v2-inl.h b/src/operator/quantization/quantize_v2-inl.h
index 2054075fed90..a8cbc0b6fdf5 100644
--- a/src/operator/quantization/quantize_v2-inl.h
+++ b/src/operator/quantization/quantize_v2-inl.h
@@ -38,16 +38,15 @@ namespace mxnet {
 namespace op {
 
 struct QuantizeV2Param : public dmlc::Parameter<QuantizeV2Param> {
-  enum OutType { kAuto = 0, kInt8, kUint8 };
   int out_type;
   dmlc::optional<float> min_calib_range;
   dmlc::optional<float> max_calib_range;
   DMLC_DECLARE_PARAMETER(QuantizeV2Param) {
     DMLC_DECLARE_FIELD(out_type)
-      .add_enum("auto", kAuto)
-      .add_enum("int8", kInt8)
-      .add_enum("uint8", kUint8)
-      .set_default(kInt8)
+      .add_enum("auto", QuantizeOutType::kAuto)
+      .add_enum("int8", QuantizeOutType::kInt8)
+      .add_enum("uint8", QuantizeOutType::kUint8)
+      .set_default(QuantizeOutType::kInt8)
       .describe("Output data type. `auto` can be specified to automatically determine output type "
                 "according to min_calib_range.");
     DMLC_DECLARE_FIELD(min_calib_range)
@@ -61,26 +60,6 @@ struct QuantizeV2Param : public dmlc::Parameter<QuantizeV2Param> {
   }
 };
 
-static mshadow::TypeFlag GetOutputType(const QuantizeV2Param &param) {
-  auto out_type = mshadow::kInt8;
-  if (param.out_type == QuantizeV2Param::OutType::kAuto) {
-    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
-      if (param.min_calib_range.value() >= 0.0) {
-        out_type = mshadow::kUint8;
-      } else {
-        out_type = mshadow::kInt8;
-      }
-    }
-  } else if (param.out_type == QuantizeV2Param::OutType::kInt8) {
-    out_type = mshadow::kInt8;
-  } else if (param.out_type == QuantizeV2Param::OutType::kUint8) {
-    out_type = mshadow::kUint8;
-  } else {
-    LOG(FATAL) << "Unsupported out_type in params: " <<param.out_type;
-  }
-  return out_type;
-}
-
 // quantize float to uint8_t
 struct quantize_v2_unsigned {
   template <typename DstDType, typename SrcDType>
@@ -143,7 +122,7 @@ static inline bool QuantizeV2Type(const nnvm::NodeAttrs &attrs, std::vector<int>
   const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs.parsed);
   CHECK(in_attrs->at(0) == mshadow::kFloat32 || in_attrs->at(0) == mshadow::kUint8 ||
         in_attrs->at(0) == mshadow::kInt8);
-  auto out_type = GetOutputType(param);
+  auto out_type = GetQuantizeOutputType(param);
   if (out_type == mshadow::kUint8) {
     TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kUint8);
   } else if (out_type == mshadow::kInt8) {
@@ -170,7 +149,7 @@ class QuantizeV2Operator {
     using mshadow::red::limits::MinValue;
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs_.parsed);
-    auto out_type = GetOutputType(param);
+    auto out_type = GetQuantizeOutputType(param);
     if (out_type == mshadow::kUint8 && std::is_same<xpu, gpu>::value) {
       LOG(FATAL) << "currently, uint8 quantization is only supported by CPU, "
                     "please switch to the context of CPU or int8 data type for GPU.";
diff --git a/src/operator/quantization/quantized_conv.cu b/src/operator/quantization/quantized_conv.cu
index ee688c0648c8..23c41a17ef4a 100644
--- a/src/operator/quantization/quantized_conv.cu
+++ b/src/operator/quantization/quantized_conv.cu
@@ -174,7 +174,7 @@ class QuantizedCuDNNConvOp {
     // of in_data[0] and in_data[1]. Need to rescale the min/max range of out_data
     // based on the min/max ranges of in_data[0] and in_data[1].
     const size_t num_inputs = param_.no_bias ? 2 : 3;
-    mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, gpu>::Launch(s, 1,
+    mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, gpu>::Launch(s, 1,
       out_data[1].dptr<float>(), out_data[2].dptr<float>(),
        in_data[num_inputs].dptr<float>(),  in_data[num_inputs+1].dptr<float>(),
        in_data[num_inputs+2].dptr<float>(),  in_data[num_inputs+3].dptr<float>());
diff --git a/src/operator/quantization/quantized_elemwise_add-inl.h b/src/operator/quantization/quantized_elemwise_add-inl.h
new file mode 100644
index 000000000000..673b281d6cdc
--- /dev/null
+++ b/src/operator/quantization/quantized_elemwise_add-inl.h
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \file quantized_elemwise_add-inl.h
+ * \brief
+ * \author Rong Zhang
+ */
+
+#ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_ADD_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_ADD_INL_H_
+
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+/* These structure is used for requantization only when fusion */
+struct QuantizeElemwiseAddParam : public dmlc::Parameter<QuantizeElemwiseAddParam> {
+  dmlc::optional<float> min_calib_range;
+  dmlc::optional<float> max_calib_range;
+  DMLC_DECLARE_PARAMETER(QuantizeElemwiseAddParam) {
+    DMLC_DECLARE_FIELD(min_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The minimum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to requantize the "
+              "int8 output data.");
+    DMLC_DECLARE_FIELD(max_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The maximum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to requantize the "
+              "int8 output data.");
+  }
+};
+
+namespace quantized_elemwise_add_enum {
+enum QuantizedElemwiseAddOutputs { kOut, kMin, kMax };
+enum QuantizedElemwiseAddInputs { kDataA, kDataB, kAMin, kAMax, kBMin, kBMax};
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_ADD_INL_H_
diff --git a/src/operator/quantization/quantized_elemwise_add.cc b/src/operator/quantization/quantized_elemwise_add.cc
new file mode 100644
index 000000000000..f821e6598192
--- /dev/null
+++ b/src/operator/quantization/quantized_elemwise_add.cc
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file quantized_elemwise_add.cc
+ * \brief
+*/
+#include "../tensor/elemwise_unary_op.h"
+#include "./quantized_elemwise_add-inl.h"
+
+namespace mxnet {
+namespace op {
+
+static bool ElemwiseAddShape(const nnvm::NodeAttrs& attrs,
+                             mxnet::ShapeVector* in_shape,
+                             mxnet::ShapeVector* out_shape) {
+  // A, B, A_min, A_max, B_min, B_max
+  CHECK_EQ(in_shape->size(), 6U);
+  // C, C_min, C_max
+  CHECK_EQ(out_shape->size(), 3U);
+  CHECK_EQ((*in_shape)[0], (*in_shape)[1]);
+
+
+  SHAPE_ASSIGN_CHECK(*in_shape, 2, TShape{1});
+  SHAPE_ASSIGN_CHECK(*in_shape, 3, TShape{1});
+  SHAPE_ASSIGN_CHECK(*in_shape, 4, TShape{1});
+  SHAPE_ASSIGN_CHECK(*in_shape, 5, TShape{1});
+
+  SHAPE_ASSIGN_CHECK(*out_shape, 0, (*in_shape)[0]);
+  SHAPE_ASSIGN_CHECK(*out_shape, 1, TShape{1});
+  SHAPE_ASSIGN_CHECK(*out_shape, 2, TShape{1});
+  return true;
+}
+
+static bool ElemwiseAddType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_type,
+                            std::vector<int>* out_type) {
+  // A, B, A_min, A_max, B_min, B_max
+  CHECK_EQ(in_type->size(), 6U);
+  // C, C_min, C_max
+  CHECK_EQ(out_type->size(), 3U);
+
+  // A, B
+  const int elem_add_num = 2;
+  for (int i = 0; i < elem_add_num; ++i) {
+    if (in_type->at(i) == mshadow::kInt8) {
+      TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kInt8);
+    } else {
+      TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kUint8);
+    }
+  }
+  // C
+  int dtype = mshadow::kInt32;
+  const QuantizeElemwiseAddParam& params = nnvm::get<QuantizeElemwiseAddParam>(attrs.parsed);
+  if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+    dtype = (in_type->at(0) == in_type->at(1)) ? in_type->at(0) : mshadow::kInt8;
+  }
+  TYPE_ASSIGN_CHECK(*out_type, 0, dtype);
+  // C_min
+  TYPE_ASSIGN_CHECK(*out_type, 1, mshadow::kFloat32);
+  // C_max
+  TYPE_ASSIGN_CHECK(*out_type, 2, mshadow::kFloat32);
+
+  return true;
+}
+
+void QuantizedElemwiseAddForward(const nnvm::NodeAttrs& attrs,
+                                 const OpContext &ctx,
+                                 const std::vector<TBlob> &in_data,
+                                 const std::vector<OpReqType> &req,
+                                 const std::vector<TBlob> &out_data) {
+  LOG(FATAL) << "Not supported for MXNet built without MKLDNN. "
+                "Please install MKLDNN enabled MXNet.";
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_elemwise_add)
+.describe(R"code(elemwise_add operator for input dataA and input dataB data type of int8,
+and accumulates in type int32 for the output. For each argument, two more arguments of type
+float32 must be provided representing the thresholds of quantizing argument from data
+type float32 to int8. The final outputs contain result in int32, and min
+and max thresholds representing the threholds for quantizing the float32 output into int32.
+
+.. Note::
+    This operator only supports forward propogation. DO NOT use it in training.
+
+)code")
+.set_num_inputs([](const NodeAttrs& attrs) {
+// A, B, A_min, A_max, B_min, B_max
+  return 6;
+})
+// C, C_min, C_max
+.set_num_outputs(3)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"lhs", "rhs", "lhs_min", "lhs_max", "rhs_min", "rhs_max"}; \
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output", "min_output", "max_output"};
+})
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseAddType)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseAddShape)
+.set_attr<FCompute>("FCompute<cpu>", QuantizedElemwiseAddForward)
+.set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
+.add_argument("lhs", "NDArray-or-Symbol", "first input")
+.add_argument("rhs", "NDArray-or-Symbol", "second input")
+.add_argument("lhs_min", "NDArray-or-Symbol", "3rd input")
+.add_argument("lhs_max", "NDArray-or-Symbol", "4th input")
+.add_argument("rhs_min", "NDArray-or-Symbol", "5th input")
+.add_argument("rhs_max", "NDArray-or-Symbol", "6th input");
+
+
+NNVM_REGISTER_OP(elemwise_add)
+.set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
+  nnvm::NodePtr node = nnvm::Node::Create();
+  node->attrs.op = Op::Get("_contrib_quantized_elemwise_add");
+  node->attrs.name = "quantized_" + attrs.name;
+  node->attrs.dict = attrs.dict;
+  if (node->op()->attr_parser != nullptr) {
+    node->op()->attr_parser(&(node->attrs));
+  }
+  return node;
+});
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index e42ea3020352..ceac0b6ec9a0 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -233,7 +233,7 @@ void QuantizedFullyConnectedForwardCPU(const nnvm::NodeAttrs& attrs,
   Tensor<cpu, 1, float> max_weight =
     in_data[num_inputs + quantized_fullc::kWeightMax].get<cpu, 1, float>(s);
 
-  Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1, min_output.dptr_,
+  Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(s, 1, min_output.dptr_,
       max_output.dptr_, min_data.dptr_, max_data.dptr_, min_weight.dptr_, max_weight.dptr_);
   if (!param.no_bias) {
     Tensor<cpu, 1, int8_t> bias = in_data[fullc::kBias].get_with_shape<cpu, 1, int8_t>(
diff --git a/src/operator/quantization/quantized_fully_connected.cu b/src/operator/quantization/quantized_fully_connected.cu
index d1cbdc98d535..04680c8c2b78 100644
--- a/src/operator/quantization/quantized_fully_connected.cu
+++ b/src/operator/quantization/quantized_fully_connected.cu
@@ -109,7 +109,7 @@ void QuantizedFullyConnectedForwardGPU(const nnvm::NodeAttrs& attrs,
                            cmp_type,
                            CUBLAS_GEMM_DFALT));
 
-  Kernel<QuantizationRangeForMultiplicationStruct, gpu>::Launch(s, 1,
+  Kernel<QuantizationRangeForS8S8MultiplicationStruct, gpu>::Launch(s, 1,
     outputs[1].dptr<float>(), outputs[2].dptr<float>(),
      inputs[num_inputs].dptr<float>(),   inputs[num_inputs+1].dptr<float>(),
      inputs[num_inputs+2].dptr<float>(), inputs[num_inputs+3].dptr<float>());
diff --git a/src/operator/quantization/requantize-inl.h b/src/operator/quantization/requantize-inl.h
index 9106c7fe4716..2bdc3a712961 100644
--- a/src/operator/quantization/requantize-inl.h
+++ b/src/operator/quantization/requantize-inl.h
@@ -38,9 +38,17 @@ namespace mxnet {
 namespace op {
 
 struct RequantizeParam : public dmlc::Parameter<RequantizeParam> {
+  int out_type;
   dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
   dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
   DMLC_DECLARE_PARAMETER(RequantizeParam) {
+    DMLC_DECLARE_FIELD(out_type)
+      .add_enum("auto", QuantizeOutType::kAuto)
+      .add_enum("int8", QuantizeOutType::kInt8)
+      .add_enum("uint8", QuantizeOutType::kUint8)
+      .set_default(QuantizeOutType::kInt8)
+      .describe("Output data type. `auto` can be specified to automatically determine output type "
+                "according to min_calib_range.");
     DMLC_DECLARE_FIELD(min_calib_range)
     .set_default(dmlc::optional<float>())
     .describe("The minimum scalar value in the form of float32 obtained "
@@ -59,10 +67,18 @@ inline bool RequantizeType(const nnvm::NodeAttrs& attrs,
                            std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 3U);
+  const RequantizeParam &param = nnvm::get<RequantizeParam>(attrs.parsed);
   TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt32);
   TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
   TYPE_ASSIGN_CHECK(*in_attrs, 2, mshadow::kFloat32);
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  auto out_type = GetQuantizeOutputType(param);
+  if (out_type == mshadow::kUint8) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kUint8);
+  } else if (out_type == mshadow::kInt8) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  } else {
+    LOG(FATAL) << "requantize op only supports int8 and uint8 as output type";
+  }
   TYPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::kFloat32);
   TYPE_ASSIGN_CHECK(*out_attrs, 2, mshadow::kFloat32);
   return (*in_attrs)[0] != -1;
@@ -100,6 +116,11 @@ void RequantizeForward(const nnvm::NodeAttrs& attrs,
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const RequantizeParam& param =
     nnvm::get<RequantizeParam>(attrs.parsed);
+  auto out_type = GetQuantizeOutputType(param);
+  if (out_type == mshadow::kUint8 && std::is_same<xpu, gpu>::value) {
+    LOG(FATAL) << "currently, uint8 quantization is only supported by CPU, "
+                  "please switch to the context of CPU or int8 data type for GPU.";
+  }
 
   if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
     Kernel<RequantizeKernel, xpu>::Launch(s, inputs[0].Size(),
diff --git a/src/operator/random/sample_multinomial_op.h b/src/operator/random/sample_multinomial_op.h
index b38aefbc1634..377df4f313da 100644
--- a/src/operator/random/sample_multinomial_op.h
+++ b/src/operator/random/sample_multinomial_op.h
@@ -68,7 +68,7 @@ inline bool SampleMultinomialOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), param.get_prob ? 2U : 1U);
   const mxnet::TShape& ishape = (*in_attrs)[0];
-  if (!shape_is_known(ishape)) return false;
+  if (!ndim_is_known(ishape)) return false;
 
   MSHADOW_TYPE_SWITCH(param.dtype, DType, {
     CHECK_LE(ishape[ishape.ndim() - 1], mxnet::common::MaxIntegerValue<DType>())
@@ -95,7 +95,10 @@ inline bool SampleMultinomialOpShape(const nnvm::NodeAttrs& attrs,
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   if (param.get_prob) SHAPE_ASSIGN_CHECK(*out_attrs, 1, oshape);
-  return shape_is_known(out_attrs->at(0)) && shape_is_known(out_attrs->at(1));
+  for (const auto& out_shape : *out_attrs) {
+    if (!shape_is_known(out_shape)) return false;
+  }
+  return true;
 }
 
 
diff --git a/src/operator/sequence_reverse-inl.h b/src/operator/sequence_reverse-inl.h
index 03210d325699..8e2362f76dd2 100644
--- a/src/operator/sequence_reverse-inl.h
+++ b/src/operator/sequence_reverse-inl.h
@@ -64,40 +64,37 @@ struct SequenceReverseParam : public dmlc::Parameter<SequenceReverseParam> {
   }
 };
 
+template <OpReqType req>
 struct ReverseKernel {
   template <typename DType, typename IType>
   MSHADOW_XINLINE static void Map(const int i, DType *const out_data,
                                   const DType *const in_data,
-                                  const OpReqType req,
                                   const index_t max_seq_len,
                                   const index_t batch_size,
                                   const index_t other_dim, const index_t numel,
                                   const IType *const indices) {
-    for (index_t batch = 0; batch < batch_size; ++batch) {
-      const index_t num_seq =
-          indices ? static_cast<index_t>(indices[batch]) : max_seq_len;
-      const index_t padded_periods = max_seq_len - num_seq;
-      // padded part
-      if (padded_periods > 0 && i < static_cast<int>(padded_periods)) {
-        const int padded_in_offset =
-            (i + num_seq) * batch_size * other_dim + batch * other_dim;
-
-        for (index_t j = 0; j < other_dim; ++j) {
-          KERNEL_ASSIGN(out_data[padded_in_offset + j], req,
-                        in_data[padded_in_offset + j]);
-        }
-      }
-      // unpadded part
-      if (i < static_cast<int>(num_seq)) {
-        const int in_offset = i * batch_size * other_dim + batch * other_dim;
-        const int out_offset =
-            numel - (i + 1 + padded_periods) * batch_size * other_dim +
-            batch * other_dim;
-
-        for (index_t j = 0; j < other_dim; ++j) {
-          KERNEL_ASSIGN(out_data[out_offset + j], req, in_data[in_offset + j]);
-        }
-      }
+    const index_t batch = i / (max_seq_len * other_dim);
+    const int id = (i / other_dim) % max_seq_len;
+    const index_t j = i % other_dim;
+    const index_t num_seq =
+        indices ? static_cast<index_t>(indices[batch]) : max_seq_len;
+    const index_t padded_periods = max_seq_len - num_seq;
+    // padded part
+    if (padded_periods > 0 && id < static_cast<int>(padded_periods)) {
+      const int padded_in_offset =
+          (id + num_seq) * batch_size * other_dim + batch * other_dim;
+
+      KERNEL_ASSIGN(out_data[padded_in_offset + j], req,
+                    in_data[padded_in_offset + j]);
+    }
+    // unpadded part
+    if (id < static_cast<int>(num_seq)) {
+      const int in_offset = id * batch_size * other_dim + batch * other_dim;
+      const int out_offset =
+          numel - (id + 1 + padded_periods) * batch_size * other_dim +
+          batch * other_dim;
+
+      KERNEL_ASSIGN(out_data[out_offset + j], req, in_data[in_offset + j]);
     }
   }
 };
@@ -118,9 +115,11 @@ class SequenceReverseOp : public Operator {
     const index_t other_dim = data.size(2);
     const index_t tensor_numel = data.shape_.Size();
 
-    mxnet_op::Kernel<ReverseKernel, xpu>::Launch(
-        s, max_seq_len, out.dptr_, data.dptr_, req, max_seq_len, batch_size,
-        other_dim, tensor_numel, indices);
+    MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+      mxnet_op::Kernel<ReverseKernel<req_type>, xpu>::Launch(
+          s, max_seq_len * batch_size * other_dim, out.dptr_, data.dptr_,
+          max_seq_len, batch_size, other_dim, tensor_numel, indices);
+    });
   }
 
   virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
diff --git a/src/operator/subgraph/build_subgraph.cc b/src/operator/subgraph/build_subgraph.cc
index 32ea341d0834..28b89613ee86 100644
--- a/src/operator/subgraph/build_subgraph.cc
+++ b/src/operator/subgraph/build_subgraph.cc
@@ -509,9 +509,9 @@ void FindOutputEntries(nnvm::Graph* g,
 void CutGraphInputs(const std::vector<nnvm::NodeEntry*> &input_entries,
                     std::vector<nnvm::NodeEntry> *orig_entries,
                     const bool skip_var = false) {
-  orig_entries->resize(input_entries.size());
+  orig_entries->reserve(input_entries.size());
   // map for creating unique var nodes for deduplicating entries from the same node
-  std::unordered_map<std::string, int> name_count_map;
+  std::unordered_map<std::string, nnvm::NodePtr> new_node_map;
   for (size_t i = 0; i < input_entries.size(); ++i) {
     nnvm::NodeEntry *e = input_entries[i];
     // If the node is a variable itself, we may want to skip the node.
@@ -519,19 +519,17 @@ void CutGraphInputs(const std::vector<nnvm::NodeEntry*> &input_entries,
       continue;
     }
 
-    orig_entries->at(i) = *e;
     nnvm::Symbol sym;
     sym.outputs.push_back(*e);
     const auto output_names = sym.ListOutputNames();
     CHECK_EQ(output_names.size(), 1U);
     const std::string& var_name = output_names[0];
-    auto it = name_count_map.find(var_name);
-    if (name_count_map.end() == it) {
-      name_count_map.emplace(var_name, 0);
-    } else {
-      ++(it->second);
+    auto it = new_node_map.find(var_name);
+    if (it == new_node_map.end()) {
+      orig_entries->push_back(*e);
+      new_node_map[var_name] = nnvm::CreateVariableNode(var_name);
     }
-    nnvm::NodePtr n = nnvm::CreateVariableNode(var_name + std::to_string(name_count_map[var_name]));
+    nnvm::NodePtr n = new_node_map[var_name];
     *e = nnvm::NodeEntry{n, 0, 0};
   }
 }
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv.cc b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
index e142fae90e97..2c05fda9a879 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
@@ -31,6 +31,9 @@
 namespace mxnet {
 namespace op {
 
+using red::limits::MaxValue;
+using red::limits::MinValue;
+
 template <typename DType>
 static void UpdateConvWeightBias(NDArray *weight, NDArray *bias, bool no_bias,
                                  const NDArray &gamma, const NDArray &beta,
@@ -78,8 +81,6 @@ static inline size_t GetInSumIndex(const MKLDNNConvFusionParam &param) {
 
 template <typename DType>
 static std::vector<float> GetWeightScales(const NDArray &weight, bool weight_channelwise_scale) {
-  using red::limits::MaxValue;
-  using red::limits::MinValue;
   std::vector<float> weight_scales;
   const DType *weight_ptr = weight.data().dptr<DType>();
   size_t channel = weight.shape()[0];
@@ -111,9 +112,11 @@ static std::vector<float> GetWeightScales(const NDArray &weight, bool weight_cha
       if (total_min > weight_c_min[c]) total_min = weight_c_min[c];
       if (total_max < weight_c_max[c]) total_max = weight_c_max[c];
     }
-    weight_scales.resize(1);
+    weight_scales.resize(3);
     DType weight_range = MaxAbs(total_min, total_max);
     weight_scales[0] = kInt8Range / weight_range;
+    weight_scales[1] = total_min;
+    weight_scales[2] = total_max;
   }
   return weight_scales;
 }
@@ -247,11 +250,24 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
     if (!inplace_) {
       auto in_mkl_mem = inputs[in_sum].GetMKLDNNData();
       auto out_mkl_mem = outputs[kOut].GetMKLDNNData();
+      if (outputs[kOut].dtype() == mshadow::kInt32) {
+        auto mem_desc = in_mkl_mem->get_primitive_desc().desc();
+        auto this_dtype = get_mkldnn_type(mshadow::kInt32);
+        mkldnn::memory::desc omd(
+            mkldnn::memory::dims(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims),
+            this_dtype, static_cast<mkldnn::memory::format>(mem_desc.data.format));
+        mkldnn::memory::primitive_desc opd(omd, CpuEngine::Get()->get_engine());
+        mkldnn_mem_ptr tmp_mem(new mkldnn::memory(opd, out_mkl_mem->get_data_handle()));
+        MKLDNNStream::Get()->RegisterMem(tmp_mem);
+        MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*in_mkl_mem, *tmp_mem));
+        output = NDArray(tmp_mem);
+      } else {
       mkldnn_mem_ptr tmp_mem(
           new mkldnn::memory(in_mkl_mem->get_primitive_desc(), out_mkl_mem->get_data_handle()));
       MKLDNNStream::Get()->RegisterMem(tmp_mem);
       mxnet::MKLDNNCopy(*in_mkl_mem, tmp_mem.get());
       output = NDArray(tmp_mem);
+      }
     }
   }
 
@@ -327,7 +343,8 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
       float quantized_out_range;
       float output_scale;
       if (mkldnn_param.with_sum) {
-        auto quantized_sum_range = cached_sum_min_ < 0 ? kInt8Range : kUint8Range;
+        auto quantized_sum_range =
+            (inputs[in_sum].dtype() == mshadow::kInt8) ? kInt8Range : kUint8Range;
         sum_in_scale = quantized_sum_range / MaxAbs(cached_sum_min_, cached_sum_max_);
       }
       if (post_requantize_) {
@@ -339,11 +356,23 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
           full_conv_param.requantize_scales[c] = output_scale / data_scale_ / weight_scales_[c];
         }
       } else {
+        Stream<cpu> *s = ctx.get_stream<cpu>();
+        if (data.dtype() == mshadow::kInt8) {
+          mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(
+              s, 1, &cached_output_min_, &cached_output_max_, &weight_scales_[1],
+              &weight_scales_[2], &cached_data_min_, &cached_data_max_);
+        } else {
+          mxnet_op::Kernel<QuantizationRangeForS8U8MultiplicationStruct, cpu>::Launch(
+              s, 1, &cached_output_min_, &cached_output_max_, &weight_scales_[1],
+              &weight_scales_[2], &cached_data_min_, &cached_data_max_);
+        }
+        weight_scales_.resize(1);
         output_scale = data_scale_ * weight_scales_[0];
         full_conv_param.requantize_scales.resize(0);
       }
-      if (mkldnn_param.with_sum)
+      if (mkldnn_param.with_sum) {
         full_conv_param.sum_scale = output_scale / sum_in_scale;
+      }
     }
     fwd_.reset(new MKLDNNConvForward(
         full_conv_param, ctx.is_train, data, cached_weight_,
@@ -375,11 +404,10 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
     MKLDNNConvolutionForwardFullFeature(full_conv_param, ctx, fwd_.get(), new_inputs, new_req,
                                         {output});
   }
-  if (post_requantize_) {
-  float *out_min_ptr = outputs[kMin].data().dptr<float>();
-  float *out_max_ptr = outputs[kMax].data().dptr<float>();
-  *out_min_ptr = cached_output_min_;
-  *out_max_ptr = cached_output_max_;
+
+  if (mkldnn_param.quantized) {
+    *outputs[kMin].data().dptr<float>() = cached_output_min_;
+    *outputs[kMax].data().dptr<float>() = cached_output_max_;
   }
   if (mkldnn_param.with_sum) {
     auto out = const_cast<NDArray &>(outputs[kOut]);
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc.cc b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
index 857a27d9a134..f345a18c18a6 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_fc.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
@@ -174,7 +174,7 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
           MaxAbs(cached_min_output_, cached_max_output_) / data_scale / weight_scale;
       } else {
         Stream<cpu> *s = ctx.get_stream<cpu>();
-        mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(
+        mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(
           s, 1, &cached_min_output_, &cached_max_output_,
           &min_data, &max_data, &min_weight, &max_weight);
       }
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv_post_quantize_property.h b/src/operator/subgraph/mkldnn/mkldnn_post_quantize_property.h
similarity index 68%
rename from src/operator/subgraph/mkldnn/mkldnn_conv_post_quantize_property.h
rename to src/operator/subgraph/mkldnn/mkldnn_post_quantize_property.h
index f9033f48d413..b61a303757b3 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv_post_quantize_property.h
+++ b/src/operator/subgraph/mkldnn/mkldnn_post_quantize_property.h
@@ -16,12 +16,13 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_POST_QUANTIZE_PROPERTY_H_
-#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_POST_QUANTIZE_PROPERTY_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_POST_QUANTIZE_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_POST_QUANTIZE_PROPERTY_H_
 #if MXNET_USE_MKLDNN == 1
 
 #include <string>
 #include <vector>
+#include <set>
 #include "../common.h"
 #include "../subgraph_property.h"
 #include "../../nn/mkldnn/mkldnn_convolution-inl.h"
@@ -31,7 +32,7 @@
 namespace mxnet {
 namespace op {
 
-class SgMKLDNNConvPostQuantizeSelector : public SubgraphSelector {
+class SgMKLDNNPostQuantizeSelector : public SubgraphSelector {
  public:
   /*! \brief pattern match status */
   enum SelectStatus {
@@ -43,14 +44,25 @@ class SgMKLDNNConvPostQuantizeSelector : public SubgraphSelector {
  private:
   SelectStatus status;
   std::vector<const nnvm::Node *> matched_list;
+  std::set<std::string> support_requantize_fusion_op_name;
 
  public:
-  SgMKLDNNConvPostQuantizeSelector() {}
+  SgMKLDNNPostQuantizeSelector() {
+    support_requantize_fusion_op_name.insert("_sg_mkldnn_conv");
+    support_requantize_fusion_op_name.insert("_contrib_quantized_elemwise_add");
+  }
 
   bool Select(const nnvm::Node &n) override {
-    if (n.op() && n.op()->name == "_sg_mkldnn_conv") {
-      auto const &param = nnvm::get<MKLDNNConvFusionParam>(n.attrs.parsed);
-      if (param.full_conv_param.mkldnn_param.quantized) {
+    if (n.op() && support_requantize_fusion_op_name.count(n.op()->name)) {
+      if (n.op()->name == "_sg_mkldnn_conv") {
+        auto const &param = nnvm::get<MKLDNNConvFusionParam>(n.attrs.parsed);
+        if (param.full_conv_param.mkldnn_param.quantized) {
+          status = kStart;
+          matched_list.clear();
+          matched_list.push_back(&n);
+          return true;
+        }
+      } else if (n.op()->name == "_contrib_quantized_elemwise_add") {
         status = kStart;
         matched_list.clear();
         matched_list.push_back(&n);
@@ -97,47 +109,48 @@ class SgMKLDNNConvPostQuantizeSelector : public SubgraphSelector {
   }
 };
 
-class SgMKLDNNConvPostQuantizeProperty : public SubgraphProperty {
+class SgMKLDNNPostQuantizeProperty : public SubgraphProperty {
  public:
-  SgMKLDNNConvPostQuantizeProperty() {}
-
+  SgMKLDNNPostQuantizeProperty() {
+    support_requantize_fusion_op_name.insert("_sg_mkldnn_conv");
+    support_requantize_fusion_op_name.insert("_contrib_quantized_elemwise_add");
+  }
   static SubgraphPropertyPtr Create() {
-    static const std::string &name = "MKLDNN Convolution post-quantization optimization pass";
-    auto property = std::make_shared<SgMKLDNNConvPostQuantizeProperty>();
+    static const std::string &name = "MKLDNN post-quantization optimization pass";
+    auto property = std::make_shared<SgMKLDNNPostQuantizeProperty>();
     property->SetAttr<std::string>("property_name", name);
     property->SetAttr<bool>("inference_only", true);
     return property;
   }
-
   nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
                                    const int subgraph_id = 0) const override {
-    nnvm::NodePtr conv_node = nullptr;
+    nnvm::NodePtr fuse_node = nullptr;
     nnvm::NodePtr requantize_node = nullptr;
     DFSVisit(sym.outputs, [&](const nnvm::NodePtr &node) {
       if (node->is_variable()) return;
       auto &op_name = node->op()->name;
-      if (op_name == "_sg_mkldnn_conv") {
-        conv_node = node;
+      if (support_requantize_fusion_op_name.count(op_name)) {
+        fuse_node = node;
       } else if (op_name == "_contrib_requantize") {
         requantize_node = node;
       }
     });
-    CHECK_NOTNULL(conv_node);
+    CHECK_NOTNULL(fuse_node);
     CHECK_NOTNULL(requantize_node);
     auto const &requantize_param =
         nnvm::get<RequantizeParam>(requantize_node->attrs.parsed);
     CHECK(requantize_param.min_calib_range.has_value());
     CHECK(requantize_param.max_calib_range.has_value());
-    conv_node->attrs.dict["min_calib_range"] =
+    fuse_node->attrs.dict["min_calib_range"] =
         std::to_string(requantize_param.min_calib_range.value());
-    conv_node->attrs.dict["max_calib_range"] =
+    fuse_node->attrs.dict["max_calib_range"] =
         std::to_string(requantize_param.max_calib_range.value());
-    conv_node->op()->attr_parser(&(conv_node->attrs));
-    return conv_node;
+    fuse_node->op()->attr_parser(&(fuse_node->attrs));
+    return fuse_node;
   }
 
   SubgraphSelectorPtr CreateSubgraphSelector() const override {
-    auto selector = std::make_shared<SgMKLDNNConvPostQuantizeSelector>();
+    auto selector = std::make_shared<SgMKLDNNPostQuantizeSelector>();
     return selector;
   }
 
@@ -149,10 +162,12 @@ class SgMKLDNNConvPostQuantizeProperty : public SubgraphProperty {
       *entry_ptr = nnvm::NodeEntry{n, entry_ptr->index, 0};
     }
   }
-};
 
+ private:
+  std::set<std::string> support_requantize_fusion_op_name;
+};
 }  // namespace op
 }  // namespace mxnet
 
 #endif  // if MXNET_USE_MKLDNN == 1
-#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_POST_QUANTIZE_PROPERTY_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_POST_QUANTIZE_PROPERTY_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc b/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
index 26aa3b5b8e9a..4fc2d2c024bf 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
@@ -21,7 +21,7 @@
 
 #include "mkldnn_conv_property.h"
 #include "mkldnn_fc_property.h"
-#include "mkldnn_conv_post_quantize_property.h"
+#include "mkldnn_post_quantize_property.h"
 #include "mkldnn_fc_post_quantize_property.h"
 #include "mkldnn_post_quantize_align_scale_property.h"
 
@@ -30,7 +30,7 @@ namespace op {
 
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNConvProperty);
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNFCProperty);
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_POST_QUANTIZE, SgMKLDNNConvPostQuantizeProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_POST_QUANTIZE, SgMKLDNNPostQuantizeProperty);
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_POST_QUANTIZE, SgMKLDNNFCPostQuantizeProperty);
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_POST_QUANTIZE, SgMKLDNNPostQuantizeAlignScaleProperty);
 
diff --git a/src/operator/contrib/nnvm_to_onnx-inl.h b/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h
similarity index 61%
rename from src/operator/contrib/nnvm_to_onnx-inl.h
rename to src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h
index 052948521ba8..4a88aee886db 100644
--- a/src/operator/contrib/nnvm_to_onnx-inl.h
+++ b/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h
@@ -1,5 +1,5 @@
-#ifndef MXNET_OPERATOR_CONTRIB_NNVM_TO_ONNX_INL_H_
-#define MXNET_OPERATOR_CONTRIB_NNVM_TO_ONNX_INL_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_TENSORRT_NNVM_TO_ONNX_INL_H_
+#define MXNET_OPERATOR_SUBGRAPH_TENSORRT_NNVM_TO_ONNX_INL_H_
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -20,76 +20,23 @@
  */
 
 /*!
- * Copyright (c) 2018 by Contributors
- * \file tensorrt-inl.h
- * \brief TensorRT Operator
+ * Copyright (c) 2019 by Contributors
+ * \file nnvm_to_onnx-inl.h
+ * \brief Conversion from NNVM to ONNX for TensorRT
  * \author Marek Kolodziej, Clement Fuji Tsang
 */
 
 #if MXNET_USE_TENSORRT
 
-#include <dmlc/logging.h>
-#include <dmlc/memory_io.h>
-#include <dmlc/serializer.h>
-#include <dmlc/parameter.h>
-#include <mxnet/base.h>
 #include <mxnet/operator.h>
-#include <nnvm/graph.h>
 #include <nnvm/pass_functions.h>
 
 #include <onnx/onnx_pb.h>
 
-#include <algorithm>
-#include <iostream>
-#include <map>
-#include <vector>
-#include <tuple>
-#include <unordered_map>
-#include <utility>
 #include <string>
 
-#include "../operator_common.h"
-#include "../../common/utils.h"
-#include "../../common/serialization.h"
-
 namespace mxnet {
 namespace op {
-
-namespace nnvm_to_onnx {
-    enum class TypeIO { Inputs = 0, Outputs = 1 };
-    using NameToIdx_t = std::map<std::string, int32_t>;
-    using InferenceTuple_t = std::tuple<uint32_t, mxnet::TShape, int, int>;
-    using InferenceMap_t = std::map<std::string, InferenceTuple_t>;
-}  // namespace nnvm_to_onnx
-
-struct ONNXParam : public dmlc::Parameter<ONNXParam> {
-  std::string serialized_onnx_graph;
-  std::string serialized_input_map;
-  std::string serialized_output_map;
-  nnvm_to_onnx::NameToIdx_t input_map;
-  nnvm_to_onnx::InferenceMap_t output_map;
-  ::onnx::ModelProto onnx_pb_graph;
-
-  ONNXParam() = default;
-
-  ONNXParam(const ::onnx::ModelProto& onnx_graph,
-           const nnvm_to_onnx::InferenceMap_t& input_map,
-           const nnvm_to_onnx::NameToIdx_t& output_map) {
-    common::Serialize(input_map, &serialized_input_map);
-    common::Serialize(output_map, &serialized_output_map);
-    onnx_graph.SerializeToString(&serialized_onnx_graph);
-  }
-
-DMLC_DECLARE_PARAMETER(ONNXParam) {
-    DMLC_DECLARE_FIELD(serialized_onnx_graph)
-    .describe("Serialized ONNX graph");
-    DMLC_DECLARE_FIELD(serialized_input_map)
-    .describe("Map from inputs to topological order as input.");
-    DMLC_DECLARE_FIELD(serialized_output_map)
-    .describe("Map from outputs to order in g.outputs.");
-  }
-};
-
 namespace nnvm_to_onnx {
 
 using namespace nnvm;
@@ -99,24 +46,26 @@ using int64 = ::google::protobuf::int64;
 std::unordered_map<std::string, mxnet::TShape> GetPlaceholderShapes(const ShapeVector& shape_inputs,
     const nnvm::IndexedGraph& ig);
 
+std::unordered_map<std::string, int> GetPlaceholderDTypes(const DTypeVector&
+dtype_inputs,
+    const nnvm::IndexedGraph& ig);
+
 std::unordered_map<std::string, uint32_t> GetOutputLookup(const nnvm::IndexedGraph& ig);
 
 void ConvertPlaceholder(
   const std::string& node_name,
-  const std::unordered_map<std::string, mxnet::TShape>& placeholder_shapes,
+  const std::unordered_map<std::string, TShape>& placeholder_shapes,
+  const std::unordered_map<std::string, int>& placeholder_dtypes,
   GraphProto* graph_proto);
 
 void ConvertConstant(GraphProto* graph_proto,
   const std::string& node_name,
-  std::unordered_map<std::string, NDArray>* shared_buffer);
+  const std::unordered_map<std::string, NDArray>* const params_map);
 
-void ConvertOutput(op::nnvm_to_onnx::InferenceMap_t* trt_output_map,
-                   GraphProto* graph_proto,
+void ConvertOutput(GraphProto* graph_proto,
                    const std::unordered_map<std::string, uint32_t>::iterator& out_iter,
-                   const std::string& node_name,
-                   const nnvm::Graph& g,
-                   const StorageTypeVector& storage_types,
-                   const DTypeVector& dtypes);
+                   const std::string& node_name, const ShapeVector& shapes,
+                   const DTypeVector& dtypes, const nnvm::IndexedGraph &ig);
 
 typedef void (*ConverterFunction)(NodeProto *node_proto,
                                   const NodeAttrs &attrs,
@@ -137,6 +86,11 @@ void ConvertPooling(NodeProto *node_proto,
                     const nnvm::IndexedGraph &ig,
                     const array_view<IndexedGraph::NodeEntry> &inputs);
 
+void ConvertRelu(NodeProto *node_proto,
+                 const NodeAttrs &attrs,
+                 const nnvm::IndexedGraph &ig,
+                 const array_view<IndexedGraph::NodeEntry> &inputs);
+
 void ConvertActivation(NodeProto *node_proto,
                        const NodeAttrs &attrs,
                        const nnvm::IndexedGraph &ig,
@@ -157,6 +111,11 @@ void ConvertFlatten(NodeProto *node_proto,
                     const nnvm::IndexedGraph &ig,
                     const array_view<IndexedGraph::NodeEntry> &inputs);
 
+void ConvertDropout(NodeProto *node_proto,
+                    const NodeAttrs &attrs,
+                    const nnvm::IndexedGraph &ig,
+                    const array_view<IndexedGraph::NodeEntry> &inputs);
+
 void ConvertBatchNorm(NodeProto *node_proto,
                     const NodeAttrs &attrs,
                     const nnvm::IndexedGraph &ig,
@@ -167,19 +126,39 @@ void ConvertElementwiseAdd(NodeProto *node_proto,
                     const nnvm::IndexedGraph &ig,
                     const array_view<IndexedGraph::NodeEntry> &inputs);
 
-ONNXParam ConvertNnvmGraphToOnnx(
-    const nnvm::Graph &g,
-    std::unordered_map<std::string, NDArray>* shared_buffer);
+void ConvertConcatenate(NodeProto *node_proto,
+                    const NodeAttrs &attrs,
+                    const nnvm::IndexedGraph &ig,
+                    const array_view<IndexedGraph::NodeEntry> &inputs);
+
+void ConvertClip(NodeProto *node_proto,
+                 const NodeAttrs &attrs,
+                 const nnvm::IndexedGraph &ig,
+                 const array_view<IndexedGraph::NodeEntry> &inputs);
+
+void ConvertPad(NodeProto* node_proto,
+                const NodeAttrs & attrs,
+                const nnvm::IndexedGraph &ig,
+                const array_view<IndexedGraph::NodeEntry> &inputs);
+
+std::string ConvertNnvmGraphToOnnx(const nnvm::Graph &g,
+    const std::unordered_map<std::string, NDArray>* const params_map);
 
 static const std::unordered_map<std::string, ConverterFunction> converter_map = {
-  {"Convolution", ConvertConvolution},
-  {"Pooling", ConvertPooling},
   {"Activation", ConvertActivation},
-  {"FullyConnected", ConvertFullyConnected},
-  {"SoftmaxOutput", ConvertSoftmaxOutput},
-  {"Flatten", ConvertFlatten},
   {"BatchNorm", ConvertBatchNorm},
-  {"elemwise_add", ConvertElementwiseAdd}};
+  {"clip", ConvertClip},
+  {"Convolution", ConvertConvolution},
+  {"Concat", ConvertConcatenate},
+  {"Dropout", ConvertDropout},
+  {"elemwise_add", ConvertElementwiseAdd},
+  {"Flatten", ConvertFlatten},
+  {"FullyConnected", ConvertFullyConnected},
+  {"Pad", ConvertPad},
+  {"Pooling", ConvertPooling},
+  {"relu", ConvertRelu},
+  {"SoftmaxOutput", ConvertSoftmaxOutput}
+};
 
 }  // namespace nnvm_to_onnx
 }  // namespace op
@@ -187,4 +166,4 @@ static const std::unordered_map<std::string, ConverterFunction> converter_map =
 
 #endif  // MXNET_USE_TENSORRT
 
-#endif  // MXNET_OPERATOR_CONTRIB_NNVM_TO_ONNX_INL_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_TENSORRT_NNVM_TO_ONNX_INL_H_
diff --git a/src/operator/contrib/nnvm_to_onnx.cc b/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
similarity index 68%
rename from src/operator/contrib/nnvm_to_onnx.cc
rename to src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
index 0c8bd79490e3..da89c2b476ee 100644
--- a/src/operator/contrib/nnvm_to_onnx.cc
+++ b/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
@@ -18,9 +18,9 @@
  */
 
 /*!
- * Copyright (c) 2018 by Contributors
- * \file trt.cc
- * \brief TensorRT operation registration
+ * Copyright (c) 2019 by Contributors
+ * \file nnvm_to_onnx.cc
+ * \brief Conversion from NNVM to ONNX for TensorRT
  * \author Marek Kolodziej, Clement Fuji Tsang
 */
 
@@ -32,21 +32,17 @@
 #include <nnvm/graph.h>
 #include <nnvm/pass_functions.h>
 
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <unordered_map>
-#include <vector>
-
-#include "../../common/serialization.h"
-#include "../../common/utils.h"
-#include "../../ndarray/ndarray_function.h"
-#include "../../operator/nn/activation-inl.h"
-#include "../../operator/nn/batch_norm-inl.h"
-#include "../../operator/nn/convolution-inl.h"
-#include "../../operator/nn/fully_connected-inl.h"
-#include "../../operator/nn/pooling-inl.h"
-#include "../../operator/softmax_output-inl.h"
+#include "../../../common/utils.h"
+#include "../../../ndarray/ndarray_function.h"
+#include "../../pad-inl.h"
+#include "../../nn/activation-inl.h"
+#include "../../nn/batch_norm-inl.h"
+#include "../../nn/convolution-inl.h"
+#include "../../nn/fully_connected-inl.h"
+#include "../../nn/pooling-inl.h"
+#include "../../nn/concat-inl.h"
+#include "../../softmax_output-inl.h"
+#include "../../tensor/matrix_op-inl.h"
 
 #if MXNET_USE_TENSORRT_ONNX_CHECKER
 #include <onnx/checker.h>
@@ -54,36 +50,21 @@
 
 namespace mxnet {
 namespace op {
-
-DMLC_REGISTER_PARAMETER(ONNXParam);
-
 namespace nnvm_to_onnx {
 
-op::ONNXParam ConvertNnvmGraphToOnnx(
+std::string ConvertNnvmGraphToOnnx(
     const nnvm::Graph& g,
-    std::unordered_map<std::string, NDArray>* const shared_buffer) {
+    const std::unordered_map<std::string, NDArray>* const params_map) {
 
   static std::atomic_ulong subgraph_count = { 0 };
 
-  op::ONNXParam onnx_param;
-  op::nnvm_to_onnx::NameToIdx_t onnx_input_map;
-  op::nnvm_to_onnx::InferenceMap_t onnx_output_map;
+  std::string serialized_onnx_graph;
 
   const nnvm::IndexedGraph& ig = g.indexed_graph();
-  const auto& storage_types = g.GetAttr<StorageTypeVector>("storage_type");
   const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
-  const auto& shape_inputs = g.GetAttr<mxnet::ShapeVector>("shape_inputs");
-
-  // TODO(kellens): At the moment this check always passes no matter the weight dtypes used in your
-  // graph.  We should first iterate over datatypes by name and ensure  they're valid types
-  // (fp16 or fp32) and that they're uniform.  Then ensure later conversions set tensor types
-  // correctly in ONNX.
-  for (auto& e : storage_types) {
-    if (e != mshadow::kFloat32) {
-      LOG(FATAL) << "ONNX converter does not support types other than float32 "
-                    "right now.";
-    }
-  }
+  const auto& shapes = g.GetAttr<ShapeVector>("shape");
+  const auto& dtype_inputs = g.GetAttr<DTypeVector>("dtype_inputs");
+  const auto& shape_inputs = g.GetAttr<ShapeVector>("shape_inputs");
 
   ModelProto model_proto;
 
@@ -104,9 +85,9 @@ op::ONNXParam ConvertNnvmGraphToOnnx(
   auto subgraph_name_id = subgraph_count.fetch_add(1);
   graph_proto->set_name("MXNetTRTSubgraph" + std::to_string(subgraph_name_id));
 
-  std::unordered_map<std::string, mxnet::TShape> placeholder_shapes =
-      GetPlaceholderShapes(shape_inputs, ig);
-  std::unordered_map<std::string, uint32_t> output_lookup = GetOutputLookup(ig);
+  auto placeholder_shapes = GetPlaceholderShapes(shape_inputs, ig);
+  auto placeholder_dtypes = GetPlaceholderDTypes(dtype_inputs, ig);
+  auto output_lookup = GetOutputLookup(ig);
   uint32_t current_input = 0;
 
   // Can't do a foreach over IndexedGraph since it doesn't implement begin(), etc.
@@ -121,18 +102,17 @@ op::ONNXParam ConvertNnvmGraphToOnnx(
     // placeholder
     if (source->is_variable()) {
       // Is this a placeholder?
-      if (shared_buffer->count(node_name) == 0) {
+      if (params_map->count(node_name) == 0) {
         // This fixes the problem with a SoftmaxOutput node during inference, but it's hacky.
         // Need to figure out how to properly fix it.
         if (node_name.find("label") != std::string::npos) {
           current_input++;
           continue;
         }
-        onnx_input_map.emplace(node_name, current_input++);
-        ConvertPlaceholder(node_name, placeholder_shapes, graph_proto);
+        ConvertPlaceholder(node_name, placeholder_shapes, placeholder_dtypes, graph_proto);
       } else {
         // If it's not a placeholder, then by exclusion it's a constant.
-        ConvertConstant(graph_proto, node_name, shared_buffer);
+        ConvertConstant(graph_proto, node_name, params_map);
       }  // is_placeholder
     } else {
       // It's an op, rather than a "variable" (constant or placeholder)
@@ -163,23 +143,18 @@ op::ONNXParam ConvertNnvmGraphToOnnx(
       auto out_iter = output_lookup.find(node_name);
       // We found an output
       if (out_iter != output_lookup.end()) {
-        ConvertOutput(&onnx_output_map, graph_proto, out_iter, node_name, g,
-                      storage_types, dtypes);
+        ConvertOutput(graph_proto, out_iter, node_name, shapes, dtypes, ig);
       }  // output found
     }    // conversion function exists
   }      // loop over i from 0 to num_nodes
 
-  model_proto.SerializeToString(&onnx_param.serialized_onnx_graph);
-  common::Serialize<op::nnvm_to_onnx::NameToIdx_t>(onnx_input_map,
-                                          &onnx_param.serialized_input_map);
-  common::Serialize<op::nnvm_to_onnx::InferenceMap_t>(onnx_output_map,
-                                             &onnx_param.serialized_output_map);
+  model_proto.SerializeToString(&serialized_onnx_graph);
 
 #if MXNET_USE_TENSORRT_ONNX_CHECKER
   onnx::checker::check_model(model_proto);
 #endif  // MXNET_USE_TENSORRT_ONNX_CHECKER
 
-  return onnx_param;
+  return serialized_onnx_graph;
 }
 
 void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs,
@@ -225,9 +200,10 @@ void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs,
   pads->set_name("pads");
   pads->set_type(AttributeProto::INTS);
 
-  for (const dim_t kval : pad) {
-    pads->add_ints(static_cast<int64>(kval));
-    pads->add_ints(static_cast<int64>(kval));
+  for (int i =0; i < 2; i++) {
+    for (dim_t kval : pad) {
+      pads->add_ints(static_cast<int64>(kval));
+    }
   }
 
   // strides
@@ -295,6 +271,12 @@ void ConvertPooling(NodeProto* node_proto, const NodeAttrs& attrs,
   // not global pooling
 }  // end ConvertPooling
 
+void ConvertRelu(NodeProto* node_proto, const NodeAttrs& /*attrs*/,
+                 const nnvm::IndexedGraph& /*ig*/,
+                 const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  node_proto->set_op_type("Relu");
+}
+
 void ConvertActivation(NodeProto* node_proto, const NodeAttrs& attrs,
                        const nnvm::IndexedGraph& /*ig*/,
                        const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
@@ -411,7 +393,41 @@ void ConvertElementwiseAdd(NodeProto* node_proto, const NodeAttrs& /*attrs*/,
   node_proto->set_op_type("Add");
 }
 
-std::unordered_map<std::string, mxnet::TShape> GetPlaceholderShapes(
+void ConvertConcatenate(NodeProto* node_proto, const NodeAttrs& attrs,
+                        const nnvm::IndexedGraph& /*ig*/,
+                        const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  const auto& _param = nnvm::get<ConcatParam>(attrs.parsed);
+  node_proto->set_op_type("Concat");
+  node_proto->set_name(attrs.name);
+  // axis
+  AttributeProto* const axis = node_proto->add_attribute();
+  axis->set_name("axis");
+  axis->set_type(AttributeProto::INT);
+  axis->set_i(static_cast<int64_t>(_param.dim));
+}
+
+inline TensorProto_DataType ConvertDType(int dtype) {
+  switch (dtype) {
+    case mshadow::kFloat64:
+      return TensorProto_DataType_DOUBLE;
+    case mshadow::kFloat32:
+      return TensorProto_DataType_FLOAT;
+    case mshadow::kFloat16:
+      return TensorProto_DataType_FLOAT16;
+    case mshadow::kUint8:
+      return TensorProto_DataType_UINT8;
+    case mshadow::kInt32:
+      return TensorProto_DataType_INT32;
+    case mshadow::kInt8:
+      return TensorProto_DataType_INT8;
+    case mshadow::kInt64:
+      return TensorProto_DataType_INT64;
+    default:
+      return TensorProto_DataType_UNDEFINED;
+  }
+}
+
+std::unordered_map<std::string, TShape> GetPlaceholderShapes(
     const ShapeVector& shape_inputs, const nnvm::IndexedGraph& ig) {
   std::unordered_map<std::string, mxnet::TShape> placeholder_shapes;
   for (uint32_t i = 0; i < shape_inputs.size(); ++i) {
@@ -425,6 +441,17 @@ std::unordered_map<std::string, mxnet::TShape> GetPlaceholderShapes(
   return placeholder_shapes;
 }
 
+std::unordered_map<std::string, int> GetPlaceholderDTypes(
+    const DTypeVector& dtype_inputs, const nnvm::IndexedGraph& ig) {
+  std::unordered_map<std::string, int> placeholder_dtypes;
+  for (uint32_t i = 0; i < dtype_inputs.size(); ++i) {
+    std::string name = ig[ig.input_nodes()[i]].source->attrs.name;
+    int dtype = dtype_inputs[i];
+    placeholder_dtypes.emplace(name, dtype);
+  }
+  return placeholder_dtypes;
+}
+
 std::unordered_map<std::string, uint32_t> GetOutputLookup(
     const nnvm::IndexedGraph& ig) {
   std::unordered_map<std::string, uint32_t> output_lookup;
@@ -442,17 +469,17 @@ std::unordered_map<std::string, uint32_t> GetOutputLookup(
 
 void ConvertPlaceholder(
     const std::string& node_name,
-    const std::unordered_map<std::string, mxnet::TShape>& placeholder_shapes,
+    const std::unordered_map<std::string, TShape>& placeholder_shapes,
+    const std::unordered_map<std::string, int>& placeholder_dtypes,
     GraphProto* const graph_proto) {
   auto val_info_proto = graph_proto->add_input();
   auto type_proto = val_info_proto->mutable_type()->mutable_tensor_type();
   auto shape_proto = type_proto->mutable_shape();
 
   val_info_proto->set_name(node_name);
-  // Will support fp16, etc. in the near future
-  type_proto->set_elem_type(TensorProto_DataType_FLOAT);
   auto entry_shape = placeholder_shapes.find(node_name)->second;
-
+  auto entry_dtype = placeholder_dtypes.find(node_name)->second;
+  type_proto->set_elem_type(ConvertDType(entry_dtype));
   for (const auto& elem : entry_shape) {
     TensorShapeProto_Dimension* const tsp_dim = shape_proto->add_dim();
     tsp_dim->set_dim_value(static_cast<int64>(elem));
@@ -461,38 +488,49 @@ void ConvertPlaceholder(
 
 void ConvertConstant(
     GraphProto* const graph_proto, const std::string& node_name,
-    std::unordered_map<std::string, NDArray>* const shared_buffer) {
-  TensorProto* const initializer_proto = graph_proto->add_initializer();
+    const std::unordered_map<std::string, NDArray>* const params_map) {
+    TensorProto* const initializer_proto = graph_proto->add_initializer();
 
   // Create initializer for constants
   initializer_proto->set_name(node_name);
-  // TODO(kellens): convert to fp16 if needed.
-  initializer_proto->set_data_type(TensorProto_DataType_FLOAT);
 
-  const NDArray nd = shared_buffer->find(node_name)->second;
+  const NDArray nd = params_map->find(node_name)->second;
   const TBlob& blob = nd.data();
-  const mxnet::TShape shape = blob.shape_;
+  const TShape shape = blob.shape_;
+  const auto dtype = ConvertDType(nd.dtype());
+  initializer_proto->set_data_type(dtype);
 
   for (auto& dim : shape) {
     initializer_proto->add_dims(static_cast<int64>(dim));
   }
 
   auto size = shape.Size();
-  // TODO(kellens): Note hard coded float32 size assumed.
-  std::shared_ptr<float> shared_data_ptr(new float[size]);
-  float* const data_ptr = shared_data_ptr.get();
-  nd.SyncCopyToCPU(static_cast<void*>(data_ptr), size);
 
-  for (size_t blob_idx = 0; blob_idx < size; ++blob_idx) {
-    initializer_proto->add_float_data(data_ptr[blob_idx]);
+  if (dtype == TensorProto_DataType_FLOAT) {
+    std::shared_ptr<float> shared_data_ptr(new float[size]);
+    float* const data_ptr = shared_data_ptr.get();
+    nd.SyncCopyToCPU(static_cast<void*>(data_ptr), size);
+
+    for (size_t blob_idx = 0; blob_idx < size; ++blob_idx) {
+      initializer_proto->add_float_data(data_ptr[blob_idx]);
+    }
+  } else if (dtype == TensorProto_DataType_FLOAT16) {
+    std::shared_ptr<uint16_t> shared_data_ptr(new uint16_t[size]);
+    uint16_t* const data_ptr = shared_data_ptr.get();
+    nd.SyncCopyToCPU(static_cast<void*>(data_ptr), size);
+    for (size_t blob_idx = 0; blob_idx < size; ++blob_idx) {
+      initializer_proto->add_int32_data(
+          reinterpret_cast<int32_t*>(data_ptr)[blob_idx]);
+    }
+  } else {
+    LOG(FATAL) << "dtype not supported for variables: " << node_name;
   }
 
   // Create inputs for constants.
   ValueInfoProto* const input_proto = graph_proto->add_input();
   input_proto->set_name(node_name);
 
-  // TODO(kellens): (fp16 support)
-  input_proto->mutable_type()->mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
+  input_proto->mutable_type()->mutable_tensor_type()->set_elem_type(dtype);
   for (auto& dim : shape) {
     auto new_dim = input_proto->mutable_type()->mutable_tensor_type()->mutable_shape()->add_dim();
     new_dim->set_dim_value(static_cast<int64>(dim));
@@ -500,37 +538,98 @@ void ConvertConstant(
 }
 
 void ConvertOutput(
-    op::nnvm_to_onnx::InferenceMap_t* const output_map,
     GraphProto* const graph_proto,
     const std::unordered_map<std::string, uint32_t>::iterator& out_iter,
-    const std::string& node_name, const nnvm::Graph& g,
-    const StorageTypeVector& storage_types, const DTypeVector& dtypes) {
-  const nnvm::IndexedGraph& ig = g.indexed_graph();
+    const std::string& node_name, const ShapeVector& shapes,
+    const DTypeVector& dtypes, const nnvm::IndexedGraph &ig) {
   uint32_t out_idx = ig.entry_id(ig.outputs()[out_iter->second]);
-  mxnet::TShape out_shape = g.GetAttr<mxnet::ShapeVector>("shape")[out_idx];
-  int storage_type = storage_types[out_idx];
   int dtype = dtypes[out_idx];
-
-  // This should work with fp16 as well
-  op::nnvm_to_onnx::InferenceTuple_t out_tuple{out_iter->second, out_shape, storage_type,
-                                      dtype};
-
-  output_map->emplace(node_name, out_tuple);
-
   auto graph_out = graph_proto->add_output();
   auto tensor_type = graph_out->mutable_type()->mutable_tensor_type();
   auto tensor_shape_proto = tensor_type->mutable_shape();
   graph_out->set_name(node_name);
 
   // Also support fp16.
-  tensor_type->set_elem_type(TensorProto_DataType_FLOAT);
+  tensor_type->set_elem_type(ConvertDType(dtype));
 
-  for (int64_t dim_shp : out_shape) {
+  for (int64_t dim_shp : shapes[out_idx]) {
     TensorShapeProto_Dimension* const tsp_dim = tensor_shape_proto->add_dim();
     tsp_dim->set_dim_value(static_cast<int64>(dim_shp));
   }
 }
 
+void ConvertClip(NodeProto* node_proto, const NodeAttrs& attrs,
+                 const nnvm::IndexedGraph& /*ig*/,
+                 const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  const auto param = nnvm::get<ClipParam>(attrs.parsed);
+
+  node_proto->set_op_type("Clip");
+
+  // max
+  AttributeProto* const a_max = node_proto->add_attribute();
+  a_max->set_name("max");
+  a_max->set_type(AttributeProto::FLOAT);
+  a_max->set_f(static_cast<float>(param.a_max));
+
+  // min
+  AttributeProto* const a_min = node_proto->add_attribute();
+  a_min->set_name("min");
+  a_min->set_type(AttributeProto::FLOAT);
+  a_min->set_f(static_cast<float>(param.a_min));
+}
+
+void ConvertPad(NodeProto* node_proto, const NodeAttrs& attrs,
+                const nnvm::IndexedGraph& /*ig*/,
+                const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  const auto param = nnvm::get<PadParam>(attrs.parsed);
+
+  node_proto->set_op_type("Pad");
+
+  // mode
+  AttributeProto* const mode = node_proto->add_attribute();
+  mode->set_name("mode");
+  mode->set_type(AttributeProto::STRING);
+  switch (param.mode) {
+    case op::pad_enum::kConstant:
+      mode->set_s("constant");
+      break;
+    case op::pad_enum::kEdge:
+      mode->set_s("edge");
+      break;
+    case op::pad_enum::kReflect:
+      mode->set_s("reflect");
+      break;
+    default:
+      throw dmlc::Error("Such mode of padding doesn't exist doesn't exist");
+  }
+
+  // pads
+  AttributeProto* const pads = node_proto->add_attribute();
+  pads->set_name("pads");
+  pads->set_type(AttributeProto::INTS);
+
+  std::vector<int64> pad_begin;
+  std::vector<int64> pad_end;
+  for (int st = 0; st < 2; ++st) {
+    for (auto it = param.pad_width.begin() + st;
+         it != param.pad_width.end(); it += 2) {
+      pads->add_ints(static_cast<int64>(*it));
+    }
+  }
+
+  // value
+  AttributeProto* const value = node_proto->add_attribute();
+  value->set_name("value");
+  value->set_type(AttributeProto::FLOAT);
+  value->set_f(param.constant_value);
+}
+
+void ConvertDropout(NodeProto* node_proto, const NodeAttrs& attrs,
+                    const nnvm::IndexedGraph& /*ig*/,
+                    const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  node_proto->set_op_type("Dropout");
+}
+
 }  // namespace nnvm_to_onnx
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/executor/onnx_to_tensorrt.cc b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.cc
similarity index 89%
rename from src/executor/onnx_to_tensorrt.cc
rename to src/operator/subgraph/tensorrt/onnx_to_tensorrt.cc
index f7fbc8f81359..7dbc54bc1a63 100644
--- a/src/executor/onnx_to_tensorrt.cc
+++ b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- * Copyright (c) 2018 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file onnx_to_tensorrt.cc
  * \brief TensorRT integration with the MXNet executor
  * \author Marek Kolodziej, Clement Fuji Tsang
@@ -36,6 +36,9 @@
 #include <google/protobuf/text_format.h>
 #include <onnx-tensorrt/NvOnnxParser.h>
 #include <onnx-tensorrt/NvOnnxParserRuntime.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+
 #include <onnx-tensorrt/PluginFactory.hpp>
 #include <onnx-tensorrt/plugin_common.hpp>
 
@@ -80,7 +83,7 @@ void PrintVersion() {
     << NV_TENSORRT_PATCH << endl;
 }
 
-nvinfer1::ICudaEngine* onnxToTrtCtx(
+std::tuple<nvinfer1::ICudaEngine*, nvonnxparser::IParser*> onnxToTrtCtx(
         const std::string& onnx_model,
         int32_t max_batch_size,
         size_t max_workspace_size,
@@ -91,14 +94,13 @@ nvinfer1::ICudaEngine* onnxToTrtCtx(
   TRT_Logger trt_logger(verbosity);
   auto trt_builder = InferObject(nvinfer1::createInferBuilder(trt_logger));
   auto trt_network = InferObject(trt_builder->createNetwork());
-  auto trt_parser  = InferObject(nvonnxparser::createParser(trt_network.get(), trt_logger));
+  auto trt_parser  = nvonnxparser::createParser(trt_network.get(), trt_logger);
   ::ONNX_NAMESPACE::ModelProto parsed_model;
   // We check for a valid parse, but the main effect is the side effect
   // of populating parsed_model
   if (!parsed_model.ParseFromString(onnx_model)) {
     throw dmlc::Error("Could not parse ONNX from string");
   }
-
   if ( !trt_parser->parse(onnx_model.c_str(), onnx_model.size()) ) {
       size_t nerror = trt_parser->getNbErrors();
       for ( size_t i=0; i < nerror; ++i ) {
@@ -127,19 +129,18 @@ nvinfer1::ICudaEngine* onnxToTrtCtx(
       }
       throw dmlc::Error("Cannot parse ONNX into TensorRT Engine");
   }
-
-  bool fp16 = trt_builder->platformHasFastFp16();
-
+  if (dmlc::GetEnv("MXNET_TENSORRT_USE_FP16", true)) {
+    if (trt_builder->platformHasFastFp16()) {
+      trt_builder->setFp16Mode(true);
+    } else {
+      LOG(WARNING) << "TensorRT can't use fp16 on this platform";
+    }
+  }
   trt_builder->setMaxBatchSize(max_batch_size);
   trt_builder->setMaxWorkspaceSize(max_workspace_size);
-  if ( fp16 && dmlc::GetEnv("MXNET_TENSORRT_USE_FP16_FOR_FP32", false) ) {
-    LOG(INFO) << "WARNING: TensorRT using fp16 given original MXNet graph in fp32 !!!";
-    trt_builder->setHalf2Mode(true);
-  }
-
   trt_builder->setDebugSync(debug_builder);
   nvinfer1::ICudaEngine* trt_engine = trt_builder->buildCudaEngine(*trt_network.get());
-  return trt_engine;
+  return std::make_tuple(trt_engine, trt_parser);
 }
 
 }  // namespace onnx_to_tensorrt
diff --git a/src/executor/onnx_to_tensorrt.h b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
similarity index 88%
rename from src/executor/onnx_to_tensorrt.h
rename to src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
index 259cfce7c332..3e8ea1bf9ee1 100644
--- a/src/executor/onnx_to_tensorrt.h
+++ b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
@@ -1,5 +1,5 @@
-#ifndef MXNET_EXECUTOR_ONNX_TO_TENSORRT_H_
-#define MXNET_EXECUTOR_ONNX_TO_TENSORRT_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_TENSORRT_ONNX_TO_TENSORRT_H_
+#define MXNET_OPERATOR_SUBGRAPH_TENSORRT_ONNX_TO_TENSORRT_H_
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -20,7 +20,7 @@
  */
 
 /*!
- * Copyright (c) 2018 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file onnx_to_tensorrt.h
  * \brief TensorRT integration with the MXNet executor
  * \author Marek Kolodziej, Clement Fuji Tsang
@@ -28,13 +28,15 @@
 
 #if MXNET_USE_TENSORRT
 
+#include <onnx-tensorrt/NvOnnxParser.h>
+#include <NvInfer.h>
+
 #include <fstream>
 #include <iostream>
-#include <NvInfer.h>
 #include <sstream>
 #include <string>
-
-#include "../operator/contrib/tensorrt-inl.h"
+#include <ctime>
+#include <tuple>
 
 namespace onnx_to_tensorrt {
 
@@ -64,7 +66,7 @@ class TRT_Logger : public nvinfer1::ILogger {
         }
 };
 
-nvinfer1::ICudaEngine* onnxToTrtCtx(
+std::tuple<nvinfer1::ICudaEngine*, nvonnxparser::IParser*> onnxToTrtCtx(
         const std::string& onnx_model,
         int32_t max_batch_size = 32,
         size_t max_workspace_size = 1L << 30,
@@ -74,4 +76,4 @@ nvinfer1::ICudaEngine* onnxToTrtCtx(
 
 #endif  // MXNET_USE_TENSORRT
 
-#endif  // MXNET_EXECUTOR_ONNX_TO_TENSORRT_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_TENSORRT_ONNX_TO_TENSORRT_H_
diff --git a/src/operator/subgraph/tensorrt/tensorrt-inl.h b/src/operator/subgraph/tensorrt/tensorrt-inl.h
new file mode 100644
index 000000000000..e258d892aaba
--- /dev/null
+++ b/src/operator/subgraph/tensorrt/tensorrt-inl.h
@@ -0,0 +1,240 @@
+#ifndef MXNET_OPERATOR_SUBGRAPH_TENSORRT_TENSORRT_INL_H_
+#define MXNET_OPERATOR_SUBGRAPH_TENSORRT_TENSORRT_INL_H_
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file tensorrt-inl.h
+ * \brief TensorRT operation registration
+ * \author Marek Kolodziej, Clement Fuji Tsang
+*/
+
+#if MXNET_USE_TENSORRT
+
+#include <onnx-tensorrt/NvOnnxParser.h>
+
+#include <utility>
+#include <string>
+#include <vector>
+
+#include "../common.h"
+#include "../subgraph_property.h"
+#include "nnvm_to_onnx-inl.h"
+#include "./onnx_to_tensorrt.h"
+
+namespace mxnet {
+namespace op {
+
+using int64 = ::google::protobuf::int64;
+
+struct TRTParam {
+  std::unordered_map<std::string, uint32_t> inputs_to_idx;
+  std::unordered_map<std::string, uint32_t> outputs_to_idx;
+  std::unordered_map<std::string, NDArray> params_map;
+};
+
+struct TRTEngineParam {
+  TRTEngineParam(nvinfer1::ICudaEngine* trt_engine,
+                 nvonnxparser::IParser* _parser,
+                 const std::unordered_map<std::string, uint32_t> input_map,
+                 const std::unordered_map<std::string, uint32_t> output_map) {
+    binding_order = std::make_shared<std::vector<std::pair<uint32_t, bool> > >();
+    bindings = std::make_shared<std::vector<void*> >();
+    binding_order->reserve(trt_engine->getNbBindings());
+    bindings->resize(trt_engine->getNbBindings());
+    for (int b = 0; b < trt_engine->getNbBindings(); ++b) {
+      const std::string& binding_name = trt_engine->getBindingName(b);
+      if (trt_engine->bindingIsInput(b)) {
+        binding_order->emplace_back(input_map.at(binding_name), true);
+      } else {
+        binding_order->emplace_back(output_map.at(binding_name), false);
+      }
+    }
+    trt_executor = trt_engine->createExecutionContext();
+    trt_parser = _parser;
+  }
+
+  ~TRTEngineParam() {
+    trt_parser->destroy();
+    trt_executor->destroy();
+  }
+  nvinfer1::IExecutionContext* trt_executor;
+  nvonnxparser::IParser* trt_parser;
+  std::shared_ptr<std::vector<std::pair<uint32_t, bool> > > binding_order;
+  std::shared_ptr<std::vector<void*> > bindings;
+};
+
+class TensorrtSelector : public SubgraphSelector {
+ public:
+  const std::unordered_set<std::string> unconditionalTRTops = {
+    "BatchNorm",
+    "clip",
+    "Concat",
+    "Convolution",
+    "Dropout",
+    "elemwise_add",
+    "elemwise_sub",
+    "elemwise_mul",
+    "Flatten",
+    "FullyConnected",
+    "mean",
+    "Pad",
+    "relu",
+    "rsqrt",
+    "SoftmaxOutput"
+  };
+
+  const std::unordered_set<std::string> withWeightsOps = {
+    "BatchNorm",
+    "Convolution",
+    "FullyConnected"
+  };
+
+  bool isTRTCompatible(const nnvm::Node &n) {
+    const std::string op_name = n.op()->name;
+    if (op_name == "Pooling") {
+      return (n.attrs.dict.at("pool_type") == "avg" ||
+          n.attrs.dict.at("pool_type") == "max");
+    }
+
+    if (unconditionalTRTops.count(op_name)) {
+      return true;
+    }
+
+    if (op_name == "Activation") {
+      return n.attrs.dict.at("act_type") == "relu" ||
+        n.attrs.dict.at("act_type") == "tanh" ||
+        n.attrs.dict.at("act_type") == "sigmoid";
+    }
+
+    return false;
+  }
+
+  bool Select(const nnvm::Node &n) override {
+    return !n.is_variable() && isTRTCompatible(n);
+  }
+
+  bool SelectInput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    if (new_node.is_variable()) {
+      if (withWeightsOps.count(n.op()->name)) {
+        return n.inputs[0].node->attrs.name != new_node.attrs.name;
+      } else {
+        return false;
+      }
+    }
+    return isTRTCompatible(new_node);
+  }
+
+  bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    return isTRTCompatible(new_node);
+  }
+
+  std::vector<nnvm::Node*> Filter(const std::vector<nnvm::Node*>& candidates) override {
+    bool found_one = false;
+    // TensorRT is interesting with at least 2 operations
+    for (auto& n : candidates) {
+      if (!n->is_variable()) {
+        if (found_one) {
+          return candidates;
+        } else {
+          found_one = true;
+        }
+      }
+    }
+    return std::vector<nnvm::Node*>();
+  }
+};
+
+class TensorrtProperty : public SubgraphProperty {
+ public:
+  static SubgraphPropertyPtr Create() {
+    return std::make_shared<TensorrtProperty>();
+  }
+
+  nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
+                                   const int subgraph_id) const override {
+    nnvm::NodePtr n = nnvm::Node::Create();
+    nnvm::Symbol new_sym;
+    std::unique_copy(sym.outputs.begin(), sym.outputs.end(),
+        std::back_inserter(new_sym.outputs), [](
+        nnvm::NodeEntry lhs, nnvm::NodeEntry rhs) {
+          return lhs.index == rhs.index && lhs.node.get() == rhs.node.get();
+        });
+    n->attrs.name = "TensorRT" + std::to_string(subgraph_id);
+    n->attrs.op = Op::Get("_TensorRT");
+    CHECK(n->attrs.op);
+    n->attrs.subgraphs.emplace_back(std::make_shared<nnvm::Symbol>(new_sym));
+    std::ostringstream params_oss;
+    for (auto &e : new_sym.ListInputNames(nnvm::Symbol::kAll)) {
+      params_oss << e << ";";
+    }
+    auto tensorrt_params_names = params_oss.str();
+    tensorrt_params_names.pop_back();
+    n->attrs.dict["subgraph_params_names"] = tensorrt_params_names;
+    TRTParam param;
+    n->attrs.parsed = param;
+    n->op()->attr_parser(&(n->attrs));
+    return n;
+  }
+
+  SubgraphSelectorPtr CreateSubgraphSelector() const override {
+    return std::make_shared<TensorrtSelector>();
+  }
+
+  void ConnectSubgraphOutputs(const nnvm::NodePtr subgraph_node, \
+                              std::vector<nnvm::NodeEntry*>* output_entries) const override {
+    std::vector<nnvm::NodeEntry>& outputs = subgraph_node->attrs.subgraphs[0]->outputs;
+    TRTParam& _params = nnvm::get<TRTParam>(subgraph_node->attrs.parsed);
+    for (size_t i = 0; i < outputs.size(); i++) {
+      auto& o = outputs[i];
+      for (auto& e : *output_entries) {
+        if (o.index == e->index && o.node.get() == e->node.get()) {
+          e->index = i;
+          e->node = subgraph_node;
+          // TODO(cfujitsang): For future support this would fail
+          //                   if the node have multiple outputs
+          _params.outputs_to_idx[o.node->attrs.name] = i;
+        }
+      }
+    }
+    subgraph_node->attrs.parsed = std::move(_params);
+  }
+
+  void ConnectSubgraphInputs(const nnvm::NodePtr subgraph_node,
+                             std::vector<nnvm::NodeEntry*>* input_entries,
+                             std::vector<nnvm::NodeEntry>* orig_input_entries) const override {
+    TRTParam& _params = nnvm::get<TRTParam>(subgraph_node->attrs.parsed);
+    subgraph_node->inputs.clear();
+    subgraph_node->inputs.resize(orig_input_entries->size());
+    for (size_t i = 0; i < orig_input_entries->size(); ++i) {
+      subgraph_node->inputs[i] = orig_input_entries->at(i);
+      _params.inputs_to_idx[input_entries->at(i)->node->attrs.name] = i;
+    }
+    subgraph_node->attrs.parsed = std::move(_params);
+  }
+};
+
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_TENSORRT
+
+#endif  // MXNET_OPERATOR_SUBGRAPH_TENSORRT_TENSORRT_INL_H_
diff --git a/src/operator/subgraph/tensorrt/tensorrt.cc b/src/operator/subgraph/tensorrt/tensorrt.cc
new file mode 100644
index 000000000000..30fcee007cfc
--- /dev/null
+++ b/src/operator/subgraph/tensorrt/tensorrt.cc
@@ -0,0 +1,336 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file tensorrt.cc
+ * \brief TensorRT operation registration
+ * \author Marek Kolodziej, Clement Fuji Tsang
+*/
+
+#if MXNET_USE_TENSORRT
+
+#include "./tensorrt-inl.h"
+
+namespace mxnet {
+namespace op {
+
+inline uint32_t TRTNumInputs(const nnvm::NodeAttrs& attrs) {
+  const TRTParam& param = nnvm::get<TRTParam>(attrs.parsed);
+  const auto inputs_to_idx = param.inputs_to_idx;
+  return inputs_to_idx.size();
+}
+
+inline std::vector<std::string> TRTListInputNames(const nnvm::NodeAttrs& attrs) {
+  std::vector<std::string> outputs;
+  const TRTParam& param = nnvm::get<TRTParam>(attrs.parsed);
+  const auto inputs_to_idx = param.inputs_to_idx;
+  for (auto& p : inputs_to_idx) {
+    outputs[p.second] = p.first;
+  }
+  return outputs;
+}
+
+inline bool TRTInferShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape> *in_shapes,
+                          std::vector<TShape> *out_shapes) {
+  using namespace exec;
+  const nnvm::Symbol subgraph_sym = *(attrs.subgraphs[0]);
+  const TRTParam& param = nnvm::get<TRTParam>(attrs.parsed);
+  auto params_map = param.params_map;
+  auto inputs_to_idx = param.inputs_to_idx;
+  nnvm::Graph g;
+  g.outputs = subgraph_sym.outputs;
+  const auto& idx_g = g.indexed_graph();
+  CHECK_EQ(idx_g.input_nodes().size(), in_shapes->size() + params_map.size());
+  CHECK_EQ(idx_g.outputs().size(), out_shapes->size());
+
+  // Put the input and output shapes to the shape vector.
+  mxnet::ShapeVector shapes(idx_g.num_node_entries());
+  const auto &input_nids = idx_g.input_nodes();
+  CHECK_EQ(input_nids.size(), in_shapes->size() + params_map.size());
+  for (size_t i = 0; i < input_nids.size(); i++) {
+    auto node = idx_g[input_nids[i]].source;
+    auto eid = idx_g.entry_id(input_nids[i], 0);
+    auto it_params = params_map.find(node->attrs.name);
+    auto it_inputs = inputs_to_idx.find(node->attrs.name);
+    if (it_params != params_map.end()) {
+      shapes[eid] = it_params->second.shape();
+    } else if (it_inputs != inputs_to_idx.end()) {
+      shapes[eid] = in_shapes->at(it_inputs->second);
+    } else {
+      LOG(FATAL) << node->attrs.name << " shape information is missing for attributes inference";
+    }
+  }
+  CHECK_EQ(g.outputs.size(), out_shapes->size());
+  for (size_t i = 0; i < out_shapes->size(); i++) {
+    auto eid = idx_g.entry_id(g.outputs[i]);
+    shapes[eid] = out_shapes->at(i);
+  }
+
+  // Infer shape of the graph.
+  g.attrs["shape"] = std::make_shared<dmlc::any>(std::move(shapes));
+  g = exec::InferShape(std::move(g));
+  // Copy the inferred shape back to the input shapes and the output shapes.
+  shapes = g.GetAttr<mxnet::ShapeVector>("shape");
+  // assign to in_shapes
+  for (size_t i = 0; i < input_nids.size(); ++i) {
+    const auto node = idx_g[input_nids[i]].source;
+    const auto eid = idx_g.entry_id(input_nids[i], 0);
+    auto it = inputs_to_idx.find(node->attrs.name);
+    if (it != inputs_to_idx.end()) {
+      SHAPE_ASSIGN_CHECK(*in_shapes, it->second, shapes[eid]);
+    }
+  }
+  // assign to out_shapes
+  for (size_t i = 0; i < g.outputs.size(); ++i) {
+    const auto eid = idx_g.entry_id(g.outputs[i]);
+    SHAPE_ASSIGN_CHECK(*out_shapes, i, shapes[eid]);
+  }
+  // Check if we have inferred the shapes correctly.
+  return g.GetAttr<size_t>("shape_num_unknown_nodes") == 0;
+}
+
+inline bool TRTInferType(const nnvm::NodeAttrs& attrs,
+                    std::vector<int> *in_types,
+                    std::vector<int> *out_types) {
+  const nnvm::Symbol subgraph_sym = *(attrs.subgraphs[0]);
+  const TRTParam& param = nnvm::get<TRTParam>(attrs.parsed);
+  auto params_map = param.params_map;
+  auto inputs_to_idx = param.inputs_to_idx;
+
+  nnvm::Graph g;
+  g.outputs = subgraph_sym.outputs;
+  const auto& idx_g = g.indexed_graph();
+  CHECK_EQ(idx_g.input_nodes().size(), in_types->size() + params_map.size());
+  CHECK_EQ(idx_g.outputs().size(), out_types->size());
+
+  // Put the input and output data types to the dtype vector.
+  nnvm::DTypeVector types(idx_g.num_node_entries(), -1);
+  const auto &input_nids = idx_g.input_nodes();
+  CHECK_EQ(input_nids.size(), in_types->size() + params_map.size());
+  for (size_t i = 0; i < input_nids.size(); i++) {
+    auto node = idx_g[input_nids[i]].source;
+    auto eid = idx_g.entry_id(input_nids[i], 0);
+    auto it_params = params_map.find(node->attrs.name);
+    auto it_inputs = inputs_to_idx.find(node->attrs.name);
+    if (it_params != params_map.end()) {
+      types[eid] = it_params->second.dtype();
+    } else if (it_inputs != inputs_to_idx.end()) {
+      types[eid] = in_types->at(it_inputs->second);
+    } else {
+      LOG(FATAL) << node->attrs.name
+                 << " dtype information is missing for attributes inference";
+    }
+  }
+  CHECK_EQ(g.outputs.size(), out_types->size());
+  for (size_t i = 0; i < out_types->size(); i++) {
+    auto eid = idx_g.entry_id(g.outputs[i]);
+    types[eid] = out_types->at(i);
+  }
+
+  // Infer data type of the graph.
+  g.attrs["dtype"] = std::make_shared<dmlc::any>(std::move(types));
+  g = exec::InferType(std::move(g));
+
+  types = g.GetAttr<nnvm::DTypeVector>("dtype");
+  // assign to in_types
+  for (size_t i = 0; i < input_nids.size(); ++i) {
+    const auto node = idx_g[input_nids[i]].source;
+    const auto eid = idx_g.entry_id(input_nids[i], 0);
+    auto it = inputs_to_idx.find(node->attrs.name);
+    if (it != inputs_to_idx.end()) {
+      TYPE_ASSIGN_CHECK(*in_types, it->second, types[eid]);
+    }
+  }
+  // assign to out_types
+  for (size_t i = 0; i < g.outputs.size(); ++i) {
+    const auto eid = idx_g.entry_id(g.outputs[i]);
+    TYPE_ASSIGN_CHECK(*out_types, i, types[eid]);
+  }
+
+  // Check if we have inferred the dtypes correctly.
+  return g.GetAttr<size_t>("dtype_num_unknown_nodes") == 0;
+}
+
+inline bool TRTInferStorageType(const nnvm::NodeAttrs& attrs,
+                           const int dev_mask,
+                           DispatchMode* dispatch_mode,
+                           std::vector<int>* in_stypes,
+                           std::vector<int>* out_stypes) {
+  const nnvm::Symbol subgraph_sym = *(attrs.subgraphs[0]);
+  const TRTParam& param = nnvm::get<TRTParam>(attrs.parsed);
+  auto params_map = param.params_map;
+  auto inputs_to_idx = param.inputs_to_idx;
+  nnvm::Graph g;
+  g.outputs = subgraph_sym.outputs;
+  const auto& idx_g = g.indexed_graph();
+  CHECK_EQ(idx_g.input_nodes().size(), in_stypes->size() + params_map.size());
+  CHECK_EQ(idx_g.outputs().size(), out_stypes->size());
+  exec::DevMaskVector dev_masks(idx_g.num_node_entries(), dev_mask);
+
+  // Put the input and output storages to the storage vector.
+  StorageTypeVector stypes(idx_g.num_node_entries(), kUndefinedStorage);
+  const auto &input_nids = idx_g.input_nodes();
+  CHECK_EQ(input_nids.size(), in_stypes->size() + params_map.size());
+  for (size_t i = 0; i < input_nids.size(); i++) {
+    auto node = idx_g[input_nids[i]].source;
+    auto eid = idx_g.entry_id(input_nids[i], 0);
+    auto it_params = params_map.find(node->attrs.name);
+    auto it_inputs = inputs_to_idx.find(node->attrs.name);
+    if (it_params != params_map.end()) {
+      stypes[eid] = it_params->second.storage_type();
+    } else if (it_inputs != inputs_to_idx.end()) {
+      stypes[eid] = in_stypes->at(it_inputs->second);
+    } else {
+      LOG(FATAL) << node->attrs.name
+                 << " storage type information is missing for attributes inference";
+    }
+  }
+  CHECK_EQ(g.outputs.size(), out_stypes->size());
+  for (size_t i = 0; i < out_stypes->size(); i++) {
+    auto eid = idx_g.entry_id(g.outputs[i]);
+    stypes[eid] = out_stypes->at(i);
+  }
+
+  // Infer storage type of the graph.
+  bool dev_match = g.attrs.count("dev_mask") &&
+                   g.GetAttr<exec::DevMaskVector>("dev_mask") == dev_masks;
+  if (!dev_match) {
+    g.attrs["dev_mask"] = std::make_shared<dmlc::any>(std::move(dev_masks));
+  }
+  g.attrs["storage_type"] = std::make_shared<dmlc::any>(std::move(stypes));
+  g = exec::InferStorageType(std::move(g));
+
+  stypes = g.GetAttr<StorageTypeVector>("storage_type");
+  // assign to in_types
+  for (size_t i = 0; i < input_nids.size(); ++i) {
+    const auto node = idx_g[input_nids[i]].source;
+    const auto eid = idx_g.entry_id(input_nids[i], 0);
+    auto it = inputs_to_idx.find(node->attrs.name);
+    if (it != inputs_to_idx.end()) {
+      STORAGE_TYPE_ASSIGN_CHECK(*in_stypes, it->second, stypes[eid]);
+    }
+  }
+
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+  // assign to out_types
+  for (size_t i = 0; i < g.outputs.size(); ++i) {
+    const auto eid = idx_g.entry_id(g.outputs[i]);
+    STORAGE_TYPE_ASSIGN_CHECK(*out_stypes, i, stypes[eid]);
+  }
+  // Check if we have inferred the storages correctly.
+  return g.GetAttr<size_t>("storage_type_num_unknown_nodes") == 0;
+}
+
+void TRTParamParser(nnvm::NodeAttrs* attrs) {
+  TRTParam& _param = nnvm::get<TRTParam>(attrs->parsed);
+  std::string prefix = "subgraph_param_";
+  std::string str_dtype, str_shape, str_pointer, _tmp;
+  for (auto it = attrs->dict.begin(); it != attrs->dict.end();) {
+    std::string attrs_name = it->first;
+    if (std::equal(prefix.begin(), prefix.end(), attrs_name.begin())) {
+      std::string param_name = attrs_name.substr(prefix.size(),
+                                                 attrs_name.size() - prefix.size());
+      // TODO(cfujitsang): find a less dirty way to give weights
+      NDArray *cache = reinterpret_cast<NDArray*>(stol(it->second));
+      _param.params_map.emplace(param_name, cache->Copy(Context()));
+      _param.params_map[param_name].WaitToRead();
+      it = attrs->dict.erase(it);
+    } else {
+      ++it;
+    }
+  }
+  attrs->parsed = std::move(_param);
+}
+
+OpStatePtr TRTCreateState(const nnvm::NodeAttrs& attrs, Context ctx,
+                          const std::vector<TShape>& in_shape,
+                          const std::vector<int>& in_type) {
+  const auto& node_param = nnvm::get<TRTParam>(attrs.parsed);
+  nnvm::Graph graph;
+  graph.outputs = attrs.subgraphs[0]->outputs;
+  uint32_t max_batch_size = dmlc::GetEnv("MXNET_TENSORRT_MAX_BATCH_SIZE", in_shape[0][0]);
+  if (max_batch_size < in_shape[0][0]) {
+    LOG(INFO) << "Warning: max batch size changed to be is: " << in_shape[0][0]
+              << " instead of: " << max_batch_size;
+    max_batch_size = in_shape[0][0];
+  }
+  const auto& params_map = node_param.params_map;
+  const auto& inputs_to_idx = node_param.inputs_to_idx;
+  const auto& outputs_to_idx = node_param.outputs_to_idx;
+  const auto& idx_g = graph.indexed_graph();
+  const auto& input_nids = idx_g.input_nodes();
+  mxnet::ShapeVector shape_inputs(input_nids.size());
+  nnvm::DTypeVector dtype_inputs(input_nids.size());
+  for (int i = 0; i < input_nids.size(); ++i) {
+    auto node = idx_g[input_nids[i]].source;
+    auto it_params = params_map.find(node->attrs.name);
+    auto it_inputs = inputs_to_idx.find(node->attrs.name);
+    if (it_params != params_map.end()) {
+      shape_inputs[i] = it_params->second.shape();
+      dtype_inputs[i] = it_params->second.dtype();
+    } else if (it_inputs != inputs_to_idx.end()) {
+      shape_inputs[i] = in_shape[it_inputs->second];
+      dtype_inputs[i] = in_type[it_inputs->second];
+    } else {
+      LOG(FATAL) << node->attrs.name << " attribute is missing for attributes inference";
+    }
+  }
+  mxnet::ShapeVector out_shape(graph.outputs.size());
+  nnvm::DTypeVector out_type(graph.outputs.size(), -1);
+  mxnet::ShapeVector _in_shape(in_shape.begin(), in_shape.end());
+  nnvm::DTypeVector _in_type(in_type.begin(), in_type.end());
+  TRTInferShape(attrs, &_in_shape, &out_shape);
+  TRTInferType(attrs, &_in_type, &out_type);
+  nnvm::DTypeVector dtypes(idx_g.num_node_entries());
+  mxnet::ShapeVector shapes(idx_g.num_node_entries());
+  for (int i = 0; i < graph.outputs.size(); ++i) {
+    auto eid = idx_g.entry_id(graph.outputs[i]);
+    dtypes[eid] = out_type[i];
+    shapes[eid] = out_shape[i];
+  }
+  graph.attrs["dtype_inputs"] = std::make_shared<nnvm::any>(std::move(dtype_inputs));
+  graph.attrs["shape_inputs"] = std::make_shared<nnvm::any>(std::move(shape_inputs));
+  graph.attrs["dtype"]        = std::make_shared<nnvm::any>(std::move(dtypes));
+  graph.attrs["shape"]        = std::make_shared<nnvm::any>(std::move(shapes));
+  auto onnx_graph = op::nnvm_to_onnx::ConvertNnvmGraphToOnnx(graph, &params_map);
+  auto trt_tuple = ::onnx_to_tensorrt::onnxToTrtCtx(onnx_graph, max_batch_size, 1 << 30);
+  return OpStatePtr::Create<TRTEngineParam>(std::get<0>(trt_tuple), std::get<1>(trt_tuple),
+                                            inputs_to_idx, outputs_to_idx);
+}
+
+NNVM_REGISTER_OP(_TensorRT)
+    .describe(R"code(TRT operation (one engine)
+)code" ADD_FILELINE)
+    .set_num_inputs(TRTNumInputs)
+    .set_num_outputs(DefaultSubgraphOpNumOutputs)
+    .set_attr_parser(TRTParamParser)
+    .set_attr<mxnet::FInferShape>("FInferShape", TRTInferShape)
+    .set_attr<nnvm::FInferType>("FInferType", TRTInferType)
+    .set_attr<nnvm::FListInputNames>("FListInputNames", TRTListInputNames)
+    .set_attr<nnvm::FListOutputNames>("FListOutputNames", DefaultSubgraphOpListOutputs)
+    .set_attr<FCreateOpState>("FCreateOpState", TRTCreateState)
+    .set_attr<FInferStorageType>("FInferStorageType", TRTInferStorageType);
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(TensorRT, TensorrtProperty);
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_TENSORRT
diff --git a/src/operator/contrib/tensorrt.cu b/src/operator/subgraph/tensorrt/tensorrt.cu
similarity index 69%
rename from src/operator/contrib/tensorrt.cu
rename to src/operator/subgraph/tensorrt/tensorrt.cu
index 9a9c3c024366..4a5b23b3a9f7 100644
--- a/src/operator/contrib/tensorrt.cu
+++ b/src/operator/subgraph/tensorrt/tensorrt.cu
@@ -19,8 +19,8 @@
 
 /*!
  * Copyright (c) 2018 by Contributors
- * \file trt.cu
- * \brief TensorRT GPU operation
+ * \file tensorrt.cu
+ * \brief TensorRT GPU operation registration
  * \author Marek Kolodziej, Clement Fuji Tsang
 */
 
@@ -41,30 +41,26 @@ namespace op {
 } while (0)
 
 void TRTCompute(const OpStatePtr& state, const OpContext& ctx,
-                     const std::vector<TBlob>& inputs, const std::vector<OpReqType>& req,
-                     const std::vector<TBlob>& outputs) {
+                const std::vector<TBlob>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
-
-  Stream<gpu>* s = ctx.get_stream<gpu>();
-  cudaStream_t cuda_s = Stream<gpu>::GetStream(s);
+  cudaStream_t cuda_s = Stream<gpu>::GetStream(ctx.get_stream<gpu>());
   const auto& param = state.get_state<TRTEngineParam>();
-  std::vector<void*> bindings;
-  bindings.reserve(param.binding_map.size());
-  for (auto& p : param.binding_map) {
-    if (p.second == nnvm_to_onnx::TypeIO::Inputs) {
-      bindings.emplace_back(inputs[p.first].dptr_);
+  for (size_t i = 0; i < param.binding_order->size(); ++i) {
+    auto& p = param.binding_order->at(i);
+    if (p.second == true) {
+      param.bindings->at(i) = inputs[p.first].dptr_;
     } else {
-      bindings.emplace_back(outputs[p.first].dptr_);
+      param.bindings->at(i) = outputs[p.first].dptr_;
     }
   }
-
   const int batch_size = static_cast<int>(inputs[0].shape_[0]);
-  param.trt_executor->enqueue(batch_size, bindings.data(), cuda_s, nullptr);
-  CHECK_CUDART(cudaStreamSynchronize(cuda_s));
+  param.trt_executor->enqueue(batch_size, param.bindings->data(), cuda_s, nullptr);
 }
 
-NNVM_REGISTER_OP(_trt_op)
+NNVM_REGISTER_OP(_TensorRT)
 .set_attr<FStatefulCompute>("FStatefulCompute<gpu>", TRTCompute);
 
 }  // namespace op
diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h
index 318254b26b9f..f81eb9c04f3a 100644
--- a/src/operator/tensor/dot-inl.h
+++ b/src/operator/tensor/dot-inl.h
@@ -38,9 +38,7 @@
 #ifdef __CUDACC__
 #include "./dot-inl.cuh"
 #endif  // __CUDACC__
-#if (MSHADOW_USE_MKL == 1)
-#include "sparse_matrix.h"
-#endif
+
 namespace mxnet {
 namespace op {
 
@@ -777,35 +775,13 @@ inline void DotCsrDnsDnsImpl(const OpContext& ctx,
   }
 
   using nnvm::dim_t;
-#if (MSHADOW_USE_MKL == 1)
-  TShape lhs_shape = lhs.shape();
-  TShape rhs_shape = rhs.shape_;
-#endif
+
   const TBlob data_l = lhs.data();
   const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
   const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
   const TBlob& data_r = rhs;
   const TBlob data_out = *ret;
 
-#if (MSHADOW_USE_MKL == 1)
-  if (data_l.type_flag_ == mshadow::kFloat32
-    && indptr_l.type_flag_ == mshadow::kInt64
-    && col_idx_l.type_flag_ == mshadow::kInt64
-    && !trans_lhs) {
-    bool ret = mkl_DotCsrDnsDns(static_cast<SP_INT64*>(indptr_l.dptr_),
-                                static_cast<SP_INT64*>(col_idx_l.dptr_),
-                                data_l.dptr<float>(),
-                                data_r.dptr<float>(),
-                                data_out.dptr<float>(),
-                                lhs_shape[0],
-                                lhs_shape[1],
-                                rhs_shape[1]);
-    if (ret) {
-      return;
-    }
-  }
-#endif
-
   MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc
index f7599adcdb63..0ff73f4251cd 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_op_basic.cc
@@ -85,6 +85,9 @@ MXNET_OPERATOR_REGISTER_BINARY(elemwise_add)
                             return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};})
 MXNET_ADD_SPARSE_OP_ALIAS(elemwise_add)
 .add_alias("_add").add_alias("_plus").add_alias("_Plus")
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output"};
+})
 .describe(R"code(Adds arguments element-wise.
 
 The storage type of ``elemwise_add`` output depends on storage types of inputs
diff --git a/src/operator/tensor/la_op-inl.h b/src/operator/tensor/la_op-inl.h
index e89a0824a948..bda8137675a8 100644
--- a/src/operator/tensor/la_op-inl.h
+++ b/src/operator/tensor/la_op-inl.h
@@ -229,6 +229,100 @@ struct sumlogdiag {
   }
 };
 
+template<bool forward>
+struct CopyDiag {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, int k, int n, DType* A, DType* B) {
+    // Index of the matrix from which the diagonal should be extracted.
+    const int matrix(i / (n-abs(k)));
+    // Index of the diagonal element that should be extracted.
+    const int index(i % (n-abs(k)));
+    // row/col that must be looked up.
+    const int row(index-(k < 0 ? k : 0)), col(index+(k > 0 ? k :0));
+    if (forward) {
+      B[i] = A[(matrix*n+row)*n+col];
+    } else {
+      B[(matrix*n+row)*n+col] = A[i];
+    }
+  }
+};
+
+struct copydiag {
+  // Extracts diagonal from matrix.
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 2, DType>& B,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    using namespace mxnet_op;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const LaDiagParam& param = nnvm::get<LaDiagParam>(attrs.parsed);
+    Kernel<CopyDiag<true>, xpu>::Launch(s, B.MSize(), param.offset, A.size(1), A.dptr_, B.dptr_);
+  }
+  // Sets diagonal in matrix.
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 3, DType>& B,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    using namespace mxnet_op;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const LaDiagParam& param = nnvm::get<LaDiagParam>(attrs.parsed);
+    Kernel<set_zero, xpu>::Launch(s, B.MSize(), B.dptr_);
+    Kernel<CopyDiag<false>, xpu>::Launch(s, A.MSize(), param.offset, B.size(1), A.dptr_, B.dptr_);
+  }
+};
+
+template<bool forward>
+struct CopyTrian {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, bool lower, int k, int n, DType* A, DType* B) {
+    // Matrix that this index belongs to.
+    const int matrix(i/(n*n));
+    // Row/Col that this index represents.
+    int row((i/n)%n), col(i%n);
+    if ((k > 0) || ((k == 0) && !lower)) {
+       // When working on upper triangle we switch to transposed coordinates for indexing.
+       int tmp(row);
+       row = col;
+       col = tmp;
+    }
+    // Actual row inside the lower triangular matrix after offset adjustment.
+    row -= abs(k);
+    if (row >= col) {
+      // Index in the 1-dimensional array that holds the values of the triangle.
+      const int index((row*(row+1))/2+col);
+      // Total number of entries in the triangle.
+      const int m(((n-abs(k))*(n-abs(k)+1))/2);
+      if (forward) {
+        B[m*matrix+index] = A[i];
+      } else {
+        B[i] = A[m*matrix+index];
+      }
+    }
+  }
+};
+
+struct copytrian {
+  // Extracts triangle from matrix.
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 2, DType>& B,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    using namespace mxnet_op;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const LaTrianParam& param = nnvm::get<LaTrianParam>(attrs.parsed);
+    Kernel<CopyTrian<true>, xpu>::Launch(s, A.MSize(), param.lower, param.offset,
+                                         A.size(1), A.dptr_, B.dptr_);
+  }
+  // Sets triangle in matrix.
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 3, DType>& B,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    using namespace mxnet_op;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const LaTrianParam& param = nnvm::get<LaTrianParam>(attrs.parsed);
+    Kernel<set_zero, xpu>::Launch(s, B.MSize(), B.dptr_);
+    Kernel<CopyTrian<false>, xpu>::Launch(s, B.MSize(), param.lower, param.offset,
+                                          B.size(1), A.dptr_, B.dptr_);
+  }
+};
+
 // B = syrk(A)
 struct syrk {
   template<typename xpu, typename DType>
diff --git a/src/operator/tensor/la_op.cc b/src/operator/tensor/la_op.cc
index 12cea91f5800..d6e64c4f78cd 100644
--- a/src/operator/tensor/la_op.cc
+++ b/src/operator/tensor/la_op.cc
@@ -33,6 +33,8 @@ DMLC_REGISTER_PARAMETER(LaMatrixMacParam);
 DMLC_REGISTER_PARAMETER(LaMatrixMultParam);
 DMLC_REGISTER_PARAMETER(LaCholeskyParam);
 DMLC_REGISTER_PARAMETER(LaTriangMatrixMultParam);
+DMLC_REGISTER_PARAMETER(LaDiagParam);
+DMLC_REGISTER_PARAMETER(LaTrianParam);
 DMLC_REGISTER_PARAMETER(LaSyrkParam);
 
 NNVM_REGISTER_OP(_linalg_gemm)
@@ -461,6 +463,235 @@ NNVM_REGISTER_OP(_backward_linalg_sumlogdiag)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 2, 2, 1, sumlogdiag_backward>);
 
+NNVM_REGISTER_OP(_linalg_extractdiag)
+.add_alias("linalg_extractdiag")
+.describe(R"code(Extracts the diagonal entries of a square matrix.
+Input is a tensor *A* of dimension *n >= 2*.
+
+If *n=2*, then *A* represents a single square matrix which diagonal elements get extracted as a 1-dimensional tensor.
+
+If *n>2*, then *A* represents a batch of square matrices on the trailing two dimensions. The extracted diagonals are returned as an *n-1*-dimensional tensor.
+
+.. note:: The operator supports float32 and float64 data types only.
+
+Examples::
+
+    // Single matrix diagonal extraction
+    A = [[1.0, 2.0],
+         [3.0, 4.0]]
+
+    extractdiag(A) = [1.0, 4.0]
+
+    extractdiag(A, 1) = [2.0]
+
+    // Batch matrix diagonal extraction
+    A = [[[1.0, 2.0],
+          [3.0, 4.0]],
+         [[5.0, 6.0],
+          [7.0, 8.0]]]
+
+    extractdiag(A) = [[1.0, 4.0],
+                      [5.0, 8.0]]
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<LaDiagParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs)
+  { return std::vector<std::string>{"A"}; } )
+.set_attr<mxnet::FInferShape>("FInferShape", LaDiagTrianShape<true, true>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCompute>("FCompute<cpu>", LaOpForward<cpu, 2, 1, 1, 1, copydiag>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_linalg_extractdiag"})
+.add_argument("A", "NDArray-or-Symbol", "Tensor of square matrices")
+.add_arguments(LaDiagParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_linalg_extractdiag)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<LaDiagParam>)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
+  { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 1, 2, 1, 1, copydiag>);
+
+NNVM_REGISTER_OP(_linalg_makediag)
+.add_alias("linalg_makediag")
+.describe(R"code(Constructs a square matrix with the input as diagonal.
+Input is a tensor *A* of dimension *n >= 1*.
+
+If *n=1*, then *A* represents the diagonal entries of a single square matrix. This matrix will be returned as a 2-dimensional tensor.
+If *n>1*, then *A* represents a batch of diagonals of square matrices. The batch of diagonal matrices will be returned as an *n+1*-dimensional tensor.
+
+.. note:: The operator supports float32 and float64 data types only.
+
+Examples::
+
+    // Single diagonal matrix construction
+    A = [1.0, 2.0]
+
+    makediag(A)    = [[1.0, 0.0],
+                      [0.0, 2.0]]
+
+    makediag(A, 1) = [[0.0, 1.0, 0.0],
+                      [0.0, 0.0, 2.0],
+                      [0.0, 0.0, 0.0]]
+
+    // Batch diagonal matrix construction
+    A = [[1.0, 2.0],
+         [3.0, 4.0]]
+
+    makediag(A) = [[[1.0, 0.0],
+                    [0.0, 2.0]],
+                   [[3.0, 0.0],
+                    [0.0, 4.0]]]
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<LaDiagParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs)
+  { return std::vector<std::string>{"A"}; } )
+.set_attr<mxnet::FInferShape>("FInferShape", LaDiagTrianShape<true, false>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCompute>("FCompute<cpu>", LaOpForward<cpu, 1, 2, 1, 1, copydiag>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_linalg_makediag"})
+.add_argument("A", "NDArray-or-Symbol", "Tensor of diagonal entries")
+.add_arguments(LaDiagParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_linalg_makediag)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<LaDiagParam>)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
+  { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 1, 1, 1, copydiag>);
+
+NNVM_REGISTER_OP(_linalg_extracttrian)
+.add_alias("linalg_extracttrian")
+.describe(R"code(Extracts a triangular sub-matrix from a square matrix.
+Input is a tensor *A* of dimension *n >= 2*.
+
+If *n=2*, then *A* represents a single square matrix from which a triangular sub-matrix is extracted as a 1-dimensional tensor.
+
+If *n>2*, then *A* represents a batch of square matrices on the trailing two dimensions. The extracted triangular sub-matrices are returned as an *n-1*-dimensional tensor.
+
+The *offset* and *lower* parameters determine the triangle to be extracted:
+
+- When *offset = 0* either the lower or upper triangle with respect to the main diagonal is extracted depending on the value of parameter *lower*.
+- When *offset = k > 0* the upper triangle with respect to the k-th diagonal above the main diagonal is extracted. 
+- When *offset = k < 0* the lower triangle with respect to the k-th diagonal below the main diagonal is extracted. 
+
+.. note:: The operator supports float32 and float64 data types only.
+
+Examples::
+
+    // Single triagonal extraction
+    A = [[1.0, 2.0],
+         [3.0, 4.0]]
+
+    extracttrian(A) = [1.0, 3.0, 4.0]
+    extracttrian(A, lower=False) = [1.0, 2.0, 4.0]
+    extracttrian(A, 1) = [2.0]
+    extracttrian(A, -1) = [3.0]
+
+    // Batch triagonal extraction
+    A = [[[1.0, 2.0],
+          [3.0, 4.0]],
+         [[5.0, 6.0],
+          [7.0, 8.0]]]
+
+    extracttrian(A) = [[1.0, 3.0, 4.0],
+                       [5.0, 7.0, 8.0]]
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<LaTrianParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs)
+  { return std::vector<std::string>{"A"}; } )
+.set_attr<mxnet::FInferShape>("FInferShape", LaDiagTrianShape<false, true>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCompute>("FCompute<cpu>", LaOpForward<cpu, 2, 1, 1, 1, copytrian>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_linalg_extracttrian"})
+.add_argument("A", "NDArray-or-Symbol", "Tensor of square matrices")
+.add_arguments(LaTrianParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_linalg_extracttrian)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<LaTrianParam>)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
+  { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 1, 2, 1, 1, copytrian>);
+
+NNVM_REGISTER_OP(_linalg_maketrian)
+.add_alias("linalg_maketrian")
+.describe(R"code(Constructs a square matrix with the input representing a specific triangular sub-matrix.
+This is basically the inverse of *linalg.extracttrian*. Input is a tensor *A* of dimension *n >= 1*.
+
+If *n=1*, then *A* represents the entries of a triangular matrix which is lower triangular if *offset<0* or *offset=0*, *lower=true*. The resulting matrix is derived by first constructing the square
+matrix with the entries outside the triangle set to zero and then adding *offset*-times an additional 
+diagonal with zero entries to the square matrix. 
+
+If *n>1*, then *A* represents a batch of triangular sub-matrices. The batch of corresponding square matrices is returned as an *n+1*-dimensional tensor.
+
+.. note:: The operator supports float32 and float64 data types only.
+
+Examples::
+
+    // Single  matrix construction
+    A = [1.0, 2.0, 3.0]
+
+    maketrian(A)              = [[1.0, 0.0],
+                                 [2.0, 3.0]]
+
+    maketrian(A, lower=false) = [[1.0, 2.0],
+                                 [0.0, 3.0]]
+
+    maketrian(A, offset=1)    = [[0.0, 1.0, 2.0],
+                                 [0.0, 0.0, 3.0],
+                                 [0.0, 0.0, 0.0]]
+    maketrian(A, offset=-1)   = [[0.0, 0.0, 0.0],
+                                 [1.0, 0.0, 0.0],
+                                 [2.0, 3.0, 0.0]]
+
+    // Batch matrix construction
+    A = [[1.0, 2.0, 3.0],
+         [4.0, 5.0, 6.0]]
+
+    maketrian(A)           = [[[1.0, 0.0],
+                               [2.0, 3.0]],
+                              [[4.0, 0.0],
+                               [5.0, 6.0]]]
+
+    maketrian(A, offset=1) = [[[0.0, 1.0, 2.0],
+                               [0.0, 0.0, 3.0],
+                               [0.0, 0.0, 0.0]],
+                              [[0.0, 4.0, 5.0],
+                               [0.0, 0.0, 6.0],
+                               [0.0, 0.0, 0.0]]]
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<LaTrianParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs)
+  { return std::vector<std::string>{"A"}; } )
+.set_attr<mxnet::FInferShape>("FInferShape", LaDiagTrianShape<false, false>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCompute>("FCompute<cpu>", LaOpForward<cpu, 1, 2, 1, 1, copytrian>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_linalg_maketrian"})
+.add_argument("A", "NDArray-or-Symbol", "Tensor of triangular matrices stored as vectors")
+.add_arguments(LaTrianParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_linalg_maketrian)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<LaTrianParam>)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
+  { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 1, 1, 1, copytrian>);
+
 NNVM_REGISTER_OP(_linalg_syrk)
 .add_alias("linalg_syrk")
 .describe(R"code(Multiplication of matrix with its transpose.
diff --git a/src/operator/tensor/la_op.cu b/src/operator/tensor/la_op.cu
index 29a48466313c..ec310fe76fcd 100644
--- a/src/operator/tensor/la_op.cu
+++ b/src/operator/tensor/la_op.cu
@@ -63,6 +63,30 @@ NNVM_REGISTER_OP(_linalg_sumlogdiag)
 NNVM_REGISTER_OP(_backward_linalg_sumlogdiag)
 .set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 2, 1, sumlogdiag_backward>);
 
+NNVM_REGISTER_OP(_linalg_extractdiag)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 1, 1, 1, copydiag>);
+
+NNVM_REGISTER_OP(_backward_linalg_extractdiag)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 1, 2, 1, 1, copydiag>);
+
+NNVM_REGISTER_OP(_linalg_makediag)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 1, 2, 1, 1, copydiag>);
+
+NNVM_REGISTER_OP(_backward_linalg_makediag)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 1, 1, 1, copydiag>);
+
+NNVM_REGISTER_OP(_linalg_extracttrian)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 1, 1, 1, copytrian>);
+
+NNVM_REGISTER_OP(_backward_linalg_extracttrian)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 1, 2, 1, 1, copytrian>);
+
+NNVM_REGISTER_OP(_linalg_maketrian)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 1, 2, 1, 1, copytrian>);
+
+NNVM_REGISTER_OP(_backward_linalg_maketrian)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 1, 1, 1, copytrian>);
+
 NNVM_REGISTER_OP(_linalg_potri)
 .set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 1, 1, potri>);
 
diff --git a/src/operator/tensor/la_op.h b/src/operator/tensor/la_op.h
index db4607fe9262..3b36f7c23a55 100644
--- a/src/operator/tensor/la_op.h
+++ b/src/operator/tensor/la_op.h
@@ -129,6 +129,33 @@ struct LaSyrkParam : public dmlc::Parameter<LaSyrkParam> {
   }
 };
 
+// Parameters for diag extraction/creation.
+struct LaDiagParam : public dmlc::Parameter<LaDiagParam> {
+  int offset;
+  DMLC_DECLARE_PARAMETER(LaDiagParam) {
+    DMLC_DECLARE_FIELD(offset)
+      .set_default(0)
+      .describe("Offset of the diagonal versus the main diagonal. 0 corresponds to the main "
+                "diagonal, a negative/positive value to diagonals below/above the main diagonal.");
+  }
+};
+
+// Parameters for trian extraction/creation.
+struct LaTrianParam : public dmlc::Parameter<LaTrianParam> {
+  int  offset;
+  bool lower;
+  DMLC_DECLARE_PARAMETER(LaTrianParam) {
+    DMLC_DECLARE_FIELD(offset)
+      .set_default(0)
+      .describe("Offset of the diagonal versus the main diagonal. 0 corresponds to the main "
+                "diagonal, a negative/positive value to diagonals below/above the main diagonal.");
+    DMLC_DECLARE_FIELD(lower)
+      .set_default(true)
+      .describe("Refer to the lower triangular matrix if lower=true, refer to the upper otherwise."
+                 " Only relevant when offset=0");
+  }
+};
+
 // Common function for shape inference for matrix mult and matrix mac.
 inline bool LaMatrixMultMacOpShape(const nnvm::NodeAttrs& attrs,
                                    mxnet::ShapeVector* in_attrs,
@@ -262,6 +289,47 @@ inline bool LaReduceShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+template<bool diag, bool extract>
+inline bool LaDiagTrianShape(const nnvm::NodeAttrs& attrs,
+                             mxnet::ShapeVector* in_attrs,
+                             mxnet::ShapeVector* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+  const int ndim((*in_attrs)[0].ndim());
+  // Only infer in forward direction
+  if (ndim == 0) {
+    return false;
+  }
+  const int offset = (diag ? nnvm::get<LaDiagParam>(attrs.parsed).offset
+                           : nnvm::get<LaTrianParam>(attrs.parsed).offset);
+  std::vector<int> oshape(extract ? ndim-1 : ndim+1);
+  for (int i = 0; i < ndim-1; ++i) {
+    oshape[i] = (*in_attrs)[0][i];
+  }
+  if (extract) {
+    CHECK_GE(ndim, 2)
+      << "Input operand must be a tensor of matrices";
+    CHECK_EQ((*in_attrs)[0][ndim-2], (*in_attrs)[0][ndim-1])
+      << "Input operand must be a tensor of square matrices";
+    const int n((*in_attrs)[0][ndim-1]-abs(offset));
+    CHECK_GT(n, 0)
+      << "Illegal offset " << offset << " for diag/trian extraction of matrix with dimension "
+      << ndim;
+    oshape[ndim-2] = (diag ? n : (n*(n+1))/2);
+  } else if (diag) {
+    oshape[ndim] = oshape[ndim-1] = (*in_attrs)[0][ndim-1]+abs(offset);
+  } else {
+    const int n((*in_attrs)[0][ndim-1]);
+    const int m(std::floor(0.5+(std::sqrt(8*n+1)-1.0)*0.5));
+    CHECK_EQ((m*(m+1))/2, n)
+      << "Input tensor of maketrian has an invalid dimension for the last axis.";
+    oshape[ndim] = oshape[ndim-1] = m+abs(offset);
+  }
+  mxnet::TShape tshape(oshape.begin(), oshape.end());
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, tshape);
+  return true;
+}
+
 // Shape inference function for linalg_syrk
 inline bool LaSyrkShape(const nnvm::NodeAttrs& attrs,
                         mxnet::ShapeVector* in_attrs,
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index 4f15351a594a..0ca5ef7fa30c 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -26,6 +26,7 @@
 #include "./pooled_storage_manager.h"
 #include "./cpu_shared_storage_manager.h"
 #include "./cpu_device_storage.h"
+#include "./gpu_device_storage.h"
 #include "./pinned_memory_storage.h"
 #include "../common/lazy_alloc_array.h"
 #include "../profiler/storage_profiler.h"
@@ -106,11 +107,12 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
             if (strategy == "Round") {
               ptr = new storage::GPUPooledRoundedStorageManager(handle->ctx);
               LOG(INFO) << "Using GPUPooledRoundedStorageManager.";
-            } else {
-              if (strategy != "Naive") {
-                LOG(FATAL) << "Unknown memory pool strategy specified: " << strategy << ".";
-              }
+            } else if (strategy == "Naive") {
               ptr = new storage::GPUPooledStorageManager(handle->ctx);
+            } else if (strategy == "Unpooled") {
+              ptr = new storage::NaiveStorageManager<storage::GPUDeviceStorage>();
+            } else {
+              LOG(FATAL) << "Unknown memory pool strategy specified: " << strategy << ".";
             }
 #else
             LOG(FATAL) << "Compile with USE_CUDA=1 to enable GPU usage";
diff --git a/tests/cpp/misc/serialization.cc b/tests/cpp/misc/serialization.cc
deleted file mode 100644
index 2509a43c27ee..000000000000
--- a/tests/cpp/misc/serialization.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <gtest/gtest.h>
-#include <../../../src/common/serialization.h>
-
-using namespace mxnet;
-using namespace std;
-
-/*
- * Test that used datastruct are properly serialized and deserialized
- */
-
-TEST(SerializerTest, InputMapCorrect) {
-    std::map<std::string, int32_t> input_map;
-    input_map.emplace("input_0", 2);
-    input_map.emplace("another_input", 0);
-    input_map.emplace("last_input", 1);
-    std::string serialized_data;
-    common::Serialize(input_map, &serialized_data);
-    std::map<std::string, int32_t> deserialized_input_map;
-    common::Deserialize(&deserialized_input_map, serialized_data);
-    ASSERT_EQ(input_map.size(), deserialized_input_map.size());
-    for (auto& p : input_map) {
-        auto it = deserialized_input_map.find(p.first);
-        ASSERT_NE(it, deserialized_input_map.end());
-        ASSERT_EQ(it->second, p.second);
-    }
-}
-
-TEST(SerializerTest, OutputMapCorrect) {
-    std::map<std::string, std::tuple<uint32_t, mxnet::TShape, int, int> > output_map;
-    output_map.emplace("output_0", std::make_tuple(1, mxnet::TShape({23, 12, 63, 432}), 0, 1));
-    output_map.emplace("another_output", std::make_tuple(2, mxnet::TShape({23, 123}), 14, -23));
-    output_map.emplace("last_output", std::make_tuple(0, mxnet::TShape(1, 0), -1, 0));
-    std::string serialized_data;
-    common::Serialize(output_map, &serialized_data);
-    std::map<std::string, std::tuple<uint32_t, mxnet::TShape, int, int> > deserialized_output_map;
-    common::Deserialize(&deserialized_output_map, serialized_data);
-    ASSERT_EQ(output_map.size(), deserialized_output_map.size());
-    for (auto& p : output_map) {
-        auto it = deserialized_output_map.find(p.first);
-        ASSERT_NE(it, deserialized_output_map.end());
-        auto lhs = it->second;
-        auto rhs = p.second;
-        ASSERT_EQ(std::get<0>(lhs), std::get<0>(rhs));
-        ASSERT_EQ(std::get<1>(lhs), std::get<1>(rhs));
-        ASSERT_EQ(std::get<2>(lhs), std::get<2>(rhs));
-        ASSERT_EQ(std::get<3>(lhs), std::get<3>(rhs));
-    }
-}
-
diff --git a/tests/nightly/JenkinsfileForBinaries b/tests/nightly/JenkinsfileForBinaries
index 13bb50e0e484..6545d59f0e09 100755
--- a/tests/nightly/JenkinsfileForBinaries
+++ b/tests/nightly/JenkinsfileForBinaries
@@ -46,7 +46,7 @@ core_logic: {
         ws('workspace/build-cpu-int64') {
           utils.init_git()
           utils.docker_run('ubuntu_nightly_cpu', 'build_ubuntu_cpu_large_tensor', false)
-          utils.pack_lib('ubuntu_cpu_int64', mx_cmake_lib, true)
+          utils.pack_lib('cpu_int64', mx_cmake_lib, true)
         }
       }
     },
@@ -55,7 +55,7 @@ core_logic: {
         ws('workspace/build-gpu-int64') {
           utils.init_git()
           utils.docker_run('ubuntu_nightly_gpu', 'build_ubuntu_gpu_large_tensor', true)
-          utils.pack_lib('ubuntu_gpu_int64', mx_cmake_lib, true)
+          utils.pack_lib('gpu_int64', mx_cmake_lib, true)
         }
       }
     }
diff --git a/tests/python/gpu/test_gluon_contrib_gpu.py b/tests/python/gpu/test_gluon_contrib_gpu.py
new file mode 100644
index 000000000000..1d19d850dd8e
--- /dev/null
+++ b/tests/python/gpu/test_gluon_contrib_gpu.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests of the contrib APIs in Gluon only with gpu"""
+
+from __future__ import print_function
+import mxnet as mx
+from mxnet.gluon import nn
+from mxnet.gluon import contrib
+from mxnet.gluon.contrib.cnn import DeformableConvolution
+
+
+def test_DeformableConvolution():
+    """test of the deformable convolution layer with possible combinations of arguments,
+    currently this layer only supports gpu
+    """
+    net = nn.HybridSequential()
+    net.add(
+        DeformableConvolution(10, kernel_size=(3, 3), strides=1, padding=0),
+        DeformableConvolution(10, kernel_size=(3, 2), strides=1, padding=0, activation='relu',
+                               offset_use_bias=False, use_bias=False),
+        DeformableConvolution(10, kernel_size=(3, 2), strides=1, padding=0, activation='relu',
+                               offset_use_bias=False),
+        DeformableConvolution(10, kernel_size=(3, 2), strides=1, padding=0, activation='relu',
+                               use_bias=False),
+        DeformableConvolution(10, kernel_size=(3, 2), strides=1, padding=0, offset_use_bias=False, use_bias=False),
+        DeformableConvolution(10, kernel_size=(3, 2), strides=1, padding=0, offset_use_bias=False),
+        DeformableConvolution(12, kernel_size=(3, 2), strides=1, padding=0, use_bias=False),
+        DeformableConvolution(12, kernel_size=(3, 2), strides=1, padding=0, use_bias=False, num_deformable_group=4),
+    )
+
+    try:
+        ctx = mx.gpu()
+        _ = mx.nd.array([0], ctx=ctx)
+    except mx.base.MXNetError:
+        print("deformable_convolution only supports GPU")
+        return
+
+    net.initialize(force_reinit=True, ctx=ctx)
+    net.hybridize()
+
+    x = mx.nd.random.uniform(shape=(8, 5, 30, 31), ctx=ctx)
+    with mx.autograd.record():
+        y = net(x)
+        y.backward()
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 3c8cc4234e54..ce93f9821b9d 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -85,9 +85,9 @@ def test_symbolic_api_dequantization(qdata, min_range, max_range, expected_resul
         sym_data = mx.sym.Variable('data')
         sym_min_range = mx.sym.Variable('min_range')
         sym_max_range = mx.sym.Variable('max_range')
-        dequant = mx.sym.contrib.dequantize(sym_data, sym_min_range, 
+        dequant = mx.sym.contrib.dequantize(sym_data, sym_min_range,
                                             sym_max_range, out_type='float32')
-        out = dequant.bind(ctx=mx.current_context(), 
+        out = dequant.bind(ctx=mx.current_context(),
                            args={'data':qdata, 'min_range':min_range, 'max_range':max_range})
         data = out.forward()[0]
         assert data.dtype == np.float32
@@ -141,7 +141,8 @@ def check_requantize(shape, min_calib_range=None, max_calib_range=None):
             qdata_int8, min_output, max_output = mx.nd.contrib.requantize(qdata, min_range, max_range)
         else:
             qdata_int8, min_output, max_output = mx.nd.contrib.requantize(qdata, min_range, max_range,
-                                                                          min_calib_range, max_calib_range)
+                                                                          min_calib_range=min_calib_range,
+                                                                          max_calib_range=max_calib_range)
 
         qdata_int8_np, min_output_np, max_output_np = requantize_baseline(qdata.asnumpy(), min_range.asscalar(),
                                                                           max_range.asscalar(),
@@ -150,7 +151,7 @@ def check_requantize(shape, min_calib_range=None, max_calib_range=None):
         assert_almost_equal(qdata_int8.asnumpy(), qdata_int8_np, atol = 1)
         assert_almost_equal(min_output.asnumpy(), np.array([min_output_np]))
         assert_almost_equal(max_output.asnumpy(), np.array([max_output_np]))
-    
+
     def check_requantize_with_symbol(shape, min_calib_range=None, max_calib_range=None):
         qdata = mx.nd.random.uniform(low=-1000.0, high=1000.0, shape=shape).astype('int32')
         min_range = mx.nd.array([-1010.0])
@@ -160,17 +161,18 @@ def check_requantize_with_symbol(shape, min_calib_range=None, max_calib_range=No
         sym_max_range = mx.sym.Variable('max_range')
         if min_calib_range is None or max_calib_range is None:
             requant = mx.sym.contrib.requantize(sym_data, sym_min_range, sym_max_range)
-            out = requant.bind(ctx=mx.current_context(), 
-                               args={'data':qdata, 'min_range':min_range, 
-                               'max_range':max_range}) 
+            out = requant.bind(ctx=mx.current_context(),
+                               args={'data':qdata, 'min_range':min_range,
+                               'max_range':max_range})
             qdata_int8, min_output, max_output = out.forward()
         else:
-            requant = mx.sym.contrib.requantize(sym_data, sym_min_range, sym_max_range, 
-                                                min_calib_range, max_calib_range)
-            out = requant.bind(ctx=mx.current_context(), args={'data':qdata, 'min_range':min_range, 
-                                                   'max_range':max_range})  
-            qdata_int8, min_output, max_output = out.forward()                                   
-           
+            requant = mx.sym.contrib.requantize(sym_data, sym_min_range, sym_max_range,
+                                                min_calib_range=min_calib_range,
+                                                max_calib_range=max_calib_range)
+            out = requant.bind(ctx=mx.current_context(), args={'data':qdata, 'min_range':min_range,
+                               'max_range':max_range})
+            qdata_int8, min_output, max_output = out.forward()
+
         qdata_int8_np, min_output_np, max_output_np = requantize_baseline(qdata.asnumpy(), min_range.asscalar(),
                                                                           max_range.asscalar(),
                                                                           min_calib_range=min_calib_range,
@@ -273,6 +275,71 @@ def check_quantized_conv(data_shape, kernel, num_filter, pad, stride, no_bias, q
         check_quantized_conv((3, 4, 28, 28), (3, 3), 128, (1, 1), (1, 1), True, qdtype)
         check_quantized_conv((3, 4, 28, 28), (3, 3), 128, (1, 1), (1, 1), False, qdtype)
 
+
+@with_seed()
+def test_quantized_elemwise_add():
+    def check_quantized_elemwise_add(data_shape, qtype):
+        if is_test_for_native_cpu():
+            print('skipped testing quantized_elemwise_add for native cpu since it is not supported yet')
+            return
+        elif qtype != 'uint8' and qtype != 'int8':
+            print('skipped testing quantized_elemwise_add for not supported data type')
+            return
+        elif is_test_for_gpu():
+            print('skipped testing quantized_elemwise_add for gpu since it is not supported yet')
+            return
+
+        dataA = mx.sym.Variable(name='dataA', shape=data_shape, dtype='float32')
+        dataB = mx.sym.Variable(name='dataB', shape=data_shape, dtype='float32')
+        elemwise_add_fp32 = mx.sym.elemwise_add(dataA, dataB)
+        arg_names = elemwise_add_fp32.list_arguments()
+        elemwise_add_fp32_exe = elemwise_add_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
+        if qtype == 'uint8':
+            data_low = 0.0
+            data_high = 255.0
+        else:
+            data_low = -127.0
+            data_high = 127.0
+
+        dataA_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32')
+        dataB_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32')
+        elemwise_add_fp32_exe.arg_dict[arg_names[0]][:] = dataA_val
+
+        elemwise_add_fp32_exe.arg_dict[arg_names[1]][:] = dataB_val
+
+        output = elemwise_add_fp32_exe.forward()[0]
+
+        qdataA = mx.sym.Variable(name='qdataA', shape=data_shape, dtype=qtype)
+        qdataB = mx.sym.Variable(name='qdataB', shape=data_shape, dtype=qtype)
+        min_dataA = mx.sym.Variable(name='min_dataA')
+        max_dataA = mx.sym.Variable(name='max_dataA')
+        min_dataB = mx.sym.Variable(name='min_dataB')
+        max_dataB = mx.sym.Variable(name='max_dataB')
+        quantized_elemwise_add = mx.sym.contrib.quantized_elemwise_add(qdataA, qdataB, min_dataA, max_dataA, min_dataB, max_dataB)
+        elemwise_add_int8_exe = quantized_elemwise_add.simple_bind(ctx=mx.current_context(), grad_req='null')
+        qarg_names = quantized_elemwise_add.list_arguments()
+        elemwise_add_int8_exe.arg_dict[qarg_names[0]][:] = elemwise_add_fp32_exe.arg_dict[arg_names[0]].astype(qtype)
+        elemwise_add_int8_exe.arg_dict[qarg_names[1]][:] = elemwise_add_fp32_exe.arg_dict[arg_names[1]].astype(qtype)
+        quantized_range = 127.0
+        elemwise_add_int8_exe.arg_dict[qarg_names[2]][:] = data_low
+        elemwise_add_int8_exe.arg_dict[qarg_names[3]][:] = data_high
+        elemwise_add_int8_exe.arg_dict[qarg_names[4]][:] = data_low
+        elemwise_add_int8_exe.arg_dict[qarg_names[5]][:] = data_high
+        qoutput, min_range, max_range = elemwise_add_int8_exe.forward()
+        min_val = min_range.asnumpy().tolist()[0]
+        max_val = max_range.asnumpy().tolist()[0]
+
+        fp32_rslt = output.asnumpy()
+        int8_rslt = qoutput.asnumpy()*max_val/0x7fffffff
+        assert_almost_equal(int8_rslt, int8_rslt, atol = 1e-4)
+
+    for qtype in ['int8', 'uint8']:
+        check_quantized_elemwise_add((4, 6), qtype)
+        check_quantized_elemwise_add((13, 74, 52), qtype)
+        check_quantized_elemwise_add((3, 4, 56, 56), qtype)
+        check_quantized_elemwise_add((32, 56, 64, 11), qtype)
+
+
 @with_seed()
 def test_quantized_pooling():
     def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_pool, qdtype, convention='valid'):
@@ -564,7 +631,8 @@ def get_fp32_residual():
     conv0 = mx.sym.Convolution(data=data, num_filter=4, kernel=(1,1), pad=(0,0),
                                no_bias=True, name='conv0')
     bn = mx.sym.BatchNorm(data=conv0, fix_gamma=False, eps=2e-5, momentum=0.9, name='bn')
-    act0 = mx.sym.Activation(data=bn + data, act_type='relu', name='relu0')
+    sum0 = mx.sym.elemwise_add(bn, data, name='sum0')
+    act0 = mx.sym.Activation(data=sum0, act_type='relu', name='relu0')
     pool0 = mx.sym.Pooling(act0, kernel=(4, 4), pool_type='avg', name='pool0')
     conv1 = mx.sym.Convolution(data=pool0, num_filter=4, kernel=(1,1), pad=(0,0),
                                no_bias=False, name='conv1')
@@ -747,7 +815,7 @@ def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape):
             if mx.current_context() == mx.cpu():
                excluded_names += ['fc', 'conv1']
             if mx.current_context() == mx.gpu():
-               excluded_names += ['relu0', 'relu1']
+               excluded_names += ['sum0', 'relu0', 'relu1']
             excluded_names += ['concat']
 
             optional_names = ['pool0']
diff --git a/tests/python/tensorrt/lenet5_train.py b/tests/python/tensorrt/lenet5_train.py
index 8edd9abf70e7..e679c05894a3 100644
--- a/tests/python/tensorrt/lenet5_train.py
+++ b/tests/python/tensorrt/lenet5_train.py
@@ -24,6 +24,7 @@ def lenet5():
     """LeNet-5 Symbol"""
     #pylint: disable=no-member
     data = mx.sym.Variable('data')
+    data = mx.sym.Cast(data, 'float16')
     conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=20)
     tanh1 = mx.sym.Activation(data=conv1, act_type="tanh")
     pool1 = mx.sym.Pooling(data=tanh1, pool_type="max",
@@ -39,6 +40,7 @@ def lenet5():
     tanh3 = mx.sym.Activation(data=fc1, act_type="tanh")
     # second fullc
     fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=10)
+    fc2 = mx.sym.Cast(fc2, 'float32')
     # loss
     lenet = mx.sym.SoftmaxOutput(data=fc2, name='softmax')
     #pylint: enable=no-member
diff --git a/tests/python/tensorrt/test_cvnets.py b/tests/python/tensorrt/test_cvnets.py
index 4fdd522341bc..9282bc6f0de6 100644
--- a/tests/python/tensorrt/test_cvnets.py
+++ b/tests/python/tensorrt/test_cvnets.py
@@ -27,28 +27,22 @@
 
 
 def get_classif_model(model_name, use_tensorrt, ctx=mx.gpu(0), batch_size=128):
-    mx.contrib.tensorrt.set_use_tensorrt(use_tensorrt)
+    mx.contrib.tensorrt.set_use_fp16(False)
     h, w = 32, 32
     net = gluoncv.model_zoo.get_model(model_name, pretrained=True)
-    data = mx.sym.var('data')
-
+    net.hybridize()
+    net.forward(mx.nd.zeros((batch_size, 3, h, w)))
+    net.export(model_name)
+    _sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, 0)
     if use_tensorrt:
-        out = net(data)
-        softmax = mx.sym.SoftmaxOutput(out, name='softmax')
-        all_params = dict([(k, v.data()) for k, v in net.collect_params().items()])
-        executor = mx.contrib.tensorrt.tensorrt_bind(softmax, ctx=ctx, all_params=all_params,
-                                                     data=(batch_size,3, h, w),
-                                                     softmax_label=(batch_size,), grad_req='null',
-                                                     force_rebind=True)
+        sym = _sym.get_backend_symbol('TensorRT') 
+        mx.contrib.tensorrt.init_tensorrt_params(sym, arg_params, aux_params)
     else:
-        # Convert gluon model to Symbolic
-        net.hybridize()
-        net.forward(mx.ndarray.zeros((batch_size, 3, h, w)))
-        net.export(model_name)
-        symbol, arg_params, aux_params = mx.model.load_checkpoint(model_name, 0)
-        executor = symbol.simple_bind(ctx=ctx, data=(batch_size, 3, h, w),
-                                      softmax_label=(batch_size,))
-        executor.copy_params_from(arg_params, aux_params)
+        sym = _sym
+    executor = sym.simple_bind(ctx=ctx, data=(batch_size, 3, h, w),
+                               softmax_label=(batch_size,),
+			       grad_req='null', force_rebind=True)
+    executor.copy_params_from(arg_params, aux_params)
     return executor
 
 
@@ -126,7 +120,7 @@ def run_experiment_for(model_name, batch_size, num_workers):
 
 
 def test_tensorrt_on_cifar_resnets(batch_size=32, tolerance=0.1, num_workers=1):
-    original_try_value = mx.contrib.tensorrt.get_use_tensorrt()
+    original_use_fp16 = mx.contrib.tensorrt.get_use_fp16()
     try:
         models = [
             'cifar_resnet20_v1',
@@ -170,7 +164,7 @@ def test_tensorrt_on_cifar_resnets(batch_size=32, tolerance=0.1, num_workers=1):
 
         print("Test duration: %.2f seconds" % test_duration)
     finally:
-        mx.contrib.tensorrt.set_use_tensorrt(original_try_value)
+        mx.contrib.tensorrt.set_use_fp16(original_use_fp16)
 
 
 if __name__ == '__main__':
diff --git a/tests/python/tensorrt/test_cycle.py b/tests/python/tensorrt/test_cycle.py
deleted file mode 100644
index 25f515a106a6..000000000000
--- a/tests/python/tensorrt/test_cycle.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-from common import *
-
-
-def detect_cycle_from(sym, visited, stack):
-    visited.add(sym.handle.value)
-    stack.add(sym.handle.value)
-    for s in sym.get_children():
-        if s.handle.value not in visited:
-            if detect_cycle_from(sym, visited, stack):
-                return True
-        elif s.handle.value in stack:
-            return True
-        stack.remove(sym.handle.value)
-    return False
-
-
-def has_no_cycle(sym):
-    visited = set()
-    stack = set()
-    all_nodes = sym.get_internals()
-    for s in all_nodes:
-        if s.handle.value in visited:
-            if detect_cycle_from(s, visited, stack):
-                return False
-    return True
-
-
-def test_simple_cycle():
-    inp = mx.sym.Variable('input', shape=[1,10])
-    A = mx.sym.FullyConnected(data=inp, num_hidden=10, no_bias=False, name='A')
-    B = mx.sym.FullyConnected(data=A, num_hidden=10, no_bias=False, name='B')
-    D = mx.sym.sin(data=A, name='D')
-    C = mx.sym.elemwise_add(lhs=B, rhs=D, name='C')
-    arg_params = {
-                'I_weight': mx.nd.zeros([10,10]),
-                'I_bias': mx.nd.zeros([10]),
-                'A_weight': mx.nd.zeros([10,10]),
-                'A_bias': mx.nd.zeros([10]),
-                'B_weight': mx.nd.zeros([10,10]),
-                'B_bias': mx.nd.zeros([10]),
-               }
-
-    executor = C.simple_bind(ctx=mx.gpu(0), data=(1,10), softmax_label=(1,),
-                           shared_buffer=arg_params, grad_req='null', force_rebind=True)
-    optimized_graph = mx.contrib.tensorrt.get_optimized_symbol(executor)
-    assert has_no_cycle(optimized_graph), "The graph optimized by TRT contains a cycle"
-
-
-if __name__ == '__main__':
-    import nose
-    nose.runmodule()
diff --git a/tests/python/tensorrt/test_resnet18.py b/tests/python/tensorrt/test_resnet18.py
index fff3ac5dd768..36a7f33fe6a0 100644
--- a/tests/python/tensorrt/test_resnet18.py
+++ b/tests/python/tensorrt/test_resnet18.py
@@ -25,7 +25,6 @@
 url = 'https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/cat.jpg?raw=true'
 model_file_name = 'resnet18_v2_trt_test'
 
-
 def get_image(image_url):
     fname = mx.test_utils.download(image_url, fname=image_url.split('/')[-1].split('?')[0])
     img = mx.image.imread(fname)
@@ -33,8 +32,7 @@ def get_image(image_url):
     img = img.transpose((2, 0, 1))  # Channel first
     img = img.expand_dims(axis=0)  # Batchify
     img = mx.nd.cast(img, dtype=np.float32)
-    return img/255.0
-
+    return img / 255.0
 
 def test_tensorrt_resnet18_feature_vect():
     print("downloading sample input")
@@ -45,24 +43,32 @@ def test_tensorrt_resnet18_feature_vect():
     gluon_resnet18.export(model_file_name)
     sym, arg_params, aux_params = mx.model.load_checkpoint(model_file_name, 0)
 
-    os.environ['MXNET_USE_TENSORRT'] = '0'
-    executor = sym.simple_bind(ctx=mx.gpu(), data=batch_shape, grad_req='null', force_rebind=True)
+    executor = sym.simple_bind(ctx=mx.gpu(), data=batch_shape,
+                               grad_req='null', force_rebind=True)
     executor.copy_params_from(arg_params, aux_params)
     y = executor.forward(is_train=False, data=input_data)
-
-    os.environ['MXNET_USE_TENSORRT'] = '1'
-    all_params = arg_params
-    all_params.update(aux_params)
-    executor = mx.contrib.tensorrt.tensorrt_bind(sym, ctx=mx.gpu(), all_params=all_params, data=batch_shape,
-                                                 grad_req='null', force_rebind=True)
-    y_trt = executor.forward(is_train=False, data=input_data)
-
-    no_trt_output = y[0].asnumpy()[0]
-    trt_output = y_trt[0].asnumpy()[0]
-    assert_almost_equal(no_trt_output, trt_output, 1e-4, 1e-4)
-
+    trt_sym = sym.get_backend_symbol('TensorRT')
+    mx.contrib.tensorrt.init_tensorrt_params(trt_sym, arg_params, aux_params)
+    original_precision_value = mx.contrib.tensorrt.get_use_fp16()
+    try:
+        mx.contrib.tensorrt.set_use_fp16(True)
+        executor = trt_sym.simple_bind(ctx=mx.gpu(), data=batch_shape,
+                                       grad_req='null', force_rebind=True)
+        executor.copy_params_from(arg_params, aux_params)
+        y_trt = executor.forward(is_train=False, data=input_data)
+        mx.contrib.tensorrt.set_use_fp16(False)
+        executor = trt_sym.simple_bind(ctx=mx.gpu(), data=batch_shape,
+                                       grad_req='null', force_rebind=True)
+        executor.copy_params_from(arg_params, aux_params)
+        y_trt_fp32 = executor.forward(is_train=False, data=input_data)
+        no_trt_output = y[0].asnumpy()[0]
+        trt_output = y_trt[0].asnumpy()[0]
+        trt_fp32_output = y_trt_fp32[0].asnumpy()[0]
+        assert_almost_equal(no_trt_output, trt_output, 1e-1, 1e-2)
+        assert_almost_equal(no_trt_output, trt_fp32_output, 1e-4, 1e-4)
+    finally:
+        mx.contrib.tensorrt.set_use_fp16(original_precision_value)
 
 if __name__ == '__main__':
     import nose
-
     nose.runmodule()
diff --git a/tests/python/tensorrt/test_tensorrt_lenet5.py b/tests/python/tensorrt/test_tensorrt_lenet5.py
index 258686428a45..bdc306c0b297 100644
--- a/tests/python/tensorrt/test_tensorrt_lenet5.py
+++ b/tests/python/tensorrt/test_tensorrt_lenet5.py
@@ -24,24 +24,25 @@
 
 def run_inference(sym, arg_params, aux_params, mnist, all_test_labels, batch_size, use_tensorrt):
     """Run inference with either MXNet or TensorRT"""
-    mx.contrib.tensorrt.set_use_tensorrt(use_tensorrt)
 
     data_size = (batch_size,) + mnist['test_data'].shape[1:]
+    type_dict = {'data': 'float32', 'softmax_label': 'float32'}
     if use_tensorrt:
-        all_params = merge_dicts(arg_params, aux_params)
-        executor = mx.contrib.tensorrt.tensorrt_bind(sym, ctx=mx.gpu(0), all_params=all_params,
-                                                     data=data_size,
-                                                     softmax_label=(batch_size,),
-                                                     grad_req='null',
-                                                     force_rebind=True)
+        _sym = sym.get_backend_symbol('TensorRT')
+        mx.contrib.tensorrt.init_tensorrt_params(_sym, arg_params, aux_params)
     else:
-        executor = sym.simple_bind(ctx=mx.gpu(0),
-                                   data=data_size,
-                                   softmax_label=(batch_size,),
-                                   grad_req='null',
-                                   force_rebind=True)
-        executor.copy_params_from(arg_params, aux_params)
-
+        _sym = sym
+    for k, v in arg_params.items():
+        type_dict[k] = v.dtype
+    for k, v in aux_params.items():
+        type_dict[k] = v.dtype
+    executor = _sym.simple_bind(ctx=mx.gpu(0),
+                                type_dict=type_dict,
+                                data=data_size,
+                                softmax_label=(batch_size,),
+                                grad_req='null',
+                                force_rebind=True)
+    executor.copy_params_from(arg_params, aux_params)
     # Get this value from all_test_labels
     # Also get classes from the dataset
     num_ex = 10000
@@ -68,39 +69,35 @@ def run_inference(sym, arg_params, aux_params, mnist, all_test_labels, batch_siz
 
 def test_tensorrt_inference():
     """Run LeNet-5 inference comparison between MXNet and TensorRT."""
-    original_try_value = mx.contrib.tensorrt.get_use_tensorrt()
-    try:
-        check_tensorrt_installation()
-        mnist = mx.test_utils.get_mnist()
-        num_epochs = 10
-        batch_size = 128
-        model_name = 'lenet5'
-        model_dir = os.getenv("LENET_MODEL_DIR", "/tmp")
-        model_file = '%s/%s-symbol.json' % (model_dir, model_name)
-        params_file = '%s/%s-%04d.params' % (model_dir, model_name, num_epochs)
-
-        _, _, _, all_test_labels = get_iters(mnist, batch_size)
-
-        # Load serialized MXNet model (model-symbol.json + model-epoch.params)
-        sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, num_epochs)
-
-        print("LeNet-5 test")
-        print("Running inference in MXNet")
-        mx_pct = run_inference(sym, arg_params, aux_params, mnist, all_test_labels,
-                               batch_size=batch_size, use_tensorrt=False)
-
-        print("Running inference in MXNet-TensorRT")
-        trt_pct = run_inference(sym, arg_params, aux_params, mnist, all_test_labels,
-                                batch_size=batch_size, use_tensorrt=True)
-
-        print("MXNet accuracy: %f" % mx_pct)
-        print("MXNet-TensorRT accuracy: %f" % trt_pct)
-
-        assert abs(mx_pct - trt_pct) < 1e-2, \
-            """Diff. between MXNet & TensorRT accuracy too high:
-               MXNet = %f, TensorRT = %f""" % (mx_pct, trt_pct)
-    finally:
-        mx.contrib.tensorrt.set_use_tensorrt(original_try_value)
+    check_tensorrt_installation()
+    mnist = mx.test_utils.get_mnist()
+    num_epochs = 10
+    batch_size = 128
+    model_name = 'lenet5'
+    model_dir = os.getenv("LENET_MODEL_DIR", "/tmp")
+    model_file = '%s/%s-symbol.json' % (model_dir, model_name)
+    params_file = '%s/%s-%04d.params' % (model_dir, model_name, num_epochs)
+
+    _, _, _, all_test_labels = get_iters(mnist, batch_size)
+
+    # Load serialized MXNet model (model-symbol.json + model-epoch.params)
+    sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, num_epochs)
+
+    print("LeNet-5 test")
+    print("Running inference in MXNet")
+    mx_pct = run_inference(sym, arg_params, aux_params, mnist, all_test_labels,
+                           batch_size=batch_size, use_tensorrt=False)
+
+    print("Running inference in MXNet-TensorRT")
+    trt_pct = run_inference(sym, arg_params, aux_params, mnist, all_test_labels,
+                            batch_size=batch_size, use_tensorrt=True)
+
+    print("MXNet accuracy: %f" % mx_pct)
+    print("MXNet-TensorRT accuracy: %f" % trt_pct)
+
+    assert abs(mx_pct - trt_pct) < 1e-2, \
+        """Diff. between MXNet & TensorRT accuracy too high:
+           MXNet = %f, TensorRT = %f""" % (mx_pct, trt_pct)
 
 
 if __name__ == '__main__':
diff --git a/tests/python/tensorrt/test_training_warning.py b/tests/python/tensorrt/test_training_warning.py
deleted file mode 100644
index fdac859aef6f..000000000000
--- a/tests/python/tensorrt/test_training_warning.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import gluoncv
-import mxnet as mx
-
-from tests.python.unittest.common import assertRaises
-
-
-def test_training_without_trt():
-    run_resnet(is_train=True, use_tensorrt=False)
-
-
-def test_inference_without_trt():
-    run_resnet(is_train=False, use_tensorrt=False)
-
-
-def test_training_with_trt():
-    assertRaises(RuntimeError, run_resnet, is_train=True, use_tensorrt=True)
-
-
-def test_inference_with_trt():
-    run_resnet(is_train=False, use_tensorrt=True)
-
-
-def run_resnet(is_train, use_tensorrt):
-    original_trt_value = mx.contrib.tensorrt.get_use_tensorrt()
-    try:
-        mx.contrib.tensorrt.set_use_tensorrt(use_tensorrt)
-        ctx = mx.gpu(0)
-        batch_size = 1
-        h = 32
-        w = 32
-        model_name = 'cifar_resnet20_v1'
-        resnet = gluoncv.model_zoo.get_model(model_name, pretrained=True)
-        data = mx.sym.var('data')
-        out = resnet(data)
-        softmax = mx.sym.SoftmaxOutput(out, name='softmax')
-        if is_train:
-            grad_req = 'write'
-        else:
-            grad_req = 'null'
-        if use_tensorrt:
-            all_params = dict([(k, v.data()) for k, v in resnet.collect_params().items()])
-            mx.contrib.tensorrt.tensorrt_bind(softmax, ctx=ctx, all_params=all_params,
-                                              data=(batch_size, 3, h, w), softmax_label=(batch_size,),
-                                              force_rebind=True, grad_req=grad_req)
-        else:
-            softmax.simple_bind(ctx=ctx, data=(batch_size, 3, h, w), softmax_label=(batch_size,),
-                                force_rebind=True, grad_req=grad_req)
-    finally:
-        mx.contrib.tensorrt.set_use_tensorrt(original_trt_value)
-
-
-if __name__ == '__main__':
-    import nose
-    nose.runmodule()
diff --git a/tests/python/train/test_dtype.py b/tests/python/train/test_dtype.py
index 39bfbcdeeafe..47b785cbc0cf 100644
--- a/tests/python/train/test_dtype.py
+++ b/tests/python/train/test_dtype.py
@@ -65,6 +65,34 @@ def get_iterator_uint8(kv):
 
     return (train, val)
 
+def get_iterator_uint8_with_param(kv, ctx):
+    data_shape = (3, 28, 28)
+
+    train = mx.io.ImageRecordIter(
+        path_imgrec = "data/cifar/train.rec",
+        data_shape  = data_shape,
+        batch_size  = batch_size,
+        rand_crop   = True,
+        rand_mirror = True,
+        num_parts   = kv.num_workers,
+        part_index  = kv.rank,
+        dtype       ='uint8',
+        ctx         = ctx)
+    train = mx.io.PrefetchingIter(train)
+
+    val = mx.io.ImageRecordIter(
+        path_imgrec = "data/cifar/test.rec",
+        rand_crop   = False,
+        rand_mirror = False,
+        data_shape  = data_shape,
+        batch_size  = batch_size,
+        num_parts   = kv.num_workers,
+        part_index  = kv.rank,
+        dtype       ='uint8',
+        ctx         = ctx)
+
+    return (train, val)
+
 def get_iterator_int8(kv):
     data_shape = (3, 28, 28)
 
@@ -89,6 +117,34 @@ def get_iterator_int8(kv):
 
     return (train, val)
 
+def get_iterator_int8_with_param(kv, ctx):
+    data_shape = (3, 28, 28)
+
+    train = mx.io.ImageRecordIter(
+        path_imgrec = "data/cifar/train.rec",
+        data_shape  = data_shape,
+        batch_size  = batch_size,
+        rand_crop   = True,
+        rand_mirror = True,
+        num_parts   = kv.num_workers,
+        part_index  = kv.rank,
+        dtype       ='int8',
+        ctx         = ctx)
+    train = mx.io.PrefetchingIter(train)
+
+    val = mx.io.ImageRecordIter(
+        path_imgrec = "data/cifar/test.rec",
+        rand_crop   = False,
+        rand_mirror = False,
+        data_shape  = data_shape,
+        batch_size  = batch_size,
+        num_parts   = kv.num_workers,
+        part_index  = kv.rank,
+        dtype       = 'int8',
+        ctx         = ctx)
+
+    return (train, val)
+
 def get_iterator_float32(kv):
     data_shape = (3, 28, 28)
 
@@ -214,10 +270,20 @@ def test_cifar10():
     run_cifar10(train, val, use_module=False)
     run_cifar10(train, val, use_module=True)
 
+    for ctx in ("gpu", "cpu"):
+        (train, val) = get_iterator_uint8_with_param(kv, ctx)
+        run_cifar10(train, val, use_module=False)
+        run_cifar10(train, val, use_module=True)
+
     # test int8 input
     (train, val) = get_iterator_int8(kv)
     run_cifar10(train, val, use_module=False)
     run_cifar10(train, val, use_module=True)
 
+    for ctx in ("gpu", "cpu"):
+        (train, val) = get_iterator_int8_with_param(kv, ctx)
+        run_cifar10(train, val, use_module=False)
+        run_cifar10(train, val, use_module=True)
+
 if __name__ == "__main__":
     test_cifar10()
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 94777677354d..374050668612 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -1653,6 +1653,37 @@ def test_ndarray_nan_comparison():
     for i in (np.isnan(data1_grad))[1][0].flatten():
         assert i == True
 
+
+def test_zero_from_numpy():
+    # Test zero_copy
+    arrays = [
+        # ordinary numpy array
+        np.array([[1, 2], [3, 4], [5, 6]], dtype="float32"),
+        # 0-dim
+        np.array((1, )).reshape(()),
+        # 0-size
+        np.array(()).reshape((1, 0, 2)),
+    ]
+    for zero_copy in [False, True]:
+        for np_array in arrays:
+            mx_array = mx.nd.from_numpy(np_array, zero_copy=zero_copy)
+            mx.test_utils.assert_almost_equal(np_array, mx_array.asnumpy())
+    np_array = arrays[0]
+    mx_array = mx.nd.from_numpy(np_array)
+    np_array[2, 1] = 0
+    mx.test_utils.assert_almost_equal(np_array, mx_array.asnumpy())
+    mx_array[2, 1] = 100
+    mx.test_utils.assert_almost_equal(np_array, mx_array.asnumpy())
+    np_array = np.array([[1, 2], [3, 4], [5, 6]]).transpose()
+    assert not np_array.flags["C_CONTIGUOUS"]
+    try:
+        mx_array = mx.nd.from_numpy(np_array)
+    except ValueError:
+        pass
+    else:
+        assert False
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index ddcc881939ad..e8bfaba4736d 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6296,6 +6296,51 @@ def test_laop_4():
     #print('float32')
     check_fw(test_syevd, [a_np], [u_np, l_np], np.float32)
 
+def test_laop_5():
+    # tests for diagonal and triangular matrix extraction and generation
+    data = mx.symbol.Variable('data')
+    # test complete range of small matrices to cover corner cases
+    for n in range(1, 10):
+        # test batched and non-batched processing
+        for b in range(3):
+            shape = (n, n) if b == 0 else (b, n, n) 
+            data_in = np.random.uniform(1, 10, shape)
+            # test all legal offsets of the diagonal
+            for offs in range(1-n, n): 
+                # test extraction of diagonal 
+                test_diag = mx.sym.linalg.extractdiag(data, offset=offs)
+                res_diag = np.diagonal(data_in, offset=offs) if b==0 else np.diagonal(data_in, axis1=1, axis2=2, offset=offs)
+                check_symbolic_forward(test_diag, [data_in], [res_diag])
+                check_numeric_gradient(test_diag, [data_in])
+                # test generation of diagonal matrix
+                test_diag2 = mx.sym.linalg.makediag(data, offset=offs)
+                res_diag2 = None  
+                if b == 0:
+                    res_diag2 = np.diagflat(res_diag, k=offs)
+                else:
+                    for i in range(b):
+                        res = np.reshape(np.diagflat(res_diag[i], k=offs), (1, n, n))
+                        res_diag2 = res if res_diag2 is None else np.concatenate((res_diag2, res), axis=0)
+                check_symbolic_forward(test_diag2, [res_diag], [res_diag2])
+                check_numeric_gradient(test_diag2, [res_diag])
+                # check both settings for parameter "lower" in case of zero offset
+                lower_vals = [True] if offs != 0 else [True, False]
+                for lower in lower_vals:
+                    # test extraction of triangle by doing a full roundtrip as the intermediate extracted
+                    # triangle has different orderings than numpy.
+                    test_trian = mx.sym.linalg.extracttrian(data, offset=offs, lower=lower)
+                    test_trian = mx.sym.linalg.maketrian(test_trian, offset=offs, lower=lower)
+                    extracts_lower = (offs < 0) or ((offs == 0) and lower)
+                    res_trian = None
+                    if b == 0:
+                        res_trian = np.tril(data_in, offs) if extracts_lower else np.triu(data_in, offs)
+                    else:
+                        for i in range(b):
+                            res = np.tril(data_in[i], offs) if extracts_lower else np.triu(data_in[i], offs)
+                            res = np.reshape(res, (1, n, n))
+                            res_trian = res if res_trian is None else np.concatenate((res_trian, res), axis=0)
+                    check_symbolic_forward(test_trian, [data_in], [res_trian])
+                    check_numeric_gradient(test_trian, [data_in])
 
 @with_seed()
 def test_stack():
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 8fbd97d8a162..5e809d383cdf 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -916,6 +916,18 @@ def test_randint_without_dtype():
     a = mx.nd.random.randint(low=50000000, high=50000010, ctx=mx.context.current_context())
     assert a.dtype == np.int32
 
+
+@with_seed()
+def test_sample_multinomial_num_outputs():
+    ctx = mx.context.current_context()
+    probs = [[0.125, 0.25, 0.25], [0.0625, 0.125, 0.1875]]
+    out = mx.nd.random.multinomial(data=mx.nd.array(probs, ctx=ctx), shape=10000, get_prob=False)
+    assert isinstance(out, mx.nd.NDArray)
+    out = mx.nd.random.multinomial(data=mx.nd.array(probs, ctx=ctx), shape=10000, get_prob=True)
+    assert isinstance(out, list)
+    assert len(out) == 2
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index b290ff344227..2dfe3e44eedb 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -367,6 +367,11 @@ def test_simple_bind_gradient_graph_possible_with_cycle():
     res = data + data + data + data + data + data + data + data
     res.simple_bind(ctx=mx.cpu(), data=(1,))
 
+def test_children_same_name():
+    a = mx.sym.Variable('data')
+    b = a + a
+    for c in b.get_children():
+        pass
 
 if __name__ == '__main__':
     import nose
diff --git a/tools/caffe_converter/compare_layers.py b/tools/caffe_converter/compare_layers.py
index ed73ee991c81..8d6598c8b781 100644
--- a/tools/caffe_converter/compare_layers.py
+++ b/tools/caffe_converter/compare_layers.py
@@ -143,8 +143,6 @@ def convert_and_compare_caffe_to_mxnet(image_url, gpu, caffe_prototxt_path, caff
     compare_layers_from_nets(caffe_net, arg_params, aux_params, exe, layer_name_to_record,
                              top_to_layers, mean_diff_allowed, max_diff_allowed)
 
-    return
-
 
 def _bfs(root_node, process_node):
     """
@@ -280,7 +278,6 @@ def _process_layer_parameters(layer):
             warnings.warn('No handling for layer %s of type %s, should we ignore it?', layer.name,
                           layer.type)
 
-        return
 
     def _process_layer_output(caffe_blob_name):
 
@@ -332,8 +329,6 @@ def _process_layer_output(caffe_blob_name):
     for caffe_blob_name in caffe_net.blobs.keys():
         _process_layer_output(caffe_blob_name)
 
-    return
-
 
 def main():
     """Entrypoint for compare_layers"""
diff --git a/tools/caffe_converter/test_converter.py b/tools/caffe_converter/test_converter.py
index 7b47278f51f9..3c325d6bdd63 100644
--- a/tools/caffe_converter/test_converter.py
+++ b/tools/caffe_converter/test_converter.py
@@ -76,8 +76,6 @@ def test_model_weights_and_outputs(model_name, image_url, gpu):
     convert_and_compare_caffe_to_mxnet(image_url, gpu, prototxt, caffemodel, mean,
                                        mean_diff_allowed=1e-03, max_diff_allowed=1e-01)
 
-    return
-
 
 def main():
     """Entrypoint for test_converter"""