triton-inference-server · dtrawins · Mar 26, 2024 · Mar 26, 2024 · Mar 27, 2024 · Mar 27, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -210,7 +210,6 @@ else()
     COMMAND rm -fr openvino
     COMMAND docker cp openvino_backend_ov:/opt/openvino openvino
     COMMAND docker rm openvino_backend_ov
-    COMMAND echo '<ie><plugins><plugin name=\"CPU\" location=\"libopenvino_intel_cpu_plugin.so\"></plugin></plugins></ie>' >> openvino/lib/plugins.xml
     COMMENT "Building OpenVino"
   )
 endif() # WIN32

diff --git a/README.md b/README.md
@@ -71,6 +71,16 @@ but the listed CMake argument can be used to override.
 * triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag]
 * triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]
 
+## Build a complete image with OpenVINO backend including Intel GPU drivers
+
+Build the custom triton image with the required runtime drivers using the script from [build.py](https://github.com/dtrawins/server/blob/igpu/build.py).
+
+```
+python3 build.py --target-platform linux --enable-logging --enable-stats --enable-metrics --enable-cpu-metrics --endpoint grpc --endpoint http --filesystem s3 \
+--backend openvino
+```
+
+
 ## Using the OpenVINO Backend
 
 ### Parameters
@@ -88,6 +98,7 @@ to skip the dynamic batch sizes in backend.
 * `ENABLE_BATCH_PADDING`: By default an error will be generated if backend receives a request with batch size less than max_batch_size specified in the configuration. This error can be avoided at a cost of performance by specifying `ENABLE_BATCH_PADDING` parameter as `YES`.
 * `RESHAPE_IO_LAYERS`: By setting this parameter as `YES`, the IO layers are reshaped to the dimensions provided in
 model configuration. By default, the dimensions in the model is used.
+* `TARGET_DEVICE`: Choose the OpenVINO device for running the inference. It could be CPU (default), GPU or any of the virtual devices like AUTO, MULTI, HETERO. Note: using Intel GPU is possible only if `--device /dev/dri` is passed to the container and is supported only on linux with x86_64 arch.
 
 
 
@@ -231,6 +242,36 @@ string_value:"yes"
 }
 }
 ```
+### Running the models on Intel GPU
+
+Add to your config.pbtxt a parameter `TARGET_DEVICE`:
+```
+parameters: [
+{
+   key: "NUM_STREAMS"
+   value: {
+     string_value: "1"
+   }
+},
+{
+   key: "PERFORMANCE_HINT"
+   value: {
+     string_value: "THROUGHPUT"
+   }
+},
+{
+   key: "TARGET_DEVICE"
+   value: {
+     string_value: "GPU"
+   }
+}
+]
+```
+
+Start the container with extra parameter to pass the device `/dev/dri`:
+```
+docker run -it --rm --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* )  tritonserver:latest
+```
 
 ## Known Issues
 

diff --git a/src/openvino.cc b/src/openvino.cc
@@ -84,6 +84,9 @@ class ModelState : public BackendModel {
   TRITONSERVER_Error* ParseParameter(
       const std::string& mkey, triton::common::TritonJson::Value& params,
       std::vector<std::pair<std::string, ov::Any>>* device_config);
+  TRITONSERVER_Error* ParseStringParameter(
+      const std::string& mkey, triton::common::TritonJson::Value& params,
+      std::string* value);
   TRITONSERVER_Error* ParseParameterHelper(
       const std::string& mkey, std::string* value,
       std::pair<std::string, ov::Any>* ov_property);
@@ -118,6 +121,7 @@ class ModelState : public BackendModel {
 
   bool SkipDynamicBatchSize() { return skip_dynamic_batchsize_; }
   bool EnableBatchPadding() { return enable_padding_; }
+  std::string TargetDevice() { return target_device_; }
 
  private:
   ModelState(TRITONBACKEND_Model* triton_model);
@@ -140,6 +144,7 @@ class ModelState : public BackendModel {
   bool skip_dynamic_batchsize_;
   bool enable_padding_;
   bool reshape_io_layers_;
+  std::string target_device_;
 };
 
 TRITONSERVER_Error*
@@ -179,7 +184,7 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
 ModelState::ModelState(TRITONBACKEND_Model* triton_model)
     : BackendModel(triton_model), model_read_(false),
       skip_dynamic_batchsize_(false), enable_padding_(false),
-      reshape_io_layers_(false)
+      reshape_io_layers_(false), target_device_("CPU")
 {
 }
 
@@ -238,12 +243,11 @@ ModelState::ParseParameters()
   bool status = model_config_.Find("parameters", &params);
   if (status) {
     RETURN_IF_ERROR(LoadCpuExtensions(params));
-    RETURN_IF_ERROR(ParseBoolParameter(
-        "SKIP_OV_DYNAMIC_BATCHSIZE", params, &skip_dynamic_batchsize_));
-    RETURN_IF_ERROR(
-        ParseBoolParameter("ENABLE_BATCH_PADDING", params, &enable_padding_));
-    RETURN_IF_ERROR(
-        ParseBoolParameter("RESHAPE_IO_LAYERS", params, &reshape_io_layers_));
+    ParseBoolParameter(
+        "SKIP_OV_DYNAMIC_BATCHSIZE", params, &skip_dynamic_batchsize_);
+    ParseBoolParameter("ENABLE_BATCH_PADDING", params, &enable_padding_);
+    ParseBoolParameter("RESHAPE_IO_LAYERS", params, &reshape_io_layers_);
+    ParseStringParameter("TARGET_DEVICE", params, &target_device_);
   }
 
   return nullptr;
@@ -256,18 +260,13 @@ ModelState::ParseParameters(const std::string& device)
   triton::common::TritonJson::Value params;
   bool status = model_config_.Find("parameters", &params);
   if (status) {
-    if (device == "CPU") {
-      config_[device] = {};
-      auto& device_config = config_.at(device);
-      RETURN_IF_ERROR(
-          ParseParameter("INFERENCE_NUM_THREADS", params, &device_config));
-      RETURN_IF_ERROR(
-          ParseParameter("COMPILATION_NUM_THREADS", params, &device_config));
-      RETURN_IF_ERROR(ParseParameter("HINT_BF16", params, &device_config));
-      RETURN_IF_ERROR(ParseParameter("NUM_STREAMS", params, &device_config));
-      RETURN_IF_ERROR(
-          ParseParameter("PERFORMANCE_HINT", params, &device_config));
-    }
+    config_[device] = {};
+    auto& device_config = config_.at(device);
+    ParseParameter("INFERENCE_NUM_THREADS", params, &device_config);
+    ParseParameter("COMPILATION_NUM_THREADS", params, &device_config);
+    ParseParameter("HINT_BF16", params, &device_config);
+    ParseParameter("NUM_STREAMS", params, &device_config);
+    ParseParameter("PERFORMANCE_HINT", params, &device_config);
   }
 
   return nullptr;
@@ -277,9 +276,7 @@ TRITONSERVER_Error*
 ModelState::LoadCpuExtensions(triton::common::TritonJson::Value& params)
 {
   std::string cpu_ext_path;
-  LOG_IF_ERROR(
-      ReadParameter(params, "CPU_EXTENSION_PATH", &(cpu_ext_path)),
-      "error when reading parameters");
+  ReadParameter(params, "CPU_EXTENSION_PATH", &(cpu_ext_path));
   if (!cpu_ext_path.empty()) {
     // CPU (MKLDNN) extensions is loaded as a shared library and passed as a
     // pointer to base extension
@@ -301,8 +298,7 @@ ModelState::ParseBoolParameter(
     bool* setting)
 {
   std::string value;
-  LOG_IF_ERROR(
-      ReadParameter(params, mkey, &(value)), "error when reading parameters");
+  RETURN_IF_ERROR(ReadParameter(params, mkey, &(value)));
   std::transform(
       value.begin(), value.end(), value.begin(),
       [](unsigned char c) { return std::tolower(c); });
@@ -313,14 +309,30 @@ ModelState::ParseBoolParameter(
   return nullptr;
 }
 
+TRITONSERVER_Error*
+ModelState::ParseStringParameter(
+    const std::string& mkey, triton::common::TritonJson::Value& params,
+    std::string* setting)
+{
+  std::string value;
+  RETURN_IF_ERROR(ReadParameter(params, mkey, &(value)));
+  std::transform(
+      value.begin(), value.end(), value.begin(),
+      [](unsigned char c) { return std::toupper(c); });
+  if (value.length() > 0) {
+    *setting = value;
+  }
+
+  return nullptr;
+}
+
 TRITONSERVER_Error*
 ModelState::ParseParameter(
     const std::string& mkey, triton::common::TritonJson::Value& params,
     std::vector<std::pair<std::string, ov::Any>>* device_config)
 {
   std::string value;
-  LOG_IF_ERROR(
-      ReadParameter(params, mkey, &(value)), "error when reading parameters");
+  RETURN_IF_ERROR(ReadParameter(params, mkey, &(value)));
   if (!value.empty()) {
     std::pair<std::string, ov::Any> ov_property;
     RETURN_IF_ERROR(ParseParameterHelper(mkey, &value, &ov_property));
@@ -410,6 +422,16 @@ ModelState::ParseParameterHelper(
 TRITONSERVER_Error*
 ModelState::ConfigureOpenvinoCore()
 {
+  auto availableDevices = ov_core_.get_available_devices();
+  std::stringstream list_of_devices;
+
+  for (auto& element : availableDevices) {
+    list_of_devices << element << ",";
+  }
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      (std::string("Available OpenVINO devices: " + list_of_devices.str()))
+          .c_str());
   for (auto&& item : config_) {
     std::string device_name = item.first;
     std::vector<std::pair<std::string, ov::Any>> properties = item.second;
@@ -438,9 +460,10 @@ ModelState::LoadModel(
                                  std::to_string(OPENVINO_VERSION_MINOR) + "." +
                                  std::to_string(OPENVINO_VERSION_PATCH))
                                     .c_str());
+
   LOG_MESSAGE(
       TRITONSERVER_LOG_VERBOSE,
-      (std::string("Device info: \n") +
+      (std::string("Device info: ") +
        ConvertVersionMapToString(ov_core_.get_versions(device)))
           .c_str());
 
@@ -932,19 +955,27 @@ ModelInstanceState::Create(
 ModelInstanceState::ModelInstanceState(
     ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
     : BackendModelInstance(model_state, triton_model_instance),
-      model_state_(model_state), device_("CPU"), batch_pad_size_(0)
+      model_state_(model_state), device_(model_state->TargetDevice()),
+      batch_pad_size_(0)
 {
-  if (Kind() != TRITONSERVER_INSTANCEGROUPKIND_CPU) {
+  if ((Kind() != TRITONSERVER_INSTANCEGROUPKIND_CPU) &&
+      (Kind() != TRITONSERVER_INSTANCEGROUPKIND_AUTO)) {
     throw triton::backend::BackendModelInstanceException(TRITONSERVER_ErrorNew(
         TRITONSERVER_ERROR_INVALID_ARG,
         (std::string("unable to load model '") + model_state_->Name() +
-         "', Triton openVINO backend supports only CPU device")
+         "', Triton OpenVINO backend supports only Kind CPU and AUTO")
             .c_str()));
   }
 
   if (model_state_->ModelNotRead()) {
     std::string model_path;
     THROW_IF_BACKEND_INSTANCE_ERROR(model_state_->ParseParameters());
+    device_ = model_state->TargetDevice();
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("Target device " + device_)).c_str());
+
+
     THROW_IF_BACKEND_INSTANCE_ERROR(
         model_state_->ReadModel(ArtifactFilename(), &model_path));
     THROW_IF_BACKEND_INSTANCE_ERROR(model_state_->ValidateConfigureModel());
@@ -1519,8 +1550,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
   LOG_MESSAGE(
       TRITONSERVER_LOG_INFO,
       (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" +
-       TRITONSERVER_InstanceGroupKindString(kind) + " device " +
-       std::to_string(device_id) + ")")
+       TRITONSERVER_InstanceGroupKindString(kind) + ")")
           .c_str());
 
   // Get the model state associated with this instance's model.
@@ -1608,7 +1638,7 @@ TRITONBACKEND_GetBackendAttribute(
       TRITONSERVER_LOG_VERBOSE,
       "TRITONBACKEND_GetBackendAttribute: setting attributes");
   RETURN_IF_ERROR(TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(
-      backend_attributes, TRITONSERVER_INSTANCEGROUPKIND_CPU, 0, nullptr, 0));
+      backend_attributes, TRITONSERVER_INSTANCEGROUPKIND_AUTO, 0, nullptr, 0));
 
   return nullptr;
 }

diff --git a/tools/gen_openvino_dockerfile.py b/tools/gen_openvino_dockerfile.py
@@ -76,6 +76,15 @@ def dockerfile_for_linux(output_file):
 # pre-build archive.
 # TODO: Unify build steps between linux and windows.
 
+# Get intel GPU drivers
+WORKDIR /drv
+RUN curl -L -O https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15468.11/intel-igc-core_1.0.15468.11_amd64.deb ; \
+    curl -L -O https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15468.11/intel-igc-opencl_1.0.15468.11_amd64.deb ; \
+    curl -L -O https://github.com/intel/compute-runtime/releases/download/23.43.27642.18/intel-opencl-icd_23.43.27642.18_amd64.deb ; \
+    curl -L -O https://github.com/intel/compute-runtime/releases/download/23.43.27642.18/libigdgmm12_22.3.11_amd64.deb ; \
+    apt-get download ocl-icd-libopencl1 ; \
+    find . -iname '*.deb' -exec  dpkg-deb -X {} . \;
+
 ARG OPENVINO_VERSION
 ARG OPENVINO_BUILD_TYPE
 WORKDIR /workspace
@@ -106,7 +115,8 @@ def dockerfile_for_linux(output_file):
     cp -r /workspace/install/runtime/include/* include/.
 RUN mkdir -p lib && \
     cp -P /workspace/install/runtime/lib/intel64/*.so* lib/. && \
-    cp -P /workspace/install/runtime/3rdparty/tbb/lib/libtbb.so* lib/. \
+    cp -P /workspace/install/runtime/lib/intel64/libopenvino*.so* lib/. && \
+    find /drv/usr/ -iname '*.so*' -exec cp -P {} lib/. \;
 """
 
     df += """