diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index abb5b31b76e44..eed08ee673e49 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -21,6 +21,7 @@
 #include "core/providers/openvino/ov_interface.h"
 #include "core/providers/openvino/ov_versions/capability.h"
 #include "core/providers/openvino/qdq_transformations/qdq_stripping.h"
+#include "core/providers/openvino/exceptions.h"
 #include "core/providers/openvino/qdq_transformations/qdq_scales_fix.h"
 #include "../../framework/tensorprotoutils.h"
 
@@ -157,40 +158,11 @@ BackendManager::BackendManager(SessionContext& session_context,
     subgraph_context_.has_dynamic_input_shape = false;
 
     // OV NPU plugin is supported with fallback to OV CPU upon compilation failures.
-    try {
-      concrete_backend_ = BackendFactory::MakeBackend(model_proto,
-                                                      session_context_,
-                                                      subgraph_context_,
-                                                      *shared_context_,
-                                                      model_stream);
-    } catch (const OnnxRuntimeException& ex) {
-      std::string exception_str = ex.what();
-
-      if (session_context_.device_type.find("NPU") != std::string::npos &&
-          exception_str.find("intel_npu") != std::string::npos) {
-        // Handle NPU device related errors
-#ifndef NDEBUG
-        std::string suffix = session_context_.so_disable_cpu_ep_fallback ? "\nModel failed to compile on NPU. Enable CPU fallback or try another device.\n" : "\nModel needs to be recompiled\n";
-        ORT_THROW(exception_str + suffix);
-#else
-        std::string error_message = "UNKNOWN NPU ERROR";
-        std::string error_code = "code 0x0";
-        std::regex error_message_pattern(R"(\bZE_\w*\b)");
-        std::regex error_code_pattern("code 0x[0-9a-fA-F]+");
-        std::smatch matches;
-        if (std::regex_search(exception_str, matches, error_message_pattern)) {
-          error_message = matches[0];
-        }
-        if (std::regex_search(exception_str, matches, error_code_pattern)) {
-          error_code = matches[0];
-        }
-        std::string suffix = session_context_.so_disable_cpu_ep_fallback ? "\nModel failed to compile on NPU. Enable CPU fallback or try another device.\n" : "\nModel needs to be recompiled\n";
-        throw std::runtime_error(error_message + ", " + error_code + suffix);
-#endif
-      } else {
-        ORT_THROW(exception_str);
-      }
-    }
+    concrete_backend_ = BackendFactory::MakeBackend(model_proto,
+                                                    session_context_,
+                                                    subgraph_context_,
+                                                    *shared_context_,
+                                                    model_stream);
   }
 
   if (ShouldExportEpContext(session_context_, subgraph_context_)) {
diff --git a/onnxruntime/core/providers/openvino/exceptions.h b/onnxruntime/core/providers/openvino/exceptions.h
new file mode 100644
index 0000000000000..140ab1ac688ba
--- /dev/null
+++ b/onnxruntime/core/providers/openvino/exceptions.h
@@ -0,0 +1,88 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#pragma once
+
+#include <exception>
+#include <regex>
+#include <string>
+
+#include "core/common/status.h"
+
+namespace onnxruntime {
+namespace openvino_ep {
+
+struct ovep_exception : public std::exception {
+  enum class type {
+    compile_model,
+    import_model,
+    query_prop,
+    read_model,
+    unknown,
+  };
+
+  ovep_exception(const std::exception& ex, enum class type exception_type)
+      : message_{ex.what()},
+        type_{exception_type},
+        error_code_{ze_result_code_from_string(message_)},
+        error_name_{ze_result_name_from_string(message_)} {}
+
+  ovep_exception(const std::string& message, enum class type exception_type)
+      : message_{message},
+        type_{exception_type},
+        error_code_{ze_result_code_from_string(message)},
+        error_name_{ze_result_name_from_string(message)} {}
+
+  const char* what() const noexcept override {
+    return message_.data();
+  }
+
+  uint32_t get_code() const { return error_code_; }
+
+  operator common::Status() const {
+    common::StatusCategory category_ort{common::ONNXRUNTIME};
+
+    if (type_ == type::unknown) {
+      return {category_ort, common::FAIL, message_};
+    }
+
+    // Newer drivers
+    if ((type_ == type::import_model) &&
+        (error_code_ == 0x7800000f /* ZE_RESULT_ERROR_INVALID_NATIVE_BINARY */)) {
+      std::string message{error_name_ + ", code 0x" + std::to_string(error_code_) + "\nModel needs to be recompiled\n"};
+      return {category_ort, common::INVALID_GRAPH, message};
+    }
+
+    std::string error_message = "Unhandled exception type: " + std::to_string(static_cast<int>(type_));
+    return {category_ort, common::EP_FAIL, error_message};
+  }
+
+ protected:
+  std::string message_;
+  type type_{type::unknown};
+  uint32_t error_code_{0};
+  std::string error_name_;
+
+ private:
+  uint32_t ze_result_code_from_string(const std::string& ov_exception_string) {
+    uint32_t error_code{0};
+    std::regex error_code_pattern("code 0x([0-9a-fA-F]+)");
+    std::smatch matches;
+    if (std::regex_search(ov_exception_string, matches, error_code_pattern)) {
+      std::from_chars(&(*matches[1].first), &(*matches[1].second), error_code, 16);
+    }
+    return error_code;
+  }
+  std::string ze_result_name_from_string(const std::string& ov_exception_string) {
+    std::string error_message = "UNKNOWN NPU ERROR";
+    std::regex error_message_pattern(R"(\bZE_\w*\b)");
+    std::smatch matches;
+    if (std::regex_search(ov_exception_string, matches, error_message_pattern)) {
+      error_message = matches[0];
+    }
+    return error_message;
+  }
+};
+
+}  // namespace openvino_ep
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index f9c9fa2ea6f48..6dc7328d696da 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -12,6 +12,7 @@
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
 #include "core/providers/openvino/ov_versions/capability.h"
 #include "core/providers/openvino/qdq_transformations/qdq_stripping.h"
+#include "core/providers/openvino/exceptions.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "openvino/core/version.hpp"
 #ifdef USE_OVEP_NPU_MEMORY
@@ -103,107 +104,111 @@ common::Status OpenVINOExecutionProvider::Compile(
   auto& logger = *GetLogger();
   Status status = Status::OK();
 
-  if (session_context_.so_context_enable && session_context_.so_context_embed_mode && session_context_.so_share_ep_contexts) {
-    return Status(common::StatusCategory::ONNXRUNTIME, common::EP_FAIL,
-                  std::string("Invalid EP context configuration: ") + kOrtSessionOptionEpContextEmbedMode + " must be 0 if " + kOrtSessionOptionShareEpContexts + " is 1.");
-  }
+  try {
+    if (session_context_.so_context_enable && session_context_.so_context_embed_mode && session_context_.so_share_ep_contexts) {
+      return Status(common::StatusCategory::ONNXRUNTIME, common::EP_FAIL,
+                    std::string("Invalid EP context configuration: ") + kOrtSessionOptionEpContextEmbedMode + " must be 0 if " + kOrtSessionOptionShareEpContexts + " is 1.");
+    }
 
-  bool is_epctx_model = false;
-  if (!fused_nodes.empty()) {
-    // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext
-    const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get();
-    session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string();
-    session_context_.onnx_opset_version =
-        graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain);
-
-    // OVIR wrapped in epctx should be treated as source but this code does not
-    // This corner case is not in use and will be addressed in a future commit
-    is_epctx_model = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(graph_body_viewer_0);
-  }
+    bool is_epctx_model = false;
+    if (!fused_nodes.empty()) {
+      // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext
+      const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get();
+      session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string();
+      session_context_.onnx_opset_version =
+          graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain);
+
+      // OVIR wrapped in epctx should be treated as source but this code does not
+      // This corner case is not in use and will be addressed in a future commit
+      is_epctx_model = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(graph_body_viewer_0);
+    }
 
-  if (is_epctx_model) {
-    ep_ctx_handle_.Initialize(fused_nodes, session_context_.GetOutputBinPath().parent_path());
-  }
+    if (is_epctx_model) {
+      ep_ctx_handle_.Initialize(fused_nodes, session_context_.GetOutputBinPath().parent_path());
+    }
 
-  struct OpenVINOEPFunctionState {
-    AllocateFunc allocate_func = nullptr;
-    DestroyFunc destroy_func = nullptr;
-    AllocatorHandle allocator_handle = nullptr;
-    BackendManager& backend_manager;
-  };
-
-  for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) {
-    const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
-    const Node& fused_node = fused_node_graph.fused_node;
-
-    NodeComputeInfo compute_info;
-
-    // During backend creation, we check if user wants to use precompiled blob onnx model or the original model
-    // For precompiled blob, directly load the model instead of compiling the model
-    // For original model, check if the user wants to export a model with pre-compiled blob
-
-    auto& backend_manager = backend_managers_.emplace_back(session_context_,
-                                                           *shared_context_manager_,
-                                                           fused_node,
-                                                           graph_body_viewer,
-                                                           logger,
-                                                           ep_ctx_handle_);
-    compute_info.create_state_func =
-        [&backend_manager](ComputeContext* context, FunctionState* state) {
-          OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{
-              .allocate_func = context->allocate_func,
-              .destroy_func = context->release_func,
-              .allocator_handle = context->allocator_handle,
-              .backend_manager = backend_manager};
-          *state = static_cast<FunctionState>(p);
-          return 0;
-        };
-
-    compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
-      auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
-      try {
-        function_state->backend_manager.Compute(context);
-      } catch (const std::exception& ex) {
-        return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what());
-      }
-      return Status::OK();
+    struct OpenVINOEPFunctionState {
+      AllocateFunc allocate_func = nullptr;
+      DestroyFunc destroy_func = nullptr;
+      AllocatorHandle allocator_handle = nullptr;
+      BackendManager& backend_manager;
     };
 
-    compute_info.release_state_func =
-        [](FunctionState state) {
-          if (state) {
-            OpenVINOEPFunctionState* function_state = static_cast<OpenVINOEPFunctionState*>(state);
-            delete function_state;
-          }
-        };
+    for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) {
+      const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
+      const Node& fused_node = fused_node_graph.fused_node;
+
+      NodeComputeInfo compute_info;
+
+      // During backend creation, we check if user wants to use precompiled blob onnx model or the original model
+      // For precompiled blob, directly load the model instead of compiling the model
+      // For original model, check if the user wants to export a model with pre-compiled blob
+
+      auto& backend_manager = backend_managers_.emplace_back(session_context_,
+                                                             *shared_context_manager_,
+                                                             fused_node,
+                                                             graph_body_viewer,
+                                                             logger,
+                                                             ep_ctx_handle_);
+      compute_info.create_state_func =
+          [&backend_manager](ComputeContext* context, FunctionState* state) {
+            OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{
+                .allocate_func = context->allocate_func,
+                .destroy_func = context->release_func,
+                .allocator_handle = context->allocator_handle,
+                .backend_manager = backend_manager};
+            *state = static_cast<FunctionState>(p);
+            return 0;
+          };
+
+      compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
+        auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
+        try {
+          function_state->backend_manager.Compute(context);
+        } catch (const std::exception& ex) {
+          return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what());
+        }
+        return Status::OK();
+      };
 
-    node_compute_funcs.push_back(std::move(compute_info));
-  }
+      compute_info.release_state_func =
+          [](FunctionState state) {
+            if (state) {
+              OpenVINOEPFunctionState* function_state = static_cast<OpenVINOEPFunctionState*>(state);
+              delete function_state;
+            }
+          };
 
-  // Export compiled blobs as EPContext nodes if context enable is set
-  if (session_context_.so_context_enable) {
-    auto backend_it = backend_managers_.begin();
-    bool is_first = true;
+      node_compute_funcs.push_back(std::move(compute_info));
+    }
 
-    for (const auto& fused_node_graph : fused_nodes) {
-      const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
+    // Export compiled blobs as EPContext nodes if context enable is set
+    if (session_context_.so_context_enable) {
+      auto backend_it = backend_managers_.begin();
+      bool is_first = true;
 
-      // Set include_embed_data to true only for the first backend manager
-      backend_it->TryExportCompiledBlobAsEPCtxNode(graph_body_viewer, is_first);
+      for (const auto& fused_node_graph : fused_nodes) {
+        const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
 
-      is_first = false;
-      ++backend_it;
-    }
+        // Set include_embed_data to true only for the first backend manager
+        backend_it->TryExportCompiledBlobAsEPCtxNode(graph_body_viewer, is_first);
+
+        is_first = false;
+        ++backend_it;
+      }
 
-    // bit clunky ideally we should try to fold this into ep context handler
-    if (!session_context_.so_context_embed_mode) {
-      auto shared_context = shared_context_manager_->GetOrCreateActiveSharedContext(session_context_.GetOutputBinPath());
-      shared_context->Serialize();
-      if (session_context_.so_stop_share_ep_contexts) {
-        shared_context_manager_->ClearActiveSharedContext();
-        shared_context->Clear();
+      // bit clunky ideally we should try to fold this into ep context handler
+      if (!session_context_.so_context_embed_mode) {
+        auto shared_context = shared_context_manager_->GetOrCreateActiveSharedContext(session_context_.GetOutputBinPath());
+        shared_context->Serialize();
+        if (session_context_.so_stop_share_ep_contexts) {
+          shared_context_manager_->ClearActiveSharedContext();
+          shared_context->Clear();
+        }
       }
     }
+  } catch (const ovep_exception& ex) {
+    status = ex;
   }
 
   return status;
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 85fc4d93d6243..446ed098521cb 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -12,16 +12,21 @@
 #include "core/providers/openvino/backends/basic_backend.h"
 #include "core/providers/openvino/ov_stateful_patch_utils.h"
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
+#include "core/providers/openvino/exceptions.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
 
-template <typename Func, typename... Args>
+template <bool typed, typename Func, typename... Args>
 inline auto OvExceptionBoundary(Func&& func, std::format_string<Args...>&& fmt, Args&&... args) {
   try {
     return func();
   } catch (const ov::Exception& e) {
-    ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...)) + ": " + std::string(e.what()));
+    if constexpr (typed) {
+      throw ovep_exception(e, ovep_exception::type::import_model);
+    } else {
+      ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...)) + ": " + std::string(e.what()));
+    }
   } catch (...) {
     ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...)));
   }
@@ -70,7 +75,7 @@ std::optional<bool> queryOVProperty(const std::string& property, const std::stri
 }
 
 std::shared_ptr<OVNetwork> OVCore::ReadModel(std::string&& model, const std::string& model_path) {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<false>([&]() {
     std::istringstream modelStringStream(std::move(model));
     std::istream& modelStream = modelStringStream;
     // Try to load with FrontEndManager
@@ -88,7 +93,7 @@ std::shared_ptr<OVNetwork> OVCore::ReadModel(std::string&& model, const std::str
       ORT_THROW(log_tag + "Unknown exception while Reading network");
     }
   },
-                             "Exception while Reading network");
+                                    "Exception while Reading network");
 }
 
 OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
@@ -156,7 +161,7 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_netwo
                                   ov::AnyMap& device_config,
                                   bool enable_causallm,
                                   const std::string& name) {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<false>([&]() {
     OVExeNetwork exe;
     if (enable_causallm) {
       auto mutable_model = ie_cnn_network->clone();
@@ -172,14 +177,14 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_netwo
 
     return exe;
   },
-                             "Exception while Loading Network for graph {}", name);
+                                    "Exception while Loading Network for graph {}", name);
 }
 
 OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
                                   std::string& hw_target,
                                   ov::AnyMap& device_config,
                                   const std::string& name) {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<false>([&]() {
     ov::CompiledModel obj;
 
     obj = core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config);
@@ -189,14 +194,14 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
     OVExeNetwork exe(obj, hw_target);
     return exe;
   },
-                             "Exception while Loading Network for graph {}", name);
+                                    "Exception while Loading Network for graph {}", name);
 }
 
 OVExeNetwork OVCore::ImportModel(ModelBlobWrapper& model_blob,
                                  std::string hw_target,
                                  const ov::AnyMap& device_config,
                                  std::string name) {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<true>([&]() {
     ov::CompiledModel obj;
 #if (OPENVINO_VERSION_MAJOR > 2025 || (OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR >= 3))
     if (model_blob.tensor_) {
@@ -205,7 +210,7 @@ OVExeNetwork OVCore::ImportModel(ModelBlobWrapper& model_blob,
       obj = core.import_model(*model_blob.stream_, hw_target, device_config);
     }
 #else
-    obj = core.import_model(*model_blob.stream_, hw_target, device_config);
+      obj = core.import_model(*model_blob.stream_, hw_target, device_config);
 #endif
     OVExeNetwork exe(obj, hw_target);
 
@@ -214,7 +219,7 @@ OVExeNetwork OVCore::ImportModel(ModelBlobWrapper& model_blob,
 #endif
     return exe;
   },
-                             "Exception while Loading Network for graph {}", name);
+                                   "Exception while Loading Network for graph {}", name);
 }
 
 OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream& model_stream,
@@ -222,7 +227,7 @@ OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream& model_stream,
                                                   const ov::AnyMap& device_config,
                                                   bool enable_causallm,
                                                   std::filesystem::path model_file_path) {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<false>([&]() {
     OVExeNetwork exe;
 
     bool isXML = backend_utils::IsModelStreamXML(model_stream);
@@ -267,7 +272,11 @@ OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream& model_stream,
 #endif
     return exe;
   },
-                             "Exception while Loading Network from OVIR model file: {}", model_file_path.string());
+                                    "Exception while Loading Network from OVIR model file: {}", model_file_path.string());
+}
+
+void OVCore::SetCache(const std::string& cache_dir_path) {
+  core.set_property(ov::cache_dir(cache_dir_path));
 }
 
 std::vector<std::string> OVCore::GetAvailableDevices() const {
@@ -308,8 +317,12 @@ std::vector<std::string> OVCore::GetAvailableDevices(const std::string& device_t
   return available_devices;
 }
 
+void OVCore::SetStreams(const std::string& device_type, int num_streams) {
+  core.set_property(device_type, {ov::num_streams(num_streams)});
+}
+
 std::shared_ptr<OVInferRequest> OVExeNetwork::CreateInferRequest() {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<false>([&]() {
     auto infReq = compiled_model_obj.create_infer_request();
     std::shared_ptr<OVInferRequest> ovInfReq;
     if (is_stateful_causallm) {
@@ -320,31 +333,31 @@ std::shared_ptr<OVInferRequest> OVExeNetwork::CreateInferRequest() {
     return ovInfReq;
   },
 
-                             "Exception while creating InferRequest object");
+                                    "Exception while creating InferRequest object");
 }
 
 OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<false>([&]() {
     auto tobj = ovInfReq.get_tensor(input_name);
     OVTensorPtr blob = std::make_shared<OVTensor>(tobj);
     return blob;
   },
-                             " Cannot access IE Blob for input: {}", input_name);
+                                    " Cannot access IE Blob for input: {}", input_name);
 }
 
 std::string OVInferRequest::GetInputTensorName(uint32_t index) {
-  return OvExceptionBoundary([&]() -> const std::string& {
+  return OvExceptionBoundary<false>([&]() {
     const auto& model = ovInfReq.get_compiled_model();
     return *model.input(index).get_names().begin();
   },
-                             " Cannot access IE Blob for input number: {}", index);
+                                    " Cannot access IE Blob for input number: {}", index);
 }
 
 void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) {
-  OvExceptionBoundary([&]() {
+  OvExceptionBoundary<false>([&]() {
     ovInfReq.set_tensor(name, *(blob.get()));
   },
-                      " Cannot set Remote Blob for output: {}", name);
+                             " Cannot set Remote Blob for output: {}", name);
 }
 
 uint32_t OVInferRequest::GetNumInputs() {
@@ -352,20 +365,16 @@ uint32_t OVInferRequest::GetNumInputs() {
 }
 
 void OVInferRequest::Infer() {
-  OvExceptionBoundary([&]() {
+  OvExceptionBoundary<false>([&]() {
     ovInfReq.infer();
   },
-                      "In Error Couldn't start Inference");
+                             "In Error Couldn't start Inference");
 }
 
 StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device)
     : OVInferRequest(std::move(infer_request)), target_device(device) {
   bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos));
-
-  // check if there is input_ids tensors and if the tensor type is int64,
-  // because logic prefill_use_full_chat_history is only for specific inputs and data type
-  auto input_ids_opt = FindTensor("input_ids");
-  if (gpu_or_npu && input_ids_opt.has_value() && input_ids_opt->get_element_type() == ov::element::i64) {
+  if (gpu_or_npu) {
     prefill_use_full_chat_history = true;
   }
 }
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index 8765cd040d098..8a55fdcbd4fb4 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -81,6 +81,8 @@ struct OVCore : WeakSingleton<OVCore> {
 
   std::vector<std::string> GetAvailableDevices() const;
   std::vector<std::string> GetAvailableDevices(const std::string& device_type) const;
+  void SetCache(const std::string& cache_dir_path);
+  void SetStreams(const std::string& device_type, int num_streams);
 };
 
 class OVExeNetwork {