diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index abb5b31b76e44..eed08ee673e49 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -21,6 +21,7 @@ #include "core/providers/openvino/ov_interface.h" #include "core/providers/openvino/ov_versions/capability.h" #include "core/providers/openvino/qdq_transformations/qdq_stripping.h" +#include "core/providers/openvino/exceptions.h" #include "core/providers/openvino/qdq_transformations/qdq_scales_fix.h" #include "../../framework/tensorprotoutils.h" @@ -157,40 +158,11 @@ BackendManager::BackendManager(SessionContext& session_context, subgraph_context_.has_dynamic_input_shape = false; // OV NPU plugin is supported with fallback to OV CPU upon compilation failures. - try { - concrete_backend_ = BackendFactory::MakeBackend(model_proto, - session_context_, - subgraph_context_, - *shared_context_, - model_stream); - } catch (const OnnxRuntimeException& ex) { - std::string exception_str = ex.what(); - - if (session_context_.device_type.find("NPU") != std::string::npos && - exception_str.find("intel_npu") != std::string::npos) { - // Handle NPU device related errors -#ifndef NDEBUG - std::string suffix = session_context_.so_disable_cpu_ep_fallback ? "\nModel failed to compile on NPU. Enable CPU fallback or try another device.\n" : "\nModel needs to be recompiled\n"; - ORT_THROW(exception_str + suffix); -#else - std::string error_message = "UNKNOWN NPU ERROR"; - std::string error_code = "code 0x0"; - std::regex error_message_pattern(R"(\bZE_\w*\b)"); - std::regex error_code_pattern("code 0x[0-9a-fA-F]+"); - std::smatch matches; - if (std::regex_search(exception_str, matches, error_message_pattern)) { - error_message = matches[0]; - } - if (std::regex_search(exception_str, matches, error_code_pattern)) { - error_code = matches[0]; - } - std::string suffix = session_context_.so_disable_cpu_ep_fallback ? "\nModel failed to compile on NPU. Enable CPU fallback or try another device.\n" : "\nModel needs to be recompiled\n"; - throw std::runtime_error(error_message + ", " + error_code + suffix); -#endif - } else { - ORT_THROW(exception_str); - } - } + concrete_backend_ = BackendFactory::MakeBackend(model_proto, + session_context_, + subgraph_context_, + *shared_context_, + model_stream); } if (ShouldExportEpContext(session_context_, subgraph_context_)) { diff --git a/onnxruntime/core/providers/openvino/exceptions.h b/onnxruntime/core/providers/openvino/exceptions.h new file mode 100644 index 0000000000000..140ab1ac688ba --- /dev/null +++ b/onnxruntime/core/providers/openvino/exceptions.h @@ -0,0 +1,88 @@ +// Copyright (C) Intel Corporation +// Licensed under the MIT License + +#pragma once + +#include +#include +#include + +#include "core/common/status.h" + +namespace onnxruntime { +namespace openvino_ep { + +struct ovep_exception : public std::exception { + enum class type { + compile_model, + import_model, + query_prop, + read_model, + unknown, + }; + + ovep_exception(const std::exception& ex, enum class type exception_type) + : message_{ex.what()}, + type_{exception_type}, + error_code_{ze_result_code_from_string(message_)}, + error_name_{ze_result_name_from_string(message_)} {} + + ovep_exception(const std::string& message, enum class type exception_type) + : message_{message}, + type_{exception_type}, + error_code_{ze_result_code_from_string(message)}, + error_name_{ze_result_name_from_string(message)} {} + + const char* what() const noexcept override { + return message_.data(); + } + + uint32_t get_code() const { return error_code_; } + + operator common::Status() const { + common::StatusCategory category_ort{common::ONNXRUNTIME}; + + if (type_ == type::unknown) { + return {category_ort, common::FAIL, message_}; + } + + // Newer drivers + if ((type_ == type::import_model) && + (error_code_ == 0x7800000f /* ZE_RESULT_ERROR_INVALID_NATIVE_BINARY */)) { + std::string message{error_name_ + ", code 0x" + std::to_string(error_code_) + "\nModel needs to be recompiled\n"}; + return {category_ort, common::INVALID_GRAPH, message}; + } + + std::string error_message = "Unhandled exception type: " + std::to_string(static_cast(type_)); + return {category_ort, common::EP_FAIL, error_message}; + } + + protected: + std::string message_; + type type_{type::unknown}; + uint32_t error_code_{0}; + std::string error_name_; + + private: + uint32_t ze_result_code_from_string(const std::string& ov_exception_string) { + uint32_t error_code{0}; + std::regex error_code_pattern("code 0x([0-9a-fA-F]+)"); + std::smatch matches; + if (std::regex_search(ov_exception_string, matches, error_code_pattern)) { + std::from_chars(&(*matches[1].first), &(*matches[1].second), error_code, 16); + } + return error_code; + } + std::string ze_result_name_from_string(const std::string& ov_exception_string) { + std::string error_message = "UNKNOWN NPU ERROR"; + std::regex error_message_pattern(R"(\bZE_\w*\b)"); + std::smatch matches; + if (std::regex_search(ov_exception_string, matches, error_message_pattern)) { + error_message = matches[0]; + } + return error_message; + } +}; + +} // namespace openvino_ep +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index f9c9fa2ea6f48..6dc7328d696da 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -12,6 +12,7 @@ #include "core/providers/openvino/onnx_ctx_model_helper.h" #include "core/providers/openvino/ov_versions/capability.h" #include "core/providers/openvino/qdq_transformations/qdq_stripping.h" +#include "core/providers/openvino/exceptions.h" #include "core/session/onnxruntime_session_options_config_keys.h" #include "openvino/core/version.hpp" #ifdef USE_OVEP_NPU_MEMORY @@ -103,107 +104,111 @@ common::Status OpenVINOExecutionProvider::Compile( auto& logger = *GetLogger(); Status status = Status::OK(); - if (session_context_.so_context_enable && session_context_.so_context_embed_mode && session_context_.so_share_ep_contexts) { - return Status(common::StatusCategory::ONNXRUNTIME, common::EP_FAIL, - std::string("Invalid EP context configuration: ") + kOrtSessionOptionEpContextEmbedMode + " must be 0 if " + kOrtSessionOptionShareEpContexts + " is 1."); - } + try { + if (session_context_.so_context_enable && session_context_.so_context_embed_mode && session_context_.so_share_ep_contexts) { + return Status(common::StatusCategory::ONNXRUNTIME, common::EP_FAIL, + std::string("Invalid EP context configuration: ") + kOrtSessionOptionEpContextEmbedMode + " must be 0 if " + kOrtSessionOptionShareEpContexts + " is 1."); + } - bool is_epctx_model = false; - if (!fused_nodes.empty()) { - // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext - const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get(); - session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string(); - session_context_.onnx_opset_version = - graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain); - - // OVIR wrapped in epctx should be treated as source but this code does not - // This corner case is not in use and will be addressed in a future commit - is_epctx_model = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(graph_body_viewer_0); - } + bool is_epctx_model = false; + if (!fused_nodes.empty()) { + // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext + const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get(); + session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string(); + session_context_.onnx_opset_version = + graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain); + + // OVIR wrapped in epctx should be treated as source but this code does not + // This corner case is not in use and will be addressed in a future commit + is_epctx_model = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(graph_body_viewer_0); + } - if (is_epctx_model) { - ep_ctx_handle_.Initialize(fused_nodes, session_context_.GetOutputBinPath().parent_path()); - } + if (is_epctx_model) { + ep_ctx_handle_.Initialize(fused_nodes, session_context_.GetOutputBinPath().parent_path()); + } - struct OpenVINOEPFunctionState { - AllocateFunc allocate_func = nullptr; - DestroyFunc destroy_func = nullptr; - AllocatorHandle allocator_handle = nullptr; - BackendManager& backend_manager; - }; - - for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) { - const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; - const Node& fused_node = fused_node_graph.fused_node; - - NodeComputeInfo compute_info; - - // During backend creation, we check if user wants to use precompiled blob onnx model or the original model - // For precompiled blob, directly load the model instead of compiling the model - // For original model, check if the user wants to export a model with pre-compiled blob - - auto& backend_manager = backend_managers_.emplace_back(session_context_, - *shared_context_manager_, - fused_node, - graph_body_viewer, - logger, - ep_ctx_handle_); - compute_info.create_state_func = - [&backend_manager](ComputeContext* context, FunctionState* state) { - OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{ - .allocate_func = context->allocate_func, - .destroy_func = context->release_func, - .allocator_handle = context->allocator_handle, - .backend_manager = backend_manager}; - *state = static_cast(p); - return 0; - }; - - compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) { - auto function_state = static_cast(state); - try { - function_state->backend_manager.Compute(context); - } catch (const std::exception& ex) { - return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what()); - } - return Status::OK(); + struct OpenVINOEPFunctionState { + AllocateFunc allocate_func = nullptr; + DestroyFunc destroy_func = nullptr; + AllocatorHandle allocator_handle = nullptr; + BackendManager& backend_manager; }; - compute_info.release_state_func = - [](FunctionState state) { - if (state) { - OpenVINOEPFunctionState* function_state = static_cast(state); - delete function_state; - } - }; + for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) { + const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; + const Node& fused_node = fused_node_graph.fused_node; + + NodeComputeInfo compute_info; + + // During backend creation, we check if user wants to use precompiled blob onnx model or the original model + // For precompiled blob, directly load the model instead of compiling the model + // For original model, check if the user wants to export a model with pre-compiled blob + + auto& backend_manager = backend_managers_.emplace_back(session_context_, + *shared_context_manager_, + fused_node, + graph_body_viewer, + logger, + ep_ctx_handle_); + compute_info.create_state_func = + [&backend_manager](ComputeContext* context, FunctionState* state) { + OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{ + .allocate_func = context->allocate_func, + .destroy_func = context->release_func, + .allocator_handle = context->allocator_handle, + .backend_manager = backend_manager}; + *state = static_cast(p); + return 0; + }; + + compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) { + auto function_state = static_cast(state); + try { + function_state->backend_manager.Compute(context); + } catch (const std::exception& ex) { + return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what()); + } + return Status::OK(); + }; - node_compute_funcs.push_back(std::move(compute_info)); - } + compute_info.release_state_func = + [](FunctionState state) { + if (state) { + OpenVINOEPFunctionState* function_state = static_cast(state); + delete function_state; + } + }; - // Export compiled blobs as EPContext nodes if context enable is set - if (session_context_.so_context_enable) { - auto backend_it = backend_managers_.begin(); - bool is_first = true; + node_compute_funcs.push_back(std::move(compute_info)); + } - for (const auto& fused_node_graph : fused_nodes) { - const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; + // Export compiled blobs as EPContext nodes if context enable is set + if (session_context_.so_context_enable) { + auto backend_it = backend_managers_.begin(); + bool is_first = true; - // Set include_embed_data to true only for the first backend manager - backend_it->TryExportCompiledBlobAsEPCtxNode(graph_body_viewer, is_first); + for (const auto& fused_node_graph : fused_nodes) { + const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; - is_first = false; - ++backend_it; - } + // Set include_embed_data to true only for the first backend manager + backend_it->TryExportCompiledBlobAsEPCtxNode(graph_body_viewer, is_first); + + is_first = false; + ++backend_it; + } - // bit clunky ideally we should try to fold this into ep context handler - if (!session_context_.so_context_embed_mode) { - auto shared_context = shared_context_manager_->GetOrCreateActiveSharedContext(session_context_.GetOutputBinPath()); - shared_context->Serialize(); - if (session_context_.so_stop_share_ep_contexts) { - shared_context_manager_->ClearActiveSharedContext(); - shared_context->Clear(); + // bit clunky ideally we should try to fold this into ep context handler + if (!session_context_.so_context_embed_mode) { + auto shared_context = shared_context_manager_->GetOrCreateActiveSharedContext(session_context_.GetOutputBinPath()); + shared_context->Serialize(); + if (session_context_.so_stop_share_ep_contexts) { + shared_context_manager_->ClearActiveSharedContext(); + shared_context->Clear(); + } } } + } catch (const ovep_exception& ex) { + status = ex; } return status; diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 85fc4d93d6243..446ed098521cb 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -12,16 +12,21 @@ #include "core/providers/openvino/backends/basic_backend.h" #include "core/providers/openvino/ov_stateful_patch_utils.h" #include "core/providers/openvino/onnx_ctx_model_helper.h" +#include "core/providers/openvino/exceptions.h" namespace onnxruntime { namespace openvino_ep { -template +template inline auto OvExceptionBoundary(Func&& func, std::format_string&& fmt, Args&&... args) { try { return func(); } catch (const ov::Exception& e) { - ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...)) + ": " + std::string(e.what())); + if constexpr (typed) { + throw ovep_exception(e, ovep_exception::type::import_model); + } else { + ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...)) + ": " + std::string(e.what())); + } } catch (...) { ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...))); } @@ -70,7 +75,7 @@ std::optional queryOVProperty(const std::string& property, const std::stri } std::shared_ptr OVCore::ReadModel(std::string&& model, const std::string& model_path) { - return OvExceptionBoundary([&]() { + return OvExceptionBoundary([&]() { std::istringstream modelStringStream(std::move(model)); std::istream& modelStream = modelStringStream; // Try to load with FrontEndManager @@ -88,7 +93,7 @@ std::shared_ptr OVCore::ReadModel(std::string&& model, const std::str ORT_THROW(log_tag + "Unknown exception while Reading network"); } }, - "Exception while Reading network"); + "Exception while Reading network"); } OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr& model, @@ -156,7 +161,7 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr& ie_cnn_netwo ov::AnyMap& device_config, bool enable_causallm, const std::string& name) { - return OvExceptionBoundary([&]() { + return OvExceptionBoundary([&]() { OVExeNetwork exe; if (enable_causallm) { auto mutable_model = ie_cnn_network->clone(); @@ -172,14 +177,14 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr& ie_cnn_netwo return exe; }, - "Exception while Loading Network for graph {}", name); + "Exception while Loading Network for graph {}", name); } OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, std::string& hw_target, ov::AnyMap& device_config, const std::string& name) { - return OvExceptionBoundary([&]() { + return OvExceptionBoundary([&]() { ov::CompiledModel obj; obj = core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config); @@ -189,14 +194,14 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, OVExeNetwork exe(obj, hw_target); return exe; }, - "Exception while Loading Network for graph {}", name); + "Exception while Loading Network for graph {}", name); } OVExeNetwork OVCore::ImportModel(ModelBlobWrapper& model_blob, std::string hw_target, const ov::AnyMap& device_config, std::string name) { - return OvExceptionBoundary([&]() { + return OvExceptionBoundary([&]() { ov::CompiledModel obj; #if (OPENVINO_VERSION_MAJOR > 2025 || (OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR >= 3)) if (model_blob.tensor_) { @@ -205,7 +210,7 @@ OVExeNetwork OVCore::ImportModel(ModelBlobWrapper& model_blob, obj = core.import_model(*model_blob.stream_, hw_target, device_config); } #else - obj = core.import_model(*model_blob.stream_, hw_target, device_config); + obj = core.import_model(*model_blob.stream_, hw_target, device_config); #endif OVExeNetwork exe(obj, hw_target); @@ -214,7 +219,7 @@ OVExeNetwork OVCore::ImportModel(ModelBlobWrapper& model_blob, #endif return exe; }, - "Exception while Loading Network for graph {}", name); + "Exception while Loading Network for graph {}", name); } OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream& model_stream, @@ -222,7 +227,7 @@ OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream& model_stream, const ov::AnyMap& device_config, bool enable_causallm, std::filesystem::path model_file_path) { - return OvExceptionBoundary([&]() { + return OvExceptionBoundary([&]() { OVExeNetwork exe; bool isXML = backend_utils::IsModelStreamXML(model_stream); @@ -267,7 +272,11 @@ OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream& model_stream, #endif return exe; }, - "Exception while Loading Network from OVIR model file: {}", model_file_path.string()); + "Exception while Loading Network from OVIR model file: {}", model_file_path.string()); +} + +void OVCore::SetCache(const std::string& cache_dir_path) { + core.set_property(ov::cache_dir(cache_dir_path)); } std::vector OVCore::GetAvailableDevices() const { @@ -308,8 +317,12 @@ std::vector OVCore::GetAvailableDevices(const std::string& device_t return available_devices; } +void OVCore::SetStreams(const std::string& device_type, int num_streams) { + core.set_property(device_type, {ov::num_streams(num_streams)}); +} + std::shared_ptr OVExeNetwork::CreateInferRequest() { - return OvExceptionBoundary([&]() { + return OvExceptionBoundary([&]() { auto infReq = compiled_model_obj.create_infer_request(); std::shared_ptr ovInfReq; if (is_stateful_causallm) { @@ -320,31 +333,31 @@ std::shared_ptr OVExeNetwork::CreateInferRequest() { return ovInfReq; }, - "Exception while creating InferRequest object"); + "Exception while creating InferRequest object"); } OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) { - return OvExceptionBoundary([&]() { + return OvExceptionBoundary([&]() { auto tobj = ovInfReq.get_tensor(input_name); OVTensorPtr blob = std::make_shared(tobj); return blob; }, - " Cannot access IE Blob for input: {}", input_name); + " Cannot access IE Blob for input: {}", input_name); } std::string OVInferRequest::GetInputTensorName(uint32_t index) { - return OvExceptionBoundary([&]() -> const std::string& { + return OvExceptionBoundary([&]() { const auto& model = ovInfReq.get_compiled_model(); return *model.input(index).get_names().begin(); }, - " Cannot access IE Blob for input number: {}", index); + " Cannot access IE Blob for input number: {}", index); } void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) { - OvExceptionBoundary([&]() { + OvExceptionBoundary([&]() { ovInfReq.set_tensor(name, *(blob.get())); }, - " Cannot set Remote Blob for output: {}", name); + " Cannot set Remote Blob for output: {}", name); } uint32_t OVInferRequest::GetNumInputs() { @@ -352,20 +365,16 @@ uint32_t OVInferRequest::GetNumInputs() { } void OVInferRequest::Infer() { - OvExceptionBoundary([&]() { + OvExceptionBoundary([&]() { ovInfReq.infer(); }, - "In Error Couldn't start Inference"); + "In Error Couldn't start Inference"); } StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device) : OVInferRequest(std::move(infer_request)), target_device(device) { bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos)); - - // check if there is input_ids tensors and if the tensor type is int64, - // because logic prefill_use_full_chat_history is only for specific inputs and data type - auto input_ids_opt = FindTensor("input_ids"); - if (gpu_or_npu && input_ids_opt.has_value() && input_ids_opt->get_element_type() == ov::element::i64) { + if (gpu_or_npu) { prefill_use_full_chat_history = true; } } diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 8765cd040d098..8a55fdcbd4fb4 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -81,6 +81,8 @@ struct OVCore : WeakSingleton { std::vector GetAvailableDevices() const; std::vector GetAvailableDevices(const std::string& device_type) const; + void SetCache(const std::string& cache_dir_path); + void SetStreams(const std::string& device_type, int num_streams); }; class OVExeNetwork {