diff --git a/include/onnxruntime/core/common/gpu_profiler_common.h b/include/onnxruntime/core/common/gpu_profiler_common.h index 00d5033ef2df4..796c330974e86 100644 --- a/include/onnxruntime/core/common/gpu_profiler_common.h +++ b/include/onnxruntime/core/common/gpu_profiler_common.h @@ -443,6 +443,8 @@ class GPUProfilerBase : public EpProfiler { return true; } + virtual void StartRunProfiling() override {} + virtual void EndProfiling(TimePoint start_time, Events& events) override { auto& manager = TManager::GetInstance(); std::map event_map; diff --git a/include/onnxruntime/core/common/profiler_common.h b/include/onnxruntime/core/common/profiler_common.h index ab973256fe5f1..557e88870f6d7 100644 --- a/include/onnxruntime/core/common/profiler_common.h +++ b/include/onnxruntime/core/common/profiler_common.h @@ -83,6 +83,14 @@ class EpProfiler { virtual void EndProfiling(TimePoint start_time, Events& events) = 0; // called when profiling ends, save all captures numbers to "events" virtual void Start(uint64_t) {} // called before op start, accept an id as argument to identify the op virtual void Stop(uint64_t) {} // called after op stop, accept an id as argument to identify the op + + // Run-level profiling support + virtual void StartRunProfiling() {} + virtual void EndRunProfiling(TimePoint start_time, Events& events) { + ORT_UNUSED_PARAMETER(start_time); + ORT_UNUSED_PARAMETER(events); + } + virtual bool IsRunProfilingEnabled() const { return false; } // check if run-level profiling is enabled for current thread }; // Demangle C++ symbols diff --git a/include/onnxruntime/core/framework/run_options.h b/include/onnxruntime/core/framework/run_options.h index e63ab044834f5..8eef7d1355099 100644 --- a/include/onnxruntime/core/framework/run_options.h +++ b/include/onnxruntime/core/framework/run_options.h @@ -35,6 +35,15 @@ struct OrtRunOptions { // So it is possible that only some of the nodes are executed. bool only_execute_path_to_fetches = false; + // Set to 'true' to enable profiling for this run. + // Profiling result will be written to the file with prefix specified by profile_file_prefix. + bool enable_profiling = false; + + // File prefix for profiling result for this run. + // The actual filename will be: _.json + // Only used when enable_profiling is true. + std::string profile_file_prefix = "onnxruntime_profile"; + #ifdef ENABLE_TRAINING // Used by onnxruntime::training::TrainingSession. This class is now deprecated. // Delete training_mode when TrainingSession is deleted. diff --git a/onnxruntime/core/common/profiler.cc b/onnxruntime/core/common/profiler.cc index 8562e5524af74..78b3e48420ad2 100644 --- a/onnxruntime/core/common/profiler.cc +++ b/onnxruntime/core/common/profiler.cc @@ -20,7 +20,8 @@ profiling::Profiler::~Profiler() {} #endif ::onnxruntime::TimePoint profiling::Profiler::Start() { - ORT_ENFORCE(enabled_); + // Allow Start() when either session-level or run-level profiling is enabled + ORT_ENFORCE(enabled_ || run_level_state_.enabled); auto start_time = std::chrono::high_resolution_clock::now(); auto ts = TimeDiffMicroSeconds(profiling_start_time_, start_time); for (const auto& ep_profiler : ep_profilers_) { @@ -81,13 +82,15 @@ void Profiler::EndTimeAndRecordEvent(EventCategory category, EventRecord event(category, logging::GetProcessId(), logging::GetThreadId(), event_name, ts, dur, {event_args.begin(), event_args.end()}); + + // Session level profiling if (profile_with_logger_) { custom_logger_->SendProfileEvent(event); - } else { + } else if (enabled_) { // TODO: sync_gpu if needed. std::lock_guard lock(mutex_); if (events_.size() < max_num_events_) { - events_.emplace_back(std::move(event)); + events_.emplace_back(event); // copy to session events } else { if (session_logger_ && !max_events_reached) { LOGS(*session_logger_, ERROR) @@ -97,6 +100,15 @@ void Profiler::EndTimeAndRecordEvent(EventCategory category, } } + // Run level profiling (TLS, no lock needed) + if (run_level_state_.enabled) { + // Recalculate ts relative to run start time + long long run_ts = TimeDiffMicroSeconds(run_level_state_.start_time, start_time); + EventRecord run_event(category, logging::GetProcessId(), + logging::GetThreadId(), event_name, run_ts, dur, {event_args.begin(), event_args.end()}); + run_level_state_.events.emplace_back(std::move(run_event)); + } + for (const auto& ep_profiler : ep_profilers_) { ep_profiler->Stop(ts); } @@ -157,5 +169,79 @@ std::string Profiler::EndProfiling() { return profile_stream_file_; } +thread_local Profiler::RunLevelState Profiler::run_level_state_; + +void Profiler::StartRunLevelProfiling() { + run_level_state_.enabled = true; + run_level_state_.events.clear(); + run_level_state_.start_time = std::chrono::high_resolution_clock::now(); + + // Notify EP profilers about run-level profiling start + for (const auto& ep_profiler : ep_profilers_) { + ep_profiler->StartRunProfiling(); + } +} + +bool Profiler::IsRunLevelProfilingEnabled() const { + return run_level_state_.enabled; +} + +std::string Profiler::EndRunLevelProfiling(const std::string& file_path) { + if (!run_level_state_.enabled) { + return std::string(); + } + + // Collect EP profiler events for this run + for (const auto& ep_profiler : ep_profilers_) { + ep_profiler->EndRunProfiling(run_level_state_.start_time, run_level_state_.events); + } + + run_level_state_.enabled = false; + + if (file_path.empty()) { + run_level_state_.events.clear(); + return std::string(); + } + + // Write to file + std::ofstream out(file_path, std::ios::out | std::ios::trunc); + if (!out.is_open()) { + run_level_state_.events.clear(); + return std::string(); + } + + out << "[\n"; + + for (size_t i = 0; i < run_level_state_.events.size(); ++i) { + auto& rec = run_level_state_.events[i]; + out << R"({"cat" : ")" << event_category_names_[rec.cat] << "\","; + out << "\"pid\" :" << rec.pid << ","; + out << "\"tid\" :" << rec.tid << ","; + out << "\"dur\" :" << rec.dur << ","; + out << "\"ts\" :" << rec.ts << ","; + out << R"("ph" : "X",)"; + out << R"("name" :")" << rec.name << "\","; + out << "\"args\" : {"; + bool is_first_arg = true; + for (const auto& event_arg : rec.args) { + if (!is_first_arg) out << ","; + if (!event_arg.second.empty() && (event_arg.second[0] == '{' || event_arg.second[0] == '[')) { + out << "\"" << event_arg.first << "\" : " << event_arg.second; + } else { + out << "\"" << event_arg.first << "\" : \"" << event_arg.second << "\""; + } + is_first_arg = false; + } + out << "}"; + out << (i == run_level_state_.events.size() - 1 ? "}\n" : "},\n"); + } + + out << "]\n"; + out.close(); + + run_level_state_.events.clear(); + return file_path; +} + } // namespace profiling } // namespace onnxruntime diff --git a/onnxruntime/core/common/profiler.h b/onnxruntime/core/common/profiler.h index 0103d8abb151f..29b6154817200 100644 --- a/onnxruntime/core/common/profiler.h +++ b/onnxruntime/core/common/profiler.h @@ -119,6 +119,22 @@ class Profiler { } } + /* + Start run-level profiling for the current thread. + */ + void StartRunLevelProfiling(); + + /* + End run-level profiling for the current thread and write to file. + Returns the file path if successful, empty string otherwise. + */ + std::string EndRunLevelProfiling(const std::string& file_path); + + /* + Whether run-level profiling is enabled for the current thread. + */ + bool IsRunLevelProfilingEnabled() const; + private: ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Profiler); @@ -156,6 +172,14 @@ class Profiler { #endif std::vector> ep_profilers_; + + // TLS: Run level profiling state + struct RunLevelState { + bool enabled = false; + Events events; + TimePoint start_time; + }; + static thread_local RunLevelState run_level_state_; }; } // namespace profiling diff --git a/onnxruntime/core/common/profiler_common.cc b/onnxruntime/core/common/profiler_common.cc index 03a06adcfff2f..e12328d6cb275 100644 --- a/onnxruntime/core/common/profiler_common.cc +++ b/onnxruntime/core/common/profiler_common.cc @@ -12,6 +12,8 @@ namespace onnxruntime { namespace profiling { + + static constexpr int kMaxSymbolSize = 1024; std::string demangle(const char* name) { diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc index 7180d976c1d3c..49debcb0beccb 100644 --- a/onnxruntime/core/framework/sequential_executor.cc +++ b/onnxruntime/core/framework/sequential_executor.cc @@ -177,7 +177,8 @@ class SessionScope { session_state_.GetGraphExecutionCounter(), 0} #endif { - if (session_state_.Profiler().IsEnabled()) { + // Check both session-level and run-level profiling + if (session_state_.Profiler().IsEnabled() || session_state_.Profiler().IsRunLevelProfilingEnabled()) { session_start_ = session_state.Profiler().Start(); } @@ -225,7 +226,8 @@ class SessionScope { } #endif - if (session_state_.Profiler().IsEnabled()) { + // Check both session-level and run-level profiling + if (session_state_.Profiler().IsEnabled() || session_state_.Profiler().IsRunLevelProfilingEnabled()) { session_state_.Profiler().EndTimeAndRecordEvent(profiling::SESSION_EVENT, "SequentialExecutor::Execute", session_start_); } #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE) @@ -344,7 +346,8 @@ class KernelScope { node_compute_range_.Begin(); #endif - if (session_state_.Profiler().IsEnabled()) { + // Check both session-level and run-level profiling + if (session_state_.Profiler().IsEnabled() || session_state_.Profiler().IsRunLevelProfilingEnabled()) { auto& node = kernel.Node(); node_name_ = node.Name().empty() ? MakeString(node.OpType(), "_", node.Index()) : node.Name(); concurrency::ThreadPool::StartProfiling(session_state_.GetThreadPool()); @@ -363,7 +366,8 @@ class KernelScope { node_compute_range_.End(); #endif - if (session_state_.Profiler().IsEnabled()) { + // Check both session-level and run-level profiling + if (session_state_.Profiler().IsEnabled() || session_state_.Profiler().IsRunLevelProfilingEnabled()) { auto& profiler = session_state_.Profiler(); std::string output_type_shape_; CalculateTotalOutputSizes(&kernel_context_, total_output_sizes_, node_name_, output_type_shape_); diff --git a/onnxruntime/core/providers/cuda/cuda_profiler.cc b/onnxruntime/core/providers/cuda/cuda_profiler.cc index d220552204cef..0f184a8bf519e 100644 --- a/onnxruntime/core/providers/cuda/cuda_profiler.cc +++ b/onnxruntime/core/providers/cuda/cuda_profiler.cc @@ -6,6 +6,7 @@ #include #include "cuda_profiler.h" +#include "core/common/logging/logging.h" namespace onnxruntime { namespace profiling { @@ -22,6 +23,16 @@ CudaProfiler::~CudaProfiler() { manager.DeregisterClient(client_handle_); } +void CudaProfiler::StartRunProfiling() { + LOGS_DEFAULT(WARNING) << "Run-level profiling is not implemented for this EP profiler, so GPU kernel events for this EP cannot be captured."; +} + +#else + +void CudaProfiler::StartRunProfiling() { + LOGS_DEFAULT(WARNING) << "Run-level profiling is not implemented for this EP profiler, so GPU kernel events for this EP cannot be captured."; +} + #endif /* #if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */ } // namespace profiling diff --git a/onnxruntime/core/providers/cuda/cuda_profiler.h b/onnxruntime/core/providers/cuda/cuda_profiler.h index 1d8ecddce4c79..44faf4cd6d3fd 100644 --- a/onnxruntime/core/providers/cuda/cuda_profiler.h +++ b/onnxruntime/core/providers/cuda/cuda_profiler.h @@ -23,6 +23,7 @@ class CudaProfiler final : public GPUProfilerBase { CudaProfiler(); ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaProfiler); ~CudaProfiler(); + void StartRunProfiling() override; }; #else /* #if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */ @@ -34,6 +35,7 @@ class CudaProfiler final : public EpProfiler { ~CudaProfiler() {} bool StartProfiling(TimePoint) override { return true; } void EndProfiling(TimePoint, Events&) override {} + void StartRunProfiling() override; void Start(uint64_t) override {} void Stop(uint64_t) override {} }; diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.cc b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc index d84507ec6ad02..4307f52178e2f 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_profiler.cc +++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc @@ -12,6 +12,10 @@ bool VitisaiProfiler::StartProfiling(TimePoint tp) { return true; } +void VitisaiProfiler::StartRunProfiling() { + LOGS_DEFAULT(WARNING) << "Run-level profiling is not implemented for this EP profiler, so GPU kernel events for this EP cannot be captured."; +} + void VitisaiProfiler::EndProfiling(TimePoint tp, Events& events) { auto time_point = std::chrono::duration_cast(tp.time_since_epoch()).count(); diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.h b/onnxruntime/core/providers/vitisai/vitisai_profiler.h index afe4058f7290a..d7e532d6ba452 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_profiler.h +++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.h @@ -14,6 +14,7 @@ class VitisaiProfiler final : public EpProfiler { ~VitisaiProfiler() {} bool StartProfiling(TimePoint) override; void EndProfiling(TimePoint, Events&) override; + void StartRunProfiling() override; void Start(uint64_t) override {} void Stop(uint64_t) override {} }; diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index 15263a87a17b6..473622e4fad86 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -978,7 +978,8 @@ Status WebGpuExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_op context_.PushErrorScope(); } - if (profiler_->Enabled()) { + // Start GPU profiling if session-level OR run-level profiling is enabled + if (profiler_->Enabled() || run_options.enable_profiling) { context_.StartProfiling(); } @@ -1000,7 +1001,7 @@ Status WebGpuExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_op return Status::OK(); } -Status WebGpuExecutionProvider::OnRunEnd(bool /* sync_stream */, const onnxruntime::RunOptions& /* run_options */) { +Status WebGpuExecutionProvider::OnRunEnd(bool /* sync_stream */, const onnxruntime::RunOptions& run_options) { context_.Flush(BufferManager()); if (IsGraphCaptureEnabled() && !IsGraphCaptured(m_current_graph_annotation_id)) { @@ -1013,8 +1014,12 @@ Status WebGpuExecutionProvider::OnRunEnd(bool /* sync_stream */, const onnxrunti } } - if (profiler_->Enabled()) { - context_.CollectProfilingData(profiler_->Events()); + // Collect GPU profiling data if session-level OR run-level profiling is enabled + if (profiler_->Enabled() || run_options.enable_profiling) { + onnxruntime::profiling::Events gpu_events; + context_.CollectProfilingData(gpu_events); + // Dispatch to appropriate containers (session-level, run-level, or both) + profiler_->DispatchGpuEvents(gpu_events); } #if defined(ENABLE_PIX_FOR_WEBGPU_EP) diff --git a/onnxruntime/core/providers/webgpu/webgpu_profiler.cc b/onnxruntime/core/providers/webgpu/webgpu_profiler.cc index ce973987e593a..e007d8018142e 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_profiler.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_profiler.cc @@ -7,6 +7,9 @@ namespace onnxruntime { namespace webgpu { +// TLS definition for run-level profiling +thread_local WebGpuProfiler::RunLevelState WebGpuProfiler::run_level_state_; + WebGpuProfiler::WebGpuProfiler(WebGpuContext& context) : context_{context} {} bool WebGpuProfiler::StartProfiling(TimePoint) { @@ -19,5 +22,44 @@ void WebGpuProfiler::EndProfiling(TimePoint tp, onnxruntime::profiling::Events& enabled_ = false; } +void WebGpuProfiler::StartRunProfiling() { + run_level_state_.enabled = true; + run_level_state_.events.clear(); +} + +void WebGpuProfiler::EndRunProfiling(TimePoint /* start_time */, onnxruntime::profiling::Events& events) { + if (!run_level_state_.enabled) { + return; + } + + // Move collected GPU events to the output + for (auto& event : run_level_state_.events) { + events.emplace_back(std::move(event)); + } + + run_level_state_.events.clear(); + run_level_state_.enabled = false; +} + +bool WebGpuProfiler::IsRunProfilingEnabled() const { + return run_level_state_.enabled; +} + +void WebGpuProfiler::DispatchGpuEvents(onnxruntime::profiling::Events& gpu_events) { + // If session-level profiling is enabled, copy to session events + if (enabled_) { + for (const auto& event : gpu_events) { + events_.emplace_back(event); // copy + } + } + + // If run-level profiling is enabled, move to run-level events + if (run_level_state_.enabled) { + for (auto& event : gpu_events) { + run_level_state_.events.emplace_back(std::move(event)); // move + } + } +} + } // namespace webgpu } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/webgpu_profiler.h b/onnxruntime/core/providers/webgpu/webgpu_profiler.h index d826d295a3842..af8b2ce297871 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_profiler.h +++ b/onnxruntime/core/providers/webgpu/webgpu_profiler.h @@ -21,13 +21,33 @@ class WebGpuProfiler final : public onnxruntime::profiling::EpProfiler { } void Stop(uint64_t) override { } + + // Run-level profiling support + void StartRunProfiling() override; + void EndRunProfiling(TimePoint start_time, onnxruntime::profiling::Events& events) override; + bool IsRunProfilingEnabled() const override; + + // Check if session-level profiling is enabled (original semantics) inline bool Enabled() const { return enabled_; } + + // Get session-level events container inline onnxruntime::profiling::Events& Events() { return events_; } + // Dispatch GPU events to appropriate containers based on profiling mode + // Call this when GPU events are collected + void DispatchGpuEvents(onnxruntime::profiling::Events& gpu_events); + private: WebGpuContext& context_; bool enabled_{false}; - onnxruntime::profiling::Events events_; // cached GPU events + onnxruntime::profiling::Events events_; // cached GPU events for session-level + + // TLS for run-level profiling + struct RunLevelState { + bool enabled = false; + onnxruntime::profiling::Events events; + }; + static thread_local RunLevelState run_level_state_; }; } // namespace webgpu diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index e5523dc78b5d2..462658d209f83 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -2930,6 +2930,11 @@ Status InferenceSession::Run(const RunOptions& run_options, tp = session_profiler_.Start(); } + // Start run-level profiling if enabled + if (run_options.enable_profiling) { + session_profiler_.StartRunLevelProfiling(); + } + #ifdef ONNXRUNTIME_ENABLE_INSTRUMENT TraceLoggingActivity ortrun_activity; ortrun_activity.SetRelatedActivity(session_activity); @@ -3138,6 +3143,14 @@ Status InferenceSession::Run(const RunOptions& run_options, if (session_profiler_.IsEnabled()) { session_profiler_.EndTimeAndRecordEvent(profiling::SESSION_EVENT, "model_run", tp); } + + // End run-level profiling and write to file if enabled + if (run_options.enable_profiling) { + std::ostringstream profile_ss; + profile_ss << run_options.profile_file_prefix << "_" << GetCurrentTimeString() << ".json"; + session_profiler_.EndRunLevelProfiling(profile_ss.str()); + } + #ifdef ONNXRUNTIME_ENABLE_INSTRUMENT TraceLoggingWriteStop(ortrun_activity, "OrtRun"); #endif diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index f0d8906d99c14..957fbd4191dfc 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -2361,6 +2361,10 @@ RunOptions instance. The individual calls will exit gracefully and return an err #endif .def_readwrite("only_execute_path_to_fetches", &RunOptions::only_execute_path_to_fetches, R"pbdoc(Only execute the nodes needed by fetch list)pbdoc") + .def_readwrite("enable_profiling", &RunOptions::enable_profiling, + R"pbdoc(Enable profiling for this run.)pbdoc") + .def_readwrite("profile_file_prefix", &RunOptions::profile_file_prefix, + R"pbdoc(File prefix for profiling result. The actual filename will be: _.json)pbdoc") .def( "add_run_config_entry", [](RunOptions* options, const char* config_key, const char* config_value) -> void {