diff --git a/include/onnxruntime/core/common/gpu_profiler_common.h b/include/onnxruntime/core/common/gpu_profiler_common.h
index 00d5033ef2df4..796c330974e86 100644
--- a/include/onnxruntime/core/common/gpu_profiler_common.h
+++ b/include/onnxruntime/core/common/gpu_profiler_common.h
@@ -443,6 +443,8 @@ class GPUProfilerBase : public EpProfiler {
     return true;
   }
 
+  virtual void StartRunProfiling() override {}
+
   virtual void EndProfiling(TimePoint start_time, Events& events) override {
     auto& manager = TManager::GetInstance();
     std::map<uint64_t, Events> event_map;
diff --git a/include/onnxruntime/core/common/profiler_common.h b/include/onnxruntime/core/common/profiler_common.h
index ab973256fe5f1..557e88870f6d7 100644
--- a/include/onnxruntime/core/common/profiler_common.h
+++ b/include/onnxruntime/core/common/profiler_common.h
@@ -83,6 +83,14 @@ class EpProfiler {
   virtual void EndProfiling(TimePoint start_time, Events& events) = 0;  // called when profiling ends, save all captures numbers to "events"
   virtual void Start(uint64_t) {}                                       // called before op start, accept an id as argument to identify the op
   virtual void Stop(uint64_t) {}                                        // called after op stop, accept an id as argument to identify the op
+
+  // Run-level profiling support
+  virtual void StartRunProfiling() {}
+  virtual void EndRunProfiling(TimePoint start_time, Events& events) {
+    ORT_UNUSED_PARAMETER(start_time);
+    ORT_UNUSED_PARAMETER(events);
+  }
+  virtual bool IsRunProfilingEnabled() const { return false; }  // check if run-level profiling is enabled for current thread
 };
 
 // Demangle C++ symbols
diff --git a/include/onnxruntime/core/framework/run_options.h b/include/onnxruntime/core/framework/run_options.h
index e63ab044834f5..8eef7d1355099 100644
--- a/include/onnxruntime/core/framework/run_options.h
+++ b/include/onnxruntime/core/framework/run_options.h
@@ -35,6 +35,15 @@ struct OrtRunOptions {
   // So it is possible that only some of the nodes are executed.
   bool only_execute_path_to_fetches = false;
 
+  // Set to 'true' to enable profiling for this run.
+  // Profiling result will be written to the file with prefix specified by profile_file_prefix.
+  bool enable_profiling = false;
+
+  // File prefix for profiling result for this run.
+  // The actual filename will be: <profile_file_prefix>_<timestamp>.json
+  // Only used when enable_profiling is true.
+  std::string profile_file_prefix = "onnxruntime_profile";
+
 #ifdef ENABLE_TRAINING
   // Used by onnxruntime::training::TrainingSession. This class is now deprecated.
   // Delete training_mode when TrainingSession is deleted.
diff --git a/onnxruntime/core/common/profiler.cc b/onnxruntime/core/common/profiler.cc
index 8562e5524af74..78b3e48420ad2 100644
--- a/onnxruntime/core/common/profiler.cc
+++ b/onnxruntime/core/common/profiler.cc
@@ -20,7 +20,8 @@ profiling::Profiler::~Profiler() {}
 #endif
 
 ::onnxruntime::TimePoint profiling::Profiler::Start() {
-  ORT_ENFORCE(enabled_);
+  // Allow Start() when either session-level or run-level profiling is enabled
+  ORT_ENFORCE(enabled_ || run_level_state_.enabled);
   auto start_time = std::chrono::high_resolution_clock::now();
   auto ts = TimeDiffMicroSeconds(profiling_start_time_, start_time);
   for (const auto& ep_profiler : ep_profilers_) {
@@ -81,13 +82,15 @@ void Profiler::EndTimeAndRecordEvent(EventCategory category,
 
   EventRecord event(category, logging::GetProcessId(),
                     logging::GetThreadId(), event_name, ts, dur, {event_args.begin(), event_args.end()});
+
+  // Session level profiling
   if (profile_with_logger_) {
     custom_logger_->SendProfileEvent(event);
-  } else {
+  } else if (enabled_) {
     // TODO: sync_gpu if needed.
     std::lock_guard<std::mutex> lock(mutex_);
     if (events_.size() < max_num_events_) {
-      events_.emplace_back(std::move(event));
+      events_.emplace_back(event);  // copy to session events
     } else {
       if (session_logger_ && !max_events_reached) {
         LOGS(*session_logger_, ERROR)
@@ -97,6 +100,15 @@ void Profiler::EndTimeAndRecordEvent(EventCategory category,
     }
   }
 
+  // Run level profiling (TLS, no lock needed)
+  if (run_level_state_.enabled) {
+    // Recalculate ts relative to run start time
+    long long run_ts = TimeDiffMicroSeconds(run_level_state_.start_time, start_time);
+    EventRecord run_event(category, logging::GetProcessId(),
+                          logging::GetThreadId(), event_name, run_ts, dur, {event_args.begin(), event_args.end()});
+    run_level_state_.events.emplace_back(std::move(run_event));
+  }
+
   for (const auto& ep_profiler : ep_profilers_) {
     ep_profiler->Stop(ts);
   }
@@ -157,5 +169,79 @@ std::string Profiler::EndProfiling() {
   return profile_stream_file_;
 }
 
+thread_local Profiler::RunLevelState Profiler::run_level_state_;
+
+void Profiler::StartRunLevelProfiling() {
+  run_level_state_.enabled = true;
+  run_level_state_.events.clear();
+  run_level_state_.start_time = std::chrono::high_resolution_clock::now();
+
+  // Notify EP profilers about run-level profiling start
+  for (const auto& ep_profiler : ep_profilers_) {
+    ep_profiler->StartRunProfiling();
+  }
+}
+
+bool Profiler::IsRunLevelProfilingEnabled() const {
+  return run_level_state_.enabled;
+}
+
+std::string Profiler::EndRunLevelProfiling(const std::string& file_path) {
+  if (!run_level_state_.enabled) {
+    return std::string();
+  }
+
+  // Collect EP profiler events for this run
+  for (const auto& ep_profiler : ep_profilers_) {
+    ep_profiler->EndRunProfiling(run_level_state_.start_time, run_level_state_.events);
+  }
+
+  run_level_state_.enabled = false;
+
+  if (file_path.empty()) {
+    run_level_state_.events.clear();
+    return std::string();
+  }
+
+  // Write to file
+  std::ofstream out(file_path, std::ios::out | std::ios::trunc);
+  if (!out.is_open()) {
+    run_level_state_.events.clear();
+    return std::string();
+  }
+
+  out << "[\n";
+
+  for (size_t i = 0; i < run_level_state_.events.size(); ++i) {
+    auto& rec = run_level_state_.events[i];
+    out << R"({"cat" : ")" << event_category_names_[rec.cat] << "\",";
+    out << "\"pid\" :" << rec.pid << ",";
+    out << "\"tid\" :" << rec.tid << ",";
+    out << "\"dur\" :" << rec.dur << ",";
+    out << "\"ts\" :" << rec.ts << ",";
+    out << R"("ph" : "X",)";
+    out << R"("name" :")" << rec.name << "\",";
+    out << "\"args\" : {";
+    bool is_first_arg = true;
+    for (const auto& event_arg : rec.args) {
+      if (!is_first_arg) out << ",";
+      if (!event_arg.second.empty() && (event_arg.second[0] == '{' || event_arg.second[0] == '[')) {
+        out << "\"" << event_arg.first << "\" : " << event_arg.second;
+      } else {
+        out << "\"" << event_arg.first << "\" : \"" << event_arg.second << "\"";
+      }
+      is_first_arg = false;
+    }
+    out << "}";
+    out << (i == run_level_state_.events.size() - 1 ? "}\n" : "},\n");
+  }
+
+  out << "]\n";
+  out.close();
+
+  run_level_state_.events.clear();
+  return file_path;
+}
+
 }  // namespace profiling
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/common/profiler.h b/onnxruntime/core/common/profiler.h
index 0103d8abb151f..29b6154817200 100644
--- a/onnxruntime/core/common/profiler.h
+++ b/onnxruntime/core/common/profiler.h
@@ -119,6 +119,22 @@ class Profiler {
     }
   }
 
+  /*
+  Start run-level profiling for the current thread.
+  */
+  void StartRunLevelProfiling();
+
+  /*
+  End run-level profiling for the current thread and write to file.
+  Returns the file path if successful, empty string otherwise.
+  */
+  std::string EndRunLevelProfiling(const std::string& file_path);
+
+  /*
+  Whether run-level profiling is enabled for the current thread.
+  */
+  bool IsRunLevelProfilingEnabled() const;
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Profiler);
 
@@ -156,6 +172,14 @@ class Profiler {
 #endif
 
   std::vector<std::unique_ptr<EpProfiler>> ep_profilers_;
+
+  // TLS: Run level profiling state
+  struct RunLevelState {
+    bool enabled = false;
+    Events events;
+    TimePoint start_time;
+  };
+  static thread_local RunLevelState run_level_state_;
 };
 
 }  // namespace profiling
diff --git a/onnxruntime/core/common/profiler_common.cc b/onnxruntime/core/common/profiler_common.cc
index 03a06adcfff2f..e12328d6cb275 100644
--- a/onnxruntime/core/common/profiler_common.cc
+++ b/onnxruntime/core/common/profiler_common.cc
@@ -12,6 +12,8 @@
 namespace onnxruntime {
 namespace profiling {
 
+
+
 static constexpr int kMaxSymbolSize = 1024;
 
 std::string demangle(const char* name) {
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index 7180d976c1d3c..49debcb0beccb 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -177,7 +177,8 @@ class SessionScope {
             session_state_.GetGraphExecutionCounter(), 0}
 #endif
   {
-    if (session_state_.Profiler().IsEnabled()) {
+    // Check both session-level and run-level profiling
+    if (session_state_.Profiler().IsEnabled() || session_state_.Profiler().IsRunLevelProfilingEnabled()) {
       session_start_ = session_state.Profiler().Start();
     }
 
@@ -225,7 +226,8 @@ class SessionScope {
     }
 #endif
 
-    if (session_state_.Profiler().IsEnabled()) {
+    // Check both session-level and run-level profiling
+    if (session_state_.Profiler().IsEnabled() || session_state_.Profiler().IsRunLevelProfilingEnabled()) {
       session_state_.Profiler().EndTimeAndRecordEvent(profiling::SESSION_EVENT, "SequentialExecutor::Execute", session_start_);
     }
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
@@ -344,7 +346,8 @@ class KernelScope {
     node_compute_range_.Begin();
 #endif
 
-    if (session_state_.Profiler().IsEnabled()) {
+    // Check both session-level and run-level profiling
+    if (session_state_.Profiler().IsEnabled() || session_state_.Profiler().IsRunLevelProfilingEnabled()) {
       auto& node = kernel.Node();
       node_name_ = node.Name().empty() ? MakeString(node.OpType(), "_", node.Index()) : node.Name();
       concurrency::ThreadPool::StartProfiling(session_state_.GetThreadPool());
@@ -363,7 +366,8 @@ class KernelScope {
     node_compute_range_.End();
 #endif
 
-    if (session_state_.Profiler().IsEnabled()) {
+    // Check both session-level and run-level profiling
+    if (session_state_.Profiler().IsEnabled() || session_state_.Profiler().IsRunLevelProfilingEnabled()) {
       auto& profiler = session_state_.Profiler();
       std::string output_type_shape_;
       CalculateTotalOutputSizes(&kernel_context_, total_output_sizes_, node_name_, output_type_shape_);
diff --git a/onnxruntime/core/providers/cuda/cuda_profiler.cc b/onnxruntime/core/providers/cuda/cuda_profiler.cc
index d220552204cef..0f184a8bf519e 100644
--- a/onnxruntime/core/providers/cuda/cuda_profiler.cc
+++ b/onnxruntime/core/providers/cuda/cuda_profiler.cc
@@ -6,6 +6,7 @@
 #include <iostream>
 
 #include "cuda_profiler.h"
+#include "core/common/logging/logging.h"
 
 namespace onnxruntime {
 namespace profiling {
@@ -22,6 +23,16 @@ CudaProfiler::~CudaProfiler() {
   manager.DeregisterClient(client_handle_);
 }
 
+void CudaProfiler::StartRunProfiling() {
+  LOGS_DEFAULT(WARNING) << "Run-level profiling is not implemented for this EP profiler, so GPU kernel events for this EP cannot be captured.";
+}
+
+#else
+
+void CudaProfiler::StartRunProfiling() {
+  LOGS_DEFAULT(WARNING) << "Run-level profiling is not implemented for this EP profiler, so GPU kernel events for this EP cannot be captured.";
+}
+
 #endif /* #if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */
 
 }  // namespace profiling
diff --git a/onnxruntime/core/providers/cuda/cuda_profiler.h b/onnxruntime/core/providers/cuda/cuda_profiler.h
index 1d8ecddce4c79..44faf4cd6d3fd 100644
--- a/onnxruntime/core/providers/cuda/cuda_profiler.h
+++ b/onnxruntime/core/providers/cuda/cuda_profiler.h
@@ -23,6 +23,7 @@ class CudaProfiler final : public GPUProfilerBase<CUPTIManager> {
   CudaProfiler();
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaProfiler);
   ~CudaProfiler();
+  void StartRunProfiling() override;
 };
 
 #else /* #if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING) */
@@ -34,6 +35,7 @@ class CudaProfiler final : public EpProfiler {
   ~CudaProfiler() {}
   bool StartProfiling(TimePoint) override { return true; }
   void EndProfiling(TimePoint, Events&) override {}
+  void StartRunProfiling() override;
   void Start(uint64_t) override {}
   void Stop(uint64_t) override {}
 };
diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.cc b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc
index d84507ec6ad02..4307f52178e2f 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_profiler.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc
@@ -12,6 +12,10 @@ bool VitisaiProfiler::StartProfiling(TimePoint tp) {
   return true;
 }
 
+void VitisaiProfiler::StartRunProfiling() {
+  LOGS_DEFAULT(WARNING) << "Run-level profiling is not implemented for this EP profiler, so GPU kernel events for this EP cannot be captured.";
+}
+
 void VitisaiProfiler::EndProfiling(TimePoint tp, Events& events) {
   auto time_point =
       std::chrono::duration_cast<std::chrono::microseconds>(tp.time_since_epoch()).count();
diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.h b/onnxruntime/core/providers/vitisai/vitisai_profiler.h
index afe4058f7290a..d7e532d6ba452 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_profiler.h
+++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.h
@@ -14,6 +14,7 @@ class VitisaiProfiler final : public EpProfiler {
   ~VitisaiProfiler() {}
   bool StartProfiling(TimePoint) override;
   void EndProfiling(TimePoint, Events&) override;
+  void StartRunProfiling() override;
   void Start(uint64_t) override {}
   void Stop(uint64_t) override {}
 };
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index 15263a87a17b6..473622e4fad86 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -978,7 +978,8 @@ Status WebGpuExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_op
     context_.PushErrorScope();
   }
 
-  if (profiler_->Enabled()) {
+  // Start GPU profiling if session-level OR run-level profiling is enabled
+  if (profiler_->Enabled() || run_options.enable_profiling) {
     context_.StartProfiling();
   }
 
@@ -1000,7 +1001,7 @@ Status WebGpuExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_op
   return Status::OK();
 }
 
-Status WebGpuExecutionProvider::OnRunEnd(bool /* sync_stream */, const onnxruntime::RunOptions& /* run_options */) {
+Status WebGpuExecutionProvider::OnRunEnd(bool /* sync_stream */, const onnxruntime::RunOptions& run_options) {
   context_.Flush(BufferManager());
 
   if (IsGraphCaptureEnabled() && !IsGraphCaptured(m_current_graph_annotation_id)) {
@@ -1013,8 +1014,12 @@ Status WebGpuExecutionProvider::OnRunEnd(bool /* sync_stream */, const onnxrunti
     }
   }
 
-  if (profiler_->Enabled()) {
-    context_.CollectProfilingData(profiler_->Events());
+  // Collect GPU profiling data if session-level OR run-level profiling is enabled
+  if (profiler_->Enabled() || run_options.enable_profiling) {
+    onnxruntime::profiling::Events gpu_events;
+    context_.CollectProfilingData(gpu_events);
+    // Dispatch to appropriate containers (session-level, run-level, or both)
+    profiler_->DispatchGpuEvents(gpu_events);
   }
 
 #if defined(ENABLE_PIX_FOR_WEBGPU_EP)
diff --git a/onnxruntime/core/providers/webgpu/webgpu_profiler.cc b/onnxruntime/core/providers/webgpu/webgpu_profiler.cc
index ce973987e593a..e007d8018142e 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_profiler.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_profiler.cc
@@ -7,6 +7,9 @@
 namespace onnxruntime {
 namespace webgpu {
 
+// TLS definition for run-level profiling
+thread_local WebGpuProfiler::RunLevelState WebGpuProfiler::run_level_state_;
+
 WebGpuProfiler::WebGpuProfiler(WebGpuContext& context) : context_{context} {}
 
 bool WebGpuProfiler::StartProfiling(TimePoint) {
@@ -19,5 +22,44 @@ void WebGpuProfiler::EndProfiling(TimePoint tp, onnxruntime::profiling::Events&
   enabled_ = false;
 }
 
+void WebGpuProfiler::StartRunProfiling() {
+  run_level_state_.enabled = true;
+  run_level_state_.events.clear();
+}
+
+void WebGpuProfiler::EndRunProfiling(TimePoint /* start_time */, onnxruntime::profiling::Events& events) {
+  if (!run_level_state_.enabled) {
+    return;
+  }
+
+  // Move collected GPU events to the output
+  for (auto& event : run_level_state_.events) {
+    events.emplace_back(std::move(event));
+  }
+
+  run_level_state_.events.clear();
+  run_level_state_.enabled = false;
+}
+
+bool WebGpuProfiler::IsRunProfilingEnabled() const {
+  return run_level_state_.enabled;
+}
+
+void WebGpuProfiler::DispatchGpuEvents(onnxruntime::profiling::Events& gpu_events) {
+  // If session-level profiling is enabled, copy to session events
+  if (enabled_) {
+    for (const auto& event : gpu_events) {
+      events_.emplace_back(event);  // copy
+    }
+  }
+
+  // If run-level profiling is enabled, move to run-level events
+  if (run_level_state_.enabled) {
+    for (auto& event : gpu_events) {
+      run_level_state_.events.emplace_back(std::move(event));  // move
+    }
+  }
+}
+
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_profiler.h b/onnxruntime/core/providers/webgpu/webgpu_profiler.h
index d826d295a3842..af8b2ce297871 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_profiler.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_profiler.h
@@ -21,13 +21,33 @@ class WebGpuProfiler final : public onnxruntime::profiling::EpProfiler {
   }
   void Stop(uint64_t) override {
   }
+
+  // Run-level profiling support
+  void StartRunProfiling() override;
+  void EndRunProfiling(TimePoint start_time, onnxruntime::profiling::Events& events) override;
+  bool IsRunProfilingEnabled() const override;
+
+  // Check if session-level profiling is enabled (original semantics)
   inline bool Enabled() const { return enabled_; }
+
+  // Get session-level events container
   inline onnxruntime::profiling::Events& Events() { return events_; }
 
+  // Dispatch GPU events to appropriate containers based on profiling mode
+  // Call this when GPU events are collected
+  void DispatchGpuEvents(onnxruntime::profiling::Events& gpu_events);
+
  private:
   WebGpuContext& context_;
   bool enabled_{false};
-  onnxruntime::profiling::Events events_;  // cached GPU events
+  onnxruntime::profiling::Events events_;  // cached GPU events for session-level
+
+  // TLS for run-level profiling
+  struct RunLevelState {
+    bool enabled = false;
+    onnxruntime::profiling::Events events;
+  };
+  static thread_local RunLevelState run_level_state_;
 };
 
 }  // namespace webgpu
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index e5523dc78b5d2..462658d209f83 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -2930,6 +2930,11 @@ Status InferenceSession::Run(const RunOptions& run_options,
     tp = session_profiler_.Start();
   }
 
+  // Start run-level profiling if enabled
+  if (run_options.enable_profiling) {
+    session_profiler_.StartRunLevelProfiling();
+  }
+
 #ifdef ONNXRUNTIME_ENABLE_INSTRUMENT
   TraceLoggingActivity<telemetry_provider_handle> ortrun_activity;
   ortrun_activity.SetRelatedActivity(session_activity);
@@ -3138,6 +3143,14 @@ Status InferenceSession::Run(const RunOptions& run_options,
   if (session_profiler_.IsEnabled()) {
     session_profiler_.EndTimeAndRecordEvent(profiling::SESSION_EVENT, "model_run", tp);
   }
+
+  // End run-level profiling and write to file if enabled
+  if (run_options.enable_profiling) {
+    std::ostringstream profile_ss;
+    profile_ss << run_options.profile_file_prefix << "_" << GetCurrentTimeString<char>() << ".json";
+    session_profiler_.EndRunLevelProfiling(profile_ss.str());
+  }
+
 #ifdef ONNXRUNTIME_ENABLE_INSTRUMENT
   TraceLoggingWriteStop(ortrun_activity, "OrtRun");
 #endif
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index f0d8906d99c14..957fbd4191dfc 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -2361,6 +2361,10 @@ RunOptions instance. The individual calls will exit gracefully and return an err
 #endif
       .def_readwrite("only_execute_path_to_fetches", &RunOptions::only_execute_path_to_fetches,
                      R"pbdoc(Only execute the nodes needed by fetch list)pbdoc")
+      .def_readwrite("enable_profiling", &RunOptions::enable_profiling,
+                     R"pbdoc(Enable profiling for this run.)pbdoc")
+      .def_readwrite("profile_file_prefix", &RunOptions::profile_file_prefix,
+                     R"pbdoc(File prefix for profiling result. The actual filename will be: <prefix>_<timestamp>.json)pbdoc")
       .def(
           "add_run_config_entry",
           [](RunOptions* options, const char* config_key, const char* config_value) -> void {