@@ -24,6 +24,9 @@ use serde::{ser::SerializeMap, Serialize};
2424use serde_json:: { json, value:: RawValue } ;
2525use sqlx:: { types:: Json , Pool , Postgres , Transaction } ;
2626use sqlx:: { Encode , PgExecutor } ;
27+ use tokio:: sync:: mpsc:: Sender ;
28+ use tokio:: sync:: oneshot;
29+ use tokio:: task:: JoinHandle ;
2730use tokio:: { sync:: RwLock , time:: sleep} ;
2831use ulid:: Ulid ;
2932use uuid:: Uuid ;
@@ -137,7 +140,7 @@ pub struct CanceledBy {
137140 pub reason : Option < String > ,
138141}
139142
140- #[ derive( Debug , Clone , Serialize , Deserialize ) ]
143+ #[ derive( Debug , Serialize , Deserialize ) ]
141144pub struct JobCompleted {
142145 pub job : MiniCompletedJob ,
143146 pub preprocessed_args : Option < HashMap < String , Box < RawValue > > > ,
@@ -151,6 +154,10 @@ pub struct JobCompleted {
151154 pub duration : Option < i64 > ,
152155 pub has_stream : Option < bool > ,
153156 pub from_cache : Option < bool > ,
157+ #[ serde( skip) ]
158+ pub flow_runners : Option < Arc < FlowRunners > > ,
159+ #[ serde( skip) ]
160+ pub done_tx : Option < oneshot:: Sender < ( ) > > ,
154161}
155162
156163pub async fn cancel_single_job < ' c > (
@@ -2271,6 +2278,8 @@ pub struct JobAndPerms {
22712278 pub parent_runnable_path : Option < String > ,
22722279 pub token : String ,
22732280 pub precomputed_agent_info : Option < PrecomputedAgentInfo > ,
2281+ #[ serde( skip) ]
2282+ pub flow_runners : Option < Arc < FlowRunners > > ,
22742283}
22752284impl PulledJob {
22762285 pub async fn get_job_and_perms ( self , db : & DB ) -> JobAndPerms {
@@ -2302,6 +2311,7 @@ impl PulledJob {
23022311 parent_runnable_path : self . parent_runnable_path ,
23032312 token,
23042313 precomputed_agent_info : None ,
2314+ flow_runners : None ,
23052315 }
23062316 }
23072317}
@@ -2492,6 +2502,8 @@ impl PulledJobResult {
24922502 duration : None ,
24932503 has_stream : Some ( false ) ,
24942504 from_cache : None ,
2505+ flow_runners : None ,
2506+ done_tx : None ,
24952507 } ) ,
24962508 ) ,
24972509 PulledJobResult { job : Some ( job) , error_while_preprocessing : Some ( e) , .. } => Err (
@@ -2511,6 +2523,8 @@ impl PulledJobResult {
25112523 duration : None ,
25122524 has_stream : Some ( false ) ,
25132525 from_cache : None ,
2526+ flow_runners : None ,
2527+ done_tx : None ,
25142528 } ) ,
25152529 ) ,
25162530 PulledJobResult { job, .. } => Ok ( job) ,
@@ -5715,10 +5729,74 @@ async fn restarted_flows_resolution(
57155729 ) )
57165730}
57175731
5732+
5733+
5734+ // Wrapper struct to send both job and optional flow_runners to dedicated workers
5735+ pub struct DedicatedWorkerJob {
5736+ pub job : Arc < MiniPulledJob > ,
5737+ pub flow_runners : Option < Arc < FlowRunners > > ,
5738+ pub done_tx : Option < oneshot:: Sender < ( ) > > ,
5739+ }
5740+
5741+ #[ derive( Debug ) ]
5742+ pub struct FlowRunners {
5743+ pub runners : HashMap < String , Sender < DedicatedWorkerJob > > ,
5744+ pub handles : Vec < JoinHandle < ( ) > > ,
5745+ pub job_id : Uuid ,
5746+ }
5747+
5748+ impl Drop for FlowRunners {
5749+ fn drop ( & mut self ) {
5750+ let total_runners = self . handles . len ( ) ;
5751+ tracing:: info!( "dropping {} flow runners for job {}" , total_runners, self . job_id) ;
5752+
5753+ // First, drop all senders to signal workers to stop gracefully
5754+ self . runners . clear ( ) ;
5755+
5756+ // Spawn a background task to wait with timeout and abort if needed
5757+ let handles = std:: mem:: take ( & mut self . handles ) ;
5758+ let job_id = self . job_id ;
5759+
5760+ tokio:: spawn ( async move {
5761+ // Extract abort handles before consuming the join handles
5762+ let abort_handles: Vec < _ > = handles. iter ( ) . map ( |h| h. abort_handle ( ) ) . collect ( ) ;
5763+
5764+ // Wait up to 5 seconds for natural termination
5765+ let timeout_result = tokio:: time:: timeout (
5766+ tokio:: time:: Duration :: from_secs ( 5 ) ,
5767+ futures:: future:: join_all ( handles)
5768+ ) . await ;
5769+
5770+ match timeout_result {
5771+ Ok ( _) => {
5772+ tracing:: info!( "all {} flow runners for job {} terminated gracefully" , total_runners, job_id) ;
5773+ }
5774+ Err ( _) => {
5775+ // Timeout reached, abort only the handles that haven't finished
5776+ let mut aborted = 0 ;
5777+ for abort_handle in abort_handles {
5778+ if !abort_handle. is_finished ( ) {
5779+ abort_handle. abort ( ) ;
5780+ aborted += 1 ;
5781+ }
5782+ }
5783+ let graceful = total_runners - aborted;
5784+ tracing:: warn!(
5785+ "flow runners for job {}: {} terminated gracefully, {} aborted after 5s timeout" ,
5786+ job_id, graceful, aborted
5787+ ) ;
5788+ }
5789+ }
5790+ } ) ;
5791+ }
5792+ }
5793+
57185794#[ derive( Debug , Serialize , Deserialize ) ]
57195795pub struct SameWorkerPayload {
57205796 pub job_id : Uuid ,
57215797 pub recoverable : bool ,
5798+ #[ serde( skip) ]
5799+ pub flow_runners : Option < Arc < FlowRunners > > ,
57225800}
57235801
57245802pub async fn get_same_worker_job (
0 commit comments