@@ -38,8 +38,32 @@ impl Flusher {
3838 }
3939 }
4040
41- pub async fn flush ( & mut self ) {
42- let ( all_series, all_distributions) = {
41+ pub async fn flush (
42+ & mut self ,
43+ ) -> Option < (
44+ Vec < crate :: datadog:: Series > ,
45+ Vec < datadog_protos:: metrics:: SketchPayload > ,
46+ ) > {
47+ self . flush_with_retries ( None , None ) . await
48+ }
49+
50+ pub async fn flush_with_retries (
51+ & mut self ,
52+ retry_series : Option < Vec < crate :: datadog:: Series > > ,
53+ retry_sketches : Option < Vec < datadog_protos:: metrics:: SketchPayload > > ,
54+ ) -> Option < (
55+ Vec < crate :: datadog:: Series > ,
56+ Vec < datadog_protos:: metrics:: SketchPayload > ,
57+ ) > {
58+ let ( all_series, all_distributions) = if retry_series. is_some ( ) || retry_sketches. is_some ( )
59+ {
60+ // Use the provided metrics for retry
61+ (
62+ retry_series. unwrap_or_default ( ) ,
63+ retry_sketches. unwrap_or_default ( ) ,
64+ )
65+ } else {
66+ // Collect new metrics from the aggregator
4367 #[ allow( clippy:: expect_used) ]
4468 let mut aggregator = self . aggregator . lock ( ) . expect ( "lock poisoned" ) ;
4569 (
@@ -53,35 +77,68 @@ impl Flusher {
5377
5478 debug ! ( "Flushing {n_series} series and {n_distributions} distributions" ) ;
5579
80+ // Save copies for potential error returns
81+ let all_series_copy = all_series. clone ( ) ;
82+ let all_distributions_copy = all_distributions. clone ( ) ;
83+
5684 let dd_api_clone = self . dd_api . clone ( ) ;
5785 let series_handle = tokio:: spawn ( async move {
86+ let mut failed = Vec :: new ( ) ;
87+ let mut had_shipping_error = false ;
5888 for a_batch in all_series {
59- let continue_shipping =
89+ let ( continue_shipping, should_retry ) =
6090 should_try_next_batch ( dd_api_clone. ship_series ( & a_batch) . await ) . await ;
91+ if should_retry {
92+ failed. push ( a_batch) ;
93+ had_shipping_error = true ;
94+ }
6195 if !continue_shipping {
6296 break ;
6397 }
6498 }
99+ ( failed, had_shipping_error)
65100 } ) ;
101+
66102 let dd_api_clone = self . dd_api . clone ( ) ;
67103 let distributions_handle = tokio:: spawn ( async move {
104+ let mut failed = Vec :: new ( ) ;
105+ let mut had_shipping_error = false ;
68106 for a_batch in all_distributions {
69- let continue_shipping =
107+ let ( continue_shipping, should_retry ) =
70108 should_try_next_batch ( dd_api_clone. ship_distributions ( & a_batch) . await ) . await ;
109+ if should_retry {
110+ failed. push ( a_batch) ;
111+ had_shipping_error = true ;
112+ }
71113 if !continue_shipping {
72114 break ;
73115 }
74116 }
117+ ( failed, had_shipping_error)
75118 } ) ;
76119
77120 match tokio:: try_join!( series_handle, distributions_handle) {
78- Ok ( _) => {
79- debug ! ( "Successfully flushed {n_series} series and {n_distributions} distributions" )
121+ Ok ( ( ( series_failed, series_had_error) , ( sketches_failed, sketches_had_error) ) ) => {
122+ if series_failed. is_empty ( ) && sketches_failed. is_empty ( ) {
123+ debug ! ( "Successfully flushed {n_series} series and {n_distributions} distributions" ) ;
124+ None // Return None to indicate success
125+ } else if series_had_error || sketches_had_error {
126+ // Only return the metrics if there was an actual shipping error
127+ error ! ( "Failed to flush some metrics due to shipping errors: {} series and {} sketches" ,
128+ series_failed. len( ) , sketches_failed. len( ) ) ;
129+ // Return the failed metrics for potential retry
130+ Some ( ( series_failed, sketches_failed) )
131+ } else {
132+ debug ! ( "Some metrics were not sent but no errors occurred" ) ;
133+ None // No shipping errors, so don't return metrics for retry
134+ }
80135 }
81136 Err ( err) => {
82- error ! ( "Failed to flush metrics{err}" )
137+ error ! ( "Failed to flush metrics: {err}" ) ;
138+ // Return all metrics in case of join error for potential retry
139+ Some ( ( all_series_copy, all_distributions_copy) )
83140 }
84- } ;
141+ }
85142 }
86143}
87144
@@ -90,26 +147,45 @@ pub enum ShippingError {
90147 Destination ( Option < StatusCode > , String ) ,
91148}
92149
93- async fn should_try_next_batch ( resp : Result < Response , ShippingError > ) -> bool {
150+ /// Returns a tuple (continue_to_next_batch, should_retry_this_batch)
151+ async fn should_try_next_batch ( resp : Result < Response , ShippingError > ) -> ( bool , bool ) {
94152 match resp {
95153 Ok ( resp_payload) => match resp_payload. status ( ) {
96- StatusCode :: ACCEPTED => true ,
154+ StatusCode :: ACCEPTED => ( true , false ) , // Success, continue to next batch, no need to retry
97155 unexpected_status_code => {
156+ // Check if the status code indicates a permanent error (4xx) or a temporary error (5xx)
157+ let is_permanent_error =
158+ unexpected_status_code. as_u16 ( ) >= 400 && unexpected_status_code. as_u16 ( ) < 500 ;
159+
98160 error ! (
99161 "{}: Failed to push to API: {:?}" ,
100162 unexpected_status_code,
101163 resp_payload. text( ) . await . unwrap_or_default( )
102164 ) ;
103- true
165+
166+ if is_permanent_error {
167+ ( true , false ) // Permanent error, continue to next batch but don't retry
168+ } else {
169+ ( false , true ) // Temporary error, don't continue to next batch and mark for retry
170+ }
104171 }
105172 } ,
106173 Err ( ShippingError :: Payload ( msg) ) => {
107174 error ! ( "Failed to prepare payload. Data dropped: {}" , msg) ;
108- true
175+ ( true , false ) // Payload error, continue to next batch but don't retry (data is malformed)
109176 }
110177 Err ( ShippingError :: Destination ( sc, msg) ) => {
178+ // Check if status code indicates a permanent error
179+ let is_permanent_error =
180+ sc. map_or ( false , |code| code. as_u16 ( ) >= 400 && code. as_u16 ( ) < 500 ) ;
181+
111182 error ! ( "Error shipping data: {:?} {}" , sc, msg) ;
112- false
183+
184+ if is_permanent_error {
185+ ( false , false ) // Permanent destination error, don't continue and don't retry
186+ } else {
187+ ( false , true ) // Temporary error, don't continue and mark for retry
188+ }
113189 }
114190 }
115191}
0 commit comments