14
14
import org .apache .flink .api .common .eventtime .WatermarkStrategy ;
15
15
import org .apache .flink .util .ParameterTool ;
16
16
import org .apache .flink .configuration .Configuration ;
17
+ import org .apache .flink .configuration .ExternalizedCheckpointRetention ;
17
18
import org .apache .flink .configuration .RestartStrategyOptions ;
18
19
import org .apache .flink .streaming .api .environment .StreamExecutionEnvironment ;
19
20
import org .apache .flink .streaming .api .functions .ProcessFunction ;
20
21
21
22
import java .util .Map ;
22
23
import java .util .Random ;
24
+ import org .slf4j .Logger ;
25
+ import org .slf4j .LoggerFactory ;
23
26
24
27
/**
25
28
* <p>Main abstract class used by all the jobs executed by Metis.</p>
32
35
*/
33
36
public abstract class MetisJob {
34
37
35
- private static final int RESTART_ATTEMPTS = 3 ;
36
- private static final int RESTART_DELAY_IN_SECONDS = 10 ;
38
+ private static final Logger LOGGER = LoggerFactory .getLogger (MetisJob .class );
37
39
40
+ ////////////////////////Failover strategy configuration/////////////////////////////////////////
41
+ //All these gives us over 30 minutes of restarting in case of total infrastructure error. This
42
+ // time is a bit random and depends on jitter and time of task starting.
43
+ private static final Duration INITIAL_RESTART_DELAY_IN_SECONDS = Duration .ofSeconds (10 );
44
+ private static final Duration MAX_RESTART_DELAY_IN_SECONDS = Duration .ofMinutes (2 );
45
+ public static final double BACK_OFF_MULTIPLIER = 2.0 ;
46
+ private static final int ATTEMPTS = 20 ;
47
+ public static final double JITTER_FACTOR = 0.1 ;
48
+ //After job works fine for configured time restart counter is reset.
49
+ private static final Duration RESET_BACKOFF_THRESHOLD = Duration .ofMinutes (10 );
50
+
51
+ ////////////////////////Checkpointing configuration/////////////////////////////////////////////
38
52
private static final long CHECKPOINT_INTERVAL_IN_MILLIS = 2000 ;
39
53
private static final long MIN_PAUSE_BETWEEN_CHECKPOINTS = 1000 ;
40
54
@@ -59,9 +73,13 @@ protected MetisJob(String[] args, String jobName) {
59
73
protected StreamExecutionEnvironment prepareEnvironment () {
60
74
61
75
Configuration config = new Configuration ();
62
- config .set (RestartStrategyOptions .RESTART_STRATEGY , "fixed-delay" );
63
- config .set (RestartStrategyOptions .RESTART_STRATEGY_FIXED_DELAY_ATTEMPTS , RESTART_ATTEMPTS );
64
- config .set (RestartStrategyOptions .RESTART_STRATEGY_FIXED_DELAY_DELAY , Duration .ofSeconds (RESTART_DELAY_IN_SECONDS ));
76
+ config .set (RestartStrategyOptions .RESTART_STRATEGY , "exponential-delay" );
77
+ config .set (RestartStrategyOptions .RESTART_STRATEGY_EXPONENTIAL_DELAY_INITIAL_BACKOFF , INITIAL_RESTART_DELAY_IN_SECONDS );
78
+ config .set (RestartStrategyOptions .RESTART_STRATEGY_EXPONENTIAL_DELAY_MAX_BACKOFF , MAX_RESTART_DELAY_IN_SECONDS );
79
+ config .set (RestartStrategyOptions .RESTART_STRATEGY_EXPONENTIAL_DELAY_BACKOFF_MULTIPLIER , BACK_OFF_MULTIPLIER );
80
+ config .set (RestartStrategyOptions .RESTART_STRATEGY_EXPONENTIAL_DELAY_ATTEMPTS , ATTEMPTS );
81
+ config .set (RestartStrategyOptions .RESTART_STRATEGY_EXPONENTIAL_DELAY_RESET_BACKOFF_THRESHOLD , RESET_BACKOFF_THRESHOLD );
82
+ config .set (RestartStrategyOptions .RESTART_STRATEGY_EXPONENTIAL_DELAY_JITTER_FACTOR , JITTER_FACTOR );
65
83
66
84
final StreamExecutionEnvironment env =
67
85
StreamExecutionEnvironment .getExecutionEnvironment (config );
@@ -71,6 +89,7 @@ protected StreamExecutionEnvironment prepareEnvironment() {
71
89
env .getConfig ().setGlobalJobParameters (tool );
72
90
env .enableCheckpointing (CHECKPOINT_INTERVAL_IN_MILLIS );
73
91
env .getCheckpointConfig ().setMinPauseBetweenCheckpoints (MIN_PAUSE_BETWEEN_CHECKPOINTS );
92
+ env .getCheckpointConfig ().setExternalizedCheckpointRetention (ExternalizedCheckpointRetention .RETAIN_ON_CANCELLATION );
74
93
return env ;
75
94
}
76
95
@@ -113,6 +132,7 @@ protected void prepareJob() {
113
132
*/
114
133
public void execute () throws Exception {
115
134
prepareJob ();
135
+ LOGGER .info ("Execution plan: {}" , flinkEnvironment .getExecutionPlan ());
116
136
flinkEnvironment .execute (enrichedJobName ());
117
137
}
118
138
0 commit comments