@@ -11,35 +11,18 @@ defmodule WalEx.Replication.Server do
11
11
alias WalEx.Decoder
12
12
alias WalEx.Replication.QueryBuilder
13
13
14
+ require Logger
15
+
16
+ @ max_retries 10
17
+ @ initial_backoff 1000
18
+
14
19
def start_link ( opts ) do
15
20
app_name = Keyword . get ( opts , :app_name )
16
21
opts = set_pgx_replication_conn_opts ( app_name )
17
22
18
23
Postgrex.ReplicationConnection . start_link ( __MODULE__ , [ app_name: app_name ] , opts )
19
24
end
20
25
21
- defp set_pgx_replication_conn_opts ( app_name ) do
22
- database_configs_keys = [
23
- :hostname ,
24
- :username ,
25
- :password ,
26
- :port ,
27
- :database ,
28
- :ssl ,
29
- :ssl_opts ,
30
- :socket_options
31
- ]
32
-
33
- extra_opts = [ auto_reconnect: true ]
34
- database_configs = WalEx.Config . get_configs ( app_name , database_configs_keys )
35
-
36
- replications_name = [
37
- name: WalExRegistry . set_name ( :set_gen_server , __MODULE__ , app_name )
38
- ]
39
-
40
- extra_opts ++ database_configs ++ replications_name
41
- end
42
-
43
26
@ impl true
44
27
def init ( opts ) do
45
28
app_name = Keyword . get ( opts , :app_name )
@@ -90,7 +73,7 @@ defmodule WalEx.Replication.Server do
90
73
def handle_result ( results , % { step: :publication_exists } = state ) do
91
74
case results do
92
75
[ % Postgrex.Result { num_rows: 0 } ] ->
93
- raise "Publication doesn't exists . publication: #{ inspect ( state . publication ) } "
76
+ raise "Publication doesn't exist . publication: #{ inspect ( state . publication ) } "
94
77
95
78
_ ->
96
79
raise "Unexpected result when checking if publication exists. #{ inspect ( results ) } "
@@ -110,11 +93,17 @@ defmodule WalEx.Replication.Server do
110
93
) do
111
94
case active do
112
95
"f" ->
113
- query = QueryBuilder . start_replication_slot ( state )
114
- { :stream , query , [ ] , % { state | step: :streaming } }
96
+ Logger . info ( "Activating inactive replication slot: #{ state . slot_name } " )
97
+ start_replication_with_retry ( state , 0 , @ initial_backoff )
115
98
116
99
"t" ->
117
- raise "Durable slot already active"
100
+ Logger . info (
101
+ "Replication slot #{ state . slot_name } is active. Waiting for it to become inactive."
102
+ )
103
+
104
+ schedule_slot_check ( )
105
+
106
+ { :noreply , state }
118
107
end
119
108
end
120
109
@@ -125,8 +114,7 @@ defmodule WalEx.Replication.Server do
125
114
126
115
@ impl true
127
116
def handle_result ( [ % Postgrex.Result { } | _results ] , state = % { step: :create_slot } ) do
128
- query = QueryBuilder . start_replication_slot ( state )
129
- { :stream , query , [ ] , % { state | step: :streaming } }
117
+ start_replication_with_retry ( state , 0 , @ initial_backoff )
130
118
end
131
119
132
120
@ impl true
@@ -136,7 +124,22 @@ defmodule WalEx.Replication.Server do
136
124
end
137
125
138
126
@ impl true
139
- # https://www.postgresql.org/docs/14/protocol-replication.html
127
+ def handle_result (
128
+ % Postgrex.Error { postgres: % { code: :object_in_use } } ,
129
+ state = % { step: { :start_replication , retry_count , backoff } }
130
+ ) do
131
+ Logger . warning ( "Replication slot in use, retrying... (attempt #{ retry_count + 1 } )" )
132
+ Process . sleep ( backoff )
133
+ start_replication_with_retry ( state , retry_count + 1 , backoff * 2 )
134
+ end
135
+
136
+ @ impl true
137
+ def handle_result ( _ , state = % { step: { :start_replication , _retry_count , _backoff } } ) do
138
+ Logger . info ( "Successfully started replication slot: #{ state . slot_name } " )
139
+ { :noreply , % { state | step: :streaming } }
140
+ end
141
+
142
+ @ impl true
140
143
def handle_data ( << ?w , _wal_start :: 64 , _wal_end :: 64 , _clock :: 64 , rest :: binary >> , state ) do
141
144
rest
142
145
|> Decoder . decode_message ( )
@@ -145,6 +148,7 @@ defmodule WalEx.Replication.Server do
145
148
{ :noreply , state }
146
149
end
147
150
151
+ @ impl true
148
152
def handle_data ( << ?k , wal_end :: 64 , _clock :: 64 , reply >> , state ) do
149
153
messages =
150
154
case reply do
@@ -155,6 +159,55 @@ defmodule WalEx.Replication.Server do
155
159
{ :noreply , messages , state }
156
160
end
157
161
162
+ @ impl true
163
+ def handle_info ( :check_slot_status , state ) do
164
+ query = QueryBuilder . slot_exists ( state )
165
+ { :query , query , % { state | step: :slot_exists } }
166
+ end
167
+
168
+ defp set_pgx_replication_conn_opts ( app_name ) do
169
+ database_configs_keys = [
170
+ :hostname ,
171
+ :username ,
172
+ :password ,
173
+ :port ,
174
+ :database ,
175
+ :ssl ,
176
+ :ssl_opts ,
177
+ :socket_options
178
+ ]
179
+
180
+ extra_opts = [ auto_reconnect: true ]
181
+ database_configs = WalEx.Config . get_configs ( app_name , database_configs_keys )
182
+
183
+ replications_name = [
184
+ name: WalExRegistry . set_name ( :set_gen_server , __MODULE__ , app_name )
185
+ ]
186
+
187
+ extra_opts ++ database_configs ++ replications_name
188
+ end
189
+
190
+ defp start_replication_with_retry ( state , retry_count , backoff )
191
+ when retry_count < @ max_retries do
192
+ query = QueryBuilder . start_replication_slot ( state )
193
+ { :stream , query , [ ] , % { state | step: { :start_replication , retry_count , backoff } } }
194
+ end
195
+
196
+ defp start_replication_with_retry ( state , _retry_count , _backoff ) do
197
+ Logger . warning (
198
+ "Failed to start replication slot after maximum retries. Scheduling another check."
199
+ )
200
+
201
+ schedule_slot_check ( )
202
+
203
+ { :noreply , state }
204
+ end
205
+
206
+ defp schedule_slot_check ( ) do
207
+ # Check again after 5 seconds
208
+ Process . send_after ( self ( ) , :check_slot_status , 5000 )
209
+ end
210
+
158
211
@ epoch DateTime . to_unix ( ~U[ 2000-01-01 00:00:00Z] , :microsecond )
159
212
defp current_time , do: System . os_time ( :microsecond ) - @ epoch
160
213
end
0 commit comments