Skip to content

Commit 4c0f4c9

Browse files
committed
High availabily via jgroups-raft
1 parent a9bc8a8 commit 4c0f4c9

12 files changed

+224
-134
lines changed

docs/docs/config.md

+8-1
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,14 @@ filter | STRING | filter rules, eg `exclude:
164164
option | argument | description | default
165165
-------------------------------|-------------------------------------| --------------------------------------------------- | -------
166166
encrypt | [ none | data | all ] | encrypt mode: none = no encryption. "data": encrypt the `data` field only. `all`: encrypt entire maxwell message | none
167-
secret_key | STRING | specify the encryption key to be used | null
167+
secret_key | string | specify the encryption key to be used | null
168+
169+
# high availability
170+
option | argument | description | default
171+
-------------------------------|-------------------------------------| --------------------------------------------------- | -------
172+
ha | | enable maxwell client HA |
173+
jgroups_config | string | location of xml configuration file for jGroups | $PWD/raft.xml
174+
raft_member_id | string | uniquely identify this node within jgroups-raft cluster |
168175

169176
# monitoring / metrics
170177
option | argument | description | default

docs/docs/high_availability.md

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# High Availabilty
2+
3+
As v1.29.0, Maxwell contains experiemental (alpha quality) client-side HA.
4+
Support for performing leader elections is done via
5+
[jgroups-raft](https://github.com/belaban/jgroups-raft).
6+
7+
## Getting started
8+
9+
First, copy `raft.xml.example` to `raft.xml`. Edit to your liking, paying attention to:
10+
11+
```
12+
<raft.RAFT members="A,B,C" raft_id="${raft_id:undefined}"/>
13+
```
14+
15+
Note that because we are using a RAFT-based leader election, we will have to spin up at least
16+
3 maxwell client nodes.
17+
18+
19+
Now start each of your HA maxwell nodes like this:
20+
21+
```bash
22+
host1: $ bin/maxwell --ha --raft_member_id=A
23+
host2: $ bin/maxwell --ha --raft_member_id=B
24+
host3: $ bin/maxwell --ha --raft_member_id=C
25+
```
26+
27+
if all goes well, the 3 nodes will communicate via multicast/UDP, elect one to
28+
be the cluster leader, and away you will go. If one node is terminated or
29+
partitioned, a new election will be held to replace it.
30+
31+
32+
## Getting deeper
33+
34+
More advanced (especially inter-DC) configurations may be implemented by
35+
editing `raft.xml`; you'll probably need to get the nodes to communicate with
36+
each other via TCP instead of UDP, and maybe tunnel through a firewall or two,
37+
good stuff like that. It's of course out of scope for this document, so
38+
[check out the jgroups
39+
documentation](http://www.jgroups.org/manual/html/user-advanced.html), but if
40+
you come up with something good drop me a line.
41+
42+
## Common problems
43+
44+
Something I encountered right out of the gate was this:
45+
46+
```
47+
12:37:53,135 WARN UDP - failed to join /224.0.75.75:7500 on utun0: java.net.SocketException: Can't assign requested address
48+
```
49+
50+
which can be worked around by forcing the JVM onto an ipv4 stack:
51+
52+
```
53+
JAVA_OPTS="-Djava.net.preferIPv4Stack=true" bin/maxwell --ha --raft_member_id=B
54+
```
55+
56+
57+
58+
59+
60+

docs/mkdocs.yml

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ nav:
1414
- 'Data Format': 'dataformat.md'
1515
- 'Encryption': 'encryption.md'
1616
- 'Monitoring': 'monitoring.md'
17+
- 'High Availability': 'high_availability.md'
1718
- 'Embedding': 'embedding.md'
1819
- 'Internals': 'schemas.md'
1920
- 'Compatibility': 'compat.md'

pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@
318318
<dependency>
319319
<groupId>org.jgroups</groupId>
320320
<artifactId>jgroups-raft</artifactId>
321-
<version>0.5.3-SNAPSHOT</version>
321+
<version>1.0.0.Final</version>
322322
</dependency>
323323
</dependencies>
324324

raft.xml.example

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
<?xml version='1.0' encoding='utf-8'?>
2+
<config xmlns="urn:org:jgroups"
3+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4+
xsi:schemaLocation="urn:org:jgroups http://www.jgroups.org/schema/jgroups.xsd">
5+
<UDP mcast_addr="228.8.8.8" mcast_port="${jgroups.udp.mcast_port:45588}"/>
6+
<PING />
7+
<MERGE3 />
8+
<FD_SOCK/>
9+
<FD_ALL/>
10+
<VERIFY_SUSPECT timeout="1500"/>
11+
<pbcast.NAKACK2 xmit_interval="500"/>
12+
<UNICAST3 xmit_interval="500"/>
13+
<pbcast.STABLE desired_avg_gossip="50000" max_bytes="4M"/>
14+
<raft.NO_DUPES/>
15+
<pbcast.GMS print_local_addr="true" join_timeout="2000"/>
16+
<UFC max_credits="2M" min_threshold="0.4"/>
17+
<MFC max_credits="2M" min_threshold="0.4"/>
18+
<FRAG2 frag_size="60K"/>
19+
<raft.ELECTION election_min_interval="500" election_max_interval="1000" heartbeat_interval="250"/>
20+
<raft.RAFT members="A,B,C" raft_id="${raft_id:undefined}"/>
21+
<raft.REDIRECT/>
22+
</config>

src/main/java/com/zendesk/maxwell/Maxwell.java

+11-99
Original file line numberDiff line numberDiff line change
@@ -13,26 +13,14 @@
1313
import com.zendesk.maxwell.schema.*;
1414
import com.zendesk.maxwell.schema.columndef.ColumnDefCastException;
1515
import com.zendesk.maxwell.util.Logging;
16-
import org.jgroups.JChannel;
17-
import org.jgroups.protocols.raft.RaftLeaderException;
18-
import org.jgroups.protocols.raft.Role;
19-
import org.jgroups.protocols.raft.Settable;
20-
import org.jgroups.protocols.raft.StateMachine;
21-
import org.jgroups.raft.RaftHandle;
2216
import org.slf4j.Logger;
2317
import org.slf4j.LoggerFactory;
2418

25-
import java.io.DataInput;
26-
import java.io.DataOutput;
2719
import java.net.URISyntaxException;
2820
import java.sql.Connection;
2921
import java.sql.SQLException;
3022
import java.util.ArrayList;
3123
import java.util.List;
32-
import java.util.concurrent.CompletableFuture;
33-
import java.util.concurrent.TimeUnit;
34-
import java.util.concurrent.TimeoutException;
35-
import java.util.concurrent.atomic.AtomicBoolean;
3624

3725
public class Maxwell implements Runnable {
3826
protected MaxwellConfig config;
@@ -58,13 +46,22 @@ public void run() {
5846
}
5947
}
6048

49+
public void restart() {
50+
try {
51+
this.context = new MaxwellContext(config);
52+
} catch ( Exception e ) {
53+
throw new RuntimeException(e);
54+
}
55+
56+
run();
57+
}
58+
6159
public void terminate() {
6260
Thread terminationThread = this.context.terminate();
6361
if (terminationThread != null) {
6462
try {
6563
terminationThread.join();
6664
} catch (InterruptedException e) {
67-
// ignore
6865
}
6966
}
7067
}
@@ -182,91 +179,6 @@ private void logBanner(AbstractProducer producer, Position initialPosition) {
182179
protected void onReplicatorStart() {}
183180
protected void onReplicatorEnd() {}
184181

185-
private AtomicBoolean isLeader = new AtomicBoolean(false);
186-
187-
private void startHA() throws Exception {
188-
JChannel ch=new JChannel(this.config.jgroupsConf);
189-
StateMachine s= new StateMachine() {
190-
@Override
191-
public byte[] apply(byte[] bytes, int i, int i1) throws Exception {
192-
return new byte[0];
193-
}
194-
195-
@Override
196-
public void readContentFrom(DataInput dataInput) throws Exception {
197-
198-
}
199-
200-
@Override
201-
public void writeContentTo(DataOutput dataOutput) throws Exception {
202-
203-
}
204-
};
205-
206-
RaftHandle handle=new RaftHandle(ch, s);
207-
handle.raftId(this.config.raftMemberID);
208-
209-
handle.addRoleListener(role -> {
210-
if(role == Role.Leader) {
211-
LOGGER.info("won HA election, starting maxwell");
212-
try {
213-
isLeader.set(true);
214-
this.start();
215-
} catch ( Exception e ) {
216-
} finally {
217-
isLeader.set(false);
218-
}
219-
220-
} else
221-
LOGGER.info("lost HA election, current leader: " + handle.leader());
222-
// stop singleton services
223-
});
224-
225-
ch.connect(this.config.clientID);
226-
LOGGER.info("enter HA group, current leader: " + handle.leader());
227-
228-
new Thread(() -> {
229-
int exceptionCount = 0;
230-
while ( true ) {
231-
byte[] b = new byte[] { (byte) 0x1 };
232-
try {
233-
handle.set(b, 0, 1, 5000, TimeUnit.MILLISECONDS);
234-
LOGGER.debug("RAFT-heartbeat successful");
235-
exceptionCount = 0;
236-
if ( handle.isLeader() && !this.isLeader.get() ) {
237-
LOGGER.info("RAFT-consensus available, restarting maxwell...");
238-
try {
239-
isLeader.set(true);
240-
this.start();
241-
} catch ( Exception e ) {
242-
} finally {
243-
isLeader.set(false);
244-
}
245-
}
246-
} catch ( RaftLeaderException e ) {
247-
LOGGER.warn("RAFT leader unavailable: " + e.getMessage());
248-
exceptionCount++;
249-
} catch ( TimeoutException e ) {
250-
exceptionCount++;
251-
LOGGER.warn("RAFT-heartbeat timed out. Exception Count:" + exceptionCount);
252-
} catch ( Exception e ) {
253-
LOGGER.error("unexpected exception in RAFT-heartbeat", e);
254-
}
255-
256-
if ( exceptionCount > 1 && isLeader.get() ) {
257-
LOGGER.warn("RAFT consensus unavailable after " + exceptionCount + " tries, stopping maxwell");
258-
this.context.shutdown(new AtomicBoolean(false));
259-
}
260-
261-
try {
262-
Thread.sleep(1000);
263-
} catch (InterruptedException e) { }
264-
}
265-
}).start();
266-
267-
268-
Thread.sleep(Long.MAX_VALUE);
269-
}
270182

271183
private void start() throws Exception {
272184
try {
@@ -368,7 +280,7 @@ public void run() {
368280
});
369281

370282
if ( config.haMode ) {
371-
maxwell.startHA();
283+
new MaxwellHA(maxwell, config.jgroupsConf, config.raftMemberID, config.clientID).startHA();
372284
} else {
373285
maxwell.start();
374286
}

src/main/java/com/zendesk/maxwell/MaxwellConfig.java

+2-7
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,6 @@ public class MaxwellConfig extends AbstractConfig {
8686
public MaxwellOutputConfig outputConfig;
8787
public String log_level;
8888

89-
public MetricRegistry metricRegistry;
90-
public HealthCheckRegistry healthCheckRegistry;
91-
9289
public int httpPort;
9390
public String httpBindAddress;
9491
public String httpPathPrefix;
@@ -162,8 +159,6 @@ public MaxwellConfig() { // argv is only null in tests
162159
this.masterRecovery = false;
163160
this.gtidMode = false;
164161
this.bufferedProducerSize = 200;
165-
this.metricRegistry = new MetricRegistry();
166-
this.healthCheckRegistry = new HealthCheckRegistry();
167162
this.outputConfig = new MaxwellOutputConfig();
168163
setup(null, null); // setup defaults
169164
}
@@ -224,9 +219,9 @@ protected MaxwellOptionParser buildOptionParser() {
224219
parser.accepts( "max_schemas", "Maximum schemas to keep before triggering a compaction operation. Default: unlimited" ).withRequiredArg();
225220
parser.section("operation");
226221

227-
parser.accepts( "ha", "enable high-availability mode via jgroups-raft" ).withOptionalArg();
222+
parser.accepts( "ha", "enable high-availability mode via jgroups-raft" );
228223
parser.accepts( "jgroups_config", "location of jgroups xml configuration file" ).withRequiredArg();
229-
parser.accepts( "raft_member_id", "raft memberID" ).withRequiredArg();
224+
parser.accepts( "raft_member_id", "raft memberID. (may also be specified in raft.xml)" ).withRequiredArg();
230225

231226
parser.accepts( "bootstrapper", "bootstrapper type: async|sync|none. default: async" ).withRequiredArg();
232227
parser.accepts( "init_position", "initial binlog position, given as BINLOG_FILE:POSITION[:HEARTBEAT]" ).withRequiredArg();

src/main/java/com/zendesk/maxwell/MaxwellContext.java

+21-6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package com.zendesk.maxwell;
22

3+
import com.codahale.metrics.MetricRegistry;
4+
import com.codahale.metrics.health.HealthCheckRegistry;
35
import com.zendesk.maxwell.bootstrap.BootstrapController;
46
import com.zendesk.maxwell.bootstrap.SynchronousBootstrapper;
57
import com.zendesk.maxwell.filtering.Filter;
@@ -33,9 +35,9 @@
3335
public class MaxwellContext {
3436
static final Logger LOGGER = LoggerFactory.getLogger(MaxwellContext.class);
3537

36-
private final ConnectionPool replicationConnectionPool;
37-
private final ConnectionPool maxwellConnectionPool;
38-
private final ConnectionPool rawMaxwellConnectionPool;
38+
private ConnectionPool replicationConnectionPool;
39+
private ConnectionPool maxwellConnectionPool;
40+
private ConnectionPool rawMaxwellConnectionPool;
3941
private final ConnectionPool schemaConnectionPool;
4042
private final MaxwellConfig config;
4143
private final MaxwellMetrics metrics;
@@ -58,11 +60,16 @@ public class MaxwellContext {
5860
private BootstrapController bootstrapController;
5961
private Thread bootstrapControllerThread;
6062

63+
public MetricRegistry metricRegistry;
64+
public HealthCheckRegistry healthCheckRegistry;
65+
6166
public MaxwellContext(MaxwellConfig config) throws SQLException, URISyntaxException {
6267
this.config = config;
6368
this.config.validate();
6469
this.taskManager = new TaskManager();
65-
this.metrics = new MaxwellMetrics(config);
70+
71+
this.metricRegistry = new MetricRegistry();
72+
this.metrics = new MaxwellMetrics(config, this.metricRegistry);
6673

6774
this.replicationConnectionPool = new C3P0ConnectionPool(
6875
config.replicationMysql.getConnectionURI(false),
@@ -95,8 +102,6 @@ public MaxwellContext(MaxwellConfig config) throws SQLException, URISyntaxExcept
95102
config.maxwellMysql.user,
96103
config.maxwellMysql.password
97104
);
98-
// do NOT probe the regular maxwell connection pool; the database might not be created yet.
99-
100105
if ( this.config.initPosition != null )
101106
this.initialPosition = this.config.initPosition;
102107

@@ -109,6 +114,8 @@ public MaxwellContext(MaxwellConfig config) throws SQLException, URISyntaxExcept
109114
this.heartbeatNotifier = new HeartbeatNotifier();
110115
List<MaxwellDiagnostic> diagnostics = new ArrayList<>(Collections.singletonList(new BinlogConnectorDiagnostic(this)));
111116
this.diagnosticContext = new MaxwellDiagnosticContext(config.diagnosticConfig, diagnostics);
117+
118+
this.healthCheckRegistry = new HealthCheckRegistry();
112119
}
113120

114121
public MaxwellConfig getConfig() {
@@ -177,9 +184,17 @@ private void sendFinalHeartbeat() {
177184
public void shutdown(AtomicBoolean complete) {
178185
try {
179186
taskManager.stop(this.error);
187+
this.metrics.stop();
188+
180189
this.replicationConnectionPool.release();
190+
this.replicationConnectionPool = null;
191+
181192
this.maxwellConnectionPool.release();
193+
this.maxwellConnectionPool = null;
194+
182195
this.rawMaxwellConnectionPool.release();
196+
this.rawMaxwellConnectionPool = null;
197+
183198
complete.set(true);
184199
} catch (Exception e) {
185200
LOGGER.error("Exception occurred during shutdown:", e);

0 commit comments

Comments
 (0)