Skip to content

Commit

Permalink
feat: Zero-ETL for PG (#199)
Browse files Browse the repository at this point in the history
  • Loading branch information
TianyuZhang1214 authored Nov 26, 2024
1 parent 0b03960 commit 95af164
Show file tree
Hide file tree
Showing 14 changed files with 566 additions and 129 deletions.
19 changes: 8 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,27 +108,24 @@ psql -h 127.0.0.1 -p 15432 -U postgres

We have integrated a setup tool in the Docker image that helps replicate data from your primary (MySQL|Postgres) server to MyDuck Server. The tool is available via the `SETUP_MODE` environment variable. In `REPLICA` mode, the container will start MyDuck Server, dump a snapshot of your primary (MySQL|Postgres) server, and start replicating data in real-time.

#### MySQL Replica Setup

```bash
docker run \
--network=host \
-p 13306:3306 \
-p 15432:5432 \
--privileged \
--workdir=/home/admin \
--env=SETUP_MODE=REPLICA \
--env=MYSQL_HOST=<mysql_host> \
--env=MYSQL_PORT=<mysql_port> \
--env=MYSQL_USER=<mysql_user> \
--env=MYSQL_PASSWORD=<mysql_password> \
--env=SOURCE_DSN="<postgresql|mysql>://<user>:<password>@<host>:<port>/<dbname>"
--detach=true \
apecloud/myduckserver:latest
```
`SOURCE_DSN` specifies the connection string to the primary database server, which can be either MySQL or PostgreSQL.

#### PostgreSQL Replica Setup
- **MySQL Primary:** Use the MySQL URI scheme, e.g.,
`--env=SOURCE_DSN=mysql://root:[email protected]:3306`

```bash
TODO
```
- **PostgreSQL Primary:** Use the PostgreSQL URI scheme, e.g.,
`--env=SOURCE_DSN=postgres://postgres:[email protected]:5432`

### Connecting to Cloud MySQL & Postgres

Expand Down
2 changes: 2 additions & 0 deletions catalog/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ func NewDBProvider(dataDir, dbFile string) (*DatabaseProvider, error) {
bootQueries := []string{
"INSTALL arrow",
"LOAD arrow",
"INSTALL postgres_scanner",
"LOAD postgres_scanner",
}
for _, q := range bootQueries {
if _, err := storage.ExecContext(context.Background(), q); err != nil {
Expand Down
20 changes: 10 additions & 10 deletions devtools/replica-setup-mysql/checker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ check_server_params() {
echo "Checking MySQL server parameters..."

# Retrieve the required MySQL server variables using mysqlsh
result=$(mysqlsh --host="$MYSQL_HOST" --port="$MYSQL_PORT" --user="$MYSQL_USER" --password="$MYSQL_PASSWORD" --sql -e "
result=$(mysqlsh --uri="$SOURCE_DSN" --sql -e "
SHOW VARIABLES WHERE variable_name IN ('binlog_format', 'enforce_gtid_consistency', 'gtid_mode', 'gtid_strict_mode', 'log_bin');
")

Expand Down Expand Up @@ -62,22 +62,22 @@ check_server_params() {

# Function to check MySQL current user privileges
check_user_privileges() {
echo "Checking privileges for the current user '$MYSQL_USER'..."
echo "Checking privileges for the current user '$SOURCE_USER'..."

# Check the user grants for the currently authenticated user using mysqlsh
result=$(mysqlsh --host="$MYSQL_HOST" --port="$MYSQL_PORT" --user="$MYSQL_USER" --password="$MYSQL_PASSWORD" --sql -e "
result=$(mysqlsh --host="$SOURCE_HOST" --port="$SOURCE_PORT" --user="$SOURCE_USER" --password="$SOURCE_PASSWORD" --sql -e "
SHOW GRANTS FOR CURRENT_USER();
")

check_command "retrieving user grants"

# Check if the required privileges are granted or if GRANT ALL is present
if echo "$result" | grep -q -E "GRANT (SELECT|RELOAD|REPLICATION CLIENT|REPLICATION SLAVE|SHOW VIEW|EVENT)"; then
echo "Current user '$MYSQL_USER' has all required privileges."
echo "Current user '$SOURCE_USER' has all required privileges."
elif echo "$result" | grep -q "GRANT ALL"; then
echo "Current user '$MYSQL_USER' has 'GRANT ALL' privileges."
echo "Current user '$SOURCE_USER' has 'GRANT ALL' privileges."
else
echo "Error: Current user '$MYSQL_USER' is missing some required privileges."
echo "Error: Current user '$SOURCE_USER' is missing some required privileges."
return 1
fi

Expand All @@ -98,7 +98,7 @@ check_mysql_config() {
# Function to check if source MySQL server is empty
check_if_source_mysql_is_empty() {
# Run the query using mysqlsh and capture the output
OUTPUT=$(mysqlsh --uri "$MYSQL_USER:$MYSQL_PASSWORD@$MYSQL_HOST:$MYSQL_PORT" --sql -e "SHOW DATABASES;" 2>/dev/null)
OUTPUT=$(mysqlsh --uri "$SOURCE_DSN" --sql -e "SHOW DATABASES;" 2>/dev/null)

check_command "retrieving database list"

Expand All @@ -117,11 +117,11 @@ check_if_myduck_has_replica() {
REPLICA_STATUS=$(mysqlsh --sql --host=$MYDUCK_HOST --port=$MYDUCK_PORT --user=root --password='' -e "SHOW REPLICA STATUS\G")
check_command "retrieving replica status"

SOURCE_HOST=$(echo "$REPLICA_STATUS" | awk '/Source_Host/ {print $2}')
SOURCE_HOST_EXISTS=$(echo "$REPLICA_STATUS" | awk '/Source_Host/ {print $2}')

# Check if Source_Host is not null or empty
if [[ -n "$SOURCE_HOST" ]]; then
echo "Replication has already been started. Source Host: $SOURCE_HOST"
if [[ -n "$SOURCE_HOST_EXISTS" ]]; then
echo "Replication has already been started. Source Host: $SOURCE_HOST_EXISTS"
return 1
else
return 0
Expand Down
16 changes: 8 additions & 8 deletions devtools/replica-setup-mysql/replica_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,19 @@ GTID_MODE="ON"
while [[ $# -gt 0 ]]; do
case $1 in
--mysql_host)
MYSQL_HOST="$2"
SOURCE_HOST="$2"
shift 2
;;
--mysql_port)
MYSQL_PORT="$2"
SOURCE_PORT="$2"
shift 2
;;
--mysql_user)
MYSQL_USER="$2"
SOURCE_USER="$2"
shift 2
;;
--mysql_password)
MYSQL_PASSWORD="$2"
SOURCE_PASSWORD="$2"
shift 2
;;
--myduck_host)
Expand Down Expand Up @@ -62,14 +62,14 @@ while [[ $# -gt 0 ]]; do
esac
done

source checker.sh

# Check if all parameters are set
if [[ -z "$MYSQL_HOST" || -z "$MYSQL_PORT" || -z "$MYSQL_USER" ]]; then
echo "Error: Missing required MySQL connection variables: MYSQL_HOST, MYSQL_PORT, MYSQL_USER."
if [[ -z "$SOURCE_HOST" || -z "$SOURCE_PORT" || -z "$SOURCE_USER" ]]; then
echo "Error: Missing required MySQL connection variables: SOURCE_HOST, SOURCE_PORT, SOURCE_USER."
usage
fi

source checker.sh

# Step 1: Check if mysqlsh exists, if not, install it
if ! command -v mysqlsh &> /dev/null; then
echo "mysqlsh not found, attempting to install..."
Expand Down
2 changes: 1 addition & 1 deletion devtools/replica-setup-mysql/snapshot.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ echo "Thread count set to: $THREAD_COUNT"

echo "Copying data from MySQL to MyDuck..."
# Run mysqlsh command and capture the output
output=$(mysqlsh --host=${MYSQL_HOST} --port=${MYSQL_PORT} --user=${MYSQL_USER} --password=${MYSQL_PASSWORD} -- util copy-instance "mysql://${MYDUCK_USER}:${MYDUCK_PASSWORD}@${MYDUCK_HOST}:${MYDUCK_PORT}" --users false --consistent false --ignore-existing-objects true --handle-grant-errors ignore --threads $THREAD_COUNT --bytesPerChunk 256M --ignore-version true)
output=$(mysqlsh --host=${SOURCE_HOST} --port=${SOURCE_PORT} --user=${SOURCE_USER} --password=${SOURCE_PASSWORD} -- util copy-instance "mysql://${MYDUCK_USER}:${MYDUCK_PASSWORD}@${MYDUCK_HOST}:${MYDUCK_PORT}" --users false --consistent false --ignore-existing-objects true --handle-grant-errors ignore --threads $THREAD_COUNT --bytesPerChunk 256M --ignore-version true)

if [[ $GTID_MODE == "ON" ]]; then
# Extract the EXECUTED_GTID_SET from this output:
Expand Down
19 changes: 10 additions & 9 deletions devtools/replica-setup-mysql/start_replication.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,18 @@
OS=$(uname -s)

# if [[ $SOURCE_IS_EMPTY -eq 0 ]]; then
# EXECUTED_GTID_SET=$(mysqlsh --host="$MYSQL_HOST" --user="$MYSQL_USER" --password="$MYSQL_PASSWORD" --sql -e "SHOW BINARY LOG STATUS\G" | grep -i "Executed_Gtid_Set" | awk -F': ' '{print $2}')
# EXECUTED_GTID_SET=$(mysqlsh --host="$SOURCE_HOST" --user="$SOURCE_USER" --password="$SOURCE_PASSWORD" --sql -e "SHOW BINARY LOG STATUS\G" | grep -i "Executed_Gtid_Set" | awk -F': ' '{print $2}')
# if [[ -z "$EXECUTED_GTID_SET" ]]; then
# echo "Failed to get executed GTID set by statement 'SHOW BINARY LOG STATUS\G'. Trying to get it by statement 'SHOW MASTER STATUS\G'..."
# EXECUTED_GTID_SET=$(mysqlsh --host="$MYSQL_HOST" --user="$MYSQL_USER" --password="$MYSQL_PASSWORD" --sql -e "SHOW MASTER STATUS\G" | grep -i "Executed_Gtid_Set" | awk -F': ' '{print $2}')
# EXECUTED_GTID_SET=$(mysqlsh --host="$SOURCE_HOST" --user="$SOURCE_USER" --password="$SOURCE_PASSWORD" --sql -e "SHOW MASTER STATUS\G" | grep -i "Executed_Gtid_Set" | awk -F': ' '{print $2}')
# fi
# fi

if [[ "${MYDUCK_IN_DOCKER}" =~ "true" ]] && [[ "$OS" == "Darwin" ]] && ([[ "${MYSQL_HOST}" == "127.0.0.1" ]] || [[ "${MYSQL_HOST}" == "localhost" ]] || [[ "${MYSQL_HOST}" == "0.0.0.0" ]]); then
MYSQL_HOST_FOR_REPLICA="host.docker.internal"
if [[ "${MYDUCK_IN_DOCKER}" == "true" && "$OS" == "Darwin" &&
("${SOURCE_HOST}" == "127.0.0.1" || "${SOURCE_HOST}" == "localhost" || "${SOURCE_HOST}" == "0.0.0.0") ]]; then
SOURCE_HOST_FOR_REPLICA="host.docker.internal"
else
MYSQL_HOST_FOR_REPLICA="${MYSQL_HOST}"
SOURCE_HOST_FOR_REPLICA="${SOURCE_HOST}"
fi

# Use the EXECUTED_GTID_SET variable from the previous steps
Expand All @@ -26,10 +27,10 @@ fi

# Connect to MySQL and execute the replication configuration commands
REPLICATION_CMD="CHANGE REPLICATION SOURCE TO \
SOURCE_HOST='${MYSQL_HOST_FOR_REPLICA}', \
SOURCE_PORT=${MYSQL_PORT}, \
SOURCE_USER='${MYSQL_USER}', \
SOURCE_PASSWORD='${MYSQL_PASSWORD}'"
SOURCE_HOST='${SOURCE_HOST_FOR_REPLICA}', \
SOURCE_PORT=${SOURCE_PORT}, \
SOURCE_USER='${SOURCE_USER}', \
SOURCE_PASSWORD='${SOURCE_PASSWORD}'"

if [ $GTID_MODE == "OFF" ]; then
REPLICATION_CMD="${REPLICATION_CMD}, \
Expand Down
92 changes: 92 additions & 0 deletions devtools/replica-setup-postgres/replica_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/bin/bash

usage() {
echo "Usage: $0 --postgres_host <host> --postgres_port <port> --postgres_user <user> --postgres_password <password> [--myduck_host <host>] [--myduck_port <port>] [--myduck_user <user>] [--myduck_password <password>] [--myduck_in_docker <true|false>]"
exit 1
}

MYDUCK_HOST=${MYDUCK_HOST:-127.0.0.1}
MYDUCK_PORT=${MYDUCK_PORT:-5432}
MYDUCK_USER=${MYDUCK_USER:-mysql}
MYDUCK_PASSWORD=${MYDUCK_PASSWORD:-}
MYDUCK_SERVER_ID=${MYDUCK_SERVER_ID:-2}
MYDUCK_IN_DOCKER=${MYDUCK_IN_DOCKER:-false}

while [[ $# -gt 0 ]]; do
case $1 in
--postgres_host)
SOURCE_HOST="$2"
shift 2
;;
--postgres_port)
SOURCE_PORT="$2"
shift 2
;;
--postgres_user)
SOURCE_USER="$2"
shift 2
;;
--postgres_password)
SOURCE_PASSWORD="$2"
shift 2
;;
--myduck_host)
MYDUCK_HOST="$2"
shift 2
;;
--myduck_port)
MYDUCK_PORT="$2"
shift 2
;;
--myduck_user)
MYDUCK_USER="$2"
shift 2
;;
--myduck_password)
MYDUCK_PASSWORD="$2"
shift 2
;;
--myduck_server_id)
MYDUCK_SERVER_ID="$2"
shift 2
;;
--myduck_in_docker)
MYDUCK_IN_DOCKER="$2"
shift 2
;;
*)
echo "Unknown parameter: $1"
usage
;;
esac
done

# Check if all parameters are set
if [[ -z "$SOURCE_HOST" || -z "$SOURCE_PORT" || -z "$SOURCE_USER" ]]; then
echo "Error: Missing required Postgres connection variables: SOURCE_HOST, SOURCE_PORT, SOURCE_USER."
usage
fi

# Step 1: Check Postgres configuration
echo "Checking Postgres configuration..."
# TODO(neo.zty): add check for Postgres configuration

# Step 2: Establish replication
echo "Starting replication..."
export PUBLICATION_NAME="myduck_publication"
export SUBSCRIPTION_NAME="myduck_subscription"

CREATE_SUBSCRIPTION_SQL="CREATE SUBSCRIPTION ${SUBSCRIPTION_NAME} \
CONNECTION 'dbname=${SOURCE_DATABASE} host=${SOURCE_HOST} port=${SOURCE_PORT} user=${SOURCE_USER} password=${SOURCE_PASSWORD}' \
PUBLICATION ${PUBLICATION_NAME};"

psql -h $MYDUCK_HOST -p $MYDUCK_PORT -U $MYDUCK_USER <<EOF
${CREATE_SUBSCRIPTION_SQL}
EOF

if [[ -n "$?" ]]; then
echo "SQL executed successfully."
else
echo "SQL execution failed. Check the error message above."
exit 1
fi
1 change: 1 addition & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ WORKDIR /home/admin
COPY --from=builder /myduckserver /usr/local/bin/myduckserver
COPY --chown=admin:admin --chmod=755 docker/*.sh .
COPY --chown=admin:admin --chmod=755 devtools/replica-setup-mysql ./replica-setup-mysql
COPY --chown=admin:admin --chmod=755 devtools/replica-setup-postgres ./replica-setup-postgres

# ENV LC_CTYPE="en_US.UTF-8"
# ENV LANG="en_US.UTF-8"
Expand Down
Loading

0 comments on commit 95af164

Please sign in to comment.