diff --git a/.github/workflows/manual-build.yml b/.github/workflows/manual-build.yml index 944f903..24125d7 100644 --- a/.github/workflows/manual-build.yml +++ b/.github/workflows/manual-build.yml @@ -1,11 +1,34 @@ +#--- +#name: Manual Build & Push +#on: +# workflow_dispatch: +# inputs: +# platforms: +# description: 'The platforms for which the Docker image should be built. If not specified, defaults to linux/amd64.' +# required: false +# default: 'linux/amd64,linux/arm64/v8,linux/riscv64' +#jobs: +# build-push: +# uses: kbase/.github/.github/workflows/reusable_build-push.yml@develop +# with: +# name: '${{ github.event.repository.name }}-develop' +# tags: br-${{ github.ref_name }} +# platforms: ${{ github.event.inputs.platforms }} +# secrets: inherit --- name: Manual Build & Push on: - workflow_dispatch: + workflow_dispatch: + inputs: + platforms: + description: 'The platforms for which the Docker image should be built. If not specified, defaults to linux/amd64.' + required: false + default: 'linux/amd64,linux/arm64/v8' jobs: build-push: - uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + uses: kbase/.github/.github/workflows/reusable_build-push.yml@develop with: name: '${{ github.event.repository.name }}-develop' tags: br-${{ github.ref_name }} - secrets: inherit + platforms: ${{ github.event.inputs.platforms }} + secrets: inherit \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 7ec1da0..8a11b48 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,8 @@ -FROM alpine:latest +# Builder stage +FROM alpine:latest as builder -# Update and install necessary packages RUN apk update && \ - apk add p7zip rclone + apk add --no-cache curl p7zip rclone # Create config directory RUN mkdir -p /root/.config/rclone/ @@ -11,6 +11,17 @@ RUN mkdir -p /root/.config/rclone/ COPY rclone.conf /root/.config/rclone/rclone.conf COPY app/ /app/ +# Final stage +FROM alpine:latest + +RUN apk update && \ + apk add --no-cache curl p7zip + +# Copy necessary binaries and files from builder stage +COPY --from=builder /usr/bin/rclone /usr/bin/rclone +COPY --from=builder /root/.config/rclone/rclone.conf /root/.config/rclone/rclone.conf +COPY --from=builder /app/ /app/ + WORKDIR /app -ENTRYPOINT /app/zip2cloud \ No newline at end of file +ENTRYPOINT ["/app/zip2cloud"] \ No newline at end of file diff --git a/app/README.md b/app/README.md new file mode 100644 index 0000000..4002b0f --- /dev/null +++ b/app/README.md @@ -0,0 +1,48 @@ + +## Zip2Cloud + +A robust zip & upload utility for sending archives to a remote location. + +### Features + +- Intelligently compares local & remote files with md5 sums +- Only uploads _completed_ archives +- Only deletes local files once they have been successfully uploaded +- Allows keeping an arbitrary amount of zipped & unzipped backups locally for faster restore + - Script only zips & uploads files that are missing from the remote location +- Allows mixing backup files with other data + - Only zips folders under the `$DUMP_BASE` directory with a date-based name e.g. `2024-04-01` +- Notifies on completion or error via Slack + +### Operation of `zip2cloud` + +- Uses `rclone` to create a list of `.7z` & `.md5` files from the remote location defined with the `REMOTE` environment variable +- For each file in the list + +- Compares file names & md5 sums between local & remote locations prior to read/write operations + - Uploads any `.7z` files that are missing from the remote location + - Files with mismatched md5 sums are uploaded with alternate filenames + - Only deletes files locally once they have been successfully uploaded & md5 sums confirmed +- Allows multiple unzipped local backups to remain, without re-zipping & uploading + - This allows for faster restores, as we can avoid downloading the most recent archives +- + +1. Creates 7zip archives of any directories under the `$DUMP_BASE` with a date-based name + - For example, if `$DUMP_BASE` is `/dump/full_backup`, the directory `2024-04-01` will +2. Syncs the archives to a remote location using rclone + +### Variables + +| Variable | Description | Default | +|-----------------|---------------------------------------------------|---------| +| `BUCKET` | The bucket to store the backups | | +| `BUCKET_PATH` | The path within the bucket to store the backups | | +| `DUMP_BASE` | The base directory for backup dumps | `/dump` | +| `DUMP_RETENTION`| The number of days to keep uncompressed backups locally | | +| `REMOTE` | The remote location to sync backups to | | +| `SECRET` | The encryption key for 7zip | | +| `SLACK_CHANNEL` | The slack channel to send notifications to | | +| `SLACK_WEBHOOK` | The webhook URL for slack notifications | | +| `ZIP_BASE` | The base filename, minus date, for the compressed backups | | +| `ZIP_DIR` | The directory to store all compressed backups | `/zip` | +| `ZIP_RETENTION` | The number of days to keep compressed backups locally | | \ No newline at end of file diff --git a/app/zip2cloud b/app/zip2cloud index 5ce5fc2..591739f 100755 --- a/app/zip2cloud +++ b/app/zip2cloud @@ -1,27 +1,19 @@ #!/bin/sh -# Script to compress and encrypt mongodb backup directories and then sync them against a -# cloud S3 bucket -# -# Depends on 7zip and rclone -# -# sychan@lbl.gov -# 5/21/2021 +## Variables +COMPRESSION_LEVEL=${COMPRESSION_LEVEL:-0} # Set to 0 if the db dumps are already compressed +DELETE_DUMP=${DELETE_DUMP:-''} +DUMP_BASE=${DUMP_BASE:-/dump/full_backup} +DUMP_RETENTION=${DUMP_RETENTION:-3} +REMOTE=${REMOTE:-remote:${BUCKET}/${BUCKETPATH}} +SECRET=${SECRET:-`cat /run/secrets/encryption_key`} +SLACK_CHANNEL=${SLACK_CHANNEL:-''} +SLACK_WEBHOOK=${SLACK_WEBHOOK:-''} +ZIP_BASE=${ZIP_BASE:-backup_full} +ZIP_DIR=${ZIP_DIR:-/zip} +ZIP_RETENTION=${ZIP_RETENTION:-4} -# Directory containing db dumps to be archived/compressed/copied -#DUMP_BASE=/Users/jsfillman/Documents/repos/jsfillman-github/tmp-backup-test -DUMP_BASE=/dump/full_backup - -# Directory to put the zipped backups -#ZIP_DIR=/Users/jsfillman/Documents/repos/jsfillman-github/tmp-backup-zip -ZIP_DIR=/zip - -NOW=$(/bin/date +"%Y%m%d%H%M") - -# Name of the zip'ed db backup. The .7z extension wil be added by the 7zip program - -ZIP_BASE=backup_full -#ZIP_NAME=${ZIP_BASE}${NOW} +### Cleanup [ -r /run/secrets/encryption_key ] || { echo "Encryption key not readable in /run/secrets/encryption_key" ; exit 1; } [ -r /run/secrets/gcp_backup_creds ] || { echo "Google cloud service credentials not found in /run/secrets/gcp_back_creds" ; exit 1; } @@ -29,37 +21,128 @@ ZIP_BASE=backup_full [ -z "${BUCKETPATH}" ] && { echo "Path within S3 bucket not set in BUCKETPATH environment variable" ; exit 1; } [ -z "${DELETE_DUMP}" ] || echo "DELETE_DUMP set, will delete files/directories under /dump/ when done compressing" -## This is the password used to generate the AES256 encryption key -#SECRET=tempsecret -SECRET=`cat /run/secrets/encryption_key` -# -## This is the Google Cloud Storage path, note that it depends on rclone being preconfigured -## for "remote" using the runtime creds, check rclone config in /root/.config/rclone/rclone.conf -REMOTE=remote:${BUCKET}/${BUCKETPATH} -# Delete any files older than 30 days in the zip directory -echo "Deleting database archives older than 30 days" -/usr/bin/find ${ZIP_DIR} -mtime +30 -type f -name "${ZIP_BASE}*" -print -exec rm {} \; +# Delete all old zip files, except the last N, as defined by $ZIP_RETENTION +rm -rf ${ZIP_DIR}/tmp_md5 +ls -t ${ZIP_DIR}/*.7z 2>/dev/null | tail -n +${ZIP_RETENTION} | xargs -r rm -f +ls -t ${ZIP_DIR}/*.md5 2>/dev/null | tail -n +${ZIP_RETENTION} | xargs -r rm -f + +# Delete all old backup dumps, except the last N, as defined by $DUMP_RETENTION +find ${DUMP_BASE} -type d -name "[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]" -print0 | xargs -0 ls -td | tail -n +${DUMP_RETENTION} | xargs -I {} rm -rf {} + +### End Cleanup + + +# Get list of remote backups +remote_files=$(rclone ls remote:${BUCKET}/${BUCKETPATH} | grep 7z | awk '{print $2}' | rev | cut -d. -f2- | rev) +# Pull remote md5 sums for each remote backup into `tmp_md5` directory +mkdir -p ${ZIP_DIR}/tmp_md5 && cd $_ +for file in $remote_files; do + rclone md5sum remote:${BUCKET}/${BUCKETPATH}/$file.7z | awk '{print $1}' > ${ZIP_DIR}/tmp_md5/$file.md5 +done -echo "Zipping ${DUMP_BASE}/${DUMP_DIR} to ${ZIP_DIR}/${ZIP_NAME}" -# Get all directories in DUMP_BASE +# Get all exports from DUMP_BASE for DUMP_DIR in $(ls -d ${DUMP_BASE}/*/); do + # Check if the dump is complete + echo "Checking export for ${DUMP_DIR}" + if [ ! -f "${DUMP_DIR}/dump_complete.txt" ]; then + echo "dump_complete.txt not found in ${DUMP_DIR}, skipping" + continue + fi # Remove trailing slash and get the base name of the directory DIR_NAME=$(basename ${DUMP_DIR%/}) ZIP_NAME=${ZIP_DIR}/${ZIP_BASE}_${DIR_NAME} + echo $DIR_NAME + # Check if the corresponding md5 file exists, if not, zip it + if [ ! -f "${ZIP_DIR}/tmp_md5/${ZIP_BASE}_${DIR_NAME}.md5" ]; then + echo "No remote exists for ${DIR_NAME}, zipping" + /usr/bin/7z a -p${SECRET} ${ZIP_NAME} -mx=${COMPRESSION_LEVEL} -mhe -t7z ${DUMP_DIR} || { echo "Could not zip ${DUMP_DIR} into ${ZIP_NAME}" ; exit 1; } + else + echo "Remote exists for ${DIR_NAME}, skipping" + fi +done - echo "Zipping ${DUMP_DIR} to ${ZIP_NAME}" - /usr/bin/7za a -p${SECRET} ${ZIP_NAME} -mx=0 -mhe -t7z ${DUMP_DIR} || { echo "Could not zip ${DUMP_DIR} into ${ZIP_NAME}" ; exit 1; } +# Compare checksums of local 7z files against all remotes' md5's. Add to upload list if not found +uploads="" +cd ${ZIP_DIR} || exit +for file in ${ZIP_DIR}/*.7z; do + # Get the base name of the file without extension + base_name=$(basename "$file" .7z) + local_md5=$(md5sum "$file" | awk '{print $1}') + echo $local_md5 > "${ZIP_DIR}/${base_name}.md5" + # Now compare this file with the remote md5s + match_found=0 + for remote_md5_file in ${ZIP_DIR}/tmp_md5/*.md5; do + remote_md5=$(cat "$remote_md5_file") + if [ "$local_md5" = "$remote_md5" ]; then + match_found=1 + break + fi + done + if [ $match_found -eq 0 ]; then + echo "Adding $file to uploads list" + uploads="$uploads $file" + fi done -## Sync All Resulting Files -cd ${ZIP_DIR} -for file in ${ZIP_DIR}/*; do - echo "RClone-ing ${file} to GCP ${GCP_DEST}" - /bin/rclone sync -v "$file" ${REMOTE}/ +echo "Current uploads candidates are: $uploads" + +## Verify & update list of files to upload +final_uploads="" +cd ${ZIP_DIR} || exit +for file in ${uploads}; do + # Get the base name of the file without extension + base_name=$(basename "$file" .7z) +# # Check if the remote md5 file exists +# if [ ! -f "${ZIP_DIR}/tmp_md5/${base_name}.md5" ]; then +# # If the remote md5 file does not exist, add the file to the uploads list +# echo "Remote does not exist for $file, adding $file to uploads list" +# final_uploads="$final_uploads $file" +# else + # Compare local and remote md5 + remote_md5=$(cat "${ZIP_DIR}/tmp_md5/${base_name}.md5") + local_md5=$(cat "${ZIP_DIR}/${base_name}.md5") + if [ "$local_md5" != "$remote_md5" ]; then + echo "MD5 mismatch for file $file. Incrementing filename and adding to uploads list." + # Extract the last character of the base name + last_char=${base_name: -1} + # Check if the last character is a letter + if [[ $last_char =~ [a-y] ]]; then + # If it's a letter, increment it + next_char=$(echo "$last_char" | tr "a-y" "b-z") + new_base_name=${base_name%?}$next_char + elif [[ $last_char == 'z' ]]; then + # If it's 'z', replace it with 'a' and append 'a' + new_base_name=${base_name%?}aa + else + # If it's not a letter, append 'a' + new_base_name=${base_name}a + fi + # Rename the file + mv "$file" "${ZIP_DIR}/${new_base_name}.7z" + # Add the renamed file to the uploads list + final_uploads="$final_uploads ${ZIP_DIR}/${new_base_name}.7z" + fi done -## Create a block that, upon success of rclone above, delete _only_ files that were uploaded -## For each $FILE.7z in $ZIP_DIR, do a "rm -rf $DUMP_BASE/$FILE" to remove the original dump -#[ -z "${DELETE_DUMP}" ] || { echo "Clearing contents of /dump/"; cd /dump/; rm -rf *; } +echo "Final uploads: $final_uploads" + + +# Before running rclone +#for file in "${uploads[@]}"; do +for file in ${final_uploads}; do + ls $file + if [ ! -f "$file" ]; then + echo "File does not exist: $file" + fi +done + + + +## Sync All Resulting Files (in list!) +cd ${ZIP_DIR} || exit +for file in ${final_uploads}; do + echo "RClone-ing ${file} to GCP ${REMOTE}" + /usr/bin/rclone sync -v "$file" ${REMOTE}/ +done