diff --git a/.github/workflows/manual-build.yml b/.github/workflows/manual-build.yml index 944f903..5eb59c2 100644 --- a/.github/workflows/manual-build.yml +++ b/.github/workflows/manual-build.yml @@ -1,11 +1,17 @@ --- name: Manual Build & Push on: - workflow_dispatch: + workflow_dispatch: + inputs: + platforms: + description: 'The platforms for which the Docker image should be built. If not specified, defaults to linux/amd64.' + required: false + default: 'linux/amd64,linux/arm64/v8' jobs: build-push: - uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + uses: kbase/.github/.github/workflows/reusable_build-push.yml@multi-target with: name: '${{ github.event.repository.name }}-develop' tags: br-${{ github.ref_name }} + platforms: ${{ github.event.inputs.platforms }} secrets: inherit diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml index 0fa1c46..5d07745 100644 --- a/.github/workflows/pr_build.yml +++ b/.github/workflows/pr_build.yml @@ -11,33 +11,49 @@ on: - reopened - synchronize - closed +# Defaults to building for linux/amd64. Can modify be updating variable, e.g. PLATFORMS: 'linux/amd64,linux/arm64/v8' +env: + PLATFORMS: 'linux/amd64,linux/arm64/v8' jobs: + set-platforms: + runs-on: ubuntu-latest + outputs: + platforms: ${{ steps.set-platforms.outputs.platforms }} + steps: + - name: Set platforms + id: set-platforms + run: echo "::set-output name=platforms::linux/amd64,linux/arm64/v8" build-develop-open: if: github.base_ref == 'develop' && github.event.pull_request.merged == false - uses: kbase/.github/.github/workflows/reusable_build.yml@main + uses: kbase/.github/.github/workflows/reusable_build.yml@multi-target + with: + platforms: ${{ needs.set-platforms.outputs.platforms }} secrets: inherit build-develop-merge: if: github.base_ref == 'develop' && github.event.pull_request.merged == true - uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + uses: kbase/.github/.github/workflows/reusable_build-push.yml@multi-target with: name: '${{ github.event.repository.name }}-develop' tags: pr-${{ github.event.number }},latest + platforms: ${{ needs.set-platforms.outputs.platforms }} secrets: inherit build-main-open: if: (github.base_ref == 'main' || github.base_ref == 'master') && github.event.pull_request.merged == false - uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + uses: kbase/.github/.github/workflows/reusable_build-push.yml@multi-target with: name: '${{ github.event.repository.name }}' tags: pr-${{ github.event.number }} + platforms: ${{ needs.set-platforms.outputs.platforms }} secrets: inherit build-main-merge: if: (github.base_ref == 'main' || github.base_ref == 'master') && github.event.pull_request.merged == true - uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + uses: kbase/.github/.github/workflows/reusable_build-push.yml@multi-target with: name: '${{ github.event.repository.name }}' tags: pr-${{ github.event.number }},latest-rc + platforms: ${{ needs.set-platforms.outputs.platforms }} secrets: inherit trivy-scans: if: (github.base_ref == 'develop' || github.base_ref == 'main' || github.base_ref == 'master' ) && github.event.pull_request.merged == false - uses: kbase/.github/.github/workflows/reusable_trivy-scans.yml@main + uses: kbase/.github/.github/workflows/reusable_trivy-scans.yml@multi-target secrets: inherit diff --git a/.github/workflows/release-main.yml b/.github/workflows/release-main.yml index a254678..52f4517 100644 --- a/.github/workflows/release-main.yml +++ b/.github/workflows/release-main.yml @@ -7,19 +7,28 @@ on: - master types: [ published ] jobs: + set-platforms: + runs-on: ubuntu-latest + outputs: + platforms: ${{ steps.set-platforms.outputs.platforms }} + steps: + - name: Set platforms + id: set-platforms + run: echo "::set-output name=platforms::linux/amd64,linux/arm64/v8" check-source-branch: - uses: kbase/.github/.github/workflows/reusable_validate-branch.yml@main + uses: kbase/.github/.github/workflows/reusable_validate-branch.yml@multi-target with: build_branch: '${{ github.event.release.target_commitish }}' validate-release-tag: needs: check-source-branch - uses: kbase/.github/.github/workflows/reusable_validate-release-tag.yml@main + uses: kbase/.github/.github/workflows/reusable_validate-release-tag.yml@multi-target with: release_tag: '${{ github.event.release.tag_name }}' build-push: needs: validate-release-tag - uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + uses: kbase/.github/.github/workflows/reusable_build-push.yml@multi-target with: name: '${{ github.event.repository.name }}' tags: '${{ github.event.release.tag_name }},latest' + platforms: ${{ needs.set-platforms.outputs.platforms }} secrets: inherit diff --git a/Dockerfile b/Dockerfile index 5ad3780..8a11b48 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,32 +1,27 @@ -FROM arangodb:3.5.3 +# Builder stage +FROM alpine:latest as builder -# Build arguments passed into the docker command for image metadata -ARG BUILD_DATE -ARG COMMIT -ARG BRANCH - -# RUN pip install requests docker python-json-logger structlog && \ RUN apk update && \ - apk add p7zip && \ - cd /tmp && \ - wget https://downloads.rclone.org/rclone-current-linux-amd64.zip && \ - unzip rclone-current-linux-amd64.zip && \ - mv rclone-v*-linux-amd64/rclone /bin/rclone && \ - mkdir -p /root/.config/rclone/ + apk add --no-cache curl p7zip rclone + +# Create config directory +RUN mkdir -p /root/.config/rclone/ +# Copy necessary files COPY rclone.conf /root/.config/rclone/rclone.conf COPY app/ /app/ -LABEL org.label-schema.build-date=$BUILD_DATE \ - org.label-schema.vcs-url="https://github.com/kbase/db_zip2cloud.git" \ - org.label-schema.vcs-ref=$COMMIT \ - org.label-schema.schema-version="1.0.0-rc1" \ - us.kbase.vcs-branch=$BRANCH \ - maintainer="Steve Chan sychan@lbl.gov" \ - org.opencontainers.image.source="https://github.com/kbase/db_zip2cloud" +# Final stage +FROM alpine:latest -WORKDIR /app +RUN apk update && \ + apk add --no-cache curl p7zip -ENTRYPOINT /app/zip2cloud +# Copy necessary binaries and files from builder stage +COPY --from=builder /usr/bin/rclone /usr/bin/rclone +COPY --from=builder /root/.config/rclone/rclone.conf /root/.config/rclone/rclone.conf +COPY --from=builder /app/ /app/ +WORKDIR /app +ENTRYPOINT ["/app/zip2cloud"] \ No newline at end of file diff --git a/app/README.md b/app/README.md new file mode 100644 index 0000000..6b2372a --- /dev/null +++ b/app/README.md @@ -0,0 +1,45 @@ + +## Zip2Cloud + +A robust zip & upload utility for sending archives to a remote location. + +### Features + +- Intelligently compares local & remote files with md5 sums +- Only uploads _completed_ archives +- Only deletes local files once they have been successfully uploaded +- Allows keeping an arbitrary amount of zipped & unzipped backups locally for faster restore + - Script only zips & uploads files that are missing from the remote location + +[//]: # (- Allows mixing backup files with other data) + +[//]: # ( - Only zips folders under the `$DUMP_BASE` directory with a date-based name e.g. `2024-04-01`) + +[//]: # (- Notifies on completion or error via Slack) + +### Operation of `zip2cloud` + +1. Cleans up old zip files and backup dumps based on the retention period set in the environment variables. +2. Retrieves the list of remote backups and their MD5 checksums from the remote S3 bucket. +3. Checks database dumps for completion by looking for a `dump_complete.txt` file in the dump's top-level directory. +4. Compresses new database dumps that do not have a corresponding MD5 file in the remote S3 bucket. +5. Compares the MD5 checksums of local and remote files. + 1. If a local file does not have a matching MD5 checksum in the remote S3 bucket, it is added to the upload list. +6. If there's an MD5 mismatch between a local and a remote file, the script increments the filename of the local file and adds it to the upload list. +7. Finally, it syncs all the files in the upload list to the remote S3 bucket using rclone. + +### Variables + +| Variable | Description | Default | +|-----------------|---------------------------------------------------|---------| +| `BUCKET` | The bucket to store the backups | | +| `BUCKET_PATH` | The path within the bucket to store the backups | | +| `DUMP_BASE` | The base directory for backup dumps | `/dump` | +| `DUMP_RETENTION`| The number of days to keep uncompressed backups locally | | +| `REMOTE` | The remote location to sync backups to | | +| `SECRET` | The encryption key for 7zip | | +| `SLACK_CHANNEL` | The slack channel to send notifications to | | +| `SLACK_WEBHOOK` | The webhook URL for slack notifications | | +| `ZIP_BASE` | The base filename, minus date, for the compressed backups | | +| `ZIP_DIR` | The directory to store all compressed backups | `/zip` | +| `ZIP_RETENTION` | The number of days to keep compressed backups locally | | \ No newline at end of file diff --git a/app/zip2cloud b/app/zip2cloud index 06df524..ad8c40c 100755 --- a/app/zip2cloud +++ b/app/zip2cloud @@ -1,25 +1,19 @@ #!/bin/sh -# Script to compress and encrypt mongodb backup directories and then sync them against a -# cloud S3 bucket -# -# Depends on 7zip and rclone -# -# sychan@lbl.gov -# 5/21/2021 +## Variables +COMPRESSION_LEVEL=${COMPRESSION_LEVEL:-0} # Set to 0 if the db dumps are already compressed +DELETE_DUMP=${DELETE_DUMP:-''} +DUMP_BASE=${DUMP_BASE:-/dump/full_backup} +DUMP_RETENTION=${DUMP_RETENTION:-3} +REMOTE=${REMOTE:-remote:${BUCKET}/${BUCKETPATH}} +SECRET=${SECRET:-`cat /run/secrets/encryption_key`} +SLACK_CHANNEL=${SLACK_CHANNEL:-''} +SLACK_WEBHOOK=${SLACK_WEBHOOK:-''} +ZIP_BASE=${ZIP_BASE:-backup_full} +ZIP_DIR=${ZIP_DIR:-/zip} +ZIP_RETENTION=${ZIP_RETENTION:-4} -# Directory containing db dumps to be archived/compressed/copied -DUMP_BASE=/dump/ - -# Directory to put the zipped backups -ZIP_DIR=/zip/ - -NOW=$(/bin/date +"%Y%m%d%H%M") - -# Name of the zip'ed db backup. The .7z extension wil be added by the 7zip program - -ZIP_BASE=backup_full_ -ZIP_NAME=${ZIP_BASE}${NOW} +### Cleanup [ -r /run/secrets/encryption_key ] || { echo "Encryption key not readable in /run/secrets/encryption_key" ; exit 1; } [ -r /run/secrets/gcp_backup_creds ] || { echo "Google cloud service credentials not found in /run/secrets/gcp_back_creds" ; exit 1; } @@ -27,22 +21,122 @@ ZIP_NAME=${ZIP_BASE}${NOW} [ -z "${BUCKETPATH}" ] && { echo "Path within S3 bucket not set in BUCKETPATH environment variable" ; exit 1; } [ -z "${DELETE_DUMP}" ] || echo "DELETE_DUMP set, will delete files/directories under /dump/ when done compressing" -# This is the password used to generate the AES256 encryption key -SECRET=`cat /run/secrets/encryption_key` -# This is the Google Cloud Storage path, note that it depends on rclone being preconfigured -# for "remote" using the runtime creds, check rclone config in /root/.config/rclone/rclone.conf -REMOTE=remote:${BUCKET}/${BUCKETPATH} +# Delete all old zip files, except the last N, as defined by $ZIP_RETENTION +rm -rf ${ZIP_DIR}/tmp_md5 +ls -t ${ZIP_DIR}/*.7z 2>/dev/null | tail -n +${ZIP_RETENTION} | xargs -r rm -f +ls -t ${ZIP_DIR}/*.md5 2>/dev/null | tail -n +${ZIP_RETENTION} | xargs -r rm -f + +# Delete all old backup dumps, except the last N, as defined by $DUMP_RETENTION +find ${DUMP_BASE} -type d -name "[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]" -print0 | xargs -0 ls -td | tail -n +${DUMP_RETENTION} | xargs -I {} rm -rf {} + +### End Cleanup + + +# Get list of remote backups +remote_files=$(rclone ls remote:${BUCKET}/${BUCKETPATH} | grep 7z | awk '{print $2}' | rev | cut -d. -f2- | rev) +# Pull remote md5 sums for each remote backup into `tmp_md5` directory +mkdir -p ${ZIP_DIR}/tmp_md5 && cd $_ +for file in $remote_files; do + rclone md5sum remote:${BUCKET}/${BUCKETPATH}/$file.7z | awk '{print $1}' > ${ZIP_DIR}/tmp_md5/$file.md5 +done + + +# Get all exports from DUMP_BASE +for DUMP_DIR in $(ls -d ${DUMP_BASE}/*/); do + # Check if the dump is complete + echo "Checking export for ${DUMP_DIR}" + if [ ! -f "${DUMP_DIR}/dump_complete.txt" ]; then + echo "dump_complete.txt not found in ${DUMP_DIR}, skipping" + continue + fi + # Remove trailing slash and get the base name of the directory + DIR_NAME=$(basename ${DUMP_DIR%/}) + ZIP_NAME=${ZIP_DIR}/${ZIP_BASE}_${DIR_NAME} + echo $DIR_NAME + # Check if the corresponding md5 file exists, if not, zip it + if [ ! -f "${ZIP_DIR}/tmp_md5/${ZIP_BASE}_${DIR_NAME}.md5" ]; then + echo "No remote exists for ${DIR_NAME}, zipping" + /usr/bin/7z a -p${SECRET} ${ZIP_NAME} -mx=${COMPRESSION_LEVEL} -mhe -t7z ${DUMP_DIR} || { echo "Could not zip ${DUMP_DIR} into ${ZIP_NAME}" ; exit 1; } + else + echo "Remote exists for ${DIR_NAME}, skipping" + fi +done + +# Compare checksums of local 7z files against all remotes' md5's. Add to upload list if not found +uploads="" +cd ${ZIP_DIR} || exit +for file in ${ZIP_DIR}/*.7z; do + # Get the base name of the file without extension + base_name=$(basename "$file" .7z) + local_md5=$(md5sum "$file" | awk '{print $1}') + echo $local_md5 > "${ZIP_DIR}/${base_name}.md5" + # Now compare this file with the remote md5s + match_found=0 + for remote_md5_file in ${ZIP_DIR}/tmp_md5/*.md5; do + remote_md5=$(cat "$remote_md5_file") + if [ "$local_md5" = "$remote_md5" ]; then + match_found=1 + break + fi + done + if [ $match_found -eq 0 ]; then + echo "Adding $file to uploads list" + uploads="$uploads $file" + fi +done + +echo "Current uploads candidates are: $uploads" + +## Verify & update list of files to upload +final_uploads="" +cd ${ZIP_DIR} || exit +for file in ${uploads}; do + # Get the base name of the file without extension + base_name=$(basename "$file" .7z) + # Compare local and remote md5 + remote_md5=$(cat "${ZIP_DIR}/tmp_md5/${base_name}.md5") + local_md5=$(cat "${ZIP_DIR}/${base_name}.md5") + if [ "$local_md5" != "$remote_md5" ]; then + echo "MD5 mismatch for file $file. Incrementing filename and adding to uploads list." + # Extract the last character of the base name + last_char=${base_name: -1} + # Check if the last character is a letter + if [[ $last_char =~ [a-y] ]]; then + # If it's a letter, increment it + next_char=$(echo "$last_char" | tr "a-y" "b-z") + new_base_name=${base_name%?}$next_char + elif [[ $last_char == 'z' ]]; then + # If it's 'z', replace it with 'a' and append 'a' + new_base_name=${base_name%?}aa + else + # If it's not a letter, append 'a' + new_base_name=${base_name}a + fi + # Rename the file + mv "$file" "${ZIP_DIR}/${new_base_name}.7z" + # Add the renamed file to the uploads list + final_uploads="$final_uploads ${ZIP_DIR}/${new_base_name}.7z" + fi +done + +echo "Final uploads: $final_uploads" + -# Delete any files older than 30 days in the zip directory -echo "Deleting database archives older than 30 days" -/usr/bin/find ${ZIP_DIR} -mtime +30 -type f -name "${ZIP_BASE}*" -print -exec rm {} \; +# Before running rclone +#for file in "${uploads[@]}"; do +for file in ${final_uploads}; do + ls $file + if [ ! -f "$file" ]; then + echo "File does not exist: $file" + fi +done -echo "Zipping ${DUMP_BASE}/${DUMP_DIR} to ${ZIP_DIR}/${ZIP_NAME}" -cd / -/usr/bin/7za a -p${SECRET} ${ZIP_DIR}/${ZIP_NAME} -mx=7 -mhe -t7z ${DUMP_BASE} || { echo "Could not zip ${DUMP_BASE} into ${ZIP_DIR}/${ZIP_NAME}" ; exit 1; } -[ -z "${DELETE_DUMP}" ] || { echo "Clearing contents of /dump/"; cd /dump/; rm -rf *; } -echo "RClone-ing ${ZIP_DIR} to GCP ${GCP_DEST}" -/bin/rclone sync ${ZIP_DIR}/ ${REMOTE} +## Sync All Resulting Files (in list!) +cd ${ZIP_DIR} || exit +for file in ${final_uploads}; do + echo "RClone-ing ${file} to GCP ${REMOTE}" + /usr/bin/rclone sync -v "$file" ${REMOTE}/ +done