forked from AlexsLemonade/refinebio
-
Notifications
You must be signed in to change notification settings - Fork 0
/
deploy.sh
executable file
·439 lines (381 loc) · 16.6 KB
/
deploy.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
#!/bin/bash -e
# This script should always run as if it were being called from
# the directory it lives in.
script_directory="$(
cd "$(dirname "$0")" || exit
pwd
)"
cd "$script_directory" || exit
print_description() {
# shellcheck disable=SC2016
echo 'This script can be used to deploy and update a `refine.bio` instance stack.'
echo 'It will create all of the AWS infrasctructure (roles/instances/db/network/etc),'
echo 'open an ingress, kill all running Batch jobs, perform a database migration,'
echo 're-define and re-register Batch job specifications, and finally close the'
echo 'ingress. This can be run from a CI/CD machine or a local dev box.'
}
print_options() {
echo 'This script accepts the following arguments: -e, -d, -i, -v, -u, -r, and -h.'
echo '-h prints this help message and exits.'
echo '-e specifies the environment you would like to deploy to and is not optional. Its valid values are:'
echo ' "-e prod" will deploy the production stack. This should only be used from a CD machine.'
echo ' "-e staging" will deploy the staging stack. This should only be used from a CD machine.'
echo ' "-e dev" will deploy a dev stack which is appropriate for a single developer to use to test.'
echo '-r May be used to override the Dockerhub repo where the images will be pulled from.'
echo ' This may also be specified by setting the TF_VAR_dockerhub_repo environment variable.'
echo ' If unset, defaults to "ccdlstaging" if the version contains "-dev" and "ccdl" otherwise.'
echo ' for dev and staging environments and "ccdl" for prod.'
echo ' This option is useful for testing code changes. Images with the code to be tested can be pushed'
echo ' to your private Dockerhub repo and then the system will find them.'
echo '-i specifies to use On Demand EC2 instances instead of Spot instances.'
echo '-v specifies the version of the system which is being deployed and is not optional.'
echo "-u specifies the username of the deployer. Should be the developer's name in development stacks."
echo ' This option may be omitted, in which case the TF_VAR_user variable MUST be set instead.'
echo '-d specifies the AWS region to deploy the stack to. Defaults to us-east-1.'
}
while getopts ":e:d:i:v:u:r:h" opt; do
case $opt in
e)
export env=$OPTARG
export TF_VAR_stage=$OPTARG
;;
r)
export TF_VAR_dockerhub_repo=$OPTARG
;;
i)
export TF_VAR_batch_use_on_demand_instances=$OPTARG
;;
v)
export SYSTEM_VERSION=$OPTARG
export TF_VAR_system_version=$OPTARG
;;
u)
export TF_VAR_user=$OPTARG
;;
d)
export TF_VAR_region=$OPTARG
;;
h)
print_description
echo
print_options
exit 0
;;
\?)
echo "Invalid option: -$OPTARG" >&2
print_options >&2
exit 1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
print_options >&2
exit 1
;;
esac
done
if [[ $env != "dev" && $env != "staging" && $env != "prod" ]]; then
echo 'Error: must specify environment as either "dev", "staging", or "prod" with -e.'
exit 1
fi
if [[ -z $TF_VAR_user ]]; then
echo 'Error: must specify the username by either providing the -u argument or setting TF_VAR_user.'
exit 1
fi
if [[ -z $SYSTEM_VERSION ]]; then
echo 'Error: must specify the system version with -v.'
exit 1
fi
if [[ -z $TF_VAR_dockerhub_repo ]]; then
if [[ $SYSTEM_VERSION == *"-dev" ]]; then
export TF_VAR_dockerhub_repo=ccdlstaging
else
export TF_VAR_dockerhub_repo=ccdl
fi
fi
if [[ -z $TF_VAR_region ]]; then
TF_VAR_region=us-east-1
fi
# We have terraform output environment variables via a single output
# variable, which we then read in as json using the command line tool
# `jq`, so that we can use them via bash.
format_environment_variables() {
echo "SYSTEM_VERSION=$SYSTEM_VERSION" >>prod_env
local IFS=$'\n'
for row in $(terraform output -json environment_variables | jq -c '.[]'); do
name=$(echo "$row" | jq -r ".name")
value=$(echo "$row" | jq -r ".value")
env_var_assignment="$name=$value"
# Exporting an expansion rather than a variable, which is exactly what we want to do.
# shellcheck disable=SC2163
export "${env_var_assignment?}"
echo "$env_var_assignment" >>prod_env
done
}
# Load $ALL_IMAGES and helper functions.
. ../scripts/common.sh
# Make our IP address known to terraform.
TF_VAR_host_ip="$(dig +short myip.opendns.com @resolver1.opendns.com)"
export TF_VAR_host_ip
for IMAGE in $ALL_IMAGES; do
# For each image we need to set the env var that is used by our
# scripts and the env var that gets picked up by terraform because
# it is preceeded with TF_VAR.
IMAGE_UPPER="$(echo "$IMAGE" | tr '[:lower:]' '[:upper:]')"
export "${IMAGE_UPPER}_DOCKER_IMAGE=dr_$IMAGE:$SYSTEM_VERSION"
export "TF_VAR_${IMAGE}_docker_image=dr_$IMAGE:$SYSTEM_VERSION"
done
# Copy ingress config to top level so it can be applied.
cp deploy/ci_ingress.tf .
# Check if a new ccdl-ubuntu ami will be needed for this region
if [[ $(aws ec2 describe-images \
--region "$TF_VAR_region" --owners 589864003899 \
--filters 'Name=name,Values=ccdl-ubuntu-18.04-*' \
--query 'length(Images)') -eq 0 ]]; then
echo "No ccdl-ubuntu-18.04 AMI found for this region, creating a new one"
# Find most recent ccdl-ubuntu ami from us-east-1
template_ami_id=$(aws ec2 describe-images \
--region us-east-1 --owners 589864003899 \
--filters 'Name=name,Values=ccdl-ubuntu-18.04-*' \
--query 'sort_by(Images,&CreationDate)[-1].ImageId' \
--output text)
# Make a copy into this region
new_ami_name="ccdl-ubuntu-18.04-$(date "+%Y-%m-%dT%H.%M.%S")"
new_ami_id=$(aws ec2 copy-image \
--source-image-id "$template_ami_id" \
--source-region us-east-1 \
--region "$TF_VAR_region" \
--name "$new_ami_name" \
--output text)
echo "Created new AMI for $TF_VAR_region"
echo " name: $new_ami_name"
echo " id: $new_ami_id"
fi
# Always init terraform first, especially since we're using a remote backend.
./init_terraform.sh
# Terraform doesn't manage these well, so they need to be tainted if
# they exist to ensure they won't require manual intervention.
terraform taint module.batch.aws_launch_template.data_refinery_worker || true
terraform taint module.batch.aws_launch_template.data_refinery_compendia || true
if terraform state list | grep -q module.batch.aws_batch_job_queue.data_refinery_; then
terraform state list |
grep module.batch.aws_batch_job_queue.data_refinery_ |
xargs -L 1 terraform taint ||
true
fi
if terraform state list | grep -q module.batch.aws_batch_compute_environment.data_refinery__; then
terraform state list |
grep module.batch.aws_batch_compute_environment.data_refinery_ |
xargs -L 1 terraform taint ||
true
fi
if terraform output | grep -q 'No outputs found'; then
ran_init_build=true
echo "No existing stack detected, applying initial terraform deployment."
# These files are inputs but are created by format_batch_with_env.sh
# based on outputs from terraform. Kinda a Catch 22, but we can
# get around it by providing dummy files to get bootstrapped.
touch api-configuration/environment
touch foreman-configuration/environment
# Output the plan for debugging deployments later.
# Until terraform plan supports -var-file the plan is wrong.
# terraform plan
terraform apply -var-file="environments/$env.tfvars" -auto-approve
fi
# We have to do this once before the initial deploy...
rm -f prod_env
format_environment_variables
../scripts/format_batch_with_env.sh -p api -e "$env" -o "$(pwd)/api-configuration/"
../scripts/format_batch_with_env.sh -p foreman -e "$env" -o "$(pwd)/foreman-configuration/"
if [[ -z $ran_init_build ]]; then
# Open up ingress to AWS for Circle, stop jobs, migrate DB.
echo "Deploying with ingress..."
# Output the plan for debugging deployments later.
# Until terraform plan supports -var-file the plan is wrong.
# terraform plan
terraform apply -var-file="environments/$env.tfvars" -auto-approve
fi
# Make sure that prod_env is empty since we are only appending to it.
# prod_env is a temporary file we use to pass environment variables to
# `docker run` commands when running migrations.
rm -f prod_env
# (cont'd) ...and once again after the update when this is re-run.
format_environment_variables
# Make sure to clear out any old batch job templates since we
# will register everything in this directory.
if [ -e batch-job-templates ]; then
rm -r batch-job-templates
fi
# Template the environment variables for production into the Batch Job
# definitions and API confs.
mkdir -p batch-job-templates
../scripts/format_batch_with_env.sh -p workers -e "$env" -o "$(pwd)/batch-job-templates"
../scripts/format_batch_with_env.sh -p surveyor -e "$env" -o "$(pwd)/batch-job-templates"
# API and foreman aren't run as Batch jobs, but the templater still works.
../scripts/format_batch_with_env.sh -p foreman -e "$env" -o "$(pwd)/foreman-configuration"
../scripts/format_batch_with_env.sh -p api -e "$env" -o "$(pwd)/api-configuration/"
# Remove all Batch jobs because it's the only way to be sure we don't
# have any old ones. Deleting the job queue is the easiest way to do
# this, and it will be recreated by the following run of terraform
# anyway.
python3 delete_batch_job_queue.py
# If we don't deregister these, they'll stick around and accumulate.
python3 deregister_batch_job_definitions.py
# Re-register Batch jobs (skip those that end in .tpl)
echo "Registering new job specifications..."
export AWS_DEFAULT_REGION=$AWS_REGION
# SC2010: Don't use ls | grep. Use a glob or a for loop with a condition to allow non-alphanumeric filenames.
# We are using a glob, but we want to limit it to a specific directory. Seems like an over aggressive check.
# shellcheck disable=SC2010
for batch_job_template in $(ls -1 batch-job-templates/*.json | grep -v .tpl); do
aws batch register-job-definition --cli-input-json file://"$batch_job_template" &
sleep 1
done
echo "Job registrations have been fired off."
# Get an image to run the migrations with.
docker pull --platform linux/amd64 "$DOCKERHUB_REPO/$FOREMAN_DOCKER_IMAGE"
# Test that the pg_bouncer instance is up. 15 minutes should be more than enough.
start_time=$(date +%s)
diff=0
until pg_isready -d "$DATABASE_NAME" -h "$DATABASE_PUBLIC_HOST" -p "$DATABASE_PORT" -U "$DATABASE_USER" &>/dev/null || [ "$diff" -gt "900" ]; do
echo "Waiting for the pg_bouncer instance to come online..."
sleep 10
((diff = $(date +%s) - start_time))
done
if ! pg_isready -d "$DATABASE_NAME" -h "$DATABASE_PUBLIC_HOST" -p "$DATABASE_PORT" -U "$DATABASE_USER" &>/dev/null; then
echo "pg_bouncer instance failed to come up after 15 minutes."
exit 1
fi
# Migrate auth.
docker run \
--env DATABASE_HOST="$DATABASE_PUBLIC_HOST" \
--env RUNNING_IN_CLOUD=False \
--env-file prod_env \
--platform linux/amd64 \
"$DOCKERHUB_REPO/$FOREMAN_DOCKER_IMAGE" \
python3 manage.py migrate auth
# Apply general migrations.
docker run \
--env DATABASE_HOST="$DATABASE_PUBLIC_HOST" \
--env RUNNING_IN_CLOUD=False \
--env-file prod_env \
--platform linux/amd64 \
"$DOCKERHUB_REPO/$FOREMAN_DOCKER_IMAGE" \
python3 manage.py migrate
# Create the cache table if it does not already exist.
docker run \
--env DATABASE_HOST="$DATABASE_PUBLIC_HOST" \
--env RUNNING_IN_CLOUD=False \
--env-file prod_env \
--platform linux/amd64 \
"$DOCKERHUB_REPO/$FOREMAN_DOCKER_IMAGE" \
python3 manage.py createcachetable
# Terraform doesn't manage these well, so they need to be tainted to
# ensure they won't require manual intervention.
terraform taint module.batch.aws_launch_template.data_refinery_worker
terraform taint module.batch.aws_launch_template.data_refinery_compendia
terraform state list |
grep module.batch.aws_batch_job_queue.data_refinery_ |
xargs -L 1 terraform taint ||
true
# Ensure the latest image version is being used for the Foreman
terraform taint aws_instance.foreman_server_1
# Remove the ingress config so the next `terraform apply` will remove
# access for Circle. Leave this for staging and dev so end-to-end
# tests can run and have access to the database since they don't use
# test databases.
if [ "$env" = "prod" ]; then
echo "Removing ingress..."
rm ci_ingress.tf
fi
terraform apply -var-file="environments/$env.tfvars" -auto-approve
# We try to avoid rebuilding the API server because we can only run certbot
# 5 times a week. Therefore we pull the newest image and restart the API
# this way rather than by tainting the server like we do for foreman.
chmod 600 data-refinery-key.pem
API_IP_ADDRESS=$(terraform output -json api_server_1_ip | tr -d '"')
# Check SSH connection.
if ! ssh -o StrictHostKeyChecking=no \
-o ServerAliveInterval=15 \
-o ConnectTimeout=5 \
-i data-refinery-key.pem \
"ubuntu@$API_IP_ADDRESS" "exit"; then
exit_code=$?
echo "Could not SSH into the API server. Did you rotate the SSH keys recently?"
exit $exit_code;
fi
# To check to see if the docker container needs to be stopped before
# it can be started, grep for the name of the container. However if
# it's not found then grep will return a non-zero exit code so in that
# case return an empty string.
container_running=$(ssh -o StrictHostKeyChecking=no \
-o ServerAliveInterval=15 \
-o ConnectTimeout=5 \
-i data-refinery-key.pem \
"ubuntu@$API_IP_ADDRESS" "docker ps -a" 2>/dev/null | grep dr_api || echo "")
# If $container_running is empty, then it's because the container isn't running.
# If the container isn't running, then it's because the instance is spinning up.
# The container will be started by the API's init script, so no need to do anything more.
# However if $container_running isn't empty then we need to stop and restart it.
if [[ -n $container_running ]]; then
echo "Restarting API with latest image."
# shellcheck disable=SC2029
ssh -o StrictHostKeyChecking=no \
-o ServerAliveInterval=15 \
-o ConnectTimeout=5 \
-i data-refinery-key.pem \
"ubuntu@$API_IP_ADDRESS" "docker pull $DOCKERHUB_REPO/$API_DOCKER_IMAGE"
ssh -o StrictHostKeyChecking=no \
-o ServerAliveInterval=15 \
-o ConnectTimeout=5 \
-i data-refinery-key.pem \
"ubuntu@$API_IP_ADDRESS" "docker rm -f dr_api"
scp -o StrictHostKeyChecking=no \
-o ServerAliveInterval=15 \
-o ConnectTimeout=5 \
-i data-refinery-key.pem \
api-configuration/environment "ubuntu@$API_IP_ADDRESS:/home/ubuntu/environment"
# Ensure the API's static file dir exists and is accessible.
ssh -o StrictHostKeyChecking=no \
-o ServerAliveInterval=15 \
-o ConnectTimeout=5 \
-i data-refinery-key.pem \
"ubuntu@$API_IP_ADDRESS" "mkdir -m a+rwx -p /var/www/volumes_static"
# shellcheck disable=SC2029
ssh -o StrictHostKeyChecking=no \
-o ServerAliveInterval=15 \
-o ConnectTimeout=5 \
-i data-refinery-key.pem \
"ubuntu@$API_IP_ADDRESS" "docker run \
--detach \
--env DATABASE_HOST=$DATABASE_HOST \
--env DATABASE_NAME=$DATABASE_NAME \
--env DATABASE_PASSWORD=$DATABASE_PASSWORD \
--env DATABASE_USER=$DATABASE_USER \
--env ELASTICSEARCH_HOST=$ELASTICSEARCH_HOST \
--env ELASTICSEARCH_PORT=$ELASTICSEARCH_PORT \
--env-file environment \
--interactive \
--log-driver=awslogs \
--log-opt awslogs-group=data-refinery-log-group-$USER-$STAGE \
--log-opt awslogs-region=$AWS_REGION \
--log-opt awslogs-stream=log-stream-api-$USER-$STAGE \
--name=dr_api \
--platform linux/amd64 \
--publish 8081:8081 \
--restart always \
--tty \
--volume /var/www/volumes_static:/tmp/www/static \
$DOCKERHUB_REPO/$API_DOCKER_IMAGE \
/bin/sh -c /home/user/collect_and_run_uwsgi.sh"
# Don't leave secrets lying around.
ssh -o StrictHostKeyChecking=no \
-o ServerAliveInterval=15 \
-o ConnectTimeout=5 \
-i data-refinery-key.pem \
"ubuntu@$API_IP_ADDRESS" "rm -f environment"
fi
echo "Deploy completed successfully."
# Remove Docker images created more than 30 days ago.
echo "Cleaning up Docker images."
docker image prune -a --force --filter "until=720h"
echo "Cleanup completed."