Merge pull request #163 from stanfordnmbl/dev

AWS scaling + any scaling
stanfordnmbl · Jun 25, 2024 · cc6caeb · cc6caeb
2 parents c2ce1f7 + c6bce4d
commit cc6caeb
Show file tree

Hide file tree

Showing 26 changed files with 965 additions and 48 deletions.
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
@@ -0,0 +1,4 @@
+# Workflows
+
+The actions only push the opencap (-dev) image to ECR, not the openpose (-dev) and mmpose (-dev) images.
+The mmpose (-dev) image is too big and the action fail. To push them to ECR, do it manually through the makefile by running make build and then make run.
diff --git a/.github/workflows/ecr-dev.yml b/.github/workflows/ecr-dev.yml
@@ -0,0 +1,70 @@
+# This workflow will build and push a new container image to Amazon ECR,
+# and then will deploy a new task definition to Amazon ECS, on every push
+# to the master branch.
+#
+# To use this workflow, you will need to complete the following set-up steps:
+#
+# 1. Create an ECR repository to store your images.
+#    For example: `aws ecr create-repository --repository-name my-ecr-repo --region us-east-2`.
+#    Replace the value of `ECR_REPOSITORY` in the workflow below with your repository's name.
+#    Replace the value of `aws-region` in the workflow below with your repository's region.
+#
+# 2. Create an ECS task definition, an ECS cluster, and an ECS service.
+#    For example, follow the Getting Started guide on the ECS console:
+#      https://us-east-2.console.aws.amazon.com/ecs/home?region=us-east-2#/firstRun
+#    Replace the values for `service` and `cluster` in the workflow below with your service and cluster names.
+#
+# 3. Store your ECS task definition as a JSON file in your repository.
+#    The format should follow the output of `aws ecs register-task-definition --generate-cli-skeleton`.
+#    Replace the value of `task-definition` in the workflow below with your JSON file's name.
+#    Replace the value of `container-name` in the workflow below with the name of the container
+#    in the `containerDefinitions` section of the task definition.
+#
+# 4. Store an IAM user access key in GitHub Actions secrets named `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
+#    See the documentation for each action used below for the recommended IAM policies for this IAM user,
+#    and best practices on handling the access key credentials.
+
+on:
+  push:
+    branches:
+      - dev
+
+name: Deploy to Amazon ECS
+
+jobs:
+  deploy:
+    name: Deploy OpenCap
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v1
+
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v1
+      with:
+        aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        aws-region: us-west-2
+
+    - name: Login to Amazon ECR
+      id: login-ecr
+      uses: aws-actions/amazon-ecr-login@v1
+
+    - name: Build, tag, and push image to Amazon ECR
+      id: build-image
+      env:
+        ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+        ECR_REPOSITORY: opencap/opencap-dev
+        IMAGE_TAG: latest # ${{ github.sha }}
+      run: |
+        # Build a docker container and
+        # push it to ECR so that it can
+        # be deployed to ECS.
+        docker build -f docker/Dockerfile -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
+        docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
+        echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
+        
+    - name: Force deployment
+      run: |
+        aws ecs update-service --cluster opencap-processing-cluster-dev --service worker --force-new-deployment
diff --git a/Examples/changeSessionMetadata.py b/Examples/changeSessionMetadata.py
@@ -23,6 +23,8 @@
 developer use.
 
 The available options for metadata are:
+    - scalingsetup:     upright_standing_pose
+                        any_pose
     - openSimModel:     LaiUhlrich2022
                         LaiUhlrich2022_shoulder
     - posemodel:        openpose
@@ -44,12 +46,15 @@
 
 from utils import changeSessionMetadata
 
-session_ids = ['0d46adef-62cb-455f-9ff3-8116717cc2fe']
+session_ids = ["0d46adef-62cb-455f-9ff3-8116717cc2fe"]
 
 # Dictionary of metadata fields to change (see sessionMetadata.yaml).
-newMetadata = {'openSimModel':'LaiUhlrich2022_shoulder',
-               'posemodel':'hrnet',
-               'augmentermodel':'v0.3',
-               'filterfrequency':15,
-               'datasharing':'Share processed data and identified videos'}
+newMetadata = {
+    'openSimModel':'LaiUhlrich2022_shoulder',
+    'posemodel':'hrnet',
+    'augmentermodel':'v0.3',
+    'filterfrequency':15,
+    'datasharing':'Share processed data and identified videos',
+    'scalingsetup': 'upright_standing_pose'
+}
 changeSessionMetadata(session_ids,newMetadata)
diff --git a/app.py b/app.py
@@ -8,29 +8,56 @@
 import logging
 import glob
 import numpy as np
-from utilsAPI import getAPIURL, getWorkerType
+from utilsAPI import getAPIURL, getWorkerType, getASInstance, unprotect_current_instance, get_number_of_pending_trials
 from utilsAuth import getToken
-from utils import getDataDirectory, checkTime, checkResourceUsage, sendStatusEmail
+from utils import (getDataDirectory, checkTime, checkResourceUsage,
+                  sendStatusEmail, checkForTrialsWithStatus)
 
 logging.basicConfig(level=logging.INFO)
 
 API_TOKEN = getToken()
 API_URL = getAPIURL()
 workerType = getWorkerType()
+autoScalingInstance = getASInstance()
+logging.info(f"AUTOSCALING TEST INSTANCE: {autoScalingInstance}")
 
 # if true, will delete entire data directory when finished with a trial
 isDocker = True
 
 # get start time
-t = time.localtime()
 initialStatusCheck = False
+t = time.localtime()
+
+# For removing AWS machine scale-in protection
+t_lastTrial = time.localtime()
+justProcessed = True
+with_on_prem = True
+minutesBeforeRemoveScaleInProtection = 2
+max_on_prem_pending_trials = 5
 
 while True:
     # Run test trial at a given frequency to check status of machine. Stop machine if fails.
     if checkTime(t,minutesElapsed=30) or not initialStatusCheck:
         runTestSession(isDocker=isDocker)           
         t = time.localtime()
         initialStatusCheck = True
+
+    # When using autoscaling, if there are on-prem workers, then we will remove
+    # the instance scale-in protection if the number of pending trials is below
+    # a threshold so that the on-prem workers are prioritized.
+    if with_on_prem:
+        # Query the number of pending trials        
+        if autoScalingInstance:
+            pending_trials = get_number_of_pending_trials()
+            logging.info(f"Number of pending trials: {pending_trials}")
+            if pending_trials < max_on_prem_pending_trials:
+                # Remove scale-in protection and sleep in the cycle so that the
+                # asg will remove that instance from the group.
+                logging.info("Removing scale-in protection (out loop).")
+                unprotect_current_instance()
+                logging.info("Removed scale-in protection (out loop).")
+                for i in range(3600):
+                    time.sleep(1)
 
     # workerType = 'calibration' -> just processes calibration and neutral
     # workerType = 'all' -> processes all types of trials
@@ -47,6 +74,27 @@
     if r.status_code == 404:
         logging.info("...pulling " + workerType + " trials.")
         time.sleep(1)
+
+        # When using autoscaling, we will remove the instance scale-in protection if it hasn't
+        # pulled a trial recently and there are no actively recording trials
+        if (autoScalingInstance and not justProcessed and 
+            checkTime(t_lastTrial, minutesElapsed=minutesBeforeRemoveScaleInProtection)):
+            if checkForTrialsWithStatus('recording', hours=2/60) == 0:
+                # Remove scale-in protection and sleep in the cycle so that the
+                # asg will remove that instance from the group.
+                logging.info("Removing scale-in protection (in loop).")
+                unprotect_current_instance()
+                logging.info("Removed scale-in protection (in loop).")
+                for i in range(3600):
+                    time.sleep(1)
+            else:
+                t_lastTrial = time.localtime()
+
+        # If a trial was just processed, reset the timer.
+        if autoScalingInstance and justProcessed:
+            justProcessed = False
+            t_lastTrial = time.localtime()
+
         continue
 
     if np.floor(r.status_code/100) == 5: # 5xx codes are server faults
@@ -88,12 +136,12 @@
     logging.info("processTrial({},{},trial_type={})".format(trial["session"], trial["id"], trial_type))
 
     try:
+        # trigger reset of timer for last processed trial                            
         processTrial(trial["session"], trial["id"], trial_type=trial_type, isDocker=isDocker)   
         # note a result needs to be posted for the API to know we finished, but we are posting them 
         # automatically thru procesTrial now
         r = requests.patch(trial_url, data={"status": "done"},
                          headers = {"Authorization": "Token {}".format(API_TOKEN)})
-
         logging.info('0.5s pause if need to restart.')
         time.sleep(0.5)
     except Exception as e:
@@ -106,6 +154,7 @@
             message = "A backend OpenCap machine timed out during pose detection. It has been stopped."
             sendStatusEmail(message=message)
             raise Exception('Worker failed. Stopped.')
+    justProcessed = True
 
     # Clean data directory
     if isDocker:

diff --git a/docker/Makefile b/docker/Makefile
@@ -1,17 +1,38 @@
+BASE_NAME := 660440363484.dkr.ecr.us-west-2.amazonaws.com
+REPO_NAME := opencap
+PROD_BRANCH := main
+
+# Determine the branch name
+CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD)
+
+# Determine image tag based on branch
+ifeq ($(CURRENT_BRANCH),$(PROD_BRANCH))
+    OPENCAP_IMAGE_TAG := opencap
+	OPENPOSE_IMAGE_TAG := openpose
+	MMPOSE_IMAGE_TAG := mmpose
+else
+    OPENCAP_IMAGE_TAG := opencap-dev
+	OPENPOSE_IMAGE_TAG := openpose-dev
+	MMPOSE_IMAGE_TAG := mmpose-dev
+endif
+
+
 .PHONY: build
 build:
 	wget -c -O ../mmpose/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth https://mc-opencap-public.s3.us-west-2.amazonaws.com/mmpose_pth/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth
 	wget -c -O ../mmpose/hrnet_w48_coco_wholebody_384x288_dark-f5726563_20200918.pth https://mc-opencap-public.s3.us-west-2.amazonaws.com/mmpose_pth/hrnet_w48_coco_wholebody_384x288_dark-f5726563_20200918.pth
-	docker build -t 660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap/opencap .. -f Dockerfile
-	docker build -t 660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap/openpose .. -f openpose/Dockerfile
-	docker build -t 660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap/mmpose .. -f mmpose/Dockerfile
+
+	docker build -t $(BASE_NAME)/$(REPO_NAME)/$(OPENCAP_IMAGE_TAG) .. -f Dockerfile
+	docker build -t $(BASE_NAME)/$(REPO_NAME)/$(OPENPOSE_IMAGE_TAG) .. -f openpose/Dockerfile
+	docker build -t $(BASE_NAME)/$(REPO_NAME)/$(MMPOSE_IMAGE_TAG) .. -f mmpose/Dockerfile
 
 .PHONY: push
 push:
 	aws ecr get-login-password --region us-west-2 --profile opencap | docker login --username AWS --password-stdin 660440363484.dkr.ecr.us-west-2.amazonaws.com
-	docker push 660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap/opencap
-	docker push 660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap/openpose
-	docker push 660440363484.dkr.ecr.us-west-2.amazonaws.com/opencap/mmpose
+
+	docker push $(BASE_NAME)/$(REPO_NAME)/$(OPENCAP_IMAGE_TAG)
+	docker push $(BASE_NAME)/$(REPO_NAME)/$(OPENPOSE_IMAGE_TAG)
+	docker push $(BASE_NAME)/$(REPO_NAME)/$(MMPOSE_IMAGE_TAG)
 
 .PHONY: run
 run:

diff --git a/main.py b/main.py
@@ -13,6 +13,9 @@
 import yaml
 import traceback
 
+import logging
+logging.basicConfig(level=logging.INFO)
+
 from utils import importMetadata, loadCameraParameters, getVideoExtension
 from utils import getDataDirectory, getOpenPoseDirectory, getMMposeDirectory
 from utilsChecker import saveCameraParameters
@@ -37,7 +40,8 @@ def main(sessionName, trialName, trial_id, camerasToUse=['all'],
          scaleModel=False, bbox_thr=0.8, augmenter_model='v0.3',
          genericFolderNames=False, offset=True, benchmark=False,
          dataDir=None, overwriteAugmenterModel=False,
-         filter_frequency='default', overwriteFilterFrequency=False):
+         filter_frequency='default', overwriteFilterFrequency=False,
+         scaling_setup='upright_standing_pose', overwriteScalingSetup=False):
 
     # %% High-level settings.
     # Camera calibration.
@@ -109,6 +113,14 @@ def main(sessionName, trialName, trial_id, camerasToUse=['all'],
     else:
         filtFreqs = {'gait':filterfrequency, 'default':filterfrequency}
 
+    # If scaling setup defined through web app.
+    # If overwriteScalingSetup is True, the scaling setup is the one
+    # passed as an argument to main(). This is useful for local testing.
+    if 'scalingsetup' in sessionMetadata and not overwriteScalingSetup:
+        scalingSetup = sessionMetadata['scalingsetup']
+    else:
+        scalingSetup = scaling_setup
+
     # %% Paths to pose detector folder for local testing.
     if poseDetector == 'OpenPose':
         poseDetectorDirectory = getOpenPoseDirectory(isDocker)
@@ -152,7 +164,10 @@ def main(sessionName, trialName, trial_id, camerasToUse=['all'],
             'poseDetector': poseDetector, 
             'augmenter_model': augmenterModel, 
             'imageUpsampleFactor': imageUpsampleFactor,
-            'openSimModel': sessionMetadata['openSimModel']}
+            'openSimModel': sessionMetadata['openSimModel'],
+            'scalingSetup': scalingSetup,
+            'filterFrequency': filterfrequency,
+            }
         if poseDetector == 'OpenPose':
             settings['resolutionPoseDetection'] = resolutionPoseDetection
         elif poseDetector == 'mmpose':
@@ -191,16 +206,15 @@ def main(sessionName, trialName, trial_id, camerasToUse=['all'],
             # Intrinsics and extrinsics already exist for this session.
             if os.path.exists(
                     os.path.join(camDir,"cameraIntrinsicsExtrinsics.pickle")):
-                print("Load extrinsics for {} - already existing".format(
+                logging.info("Load extrinsics for {} - already existing".format(
                     camName))
                 CamParams = loadCameraParameters(
                     os.path.join(camDir, "cameraIntrinsicsExtrinsics.pickle"))
                 loadedCamParams[camName] = True
 
             # Extrinsics do not exist for this session.
             else:
-                print("Compute extrinsics for {} - not yet existing".format(
-                    camName))
+                logging.info("Compute extrinsics for {} - not yet existing".format(camName))
                 # Intrinsics ##################################################
                 # Intrinsics directories.
                 intrinsicDir = os.path.join(baseDir, 'CameraIntrinsics',
@@ -396,7 +410,7 @@ def main(sessionName, trialName, trial_id, camerasToUse=['all'],
     if runMarkerAugmentation:
         os.makedirs(postAugmentationDir, exist_ok=True)    
         augmenterDir = os.path.join(baseDir, "MarkerAugmenter")
-        print('Augmenting marker set')
+        logging.info('Augmenting marker set')
         try:
             vertical_offset = augmentTRC(
                 pathOutputFiles[trialName],sessionMetadata['mass_kg'], 
@@ -441,8 +455,11 @@ def main(sessionName, trialName, trial_id, camerasToUse=['all'],
         if scaleModel:
             os.makedirs(outputScaledModelDir, exist_ok=True)
             # Path setup file.
-            genericSetupFile4ScalingName = (
-                'Setup_scaling_RajagopalModified2016_withArms_KA.xml')
+            if scalingSetup == 'any_pose':
+                genericSetupFile4ScalingName = 'Setup_scaling_LaiUhlrich2022_any_pose.xml'
+            else: # by default, use upright_standing_pose
+                genericSetupFile4ScalingName = 'Setup_scaling_LaiUhlrich2022.xml'
+
             pathGenericSetupFile4Scaling = os.path.join(
                 openSimPipelineDir, 'Scaling', genericSetupFile4ScalingName)
             # Path model file.
@@ -465,11 +482,11 @@ def main(sessionName, trialName, trial_id, camerasToUse=['all'],
                             thresholdTime=0.1, removeRoot=True)
                         success = True
                     except Exception as e:
-                        print(f"Attempt with thresholdPosition {thresholdPosition} failed: {e}")
+                        logging.info(f"Attempt identifying scaling time range with thresholdPosition {thresholdPosition} failed: {e}")
                         thresholdPosition += increment  # Increase the threshold for the next iteration
 
                 # Run scale tool.
-                print('Running Scaling')
+                logging.info('Running Scaling')
                 pathScaledModel = runScaleTool(
                     pathGenericSetupFile4Scaling, pathGenericModel4Scaling,
                     sessionMetadata['mass_kg'], pathTRCFile4Scaling, 
@@ -507,7 +524,7 @@ def main(sessionName, trialName, trial_id, camerasToUse=['all'],
                 # Path TRC file.
                 pathTRCFile4IK = pathAugmentedOutputFiles[trialName]
                 # Run IK tool. 
-                print('Running Inverse Kinematics')
+                logging.info('Running Inverse Kinematics')
                 try:
                     pathOutputIK = runIKTool(
                         pathGenericSetupFile4IK, pathScaledModel, 

diff --git a/mmpose/loop_mmpose.py b/mmpose/loop_mmpose.py
@@ -35,6 +35,7 @@ def checkCudaPyTorch():
 if os.path.isfile(video_path):
     os.remove(video_path)
 
+checkCudaPyTorch()
 while True:    
     if not os.path.isfile(video_path):
         time.sleep(0.1)

diff --git a/...jagopalModified2016_markers_augmenter.xml → ...dels/LaiUhlrich2022_markers_augmenter.xml b/...jagopalModified2016_markers_augmenter.xml → ...dels/LaiUhlrich2022_markers_augmenter.xml
diff --git a/...dified2016_markers_augmenter_shoulder.xml → ...hlrich2022_markers_augmenter_shoulder.xml b/...dified2016_markers_augmenter_shoulder.xml → ...hlrich2022_markers_augmenter_shoulder.xml
diff --git a/.../RajagopalModified2016_markers_mmpose.xml → .../Models/LaiUhlrich2022_markers_mmpose.xml b/.../RajagopalModified2016_markers_mmpose.xml → .../Models/LaiUhlrich2022_markers_mmpose.xml
diff --git a/...s/RajagopalModified2016_markers_mocap.xml → ...e/Models/LaiUhlrich2022_markers_mocap.xml b/...s/RajagopalModified2016_markers_mocap.xml → ...e/Models/LaiUhlrich2022_markers_mocap.xml
diff --git a/...alModified2016_markers_mocap_shoulder.xml → ...LaiUhlrich2022_markers_mocap_shoulder.xml b/...alModified2016_markers_mocap_shoulder.xml → ...LaiUhlrich2022_markers_mocap_shoulder.xml
diff --git a/...ajagopalModified2016_markers_openpose.xml → ...odels/LaiUhlrich2022_markers_openpose.xml b/...ajagopalModified2016_markers_openpose.xml → ...odels/LaiUhlrich2022_markers_openpose.xml
diff --git a/...ing_RajagopalModified2016_withArms_KA.xml → .../Scaling/Setup_scaling_LaiUhlrich2022.xml b/...ing_RajagopalModified2016_withArms_KA.xml → .../Scaling/Setup_scaling_LaiUhlrich2022.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <OpenSimDocument Version="40000">
-	<ScaleTool name="RajagopalModified2016_withArms">
+	<ScaleTool name="LaiUhlrich2022">
 		<!--Mass of the subject in kg.  Subject-specific model generated by scaling step will have this total mass.-->
 		<mass>75.337</mass>
 		<!--Height of the subject in mm.  For informational purposes only (not used by scaling).-->

diff --git a/...jagopalModified2016_withArms_KA_Mocap.xml → ...ng/Setup_scaling_LaiUhlrich2022_Mocap.xml b/...jagopalModified2016_withArms_KA_Mocap.xml → ...ng/Setup_scaling_LaiUhlrich2022_Mocap.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <OpenSimDocument Version="40000">
-	<ScaleTool name="RajagopalModified2016_withArms">
+	<ScaleTool name="LaiUhlrich2022">
 		<!--Mass of the subject in kg.  Subject-specific model generated by scaling step will have this total mass.-->
 		<mass>75.337</mass>
 		<!--Height of the subject in mm.  For informational purposes only (not used by scaling).-->