diff --git a/.aws/terraform-jupyterhub-provisioning-policies.json b/.aws/terraform-jupyterhub-provisioning-policies.json
index e0a6cafb..35103551 100644
--- a/.aws/terraform-jupyterhub-provisioning-policies.json
+++ b/.aws/terraform-jupyterhub-provisioning-policies.json
@@ -4,69 +4,7 @@
     {
       "Effect": "Allow",
       "Action": [
-        "ec2:AllocateAddress",
-        "ec2:AssociateAddress",
-        "ec2:AssociateRouteTable",
-        "ec2:AssociateVpcCidrBlock",
-        "ec2:AttachInternetGateway",
-        "ec2:AttachNetworkInterface",
-        "ec2:AuthorizeSecurityGroupEgress",
-        "ec2:AuthorizeSecurityGroupIngress",
-        "ec2:CreateInternetGateway",
-        "ec2:CreateLaunchTemplate",
-        "ec2:CreateLaunchTemplateVersion",
-        "ec2:CreateNatGateway",
-        "ec2:CreateNetworkAcl",
-        "ec2:CreateNetworkAclEntry",
-        "ec2:CreateNetworkInterface",
-        "ec2:CreateNetworkInterfacePermission",
-        "ec2:CreateRoute",
-        "ec2:CreateRouteTable",
-        "ec2:CreateSecurityGroup",
-        "ec2:CreateSubnet",
-        "ec2:CreateTags",
-        "ec2:CreateVpc",
-        "ec2:DeleteInternetGateway",
-        "ec2:DeleteLaunchTemplate",
-        "ec2:DeleteLaunchTemplateVersions",
-        "ec2:DeleteNatGateway",
-        "ec2:DeleteNetworkAcl",
-        "ec2:DeleteNetworkAclEntry",
-        "ec2:DeleteNetworkInterface",
-        "ec2:DeleteRoute",
-        "ec2:DeleteRouteTable",
-        "ec2:DeleteSecurityGroup",
-        "ec2:DeleteSubnet",
-        "ec2:DeleteTags",
-        "ec2:DeleteVpc",
-        "ec2:DescribeAddresses",
-        "ec2:DescribeAddressesAttribute",
-        "ec2:DescribeAvailabilityZones",
-        "ec2:DescribeInternetGateways",
-        "ec2:DescribeLaunchTemplateVersions",
-        "ec2:DescribeLaunchTemplates",
-        "ec2:DescribeNatGateways",
-        "ec2:DescribeNetworkAcls",
-        "ec2:DescribeNetworkInterfacePermissions",
-        "ec2:DescribeNetworkInterfaces",
-        "ec2:DescribeRouteTables",
-        "ec2:DescribeSecurityGroupRules",
-        "ec2:DescribeSecurityGroups",
-        "ec2:DescribeSubnets",
-        "ec2:DescribeVpcAttribute",
-        "ec2:DescribeVpcs",
-        "ec2:DetachInternetGateway",
-        "ec2:DetachNetworkInterface",
-        "ec2:DisassociateAddress",
-        "ec2:DisassociateRouteTable",
-        "ec2:DisassociateVpcCidrBlock",
-        "ec2:ModifyNetworkInterfaceAttribute",
-        "ec2:ModifyVpcAttribute",
-        "ec2:ReleaseAddress",
-        "ec2:ReplaceRoute",
-        "ec2:RevokeSecurityGroupEgress",
-        "ec2:RevokeSecurityGroupIngress",
-        "ec2:RunInstances",
+        "ec2:*",
         "ecr-public:GetAuthorizationToken",
         "eks:*",
         "elasticfilesystem:CreateFileSystem",
diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
new file mode 100755
index 00000000..63cef5b1
--- /dev/null
+++ b/.github/scripts/calculate-directory-stats.py
@@ -0,0 +1,275 @@
+#!/usr/bin/env python3
+
+import glob
+import os
+import csv
+import json
+import sys
+import unittest
+from collections import Counter, defaultdict
+from pathlib import Path
+from pprint import pprint
+from typing import Iterable, Tuple
+
+TOTALS_OUTPUT_FILE = "all_users_total.json"
+OUTPUT_DIR = "/tmp/hub-user-reports/"
+INPUT_DIR = "/tmp/hub-user-indexes"
+
+
+csv.field_size_limit(sys.maxsize)
+
+
+class DirectoryStats(defaultdict):
+    COUNTED_FIELDS = [
+        "total_size",
+        "file_count",
+        "nwb_files",
+        "nwb_size",
+        "bids_datasets",
+        "zarr_files",
+        "zarr_size",
+        "user_cache_file_count",
+        "user_cache_size",
+    ]
+    root = str
+
+    def __init__(self, root):
+        super().__init__(lambda: Counter({key: 0 for key in self.COUNTED_FIELDS}))
+        self.root = root
+
+    def increment(self, path: str, field: str, amount: int = 1):
+        if field not in self.COUNTED_FIELDS:
+            raise KeyError(
+                f"Invalid field '{field}'. Allowed fields: {self.COUNTED_FIELDS}"
+            )
+        self[path][field] += amount
+
+    def propagate_dir(self, current_parent: str, previous_parent: str):
+        """Propagate counts up the directory tree."""
+        assert os.path.isabs(current_parent) == os.path.isabs(
+            previous_parent
+        ), "Both must be absolute or both relative"
+
+        highest_common = os.path.commonpath([current_parent, previous_parent])
+        assert highest_common, "highest_common must either be a target directory or /"
+
+        path_to_propagate = os.path.relpath(previous_parent, highest_common)
+        nested_dir_list = path_to_propagate.split(os.sep)[:-1]  # Exclude last directory
+
+        while nested_dir_list:
+            working_dir = os.path.join(highest_common, *nested_dir_list)
+            for field in self.COUNTED_FIELDS:
+                self[working_dir][field] += self[previous_parent][field]
+            nested_dir_list.pop()
+            previous_parent = working_dir
+
+        # Final propagation to the common root
+        for field in self.COUNTED_FIELDS:
+            self[highest_common][field] += self[previous_parent][field]
+
+    def inc_if_bids(self, parent: str, path: str):
+        """Check if a file indicates a BIDS dataset and increment the count."""
+        if path.endswith("dataset_description.json"):
+            self.increment(parent, "bids_datasets")
+
+    def inc_if_usercache(self, parent: str, filepath: str, size: int):
+        if filepath.startswith(f"{self.root}/.cache"):
+            self.increment(parent, "user_cache_file_count")
+            self.increment(parent, "user_cache_size", size)
+
+    def inc_if_nwb(self, parent: str, path: str, size: int):
+        if path.lower().endswith(".nwb"):
+            self.increment(parent, "nwb_files")
+            self.increment(parent, "nwb_size", size)
+
+    def inc_if_zarr(self, parent: str, path: str, size: int):
+        if path.lower().endswith(".zarr"):
+            self.increment(parent, "zarr_files")
+            self.increment(parent, "zarr_size", size)
+
+    @classmethod
+    def from_index(cls, username, user_tsv_file):
+        """Separated from from_data for easier testing"""
+        data = cls._iter_file_metadata(user_tsv_file)
+        return cls.from_data(username, data)
+
+    @classmethod
+    def from_data(cls, root, data: Iterable[Tuple[str, str, str, str]]):
+        """
+        Build DirectoryStats from an iterable of (filepath, size, modified, created).
+        Assumes depth-first listing.
+        """
+        instance = cls(root=root)
+        previous_parent = ""
+
+        for filepath, size, _, _ in data:
+            parent = os.path.dirname(filepath)
+
+            instance.increment(parent, "file_count")
+            instance.increment(parent, "total_size", int(size))
+            instance.inc_if_bids(parent, filepath)
+            instance.inc_if_nwb(parent, filepath, int(size))
+            instance.inc_if_zarr(parent, filepath, int(size))
+            instance.inc_if_usercache(parent, filepath, int(size))
+
+            if previous_parent == parent:
+                continue
+            # Going deeper
+            elif not previous_parent or os.path.dirname(parent) == previous_parent:
+                previous_parent = parent
+                continue
+            else:  # Done with this directory
+                instance.propagate_dir(parent, previous_parent)
+                previous_parent = parent
+
+        # Final propagation to ensure root directory gets counts
+        if previous_parent:  # No previous_parent means no data
+            leading_dir = previous_parent.split(os.sep)[0] or "/"
+            instance.propagate_dir(leading_dir, previous_parent)
+
+        return instance
+
+    def _iter_file_metadata(file_path):
+        """
+        Reads a tsv and returns an iterable that yields one row of file metadata at
+        a time, excluding comments.
+        """
+        file_path = Path(file_path)
+        with file_path.open(mode="r", newline="", encoding="utf-8") as file:
+            reader = csv.reader(file, delimiter="\t")
+            for row in reader:
+                # Skip empty lines or lines starting with '#'
+                if not row or row[0].startswith("#"):
+                    continue
+                yield row
+
+    @property
+    def summary(self):
+        return self[self.root]
+
+    def __repr__(self):
+        """Cleaner representation for debugging."""
+        return "\n".join([f"{path}: {dict(counts)}" for path, counts in self.items()])
+
+
+def main():
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    pattern = f"{INPUT_DIR}/*-index.tsv"
+    outfile_path = Path(OUTPUT_DIR, TOTALS_OUTPUT_FILE)
+    output_stats = {}
+    for user_index_path in glob.iglob(pattern):
+        filename = os.path.basename(user_index_path)
+        username = filename.removesuffix("-index.tsv")
+        print(f"Starting {username}")
+        full_stats = DirectoryStats.from_index(username, user_index_path)
+        output_stats[username] = full_stats.summary
+
+    with outfile_path.open(mode="w", encoding="utf-8") as totals_file:
+        json.dump(output_stats, totals_file, indent=2)
+
+    print(f"Success: report written to {outfile_path}")
+
+
+class TestDirectoryStatistics(unittest.TestCase):
+    def test_propagate_dir(self):
+        stats = DirectoryStats(root="a")
+        stats["a/b/c"].update({"total_size": 100, "file_count": 3})
+        stats["a/b"].update({"total_size": 10, "file_count": 0})
+        stats["a"].update({"total_size": 1, "file_count": 0})
+
+        stats.propagate_dir("a", "a/b/c")
+        self.assertEqual(stats["a"]["file_count"], 3)
+        self.assertEqual(stats["a/b"]["file_count"], 3)
+        self.assertEqual(stats["a"]["total_size"], 111)
+
+    def test_propagate_dir_abs_path(self):
+        stats = DirectoryStats(root="/a")
+        stats["/a/b/c"].update({"file_count": 3})
+
+        stats.propagate_dir("/a", "/a/b/c")
+        self.assertEqual(stats["/a"]["file_count"], 3)
+        self.assertEqual(stats["/a/b"]["file_count"], 3)
+
+    def test_propagate_dir_files_in_all(self):
+        stats = DirectoryStats(root="a")
+        stats["a/b/c"].update({"file_count": 3})
+        stats["a/b"].update({"file_count": 2})
+        stats["a"].update({"file_count": 1})
+
+        stats.propagate_dir("a", "a/b/c")
+        self.assertEqual(stats["a"]["file_count"], 6)
+        self.assertEqual(stats["a/b"]["file_count"], 5)
+
+    def test_from_data_empty(self):
+        sample_data = []
+        stats = DirectoryStats.from_data("a", sample_data)
+        self.assertEqual(stats["a"]["file_count"], 0)
+
+    def test_generate_statistics_inc_bids_0(self):
+        sample_data = [("a/b/file3.txt", 3456, "2024-12-01", "2024-12-02")]
+        stats = DirectoryStats.from_data("a", sample_data)
+        self.assertEqual(stats["a/b"]["bids_datasets"], 0)
+        self.assertEqual(stats["a"]["bids_datasets"], 0)
+
+    def test_generate_statistics_inc_bids_subdatasets(self):
+        sample_data = [
+            ("a/b/c/subdir_of_bids", 3456, "2024-12-01", "2024-12-02"),
+            ("a/b/dataset_description.json", 3456, "2024-12-01", "2024-12-02"),
+            ("a/d/dataset_description.json", 3456, "2024-12-01", "2024-12-02"),
+            (
+                "a/d/subdataset/dataset_description.json",
+                3456,
+                "2024-12-01",
+                "2024-12-02",
+            ),
+        ]
+        stats = DirectoryStats.from_data("a", sample_data)
+        self.assertEqual(stats["a/b/c"]["bids_datasets"], 0)
+        self.assertEqual(stats["a/b"]["bids_datasets"], 1)
+        self.assertEqual(stats["a/d/subdataset"]["bids_datasets"], 1)
+        self.assertEqual(stats["a/d"]["bids_datasets"], 2)
+        self.assertEqual(stats["a"]["bids_datasets"], 3)
+
+    def test_generate_statistics_inc_usercache(self):
+        sample_data = [
+            ("a/.cache/x", 3456, "2024-12-01", "2024-12-02"),
+            ("a/.cache/y", 3456, "2024-12-01", "2024-12-02"),
+            ("a/.cache/nested/y", 3456, "2024-12-01", "2024-12-02"),
+            ("a/b/notcache", 3456, "2024-12-01", "2024-12-02"),
+        ]
+        stats = DirectoryStats.from_data("a", sample_data)
+        self.assertEqual(stats["a"]["user_cache_file_count"], 3)
+        self.assertEqual(stats["a"]["user_cache_size"], 3456 * 3)
+        self.assertEqual(stats["a/.cache"]["user_cache_file_count"], 3)
+        self.assertEqual(stats["a/.cache"]["user_cache_size"], 3456 * 3)
+        self.assertEqual(stats["a/b"]["user_cache_file_count"], 0)
+        self.assertEqual(stats["a/b"]["user_cache_size"], 0)
+
+    def test_generate_statistics(self):
+        sample_data = [
+            ("a/b/file3.txt", 3456, "2024-12-01", "2024-12-02"),
+            ("a/b/c/file1.txt", 1234, "2024-12-01", "2024-12-02"),
+            ("a/b/c/file2.txt", 2345, "2024-12-01", "2024-12-02"),
+            ("a/b/c/d/file4.txt", 4567, "2024-12-01", "2024-12-02"),
+            ("a/e/file3.txt", 5678, "2024-12-01", "2024-12-02"),
+            ("a/e/f/file1.txt", 6789, "2024-12-01", "2024-12-02"),
+            ("a/e/f/file2.txt", 7890, "2024-12-01", "2024-12-02"),
+            ("a/e/f/g/file4.txt", 8901, "2024-12-01", "2024-12-02"),
+        ]
+        stats = DirectoryStats.from_data("a", sample_data)
+        self.assertEqual(stats["a/b/c/d"]["file_count"], 1)
+        self.assertEqual(stats["a/b/c"]["file_count"], 3)
+        self.assertEqual(stats["a/b"]["file_count"], 4)
+        self.assertEqual(stats["a/e/f/g"]["file_count"], 1)
+        self.assertEqual(stats["a/e/f"]["file_count"], 3)
+        self.assertEqual(stats["a/e"]["file_count"], 4)
+        self.assertEqual(stats["a"]["file_count"], 8)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        unittest.main(
+            argv=sys.argv[:1]
+        )  # Run tests if "test" is provided as an argument
+    else:
+        main()
diff --git a/.github/scripts/cleanup-ec2.sh b/.github/scripts/cleanup-ec2.sh
new file mode 100755
index 00000000..326c0029
--- /dev/null
+++ b/.github/scripts/cleanup-ec2.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+set -eu
+
+# Load environment variables from the file if they are not already set
+ENV_FILE=".ec2-session.env"
+if [ -f "$ENV_FILE" ]; then
+  echo "Loading environment variables from $ENV_FILE..."
+  source "$ENV_FILE"
+else
+  echo "Warning: Environment file $ENV_FILE not found."
+fi
+
+# Ensure required environment variables are set
+if [ -z "$INSTANCE_ID" ]; then
+  echo "Error: INSTANCE_ID is not set. Cannot proceed with cleanup."
+  exit 1
+fi
+
+if [ -z "$ALLOC_ID" ]; then
+  echo "Error: ALLOC_ID is not set. Cannot proceed with cleanup."
+  exit 1
+fi
+
+# Check for AWS CLI and credentials
+if ! command -v aws &>/dev/null; then
+  echo "Error: AWS CLI is not installed. Please install it and configure your credentials."
+  exit 1
+fi
+
+if ! aws sts get-caller-identity &>/dev/null; then
+  echo "Error: Unable to access AWS. Ensure your credentials are configured correctly."
+  exit 1
+fi
+
+# Terminate EC2 instance
+echo "Terminating EC2 instance with ID: $INSTANCE_ID..."
+if aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" --no-cli-pager; then
+  echo "Instance termination initiated. Waiting for the instance to terminate..."
+  if aws ec2 wait instance-terminated --instance-ids "$INSTANCE_ID"; then
+    echo "Instance $INSTANCE_ID has been successfully terminated."
+  else
+    echo "Warning: Instance $INSTANCE_ID may not have terminated correctly."
+  fi
+else
+  echo "Warning: Failed to terminate instance $INSTANCE_ID. It may already be terminated."
+fi
+
+# Release Elastic IP
+echo "Releasing Elastic IP with Allocation ID: $ALLOC_ID..."
+if aws ec2 release-address --allocation-id "$ALLOC_ID"; then
+  echo "Elastic IP with Allocation ID $ALLOC_ID has been successfully released."
+else
+  echo "Warning: Failed to release Elastic IP with Allocation ID $ALLOC_ID. It may already be released."
+fi
+
+# Cleanup environment file
+if [ -f "$ENV_FILE" ]; then
+  echo "Removing environment file $ENV_FILE..."
+  rm -f "$ENV_FILE"
+fi
+
+echo "Cleanup complete."
diff --git a/.github/scripts/create-file-index.py b/.github/scripts/create-file-index.py
new file mode 100755
index 00000000..4410db2a
--- /dev/null
+++ b/.github/scripts/create-file-index.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+
+import os
+import csv
+import time
+import sys
+from datetime import datetime
+from pathlib import Path
+
+OUTPUT_DIR = "/home/ec2-user/hub-user-indexes"
+
+
+class MetadataWriter:
+    def __init__(self, output_path, error_path):
+        self.output_path = Path(output_path)
+        self.error_path = Path(error_path)
+        self.start_time = None
+        self.end_time = None
+        self.meta = {
+            "index_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "duration": None,
+            "total_files": 0,
+            "total_size": 0,
+        }
+        self.outfile = None
+        self.errfile = None
+        self.outwriter = None
+        self.errwriter = None
+
+    def start(self):
+        """Initialize the metadata and open the file for writing."""
+        self.start_time = time.time()
+        self.outfile = self.output_path.open(mode="w", newline="", encoding="utf-8")
+        self.errfile = self.error_path.open(mode="w", newline="", encoding="utf-8")
+        self.outwriter = csv.writer(self.outfile, delimiter="\t")
+        self.outwriter.writerow(
+            ["#file_name", "file_size", "file_type", "custom_metadata"]
+        )
+        self.errwriter = csv.writer(self.errfile, delimiter="\t")
+
+    def write_row(self, file_name, file_size, created, modified, error):
+        """Write data for a file."""
+        if not self.outwriter and self.errwriter:
+            raise RuntimeError("Writers not initialized.")
+        if error is not None:
+            self.errwriter.writerow([file_name, error])
+        else:
+            self.outwriter.writerow([file_name, file_size, created, modified])
+            self.meta["total_size"] += file_size
+
+        self.meta["total_files"] += 1
+
+    def finish(self):
+        """Finalize metadata, write it to the file, and close the file."""
+        if not self.outwriter and self.errwriter:
+            raise RuntimeError("Writers not initialized.")
+        self.end_time = time.time()
+        self.meta["duration"] = self.end_time - self.start_time
+
+        self.outfile.write("\n# Execution Metadata\n")
+        for key, value in self.meta.items():
+            self.outfile.write(f"# {key}: {value}\n")
+
+        self.outfile.close()
+        self.errfile.close()
+        print(
+            f"Directory {self.output_path} complete, Duration: {self.meta['duration']:.2f}, Total Files: {self.meta['total_files']}, Total Size: {self.meta['total_size']}"
+        )
+
+    def get_meta(self):
+        """Return the meta-metadata dictionary."""
+        return self.meta
+
+
+def directory_index(directory):
+    for root, dirs, files in os.walk(directory):
+        for name in files:
+            filepath = os.path.join(root, name)
+            try:
+                stat_result = os.stat(filepath, follow_symlinks=False)
+            except Exception as e:
+                size = modified = created = None
+                error = str(e)
+            else:
+                size = stat_result.st_size
+                modified = time.ctime(stat_result.st_mtime)
+                created = time.ctime(stat_result.st_ctime)
+                error = None
+            yield filepath, size, modified, created, error
+
+
+# Ensure the script is called with the required arguments
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <directory_to_index>")
+        sys.exit(1)
+
+    # We assume this directory is a user homedir
+    path_to_index = sys.argv[1]
+    username = path_to_index.split("/")[-1]
+
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    output_file = f"{OUTPUT_DIR}/{username}-index.tsv"
+    error_file = f"{OUTPUT_DIR}/{username}-errors.tsv"
+
+    file_index = MetadataWriter(output_file, error_file)
+    file_index.start()
+
+    for filename, size, created, modified, error in directory_index(path_to_index):
+        relative_filename = f"{username}/{os.path.relpath(filename, path_to_index)}"
+        file_index.write_row(relative_filename, size, created, modified, error)
+
+    file_index.finish()
diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
new file mode 100755
index 00000000..0f8615fa
--- /dev/null
+++ b/.github/scripts/launch-ec2.sh
@@ -0,0 +1,136 @@
+#!/usr/bin/env bash
+
+set -eu
+
+# Check for AWS CLI and credentials
+if ! command -v aws &>/dev/null; then
+  echo "Error: AWS CLI is not installed. Please install it and configure your credentials."
+  exit 1
+fi
+
+if ! aws sts get-caller-identity &>/dev/null; then
+  echo "Error: Unable to access AWS. Ensure your credentials are configured correctly."
+  exit 1
+fi
+
+# Set variables
+AWS_REGION="us-east-2"
+# TODO document that this key needs to be created
+KEY_NAME="dandihub-gh-actions"
+# TODO create if DNE
+# allow gh-actions to ssh into ec2 job instance from anywhere
+SECURITY_GROUP_ID="sg-0bf2dc1c2ff9c122e"
+# TODO retrieve subnet id (public, created by dandi-hub eks-dandihub-public-us-east-2a)
+SUBNET_ID="subnet-0f544cca61ccd2804"
+AMI_ID="ami-0c80e2b6ccb9ad6d1"
+EFS_ID="fs-02aac16c4c6c2dc27"
+LOCAL_SCRIPTS_DIR=".github/scripts"
+REMOTE_SCRIPTS_DIR="/home/ec2-user/scripts"
+MOUNT_POINT="/mnt/efs"
+ENV_FILE=".ec2-session.env"
+
+# Ensure the environment file is writable
+echo "# Environment variables for EC2 session" > $ENV_FILE
+echo "# Auto-generated by launch script on $(date)" >> $ENV_FILE
+
+# Run EC2 instance
+echo "Launching EC2 instance..."
+export INSTANCE_ID=$(aws ec2 run-instances \
+  --image-id $AMI_ID \
+  --count 1 \
+  --instance-type t3.micro \
+  --key-name $KEY_NAME \
+  --security-group-ids $SECURITY_GROUP_ID \
+  --subnet-id $SUBNET_ID \
+  --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=dandihub-gh-actions}]" \
+  --query 'Instances[0].InstanceId' \
+  --output text)
+
+if [ -z "$INSTANCE_ID" ]; then
+  echo "Error: Failed to launch EC2 instance."
+  exit 1
+fi
+echo "Instance ID: $INSTANCE_ID"
+echo "export INSTANCE_ID=$INSTANCE_ID" >> $ENV_FILE
+
+# Wait for instance to initialize
+echo "Waiting for instance to reach status OK..."
+aws ec2 wait instance-status-ok --instance-ids "$INSTANCE_ID"
+
+# Allocate Elastic IP
+echo "Allocating Elastic IP..."
+export ALLOC_ID=$(aws ec2 allocate-address \
+  --tag-specifications "ResourceType=elastic-ip,Tags=[{Key=Name,Value=dandihub-gh-actions-eip}]" \
+  --query 'AllocationId' \
+  --output text)
+
+if [ -z "$ALLOC_ID" ]; then
+  echo "Error: Failed to allocate Elastic IP."
+  exit 1
+fi
+echo "Elastic IP Allocation ID: $ALLOC_ID"
+echo "export ALLOC_ID=$ALLOC_ID" >> $ENV_FILE
+
+# Associate Elastic IP with instance
+echo "Associating Elastic IP with instance..."
+export EIP_ASSOC=$(aws ec2 associate-address \
+  --instance-id "$INSTANCE_ID" \
+  --allocation-id "$ALLOC_ID" \
+  --query 'AssociationId' \
+  --output text)
+
+if [ -z "$EIP_ASSOC" ]; then
+  echo "Error: Failed to associate Elastic IP."
+  exit 1
+fi
+
+# Get Elastic IP address
+export PUBLIC_IP=$(aws ec2 describe-addresses \
+  --allocation-ids "$ALLOC_ID" \
+  --query 'Addresses[0].PublicIp' \
+  --output text)
+
+echo "Elastic IP Address: $PUBLIC_IP"
+echo "export PUBLIC_IP=$PUBLIC_IP" >> $ENV_FILE
+
+# Upload scripts to EC2 instance
+echo "Uploading scripts to EC2 instance..."
+scp -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" \
+  $LOCAL_SCRIPTS_DIR/calculate-directory-stats.py $LOCAL_SCRIPTS_DIR/create-file-index.py \
+  ec2-user@"$PUBLIC_IP":"$REMOTE_SCRIPTS_DIR/"
+
+if [ $? -eq 0 ]; then
+  echo "Scripts uploaded successfully to $REMOTE_SCRIPTS_DIR on the instance."
+else
+  echo "Error: Failed to upload scripts to the instance."
+  exit 1
+fi
+
+# TODO automate
+# eks-dandihub-efs sg is created by dandi-hub install
+# this sg needs to accept incoming 2049 from the sg created for this ec2
+# sg-061d875722e569724 - eks-dandihub-efs
+# aws ec2 authorize-security-group-ingress \
+#   --group-id sg-061d875722e569724 \
+#   --protocol tcp \
+#   --port 2049 \
+#   --source-group $SECURITY_GROUP_ID
+
+echo "Installing dependencies ..."
+ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \
+  "sudo yum install -y amazon-efs-utils pip parallel && \
+  pip install con-duct"
+
+echo "Mounting EFS on the EC2 instance..."
+ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \
+   "sudo mkdir -p $MOUNT_POINT && \
+   sudo mount -t efs $EFS_ID:/ $MOUNT_POINT && \
+   echo '$EFS_ID:/ $MOUNT_POINT efs defaults,_netdev 0 0' | sudo tee -a /etc/fstab && \
+   echo 'EFS mounted at $MOUNT_POINT'"
+
+# Output SSH command for convenience
+echo "To connect to your instance, use:"
+echo "ssh -i \$EC2_SSH_KEY ec2-user@$PUBLIC_IP"
+
+echo "Environment variables saved to $ENV_FILE."
+echo "Run 'source $ENV_FILE' to restore the environment variables."
diff --git a/README.md b/README.md
index 5af6edf5..fd1c8bc4 100644
--- a/README.md
+++ b/README.md
@@ -397,3 +397,53 @@ Notable objects:
 When Jupyterhub user pods are scheduled and sufficient Nodes are not available, Karpenter creates a NodeClaim and then interacts with AWS to spin up machines.
 
 - `nodeclaims`: Create a node from one of the Karpenter Nodepools. (This is where spot/on-demand is configured for user-pods).
+
+## Monitoring Disk Usage
+
+DANDI Hub provides persistent storage to each user, but over time the data stored can become expensive.
+
+To run a job to gather disk usage per user, start by setting the configuring the `aws` cli (make sure `AWS_PROFILE` env var is set).
+You will also need to set `EC2_SSH_KEY` to the location of the PEM file for the dandihub-gh-actions keypair (see asmacdo).
+
+Launch an ec2 instance with the appropriate tools and access:
+
+```sh
+./.github/scripts/launch-ec2.sh
+```
+
+NOTE: If this does not succeed, the security group may have changed, and if so the extra rules necessary for this instance will need to be put back into place.
+      On the SG for `eks-dandihub-efs` add an inbound rule for NFS, pointing to the SG of the ec2 instance.
+
+When the script completes, it will provide instructions to ssh into the instance.
+
+Once logged into the instance, it is recommended to start a screen session.
+
+```sh
+screen -S create-file-index
+```
+
+Next, navigate to the EFS dir which contains each user homedir, and create a file index for each user.
+
+```sh
+cd /mnt/efs/home/
+parallel -j 8 ~/scripts/create-file-index.py ::: *
+```
+
+Once finished, navigate to the output location of the file index script and generate the totals.
+
+```sh
+cd /home/ec2-user/hub-user-indexes
+~/scripts/calculate-directory-stats.py
+```
+
+Logout of the ec2 instance, and pull the totals locally.
+
+```sh
+scp -i "$EC2_SSH_KEY" ec2-user@"$PUBLIC_IP":/home/ec2-user/hub-user-reports/all_users_total.json .
+```
+
+Finally, remove the ec2 instance.
+
+```sh
+./.github/scripts/cleanup-ec2.sh
+```