diff --git a/.aws/terraform-jupyterhub-provisioning-policies.json b/.aws/terraform-jupyterhub-provisioning-policies.json index e0a6cafb..35103551 100644 --- a/.aws/terraform-jupyterhub-provisioning-policies.json +++ b/.aws/terraform-jupyterhub-provisioning-policies.json @@ -4,69 +4,7 @@ { "Effect": "Allow", "Action": [ - "ec2:AllocateAddress", - "ec2:AssociateAddress", - "ec2:AssociateRouteTable", - "ec2:AssociateVpcCidrBlock", - "ec2:AttachInternetGateway", - "ec2:AttachNetworkInterface", - "ec2:AuthorizeSecurityGroupEgress", - "ec2:AuthorizeSecurityGroupIngress", - "ec2:CreateInternetGateway", - "ec2:CreateLaunchTemplate", - "ec2:CreateLaunchTemplateVersion", - "ec2:CreateNatGateway", - "ec2:CreateNetworkAcl", - "ec2:CreateNetworkAclEntry", - "ec2:CreateNetworkInterface", - "ec2:CreateNetworkInterfacePermission", - "ec2:CreateRoute", - "ec2:CreateRouteTable", - "ec2:CreateSecurityGroup", - "ec2:CreateSubnet", - "ec2:CreateTags", - "ec2:CreateVpc", - "ec2:DeleteInternetGateway", - "ec2:DeleteLaunchTemplate", - "ec2:DeleteLaunchTemplateVersions", - "ec2:DeleteNatGateway", - "ec2:DeleteNetworkAcl", - "ec2:DeleteNetworkAclEntry", - "ec2:DeleteNetworkInterface", - "ec2:DeleteRoute", - "ec2:DeleteRouteTable", - "ec2:DeleteSecurityGroup", - "ec2:DeleteSubnet", - "ec2:DeleteTags", - "ec2:DeleteVpc", - "ec2:DescribeAddresses", - "ec2:DescribeAddressesAttribute", - "ec2:DescribeAvailabilityZones", - "ec2:DescribeInternetGateways", - "ec2:DescribeLaunchTemplateVersions", - "ec2:DescribeLaunchTemplates", - "ec2:DescribeNatGateways", - "ec2:DescribeNetworkAcls", - "ec2:DescribeNetworkInterfacePermissions", - "ec2:DescribeNetworkInterfaces", - "ec2:DescribeRouteTables", - "ec2:DescribeSecurityGroupRules", - "ec2:DescribeSecurityGroups", - "ec2:DescribeSubnets", - "ec2:DescribeVpcAttribute", - "ec2:DescribeVpcs", - "ec2:DetachInternetGateway", - "ec2:DetachNetworkInterface", - "ec2:DisassociateAddress", - "ec2:DisassociateRouteTable", - "ec2:DisassociateVpcCidrBlock", - "ec2:ModifyNetworkInterfaceAttribute", - "ec2:ModifyVpcAttribute", - "ec2:ReleaseAddress", - "ec2:ReplaceRoute", - "ec2:RevokeSecurityGroupEgress", - "ec2:RevokeSecurityGroupIngress", - "ec2:RunInstances", + "ec2:*", "ecr-public:GetAuthorizationToken", "eks:*", "elasticfilesystem:CreateFileSystem", diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py new file mode 100755 index 00000000..63cef5b1 --- /dev/null +++ b/.github/scripts/calculate-directory-stats.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 + +import glob +import os +import csv +import json +import sys +import unittest +from collections import Counter, defaultdict +from pathlib import Path +from pprint import pprint +from typing import Iterable, Tuple + +TOTALS_OUTPUT_FILE = "all_users_total.json" +OUTPUT_DIR = "/tmp/hub-user-reports/" +INPUT_DIR = "/tmp/hub-user-indexes" + + +csv.field_size_limit(sys.maxsize) + + +class DirectoryStats(defaultdict): + COUNTED_FIELDS = [ + "total_size", + "file_count", + "nwb_files", + "nwb_size", + "bids_datasets", + "zarr_files", + "zarr_size", + "user_cache_file_count", + "user_cache_size", + ] + root = str + + def __init__(self, root): + super().__init__(lambda: Counter({key: 0 for key in self.COUNTED_FIELDS})) + self.root = root + + def increment(self, path: str, field: str, amount: int = 1): + if field not in self.COUNTED_FIELDS: + raise KeyError( + f"Invalid field '{field}'. Allowed fields: {self.COUNTED_FIELDS}" + ) + self[path][field] += amount + + def propagate_dir(self, current_parent: str, previous_parent: str): + """Propagate counts up the directory tree.""" + assert os.path.isabs(current_parent) == os.path.isabs( + previous_parent + ), "Both must be absolute or both relative" + + highest_common = os.path.commonpath([current_parent, previous_parent]) + assert highest_common, "highest_common must either be a target directory or /" + + path_to_propagate = os.path.relpath(previous_parent, highest_common) + nested_dir_list = path_to_propagate.split(os.sep)[:-1] # Exclude last directory + + while nested_dir_list: + working_dir = os.path.join(highest_common, *nested_dir_list) + for field in self.COUNTED_FIELDS: + self[working_dir][field] += self[previous_parent][field] + nested_dir_list.pop() + previous_parent = working_dir + + # Final propagation to the common root + for field in self.COUNTED_FIELDS: + self[highest_common][field] += self[previous_parent][field] + + def inc_if_bids(self, parent: str, path: str): + """Check if a file indicates a BIDS dataset and increment the count.""" + if path.endswith("dataset_description.json"): + self.increment(parent, "bids_datasets") + + def inc_if_usercache(self, parent: str, filepath: str, size: int): + if filepath.startswith(f"{self.root}/.cache"): + self.increment(parent, "user_cache_file_count") + self.increment(parent, "user_cache_size", size) + + def inc_if_nwb(self, parent: str, path: str, size: int): + if path.lower().endswith(".nwb"): + self.increment(parent, "nwb_files") + self.increment(parent, "nwb_size", size) + + def inc_if_zarr(self, parent: str, path: str, size: int): + if path.lower().endswith(".zarr"): + self.increment(parent, "zarr_files") + self.increment(parent, "zarr_size", size) + + @classmethod + def from_index(cls, username, user_tsv_file): + """Separated from from_data for easier testing""" + data = cls._iter_file_metadata(user_tsv_file) + return cls.from_data(username, data) + + @classmethod + def from_data(cls, root, data: Iterable[Tuple[str, str, str, str]]): + """ + Build DirectoryStats from an iterable of (filepath, size, modified, created). + Assumes depth-first listing. + """ + instance = cls(root=root) + previous_parent = "" + + for filepath, size, _, _ in data: + parent = os.path.dirname(filepath) + + instance.increment(parent, "file_count") + instance.increment(parent, "total_size", int(size)) + instance.inc_if_bids(parent, filepath) + instance.inc_if_nwb(parent, filepath, int(size)) + instance.inc_if_zarr(parent, filepath, int(size)) + instance.inc_if_usercache(parent, filepath, int(size)) + + if previous_parent == parent: + continue + # Going deeper + elif not previous_parent or os.path.dirname(parent) == previous_parent: + previous_parent = parent + continue + else: # Done with this directory + instance.propagate_dir(parent, previous_parent) + previous_parent = parent + + # Final propagation to ensure root directory gets counts + if previous_parent: # No previous_parent means no data + leading_dir = previous_parent.split(os.sep)[0] or "/" + instance.propagate_dir(leading_dir, previous_parent) + + return instance + + def _iter_file_metadata(file_path): + """ + Reads a tsv and returns an iterable that yields one row of file metadata at + a time, excluding comments. + """ + file_path = Path(file_path) + with file_path.open(mode="r", newline="", encoding="utf-8") as file: + reader = csv.reader(file, delimiter="\t") + for row in reader: + # Skip empty lines or lines starting with '#' + if not row or row[0].startswith("#"): + continue + yield row + + @property + def summary(self): + return self[self.root] + + def __repr__(self): + """Cleaner representation for debugging.""" + return "\n".join([f"{path}: {dict(counts)}" for path, counts in self.items()]) + + +def main(): + os.makedirs(OUTPUT_DIR, exist_ok=True) + pattern = f"{INPUT_DIR}/*-index.tsv" + outfile_path = Path(OUTPUT_DIR, TOTALS_OUTPUT_FILE) + output_stats = {} + for user_index_path in glob.iglob(pattern): + filename = os.path.basename(user_index_path) + username = filename.removesuffix("-index.tsv") + print(f"Starting {username}") + full_stats = DirectoryStats.from_index(username, user_index_path) + output_stats[username] = full_stats.summary + + with outfile_path.open(mode="w", encoding="utf-8") as totals_file: + json.dump(output_stats, totals_file, indent=2) + + print(f"Success: report written to {outfile_path}") + + +class TestDirectoryStatistics(unittest.TestCase): + def test_propagate_dir(self): + stats = DirectoryStats(root="a") + stats["a/b/c"].update({"total_size": 100, "file_count": 3}) + stats["a/b"].update({"total_size": 10, "file_count": 0}) + stats["a"].update({"total_size": 1, "file_count": 0}) + + stats.propagate_dir("a", "a/b/c") + self.assertEqual(stats["a"]["file_count"], 3) + self.assertEqual(stats["a/b"]["file_count"], 3) + self.assertEqual(stats["a"]["total_size"], 111) + + def test_propagate_dir_abs_path(self): + stats = DirectoryStats(root="/a") + stats["/a/b/c"].update({"file_count": 3}) + + stats.propagate_dir("/a", "/a/b/c") + self.assertEqual(stats["/a"]["file_count"], 3) + self.assertEqual(stats["/a/b"]["file_count"], 3) + + def test_propagate_dir_files_in_all(self): + stats = DirectoryStats(root="a") + stats["a/b/c"].update({"file_count": 3}) + stats["a/b"].update({"file_count": 2}) + stats["a"].update({"file_count": 1}) + + stats.propagate_dir("a", "a/b/c") + self.assertEqual(stats["a"]["file_count"], 6) + self.assertEqual(stats["a/b"]["file_count"], 5) + + def test_from_data_empty(self): + sample_data = [] + stats = DirectoryStats.from_data("a", sample_data) + self.assertEqual(stats["a"]["file_count"], 0) + + def test_generate_statistics_inc_bids_0(self): + sample_data = [("a/b/file3.txt", 3456, "2024-12-01", "2024-12-02")] + stats = DirectoryStats.from_data("a", sample_data) + self.assertEqual(stats["a/b"]["bids_datasets"], 0) + self.assertEqual(stats["a"]["bids_datasets"], 0) + + def test_generate_statistics_inc_bids_subdatasets(self): + sample_data = [ + ("a/b/c/subdir_of_bids", 3456, "2024-12-01", "2024-12-02"), + ("a/b/dataset_description.json", 3456, "2024-12-01", "2024-12-02"), + ("a/d/dataset_description.json", 3456, "2024-12-01", "2024-12-02"), + ( + "a/d/subdataset/dataset_description.json", + 3456, + "2024-12-01", + "2024-12-02", + ), + ] + stats = DirectoryStats.from_data("a", sample_data) + self.assertEqual(stats["a/b/c"]["bids_datasets"], 0) + self.assertEqual(stats["a/b"]["bids_datasets"], 1) + self.assertEqual(stats["a/d/subdataset"]["bids_datasets"], 1) + self.assertEqual(stats["a/d"]["bids_datasets"], 2) + self.assertEqual(stats["a"]["bids_datasets"], 3) + + def test_generate_statistics_inc_usercache(self): + sample_data = [ + ("a/.cache/x", 3456, "2024-12-01", "2024-12-02"), + ("a/.cache/y", 3456, "2024-12-01", "2024-12-02"), + ("a/.cache/nested/y", 3456, "2024-12-01", "2024-12-02"), + ("a/b/notcache", 3456, "2024-12-01", "2024-12-02"), + ] + stats = DirectoryStats.from_data("a", sample_data) + self.assertEqual(stats["a"]["user_cache_file_count"], 3) + self.assertEqual(stats["a"]["user_cache_size"], 3456 * 3) + self.assertEqual(stats["a/.cache"]["user_cache_file_count"], 3) + self.assertEqual(stats["a/.cache"]["user_cache_size"], 3456 * 3) + self.assertEqual(stats["a/b"]["user_cache_file_count"], 0) + self.assertEqual(stats["a/b"]["user_cache_size"], 0) + + def test_generate_statistics(self): + sample_data = [ + ("a/b/file3.txt", 3456, "2024-12-01", "2024-12-02"), + ("a/b/c/file1.txt", 1234, "2024-12-01", "2024-12-02"), + ("a/b/c/file2.txt", 2345, "2024-12-01", "2024-12-02"), + ("a/b/c/d/file4.txt", 4567, "2024-12-01", "2024-12-02"), + ("a/e/file3.txt", 5678, "2024-12-01", "2024-12-02"), + ("a/e/f/file1.txt", 6789, "2024-12-01", "2024-12-02"), + ("a/e/f/file2.txt", 7890, "2024-12-01", "2024-12-02"), + ("a/e/f/g/file4.txt", 8901, "2024-12-01", "2024-12-02"), + ] + stats = DirectoryStats.from_data("a", sample_data) + self.assertEqual(stats["a/b/c/d"]["file_count"], 1) + self.assertEqual(stats["a/b/c"]["file_count"], 3) + self.assertEqual(stats["a/b"]["file_count"], 4) + self.assertEqual(stats["a/e/f/g"]["file_count"], 1) + self.assertEqual(stats["a/e/f"]["file_count"], 3) + self.assertEqual(stats["a/e"]["file_count"], 4) + self.assertEqual(stats["a"]["file_count"], 8) + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "test": + unittest.main( + argv=sys.argv[:1] + ) # Run tests if "test" is provided as an argument + else: + main() diff --git a/.github/scripts/cleanup-ec2.sh b/.github/scripts/cleanup-ec2.sh new file mode 100755 index 00000000..326c0029 --- /dev/null +++ b/.github/scripts/cleanup-ec2.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +set -eu + +# Load environment variables from the file if they are not already set +ENV_FILE=".ec2-session.env" +if [ -f "$ENV_FILE" ]; then + echo "Loading environment variables from $ENV_FILE..." + source "$ENV_FILE" +else + echo "Warning: Environment file $ENV_FILE not found." +fi + +# Ensure required environment variables are set +if [ -z "$INSTANCE_ID" ]; then + echo "Error: INSTANCE_ID is not set. Cannot proceed with cleanup." + exit 1 +fi + +if [ -z "$ALLOC_ID" ]; then + echo "Error: ALLOC_ID is not set. Cannot proceed with cleanup." + exit 1 +fi + +# Check for AWS CLI and credentials +if ! command -v aws &>/dev/null; then + echo "Error: AWS CLI is not installed. Please install it and configure your credentials." + exit 1 +fi + +if ! aws sts get-caller-identity &>/dev/null; then + echo "Error: Unable to access AWS. Ensure your credentials are configured correctly." + exit 1 +fi + +# Terminate EC2 instance +echo "Terminating EC2 instance with ID: $INSTANCE_ID..." +if aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" --no-cli-pager; then + echo "Instance termination initiated. Waiting for the instance to terminate..." + if aws ec2 wait instance-terminated --instance-ids "$INSTANCE_ID"; then + echo "Instance $INSTANCE_ID has been successfully terminated." + else + echo "Warning: Instance $INSTANCE_ID may not have terminated correctly." + fi +else + echo "Warning: Failed to terminate instance $INSTANCE_ID. It may already be terminated." +fi + +# Release Elastic IP +echo "Releasing Elastic IP with Allocation ID: $ALLOC_ID..." +if aws ec2 release-address --allocation-id "$ALLOC_ID"; then + echo "Elastic IP with Allocation ID $ALLOC_ID has been successfully released." +else + echo "Warning: Failed to release Elastic IP with Allocation ID $ALLOC_ID. It may already be released." +fi + +# Cleanup environment file +if [ -f "$ENV_FILE" ]; then + echo "Removing environment file $ENV_FILE..." + rm -f "$ENV_FILE" +fi + +echo "Cleanup complete." diff --git a/.github/scripts/create-file-index.py b/.github/scripts/create-file-index.py new file mode 100755 index 00000000..4410db2a --- /dev/null +++ b/.github/scripts/create-file-index.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 + +import os +import csv +import time +import sys +from datetime import datetime +from pathlib import Path + +OUTPUT_DIR = "/home/ec2-user/hub-user-indexes" + + +class MetadataWriter: + def __init__(self, output_path, error_path): + self.output_path = Path(output_path) + self.error_path = Path(error_path) + self.start_time = None + self.end_time = None + self.meta = { + "index_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "duration": None, + "total_files": 0, + "total_size": 0, + } + self.outfile = None + self.errfile = None + self.outwriter = None + self.errwriter = None + + def start(self): + """Initialize the metadata and open the file for writing.""" + self.start_time = time.time() + self.outfile = self.output_path.open(mode="w", newline="", encoding="utf-8") + self.errfile = self.error_path.open(mode="w", newline="", encoding="utf-8") + self.outwriter = csv.writer(self.outfile, delimiter="\t") + self.outwriter.writerow( + ["#file_name", "file_size", "file_type", "custom_metadata"] + ) + self.errwriter = csv.writer(self.errfile, delimiter="\t") + + def write_row(self, file_name, file_size, created, modified, error): + """Write data for a file.""" + if not self.outwriter and self.errwriter: + raise RuntimeError("Writers not initialized.") + if error is not None: + self.errwriter.writerow([file_name, error]) + else: + self.outwriter.writerow([file_name, file_size, created, modified]) + self.meta["total_size"] += file_size + + self.meta["total_files"] += 1 + + def finish(self): + """Finalize metadata, write it to the file, and close the file.""" + if not self.outwriter and self.errwriter: + raise RuntimeError("Writers not initialized.") + self.end_time = time.time() + self.meta["duration"] = self.end_time - self.start_time + + self.outfile.write("\n# Execution Metadata\n") + for key, value in self.meta.items(): + self.outfile.write(f"# {key}: {value}\n") + + self.outfile.close() + self.errfile.close() + print( + f"Directory {self.output_path} complete, Duration: {self.meta['duration']:.2f}, Total Files: {self.meta['total_files']}, Total Size: {self.meta['total_size']}" + ) + + def get_meta(self): + """Return the meta-metadata dictionary.""" + return self.meta + + +def directory_index(directory): + for root, dirs, files in os.walk(directory): + for name in files: + filepath = os.path.join(root, name) + try: + stat_result = os.stat(filepath, follow_symlinks=False) + except Exception as e: + size = modified = created = None + error = str(e) + else: + size = stat_result.st_size + modified = time.ctime(stat_result.st_mtime) + created = time.ctime(stat_result.st_ctime) + error = None + yield filepath, size, modified, created, error + + +# Ensure the script is called with the required arguments +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + # We assume this directory is a user homedir + path_to_index = sys.argv[1] + username = path_to_index.split("/")[-1] + + os.makedirs(OUTPUT_DIR, exist_ok=True) + output_file = f"{OUTPUT_DIR}/{username}-index.tsv" + error_file = f"{OUTPUT_DIR}/{username}-errors.tsv" + + file_index = MetadataWriter(output_file, error_file) + file_index.start() + + for filename, size, created, modified, error in directory_index(path_to_index): + relative_filename = f"{username}/{os.path.relpath(filename, path_to_index)}" + file_index.write_row(relative_filename, size, created, modified, error) + + file_index.finish() diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh new file mode 100755 index 00000000..0f8615fa --- /dev/null +++ b/.github/scripts/launch-ec2.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash + +set -eu + +# Check for AWS CLI and credentials +if ! command -v aws &>/dev/null; then + echo "Error: AWS CLI is not installed. Please install it and configure your credentials." + exit 1 +fi + +if ! aws sts get-caller-identity &>/dev/null; then + echo "Error: Unable to access AWS. Ensure your credentials are configured correctly." + exit 1 +fi + +# Set variables +AWS_REGION="us-east-2" +# TODO document that this key needs to be created +KEY_NAME="dandihub-gh-actions" +# TODO create if DNE +# allow gh-actions to ssh into ec2 job instance from anywhere +SECURITY_GROUP_ID="sg-0bf2dc1c2ff9c122e" +# TODO retrieve subnet id (public, created by dandi-hub eks-dandihub-public-us-east-2a) +SUBNET_ID="subnet-0f544cca61ccd2804" +AMI_ID="ami-0c80e2b6ccb9ad6d1" +EFS_ID="fs-02aac16c4c6c2dc27" +LOCAL_SCRIPTS_DIR=".github/scripts" +REMOTE_SCRIPTS_DIR="/home/ec2-user/scripts" +MOUNT_POINT="/mnt/efs" +ENV_FILE=".ec2-session.env" + +# Ensure the environment file is writable +echo "# Environment variables for EC2 session" > $ENV_FILE +echo "# Auto-generated by launch script on $(date)" >> $ENV_FILE + +# Run EC2 instance +echo "Launching EC2 instance..." +export INSTANCE_ID=$(aws ec2 run-instances \ + --image-id $AMI_ID \ + --count 1 \ + --instance-type t3.micro \ + --key-name $KEY_NAME \ + --security-group-ids $SECURITY_GROUP_ID \ + --subnet-id $SUBNET_ID \ + --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=dandihub-gh-actions}]" \ + --query 'Instances[0].InstanceId' \ + --output text) + +if [ -z "$INSTANCE_ID" ]; then + echo "Error: Failed to launch EC2 instance." + exit 1 +fi +echo "Instance ID: $INSTANCE_ID" +echo "export INSTANCE_ID=$INSTANCE_ID" >> $ENV_FILE + +# Wait for instance to initialize +echo "Waiting for instance to reach status OK..." +aws ec2 wait instance-status-ok --instance-ids "$INSTANCE_ID" + +# Allocate Elastic IP +echo "Allocating Elastic IP..." +export ALLOC_ID=$(aws ec2 allocate-address \ + --tag-specifications "ResourceType=elastic-ip,Tags=[{Key=Name,Value=dandihub-gh-actions-eip}]" \ + --query 'AllocationId' \ + --output text) + +if [ -z "$ALLOC_ID" ]; then + echo "Error: Failed to allocate Elastic IP." + exit 1 +fi +echo "Elastic IP Allocation ID: $ALLOC_ID" +echo "export ALLOC_ID=$ALLOC_ID" >> $ENV_FILE + +# Associate Elastic IP with instance +echo "Associating Elastic IP with instance..." +export EIP_ASSOC=$(aws ec2 associate-address \ + --instance-id "$INSTANCE_ID" \ + --allocation-id "$ALLOC_ID" \ + --query 'AssociationId' \ + --output text) + +if [ -z "$EIP_ASSOC" ]; then + echo "Error: Failed to associate Elastic IP." + exit 1 +fi + +# Get Elastic IP address +export PUBLIC_IP=$(aws ec2 describe-addresses \ + --allocation-ids "$ALLOC_ID" \ + --query 'Addresses[0].PublicIp' \ + --output text) + +echo "Elastic IP Address: $PUBLIC_IP" +echo "export PUBLIC_IP=$PUBLIC_IP" >> $ENV_FILE + +# Upload scripts to EC2 instance +echo "Uploading scripts to EC2 instance..." +scp -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" \ + $LOCAL_SCRIPTS_DIR/calculate-directory-stats.py $LOCAL_SCRIPTS_DIR/create-file-index.py \ + ec2-user@"$PUBLIC_IP":"$REMOTE_SCRIPTS_DIR/" + +if [ $? -eq 0 ]; then + echo "Scripts uploaded successfully to $REMOTE_SCRIPTS_DIR on the instance." +else + echo "Error: Failed to upload scripts to the instance." + exit 1 +fi + +# TODO automate +# eks-dandihub-efs sg is created by dandi-hub install +# this sg needs to accept incoming 2049 from the sg created for this ec2 +# sg-061d875722e569724 - eks-dandihub-efs +# aws ec2 authorize-security-group-ingress \ +# --group-id sg-061d875722e569724 \ +# --protocol tcp \ +# --port 2049 \ +# --source-group $SECURITY_GROUP_ID + +echo "Installing dependencies ..." +ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \ + "sudo yum install -y amazon-efs-utils pip parallel && \ + pip install con-duct" + +echo "Mounting EFS on the EC2 instance..." +ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \ + "sudo mkdir -p $MOUNT_POINT && \ + sudo mount -t efs $EFS_ID:/ $MOUNT_POINT && \ + echo '$EFS_ID:/ $MOUNT_POINT efs defaults,_netdev 0 0' | sudo tee -a /etc/fstab && \ + echo 'EFS mounted at $MOUNT_POINT'" + +# Output SSH command for convenience +echo "To connect to your instance, use:" +echo "ssh -i \$EC2_SSH_KEY ec2-user@$PUBLIC_IP" + +echo "Environment variables saved to $ENV_FILE." +echo "Run 'source $ENV_FILE' to restore the environment variables." diff --git a/README.md b/README.md index 5af6edf5..fd1c8bc4 100644 --- a/README.md +++ b/README.md @@ -397,3 +397,53 @@ Notable objects: When Jupyterhub user pods are scheduled and sufficient Nodes are not available, Karpenter creates a NodeClaim and then interacts with AWS to spin up machines. - `nodeclaims`: Create a node from one of the Karpenter Nodepools. (This is where spot/on-demand is configured for user-pods). + +## Monitoring Disk Usage + +DANDI Hub provides persistent storage to each user, but over time the data stored can become expensive. + +To run a job to gather disk usage per user, start by setting the configuring the `aws` cli (make sure `AWS_PROFILE` env var is set). +You will also need to set `EC2_SSH_KEY` to the location of the PEM file for the dandihub-gh-actions keypair (see asmacdo). + +Launch an ec2 instance with the appropriate tools and access: + +```sh +./.github/scripts/launch-ec2.sh +``` + +NOTE: If this does not succeed, the security group may have changed, and if so the extra rules necessary for this instance will need to be put back into place. + On the SG for `eks-dandihub-efs` add an inbound rule for NFS, pointing to the SG of the ec2 instance. + +When the script completes, it will provide instructions to ssh into the instance. + +Once logged into the instance, it is recommended to start a screen session. + +```sh +screen -S create-file-index +``` + +Next, navigate to the EFS dir which contains each user homedir, and create a file index for each user. + +```sh +cd /mnt/efs/home/ +parallel -j 8 ~/scripts/create-file-index.py ::: * +``` + +Once finished, navigate to the output location of the file index script and generate the totals. + +```sh +cd /home/ec2-user/hub-user-indexes +~/scripts/calculate-directory-stats.py +``` + +Logout of the ec2 instance, and pull the totals locally. + +```sh +scp -i "$EC2_SSH_KEY" ec2-user@"$PUBLIC_IP":/home/ec2-user/hub-user-reports/all_users_total.json . +``` + +Finally, remove the ec2 instance. + +```sh +./.github/scripts/cleanup-ec2.sh +```