diff --git a/infra/marin-big-run.yaml b/infra/marin-big-run.yaml index 8245a5d5a5..287961c983 100644 --- a/infra/marin-big-run.yaml +++ b/infra/marin-big-run.yaml @@ -57,6 +57,11 @@ docker: initialization_commands: - which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f) - yes | gcloud auth configure-docker us-central2-docker.pkg.dev + - which gcsfuse || (sudo apt-get update && sudo apt-get install -y curl gnupg && sudo mkdir -p /etc/apt/keyrings && curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /etc/apt/keyrings/cloud.google.gpg && echo "deb [signed-by=/etc/apt/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt gcsfuse-$(. /etc/os-release && echo $VERSION_CODENAME) main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list >/dev/null && sudo apt-get update && sudo apt-get install -y gcsfuse) + - sudo sh -c "grep -q '^user_allow_other$' /etc/fuse.conf 2>/dev/null || echo user_allow_other >> /etc/fuse.conf" + - if [ -e /tmp/gcsfuse_mount ] && [ ! -d /tmp/gcsfuse_mount ]; then rm -f /tmp/gcsfuse_mount; fi + - mkdir -p /tmp/gcsfuse_mount + - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --dir-mode 0777 --file-mode 0666 -o allow_other --only-dir gcsfuse_mount marin-us-central2 /tmp/gcsfuse_mount || true # always run this because ray doesn't run with sudo - sudo usermod -aG docker $USER # we want to launch docker containers from inside docker, which means we need to loosen the permissions on the docker @@ -87,7 +92,9 @@ setup_commands: - echo 'export TPU_MIN_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_STDERR_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_LOG_DIR=disabled' >> $HOME/.bashrc - - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --only-dir gcsfuse_mount $BUCKET /opt/gcsfuse_mount || true + # gcsfuse runs on the host (mounted at /tmp/gcsfuse_mount); expose it at the conventional in-container path. + - if [ -e /opt/gcsfuse_mount ] && [ ! -L /opt/gcsfuse_mount ]; then sudo rm -rf /opt/gcsfuse_mount; fi + - sudo ln -sfn /tmp/gcsfuse_mount /opt/gcsfuse_mount worker_setup_commands: # delete any old ray session data diff --git a/infra/marin-cluster-template.yaml b/infra/marin-cluster-template.yaml index 4d32bba7f8..c98c4e6a90 100644 --- a/infra/marin-cluster-template.yaml +++ b/infra/marin-cluster-template.yaml @@ -53,6 +53,11 @@ docker: initialization_commands: - which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f) - yes | gcloud auth configure-docker {{REGION}}-docker.pkg.dev + - which gcsfuse || (sudo apt-get update && sudo apt-get install -y curl gnupg && sudo mkdir -p /etc/apt/keyrings && curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /etc/apt/keyrings/cloud.google.gpg && echo "deb [signed-by=/etc/apt/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt gcsfuse-$(. /etc/os-release && echo $VERSION_CODENAME) main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list >/dev/null && sudo apt-get update && sudo apt-get install -y gcsfuse) + - sudo sh -c "grep -q '^user_allow_other$' /etc/fuse.conf 2>/dev/null || echo user_allow_other >> /etc/fuse.conf" + - if [ -e /tmp/gcsfuse_mount ] && [ ! -d /tmp/gcsfuse_mount ]; then rm -f /tmp/gcsfuse_mount; fi + - mkdir -p /tmp/gcsfuse_mount + - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --dir-mode 0777 --file-mode 0666 -o allow_other --only-dir gcsfuse_mount {{BUCKET}} /tmp/gcsfuse_mount || true # always run this because ray doesn't run with sudo - sudo usermod -aG docker $USER # we want to launch docker containers from inside docker, which means we need to loosen the permissions on the docker @@ -83,7 +88,9 @@ setup_commands: - echo 'export TPU_MIN_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_STDERR_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_LOG_DIR=disabled' >> $HOME/.bashrc - - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --only-dir gcsfuse_mount $BUCKET /opt/gcsfuse_mount || true + # gcsfuse runs on the host (mounted at /tmp/gcsfuse_mount); expose it at the conventional in-container path. + - if [ -e /opt/gcsfuse_mount ] && [ ! -L /opt/gcsfuse_mount ]; then sudo rm -rf /opt/gcsfuse_mount; fi + - sudo ln -sfn /tmp/gcsfuse_mount /opt/gcsfuse_mount worker_setup_commands: # delete any old ray session data diff --git a/infra/marin-eu-west4-a.yaml b/infra/marin-eu-west4-a.yaml index dd59606db7..5e4e283d4f 100644 --- a/infra/marin-eu-west4-a.yaml +++ b/infra/marin-eu-west4-a.yaml @@ -57,6 +57,11 @@ docker: initialization_commands: - which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f) - yes | gcloud auth configure-docker europe-west4-docker.pkg.dev + - which gcsfuse || (sudo apt-get update && sudo apt-get install -y curl gnupg && sudo mkdir -p /etc/apt/keyrings && curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /etc/apt/keyrings/cloud.google.gpg && echo "deb [signed-by=/etc/apt/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt gcsfuse-$(. /etc/os-release && echo $VERSION_CODENAME) main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list >/dev/null && sudo apt-get update && sudo apt-get install -y gcsfuse) + - sudo sh -c "grep -q '^user_allow_other$' /etc/fuse.conf 2>/dev/null || echo user_allow_other >> /etc/fuse.conf" + - if [ -e /tmp/gcsfuse_mount ] && [ ! -d /tmp/gcsfuse_mount ]; then rm -f /tmp/gcsfuse_mount; fi + - mkdir -p /tmp/gcsfuse_mount + - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --dir-mode 0777 --file-mode 0666 -o allow_other --only-dir gcsfuse_mount marin-eu-west4 /tmp/gcsfuse_mount || true # always run this because ray doesn't run with sudo - sudo usermod -aG docker $USER # we want to launch docker containers from inside docker, which means we need to loosen the permissions on the docker @@ -87,7 +92,9 @@ setup_commands: - echo 'export TPU_MIN_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_STDERR_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_LOG_DIR=disabled' >> $HOME/.bashrc - - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --only-dir gcsfuse_mount $BUCKET /opt/gcsfuse_mount || true + # gcsfuse runs on the host (mounted at /tmp/gcsfuse_mount); expose it at the conventional in-container path. + - if [ -e /opt/gcsfuse_mount ] && [ ! -L /opt/gcsfuse_mount ]; then sudo rm -rf /opt/gcsfuse_mount; fi + - sudo ln -sfn /tmp/gcsfuse_mount /opt/gcsfuse_mount worker_setup_commands: # delete any old ray session data diff --git a/infra/marin-eu-west4.yaml b/infra/marin-eu-west4.yaml index 5c1a1471fc..eb8ec32dc6 100644 --- a/infra/marin-eu-west4.yaml +++ b/infra/marin-eu-west4.yaml @@ -57,6 +57,11 @@ docker: initialization_commands: - which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f) - yes | gcloud auth configure-docker europe-west4-docker.pkg.dev + - which gcsfuse || (sudo apt-get update && sudo apt-get install -y curl gnupg && sudo mkdir -p /etc/apt/keyrings && curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /etc/apt/keyrings/cloud.google.gpg && echo "deb [signed-by=/etc/apt/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt gcsfuse-$(. /etc/os-release && echo $VERSION_CODENAME) main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list >/dev/null && sudo apt-get update && sudo apt-get install -y gcsfuse) + - sudo sh -c "grep -q '^user_allow_other$' /etc/fuse.conf 2>/dev/null || echo user_allow_other >> /etc/fuse.conf" + - if [ -e /tmp/gcsfuse_mount ] && [ ! -d /tmp/gcsfuse_mount ]; then rm -f /tmp/gcsfuse_mount; fi + - mkdir -p /tmp/gcsfuse_mount + - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --dir-mode 0777 --file-mode 0666 -o allow_other --only-dir gcsfuse_mount marin-eu-west4 /tmp/gcsfuse_mount || true # always run this because ray doesn't run with sudo - sudo usermod -aG docker $USER # we want to launch docker containers from inside docker, which means we need to loosen the permissions on the docker @@ -87,7 +92,9 @@ setup_commands: - echo 'export TPU_MIN_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_STDERR_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_LOG_DIR=disabled' >> $HOME/.bashrc - - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --only-dir gcsfuse_mount $BUCKET /opt/gcsfuse_mount || true + # gcsfuse runs on the host (mounted at /tmp/gcsfuse_mount); expose it at the conventional in-container path. + - if [ -e /opt/gcsfuse_mount ] && [ ! -L /opt/gcsfuse_mount ]; then sudo rm -rf /opt/gcsfuse_mount; fi + - sudo ln -sfn /tmp/gcsfuse_mount /opt/gcsfuse_mount worker_setup_commands: # delete any old ray session data diff --git a/infra/marin-us-central1.yaml b/infra/marin-us-central1.yaml index 2c71b8fe8a..21defd1ad6 100644 --- a/infra/marin-us-central1.yaml +++ b/infra/marin-us-central1.yaml @@ -57,6 +57,11 @@ docker: initialization_commands: - which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f) - yes | gcloud auth configure-docker us-central1-docker.pkg.dev + - which gcsfuse || (sudo apt-get update && sudo apt-get install -y curl gnupg && sudo mkdir -p /etc/apt/keyrings && curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /etc/apt/keyrings/cloud.google.gpg && echo "deb [signed-by=/etc/apt/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt gcsfuse-$(. /etc/os-release && echo $VERSION_CODENAME) main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list >/dev/null && sudo apt-get update && sudo apt-get install -y gcsfuse) + - sudo sh -c "grep -q '^user_allow_other$' /etc/fuse.conf 2>/dev/null || echo user_allow_other >> /etc/fuse.conf" + - if [ -e /tmp/gcsfuse_mount ] && [ ! -d /tmp/gcsfuse_mount ]; then rm -f /tmp/gcsfuse_mount; fi + - mkdir -p /tmp/gcsfuse_mount + - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --dir-mode 0777 --file-mode 0666 -o allow_other --only-dir gcsfuse_mount marin-us-central1 /tmp/gcsfuse_mount || true # always run this because ray doesn't run with sudo - sudo usermod -aG docker $USER # we want to launch docker containers from inside docker, which means we need to loosen the permissions on the docker @@ -87,7 +92,9 @@ setup_commands: - echo 'export TPU_MIN_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_STDERR_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_LOG_DIR=disabled' >> $HOME/.bashrc - - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --only-dir gcsfuse_mount $BUCKET /opt/gcsfuse_mount || true + # gcsfuse runs on the host (mounted at /tmp/gcsfuse_mount); expose it at the conventional in-container path. + - if [ -e /opt/gcsfuse_mount ] && [ ! -L /opt/gcsfuse_mount ]; then sudo rm -rf /opt/gcsfuse_mount; fi + - sudo ln -sfn /tmp/gcsfuse_mount /opt/gcsfuse_mount worker_setup_commands: # delete any old ray session data diff --git a/infra/marin-us-central2-staging.yaml b/infra/marin-us-central2-staging.yaml index 267a47a7b9..917dfa71ce 100644 --- a/infra/marin-us-central2-staging.yaml +++ b/infra/marin-us-central2-staging.yaml @@ -57,6 +57,11 @@ docker: initialization_commands: - which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f) - yes | gcloud auth configure-docker us-central2-docker.pkg.dev + - which gcsfuse || (sudo apt-get update && sudo apt-get install -y curl gnupg && sudo mkdir -p /etc/apt/keyrings && curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /etc/apt/keyrings/cloud.google.gpg && echo "deb [signed-by=/etc/apt/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt gcsfuse-$(. /etc/os-release && echo $VERSION_CODENAME) main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list >/dev/null && sudo apt-get update && sudo apt-get install -y gcsfuse) + - sudo sh -c "grep -q '^user_allow_other$' /etc/fuse.conf 2>/dev/null || echo user_allow_other >> /etc/fuse.conf" + - if [ -e /tmp/gcsfuse_mount ] && [ ! -d /tmp/gcsfuse_mount ]; then rm -f /tmp/gcsfuse_mount; fi + - mkdir -p /tmp/gcsfuse_mount + - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --dir-mode 0777 --file-mode 0666 -o allow_other --only-dir gcsfuse_mount marin-us-central2 /tmp/gcsfuse_mount || true # always run this because ray doesn't run with sudo - sudo usermod -aG docker $USER # we want to launch docker containers from inside docker, which means we need to loosen the permissions on the docker @@ -87,7 +92,9 @@ setup_commands: - echo 'export TPU_MIN_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_STDERR_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_LOG_DIR=disabled' >> $HOME/.bashrc - - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --only-dir gcsfuse_mount $BUCKET /opt/gcsfuse_mount || true + # gcsfuse runs on the host (mounted at /tmp/gcsfuse_mount); expose it at the conventional in-container path. + - if [ -e /opt/gcsfuse_mount ] && [ ! -L /opt/gcsfuse_mount ]; then sudo rm -rf /opt/gcsfuse_mount; fi + - sudo ln -sfn /tmp/gcsfuse_mount /opt/gcsfuse_mount worker_setup_commands: # delete any old ray session data diff --git a/infra/marin-us-central2.yaml b/infra/marin-us-central2.yaml index 614375a938..74a1a6dc2e 100644 --- a/infra/marin-us-central2.yaml +++ b/infra/marin-us-central2.yaml @@ -57,6 +57,11 @@ docker: initialization_commands: - which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f) - yes | gcloud auth configure-docker us-central2-docker.pkg.dev + - which gcsfuse || (sudo apt-get update && sudo apt-get install -y curl gnupg && sudo mkdir -p /etc/apt/keyrings && curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /etc/apt/keyrings/cloud.google.gpg && echo "deb [signed-by=/etc/apt/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt gcsfuse-$(. /etc/os-release && echo $VERSION_CODENAME) main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list >/dev/null && sudo apt-get update && sudo apt-get install -y gcsfuse) + - sudo sh -c "grep -q '^user_allow_other$' /etc/fuse.conf 2>/dev/null || echo user_allow_other >> /etc/fuse.conf" + - if [ -e /tmp/gcsfuse_mount ] && [ ! -d /tmp/gcsfuse_mount ]; then rm -f /tmp/gcsfuse_mount; fi + - mkdir -p /tmp/gcsfuse_mount + - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --dir-mode 0777 --file-mode 0666 -o allow_other --only-dir gcsfuse_mount marin-us-central2 /tmp/gcsfuse_mount || true # always run this because ray doesn't run with sudo - sudo usermod -aG docker $USER # we want to launch docker containers from inside docker, which means we need to loosen the permissions on the docker @@ -87,7 +92,9 @@ setup_commands: - echo 'export TPU_MIN_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_STDERR_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_LOG_DIR=disabled' >> $HOME/.bashrc - - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --only-dir gcsfuse_mount $BUCKET /opt/gcsfuse_mount || true + # gcsfuse runs on the host (mounted at /tmp/gcsfuse_mount); expose it at the conventional in-container path. + - if [ -e /opt/gcsfuse_mount ] && [ ! -L /opt/gcsfuse_mount ]; then sudo rm -rf /opt/gcsfuse_mount; fi + - sudo ln -sfn /tmp/gcsfuse_mount /opt/gcsfuse_mount worker_setup_commands: # delete any old ray session data diff --git a/infra/marin-us-east1.yaml b/infra/marin-us-east1.yaml index 8d92057027..e97767c3b9 100644 --- a/infra/marin-us-east1.yaml +++ b/infra/marin-us-east1.yaml @@ -57,6 +57,11 @@ docker: initialization_commands: - which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f) - yes | gcloud auth configure-docker us-east1-docker.pkg.dev + - which gcsfuse || (sudo apt-get update && sudo apt-get install -y curl gnupg && sudo mkdir -p /etc/apt/keyrings && curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /etc/apt/keyrings/cloud.google.gpg && echo "deb [signed-by=/etc/apt/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt gcsfuse-$(. /etc/os-release && echo $VERSION_CODENAME) main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list >/dev/null && sudo apt-get update && sudo apt-get install -y gcsfuse) + - sudo sh -c "grep -q '^user_allow_other$' /etc/fuse.conf 2>/dev/null || echo user_allow_other >> /etc/fuse.conf" + - if [ -e /tmp/gcsfuse_mount ] && [ ! -d /tmp/gcsfuse_mount ]; then rm -f /tmp/gcsfuse_mount; fi + - mkdir -p /tmp/gcsfuse_mount + - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --dir-mode 0777 --file-mode 0666 -o allow_other --only-dir gcsfuse_mount marin-us-east1 /tmp/gcsfuse_mount || true # always run this because ray doesn't run with sudo - sudo usermod -aG docker $USER # we want to launch docker containers from inside docker, which means we need to loosen the permissions on the docker @@ -87,7 +92,9 @@ setup_commands: - echo 'export TPU_MIN_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_STDERR_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_LOG_DIR=disabled' >> $HOME/.bashrc - - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --only-dir gcsfuse_mount $BUCKET /opt/gcsfuse_mount || true + # gcsfuse runs on the host (mounted at /tmp/gcsfuse_mount); expose it at the conventional in-container path. + - if [ -e /opt/gcsfuse_mount ] && [ ! -L /opt/gcsfuse_mount ]; then sudo rm -rf /opt/gcsfuse_mount; fi + - sudo ln -sfn /tmp/gcsfuse_mount /opt/gcsfuse_mount worker_setup_commands: # delete any old ray session data diff --git a/infra/marin-us-east5-a.yaml b/infra/marin-us-east5-a.yaml index 7319dee422..6f219c371b 100644 --- a/infra/marin-us-east5-a.yaml +++ b/infra/marin-us-east5-a.yaml @@ -57,6 +57,11 @@ docker: initialization_commands: - which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f) - yes | gcloud auth configure-docker us-east5-docker.pkg.dev + - which gcsfuse || (sudo apt-get update && sudo apt-get install -y curl gnupg && sudo mkdir -p /etc/apt/keyrings && curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /etc/apt/keyrings/cloud.google.gpg && echo "deb [signed-by=/etc/apt/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt gcsfuse-$(. /etc/os-release && echo $VERSION_CODENAME) main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list >/dev/null && sudo apt-get update && sudo apt-get install -y gcsfuse) + - sudo sh -c "grep -q '^user_allow_other$' /etc/fuse.conf 2>/dev/null || echo user_allow_other >> /etc/fuse.conf" + - if [ -e /tmp/gcsfuse_mount ] && [ ! -d /tmp/gcsfuse_mount ]; then rm -f /tmp/gcsfuse_mount; fi + - mkdir -p /tmp/gcsfuse_mount + - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --dir-mode 0777 --file-mode 0666 -o allow_other --only-dir gcsfuse_mount marin-us-east5 /tmp/gcsfuse_mount || true # always run this because ray doesn't run with sudo - sudo usermod -aG docker $USER # we want to launch docker containers from inside docker, which means we need to loosen the permissions on the docker @@ -87,7 +92,9 @@ setup_commands: - echo 'export TPU_MIN_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_STDERR_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_LOG_DIR=disabled' >> $HOME/.bashrc - - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --only-dir gcsfuse_mount $BUCKET /opt/gcsfuse_mount || true + # gcsfuse runs on the host (mounted at /tmp/gcsfuse_mount); expose it at the conventional in-container path. + - if [ -e /opt/gcsfuse_mount ] && [ ! -L /opt/gcsfuse_mount ]; then sudo rm -rf /opt/gcsfuse_mount; fi + - sudo ln -sfn /tmp/gcsfuse_mount /opt/gcsfuse_mount worker_setup_commands: # delete any old ray session data diff --git a/infra/marin-us-east5.yaml b/infra/marin-us-east5.yaml index cafc9120fc..d2bbccafc7 100644 --- a/infra/marin-us-east5.yaml +++ b/infra/marin-us-east5.yaml @@ -57,6 +57,11 @@ docker: initialization_commands: - which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f) - yes | gcloud auth configure-docker us-east5-docker.pkg.dev + - which gcsfuse || (sudo apt-get update && sudo apt-get install -y curl gnupg && sudo mkdir -p /etc/apt/keyrings && curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /etc/apt/keyrings/cloud.google.gpg && echo "deb [signed-by=/etc/apt/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt gcsfuse-$(. /etc/os-release && echo $VERSION_CODENAME) main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list >/dev/null && sudo apt-get update && sudo apt-get install -y gcsfuse) + - sudo sh -c "grep -q '^user_allow_other$' /etc/fuse.conf 2>/dev/null || echo user_allow_other >> /etc/fuse.conf" + - if [ -e /tmp/gcsfuse_mount ] && [ ! -d /tmp/gcsfuse_mount ]; then rm -f /tmp/gcsfuse_mount; fi + - mkdir -p /tmp/gcsfuse_mount + - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --dir-mode 0777 --file-mode 0666 -o allow_other --only-dir gcsfuse_mount marin-us-east5 /tmp/gcsfuse_mount || true # always run this because ray doesn't run with sudo - sudo usermod -aG docker $USER # we want to launch docker containers from inside docker, which means we need to loosen the permissions on the docker @@ -87,7 +92,9 @@ setup_commands: - echo 'export TPU_MIN_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_STDERR_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_LOG_DIR=disabled' >> $HOME/.bashrc - - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --only-dir gcsfuse_mount $BUCKET /opt/gcsfuse_mount || true + # gcsfuse runs on the host (mounted at /tmp/gcsfuse_mount); expose it at the conventional in-container path. + - if [ -e /opt/gcsfuse_mount ] && [ ! -L /opt/gcsfuse_mount ]; then sudo rm -rf /opt/gcsfuse_mount; fi + - sudo ln -sfn /tmp/gcsfuse_mount /opt/gcsfuse_mount worker_setup_commands: # delete any old ray session data diff --git a/infra/marin-us-west4.yaml b/infra/marin-us-west4.yaml index f9f4e4361f..345a2238cb 100644 --- a/infra/marin-us-west4.yaml +++ b/infra/marin-us-west4.yaml @@ -57,6 +57,11 @@ docker: initialization_commands: - which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f) - yes | gcloud auth configure-docker us-west4-docker.pkg.dev + - which gcsfuse || (sudo apt-get update && sudo apt-get install -y curl gnupg && sudo mkdir -p /etc/apt/keyrings && curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /etc/apt/keyrings/cloud.google.gpg && echo "deb [signed-by=/etc/apt/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt gcsfuse-$(. /etc/os-release && echo $VERSION_CODENAME) main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list >/dev/null && sudo apt-get update && sudo apt-get install -y gcsfuse) + - sudo sh -c "grep -q '^user_allow_other$' /etc/fuse.conf 2>/dev/null || echo user_allow_other >> /etc/fuse.conf" + - if [ -e /tmp/gcsfuse_mount ] && [ ! -d /tmp/gcsfuse_mount ]; then rm -f /tmp/gcsfuse_mount; fi + - mkdir -p /tmp/gcsfuse_mount + - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --dir-mode 0777 --file-mode 0666 -o allow_other --only-dir gcsfuse_mount marin-us-west4 /tmp/gcsfuse_mount || true # always run this because ray doesn't run with sudo - sudo usermod -aG docker $USER # we want to launch docker containers from inside docker, which means we need to loosen the permissions on the docker @@ -87,7 +92,9 @@ setup_commands: - echo 'export TPU_MIN_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_STDERR_LOG_LEVEL=3' >> $HOME/.bashrc - echo 'export TPU_LOG_DIR=disabled' >> $HOME/.bashrc - - gcsfuse --implicit-dirs --client-protocol grpc --cache-dir /dev/shm --file-cache-max-size-mb 160000 --only-dir gcsfuse_mount $BUCKET /opt/gcsfuse_mount || true + # gcsfuse runs on the host (mounted at /tmp/gcsfuse_mount); expose it at the conventional in-container path. + - if [ -e /opt/gcsfuse_mount ] && [ ! -L /opt/gcsfuse_mount ]; then sudo rm -rf /opt/gcsfuse_mount; fi + - sudo ln -sfn /tmp/gcsfuse_mount /opt/gcsfuse_mount worker_setup_commands: # delete any old ray session data diff --git a/scripts/ray/cluster.py b/scripts/ray/cluster.py index 7a453d8e30..fcbb95e407 100755 --- a/scripts/ray/cluster.py +++ b/scripts/ray/cluster.py @@ -452,11 +452,18 @@ class Context: config_obj: RayClusterConfig | None = None +def _maybe_add_ray_verbose(ctx: Context, cmd_args: list[str]) -> list[str]: + """Add `-v` to Ray CLI commands when cluster.py verbose mode is enabled.""" + if ctx.verbose: + return [*cmd_args[:2], "-v", *cmd_args[2:]] + return cmd_args + + # Context object to pass global options between commands @click.group() @click.option("--config", help="Path to Ray cluster config file (infra/marin-*.yaml)") @click.option("--cluster", help="Cluster name to connect to") -@click.option("--verbose", is_flag=True, help="Enable verbose logging") +@click.option("--verbose", is_flag=True, help="Enable verbose logging (also passes `-v` to Ray cluster commands).") @click.pass_context def cli(ctx, config, cluster, verbose): """Marin cluster management CLI.""" @@ -506,7 +513,7 @@ def start_cluster(ctx): print() print(f"Starting cluster {config_obj.cluster_name}...") - subprocess.run(["ray", "up", "-y", config_path], check=True) + subprocess.run(_maybe_add_ray_verbose(ctx.obj, ["ray", "up", "-y", config_path]), check=True) @cli.command("stop-cluster") @@ -518,11 +525,11 @@ def stop_cluster(ctx): print("Error: --config required for cluster commands", file=sys.stderr) sys.exit(1) - _stop_cluster_internal(config_obj, config_path) + _stop_cluster_internal(ctx.obj, config_obj, config_path) print("Cluster stopped successfully!") -def _stop_cluster_internal(config_obj: RayClusterConfig, config_path: str): +def _stop_cluster_internal(ctx: Context, config_obj: RayClusterConfig, config_path: str): """Terminate a Ray cluster. N.B. We terminate the Ray coordinator node first to avoid restarting any new TPUs while @@ -542,7 +549,10 @@ def _stop_cluster_internal(config_obj: RayClusterConfig, config_path: str): print(f"Terminated {len(terminated_tpus)} TPUs") print(f"Cleaning up Ray cluster state for {config_obj.cluster_name}...") - subprocess.run(["ray", "down", "-y", config_path], check=False) # check=False since instances may already be gone + subprocess.run( + _maybe_add_ray_verbose(ctx, ["ray", "down", "-y", config_path]), + check=False, # check=False since instances may already be gone + ) @cli.command("restart-cluster") @@ -579,10 +589,13 @@ def restart_cluster(ctx, preserve_jobs): print("Proceeding with cluster restart without job preservation.") print("Stopping cluster...") - _stop_cluster_internal(config_obj, config_path) + _stop_cluster_internal(ctx.obj, config_obj, config_path) print("Starting cluster...") - subprocess.run(["ray", "up", "-y", "--no-config-cache", config_path], check=True) + subprocess.run( + _maybe_add_ray_verbose(ctx.obj, ["ray", "up", "-y", "--no-config-cache", config_path]), + check=True, + ) if preserve_jobs: print("Restoring jobs...") @@ -668,7 +681,7 @@ def ssh_connect(ctx, target, project, zone, extra_args): @click.pass_context def ssh_head(ctx, extra_args): """SSH to cluster head node using ray attach.""" - cmd_args = ["ray", "attach", ctx.obj.config_file] + cmd_args = _maybe_add_ray_verbose(ctx.obj, ["ray", "attach", ctx.obj.config_file]) if extra_args: cmd_args.extend(["--", *extra_args]) subprocess.run(cmd_args, check=True) @@ -887,7 +900,10 @@ def auth(ctx, secret: str | None, copy: bool, open_browser: bool): def show_logs(ctx, tail): """View cluster logs.""" log_command = f"tail -n {tail} -f /tmp/ray/session_latest/logs/monitor*" - subprocess.run(["ray", "exec", ctx.obj.config_file, log_command], check=True) + subprocess.run( + _maybe_add_ray_verbose(ctx.obj, ["ray", "exec", ctx.obj.config_file, log_command]), + check=True, + ) def main():