From d099ba806ddeb555a46fcbdc9dd9ba0ad5c11bec Mon Sep 17 00:00:00 2001 From: Broderick Gardner Date: Fri, 18 Oct 2024 09:41:54 -0600 Subject: [PATCH 1/2] Create /.google_hpc_firstrun sentinel file This signals the "firstrun" systemd service to run for first-boot VM configuration. --- ansible/roles/common/tasks/main.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml index a6412078..362f8442 100644 --- a/ansible/roles/common/tasks/main.yml +++ b/ansible/roles/common/tasks/main.yml @@ -47,3 +47,12 @@ template: src: ld.so.conf.d/usr-local.conf.j2 dest: /etc/ld.so.conf.d/usr-local.conf + +- name: Mark image for first boot + copy: + content: "" + dest: /.google_hpc_firstrun + force: false + group: root + owner: root + mode: 0660 From a7470bd3838bed0736130b756e3908c34d7f9285 Mon Sep 17 00:00:00 2001 From: Broderick Gardner Date: Mon, 21 Oct 2024 11:00:57 -0600 Subject: [PATCH 2/2] Update image references for Slurm-GCP 6.8.x --- docs/faq.md | 2 +- docs/images.md | 20 +++++++++---------- docs/tpu.md | 8 ++++---- .../modules/slurm_instance_template/main.tf | 2 +- .../modules/slurm_nodeset_tpu/README_TF.md | 2 +- .../modules/slurm_nodeset_tpu/main.tf | 2 +- .../modules/slurm_nodeset_tpu/variables.tf | 2 +- 7 files changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/faq.md b/docs/faq.md index 832cf466..abd62390 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -227,7 +227,7 @@ across all instances and allows easy user control with By default, the [slurm_cluster](../terraform/slurm_cluster/README.md) terraform module uses the latest Slurm image family (e.g. -`slurm-gcp-6-7-hpc-rocky-linux-8`). As new Slurm image families are released, +`slurm-gcp-6-8-hpc-rocky-linux-8`). As new Slurm image families are released, coenciding with periodic Slurm releases, the terraform module will be updated to track the newest image family by setting it as the new default. This update can be considered a breaking change. diff --git a/docs/images.md b/docs/images.md index 32975d7a..182f0635 100644 --- a/docs/images.md +++ b/docs/images.md @@ -72,21 +72,21 @@ For the [TPU](./glossary.md#tpu) nodes docker images are also released. | Project | Image Family | Arch | Status | | :------------------: | :---------------------------------- | :----- | :------------- | -| schedmd-slurm-public | slurm-gcp-6-7-debian-11 | x86_64 | Supported | -| schedmd-slurm-public | slurm-gcp-6-7-hpc-rocky-linux-8 | x86_64 | Supported | -| schedmd-slurm-public | slurm-gcp-6-7-ubuntu-2004-lts | x86_64 | Supported | -| schedmd-slurm-public | slurm-gcp-6-7-ubuntu-2204-lts-arm64 | ARM64 | Supported | +| schedmd-slurm-public | slurm-gcp-6-8-debian-11 | x86_64 | Supported | +| schedmd-slurm-public | slurm-gcp-6-8-hpc-rocky-linux-8 | x86_64 | Supported | +| schedmd-slurm-public | slurm-gcp-6-8-ubuntu-2004-lts | x86_64 | Supported | +| schedmd-slurm-public | slurm-gcp-6-8-ubuntu-2204-lts-arm64 | ARM64 | Supported | ### Published Docker Image Family | Project | Image Family | Status | | :------------------: | :-------------------------- | :-------- | -| schedmd-slurm-public | tpu:slurm-gcp-6-7-tf-2.12.1 | Supported | -| schedmd-slurm-public | tpu:slurm-gcp-6-7-tf-2.13.0 | Supported | -| schedmd-slurm-public | tpu:slurm-gcp-6-7-tf-2.13.1 | Supported | -| schedmd-slurm-public | tpu:slurm-gcp-6-7-tf-2.14.0 | Supported | -| schedmd-slurm-public | tpu:slurm-gcp-6-7-tf-2.14.1 | Supported | -| schedmd-slurm-public | tpu:slurm-gcp-6-7-tf-2.15.0 | Supported | +| schedmd-slurm-public | tpu:slurm-gcp-6-8-tf-2.12.1 | Supported | +| schedmd-slurm-public | tpu:slurm-gcp-6-8-tf-2.13.0 | Supported | +| schedmd-slurm-public | tpu:slurm-gcp-6-8-tf-2.13.1 | Supported | +| schedmd-slurm-public | tpu:slurm-gcp-6-8-tf-2.14.0 | Supported | +| schedmd-slurm-public | tpu:slurm-gcp-6-8-tf-2.14.1 | Supported | +| schedmd-slurm-public | tpu:slurm-gcp-6-8-tf-2.15.0 | Supported | ## Custom Image diff --git a/docs/tpu.md b/docs/tpu.md index 43b528c6..ac91472b 100644 --- a/docs/tpu.md +++ b/docs/tpu.md @@ -72,10 +72,10 @@ state we will also include if it is tested or not. | Project | Image Family | Arch | TPU Status | | :------------------: | :---------------------------------- | :----- | :---------- | -| schedmd-slurm-public | slurm-gcp-6-7-debian-11 | x86_64 | Untested | -| schedmd-slurm-public | slurm-gcp-6-7-hpc-rocky-linux-8 | x86_64 | Tested | -| schedmd-slurm-public | slurm-gcp-6-7-ubuntu-2004-lts | x86_64 | Untested | -| schedmd-slurm-public | slurm-gcp-6-7-ubuntu-2204-lts-arm64 | ARM64 | Untested | +| schedmd-slurm-public | slurm-gcp-6-8-debian-11 | x86_64 | Untested | +| schedmd-slurm-public | slurm-gcp-6-8-hpc-rocky-linux-8 | x86_64 | Tested | +| schedmd-slurm-public | slurm-gcp-6-8-ubuntu-2004-lts | x86_64 | Untested | +| schedmd-slurm-public | slurm-gcp-6-8-ubuntu-2204-lts-arm64 | ARM64 | Untested | ## Terraform diff --git a/terraform/slurm_cluster/modules/slurm_instance_template/main.tf b/terraform/slurm_cluster/modules/slurm_instance_template/main.tf index c53edc79..501fe4c6 100644 --- a/terraform/slurm_cluster/modules/slurm_instance_template/main.tf +++ b/terraform/slurm_cluster/modules/slurm_instance_template/main.tf @@ -47,7 +47,7 @@ locals { source_image_family = ( var.source_image_family != "" && var.source_image_family != null ? var.source_image_family - : "slurm-gcp-6-7-hpc-rocky-linux-8" + : "slurm-gcp-6-8-hpc-rocky-linux-8" ) source_image_project = ( var.source_image_project != "" && var.source_image_project != null diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/README_TF.md b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/README_TF.md index bff71dff..6c84e80d 100644 --- a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/README_TF.md +++ b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/README_TF.md @@ -47,7 +47,7 @@ No modules. |------|-------------|------|---------|:--------:| | [accelerator\_config](#input\_accelerator\_config) | Nodeset accelerator config, see https://cloud.google.com/tpu/docs/supported-tpu-configurations for details. |
object({
topology = string
version = string
})
|
{
"topology": "",
"version": ""
}
| no | | [data\_disks](#input\_data\_disks) | The data disks to include in the TPU node | `list(string)` | `[]` | no | -| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-7-tf- | `string` | `""` | no | +| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-8-tf- | `string` | `""` | no | | [enable\_public\_ip](#input\_enable\_public\_ip) | Enables IP address to access the Internet. | `bool` | `false` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | | [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of nodes allowed in this partition to be created dynamically. | `number` | `0` | no | diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/main.tf b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/main.tf index de93e997..1bb0add7 100644 --- a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/main.tf +++ b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/main.tf @@ -68,7 +68,7 @@ locals { service_account = var.service_account != null ? var.service_account : local.service_account preserve_tpu = local.can_preempt ? var.preserve_tpu : false data_disks = var.data_disks - docker_image = var.docker_image != "" ? var.docker_image : "us-docker.pkg.dev/schedmd-slurm-public/tpu/slurm-gcp-6-7:tf-${var.tf_version}" + docker_image = var.docker_image != "" ? var.docker_image : "us-docker.pkg.dev/schedmd-slurm-public/tpu/slurm-gcp-6-8:tf-${var.tf_version}" subnetwork = local.snetwork network_storage = var.network_storage } diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/variables.tf b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/variables.tf index 3fa3030a..a174f4af 100644 --- a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/variables.tf +++ b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/variables.tf @@ -51,7 +51,7 @@ variable "accelerator_config" { } variable "docker_image" { - description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-7-tf-" + description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-8-tf-" type = string default = "" }