diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 62cd667789..839fff3415 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -196,11 +196,11 @@ limitations under the License. |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.0 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.0 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/wiktorn/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | reserve_ip_addresses | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/wiktorn/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | reserve_ip_addresses | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.0 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.0 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/wiktorn/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | reserve_ip_addresses | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/wiktorn/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | reserve_ip_addresses | | [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.0 | | [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.6.0 | @@ -265,7 +265,7 @@ limitations under the License. | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for controller. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | | [login\_network\_storage](#input\_login\_network\_storage) | An array of network attached storage mounts to be configured on all login nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | -| [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. |
list(object({
name_prefix = string
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
additional_networks = optional(list(object({
access_config = optional(list(object({
nat_ip = string
network_tier = string
})), [])
alias_ip_range = optional(list(object({
ip_cidr_range = string
subnetwork_range_name = string
})), [])
ipv6_access_config = optional(list(object({
network_tier = string
})), [])
network = optional(string)
network_ip = optional(string, "")
nic_type = optional(string)
queue_count = optional(number)
stack_type = optional(string)
subnetwork = optional(string)
subnetwork_project = optional(string)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string, "n1-standard-1")
enable_confidential_vm = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
num_instances = optional(number, 1)
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
static_ips = optional(list(string), [])
subnetwork = string
spot = optional(bool, false)
tags = optional(list(string), [])
zone = optional(string)
termination_action = optional(string)
}))
| `[]` | no | +| [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. |
list(object({
name_prefix = string
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
additional_networks = optional(list(object({
access_config = optional(list(object({
nat_ip = string
network_tier = string
})), [])
alias_ip_range = optional(list(object({
ip_cidr_range = string
subnetwork_range_name = string
})), [])
ipv6_access_config = optional(list(object({
network_tier = string
})), [])
network = optional(string)
network_ip = optional(string, "")
nic_type = optional(string)
queue_count = optional(number)
stack_type = optional(string)
subnetwork_self_link = optional(string)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string, "n1-standard-1")
enable_confidential_vm = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
num_instances = optional(number, 1)
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
static_ips = optional(list(string), [])
subnetwork_self_link = string
spot = optional(bool, false)
tags = optional(list(string), [])
zone = optional(string)
termination_action = optional(string)
}))
| `[]` | no | | [login\_startup\_script](#input\_login\_startup\_script) | Startup script used by the login VMs. | `string` | `"# no-op"` | no | | [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"c2-standard-4"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 2262acf718..918b5163d0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -36,7 +36,8 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.0" + # source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.0" + source = "github.com/wiktorn/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=reserve_ip_addresses" project_id = var.project_id region = var.region @@ -76,7 +77,7 @@ module "slurm_controller_template" { source_image = local.source_image # requires source_image_logic.tf # spot = TODO: add support for spot (?) - subnetwork = var.subnetwork_self_link + subnetwork_self_link = var.subnetwork_self_link tags = concat([local.slurm_cluster_name], var.tags) # termination_action = TODO: add support for termination_action (?) @@ -92,21 +93,22 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.0" + # source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.0" + source = "github.com/wiktorn/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=reserve_ip_addresses" access_config = var.enable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false hostname = "${local.slurm_cluster_name}-controller" instance_template = module.slurm_controller_template.self_link - project_id = var.project_id - region = var.region - slurm_cluster_name = local.slurm_cluster_name - slurm_instance_role = "controller" - static_ips = var.static_ips - subnetwork = var.subnetwork_self_link - zone = var.zone - metadata = var.metadata + project_id = var.project_id + region = var.region + slurm_cluster_name = local.slurm_cluster_name + slurm_instance_role = "controller" + static_ips = var.static_ips + subnetwork_self_link = var.subnetwork_self_link + zone = var.zone + metadata = var.metadata labels = merge(local.labels, local.files_cs_labels) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index 103c224046..0ac83b8b50 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,8 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.0" + # source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.0" + source = "github.com/wiktorn/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=reserve_ip_addresses" for_each = { for x in var.login_nodes : x.name_prefix => x } @@ -50,14 +51,15 @@ module "slurm_login_template" { source_image_project = each.value.source_image_project source_image = each.value.source_image spot = each.value.spot - subnetwork = each.value.subnetwork + subnetwork_self_link = each.value.subnetwork_self_link tags = concat([local.slurm_cluster_name], each.value.tags) termination_action = each.value.termination_action } # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.0" + # source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.0" + source = "github.com/wiktorn/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=reserve_ip_addresses" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config @@ -72,10 +74,10 @@ module "slurm_login_instance" { labels = merge(each.value.labels, local.files_cs_labels) num_instances = each.value.num_instances - region = each.value.region - static_ips = each.value.static_ips - subnetwork = each.value.subnetwork - zone = each.value.zone + region = each.value.region + static_ips = each.value.static_ips + subnetwork_self_link = each.value.subnetwork_self_link + zone = each.value.zone # trigger replacement of login nodes when the controller instance is replaced replace_trigger = module.slurm_controller_instance.instances_self_links[0] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 44ed33f994..1227edcab7 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -120,13 +120,12 @@ variable "login_nodes" { ipv6_access_config = optional(list(object({ network_tier = string })), []) - network = optional(string) - network_ip = optional(string, "") - nic_type = optional(string) - queue_count = optional(number) - stack_type = optional(string) - subnetwork = optional(string) - subnetwork_project = optional(string) + network = optional(string) + network_ip = optional(string, "") + nic_type = optional(string) + queue_count = optional(number) + stack_type = optional(string) + subnetwork_self_link = optional(string) })), []) bandwidth_tier = optional(string, "platform_default") can_ip_forward = optional(bool, false) @@ -163,7 +162,7 @@ variable "login_nodes" { source_image_project = optional(string) source_image = optional(string) static_ips = optional(list(string), []) - subnetwork = string + subnetwork_self_link = string spot = optional(bool, false) tags = optional(list(string), []) zone = optional(string) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf index ab3e45fe9c..4ddc99a344 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf @@ -85,8 +85,8 @@ locals { static_ips = var.static_ips bandwidth_tier = var.bandwidth_tier - subnetwork = var.subnetwork_self_link - tags = var.tags + subnetwork_self_link = var.subnetwork_self_link + tags = var.tags } }