From 4ce431ca990bd8cf1281b82d4f3184424650df8d Mon Sep 17 00:00:00 2001 From: Abbas Mohamed <69826914+abbas1902@users.noreply.github.com> Date: Thu, 12 Dec 2024 10:27:18 -0800 Subject: [PATCH] Remove terraform modules (#237) --- terraform/_network/README.md | 40 -- terraform/_network/README_TF.md | 64 --- terraform/_network/main.tf | 84 ---- terraform/_network/outputs.tf | 30 -- terraform/_network/variables.tf | 105 ---- terraform/_network/versions.tf | 19 - .../modules/_instance_template/README_TF.md | 107 ---- .../modules/_instance_template/main.tf | 207 -------- .../modules/_instance_template/outputs.tf | 36 -- .../modules/_instance_template/variables.tf | 367 -------------- .../modules/_instance_template/versions.tf | 33 -- .../modules/_slurm_instance/README.md | 28 -- .../modules/_slurm_instance/README_TF.md | 83 ---- .../modules/_slurm_instance/main.tf | 154 ------ .../modules/_slurm_instance/outputs.tf | 41 -- .../modules/_slurm_instance/variables.tf | 161 ------ .../modules/_slurm_instance/versions.tf | 35 -- .../modules/slurm_controller_hybrid/README.md | 52 -- .../slurm_controller_hybrid/README_TF.md | 86 ---- .../modules/slurm_controller_hybrid/main.tf | 203 -------- .../slurm_controller_hybrid/outputs.tf | 37 -- .../slurm_controller_hybrid/variables.tf | 75 --- .../slurm_controller_hybrid/versions.tf | 42 -- .../slurm_controller_instance/README.md | 68 --- .../slurm_controller_instance/README_TF.md | 79 --- .../modules/slurm_controller_instance/main.tf | 140 ------ .../slurm_controller_instance/outputs.tf | 60 --- .../slurm_controller_instance/variables.tf | 144 ------ .../slurm_controller_instance/versions.tf | 34 -- .../modules/slurm_destroy_nodes/README.md | 59 --- .../modules/slurm_destroy_nodes/README_TF.md | 59 --- .../modules/slurm_destroy_nodes/main.tf | 95 ---- .../modules/slurm_destroy_nodes/outputs.tf | 15 - .../modules/slurm_destroy_nodes/variables.tf | 59 --- .../modules/slurm_destroy_nodes/versions.tf | 30 -- .../slurm_destroy_resource_policies/README.md | 58 --- .../README_TF.md | 58 --- .../slurm_destroy_resource_policies/main.tf | 93 ---- .../outputs.tf | 15 - .../variables.tf | 58 --- .../versions.tf | 30 -- .../modules/slurm_files/README.md | 200 -------- .../modules/slurm_files/README_TF.md | 119 ----- .../slurm_files/files/external_epilog.sh | 18 - .../slurm_files/files/external_prolog.sh | 18 - .../slurm_files/files/setup_external.sh | 113 ----- .../slurm_cluster/modules/slurm_files/main.tf | 297 ----------- .../modules/slurm_files/outputs.tf | 60 --- .../modules/slurm_files/variables.tf | 467 ------------------ .../modules/slurm_files/versions.tf | 37 -- .../modules/slurm_instance_template/README.md | 72 --- .../slurm_instance_template/README_TF.md | 97 ---- .../modules/slurm_instance_template/main.tf | 165 ------- .../slurm_instance_template/outputs.tf | 40 -- .../slurm_instance_template/variables.tf | 388 --------------- .../slurm_instance_template/versions.tf | 30 -- .../modules/slurm_login_instance/README.md | 60 --- .../modules/slurm_login_instance/README_TF.md | 63 --- .../modules/slurm_login_instance/main.tf | 56 --- .../modules/slurm_login_instance/outputs.tf | 20 - .../modules/slurm_login_instance/variables.tf | 145 ------ .../modules/slurm_login_instance/versions.tf | 26 - .../modules/slurm_nodeset/README.md | 23 - .../modules/slurm_nodeset/README_TF.md | 70 --- .../modules/slurm_nodeset/main.tf | 69 --- .../modules/slurm_nodeset/outputs.tf | 25 - .../modules/slurm_nodeset/variables.tf | 193 -------- .../modules/slurm_nodeset/versions.tf | 30 -- .../modules/slurm_nodeset_dyn/README.md | 24 - .../modules/slurm_nodeset_dyn/README_TF.md | 54 -- .../modules/slurm_nodeset_dyn/main.tf | 32 -- .../modules/slurm_nodeset_dyn/outputs.tf | 25 - .../modules/slurm_nodeset_dyn/variables.tf | 30 -- .../modules/slurm_nodeset_dyn/versions.tf | 26 - .../modules/slurm_nodeset_tpu/README.md | 23 - .../modules/slurm_nodeset_tpu/README_TF.md | 73 --- .../modules/slurm_nodeset_tpu/main.tf | 121 ----- .../modules/slurm_nodeset_tpu/outputs.tf | 30 -- .../modules/slurm_nodeset_tpu/variables.tf | 158 ------ .../modules/slurm_nodeset_tpu/versions.tf | 30 -- .../modules/slurm_partition/README.md | 137 ----- .../modules/slurm_partition/README_TF.md | 64 --- .../modules/slurm_partition/main.tf | 61 --- .../modules/slurm_partition/outputs.tf | 25 - .../modules/slurm_partition/variables.tf | 137 ----- .../modules/slurm_partition/versions.tf | 30 -- terraform/slurm_firewall_rules/README.md | 60 --- terraform/slurm_firewall_rules/README_TF.md | 55 --- .../examples/simple/Makefile | 36 -- .../examples/simple/README.md | 39 -- .../examples/simple/README_TF.md | 53 -- .../examples/simple/example.tfvars | 19 - .../examples/simple/main.tf | 31 -- .../examples/simple/outputs.tf | 15 - .../examples/simple/variables.tf | 25 - .../examples/simple/versions.tf | 26 - terraform/slurm_firewall_rules/main.tf | 119 ----- terraform/slurm_firewall_rules/outputs.tf | 20 - terraform/slurm_firewall_rules/variables.tf | 97 ---- terraform/slurm_firewall_rules/versions.tf | 19 - terraform/slurm_sa_iam/README.md | 66 --- terraform/slurm_sa_iam/README_TF.md | 59 --- .../slurm_sa_iam/examples/simple/Makefile | 36 -- .../slurm_sa_iam/examples/simple/README.md | 38 -- .../slurm_sa_iam/examples/simple/README_TF.md | 51 -- .../examples/simple/example.tfvars | 19 - .../slurm_sa_iam/examples/simple/main.tf | 27 - .../slurm_sa_iam/examples/simple/outputs.tf | 19 - .../slurm_sa_iam/examples/simple/variables.tf | 25 - .../slurm_sa_iam/examples/simple/versions.tf | 26 - terraform/slurm_sa_iam/main.tf | 84 ---- terraform/slurm_sa_iam/outputs.tf | 23 - terraform/slurm_sa_iam/variables.tf | 46 -- terraform/slurm_sa_iam/versions.tf | 30 -- 114 files changed, 8387 deletions(-) delete mode 100644 terraform/_network/README.md delete mode 100644 terraform/_network/README_TF.md delete mode 100644 terraform/_network/main.tf delete mode 100644 terraform/_network/outputs.tf delete mode 100644 terraform/_network/variables.tf delete mode 100644 terraform/_network/versions.tf delete mode 100644 terraform/slurm_cluster/modules/_instance_template/README_TF.md delete mode 100644 terraform/slurm_cluster/modules/_instance_template/main.tf delete mode 100644 terraform/slurm_cluster/modules/_instance_template/outputs.tf delete mode 100644 terraform/slurm_cluster/modules/_instance_template/variables.tf delete mode 100644 terraform/slurm_cluster/modules/_instance_template/versions.tf delete mode 100644 terraform/slurm_cluster/modules/_slurm_instance/README.md delete mode 100644 terraform/slurm_cluster/modules/_slurm_instance/README_TF.md delete mode 100644 terraform/slurm_cluster/modules/_slurm_instance/main.tf delete mode 100644 terraform/slurm_cluster/modules/_slurm_instance/outputs.tf delete mode 100644 terraform/slurm_cluster/modules/_slurm_instance/variables.tf delete mode 100644 terraform/slurm_cluster/modules/_slurm_instance/versions.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_controller_hybrid/README.md delete mode 100644 terraform/slurm_cluster/modules/slurm_controller_hybrid/README_TF.md delete mode 100644 terraform/slurm_cluster/modules/slurm_controller_hybrid/main.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_controller_hybrid/outputs.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_controller_hybrid/variables.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_controller_hybrid/versions.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_controller_instance/README.md delete mode 100644 terraform/slurm_cluster/modules/slurm_controller_instance/README_TF.md delete mode 100644 terraform/slurm_cluster/modules/slurm_controller_instance/main.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_controller_instance/outputs.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_controller_instance/variables.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_controller_instance/versions.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_destroy_nodes/README.md delete mode 100644 terraform/slurm_cluster/modules/slurm_destroy_nodes/README_TF.md delete mode 100644 terraform/slurm_cluster/modules/slurm_destroy_nodes/main.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_destroy_nodes/outputs.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_destroy_nodes/variables.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_destroy_nodes/versions.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_destroy_resource_policies/README.md delete mode 100644 terraform/slurm_cluster/modules/slurm_destroy_resource_policies/README_TF.md delete mode 100644 terraform/slurm_cluster/modules/slurm_destroy_resource_policies/main.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_destroy_resource_policies/outputs.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_destroy_resource_policies/variables.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_destroy_resource_policies/versions.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_files/README.md delete mode 100644 terraform/slurm_cluster/modules/slurm_files/README_TF.md delete mode 100755 terraform/slurm_cluster/modules/slurm_files/files/external_epilog.sh delete mode 100755 terraform/slurm_cluster/modules/slurm_files/files/external_prolog.sh delete mode 100755 terraform/slurm_cluster/modules/slurm_files/files/setup_external.sh delete mode 100644 terraform/slurm_cluster/modules/slurm_files/main.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_files/outputs.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_files/variables.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_files/versions.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_instance_template/README.md delete mode 100644 terraform/slurm_cluster/modules/slurm_instance_template/README_TF.md delete mode 100644 terraform/slurm_cluster/modules/slurm_instance_template/main.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_instance_template/outputs.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_instance_template/variables.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_instance_template/versions.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_login_instance/README.md delete mode 100644 terraform/slurm_cluster/modules/slurm_login_instance/README_TF.md delete mode 100644 terraform/slurm_cluster/modules/slurm_login_instance/main.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_login_instance/outputs.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_login_instance/variables.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_login_instance/versions.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset/README.md delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset/README_TF.md delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset/main.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset/outputs.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset/variables.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset/versions.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset_dyn/README.md delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset_dyn/README_TF.md delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset_dyn/main.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset_dyn/outputs.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset_dyn/variables.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset_dyn/versions.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset_tpu/README.md delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset_tpu/README_TF.md delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset_tpu/main.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset_tpu/outputs.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset_tpu/variables.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_nodeset_tpu/versions.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_partition/README.md delete mode 100644 terraform/slurm_cluster/modules/slurm_partition/README_TF.md delete mode 100644 terraform/slurm_cluster/modules/slurm_partition/main.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_partition/outputs.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_partition/variables.tf delete mode 100644 terraform/slurm_cluster/modules/slurm_partition/versions.tf delete mode 100644 terraform/slurm_firewall_rules/README.md delete mode 100644 terraform/slurm_firewall_rules/README_TF.md delete mode 100644 terraform/slurm_firewall_rules/examples/simple/Makefile delete mode 100644 terraform/slurm_firewall_rules/examples/simple/README.md delete mode 100644 terraform/slurm_firewall_rules/examples/simple/README_TF.md delete mode 100644 terraform/slurm_firewall_rules/examples/simple/example.tfvars delete mode 100644 terraform/slurm_firewall_rules/examples/simple/main.tf delete mode 100644 terraform/slurm_firewall_rules/examples/simple/outputs.tf delete mode 100644 terraform/slurm_firewall_rules/examples/simple/variables.tf delete mode 100644 terraform/slurm_firewall_rules/examples/simple/versions.tf delete mode 100644 terraform/slurm_firewall_rules/main.tf delete mode 100644 terraform/slurm_firewall_rules/outputs.tf delete mode 100644 terraform/slurm_firewall_rules/variables.tf delete mode 100644 terraform/slurm_firewall_rules/versions.tf delete mode 100644 terraform/slurm_sa_iam/README.md delete mode 100644 terraform/slurm_sa_iam/README_TF.md delete mode 100644 terraform/slurm_sa_iam/examples/simple/Makefile delete mode 100644 terraform/slurm_sa_iam/examples/simple/README.md delete mode 100644 terraform/slurm_sa_iam/examples/simple/README_TF.md delete mode 100644 terraform/slurm_sa_iam/examples/simple/example.tfvars delete mode 100644 terraform/slurm_sa_iam/examples/simple/main.tf delete mode 100644 terraform/slurm_sa_iam/examples/simple/outputs.tf delete mode 100644 terraform/slurm_sa_iam/examples/simple/variables.tf delete mode 100644 terraform/slurm_sa_iam/examples/simple/versions.tf delete mode 100644 terraform/slurm_sa_iam/main.tf delete mode 100644 terraform/slurm_sa_iam/outputs.tf delete mode 100644 terraform/slurm_sa_iam/variables.tf delete mode 100644 terraform/slurm_sa_iam/versions.tf diff --git a/terraform/_network/README.md b/terraform/_network/README.md deleted file mode 100644 index eb466fde..00000000 --- a/terraform/_network/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# Module: Network - - - -- [Module: Network](#module-network) - - [Overview](#overview) - - [Dependencies](#dependencies) - - [TerraformUser](#terraformuser) - - [Required](#required) - - [Module API](#module-api) - - - -## Overview - -This module creates a network and a nat and router for each specified -subnetwork. - -> **NOTE:** This module is intended for example purposes. For general usage, -> please consider using: -> -> - [terraform-google-modules/network/google](https://registry.terraform.io/modules/terraform-google-modules/network/google/latest) -> - [terraform-google-modules/cloud-router/google](https://registry.terraform.io/modules/terraform-google-modules/cloud-router/google/latest) -> - [terraform-google-modules/cloud-nat/google](https://registry.terraform.io/modules/terraform-google-modules/cloud-nat/google/latest) - -## Dependencies - -- [Terraform](https://www.terraform.io/downloads.html) is installed. -- [Compute Engine API](../../docs/glossary.md#compute-engine) is enabled. - -### TerraformUser - -#### Required - -- Compute Network Admin (`roles/compute.networkAdmin`) - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/_network/README_TF.md b/terraform/_network/README_TF.md deleted file mode 100644 index a116fe64..00000000 --- a/terraform/_network/README_TF.md +++ /dev/null @@ -1,64 +0,0 @@ -# \_network - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.0 | - -## Providers - -No providers. - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [nat](#module\_nat) | terraform-google-modules/cloud-nat/google | ~> 2.0 | -| [network](#module\_network) | terraform-google-modules/network/google | ~> 4.0 | -| [router](#module\_router) | terraform-google-modules/cloud-router/google | ~> 1.0 | - -## Resources - -No resources. - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [auto\_create\_subnetworks](#input\_auto\_create\_subnetworks) | When set to true, the network is created in 'auto subnet mode' and it will
create a subnet for each region automatically across the 10.128.0.0/9
address range. When set to false, the network is created in 'custom subnet mode'
so the user can explicitly connect subnetwork resources. | `bool` | `false` | no | -| [delete\_default\_internet\_gateway\_routes](#input\_delete\_default\_internet\_gateway\_routes) | If set, ensure that all routes within the network specified whose names begin
with 'default-route' and with a next hop of 'default-internet-gateway' are
deleted. | `bool` | `false` | no | -| [description](#input\_description) | An optional description of this resource. The resource must be recreated to modify this field. | `string` | `""` | no | -| [firewall\_rules](#input\_firewall\_rules) | List of additional firewall rules. | `list(map(string))` | `[]` | no | -| [mtu](#input\_mtu) | The network MTU. Must be a value between 1460 and 1500 inclusive. If set to 0
(meaning MTU is unset), the network will default to 1460 automatically. | `number` | `0` | no | -| [network\_name](#input\_network\_name) | The name of the network being created. | `string` | n/a | yes | -| [project\_id](#input\_project\_id) | The ID of the project where this VPC will be created. | `string` | n/a | yes | -| [routes](#input\_routes) | List of routes being created in this VPC. | `list(map(string))` | `[]` | no | -| [routing\_mode](#input\_routing\_mode) | The network routing mode (default 'GLOBAL') | `string` | `"GLOBAL"` | no | -| [secondary\_ranges](#input\_secondary\_ranges) | Secondary ranges that will be used in some of the subnets |
map(list(object({
range_name = string,
ip_cidr_range = string
})))
| `{}` | no | -| [shared\_vpc\_host](#input\_shared\_vpc\_host) | Makes this project a Shared VPC host if 'true' (default 'false') | `bool` | `false` | no | -| [subnets](#input\_subnets) | The list of subnets being created. | `list(map(string))` | `[]` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [nat](#output\_nat) | NAT details. | -| [network](#output\_network) | Network details. | -| [router](#output\_router) | Router details. | - diff --git a/terraform/_network/main.tf b/terraform/_network/main.tf deleted file mode 100644 index 1d1b64f9..00000000 --- a/terraform/_network/main.tf +++ /dev/null @@ -1,84 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########### -# NETWORK # -########### - -module "network" { - source = "terraform-google-modules/network/google" - version = "~> 4.0" - - project_id = var.project_id - description = var.description - network_name = var.network_name - routing_mode = var.routing_mode - shared_vpc_host = var.shared_vpc_host - mtu = var.mtu - auto_create_subnetworks = var.auto_create_subnetworks - delete_default_internet_gateway_routes = var.delete_default_internet_gateway_routes - - subnets = var.subnets - secondary_ranges = var.secondary_ranges - - routes = var.routes - - firewall_rules = var.firewall_rules -} - -########## -# ROUTER # -########## - -module "router" { - source = "terraform-google-modules/cloud-router/google" - version = "~> 1.0" - - for_each = module.network.subnets - - name = "${each.value.name}-router" - project = var.project_id - region = each.value.region - network = module.network.network.network_id -} - -####### -# NAT # -####### - -module "nat" { - source = "terraform-google-modules/cloud-nat/google" - version = "~> 2.0" - - for_each = module.router - - name = "${var.network_name}-nat" - project_id = var.project_id - region = each.value.router.region - router = each.value.router.name - - source_subnetwork_ip_ranges_to_nat = "LIST_OF_SUBNETWORKS" - - subnetworks = [ - { - name = each.key - source_ip_ranges_to_nat = ["PRIMARY_IP_RANGE"] - secondary_ip_range_names = ["LIST_OF_SECONDARY_IP_RANGES"] - }, - ] - - log_config_filter = "ERRORS_ONLY" -} diff --git a/terraform/_network/outputs.tf b/terraform/_network/outputs.tf deleted file mode 100644 index 026162cf..00000000 --- a/terraform/_network/outputs.tf +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "network" { - description = "Network details." - value = module.network -} - -output "router" { - description = "Router details." - value = module.router -} - -output "nat" { - description = "NAT details." - value = module.nat -} diff --git a/terraform/_network/variables.tf b/terraform/_network/variables.tf deleted file mode 100644 index 1c8894e7..00000000 --- a/terraform/_network/variables.tf +++ /dev/null @@ -1,105 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########### -# GENERAL # -########### - -variable "project_id" { - type = string - description = "The ID of the project where this VPC will be created." -} - -variable "network_name" { - type = string - description = "The name of the network being created." -} - -variable "routing_mode" { - type = string - default = "GLOBAL" - description = "The network routing mode (default 'GLOBAL')" -} - -variable "shared_vpc_host" { - type = bool - description = "Makes this project a Shared VPC host if 'true' (default 'false')" - default = false -} - -variable "subnets" { - type = list(map(string)) - description = "The list of subnets being created." - default = [] -} - -variable "secondary_ranges" { - type = map(list(object({ - range_name = string, - ip_cidr_range = string - }))) - description = "Secondary ranges that will be used in some of the subnets" - default = {} -} - -variable "routes" { - type = list(map(string)) - description = "List of routes being created in this VPC." - default = [] -} - -variable "firewall_rules" { - type = list(map(string)) - description = "List of additional firewall rules." - default = [] -} - -variable "delete_default_internet_gateway_routes" { - type = bool - description = < -Copyright 2019 Google LLC -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >=0.13.0 | -| [google](#requirement\_google) | >= 3.88 | -| [google-beta](#requirement\_google-beta) | >= 6.13.0 | - -## Providers - -| Name | Version | -|------|---------| -| [google](#provider\_google) | >= 3.88 | -| [google-beta](#provider\_google-beta) | >= 6.13.0 | - -## Modules - -No modules. - -## Resources - -| Name | Type | -|------|------| -| [google-beta_google_compute_instance_template.tpl](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_compute_instance_template) | resource | -| [google_project.this](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/project) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | -| [additional\_disks](#input\_additional\_disks) | List of maps of additional disks. See https://www.terraform.io/docs/providers/google/r/compute_instance_template#disk_name |
list(object({
disk_name = string
device_name = string
auto_delete = bool
boot = bool
disk_size_gb = number
disk_type = string
disk_labels = map(string)
}))
| `[]` | no | -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
}))
| `[]` | no | -| [alias\_ip\_range](#input\_alias\_ip\_range) | An array of alias IP ranges for this network interface. Can only be specified for network interfaces on subnet-mode networks.
ip\_cidr\_range: The IP CIDR range represented by this alias IP range. This IP CIDR range must belong to the specified subnetwork and cannot contain IP addresses reserved by system or used by other network interfaces. At the time of writing only a netmask (e.g. /24) may be supplied, with a CIDR format resulting in an API error.
subnetwork\_range\_name: The subnetwork secondary range name specifying the secondary range from which to allocate the IP CIDR range for this alias IP range. If left unspecified, the primary range of the subnetwork will be used. |
object({
ip_cidr_range = string
subnetwork_range_name = string
})
| `null` | no | -| [auto\_delete](#input\_auto\_delete) | Whether or not the boot disk should be auto-deleted | `string` | `"true"` | no | -| [automatic\_restart](#input\_automatic\_restart) | (Optional) Specifies whether the instance should be automatically restarted if it is terminated by Compute Engine (not terminated by a user). | `bool` | `true` | no | -| [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example | `string` | `"false"` | no | -| [disk\_encryption\_key](#input\_disk\_encryption\_key) | The id of the encryption key that is stored in Google Cloud KMS to use to encrypt all the disks on this instance | `string` | `null` | no | -| [disk\_labels](#input\_disk\_labels) | Labels to be assigned to boot disk, provided as a map | `map(string)` | `{}` | no | -| [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `string` | `"100"` | no | -| [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, local-ssd, or pd-standard | `string` | `"pd-standard"` | no | -| [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Whether to enable the Confidential VM configuration on the instance. Note that the instance image must support Confidential VMs. See https://cloud.google.com/compute/docs/images | `bool` | `false` | no | -| [enable\_nested\_virtualization](#input\_enable\_nested\_virtualization) | Defines whether the instance should have nested virtualization enabled. | `bool` | `false` | no | -| [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Whether to enable the Shielded VM configuration on the instance. Note that the instance image must support Shielded VMs. See https://cloud.google.com/compute/docs/images | `bool` | `false` | no | -| [gpu](#input\_gpu) | GPU information. Type and count of GPU to attach to the instance template. See https://cloud.google.com/compute/docs/gpus more details |
object({
type = string
count = number
})
| `null` | no | -| [instance\_termination\_action](#input\_instance\_termination\_action) | Which action to take when Compute Engine preempts the VM. Value can be: 'STOP', 'DELETE'. The default value is 'STOP'.
See https://cloud.google.com/compute/docs/instances/spot for more details. | `string` | `"STOP"` | no | -| [ipv6\_access\_config](#input\_ipv6\_access\_config) | IPv6 access configurations. Currently a max of 1 IPv6 access configuration is supported. If not specified, the instance will have no external IPv6 Internet access. |
list(object({
network_tier = string
}))
| `[]` | no | -| [labels](#input\_labels) | Labels, provided as a map | `map(string)` | `{}` | no | -| [machine\_type](#input\_machine\_type) | Machine type to create, e.g. n1-standard-1 | `string` | `"n1-standard-1"` | no | -| [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no | -| [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list: https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | -| [name\_prefix](#input\_name\_prefix) | Name prefix for the instance template | `string` | `"default-instance-template"` | no | -| [network](#input\_network) | The name or self\_link of the network to attach this interface to. Use network attribute for Legacy or Auto subnetted networks and subnetwork for custom subnetted networks. | `string` | `""` | no | -| [network\_ip](#input\_network\_ip) | Private IP address to assign to the instance if desired. | `string` | `""` | no | -| [nic\_type](#input\_nic\_type) | The type of vNIC to be used on this interface. Possible values: GVNIC, VIRTIO\_NET. | `string` | `null` | no | -| [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy | `string` | `"MIGRATE"` | no | -| [preemptible](#input\_preemptible) | Allow the instance to be preempted | `bool` | `false` | no | -| [project\_id](#input\_project\_id) | The GCP project ID | `string` | `null` | no | -| [region](#input\_region) | Region where the instance template should be created. | `string` | `null` | no | -| [resource\_policies](#input\_resource\_policies) | A list of self\_links of resource policies to attach to the instance.
Currently a max of 1 resource policy is supported. | `list(string)` | `null` | no | -| [service\_account](#input\_service\_account) | Service account to attach to the instance. See https://www.terraform.io/docs/providers/google/r/compute_instance_template#service_account. |
object({
email = optional(string)
scopes = set(string)
})
| n/a | yes | -| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Not used unless enable\_shielded\_vm is true. Shielded VM configuration for the instance. |
object({
enable_secure_boot = bool
enable_vtpm = bool
enable_integrity_monitoring = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | -| [source\_image](#input\_source\_image) | Source disk image. If neither source\_image nor source\_image\_family is specified, defaults to the latest public CentOS image. | `string` | `""` | no | -| [source\_image\_family](#input\_source\_image\_family) | Source image family. If neither source\_image nor source\_image\_family is specified, defaults to the latest public CentOS image. | `string` | `"centos-7"` | no | -| [source\_image\_project](#input\_source\_image\_project) | Project where the source image comes from. The default project contains CentOS images. | `string` | `"centos-cloud"` | no | -| [spot](#input\_spot) | Provision as a SPOT preemptible instance.
See https://cloud.google.com/compute/docs/instances/spot for more details. | `bool` | `false` | no | -| [stack\_type](#input\_stack\_type) | The stack type for this network interface to identify whether the IPv6 feature is enabled or not. Values are `IPV4_IPV6` or `IPV4_ONLY`. Default behavior is equivalent to IPV4\_ONLY. | `string` | `null` | no | -| [startup\_script](#input\_startup\_script) | User startup script to run when instances spin up | `string` | `""` | no | -| [subnetwork](#input\_subnetwork) | The name of the subnetwork to attach this interface to. The subnetwork must exist in the same region this instance will be created in. Either network or subnetwork must be provided. | `string` | `""` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The ID of the project in which the subnetwork belongs. If it is not provided, the provider project is used. | `string` | `null` | no | -| [tags](#input\_tags) | Network tags, provided as a list | `list(string)` | `[]` | no | -| [threads\_per\_core](#input\_threads\_per\_core) | The number of threads per physical core. To disable simultaneous multithreading (SMT) set this to 1. | `number` | `null` | no | -| [total\_egress\_bandwidth\_tier](#input\_total\_egress\_bandwidth\_tier) | Network bandwidth tier. Note: machine\_type must be a supported type. Values are 'TIER\_1' or 'DEFAULT'.
See https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration for details. | `string` | `"DEFAULT"` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [name](#output\_name) | Name of instance template | -| [self\_link](#output\_self\_link) | Self-link of instance template | -| [service\_account](#output\_service\_account) | value | -| [tags](#output\_tags) | Tags that will be associated with instance(s) | - diff --git a/terraform/slurm_cluster/modules/_instance_template/main.tf b/terraform/slurm_cluster/modules/_instance_template/main.tf deleted file mode 100644 index e49fa25b..00000000 --- a/terraform/slurm_cluster/modules/_instance_template/main.tf +++ /dev/null @@ -1,207 +0,0 @@ -/** - * Copyright 2019 Google LLC - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -######### -# Locals -######### - -locals { - source_image = var.source_image != "" ? var.source_image : "centos-7-v20201112" - source_image_family = var.source_image_family != "" ? var.source_image_family : "centos-7" - source_image_project = var.source_image_project != "" ? var.source_image_project : "centos-cloud" - - boot_disk = [ - { - source_image = var.source_image != "" ? format("${local.source_image_project}/${local.source_image}") : format("${local.source_image_project}/${local.source_image_family}") - disk_size_gb = var.disk_size_gb - disk_type = var.disk_type - disk_labels = var.disk_labels - auto_delete = var.auto_delete - boot = "true" - }, - ] - - all_disks = concat(local.boot_disk, var.additional_disks) - - # NOTE: Even if all the shielded_instance_config or confidential_instance_config - # values are false, if the config block exists and an unsupported image is chosen, - # the apply will fail so we use a single-value array with the default value to - # initialize the block only if it is enabled. - shielded_vm_configs = var.enable_shielded_vm ? [true] : [] - - gpu_enabled = var.gpu != null - alias_ip_range_enabled = var.alias_ip_range != null - preemptible = var.preemptible || var.spot - on_host_maintenance = ( - local.preemptible || var.enable_confidential_vm || local.gpu_enabled - ? "TERMINATE" - : var.on_host_maintenance - ) - automatic_restart = ( - # must be false when preemptible is true - local.preemptible ? false : var.automatic_restart - ) - - nic_type = var.total_egress_bandwidth_tier == "TIER_1" ? "GVNIC" : var.nic_type -} - -data "google_project" "this" { - project_id = var.project_id -} - -#################### -# Instance Template -#################### -resource "google_compute_instance_template" "tpl" { - provider = google-beta - name_prefix = "${var.name_prefix}-" - project = var.project_id - machine_type = var.machine_type - labels = var.labels - metadata = var.metadata - tags = var.tags - can_ip_forward = var.can_ip_forward - metadata_startup_script = var.startup_script - region = var.region - min_cpu_platform = var.min_cpu_platform - resource_policies = var.resource_policies - - service_account { - email = coalesce(var.service_account.email, "${data.google_project.this.number}-compute@developer.gserviceaccount.com") - scopes = lookup(var.service_account, "scopes", null) - } - - dynamic "disk" { - for_each = local.all_disks - content { - auto_delete = lookup(disk.value, "auto_delete", null) - boot = lookup(disk.value, "boot", null) - device_name = lookup(disk.value, "device_name", null) - disk_name = lookup(disk.value, "disk_name", null) - disk_size_gb = lookup(disk.value, "disk_size_gb", lookup(disk.value, "disk_type", null) == "local-ssd" ? "375" : null) - disk_type = lookup(disk.value, "disk_type", null) - interface = lookup(disk.value, "interface", lookup(disk.value, "disk_type", null) == "local-ssd" ? "NVME" : null) - mode = lookup(disk.value, "mode", null) - source = lookup(disk.value, "source", null) - source_image = lookup(disk.value, "source_image", null) - type = lookup(disk.value, "disk_type", null) == "local-ssd" ? "SCRATCH" : "PERSISTENT" - labels = lookup(disk.value, "disk_labels", null) - - dynamic "disk_encryption_key" { - for_each = compact([var.disk_encryption_key == null ? null : 1]) - content { - kms_key_self_link = var.disk_encryption_key - } - } - } - } - - network_interface { - network = var.network - subnetwork = var.subnetwork - subnetwork_project = var.subnetwork_project - network_ip = try(coalesce(var.network_ip), null) - nic_type = local.nic_type - stack_type = var.stack_type - dynamic "access_config" { - for_each = var.access_config - content { - nat_ip = access_config.value.nat_ip - network_tier = access_config.value.network_tier - } - } - dynamic "ipv6_access_config" { - for_each = var.ipv6_access_config - content { - network_tier = ipv6_access_config.value.network_tier - } - } - dynamic "alias_ip_range" { - for_each = local.alias_ip_range_enabled ? [var.alias_ip_range] : [] - content { - ip_cidr_range = alias_ip_range.value.ip_cidr_range - subnetwork_range_name = alias_ip_range.value.subnetwork_range_name - } - } - } - - dynamic "network_interface" { - for_each = var.additional_networks - content { - network = network_interface.value.network - subnetwork = network_interface.value.subnetwork - subnetwork_project = network_interface.value.subnetwork_project - network_ip = try(coalesce(network_interface.value.network_ip), null) - nic_type = try(coalesce(network_interface.value.nic_type), null) - dynamic "access_config" { - for_each = network_interface.value.access_config - content { - nat_ip = access_config.value.nat_ip - network_tier = access_config.value.network_tier - } - } - dynamic "ipv6_access_config" { - for_each = network_interface.value.ipv6_access_config - content { - network_tier = ipv6_access_config.value.network_tier - } - } - } - } - - network_performance_config { - total_egress_bandwidth_tier = coalesce(var.total_egress_bandwidth_tier, "DEFAULT") - } - - lifecycle { - create_before_destroy = "true" - } - - scheduling { - preemptible = local.preemptible - provisioning_model = local.preemptible ? "SPOT" : "STANDARD" - automatic_restart = local.automatic_restart - on_host_maintenance = local.on_host_maintenance - instance_termination_action = local.preemptible ? var.instance_termination_action : null - } - - advanced_machine_features { - enable_nested_virtualization = var.enable_nested_virtualization - threads_per_core = var.threads_per_core - } - - dynamic "shielded_instance_config" { - for_each = local.shielded_vm_configs - content { - enable_secure_boot = lookup(var.shielded_instance_config, "enable_secure_boot", shielded_instance_config.value) - enable_vtpm = lookup(var.shielded_instance_config, "enable_vtpm", shielded_instance_config.value) - enable_integrity_monitoring = lookup(var.shielded_instance_config, "enable_integrity_monitoring", shielded_instance_config.value) - } - } - - confidential_instance_config { - enable_confidential_compute = var.enable_confidential_vm - } - - dynamic "guest_accelerator" { - for_each = local.gpu_enabled ? [var.gpu] : [] - content { - type = guest_accelerator.value.type - count = guest_accelerator.value.count - } - } -} diff --git a/terraform/slurm_cluster/modules/_instance_template/outputs.tf b/terraform/slurm_cluster/modules/_instance_template/outputs.tf deleted file mode 100644 index 6241a1cd..00000000 --- a/terraform/slurm_cluster/modules/_instance_template/outputs.tf +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Copyright 2018 Google LLC - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "self_link" { - description = "Self-link of instance template" - value = google_compute_instance_template.tpl.self_link -} - -output "name" { - description = "Name of instance template" - value = google_compute_instance_template.tpl.name -} - -output "tags" { - description = "Tags that will be associated with instance(s)" - value = google_compute_instance_template.tpl.tags -} - -output "service_account" { - description = "value" - value = google_compute_instance_template.tpl.service_account[0] -} diff --git a/terraform/slurm_cluster/modules/_instance_template/variables.tf b/terraform/slurm_cluster/modules/_instance_template/variables.tf deleted file mode 100644 index 2ae31e8f..00000000 --- a/terraform/slurm_cluster/modules/_instance_template/variables.tf +++ /dev/null @@ -1,367 +0,0 @@ -/** - * Copyright 2019 Google LLC - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - type = string - description = "The GCP project ID" - default = null -} - -variable "name_prefix" { - description = "Name prefix for the instance template" - type = string - default = "default-instance-template" -} - -variable "machine_type" { - description = "Machine type to create, e.g. n1-standard-1" - type = string - default = "n1-standard-1" -} - -variable "min_cpu_platform" { - description = "Specifies a minimum CPU platform. Applicable values are the friendly names of CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list: https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform" - type = string - default = null -} - -variable "can_ip_forward" { - description = "Enable IP forwarding, for NAT instances for example" - type = string - default = "false" -} - -variable "tags" { - type = list(string) - description = "Network tags, provided as a list" - default = [] -} - -variable "labels" { - type = map(string) - description = "Labels, provided as a map" - default = {} -} - -variable "preemptible" { - type = bool - description = "Allow the instance to be preempted" - default = false -} - -variable "spot" { - description = <<-EOD - Provision as a SPOT preemptible instance. - See https://cloud.google.com/compute/docs/instances/spot for more details. - EOD - type = bool - default = false -} - -variable "instance_termination_action" { - description = <<-EOD - Which action to take when Compute Engine preempts the VM. Value can be: 'STOP', 'DELETE'. The default value is 'STOP'. - See https://cloud.google.com/compute/docs/instances/spot for more details. - EOD - type = string - default = "STOP" -} - -variable "automatic_restart" { - type = bool - description = "(Optional) Specifies whether the instance should be automatically restarted if it is terminated by Compute Engine (not terminated by a user)." - default = true -} - -variable "on_host_maintenance" { - type = string - description = "Instance availability Policy" - default = "MIGRATE" -} - -variable "region" { - type = string - description = "Region where the instance template should be created." - default = null -} - -variable "enable_nested_virtualization" { - type = bool - description = "Defines whether the instance should have nested virtualization enabled." - default = false -} - -variable "threads_per_core" { - description = "The number of threads per physical core. To disable simultaneous multithreading (SMT) set this to 1." - type = number - default = null -} - -####### -# disk -####### -variable "source_image" { - description = "Source disk image. If neither source_image nor source_image_family is specified, defaults to the latest public CentOS image." - type = string - default = "" -} - -variable "source_image_family" { - description = "Source image family. If neither source_image nor source_image_family is specified, defaults to the latest public CentOS image." - type = string - default = "centos-7" -} - -variable "source_image_project" { - description = "Project where the source image comes from. The default project contains CentOS images." - type = string - default = "centos-cloud" -} - -variable "disk_size_gb" { - description = "Boot disk size in GB" - type = string - default = "100" -} - -variable "disk_type" { - description = "Boot disk type, can be either pd-ssd, local-ssd, or pd-standard" - type = string - default = "pd-standard" -} - -variable "disk_labels" { - description = "Labels to be assigned to boot disk, provided as a map" - type = map(string) - default = {} -} - -variable "disk_encryption_key" { - description = "The id of the encryption key that is stored in Google Cloud KMS to use to encrypt all the disks on this instance" - type = string - default = null -} - -variable "auto_delete" { - description = "Whether or not the boot disk should be auto-deleted" - type = string - default = "true" -} - -variable "additional_disks" { - description = "List of maps of additional disks. See https://www.terraform.io/docs/providers/google/r/compute_instance_template#disk_name" - type = list(object({ - disk_name = string - device_name = string - auto_delete = bool - boot = bool - disk_size_gb = number - disk_type = string - disk_labels = map(string) - })) - default = [] -} - -#################### -# network_interface -#################### -variable "network" { - description = "The name or self_link of the network to attach this interface to. Use network attribute for Legacy or Auto subnetted networks and subnetwork for custom subnetted networks." - type = string - default = "" -} - -variable "nic_type" { - description = "The type of vNIC to be used on this interface. Possible values: GVNIC, VIRTIO_NET." - type = string - default = null -} - -variable "subnetwork" { - description = "The name of the subnetwork to attach this interface to. The subnetwork must exist in the same region this instance will be created in. Either network or subnetwork must be provided." - type = string - default = "" -} - -variable "subnetwork_project" { - description = "The ID of the project in which the subnetwork belongs. If it is not provided, the provider project is used." - type = string - default = null -} - -variable "network_ip" { - description = "Private IP address to assign to the instance if desired." - type = string - default = "" -} - -variable "stack_type" { - description = "The stack type for this network interface to identify whether the IPv6 feature is enabled or not. Values are `IPV4_IPV6` or `IPV4_ONLY`. Default behavior is equivalent to IPV4_ONLY." - type = string - default = null -} - -variable "additional_networks" { - description = "Additional network interface details for GCE, if any." - default = [] - type = list(object({ - network = string - subnetwork = string - subnetwork_project = string - network_ip = string - nic_type = string - access_config = list(object({ - nat_ip = string - network_tier = string - })) - ipv6_access_config = list(object({ - network_tier = string - })) - })) -} - -variable "total_egress_bandwidth_tier" { - description = < - -- [Module: Slurm Instance](#module-slurm-instance) - - [Overview](#overview) - - [Module API](#module-api) - - - -## Overview - -This module creates a [compute instance](../../../../docs/glossary.md#vm) from -[instance template](../../../../docs/glossary.md#instance-template) for a -[Slurm cluster](../slurm_cluster/README.md). - -> **NOTE:** This module is only intended to be used by Slurm modules. For -> general usage, please consider using: -> -> - [terraform-google-modules/vm/google//modules/compute_instance](https://registry.terraform.io/modules/terraform-google-modules/vm/google/latest/submodules/compute_instance). - -> **WARNING:** The source image is not modified. Make sure to use a compatible -> source image. - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_cluster/modules/_slurm_instance/README_TF.md b/terraform/slurm_cluster/modules/_slurm_instance/README_TF.md deleted file mode 100644 index febf151c..00000000 --- a/terraform/slurm_cluster/modules/_slurm_instance/README_TF.md +++ /dev/null @@ -1,83 +0,0 @@ -# \_slurm_instance - - -Copyright (C) SchedMD LLC. -Copyright 2018 Google LLC - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.0 | -| [google](#requirement\_google) | >= 3.43 | -| [local](#requirement\_local) | ~> 2.0 | -| [null](#requirement\_null) | ~> 3.0 | - -## Providers - -| Name | Version | -|------|---------| -| [google](#provider\_google) | >= 3.43 | -| [local](#provider\_local) | ~> 2.0 | -| [null](#provider\_null) | ~> 3.0 | - -## Modules - -No modules. - -## Resources - -| Name | Type | -|------|------| -| [google_compute_instance_from_template.slurm_instance](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_from_template) | resource | -| [null_resource.replace_trigger](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [google_compute_instance_template.base](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_instance_template) | data source | -| [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source | -| [local_file.startup](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | -| [add\_hostname\_suffix](#input\_add\_hostname\_suffix) | Adds a suffix to the hostname | `bool` | `true` | no | -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
access_config = optional(list(object({
nat_ip = string
network_tier = string
})), [])
alias_ip_range = optional(list(object({
ip_cidr_range = string
subnetwork_range_name = string
})), [])
ipv6_access_config = optional(list(object({
network_tier = string
})), [])
network = optional(string)
network_ip = optional(string, "")
nic_type = optional(string)
queue_count = optional(number)
stack_type = optional(string)
subnetwork = optional(string)
subnetwork_project = optional(string)
}))
| `[]` | no | -| [hostname](#input\_hostname) | Hostname of instances | `string` | `""` | no | -| [hostname\_suffix\_separator](#input\_hostname\_suffix\_separator) | Separator character to compose hostname when add\_hostname\_suffix is set to true. | `string` | `"-"` | no | -| [instance\_template](#input\_instance\_template) | Instance template self\_link used to create compute instances | `string` | n/a | yes | -| [labels](#input\_labels) | Labels, provided as a map. Merged and takes precedence over labels on instance template | `map(string)` | `{}` | no | -| [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no | -| [network](#input\_network) | Network to deploy to. Only one of network or subnetwork should be specified. | `string` | `""` | no | -| [num\_instances](#input\_num\_instances) | Number of instances to create. This value is ignored if static\_ips is provided. | `number` | `1` | no | -| [project\_id](#input\_project\_id) | The GCP project ID | `string` | `null` | no | -| [region](#input\_region) | Region where the instances should be created. | `string` | `null` | no | -| [replace\_trigger](#input\_replace\_trigger) | Trigger value to replace the instances. | `string` | `""` | no | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming. | `string` | n/a | yes | -| [slurm\_instance\_role](#input\_slurm\_instance\_role) | Slurm instance type. Must be one of: controller; login; compute. | `string` | `null` | no | -| [static\_ips](#input\_static\_ips) | List of static IPs for VM instances | `list(string)` | `[]` | no | -| [subnetwork](#input\_subnetwork) | Subnet to deploy to. Only one of network or subnetwork should be specified. | `string` | `""` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project that subnetwork belongs to | `string` | `null` | no | -| [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be spread across available zones in the region. | `string` | `null` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [available\_zones](#output\_available\_zones) | List of available zones in region | -| [instances\_details](#output\_instances\_details) | List of all details for compute instances | -| [instances\_self\_links](#output\_instances\_self\_links) | List of self-links for compute instances | -| [names](#output\_names) | List of available zones in region | -| [slurm\_instances](#output\_slurm\_instances) | List of all resource objects for compute instances | - diff --git a/terraform/slurm_cluster/modules/_slurm_instance/main.tf b/terraform/slurm_cluster/modules/_slurm_instance/main.tf deleted file mode 100644 index f7706d3e..00000000 --- a/terraform/slurm_cluster/modules/_slurm_instance/main.tf +++ /dev/null @@ -1,154 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * Copyright 2018 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########## -# LOCALS # -########## - -locals { - hostname = var.hostname == "" ? "default" : var.hostname - num_instances = length(var.static_ips) == 0 ? var.num_instances : length(var.static_ips) - - # local.static_ips is the same as var.static_ips with a dummy element appended - # at the end of the list to work around "list does not have any elements so cannot - # determine type" error when var.static_ips is empty - static_ips = concat(var.static_ips, ["NOT_AN_IP"]) -} - -################# -# LOCALS: SLURM # -################# - -locals { - network_interfaces = [for index in range(local.num_instances) : - concat([ - { - access_config = var.access_config - alias_ip_range = [] - ipv6_access_config = [] - network = var.network - network_ip = length(var.static_ips) == 0 ? "" : element(local.static_ips, index) - nic_type = null - queue_count = null - stack_type = null - subnetwork = var.subnetwork - subnetwork_project = var.subnetwork_project - } - ], - var.additional_networks - ) - ] - - slurm_instance_role = lower(var.slurm_instance_role) - - scripts_dir = abspath("${path.module}/../../../../scripts") -} - -################ -# DATA SOURCES # -################ - -data "google_compute_zones" "available" { - project = var.project_id - region = var.region -} - -data "google_compute_instance_template" "base" { - project = var.project_id - name = var.instance_template -} - -data "local_file" "startup" { - filename = abspath("${local.scripts_dir}/startup.sh") -} - -############# -# INSTANCES # -############# -resource "null_resource" "replace_trigger" { - triggers = { - trigger = var.replace_trigger - } -} - -resource "google_compute_instance_from_template" "slurm_instance" { - count = local.num_instances - name = var.add_hostname_suffix ? format("%s%s%s", local.hostname, var.hostname_suffix_separator, format("%03d", count.index + 1)) : local.hostname - project = var.project_id - zone = var.zone == null ? data.google_compute_zones.available.names[count.index % length(data.google_compute_zones.available.names)] : var.zone - - allow_stopping_for_update = true - - dynamic "network_interface" { - for_each = local.network_interfaces[count.index] - iterator = nic - content { - dynamic "access_config" { - for_each = nic.value.access_config - content { - nat_ip = access_config.value.nat_ip - network_tier = access_config.value.network_tier - } - } - dynamic "alias_ip_range" { - for_each = nic.value.alias_ip_range - content { - ip_cidr_range = alias_ip_range.value.ip_cidr_range - subnetwork_range_name = alias_ip_range.value.subnetwork_range_name - } - } - dynamic "ipv6_access_config" { - for_each = nic.value.ipv6_access_config - iterator = access_config - content { - network_tier = access_config.value.network_tier - } - } - network = nic.value.network - network_ip = nic.value.network_ip - nic_type = nic.value.nic_type - queue_count = nic.value.queue_count - subnetwork = nic.value.subnetwork - subnetwork_project = nic.value.subnetwork_project - } - } - - source_instance_template = data.google_compute_instance_template.base.self_link - - # Slurm - labels = merge( - data.google_compute_instance_template.base.labels, - var.labels, - { - slurm_cluster_name = var.slurm_cluster_name - slurm_instance_role = local.slurm_instance_role - }, - ) - metadata = merge( - data.google_compute_instance_template.base.metadata, - var.metadata, - { - slurm_cluster_name = var.slurm_cluster_name - slurm_instance_role = local.slurm_instance_role - startup-script = data.local_file.startup.content - }, - ) - - lifecycle { - replace_triggered_by = [null_resource.replace_trigger.id] - } -} diff --git a/terraform/slurm_cluster/modules/_slurm_instance/outputs.tf b/terraform/slurm_cluster/modules/_slurm_instance/outputs.tf deleted file mode 100644 index 4eba78a7..00000000 --- a/terraform/slurm_cluster/modules/_slurm_instance/outputs.tf +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * Copyright 2018 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "slurm_instances" { - description = "List of all resource objects for compute instances" - value = google_compute_instance_from_template.slurm_instance -} - -output "instances_self_links" { - description = "List of self-links for compute instances" - value = google_compute_instance_from_template.slurm_instance[*].self_link -} - -output "instances_details" { - description = "List of all details for compute instances" - value = google_compute_instance_from_template.slurm_instance[*] -} - -output "available_zones" { - description = "List of available zones in region" - value = data.google_compute_zones.available.names -} - -output "names" { - description = "List of available zones in region" - value = google_compute_instance_from_template.slurm_instance[*].name -} diff --git a/terraform/slurm_cluster/modules/_slurm_instance/variables.tf b/terraform/slurm_cluster/modules/_slurm_instance/variables.tf deleted file mode 100644 index 697d5c4b..00000000 --- a/terraform/slurm_cluster/modules/_slurm_instance/variables.tf +++ /dev/null @@ -1,161 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * Copyright 2018 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - type = string - description = "The GCP project ID" - default = null -} - -variable "network" { - description = "Network to deploy to. Only one of network or subnetwork should be specified." - type = string - default = "" -} - -variable "subnetwork" { - description = "Subnet to deploy to. Only one of network or subnetwork should be specified." - type = string - default = "" -} - -variable "subnetwork_project" { - description = "The project that subnetwork belongs to" - type = string - default = null -} - -variable "hostname" { - description = "Hostname of instances" - type = string - default = "" -} - -variable "add_hostname_suffix" { - description = "Adds a suffix to the hostname" - type = bool - default = true -} - -variable "additional_networks" { - description = "Additional network interface details for GCE, if any." - default = [] - type = list(object({ - access_config = optional(list(object({ - nat_ip = string - network_tier = string - })), []) - alias_ip_range = optional(list(object({ - ip_cidr_range = string - subnetwork_range_name = string - })), []) - ipv6_access_config = optional(list(object({ - network_tier = string - })), []) - network = optional(string) - network_ip = optional(string, "") - nic_type = optional(string) - queue_count = optional(number) - stack_type = optional(string) - subnetwork = optional(string) - subnetwork_project = optional(string) - })) - nullable = false -} - -variable "static_ips" { - description = "List of static IPs for VM instances" - type = list(string) - default = [] -} - -variable "access_config" { - description = "Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet." - type = list(object({ - nat_ip = string - network_tier = string - })) - default = [] -} - -variable "num_instances" { - description = "Number of instances to create. This value is ignored if static_ips is provided." - type = number - default = 1 -} - -variable "instance_template" { - description = "Instance template self_link used to create compute instances" - type = string -} - -variable "region" { - description = "Region where the instances should be created." - type = string - default = null -} - -variable "zone" { - description = "Zone where the instances should be created. If not specified, instances will be spread across available zones in the region." - type = string - default = null -} - -variable "hostname_suffix_separator" { - description = "Separator character to compose hostname when add_hostname_suffix is set to true." - type = string - default = "-" -} - -variable "metadata" { - type = map(string) - description = "Metadata, provided as a map" - default = {} -} - -variable "labels" { - type = map(string) - description = "Labels, provided as a map. Merged and takes precedence over labels on instance template" - default = {} -} - -######### -# SLURM # -######### - -variable "slurm_instance_role" { - description = "Slurm instance type. Must be one of: controller; login; compute." - type = string - default = null - - validation { - condition = contains(["controller", "login", "compute"], lower(var.slurm_instance_role)) - error_message = "Must be one of: controller; login; compute." - } -} - -variable "slurm_cluster_name" { - description = "Cluster name, used for resource naming." - type = string -} - - -variable "replace_trigger" { - description = "Trigger value to replace the instances." - type = string - default = "" -} diff --git a/terraform/slurm_cluster/modules/_slurm_instance/versions.tf b/terraform/slurm_cluster/modules/_slurm_instance/versions.tf deleted file mode 100644 index 293a1ef8..00000000 --- a/terraform/slurm_cluster/modules/_slurm_instance/versions.tf +++ /dev/null @@ -1,35 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * Copyright 2018 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -terraform { - required_version = "~> 1.0" - - required_providers { - google = { - source = "hashicorp/google" - version = ">= 3.43" - } - local = { - source = "hashicorp/local" - version = "~> 2.0" - } - null = { - source = "hashicorp/null" - version = "~> 3.0" - } - } -} diff --git a/terraform/slurm_cluster/modules/slurm_controller_hybrid/README.md b/terraform/slurm_cluster/modules/slurm_controller_hybrid/README.md deleted file mode 100644 index 3c5ead9a..00000000 --- a/terraform/slurm_cluster/modules/slurm_controller_hybrid/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# Module: Slurm Controller Hybrid - -[FAQ](../../../../docs/faq.md) | -[Troubleshooting](../../../../docs/troubleshooting.md) | -[Glossary](../../../../docs/glossary.md) - - - -- [Module: Slurm Controller Hybrid](#module-slurm-controller-hybrid) - - [Overview](#overview) - - [Usage](#usage) - - [Module API](#module-api) - - - -## Overview - -This is a submodule of [slurm_cluster](../../../slurm_cluster/README.md).. This -module creates a -[null_resource](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) -to manage the task generating all files and cloud resources required to support -a hybrid environment in [GCP](../../../../docs/glossary.md#gcp). - -## Usage - -See [examples](../../examples/slurm_controller_hybrid/) directory for sample -usages. - -See below for a simple inclusion within your own terraform project. - -```hcl -module "slurm_controller_hybrid" { - source = "git@github.com:SchedMD/slurm-gcp.git//terraform/modules/slurm_controller_hybrid?ref=v5.0.0" - - project_id = "" - - slurm_cluster_name = "" - - output_dir = "/etc/slurm" -} -``` - -> **NOTE:** Because this module is not hosted on -> [Terraform Registry](../../../../docs/glossary.md#terraform-registry), the -> version must be strictly controlled via -> [revision](https://www.terraform.io/language/modules/sources#selecting-a-revision) -> syntax on the source line. - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_cluster/modules/slurm_controller_hybrid/README_TF.md b/terraform/slurm_cluster/modules/slurm_controller_hybrid/README_TF.md deleted file mode 100644 index 04389d4e..00000000 --- a/terraform/slurm_cluster/modules/slurm_controller_hybrid/README_TF.md +++ /dev/null @@ -1,86 +0,0 @@ -# slurm_controller_hybrid - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.0 | -| [google](#requirement\_google) | >= 3.53 | -| [jinja](#requirement\_jinja) | ~> 1.15.0 | -| [local](#requirement\_local) | ~> 2.0 | -| [null](#requirement\_null) | ~> 3.0 | -| [random](#requirement\_random) | ~> 3.0 | - -## Providers - -| Name | Version | -|------|---------| -| [jinja](#provider\_jinja) | ~> 1.15.0 | -| [local](#provider\_local) | ~> 2.0 | -| [null](#provider\_null) | ~> 3.0 | - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [cleanup\_compute\_nodes](#module\_cleanup\_compute\_nodes) | ../slurm_destroy_nodes | n/a | -| [cleanup\_resource\_policies](#module\_cleanup\_resource\_policies) | ../slurm_destroy_resource_policies | n/a | - -## Resources - -| Name | Type | -|------|------| -| [local_file.conf_py](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | -| [local_file.config_yaml](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | -| [local_file.resume_py](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | -| [local_file.slurmcmd_service](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | -| [local_file.slurmcmd_timer](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | -| [local_file.slurmsync_py](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | -| [local_file.startup_sh](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | -| [local_file.suspend_py](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | -| [local_file.util_py](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | -| [null_resource.setup_hybrid](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [jinja_template.slurmcmd_service](https://registry.terraform.io/providers/NikolaLohinski/jinja/latest/docs/data-sources/template) | data source | -| [jinja_template.slurmcmd_timer](https://registry.terraform.io/providers/NikolaLohinski/jinja/latest/docs/data-sources/template) | data source | -| [local_file.conf_py](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | -| [local_file.resume_py](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | -| [local_file.setup_hybrid_py](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | -| [local_file.slurmsync_py](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | -| [local_file.startup_sh](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | -| [local_file.suspend_py](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | -| [local_file.util_py](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [config](#input\_config) | Cluster configuration. Use 'module.slurm\_files.config' as value. | `any` | n/a | yes | -| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

NOTE: Requires Python and script dependencies.

*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | -| [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. | `string` | n/a | yes | -| [slurm\_user](#input\_slurm\_user) | Name of the slurm user.
Defaults to "slurm". | `string` | `"slurm"` | no | -| [slurmcmd\_timeout](#input\_slurmcmd\_timeout) | The wait time between slurmcmd service runs in seconds.
It default to 30. | `number` | `30` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [cloud\_logging\_filter](#output\_cloud\_logging\_filter) | Cloud Logging filter to find startup errors. | -| [output\_dir](#output\_output\_dir) | Directory where configuration files are written to. | -| [slurm\_cluster\_name](#output\_slurm\_cluster\_name) | Cluster name for resource naming and slurm accounting. | - diff --git a/terraform/slurm_cluster/modules/slurm_controller_hybrid/main.tf b/terraform/slurm_cluster/modules/slurm_controller_hybrid/main.tf deleted file mode 100644 index ab64c5a9..00000000 --- a/terraform/slurm_cluster/modules/slurm_controller_hybrid/main.tf +++ /dev/null @@ -1,203 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########## -# LOCALS # -########## - -locals { - scripts_dir = abspath("${path.module}/../../../../scripts") - slurmcmd_template_dir = abspath("${path.module}/../../../../ansible/roles/slurmcmd/templates") - - template_context = { - service_user = var.slurm_user - service_path = local.install_dir - slurmcmd_timeout = var.slurmcmd_timeout - } - - output_dir = var.config.output_dir - install_dir = var.config.install_dir -} - -################ -# DATA: SCRIPT # -################ - -data "local_file" "setup_hybrid_py" { - filename = abspath("${local.scripts_dir}/setup_hybrid.py") -} - -data "local_file" "resume_py" { - filename = abspath("${local.scripts_dir}/resume.py") -} - -data "local_file" "suspend_py" { - filename = abspath("${local.scripts_dir}/suspend.py") -} - -data "local_file" "util_py" { - filename = abspath("${local.scripts_dir}/util.py") -} - -data "local_file" "conf_py" { - filename = abspath("${local.scripts_dir}/conf.py") -} - -data "local_file" "slurmsync_py" { - filename = abspath("${local.scripts_dir}/slurmsync.py") -} - -data "local_file" "startup_sh" { - filename = abspath("${local.scripts_dir}/startup.sh") -} - -########### -# SCRIPTS # -########### - -resource "local_file" "resume_py" { - content = data.local_file.resume_py.content - filename = abspath("${local.output_dir}/resume.py") - - file_permission = "0700" -} - -resource "local_file" "suspend_py" { - content = data.local_file.suspend_py.content - filename = abspath("${local.output_dir}/suspend.py") - - file_permission = "0700" -} - -resource "local_file" "util_py" { - content = data.local_file.util_py.content - filename = abspath("${local.output_dir}/util.py") - - file_permission = "0700" -} - -resource "local_file" "conf_py" { - content = data.local_file.conf_py.content - filename = abspath("${local.output_dir}/conf.py") - - file_permission = "0700" -} - -resource "local_file" "slurmsync_py" { - content = data.local_file.slurmsync_py.content - filename = abspath("${local.output_dir}/slurmsync.py") - - file_permission = "0700" -} - -resource "local_file" "startup_sh" { - content = data.local_file.startup_sh.content - filename = abspath("${local.output_dir}/startup.sh") - - file_permission = "0700" -} - -data "jinja_template" "slurmcmd_service" { - template = abspath("${local.slurmcmd_template_dir}/slurmcmd.service.j2") - context { - type = "yaml" - data = yamlencode(local.template_context) - } -} -resource "local_file" "slurmcmd_service" { - content = data.jinja_template.slurmcmd_service.result - filename = abspath("${local.output_dir}/slurmcmd.service") - - file_permission = "0644" -} - -data "jinja_template" "slurmcmd_timer" { - template = abspath("${local.slurmcmd_template_dir}/slurmcmd.timer.j2") - context { - type = "yaml" - data = yamlencode(local.template_context) - } -} -resource "local_file" "slurmcmd_timer" { - content = data.jinja_template.slurmcmd_timer.result - filename = abspath("${local.output_dir}/slurmcmd.timer") - - file_permission = "0644" -} - -########## -# CONFIG # -########## - -resource "local_file" "config_yaml" { - filename = abspath("${local.output_dir}/config.yaml") - content = yamlencode(var.config) - - file_permission = "0600" -} - -######### -# SETUP # -######### - -resource "null_resource" "setup_hybrid" { - triggers = merge({ - scripts_dir = local.scripts_dir - config_dir = local.output_dir - config = local_file.config_yaml.content - config_path = local_file.config_yaml.filename - script_path = data.local_file.setup_hybrid_py.filename - }, - ) - - provisioner "local-exec" { - working_dir = self.triggers.scripts_dir - environment = { - SLURM_CONFIG_YAML = self.triggers.config_path - } - command = self.triggers.script_path - } -} - -################# -# DESTROY NODES # -################# - -# Destroy all compute nodes on `terraform destroy` -module "cleanup_compute_nodes" { - source = "../slurm_destroy_nodes" - - count = var.enable_cleanup_compute ? 1 : 0 - - project_id = var.project_id - slurm_cluster_name = var.slurm_cluster_name - when_destroy = true -} - -############################# -# DESTROY RESOURCE POLICIES # -############################# - -# Destroy all resource policies on `terraform destroy` -module "cleanup_resource_policies" { - source = "../slurm_destroy_resource_policies" - - count = var.enable_cleanup_compute ? 1 : 0 - - slurm_cluster_name = var.slurm_cluster_name - project_id = var.project_id - when_destroy = true -} diff --git a/terraform/slurm_cluster/modules/slurm_controller_hybrid/outputs.tf b/terraform/slurm_cluster/modules/slurm_controller_hybrid/outputs.tf deleted file mode 100644 index 89bc59ea..00000000 --- a/terraform/slurm_cluster/modules/slurm_controller_hybrid/outputs.tf +++ /dev/null @@ -1,37 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "slurm_cluster_name" { - description = "Cluster name for resource naming and slurm accounting." - value = var.slurm_cluster_name -} - -output "output_dir" { - description = "Directory where configuration files are written to." - value = local.output_dir -} - -########## -# GOOGLE # -########## - -output "cloud_logging_filter" { - description = "Cloud Logging filter to find startup errors." - value = <<-EOT - resource.type="gce_instance" - logName=("projects/${var.project_id}/logs/slurm_resume" OR "projects/${var.project_id}/logs/slurm_suspend" OR "projects/${var.project_id}/logs/slurm_sync" OR "projects/${var.project_id}/logs/slurmctld" OR "projects/${var.project_id}/logs/slurmd" OR "projects/${var.project_id}/logs/slurmdbd") OR (logName=("projects/${var.project_id}/logs/syslog") AND jsonPayload.message=~"google_metadata_script_runner") - EOT -} diff --git a/terraform/slurm_cluster/modules/slurm_controller_hybrid/variables.tf b/terraform/slurm_cluster/modules/slurm_controller_hybrid/variables.tf deleted file mode 100644 index 28b1580f..00000000 --- a/terraform/slurm_cluster/modules/slurm_controller_hybrid/variables.tf +++ /dev/null @@ -1,75 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########### -# GENERAL # -########### - -variable "project_id" { - type = string - description = "Project ID to create resources in." -} - -######### -# SLURM # -######### - -variable "slurm_cluster_name" { - type = string - description = "Cluster name, used for resource naming and slurm accounting." - - validation { - condition = can(regex("^[a-z](?:[a-z0-9]{0,9})$", var.slurm_cluster_name)) - error_message = "Variable 'slurm_cluster_name' must be a match of regex '^[a-z](?:[a-z0-9]{0,9})$'." - } -} - -variable "config" { - description = "Cluster configuration. Use 'module.slurm_files.config' as value." - type = any -} - -variable "enable_cleanup_compute" { - description = < - -- [Module: Slurm Controller Instance](#module-slurm-controller-instance) - - [Overview](#overview) - - [Usage](#usage) - - [Module API](#module-api) - - - -## Overview - -This is a submodule of [slurm_cluster](../../../slurm_cluster/README.md). This -module creates a Slurm controller instance from -[instance template](../../../../docs/glossary.md#instance-template). Certain -properties from the -[instance template](../../../../docs/glossary.md#instance-template) will be -overridden when instanceated as a [VM](../../../../docs/glossary.md#vm). - -It is recommended to pass in an -[instance template](../../../../docs/glossary.md#instance-template) generated by -the [slurm_instance_template](../slurm_instance_template/README.md) module. - -The controller is responsible for managing compute instances defined by multiple -[slurm_partition](../slurm_partition/README.md). - -The controller instance run [slurmctld](../../../../docs/glossary.md#slurmctld), -[slurmdbd](../../../../docs/glossary.md#slurmdbd), and -[slurmrestd](../../../../docs/glossary.md#slurmrestd). - -## Usage - -See [examples](../../examples/slurm_controller_instance/) directory for sample -usages. - -See below for a simple inclusion within your own terraform project. - -```hcl -module "slurm_controller_instance" { - source = "git@github.com:SchedMD/slurm-gcp.git//terraform/slurm/cluster/modules/slurm_controller_instance?ref=v5.0.0" - - project_id = "" - - region = "us-central1" - subnetwork = "default" - - instance_template = "" - - slurm_cluster_name = "" - slurm_cluster_id = "" -} -``` - -> **NOTE:** Because this module is not hosted on -> [Terraform Registry](../../../../docs/glossary.md#terraform-registry), the -> version must be strictly controlled via -> [revision](https://www.terraform.io/language/modules/sources#selecting-a-revision) -> syntax on the source line. - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_cluster/modules/slurm_controller_instance/README_TF.md b/terraform/slurm_cluster/modules/slurm_controller_instance/README_TF.md deleted file mode 100644 index 614fb88b..00000000 --- a/terraform/slurm_cluster/modules/slurm_controller_instance/README_TF.md +++ /dev/null @@ -1,79 +0,0 @@ -# slurm_controller_instance - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.0 | -| [google](#requirement\_google) | >= 4.83.0 | -| [local](#requirement\_local) | ~> 2.0 | -| [random](#requirement\_random) | ~> 3.0 | - -## Providers - -| Name | Version | -|------|---------| -| [google](#provider\_google) | >= 4.83.0 | - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [cleanup\_compute\_nodes](#module\_cleanup\_compute\_nodes) | ../slurm_destroy_nodes | n/a | -| [cleanup\_resource\_policies](#module\_cleanup\_resource\_policies) | ../slurm_destroy_resource_policies | n/a | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | ../_slurm_instance | n/a | - -## Resources - -| Name | Type | -|------|------| -| [google_secret_manager_secret.cloudsql](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret) | resource | -| [google_secret_manager_secret_iam_member.cloudsql_secret_accessor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | -| [google_secret_manager_secret_version.cloudsql_version](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_version) | resource | -| [google_compute_instance_template.controller_template](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_instance_template) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
* server\_ip : Address of the database server.
* user : The user to access the database as.
* password : The password, given the user, to access the given database. (sensitive)
* db\_name : The database to access. |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
})
| `null` | no | -| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

NOTE: Requires Python and script dependencies.

*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | -| [enable\_public\_ip](#input\_enable\_public\_ip) | Enables IP address to access the Internet. | `bool` | `false` | no | -| [instance\_template](#input\_instance\_template) | Instance template self\_link used to create compute instances. | `string` | n/a | yes | -| [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no | -| [network](#input\_network) | Network to deploy to. Only one of network or subnetwork should be specified. | `string` | `""` | no | -| [network\_tier](#input\_network\_tier) | The networking tier used for configuring this instance. This field can take the following values: PREMIUM, FIXED\_STANDARD or STANDARD.
Ignored if enable\_public\_ip is false. | `string` | `"STANDARD"` | no | -| [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | -| [region](#input\_region) | Region where the instances should be created. | `string` | `null` | no | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | The cluster name, used for resource naming and slurm accounting. | `string` | n/a | yes | -| [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no | -| [subnetwork](#input\_subnetwork) | Subnet to deploy to. Only one of network or subnetwork should be specified. | `string` | `""` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project that subnetwork belongs to. | `string` | `null` | no | -| [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [cloud\_logging\_filter](#output\_cloud\_logging\_filter) | Cloud Logging filter to find startup errors. | -| [cloudsql\_secret](#output\_cloudsql\_secret) | Cloudsql secret by URI. | -| [instances\_self\_links](#output\_instances\_self\_links) | Controller instance resource. | -| [slurm\_cluster\_name](#output\_slurm\_cluster\_name) | Cluster name for resource naming and slurm accounting. | -| [slurm\_controller\_instance](#output\_slurm\_controller\_instance) | Controller instance module. | -| [slurm\_controller\_instances](#output\_slurm\_controller\_instances) | Controller instance resource. | - diff --git a/terraform/slurm_cluster/modules/slurm_controller_instance/main.tf b/terraform/slurm_cluster/modules/slurm_controller_instance/main.tf deleted file mode 100644 index c7d96128..00000000 --- a/terraform/slurm_cluster/modules/slurm_controller_instance/main.tf +++ /dev/null @@ -1,140 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########## -# LOCALS # -########## - -locals { - region = ( - length(regexall("/regions/([^/]*)", var.subnetwork)) > 0 - ? flatten(regexall("/regions/([^/]*)", var.subnetwork))[0] - : var.region - ) - - service_account_email = ( - var.cloudsql != null - ? data.google_compute_instance_template.controller_template[0].service_account[0].email - : null - ) - - access_config = { - nat_ip = null - network_tier = var.network_tier - } -} - -################## -# DATA: TEMPLATE # -################## - -data "google_compute_instance_template" "controller_template" { - count = var.cloudsql != null ? 1 : 0 - - name = var.instance_template -} - -############ -# INSTANCE # -############ - -module "slurm_controller_instance" { - source = "../_slurm_instance" - - access_config = var.enable_public_ip ? [local.access_config] : [] - add_hostname_suffix = false - hostname = "${var.slurm_cluster_name}-controller" - instance_template = var.instance_template - network = var.network - project_id = var.project_id - region = local.region - slurm_cluster_name = var.slurm_cluster_name - slurm_instance_role = "controller" - static_ips = var.static_ips - subnetwork_project = var.subnetwork_project - subnetwork = var.subnetwork - zone = var.zone - - metadata = var.metadata - - depends_on = [ - # Ensure nodes are destroyed before controller is - module.cleanup_compute_nodes[0], - ] -} - -##################### -# SECRETS: CLOUDSQL # -##################### - -resource "google_secret_manager_secret" "cloudsql" { - count = var.cloudsql != null ? 1 : 0 - - secret_id = "${var.slurm_cluster_name}-slurm-secret-cloudsql" - - replication { - auto {} - } - - labels = { - slurm_cluster_name = var.slurm_cluster_name - } -} - -resource "google_secret_manager_secret_version" "cloudsql_version" { - count = var.cloudsql != null ? 1 : 0 - - secret = google_secret_manager_secret.cloudsql[0].id - secret_data = jsonencode(var.cloudsql) -} - -resource "google_secret_manager_secret_iam_member" "cloudsql_secret_accessor" { - count = var.cloudsql != null ? 1 : 0 - - secret_id = google_secret_manager_secret.cloudsql[0].id - role = "roles/secretmanager.secretAccessor" - member = "serviceAccount:${local.service_account_email}" -} - -################# -# DESTROY NODES # -################# - -# Destroy all compute nodes on `terraform destroy` -module "cleanup_compute_nodes" { - source = "../slurm_destroy_nodes" - - count = var.enable_cleanup_compute ? 1 : 0 - - slurm_cluster_name = var.slurm_cluster_name - project_id = var.project_id - when_destroy = true -} - -############################# -# DESTROY RESOURCE POLICIES # -############################# - -# Destroy all resource policies on `terraform destroy` -module "cleanup_resource_policies" { - source = "../slurm_destroy_resource_policies" - - count = var.enable_cleanup_compute ? 1 : 0 - - slurm_cluster_name = var.slurm_cluster_name - project_id = var.project_id - when_destroy = true -} diff --git a/terraform/slurm_cluster/modules/slurm_controller_instance/outputs.tf b/terraform/slurm_cluster/modules/slurm_controller_instance/outputs.tf deleted file mode 100644 index 2639ca6c..00000000 --- a/terraform/slurm_cluster/modules/slurm_controller_instance/outputs.tf +++ /dev/null @@ -1,60 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -############ -# INSTANCE # -############ - -output "slurm_controller_instance" { - description = "Controller instance module." - value = module.slurm_controller_instance -} - -output "slurm_controller_instances" { - description = "Controller instance resource." - value = module.slurm_controller_instance.slurm_instances -} - -output "instances_self_links" { - description = "Controller instance resource." - value = module.slurm_controller_instance.instances_self_links -} - -######### -# SLURM # -######### - -output "slurm_cluster_name" { - description = "Cluster name for resource naming and slurm accounting." - value = var.slurm_cluster_name -} - -output "cloudsql_secret" { - description = "Cloudsql secret by URI." - value = one(google_secret_manager_secret_version.cloudsql_version[*].id) -} - -########## -# GOOGLE # -########## - -output "cloud_logging_filter" { - description = "Cloud Logging filter to find startup errors." - value = <<-EOT - resource.type="gce_instance" - logName=("projects/${var.project_id}/logs/slurm_resume" OR "projects/${var.project_id}/logs/slurm_suspend" OR "projects/${var.project_id}/logs/slurm_sync" OR "projects/${var.project_id}/logs/slurmctld" OR "projects/${var.project_id}/logs/slurmd" OR "projects/${var.project_id}/logs/slurmdbd") OR (logName=("projects/${var.project_id}/logs/syslog") AND jsonPayload.message=~"google_metadata_script_runner") - EOT -} diff --git a/terraform/slurm_cluster/modules/slurm_controller_instance/variables.tf b/terraform/slurm_cluster/modules/slurm_controller_instance/variables.tf deleted file mode 100644 index f07839d3..00000000 --- a/terraform/slurm_cluster/modules/slurm_controller_instance/variables.tf +++ /dev/null @@ -1,144 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - type = string - description = "Project ID to create resources in." -} - -########### -# NETWORK # -########### - -variable "network" { - type = string - description = "Network to deploy to. Only one of network or subnetwork should be specified." - default = "" -} - -variable "subnetwork" { - type = string - description = "Subnet to deploy to. Only one of network or subnetwork should be specified." - default = "" -} - -variable "subnetwork_project" { - type = string - description = "The project that subnetwork belongs to." - default = null -} - -variable "region" { - type = string - description = "Region where the instances should be created." - default = null -} - -############ -# INSTANCE # -############ - -variable "instance_template" { - type = string - description = "Instance template self_link used to create compute instances." -} - -variable "static_ips" { - type = list(string) - description = "List of static IPs for VM instances." - default = [] -} - -variable "enable_public_ip" { - description = "Enables IP address to access the Internet." - type = bool - default = false -} - -variable "network_tier" { - type = string - description = <<-EOD - The networking tier used for configuring this instance. This field can take the following values: PREMIUM, FIXED_STANDARD or STANDARD. - Ignored if enable_public_ip is false. - EOD - default = "STANDARD" - - validation { - condition = var.network_tier == null ? true : contains(["PREMIUM", "FIXED_STANDARD", "STANDARD"], var.network_tier) - error_message = "Allow values are: 'PREMIUM', 'FIXED_STANDARD', 'STANDARD'." - } -} - -variable "zone" { - type = string - description = < - -- [Module: Slurm Destroy Nodes](#module-slurm-destroy-nodes) - - [Overview](#overview) - - [Usage](#usage) - - [Dependencies](#dependencies) - - [Module API](#module-api) - - - -## Overview - -This module creates a -[null_resource](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) -to manage the task of destroying [VM](../../../../docs/glossary.md#vm) instances -that are labeled with the input [slurm_cluster_id](./README_TF.md#inputs). It -can be configured with triggers that will cause it to re-run the task when -infrastructure changes. - -## Usage - -See [examples](../../examples/slurm_destroy_nodes/) directory for sample usages. - -See below for a simple inclusion within your own terraform project. - -```hcl -module "slurm_destroy_nodes" { - source = "git@github.com:SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes?ref=v5.0.0" - - slurm_cluster_name = "" - - project_id = "" -} -``` - -> **NOTE:** Because this module is not hosted on -> [Terraform Registry](../../../../docs/glossary.md#terraform-registry), the -> version must be strictly controlled via -> [revision](https://www.terraform.io/language/modules/sources#selecting-a-revision) -> syntax on the source line. - -## Dependencies - -- [GCP Cloud SDK](https://cloud.google.com/sdk/downloads) is installed. -- [Python](../../../../docs/glossary.md#python) is installed. - - Required Version: `>= 3.6.0, < 4.0.0` -- [Pip](../../../../docs/glossary.md#pip) packages are installed. - - `pip3 install -r ../../../scripts/requirements.txt --user` - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_cluster/modules/slurm_destroy_nodes/README_TF.md b/terraform/slurm_cluster/modules/slurm_destroy_nodes/README_TF.md deleted file mode 100644 index da99b9ec..00000000 --- a/terraform/slurm_cluster/modules/slurm_destroy_nodes/README_TF.md +++ /dev/null @@ -1,59 +0,0 @@ -# slurm_destroy_nodes - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.0 | -| [local](#requirement\_local) | ~> 2.0 | -| [null](#requirement\_null) | ~> 3.0 | - -## Providers - -| Name | Version | -|------|---------| -| [local](#provider\_local) | ~> 2.0 | -| [null](#provider\_null) | ~> 3.0 | - -## Modules - -No modules. - -## Resources - -| Name | Type | -|------|------| -| [null_resource.destroy_nodes_on_create](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [null_resource.destroy_nodes_on_destroy](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [local_file.destroy_nodes](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [exclude\_list](#input\_exclude\_list) | Exclude destruction of these compute nodes, by instance name. | `list(string)` | `[]` | no | -| [project\_id](#input\_project\_id) | The project ID | `string` | n/a | yes | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Destroy compute nodes labeled with this slurm\_cluster\_name. | `string` | n/a | yes | -| [target\_list](#input\_target\_list) | Target destruction of these compute nodes, by instance name. | `list(string)` | `[]` | no | -| [triggers](#input\_triggers) | Additional Terraform triggers. | `map(string)` | `{}` | no | -| [when\_destroy](#input\_when\_destroy) | Run only on `terraform destroy`? | `bool` | `false` | no | - -## Outputs - -No outputs. - diff --git a/terraform/slurm_cluster/modules/slurm_destroy_nodes/main.tf b/terraform/slurm_cluster/modules/slurm_destroy_nodes/main.tf deleted file mode 100644 index 546e95f7..00000000 --- a/terraform/slurm_cluster/modules/slurm_destroy_nodes/main.tf +++ /dev/null @@ -1,95 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########## -# LOCALS # -########## - -locals { - scripts_dir = abspath("${path.module}/../../../../scripts") - - destroy_nodes = abspath("${local.scripts_dir}/destroy_nodes.py") -} - -######## -# DATA # -######## - -data "local_file" "destroy_nodes" { - filename = local.destroy_nodes -} - -######################### -# DESTROY NODES: CREATE # -######################### - -resource "null_resource" "destroy_nodes_on_create" { - count = var.when_destroy ? 0 : 1 - - triggers = merge( - var.triggers, - { - scripts_dir = local.scripts_dir - script_path = data.local_file.destroy_nodes.filename - slurm_cluster_name = var.slurm_cluster_name - project_id = var.project_id - } - ) - - provisioner "local-exec" { - working_dir = self.triggers.scripts_dir - command = < 0 - error_message = "The slurm_cluster_name must not be empty." - } -} - -variable "project_id" { - description = "The project ID" - type = string - - validation { - condition = length(var.project_id) > 0 - error_message = "The project_id must not be empty." - } -} - -variable "triggers" { - description = "Additional Terraform triggers." - type = map(string) - default = {} -} - -variable "target_list" { - description = "Target destruction of these compute nodes, by instance name." - type = list(string) - default = [] -} - -variable "exclude_list" { - description = "Exclude destruction of these compute nodes, by instance name." - type = list(string) - default = [] -} - -variable "when_destroy" { - description = "Run only on `terraform destroy`?" - type = bool - default = false -} diff --git a/terraform/slurm_cluster/modules/slurm_destroy_nodes/versions.tf b/terraform/slurm_cluster/modules/slurm_destroy_nodes/versions.tf deleted file mode 100644 index 94a89d9a..00000000 --- a/terraform/slurm_cluster/modules/slurm_destroy_nodes/versions.tf +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -terraform { - required_version = "~> 1.0" - - required_providers { - local = { - source = "hashicorp/local" - version = "~> 2.0" - } - null = { - source = "hashicorp/null" - version = "~> 3.0" - } - } -} diff --git a/terraform/slurm_cluster/modules/slurm_destroy_resource_policies/README.md b/terraform/slurm_cluster/modules/slurm_destroy_resource_policies/README.md deleted file mode 100644 index c5da045d..00000000 --- a/terraform/slurm_cluster/modules/slurm_destroy_resource_policies/README.md +++ /dev/null @@ -1,58 +0,0 @@ -# Module: Slurm Destroy Resource Policies - -[FAQ](../../../../docs/faq.md) | -[Troubleshooting](../../../../docs/troubleshooting.md) | -[Glossary](../../../../docs/glossary.md) - - - -- [Module: Slurm Destroy Resource Policies](#module-slurm-destroy-resource-policies) - - [Overview](#overview) - - [Usage](#usage) - - [Dependencies](#dependencies) - - [Module API](#module-api) - - - -## Overview - -This module creates a -[null_resource](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) -to manage the task of destroying resource policies. It can be configured with -triggers that will cause it re-run the task when infrastructure changes. - -## Usage - -See [examples](../../examples/slurm_destroy_resource_policies/) directory for -sample usages. - -See below for a simple inclusion within your own terraform project. - -```hcl -module "slurm_destroy_resource_policies" { - source = "git@github.com:SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies?ref=v5.0.0" - - partition_name = "" - slurm_cluster_name = "" - project_id = "" -} -``` - -> **NOTE:** Because this module is not hosted on -> [Terraform Registry](../../../../docs/glossary.md#terraform-registry), the -> version must be strictly controlled via -> [revision](https://www.terraform.io/language/modules/sources#selecting-a-revision) -> syntax on the source line. - -## Dependencies - -- [GCP Cloud SDK](https://cloud.google.com/sdk/downloads) is installed. -- [Python](../../../../docs/glossary.md#python) is installed. - - Required Version: `>= 3.6.0, < 4.0.0` -- [Pip](../../../../docs/glossary.md#pip) packages are installed. - - `pip3 install -r ../../../scripts/requirements.txt --user` - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_cluster/modules/slurm_destroy_resource_policies/README_TF.md b/terraform/slurm_cluster/modules/slurm_destroy_resource_policies/README_TF.md deleted file mode 100644 index 5611f030..00000000 --- a/terraform/slurm_cluster/modules/slurm_destroy_resource_policies/README_TF.md +++ /dev/null @@ -1,58 +0,0 @@ -# slurm_destroy_resource_policies - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.0 | -| [local](#requirement\_local) | ~> 2.0 | -| [null](#requirement\_null) | ~> 3.0 | - -## Providers - -| Name | Version | -|------|---------| -| [local](#provider\_local) | ~> 2.0 | -| [null](#provider\_null) | ~> 3.0 | - -## Modules - -No modules. - -## Resources - -| Name | Type | -|------|------| -| [null_resource.destroy_resource_policies_on_create](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [null_resource.destroy_resource_policies_on_destroy](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [local_file.destroy_resource_policies](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [partition\_name](#input\_partition\_name) | Partition name. | `string` | `""` | no | -| [project\_id](#input\_project\_id) | The project ID | `string` | n/a | yes | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, for resource filtering. | `string` | n/a | yes | -| [triggers](#input\_triggers) | Additional Terraform triggers. | `map(string)` | `{}` | no | -| [when\_destroy](#input\_when\_destroy) | Run only on `terraform destroy`? | `bool` | `false` | no | - -## Outputs - -No outputs. - diff --git a/terraform/slurm_cluster/modules/slurm_destroy_resource_policies/main.tf b/terraform/slurm_cluster/modules/slurm_destroy_resource_policies/main.tf deleted file mode 100644 index c041d115..00000000 --- a/terraform/slurm_cluster/modules/slurm_destroy_resource_policies/main.tf +++ /dev/null @@ -1,93 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########## -# LOCALS # -########## - -locals { - scripts_dir = abspath("${path.module}/../../../../scripts") - - destroy_resource_policies = abspath("${local.scripts_dir}/destroy_resource_policies.py") -} - -######## -# DATA # -######## - -data "local_file" "destroy_resource_policies" { - filename = local.destroy_resource_policies -} - -##################################### -# DESTROY RESOURCE POLICIES: CREATE # -##################################### - -resource "null_resource" "destroy_resource_policies_on_create" { - count = var.when_destroy ? 0 : 1 - - triggers = merge( - var.triggers, - { - scripts_dir = local.scripts_dir - script_path = data.local_file.destroy_resource_policies.filename - slurm_cluster_name = var.slurm_cluster_name - project_id = var.project_id - partition_name = var.partition_name - } - ) - - provisioner "local-exec" { - working_dir = self.triggers.scripts_dir - command = < 0 - error_message = "The project_id must not be empty." - } -} - -variable "partition_name" { - description = "Partition name." - type = string - default = "" - - validation { - condition = length(var.partition_name) > 0 ? can(regex("^[a-z](?:[a-z0-9]*)$", var.partition_name)) : true - error_message = "Variable 'partition_name' must be a match of regex '^[a-z](?:[a-z0-9]*)$'." - } -} - -variable "triggers" { - description = "Additional Terraform triggers." - type = map(string) - default = {} -} - -variable "when_destroy" { - description = "Run only on `terraform destroy`?" - type = bool - default = false -} diff --git a/terraform/slurm_cluster/modules/slurm_destroy_resource_policies/versions.tf b/terraform/slurm_cluster/modules/slurm_destroy_resource_policies/versions.tf deleted file mode 100644 index 94a89d9a..00000000 --- a/terraform/slurm_cluster/modules/slurm_destroy_resource_policies/versions.tf +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -terraform { - required_version = "~> 1.0" - - required_providers { - local = { - source = "hashicorp/local" - version = "~> 2.0" - } - null = { - source = "hashicorp/null" - version = "~> 3.0" - } - } -} diff --git a/terraform/slurm_cluster/modules/slurm_files/README.md b/terraform/slurm_cluster/modules/slurm_files/README.md deleted file mode 100644 index 12a88619..00000000 --- a/terraform/slurm_cluster/modules/slurm_files/README.md +++ /dev/null @@ -1,200 +0,0 @@ -# Module: Slurm Cluster - -[FAQ](../../docs/faq.md) | [Troubleshooting](../../docs/troubleshooting.md) | -[Glossary](../../docs/glossary.md) - - - -- [Module: Slurm Cluster](#module-slurm-cluster) - - [Overview](#overview) - - [Usage](#usage) - - [Dependencies](#dependencies) - - [Software](#software) - - [Required](#required) - - [Optional](#optional) - - [TerraformUser](#terraformuser) - - [Required](#required-1) - - [Optional](#optional-1) - - [Controller SA](#controller-sa) - - [Required](#required-2) - - [Optional](#optional-2) - - [Compute SA](#compute-sa) - - [Optional](#optional-3) - - [Login SA](#login-sa) - - [Optional](#optional-4) - - [Module API](#module-api) - - - -## Overview - -This module creates a [Slurm](../../docs/glossary.md#slurm) cluster on -[GCP](../../docs/glossary.md#gcp). There are two modes of operation: cloud; and -hybrid. Cloud mode will create a VM controller. Hybrid mode will generate -`cloud.conf` and `cloud_gres.conf` files to be included in the on-prem -configuration files, while managing a `config.yaml` file for internal module -use. - -Partitions define what compute resources are available to the controller so it -may allocate jobs. Slurm will resume/create compute instances as needed to run -allocated jobs and will suspend/terminate the instances after they are no longer -needed (e.g. IDLE for SuspendTimeout duration). Static nodes are persistent; -they are exempt from being suspended/terminated under normal conditions. Dynamic -nodes are burstable; they will scale up and down with workload. - -> **WARNING:** Destroying the controller before it has suspended/terminated all -> static and dynamic node instances and supporting resources (e.g. placement -> groups, subscription) will leave those resources orphaned unless cleanup -> options are enabled (.e.g `enable_cleanup_compute`, -> `enable_cleanup_subscriptions`). - -## Usage - -See [examples](./examples/slurm_cluster/) directory for sample usages. - -See below for a simple inclusion within your own terraform project. - -```hcl -module "slurm_cluster" { - source = "git@github.com:SchedMD/slurm-gcp.git//terraform/slurm_cluster?ref=v5.0.0" - - project_id = "" - - slurm_cluster_name = "" - - # ... omitted ... -} -``` - -> **NOTE:** Because this module is not hosted on -> [Terraform Registry](../../docs/glossary.md#terraform-registry), the version -> must be strictly controlled via -> [revision](https://www.terraform.io/language/modules/sources#selecting-a-revision) -> syntax on the source line. - -## Dependencies - -### Software - -Certain software must be installed on the local machine or APIs enabled in -[GCP](../../docs/glossary.md#gcp) for -[TerraformUser](../../docs/glossary.md#terraformuser) to be able to use this -module. - -#### Required - -- [Terraform](https://www.terraform.io/downloads.html) is installed. -- [GCP Cloud SDK](https://cloud.google.com/sdk/downloads) is installed. -- [Compute Engine API](../../docs/glossary.md#compute-engine) is enabled. - -#### Optional - -- [Python](../../docs/glossary.md#python) is installed. - - Required Version: `>= 3.6.0, < 4.0.0` - - Required when any of: - - `enable_hybrid=true` - - `enable_cleanup_compute=true` - - `enable_cleanup_subscriptions=true` - - `enable_reconfigure=true` -- [Pip](../../../docs/glossary.md#pip) packages are installed. - - Required when any of: - - `enable_hybrid=true` - - `enable_cleanup_compute=true` - - `enable_cleanup_subscriptions=true` - - `enable_reconfigure=true` - - `pip3 install -r ../../scripts/requirements.txt --user` -- [Private Google Access](../../docs/glossary.md#private-google-access) is - enabled. - - Required when any instances only have internal IPs. -- [Secret Manager API](../../docs/glossary.md#secret-manager) is enabled. - - Required when `cloudsql != null`. -- [Pub/Sub API](../../docs/glossary.md#pubsub) is enabled. - - Required when any of: - - `enable_cleanup_subscriptions=true` - - `enable_reconfigure=true` -- [Bigquery API](../../docs/glossary.md#bigquery) is enabled. - - Required when `enable_bigquery_load=true`. - -### TerraformUser - -[TerraformUser](../../docs/glossary.md#terraformuser) authenticates with -credentials to [Google Cloud](../../docs/glossary.md#gcp). It is recommended to -create a principal [IAM](../../docs/glossary.md#iam) for this user and associate -[roles](../../docs/glossary.md#iam-roles) to them. Optionally, the TerraformUser -can operate through a [service account](../../docs/glossary.md#service-account). - -#### Required - -- Compute Instance Admin (v1) (`roles/compute.instanceAdmin.v1`) - -#### Optional - -- Pub/Sub Admin (`roles/pubsub.admin`) - - Required when `enable_reconfigure=true`. -- Secret Manager Admin (`roles/secretmanager.admin`) - - Required when `cloudsql != null`. -- Service Account User (`roles/iam.serviceAccountUser`) - - Required when [TerraformUser](../../docs/glossary.md#terraformuser) is using - an [service account](../../docs/glossary.md#service-account) to - authenticate. - -### Controller SA - -[Service account](../../docs/glossary.md#service-account) intended to be -associated with the controller -[instance template](../../docs/glossary.md#instance-template) for -[slurm_controller_instance](../slurm_controller_instance/). - -#### Required - -- Compute Instance Admin (v1) (`roles/compute.instanceAdmin.v1`) -- Compute Instance Admin (beta) (`roles/compute.instanceAdmin`) -- Service Account User (`roles/iam.serviceAccountUser`) - -#### Optional - -- BigQuery Data Editor (`roles/bigquery.dataEditor`) - - Required when `enable_bigquery_load=true`. -- Cloud SQL Editor (`roles/cloudsql.editor`) - - Required when all of: - - `cloudsql != null` - - Communicating to CloudSQL instance -- Logs Writer (`roles/logging.logWriter`) - - Recommended. -- Monitoring Metric Writer (`roles/monitoring.metricWriter`) - - Recommended. -- Pub/Sub Admin (`roles/pubsub.admin`) - - Required when `enable_reconfigure=true`. - -### Compute SA - -[Service account](../../docs/glossary.md#service-account) intended to be -associated with the compute -[instance templates](../../docs/glossary.md#instance-template) created by -[slurm_partition](../slurm_partition/). - -#### Optional - -- Logs Writer (`roles/logging.logWriter`) - - Recommended. -- Monitoring Metric Writer (`roles/monitoring.metricWriter`) - - Recommended. - -### Login SA - -[Service account](../../docs/glossary.md#service-account) intended to be -associated with the login -[instance templates](../../docs/glossary.md#instance-template) created by -[slurm_partition](../slurm_partition/). - -#### Optional - -- Logs Writer (`roles/logging.logWriter`) - - Recommended. -- Monitoring Metric Writer (`roles/monitoring.metricWriter`) - - Recommended. - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_cluster/modules/slurm_files/README_TF.md b/terraform/slurm_cluster/modules/slurm_files/README_TF.md deleted file mode 100644 index e3bf9134..00000000 --- a/terraform/slurm_cluster/modules/slurm_files/README_TF.md +++ /dev/null @@ -1,119 +0,0 @@ -# bucket_files - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.3 | -| [archive](#requirement\_archive) | ~> 2.0 | -| [google](#requirement\_google) | >= 3.53 | -| [local](#requirement\_local) | ~> 2.0 | -| [random](#requirement\_random) | ~> 3.0 | - -## Providers - -| Name | Version | -|------|---------| -| [archive](#provider\_archive) | ~> 2.0 | -| [google](#provider\_google) | >= 3.53 | -| [local](#provider\_local) | ~> 2.0 | -| [random](#provider\_random) | ~> 3.0 | - -## Modules - -No modules. - -## Resources - -| Name | Type | -|------|------| -| [google_storage_bucket_object.compute_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.controller_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.devel](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.epilog_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.login_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.nodeset_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.prolog_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [random_uuid.cluster_id](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/uuid) | resource | -| [archive_file.slurm_gcp_devel_zip](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | -| [google_storage_bucket.this](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/storage_bucket) | data source | -| [local_file.external_epilog](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | -| [local_file.external_prolog](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | -| [local_file.setup_external](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. | `string` | `null` | no | -| [bucket\_name](#input\_bucket\_name) | Name of GCS bucket to use. | `string` | n/a | yes | -| [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
tree_width = optional(number)
})
| `{}` | no | -| [cloudsql\_secret](#input\_cloudsql\_secret) | Secret URI to cloudsql secret. | `string` | `null` | no | -| [compute\_startup\_scripts](#input\_compute\_startup\_scripts) | List of scripts to be ran on compute VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | -| [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | -| [controller\_startup\_scripts](#input\_controller\_startup\_scripts) | List of scripts to be ran on controller VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | -| [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | -| [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
* /usr/local/etc/slurm
* /etc/munge
* /home
* /apps
If these are disabled, the slurm etc and munge dirs must be added manually,
or some other mechanism must be used to synchronize the slurm conf files
and the munge key across the cluster. | `bool` | `false` | no | -| [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | -| [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. Not for production use. | `bool` | `false` | no | -| [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | -| [enable\_external\_prolog\_epilog](#input\_enable\_external\_prolog\_epilog) | Automatically enable a script that will execute prolog and epilog scripts
shared by NFS from the controller to compute nodes. Find more details at:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/v5/tools/prologs-epilogs/README.md | `bool` | `false` | no | -| [enable\_hybrid](#input\_enable\_hybrid) | Enables use of hybrid controller mode. When true, controller\_hybrid\_config will
be used instead of controller\_instance\_config and will disable login instances. | `bool` | `false` | no | -| [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `any` | `false` | no | -| [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) |
object({
compute = string
})
|
{
"compute": null
}
| no | -| [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | -| [extra\_logging\_flags](#input\_extra\_logging\_flags) | The list of extra flags for the logging system to use. See the logging\_flags variable in scripts/util.py to get the list of supported log flags. | `map(bool)` | `{}` | no | -| [google\_app\_cred\_path](#input\_google\_app\_cred\_path) | Path to Google Application Credentials. | `string` | `null` | no | -| [install\_dir](#input\_install\_dir) | Directory where the hybrid configuration directory will be installed on the
on-premise controller (e.g. /etc/slurm/hybrid). This updates the prefix path
for the resume and suspend scripts in the generated `cloud.conf` file.

This variable should be used when the TerraformHost and the SlurmctldHost
are different.

This will default to var.output\_dir if null. | `string` | `null` | no | -| [job\_submit\_lua\_tpl](#input\_job\_submit\_lua\_tpl) | Slurm job\_submit.lua template file path. | `string` | `null` | no | -| [login\_network\_storage](#input\_login\_network\_storage) | Storage to mounted on login and controller instances
* server\_ip : Address of the storage server.
* remote\_mount : The location in the remote instance filesystem to mount from.
* local\_mount : The location on the instance filesystem to mount to.
* fs\_type : Filesystem type (e.g. "nfs").
* mount\_options : Options to mount with. |
list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
| `[]` | no | -| [login\_startup\_scripts](#input\_login\_startup\_scripts) | List of scripts to be ran on login VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | -| [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | -| [munge\_mount](#input\_munge\_mount) | Remote munge mount for compute and login nodes to acquire the munge.key.

By default, the munge mount server will be assumed to be the
`var.slurm_control_host` (or `var.slurm_control_addr` if non-null) when
`server_ip=null`. |
object({
server_ip = string
remote_mount = string
fs_type = string
mount_options = string
})
|
{
"fs_type": "nfs",
"mount_options": "",
"remote_mount": "/etc/munge/",
"server_ip": null
}
| no | -| [network\_storage](#input\_network\_storage) | Storage to mounted on all instances.
* server\_ip : Address of the storage server.
* remote\_mount : The location in the remote instance filesystem to mount from.
* local\_mount : The location on the instance filesystem to mount to.
* fs\_type : Filesystem type (e.g. "nfs").
* mount\_options : Options to mount with. |
list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Cluster nodenets, as a list. | `list(any)` | `[]` | no | -| [nodeset\_dyn](#input\_nodeset\_dyn) | Cluster nodenets (dynamic), as a list. | `list(any)` | `[]` | no | -| [nodeset\_startup\_scripts](#input\_nodeset\_startup\_scripts) | List of scripts to be ran on compute VM startup in the specific nodeset. |
map(list(object({
filename = string
content = string
})))
| `{}` | no | -| [nodeset\_tpu](#input\_nodeset\_tpu) | Cluster nodenets (TPU), as a list. | `list(any)` | `[]` | no | -| [output\_dir](#input\_output\_dir) | Directory where this module will write its files to. These files include:
cloud.conf; cloud\_gres.conf; config.yaml; resume.py; suspend.py; and util.py. | `string` | `null` | no | -| [partitions](#input\_partitions) | Cluster partitions as a list. | `list(any)` | `[]` | no | -| [project\_id](#input\_project\_id) | The GCP project ID. | `string` | n/a | yes | -| [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | -| [slurm\_bin\_dir](#input\_slurm\_bin\_dir) | Path to directory of Slurm binary commands (e.g. scontrol, sinfo). If 'null',
then it will be assumed that binaries are in $PATH. | `string` | `null` | no | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | The cluster name, used for resource naming and slurm accounting. | `string` | n/a | yes | -| [slurm\_conf\_tpl](#input\_slurm\_conf\_tpl) | Slurm slurm.conf template file path. | `string` | `null` | no | -| [slurm\_control\_addr](#input\_slurm\_control\_addr) | The IP address or a name by which the address can be identified.

This value is passed to slurm.conf such that:
SlurmctldHost={var.slurm\_control\_host}\({var.slurm\_control\_addr}\)

See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost | `string` | `null` | no | -| [slurm\_control\_host](#input\_slurm\_control\_host) | The short, or long, hostname of the machine where Slurm control daemon is
executed (i.e. the name returned by the command "hostname -s").

This value is passed to slurm.conf such that:
SlurmctldHost={var.slurm\_control\_host}\({var.slurm\_control\_addr}\)

See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost | `string` | `null` | no | -| [slurm\_control\_host\_port](#input\_slurm\_control\_host\_port) | The port number that the Slurm controller, slurmctld, listens to for work.

See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldPort | `string` | `"6818"` | no | -| [slurm\_log\_dir](#input\_slurm\_log\_dir) | Directory where Slurm logs to. | `string` | `"/var/log/slurm"` | no | -| [slurmdbd\_conf\_tpl](#input\_slurmdbd\_conf\_tpl) | Slurm slurmdbd.conf template file path. | `string` | `null` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [checksum](#output\_checksum) | Checksum of all files written to the bucket. | -| [config](#output\_config) | Cluster configuration. | -| [nodeset](#output\_nodeset) | Cluster nodesets. | -| [nodeset\_dyn](#output\_nodeset\_dyn) | Cluster nodesets (dynamic). | -| [nodeset\_tpu](#output\_nodeset\_tpu) | Cluster nodesets (TPU). | -| [partitions](#output\_partitions) | Cluster partitions. | -| [slurm\_bucket\_path](#output\_slurm\_bucket\_path) | GCS Bucket URI of Slurm cluster file storage. | - diff --git a/terraform/slurm_cluster/modules/slurm_files/files/external_epilog.sh b/terraform/slurm_cluster/modules/slurm_files/files/external_epilog.sh deleted file mode 100755 index db514fc9..00000000 --- a/terraform/slurm_cluster/modules/slurm_files/files/external_epilog.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if [[ -x /opt/apps/adm/slurm/slurm_epilog ]]; then - exec /opt/apps/adm/slurm/slurm_epilog -fi diff --git a/terraform/slurm_cluster/modules/slurm_files/files/external_prolog.sh b/terraform/slurm_cluster/modules/slurm_files/files/external_prolog.sh deleted file mode 100755 index 37a91bb1..00000000 --- a/terraform/slurm_cluster/modules/slurm_files/files/external_prolog.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if [[ -x /opt/apps/adm/slurm/slurm_prolog ]]; then - exec /opt/apps/adm/slurm/slurm_prolog -fi diff --git a/terraform/slurm_cluster/modules/slurm_files/files/setup_external.sh b/terraform/slurm_cluster/modules/slurm_files/files/setup_external.sh deleted file mode 100755 index c21f7cbd..00000000 --- a/terraform/slurm_cluster/modules/slurm_files/files/setup_external.sh +++ /dev/null @@ -1,113 +0,0 @@ -#!/bin/bash -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e -o pipefail - -SLURM_EXTERNAL_ROOT="/opt/apps/adm/slurm" -SLURM_MUX_FILE="slurm_mux" - -mkdir -p "${SLURM_EXTERNAL_ROOT}" -mkdir -p "${SLURM_EXTERNAL_ROOT}/logs" -mkdir -p "${SLURM_EXTERNAL_ROOT}/etc" - -# create common prolog / epilog "multiplex" script -if [ ! -f "${SLURM_EXTERNAL_ROOT}/${SLURM_MUX_FILE}" ]; then - # indentation matters in EOT below; do not blindly edit! - cat <<'EOT' >"${SLURM_EXTERNAL_ROOT}/${SLURM_MUX_FILE}" -#!/bin/bash -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -CMD="${0##*/}" -# Locate script -BASE=$(readlink -f $0) -BASE=${BASE%/*} - -export CLUSTER_ADM_BASE=${BASE} - -# Source config file if it exists for extra DEBUG settings -# used below -SLURM_MUX_CONF=${CLUSTER_ADM_BASE}/etc/slurm_mux.conf -if [[ -r ${SLURM_MUX_CONF} ]]; then - source ${SLURM_MUX_CONF} -fi - -# Setup logging if configured and directory exists -LOGFILE="/dev/null" -if [[ -d ${DEBUG_SLURM_MUX_LOG_DIR} && ${DEBUG_SLURM_MUX_ENABLE_LOG} == "yes" ]]; then - LOGFILE="${DEBUG_SLURM_MUX_LOG_DIR}/${CMD}-${SLURM_SCRIPT_CONTEXT}-job-${SLURMD_NODENAME}.log" - exec >>${LOGFILE} 2>&1 -fi - -# Global scriptlets -for SCRIPTLET in ${BASE}/${SLURM_SCRIPT_CONTEXT}.d/*.${SLURM_SCRIPT_CONTEXT}; do - if [[ -x ${SCRIPTLET} ]]; then - echo "Running ${SCRIPTLET}" - ${SCRIPTLET} $@ >>${LOGFILE} 2>&1 - fi -done - -# Per partition scriptlets -for SCRIPTLET in ${BASE}/partition-${SLURM_JOB_PARTITION}-${SLURM_SCRIPT_CONTEXT}.d/*.${SLURM_SCRIPT_CONTEXT}; do - if [[ -x ${SCRIPTLET} ]]; then - echo "Running ${SCRIPTLET}" - ${SCRIPTLET} $@ >>${LOGFILE} 2>&1 - fi -done -EOT -fi - -# ensure proper permissions on slurm_mux script -chmod 0755 "${SLURM_EXTERNAL_ROOT}/${SLURM_MUX_FILE}" - -# create default slurm_mux configuration file -if [ ! -f "${SLURM_EXTERNAL_ROOT}/etc/slurm_mux.conf" ]; then - cat <<'EOT' >"${SLURM_EXTERNAL_ROOT}/etc/slurm_mux.conf" -# these settings are intended for temporary debugging purposes only; leaving -# them enabled will write files for each job to a shared NFS directory without -# any automated cleanup -DEBUG_SLURM_MUX_LOG_DIR=/opt/apps/adm/slurm/logs -DEBUG_SLURM_MUX_ENABLE_LOG=no -EOT -fi - -# create epilog symbolic link -if [ ! -L "${SLURM_EXTERNAL_ROOT}/slurm_epilog" ]; then - cd ${SLURM_EXTERNAL_ROOT} - # delete existing file if necessary - rm -f slurm_epilog - ln -s ${SLURM_MUX_FILE} slurm_epilog - cd - >/dev/null -fi - -# create prolog symbolic link -if [ ! -L "${SLURM_EXTERNAL_ROOT}/slurm_prolog" ]; then - cd ${SLURM_EXTERNAL_ROOT} - # delete existing file if necessary - rm -f slurm_prolog - ln -s ${SLURM_MUX_FILE} slurm_prolog - cd - >/dev/null -fi diff --git a/terraform/slurm_cluster/modules/slurm_files/main.tf b/terraform/slurm_cluster/modules/slurm_files/main.tf deleted file mode 100644 index 896c17dc..00000000 --- a/terraform/slurm_cluster/modules/slurm_files/main.tf +++ /dev/null @@ -1,297 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -locals { - scripts_dir = abspath("${path.module}/../../../../scripts") - - bucket_dir = coalesce(var.bucket_dir, format("%s-files", var.slurm_cluster_name)) -} - -######## -# DATA # -######## - -data "google_storage_bucket" "this" { - name = var.bucket_name -} - -########## -# RANDOM # -########## - -resource "random_uuid" "cluster_id" { -} - -################## -# CLUSTER CONFIG # -################## - -locals { - config = { - enable_slurm_gcp_plugins = var.enable_slurm_gcp_plugins - enable_bigquery_load = var.enable_bigquery_load - cloudsql_secret = var.cloudsql_secret - cluster_id = random_uuid.cluster_id.result - project = var.project_id - slurm_cluster_name = var.slurm_cluster_name - bucket_path = local.bucket_path - enable_debug_logging = var.enable_debug_logging - extra_logging_flags = var.extra_logging_flags - - # storage - disable_default_mounts = var.disable_default_mounts - network_storage = var.network_storage - login_network_storage = var.enable_hybrid ? null : var.login_network_storage - - # timeouts - controller_startup_scripts_timeout = var.enable_hybrid ? null : var.controller_startup_scripts_timeout - compute_startup_scripts_timeout = var.compute_startup_scripts_timeout - login_startup_scripts_timeout = var.enable_hybrid ? null : var.login_startup_scripts_timeout - munge_mount = local.munge_mount - - # slurm conf - prolog_scripts = [for k, v in google_storage_bucket_object.prolog_scripts : k] - epilog_scripts = [for k, v in google_storage_bucket_object.epilog_scripts : k] - cloud_parameters = var.cloud_parameters - partitions = local.partitions - nodeset = local.nodeset - nodeset_dyn = local.nodeset_dyn - nodeset_tpu = local.nodeset_tpu - - # hybrid - hybrid = var.enable_hybrid - google_app_cred_path = var.enable_hybrid ? local.google_app_cred_path : null - output_dir = var.enable_hybrid ? local.output_dir : null - install_dir = var.enable_hybrid ? local.install_dir : null - slurm_control_host = var.enable_hybrid ? var.slurm_control_host : null - slurm_control_host_port = var.enable_hybrid ? local.slurm_control_host_port : null - slurm_control_addr = var.enable_hybrid ? var.slurm_control_addr : null - slurm_bin_dir = var.enable_hybrid ? local.slurm_bin_dir : null - slurm_log_dir = var.enable_hybrid ? local.slurm_log_dir : null - - # config files templates - slurmdbd_conf_tpl = file(coalesce(var.slurmdbd_conf_tpl, "${local.etc_dir}/slurmdbd.conf.tpl")) - slurm_conf_tpl = file(coalesce(var.slurm_conf_tpl, "${local.etc_dir}/slurm.conf.tpl")) - cgroup_conf_tpl = file(coalesce(var.cgroup_conf_tpl, "${local.etc_dir}/cgroup.conf.tpl")) - jobsubmit_lua_tpl = file(coalesce(var.job_submit_lua_tpl, "${local.etc_dir}/job_submit.lua.tpl")) - - # Providers - endpoint_versions = var.endpoint_versions - } - - config_yaml = "config.yaml" - config_yaml_bucket = format("%s/%s", local.bucket_dir, local.config_yaml) - - partitions = { for p in var.partitions[*].partition : p.partition_name => p } - - nodeset = { for n in var.nodeset[*].nodeset : n.nodeset_name => n } - nodeset_dyn = { for n in var.nodeset_dyn[*].nodeset : n.nodeset_name => n } - nodeset_tpu = { for n in var.nodeset_tpu[*].nodeset : n.nodeset_name => n } - - x_nodeset = toset([for k, v in local.nodeset : v.nodeset_name]) - x_nodeset_dyn = toset([for k, v in local.nodeset_dyn : v.nodeset_name]) - x_nodeset_tpu = toset([for k, v in local.nodeset_tpu : v.nodeset_name]) - x_nodeset_overlap = setintersection([], local.x_nodeset, local.x_nodeset_dyn, local.x_nodeset_tpu) - - etc_dir = abspath("${path.module}/../../../../etc") - - bucket_path = format("%s/%s", data.google_storage_bucket.this.url, local.bucket_dir) - - slurm_control_host_port = coalesce(var.slurm_control_host_port, "6818") - - google_app_cred_path = var.google_app_cred_path != null ? abspath(var.google_app_cred_path) : null - slurm_bin_dir = var.slurm_bin_dir != null ? abspath(var.slurm_bin_dir) : null - slurm_log_dir = var.slurm_log_dir != null ? abspath(var.slurm_log_dir) : null - - munge_mount = var.enable_hybrid ? { - server_ip = lookup(var.munge_mount, "server_ip", coalesce(var.slurm_control_addr, var.slurm_control_host)) - remote_mount = lookup(var.munge_mount, "remote_mount", "/etc/munge/") - fs_type = lookup(var.munge_mount, "fs_type", "nfs") - mount_options = lookup(var.munge_mount, "mount_options", "") - } : null - - output_dir = can(coalesce(var.output_dir)) ? abspath(var.output_dir) : abspath(".") - install_dir = can(coalesce(var.install_dir)) ? abspath(var.install_dir) : local.output_dir -} - -resource "google_storage_bucket_object" "config" { - bucket = data.google_storage_bucket.this.name - name = local.config_yaml_bucket - content = yamlencode(local.config) -} - -######### -# DEVEL # -######### - -locals { - build_dir = abspath("${path.module}/../../../../build") - - slurm_gcp_devel_zip = "slurm-gcp-devel.zip" - slurm_gcp_devel_zip_bucket = format("%s/%s", local.bucket_dir, local.slurm_gcp_devel_zip) -} - -data "archive_file" "slurm_gcp_devel_zip" { - count = var.enable_devel ? 1 : 0 - - output_path = "${local.build_dir}/${local.slurm_gcp_devel_zip}" - type = "zip" - source_dir = local.scripts_dir - - excludes = flatten([ - "config.yaml", - "Pipfile", - fileset(local.scripts_dir, "__pycache__/*"), - fileset(local.scripts_dir, "*.log"), - fileset(local.scripts_dir, "*.cache"), - fileset(local.scripts_dir, "*.lock"), - ]) - -} - -resource "google_storage_bucket_object" "devel" { - count = var.enable_devel ? 1 : 0 - - bucket = var.bucket_name - name = local.slurm_gcp_devel_zip_bucket - source = data.archive_file.slurm_gcp_devel_zip[0].output_path -} - - -########### -# SCRIPTS # -########### - -resource "google_storage_bucket_object" "controller_startup_scripts" { - for_each = { - for x in local.controller_startup_scripts - : replace(basename(x.filename), "/[^a-zA-Z0-9-_]/", "_") => x - } - - bucket = var.bucket_name - name = format("%s/slurm-controller-script-%s", local.bucket_dir, each.key) - content = each.value.content -} - -resource "google_storage_bucket_object" "compute_startup_scripts" { - for_each = { - for x in var.compute_startup_scripts - : replace(basename(x.filename), "/[^a-zA-Z0-9-_]/", "_") => x - } - - bucket = var.bucket_name - name = format("%s/slurm-compute-script-%s", local.bucket_dir, each.key) - content = each.value.content -} - -resource "google_storage_bucket_object" "nodeset_startup_scripts" { - for_each = { for x in flatten([ - for nodeset, scripts in var.nodeset_startup_scripts - : [for s in scripts - : { - content = s.content, - name = format("slurm-nodeset-%s-script-%s", nodeset, replace(basename(s.filename), "/[^a-zA-Z0-9-_]/", "_")) } - ]]) : x.name => x.content } - - bucket = var.bucket_name - name = format("%s/%s", local.bucket_dir, each.key) - content = each.value -} - -resource "google_storage_bucket_object" "login_startup_scripts" { - for_each = { - for x in var.login_startup_scripts - : replace(basename(x.filename), "/[^a-zA-Z0-9-_]/", "_") => x - } - - bucket = var.bucket_name - name = format("%s/slurm-login-script-%s", local.bucket_dir, each.key) - content = each.value.content -} - -resource "google_storage_bucket_object" "prolog_scripts" { - for_each = { - for x in local.prolog_scripts - : replace(basename(x.filename), "/[^a-zA-Z0-9-_]/", "_") => x - } - - bucket = var.bucket_name - name = format("%s/slurm-prolog-script-%s", local.bucket_dir, each.key) - content = each.value.content - source = each.value.source -} - -resource "google_storage_bucket_object" "epilog_scripts" { - for_each = { - for x in local.epilog_scripts - : replace(basename(x.filename), "/[^a-zA-Z0-9-_]/", "_") => x - } - - bucket = var.bucket_name - name = format("%s/slurm-epilog-script-%s", local.bucket_dir, each.key) - content = each.value.content - source = each.value.source -} - -################################ -# DATA: EXTERNAL PROLOG/EPILOG # -################################ - -data "local_file" "external_epilog" { - filename = "${path.module}/files/external_epilog.sh" -} - -data "local_file" "external_prolog" { - filename = "${path.module}/files/external_prolog.sh" -} - -data "local_file" "setup_external" { - filename = "${path.module}/files/setup_external.sh" -} - -locals { - checksum = md5(join("", flatten([ - google_storage_bucket_object.config.md5hash, - [for f in google_storage_bucket_object.devel : f.md5hash], - [for k, f in google_storage_bucket_object.controller_startup_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.compute_startup_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.nodeset_startup_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.login_startup_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.prolog_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.epilog_scripts : f.md5hash] - ]))) - - external_epilog = [{ - filename = "z_external_epilog.sh" - content = data.local_file.external_epilog.content - source = null - }] - external_prolog = [{ - filename = "z_external_prolog.sh" - content = data.local_file.external_prolog.content - source = null - }] - setup_external = [{ - filename = "z_setup_external.sh" - content = data.local_file.setup_external.content - }] - - prolog_scripts = var.enable_external_prolog_epilog ? concat(local.external_prolog, var.prolog_scripts) : var.prolog_scripts - epilog_scripts = var.enable_external_prolog_epilog ? concat(local.external_epilog, var.epilog_scripts) : var.epilog_scripts - controller_startup_scripts = var.enable_external_prolog_epilog ? concat(local.setup_external, var.controller_startup_scripts) : var.controller_startup_scripts - - -} diff --git a/terraform/slurm_cluster/modules/slurm_files/outputs.tf b/terraform/slurm_cluster/modules/slurm_files/outputs.tf deleted file mode 100644 index 3b680b50..00000000 --- a/terraform/slurm_cluster/modules/slurm_files/outputs.tf +++ /dev/null @@ -1,60 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "slurm_bucket_path" { - description = "GCS Bucket URI of Slurm cluster file storage." - value = local.bucket_path -} - -output "config" { - description = "Cluster configuration." - value = local.config - - precondition { - condition = var.enable_hybrid ? can(coalesce(var.slurm_control_host)) : true - error_message = "Input slurm_control_host is required." - } - - precondition { - condition = length(local.x_nodeset_overlap) == 0 - error_message = "All nodeset names must be unique among all nodeset types." - } -} - -output "partitions" { - description = "Cluster partitions." - value = lookup(local.config, "partitions", null) -} - -output "nodeset" { - description = "Cluster nodesets." - value = lookup(local.config, "nodeset", null) -} - -output "nodeset_dyn" { - description = "Cluster nodesets (dynamic)." - value = lookup(local.config, "nodeset_dyn", null) -} - -output "nodeset_tpu" { - description = "Cluster nodesets (TPU)." - value = lookup(local.config, "nodeset_tpu", null) -} - -output "checksum" { - description = "Checksum of all files written to the bucket." - value = local.checksum -} diff --git a/terraform/slurm_cluster/modules/slurm_files/variables.tf b/terraform/slurm_cluster/modules/slurm_files/variables.tf deleted file mode 100644 index 1cffbb30..00000000 --- a/terraform/slurm_cluster/modules/slurm_files/variables.tf +++ /dev/null @@ -1,467 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "bucket_name" { - description = <<-EOD - Name of GCS bucket to use. - EOD - type = string -} - -variable "bucket_dir" { - description = "Bucket directory for cluster files to be put into." - type = string - default = null -} - -variable "enable_devel" { - type = bool - description = "Enables development mode. Not for production use." - default = false -} - -variable "enable_debug_logging" { - type = bool - description = "Enables debug logging mode. Not for production use." - default = false -} - -variable "extra_logging_flags" { - type = map(bool) - description = "The list of extra flags for the logging system to use. See the logging_flags variable in scripts/util.py to get the list of supported log flags." - default = {} -} - -variable "project_id" { - description = "The GCP project ID." - type = string -} - -######### -# SLURM # -######### - -variable "slurm_cluster_name" { - type = string - description = "The cluster name, used for resource naming and slurm accounting." - - validation { - condition = can(regex("^[a-z](?:[a-z0-9]{0,9})$", var.slurm_cluster_name)) - error_message = "Variable 'slurm_cluster_name' must be a match of regex '^[a-z](?:[a-z0-9]{0,9})$'." - } -} - -variable "enable_slurm_gcp_plugins" { - description = < - -- [Module: Slurm Instance Template](#module-slurm-instance-template) - - [Overview](#overview) - - [Usage](#usage) - - [Service Account](#service-account) - - [Module API](#module-api) - - - -## Overview - -This is a submodule of [slurm_cluster](../../../slurm_cluster/README.md). This -module creates an -[instance template](../../../../docs/glossary.md#instance-template) intended to -be used by [slurm_controller_instance](../slurm_controller_instance/README.md), -and [slurm_login_instance](../slurm_login_instance/README.md), and -[slurm_partition](../slurm_partition/README.md). - -> **NOTE:** [slurm_cluster_name](./README_TF.md#inputs) is appended to network -> [tags](./README_TF.md#inputs). - -## Usage - -See [examples](../../examples/slurm_instance_template/) directory for sample -usages. - -See below for a simple inclusion within your own terraform project. - -```hcl -module "slurm_instance_template" { - source = "git@github.com:SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=v5.0.0" - - project_id = "" - - slurm_cluster_name = "" - - network = "default" - service_account = { - email = "" - scopes = ["https://www.googleapis.com/auth/cloud-platform"] - } -} -``` - -> **NOTE:** Because this module is not hosted on -> [Terraform Registry](../../../../docs/glossary.md#terraform-registry), the -> version must be strictly controlled via -> [revision](https://www.terraform.io/language/modules/sources#selecting-a-revision) -> syntax on the source line. - -### Service Account - -It is recommended to generate a -[service account](../../../../docs/glossary.md#service-account) via -[slurm_sa_iam](../slurm_sa_iam/). - -Otherwise reference [slurm_sa_iam](../slurm_sa_iam/README.md#service-accounts) -to create a self managed compute -[service account](../../../../docs/glossary.md#service-account) and -[IAM](../../../../docs/glossary.md#iam). - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_cluster/modules/slurm_instance_template/README_TF.md b/terraform/slurm_cluster/modules/slurm_instance_template/README_TF.md deleted file mode 100644 index b91fd281..00000000 --- a/terraform/slurm_cluster/modules/slurm_instance_template/README_TF.md +++ /dev/null @@ -1,97 +0,0 @@ -# slurm_instance_template - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.0 | -| [google](#requirement\_google) | >= 3.53 | -| [local](#requirement\_local) | ~> 2.0 | - -## Providers - -| Name | Version | -|------|---------| -| [local](#provider\_local) | ~> 2.0 | - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [instance\_template](#module\_instance\_template) | ../_instance_template | n/a | - -## Resources - -| Name | Type | -|------|------| -| [local_file.startup](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | -| [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
list(object({
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
}))
| `[]` | no | -| [bandwidth\_tier](#input\_bandwidth\_tier) | Tier 1 bandwidth increases the maximum egress bandwidth for VMs.
Using the `virtio_enabled` setting will only enable VirtioNet and will not enable TIER\_1.
Using the `tier_1_enabled` setting will enable both gVNIC and TIER\_1 higher bandwidth networking.
Using the `gvnic_enabled` setting will only enable gVNIC and will not enable TIER\_1.
Note that TIER\_1 only works with specific machine families & shapes and must be using an image that supports gVNIC. See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | -| [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | -| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | -| [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | -| [disk\_labels](#input\_disk\_labels) | Labels to be assigned to boot disk, provided as a map. | `map(string)` | `{}` | no | -| [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `100` | no | -| [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, local-ssd, or pd-standard. | `string` | `"pd-standard"` | no | -| [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | -| [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | -| [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | -| [gpu](#input\_gpu) | GPU information. Type and count of GPU to attach to the instance template. See
https://cloud.google.com/compute/docs/gpus more details.
* type : the GPU type
* count : number of GPUs |
object({
type = string
count = number
})
| `null` | no | -| [labels](#input\_labels) | Labels, provided as a map | `map(string)` | `{}` | no | -| [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"n1-standard-1"` | no | -| [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | -| [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | -| [name\_prefix](#input\_name\_prefix) | Prefix for template resource. | `string` | `"default"` | no | -| [network](#input\_network) | The name or self\_link of the network to attach this interface to. Use network
attribute for Legacy or Auto subnetted networks and subnetwork for custom
subnetted networks. | `string` | `null` | no | -| [network\_ip](#input\_network\_ip) | Private IP address to assign to the instance if desired. | `string` | `""` | no | -| [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy | `string` | `"MIGRATE"` | no | -| [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no | -| [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | -| [region](#input\_region) | Region where the instance template should be created. | `string` | `null` | no | -| [resource\_policies](#input\_resource\_policies) | A list of self\_links of resource policies to attach to the instance.
Currently a max of 1 resource policy is supported. | `list(string)` | `null` | no | -| [service\_account](#input\_service\_account) | Service account to attach to the instances. See
'main.tf:local.service\_account' for the default. |
object({
email = string
scopes = set(string)
})
| `null` | no | -| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
* enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
* enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
* enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | -| [slurm\_bucket\_path](#input\_slurm\_bucket\_path) | GCS Bucket URI of Slurm cluster file storage. | `string` | n/a | yes | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming. | `string` | n/a | yes | -| [slurm\_instance\_role](#input\_slurm\_instance\_role) | Slurm instance type. Must be one of: controller; login; compute; or null. | `string` | `null` | no | -| [source\_image](#input\_source\_image) | Source disk image. | `string` | `""` | no | -| [source\_image\_family](#input\_source\_image\_family) | Source image family. | `string` | `""` | no | -| [source\_image\_project](#input\_source\_image\_project) | Project where the source image comes from. If it is not provided, the provider project is used. | `string` | `""` | no | -| [spot](#input\_spot) | Provision as a SPOT preemptible instance.
See https://cloud.google.com/compute/docs/instances/spot for more details. | `bool` | `false` | no | -| [subnetwork](#input\_subnetwork) | The name of the subnetwork to attach this interface to. The subnetwork must
exist in the same region this instance will be created in. Either network or
subnetwork must be provided. | `string` | `null` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The ID of the project in which the subnetwork belongs. If it is not provided, the provider project is used. | `string` | `null` | no | -| [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | -| [termination\_action](#input\_termination\_action) | Which action to take when Compute Engine preempts the VM. Value can be: 'STOP', 'DELETE'. The default value is 'STOP'.
See https://cloud.google.com/compute/docs/instances/spot for more details. | `string` | `"STOP"` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [instance\_template](#output\_instance\_template) | Instance template details | -| [name](#output\_name) | Name of instance template | -| [self\_link](#output\_self\_link) | Self\_link of instance template | -| [service\_account](#output\_service\_account) | Service account object, includes email and scopes. | -| [tags](#output\_tags) | Tags that will be associated with instance(s) | - diff --git a/terraform/slurm_cluster/modules/slurm_instance_template/main.tf b/terraform/slurm_cluster/modules/slurm_instance_template/main.tf deleted file mode 100644 index 501fe4c6..00000000 --- a/terraform/slurm_cluster/modules/slurm_instance_template/main.tf +++ /dev/null @@ -1,165 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########## -# LOCALS # -########## - -locals { - scripts_dir = abspath("${path.module}/../../../../scripts") - - additional_disks = [ - for disk in var.additional_disks : { - disk_name = disk.disk_name - device_name = disk.device_name - auto_delete = disk.auto_delete - boot = disk.boot - disk_size_gb = disk.disk_size_gb - disk_type = disk.disk_type - disk_labels = merge( - disk.disk_labels, - { - slurm_cluster_name = var.slurm_cluster_name - slurm_instance_role = local.slurm_instance_role - }, - ) - } - ] - - service_account = { - email = try(var.service_account.email, null) - scopes = try(var.service_account.scopes, ["https://www.googleapis.com/auth/cloud-platform"]) - } - - source_image_family = ( - var.source_image_family != "" && var.source_image_family != null - ? var.source_image_family - : "slurm-gcp-6-8-hpc-rocky-linux-8" - ) - source_image_project = ( - var.source_image_project != "" && var.source_image_project != null - ? var.source_image_project - : "projects/schedmd-slurm-public/global/images/family" - ) - - source_image = ( - var.source_image != null - ? var.source_image - : "" - ) - - slurm_instance_role = var.slurm_instance_role != null ? lower(var.slurm_instance_role) : null - - name_prefix = ( - local.slurm_instance_role != null - ? "${var.slurm_cluster_name}-${local.slurm_instance_role}-${var.name_prefix}" - : "${var.slurm_cluster_name}-${var.name_prefix}" - ) - - total_egress_bandwidth_tier = var.bandwidth_tier == "tier_1_enabled" ? "TIER_1" : "DEFAULT" - - nic_type_map = { - platform_default = null - virtio_enabled = "VIRTIO_NET" - gvnic_enabled = "GVNIC" - tier_1_enabled = "GVNIC" - } - nic_type = lookup(local.nic_type_map, var.bandwidth_tier, null) -} - -######## -# DATA # -######## - -data "local_file" "startup" { - filename = abspath("${local.scripts_dir}/startup.sh") -} - -############ -# TEMPLATE # -############ - -module "instance_template" { - source = "../_instance_template" - - project_id = var.project_id - - # Network - can_ip_forward = var.can_ip_forward - network_ip = var.network_ip - network = var.network - nic_type = local.nic_type - region = var.region - subnetwork_project = var.subnetwork_project - subnetwork = var.subnetwork - tags = var.tags - total_egress_bandwidth_tier = local.total_egress_bandwidth_tier - additional_networks = var.additional_networks - access_config = var.access_config - - # Instance - machine_type = var.machine_type - min_cpu_platform = var.min_cpu_platform - name_prefix = local.name_prefix - gpu = var.gpu - service_account = local.service_account - shielded_instance_config = var.shielded_instance_config - threads_per_core = var.disable_smt ? 1 : null - enable_confidential_vm = var.enable_confidential_vm - enable_shielded_vm = var.enable_shielded_vm - preemptible = var.preemptible - spot = var.spot - on_host_maintenance = var.on_host_maintenance - labels = merge( - var.labels, - { - slurm_cluster_name = var.slurm_cluster_name - slurm_instance_role = local.slurm_instance_role - }, - ) - instance_termination_action = var.termination_action - - # Metadata - startup_script = data.local_file.startup.content - metadata = merge( - var.metadata, - { - enable-oslogin = upper(var.enable_oslogin) - slurm_bucket_path = var.slurm_bucket_path - slurm_cluster_name = var.slurm_cluster_name - slurm_instance_role = local.slurm_instance_role - }, - ) - - # Image - source_image_project = local.source_image_project - source_image_family = local.source_image_family - source_image = local.source_image - - # Disk - disk_type = var.disk_type - disk_size_gb = var.disk_size_gb - auto_delete = var.disk_auto_delete - disk_labels = merge( - { - slurm_cluster_name = var.slurm_cluster_name - slurm_instance_role = local.slurm_instance_role - }, - var.disk_labels, - ) - additional_disks = local.additional_disks - resource_policies = var.resource_policies -} diff --git a/terraform/slurm_cluster/modules/slurm_instance_template/outputs.tf b/terraform/slurm_cluster/modules/slurm_instance_template/outputs.tf deleted file mode 100644 index 4e984965..00000000 --- a/terraform/slurm_cluster/modules/slurm_instance_template/outputs.tf +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "instance_template" { - description = "Instance template details" - value = module.instance_template -} - -output "self_link" { - description = "Self_link of instance template" - value = module.instance_template.self_link -} - -output "name" { - description = "Name of instance template" - value = module.instance_template.name -} - -output "tags" { - description = "Tags that will be associated with instance(s)" - value = module.instance_template.tags -} - -output "service_account" { - description = "Service account object, includes email and scopes." - value = module.instance_template.service_account -} diff --git a/terraform/slurm_cluster/modules/slurm_instance_template/variables.tf b/terraform/slurm_cluster/modules/slurm_instance_template/variables.tf deleted file mode 100644 index 2a0cc810..00000000 --- a/terraform/slurm_cluster/modules/slurm_instance_template/variables.tf +++ /dev/null @@ -1,388 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########### -# GENERAL # -########### - -variable "project_id" { - type = string - description = "Project ID to create resources in." -} - -variable "on_host_maintenance" { - type = string - description = "Instance availability Policy" - default = "MIGRATE" -} - -variable "labels" { - type = map(string) - description = "Labels, provided as a map" - default = {} -} - -variable "enable_oslogin" { - type = bool - description = < - -- [Module: Slurm Login Instance](#module-slurm-login-instance) - - [Overview](#overview) - - [Usage](#usage) - - [Module API](#module-api) - - - -## Overview - -This is a submodule of [slurm_cluster](../../../slurm_cluster/README.md). This -module creates a Slurm login instance from -[instance template](../../../../docs/glossary.md#instance-template). Certain -properties from the -[instance template](../../../../docs/glossary.md#instance-template) will be -overridden when instanceated as a [VM](../../../../docs/glossary.md#vm). - -It is recommended to pass in an -[instance template](../../../../docs/glossary.md#instance-template) generated by -the [slurm_instance_template](../slurm_instance_template/README.md) module. - -## Usage - -See [examples](../../examples/slurm_login_instance/) directory for sample -usages. - -See below for a simple inclusion within your own terraform project. - -```hcl -module "slurm_login_instance" { - source = "git@github.com:SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=v5.0.0" - - project_id = "" - - region = "us-central1" - subnetwork = "default" - - instance_template = "" - - slurm_cluster_name = "" -} -``` - -> **NOTE:** Because this module is not hosted on -> [Terraform Registry](../../../../docs/glossary.md#terraform-registry), the -> version must be strictly controlled via -> [revision](https://www.terraform.io/language/modules/sources#selecting-a-revision) -> syntax on the source line. - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_cluster/modules/slurm_login_instance/README_TF.md b/terraform/slurm_cluster/modules/slurm_login_instance/README_TF.md deleted file mode 100644 index 82e12469..00000000 --- a/terraform/slurm_cluster/modules/slurm_login_instance/README_TF.md +++ /dev/null @@ -1,63 +0,0 @@ -# slurm_login_instance - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.0 | -| [google](#requirement\_google) | >= 3.53 | - -## Providers - -No providers. - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | ../_slurm_instance | n/a | - -## Resources - -No resources. - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
access_config = optional(list(object({
nat_ip = string
network_tier = string
})), [])
alias_ip_range = optional(list(object({
ip_cidr_range = string
subnetwork_range_name = string
})), [])
ipv6_access_config = optional(list(object({
network_tier = string
})), [])
network = optional(string)
network_ip = optional(string, "")
nic_type = optional(string)
queue_count = optional(number)
stack_type = optional(string)
subnetwork = optional(string)
subnetwork_project = optional(string)
}))
| `[]` | no | -| [enable\_public\_ip](#input\_enable\_public\_ip) | Enables IP address to access the Internet. | `bool` | `false` | no | -| [instance\_template](#input\_instance\_template) | Instance template self\_link used to create compute instances. | `string` | n/a | yes | -| [network](#input\_network) | Network to deploy to. Only one of network or subnetwork should be specified. | `string` | `""` | no | -| [network\_tier](#input\_network\_tier) | The networking tier used for configuring this instance. This field can take the following values: PREMIUM, FIXED\_STANDARD or STANDARD.
Ignored if enable\_public\_ip is false. | `string` | `"STANDARD"` | no | -| [num\_instances](#input\_num\_instances) | Number of instances to create. This value is ignored if static\_ips is provided. | `number` | `1` | no | -| [project\_id](#input\_project\_id) | The GCP project ID | `string` | `null` | no | -| [region](#input\_region) | Region where the instances should be created. | `string` | `null` | no | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming. | `string` | n/a | yes | -| [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no | -| [subnetwork](#input\_subnetwork) | Subnet to deploy to. Only one of network or subnetwork should be specified. | `string` | `""` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project that subnetwork belongs to. | `string` | `null` | no | -| [suffix](#input\_suffix) | Login name suffix. | `string` | `"frontend"` | no | -| [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [slurm\_login\_instance](#output\_slurm\_login\_instance) | Login instance details | - diff --git a/terraform/slurm_cluster/modules/slurm_login_instance/main.tf b/terraform/slurm_cluster/modules/slurm_login_instance/main.tf deleted file mode 100644 index 903e642c..00000000 --- a/terraform/slurm_cluster/modules/slurm_login_instance/main.tf +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########## -# LOCALS # -########## - -locals { - region = ( - length(regexall("/regions/([^/]*)", var.subnetwork)) > 0 - ? flatten(regexall("/regions/([^/]*)", var.subnetwork))[0] - : var.region - ) - - access_config = { - nat_ip = null - network_tier = var.network_tier - } -} - -############ -# INSTANCE # -############ - -module "slurm_login_instance" { - source = "../_slurm_instance" - - access_config = var.enable_public_ip ? [local.access_config] : [] - additional_networks = var.additional_networks - add_hostname_suffix = true - hostname = "${var.slurm_cluster_name}-login-${var.suffix}" - instance_template = var.instance_template - network = var.network - num_instances = var.num_instances - project_id = var.project_id - region = local.region - slurm_cluster_name = var.slurm_cluster_name - slurm_instance_role = "login" - static_ips = var.static_ips - subnetwork_project = var.subnetwork_project - subnetwork = var.subnetwork - zone = var.zone -} diff --git a/terraform/slurm_cluster/modules/slurm_login_instance/outputs.tf b/terraform/slurm_cluster/modules/slurm_login_instance/outputs.tf deleted file mode 100644 index 57d7b5f8..00000000 --- a/terraform/slurm_cluster/modules/slurm_login_instance/outputs.tf +++ /dev/null @@ -1,20 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "slurm_login_instance" { - description = "Login instance details" - value = module.slurm_login_instance -} diff --git a/terraform/slurm_cluster/modules/slurm_login_instance/variables.tf b/terraform/slurm_cluster/modules/slurm_login_instance/variables.tf deleted file mode 100644 index a3704ca7..00000000 --- a/terraform/slurm_cluster/modules/slurm_login_instance/variables.tf +++ /dev/null @@ -1,145 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - type = string - description = "The GCP project ID" - default = null -} - -########### -# NETWORK # -########### - -variable "additional_networks" { - description = "Additional network interface details for GCE, if any." - default = [] - type = list(object({ - access_config = optional(list(object({ - nat_ip = string - network_tier = string - })), []) - alias_ip_range = optional(list(object({ - ip_cidr_range = string - subnetwork_range_name = string - })), []) - ipv6_access_config = optional(list(object({ - network_tier = string - })), []) - network = optional(string) - network_ip = optional(string, "") - nic_type = optional(string) - queue_count = optional(number) - stack_type = optional(string) - subnetwork = optional(string) - subnetwork_project = optional(string) - })) - nullable = false -} - -variable "network" { - type = string - description = "Network to deploy to. Only one of network or subnetwork should be specified." - default = "" -} - -variable "subnetwork" { - type = string - description = "Subnet to deploy to. Only one of network or subnetwork should be specified." - default = "" -} - -variable "subnetwork_project" { - type = string - description = "The project that subnetwork belongs to." - default = null -} - -variable "region" { - type = string - description = "Region where the instances should be created." - default = null -} - -############ -# INSTANCE # -############ - -variable "instance_template" { - type = string - description = "Instance template self_link used to create compute instances." -} - -variable "static_ips" { - type = list(string) - description = "List of static IPs for VM instances." - default = [] -} - -variable "enable_public_ip" { - description = "Enables IP address to access the Internet." - type = bool - default = false -} - -variable "network_tier" { - type = string - description = <<-EOD - The networking tier used for configuring this instance. This field can take the following values: PREMIUM, FIXED_STANDARD or STANDARD. - Ignored if enable_public_ip is false. - EOD - default = "STANDARD" - - validation { - condition = var.network_tier == null ? true : contains(["PREMIUM", "FIXED_STANDARD", "STANDARD"], var.network_tier) - error_message = "Allow values are: 'PREMIUM', 'FIXED_STANDARD', 'STANDARD'." - } -} - -variable "num_instances" { - type = number - description = "Number of instances to create. This value is ignored if static_ips is provided." - default = 1 -} - -variable "zone" { - type = string - description = < - -- [Module: Slurm Nodeset](#module-slurm-nodeset) - - [Overview](#overview) - - [Module API](#module-api) - - - -## Overview - -This is a submodule of [slurm_cluster](../../../slurm_cluster/README.md). It -creates a Slurm nodeset for [slurm_partition](../slurm_partition/README.md). - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_cluster/modules/slurm_nodeset/README_TF.md b/terraform/slurm_cluster/modules/slurm_nodeset/README_TF.md deleted file mode 100644 index 4e1c9dc9..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset/README_TF.md +++ /dev/null @@ -1,70 +0,0 @@ -# slurm_nodeset - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.2 | -| [google](#requirement\_google) | >= 3.53 | -| [null](#requirement\_null) | ~> 3.0 | - -## Providers - -| Name | Version | -|------|---------| -| [google](#provider\_google) | >= 3.53 | -| [null](#provider\_null) | ~> 3.0 | - -## Modules - -No modules. - -## Resources - -| Name | Type | -|------|------| -| [null_resource.nodeset](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [bandwidth\_tier](#input\_bandwidth\_tier) | Tier 1 bandwidth increases the maximum egress bandwidth for VMs.
Using the `virtio_enabled` setting will only enable VirtioNet and will not enable TIER\_1.
Using the `tier_1_enabled` setting will enable both gVNIC and TIER\_1 higher bandwidth networking.
Using the `gvnic_enabled` setting will only enable gVNIC and will not enable TIER\_1.
Note that TIER\_1 only works with specific machine families & shapes and must be using an image that supports gVNIC. See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | -| [enable\_placement](#input\_enable\_placement) | Enables compact placement policy for instances.
Use compact policies when you want VMs to be located close to each other for low network latency between the VMs.
See https://cloud.google.com/compute/docs/instances/define-instance-placement for details. | `bool` | `false` | no | -| [enable\_public\_ip](#input\_enable\_public\_ip) | Enables IP address to access the Internet. | `bool` | `false` | no | -| [instance\_template\_self\_link](#input\_instance\_template\_self\_link) | Instance template self\_link used to create compute instances. | `string` | n/a | yes | -| [maintenance\_interval](#input\_maintenance\_interval) | Sets the maintenance interval for instances in this nodeset.
See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#maintenance_interval. | `string` | `""` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | -| [network\_tier](#input\_network\_tier) | The networking tier used for configuring this instance. This field can take the following values: PREMIUM, FIXED\_STANDARD or STANDARD.
Ignored if enable\_public\_ip is false. | `string` | `"STANDARD"` | no | -| [node\_conf](#input\_node\_conf) | Slurm node configuration, as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_NODE-CONFIGURATION for details. | `map(string)` | `{}` | no | -| [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of nodes allowed in this partition to be created dynamically. | `number` | `0` | no | -| [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no | -| [nodeset\_name](#input\_nodeset\_name) | Name of Slurm nodeset. | `string` | n/a | yes | -| [reservation\_name](#input\_reservation\_name) | Sets reservation affinity for instances created from this nodeset. | `string` | `null` | no | -| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The subnetwork self\_link to attach instances to. | `string` | n/a | yes | -| [zone\_target\_shape](#input\_zone\_target\_shape) | Strategy for distributing VMs across zones in a region.
ANY
GCE picks zones for creating VM instances to fulfill the requested number of VMs
within present resource constraints and to maximize utilization of unused zonal
reservations.
ANY\_SINGLE\_ZONE (default)
GCE always selects a single zone for all the VMs, optimizing for resource quotas,
available reservations and general capacity.
BALANCED
GCE prioritizes acquisition of resources, scheduling VMs in zones where resources
are available while distributing VMs as evenly as possible across allowed zones
to minimize the impact of zonal failure. | `string` | `"ANY_SINGLE_ZONE"` | no | -| [zones](#input\_zones) | Nodes will only be created in the listed zones.
If none are given, all available zones for the region will be allowed.
NOTE: Machine Type and GPU availability may vary with zone. | `set(string)` | `[]` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [nodeset](#output\_nodeset) | Nodeset details. | -| [nodeset\_name](#output\_nodeset\_name) | Nodeset name. | - diff --git a/terraform/slurm_cluster/modules/slurm_nodeset/main.tf b/terraform/slurm_cluster/modules/slurm_nodeset/main.tf deleted file mode 100644 index 6975efdf..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset/main.tf +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########## -# LOCALS # -########## - -locals { - zones = setintersection(toset(data.google_compute_zones.available.names), var.zones) -} - -######## -# DATA # -######## - -data "google_compute_zones" "available" { - project = length(regexall("projects/([^/]*)", var.subnetwork_self_link)) > 0 ? flatten(regexall("projects/([^/]*)", var.subnetwork_self_link))[0] : null - region = length(regexall("/regions/([^/]*)", var.subnetwork_self_link)) > 0 ? flatten(regexall("/regions/([^/]*)", var.subnetwork_self_link))[0] : null -} - -########### -# NODESET # -########### - -locals { - nodeset = { - nodeset_name = var.nodeset_name - node_conf = var.node_conf - instance_template = var.instance_template_self_link - node_count_dynamic_max = var.node_count_dynamic_max - node_count_static = var.node_count_static - subnetwork = var.subnetwork_self_link - zone_target_shape = var.zone_target_shape - zone_policy_allow = length(local.zones) > 0 ? setintersection(toset(data.google_compute_zones.available.names), local.zones) : toset(data.google_compute_zones.available.names) - zone_policy_deny = length(local.zones) > 0 ? setsubtract(toset(data.google_compute_zones.available.names), local.zones) : toset([]) - # Additional Features - reservation_name = var.reservation_name - maintenance_interval = var.maintenance_interval - enable_placement = var.enable_placement - enable_public_ip = var.enable_public_ip - network_tier = var.network_tier - network_storage = var.network_storage - } -} - -resource "null_resource" "nodeset" { - triggers = { - nodeset = sha256(jsonencode(local.nodeset)) - } - lifecycle { - precondition { - condition = sum([var.node_count_dynamic_max, var.node_count_static]) > 0 - error_message = "Sum of node_count_dynamic_max and node_count_static must be > 0." - } - } -} diff --git a/terraform/slurm_cluster/modules/slurm_nodeset/outputs.tf b/terraform/slurm_cluster/modules/slurm_nodeset/outputs.tf deleted file mode 100644 index d329c9b4..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset/outputs.tf +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "nodeset_name" { - description = "Nodeset name." - value = local.nodeset.nodeset_name -} - -output "nodeset" { - description = "Nodeset details." - value = local.nodeset -} diff --git a/terraform/slurm_cluster/modules/slurm_nodeset/variables.tf b/terraform/slurm_cluster/modules/slurm_nodeset/variables.tf deleted file mode 100644 index c21cd348..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset/variables.tf +++ /dev/null @@ -1,193 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "nodeset_name" { - description = "Name of Slurm nodeset." - type = string - - validation { - condition = can(regex("^[a-z](?:[a-z0-9]{0,14})$", var.nodeset_name)) - error_message = "Variable 'nodeset_name' must be a match of regex '^[a-z](?:[a-z0-9]{0,14})$'." - } -} - -variable "node_conf" { - description = < 0 - error_message = "Must be a self link." - } -} - -variable "subnetwork_self_link" { - description = "The subnetwork self_link to attach instances to." - type = string - - validation { - condition = length(regexall("projects/([^/]*)", var.subnetwork_self_link)) > 0 && length(regexall("/regions/([^/]*)", var.subnetwork_self_link)) > 0 - error_message = "Must be a self link." - } -} - -variable "zones" { - description = <<-EOD - Nodes will only be created in the listed zones. - If none are given, all available zones for the region will be allowed. - NOTE: Machine Type and GPU availability may vary with zone. - EOD - type = set(string) - default = [] -} - -variable "zone_target_shape" { - description = <= 0 - error_message = "Value must be >= 0." - } -} - -variable "node_count_dynamic_max" { - description = "Maximum number of nodes allowed in this partition to be created dynamically." - type = number - default = 0 - - validation { - condition = var.node_count_dynamic_max >= 0 - error_message = "Value must be >= 0." - } -} - -variable "reservation_name" { - description = <<-EOD - Sets reservation affinity for instances created from this nodeset. - EOD - type = string - default = null -} - -variable "maintenance_interval" { - description = <<-EOD - Sets the maintenance interval for instances in this nodeset. - See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#maintenance_interval. - EOD - type = string - default = "" - nullable = false - - validation { - condition = contains(["", "PERIODIC"], var.maintenance_interval) - error_message = "var.maintenance_interval must be the empty string or \"PERIODIC\"" - } -} - -variable "enable_placement" { - description = <<-EOD - Enables compact placement policy for instances. - Use compact policies when you want VMs to be located close to each other for low network latency between the VMs. - See https://cloud.google.com/compute/docs/instances/define-instance-placement for details. - EOD - type = bool - default = false -} - -variable "enable_public_ip" { - description = "Enables IP address to access the Internet." - type = bool - default = false -} - -variable "network_tier" { - type = string - description = <<-EOD - The networking tier used for configuring this instance. This field can take the following values: PREMIUM, FIXED_STANDARD or STANDARD. - Ignored if enable_public_ip is false. - EOD - default = "STANDARD" - - validation { - condition = var.network_tier == null ? true : contains(["PREMIUM", "FIXED_STANDARD", "STANDARD"], var.network_tier) - error_message = "Allow values are: 'PREMIUM', 'FIXED_STANDARD', 'STANDARD'." - } -} - -# TODO: either remove or make use of this variable -# tflint-ignore: terraform_unused_declarations -variable "bandwidth_tier" { - description = <<-EOD - Tier 1 bandwidth increases the maximum egress bandwidth for VMs. - Using the `virtio_enabled` setting will only enable VirtioNet and will not enable TIER_1. - Using the `tier_1_enabled` setting will enable both gVNIC and TIER_1 higher bandwidth networking. - Using the `gvnic_enabled` setting will only enable gVNIC and will not enable TIER_1. - Note that TIER_1 only works with specific machine families & shapes and must be using an image that supports gVNIC. See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. - EOD - type = string - default = "platform_default" - - validation { - condition = contains(["platform_default", "virtio_enabled", "gvnic_enabled", "tier_1_enabled"], var.bandwidth_tier) - error_message = "Allowed values for bandwidth_tier are 'platform_default', 'virtio_enabled', 'gvnic_enabled', or 'tier_1_enabled'." - } -} - -variable "network_storage" { - description = "An array of network attached storage mounts to be configured on nodes." - type = list(object({ - server_ip = string, - remote_mount = string, - local_mount = string, - fs_type = string, - mount_options = string, - })) - default = [] -} diff --git a/terraform/slurm_cluster/modules/slurm_nodeset/versions.tf b/terraform/slurm_cluster/modules/slurm_nodeset/versions.tf deleted file mode 100644 index 7e73f1d5..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset/versions.tf +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -terraform { - required_version = "~> 1.2" - - required_providers { - google = { - source = "hashicorp/google" - version = ">= 3.53" - } - null = { - source = "hashicorp/null" - version = "~> 3.0" - } - } -} diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_dyn/README.md b/terraform/slurm_cluster/modules/slurm_nodeset_dyn/README.md deleted file mode 100644 index 98b11b8b..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset_dyn/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# Module: Slurm Nodeset (Dynamic) - -[FAQ](../../../../docs/faq.md) | -[Troubleshooting](../../../../docs/troubleshooting.md) | -[Glossary](../../../../docs/glossary.md) - - - -- [Module: Slurm Nodeset (Dynamic)](#module-slurm-nodeset-dynamic) - - [Overview](#overview) - - [Module API](#module-api) - - - -## Overview - -This is a submodule of [slurm_cluster](../../../slurm_cluster/README.md). It -creates a Slurm dynamic nodeset for -[slurm_partition](../slurm_partition/README.md). - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_dyn/README_TF.md b/terraform/slurm_cluster/modules/slurm_nodeset_dyn/README_TF.md deleted file mode 100644 index 36e27026..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset_dyn/README_TF.md +++ /dev/null @@ -1,54 +0,0 @@ -# slurm_nodeset_dyn - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.2 | -| [null](#requirement\_null) | ~> 3.0 | - -## Providers - -| Name | Version | -|------|---------| -| [null](#provider\_null) | ~> 3.0 | - -## Modules - -No modules. - -## Resources - -| Name | Type | -|------|------| -| [null_resource.nodeset](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [nodeset\_feature](#input\_nodeset\_feature) | Nodeset feature for dynamic registration. | `string` | n/a | yes | -| [nodeset\_name](#input\_nodeset\_name) | Name of Slurm nodeset. | `string` | n/a | yes | - -## Outputs - -| Name | Description | -|------|-------------| -| [nodeset](#output\_nodeset) | Nodeset details. | -| [nodeset\_name](#output\_nodeset\_name) | Nodeset name. | - diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_dyn/main.tf b/terraform/slurm_cluster/modules/slurm_nodeset_dyn/main.tf deleted file mode 100644 index 3bd8d93f..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset_dyn/main.tf +++ /dev/null @@ -1,32 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########### -# NODESET # -########### - -locals { - nodeset = { - nodeset_name = var.nodeset_name - nodeset_feature = var.nodeset_feature - } -} - -resource "null_resource" "nodeset" { - triggers = { - nodeset = sha256(jsonencode(local.nodeset)) - } -} diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_dyn/outputs.tf b/terraform/slurm_cluster/modules/slurm_nodeset_dyn/outputs.tf deleted file mode 100644 index d329c9b4..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset_dyn/outputs.tf +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "nodeset_name" { - description = "Nodeset name." - value = local.nodeset.nodeset_name -} - -output "nodeset" { - description = "Nodeset details." - value = local.nodeset -} diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_dyn/variables.tf b/terraform/slurm_cluster/modules/slurm_nodeset_dyn/variables.tf deleted file mode 100644 index 46744210..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset_dyn/variables.tf +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "nodeset_name" { - description = "Name of Slurm nodeset." - type = string - - validation { - condition = can(regex("^[a-z](?:[a-z0-9]{0,14})$", var.nodeset_name)) - error_message = "Variable 'nodeset_name' must be a match of regex '^[a-z](?:[a-z0-9]{0,14})$'." - } -} - -variable "nodeset_feature" { - description = "Nodeset feature for dynamic registration." - type = string -} diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_dyn/versions.tf b/terraform/slurm_cluster/modules/slurm_nodeset_dyn/versions.tf deleted file mode 100644 index 5f6aec48..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset_dyn/versions.tf +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -terraform { - required_version = "~> 1.2" - - required_providers { - null = { - source = "hashicorp/null" - version = "~> 3.0" - } - } -} diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/README.md b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/README.md deleted file mode 100644 index 9f2b9b6c..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/README.md +++ /dev/null @@ -1,23 +0,0 @@ -# Module: Slurm Nodeset (TPU) - -[FAQ](../../../../docs/faq.md) | -[Troubleshooting](../../../../docs/troubleshooting.md) | -[Glossary](../../../../docs/glossary.md) - - - -- [Module: Slurm Nodeset (TPU)](#module-slurm-nodeset-tpu) - - [Overview](#overview) - - [Module API](#module-api) - - - -## Overview - -This is a submodule of [slurm_cluster](../../../slurm_cluster/README.md). It -creates a Slurm TPU nodeset for [slurm_partition](../slurm_partition/README.md). - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/README_TF.md b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/README_TF.md deleted file mode 100644 index 6c84e80d..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/README_TF.md +++ /dev/null @@ -1,73 +0,0 @@ -# slurm_nodeset_tpu - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.2 | -| [google](#requirement\_google) | >= 3.53 | -| [null](#requirement\_null) | ~> 3.0 | - -## Providers - -| Name | Version | -|------|---------| -| [google](#provider\_google) | >= 3.53 | -| [null](#provider\_null) | ~> 3.0 | - -## Modules - -No modules. - -## Resources - -| Name | Type | -|------|------| -| [null_resource.nodeset_tpu](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [google_compute_subnetwork.nodeset_subnetwork](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_subnetwork) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [accelerator\_config](#input\_accelerator\_config) | Nodeset accelerator config, see https://cloud.google.com/tpu/docs/supported-tpu-configurations for details. |
object({
topology = string
version = string
})
|
{
"topology": "",
"version": ""
}
| no | -| [data\_disks](#input\_data\_disks) | The data disks to include in the TPU node | `list(string)` | `[]` | no | -| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-8-tf- | `string` | `""` | no | -| [enable\_public\_ip](#input\_enable\_public\_ip) | Enables IP address to access the Internet. | `bool` | `false` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | -| [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of nodes allowed in this partition to be created dynamically. | `number` | `0` | no | -| [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no | -| [node\_type](#input\_node\_type) | Specify a node type to base the vm configuration upon it. Not needed if you use accelerator\_config | `string` | `null` | no | -| [nodeset\_name](#input\_nodeset\_name) | Name of Slurm nodeset. | `string` | n/a | yes | -| [preemptible](#input\_preemptible) | Specify whether TPU-vms in this nodeset are preemtible, see https://cloud.google.com/tpu/docs/preemptible for details. | `bool` | `false` | no | -| [preserve\_tpu](#input\_preserve\_tpu) | Specify whether TPU-vms will get preserve on suspend, if set to true, on suspend vm is stopped, on false it gets deleted | `bool` | `true` | no | -| [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | -| [reserved](#input\_reserved) | Specify whether TPU-vms in this nodeset are created under a reservation. | `bool` | `false` | no | -| [service\_account](#input\_service\_account) | Service account to attach to the TPU-vm.
If none is given, the default service account and scopes will be used. |
object({
email = string
scopes = set(string)
})
| `null` | no | -| [subnetwork](#input\_subnetwork) | The name of the subnetwork to attach the TPU-vm of this nodeset to. | `string` | n/a | yes | -| [tf\_version](#input\_tf\_version) | Nodeset Tensorflow version, see https://cloud.google.com/tpu/docs/supported-tpu-configurations#tpu_vm for details. | `string` | n/a | yes | -| [zone](#input\_zone) | Nodes will only be created in this zone. Check https://cloud.google.com/tpu/docs/regions-zones to get zones with TPU-vm in it. | `string` | n/a | yes | - -## Outputs - -| Name | Description | -|------|-------------| -| [nodeset](#output\_nodeset) | Nodeset details. | -| [nodeset\_name](#output\_nodeset\_name) | Nodeset name. | -| [service\_account](#output\_service\_account) | Service account object, includes email and scopes. | - diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/main.tf b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/main.tf deleted file mode 100644 index 1bb0add7..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/main.tf +++ /dev/null @@ -1,121 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########### -# NODESET # -########### - -locals { - node_conf_hw = { - Mem334CPU96 = { - CPUs = 96 - Boards = 1 - Sockets = 2 - CoresPerSocket = 24 - ThreadsPerCore = 2 - RealMemory = 307200 - } - Mem400CPU240 = { - CPUs = 240 - Boards = 1 - Sockets = 2 - CoresPerSocket = 60 - ThreadsPerCore = 2 - RealMemory = 400000 - } - } - node_conf_mappings = { - "v2" = local.node_conf_hw.Mem334CPU96 - "v3" = local.node_conf_hw.Mem334CPU96 - "v4" = local.node_conf_hw.Mem400CPU240 - } - simple_nodes = ["v2-8", "v3-8", "v4-8"] -} - -locals { - snetwork = data.google_compute_subnetwork.nodeset_subnetwork.name - region = join("-", slice(split("-", var.zone), 0, 2)) - tpu_fam = var.accelerator_config.version != "" ? lower(var.accelerator_config.version) : split("-", var.node_type)[0] - #If subnetwork is specified and it does not have private_ip_google_access, we need to have public IPs on the TPU - #if no subnetwork is specified, the default one will be used, this does not have private_ip_google_access so we need public IPs too - pub_need = !data.google_compute_subnetwork.nodeset_subnetwork.private_ip_google_access - can_preempt = var.node_type != null ? contains(local.simple_nodes, var.node_type) : false - nodeset_tpu = { - nodeset_name = var.nodeset_name - node_conf = local.node_conf_mappings[local.tpu_fam] - node_type = var.node_type - accelerator_config = var.accelerator_config - tf_version = var.tf_version - preemptible = local.can_preempt ? var.preemptible : false - reserved = var.reserved - node_count_dynamic_max = var.node_count_dynamic_max - node_count_static = var.node_count_static - enable_public_ip = var.enable_public_ip - zone = var.zone - service_account = var.service_account != null ? var.service_account : local.service_account - preserve_tpu = local.can_preempt ? var.preserve_tpu : false - data_disks = var.data_disks - docker_image = var.docker_image != "" ? var.docker_image : "us-docker.pkg.dev/schedmd-slurm-public/tpu/slurm-gcp-6-8:tf-${var.tf_version}" - subnetwork = local.snetwork - network_storage = var.network_storage - } - - service_account = { - email = try(var.service_account.email, null) - scopes = try(var.service_account.scopes, ["https://www.googleapis.com/auth/cloud-platform"]) - } -} - -data "google_compute_subnetwork" "nodeset_subnetwork" { - name = var.subnetwork - region = local.region - project = var.project_id - - self_link = ( - length(regexall("/projects/([^/]*)", var.subnetwork)) > 0 - && length(regexall("/regions/([^/]*)", var.subnetwork)) > 0 - ? var.subnetwork - : null - ) -} - -resource "null_resource" "nodeset_tpu" { - triggers = { - nodeset = sha256(jsonencode(local.nodeset_tpu)) - } - lifecycle { - precondition { - condition = sum([var.node_count_dynamic_max, var.node_count_static]) > 0 - error_message = "Sum of node_count_dynamic_max and node_count_static must be > 0." - } - precondition { - condition = !(var.preemptible && var.reserved) - error_message = "Nodeset cannot be preemptible and reserved at the same time." - } - precondition { - condition = !(var.subnetwork == null && !var.enable_public_ip) - error_message = "Using the default subnetwork for the TPU nodeset requires enable_public_ip set to true." - } - precondition { - condition = !(var.subnetwork != null && (local.pub_need && !var.enable_public_ip)) - error_message = "The subnetwork specified does not have Private Google Access enabled. This is required when enable_public_ip is set to false." - } - precondition { - condition = !(var.node_type == null && (var.accelerator_config.topology == "" && var.accelerator_config.version == "")) - error_message = "Either a node type or an accelerator_config must be provided." - } - } -} diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/outputs.tf b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/outputs.tf deleted file mode 100644 index fce700d5..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/outputs.tf +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "nodeset_name" { - description = "Nodeset name." - value = local.nodeset_tpu.nodeset_name -} - -output "nodeset" { - description = "Nodeset details." - value = local.nodeset_tpu -} - -output "service_account" { - description = "Service account object, includes email and scopes." - value = local.service_account -} diff --git a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/variables.tf b/terraform/slurm_cluster/modules/slurm_nodeset_tpu/variables.tf deleted file mode 100644 index a174f4af..00000000 --- a/terraform/slurm_cluster/modules/slurm_nodeset_tpu/variables.tf +++ /dev/null @@ -1,158 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "nodeset_name" { - description = "Name of Slurm nodeset." - type = string - - validation { - condition = can(regex("^[a-z](?:[a-z0-9]{0,14})$", var.nodeset_name)) - error_message = "Variable 'nodeset_name' must be a match of regex '^[a-z](?:[a-z0-9]{0,14})$'." - } -} - -variable "node_type" { - description = "Specify a node type to base the vm configuration upon it. Not needed if you use accelerator_config" - type = string - default = null -} - -variable "accelerator_config" { - description = "Nodeset accelerator config, see https://cloud.google.com/tpu/docs/supported-tpu-configurations for details." - type = object({ - topology = string - version = string - }) - default = { - topology = "" - version = "" - } - validation { - condition = var.accelerator_config.version == "" ? true : contains(["V2", "V3", "V4"], upper(var.accelerator_config.version)) - error_message = "accelerator_config.version must be one of [\"V2\", \"V3\", \"V4\"]" - } - validation { - condition = var.accelerator_config.topology == "" ? true : can(regex("^[1-9]x[1-9](x[1-9])?$", var.accelerator_config.topology)) - error_message = "accelerator_config.topology must be a valid topology, like 2x2 4x4x4 4x2x4 etc..." - } -} - -variable "docker_image" { - description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-8-tf-" - type = string - default = "" -} - -variable "tf_version" { - description = "Nodeset Tensorflow version, see https://cloud.google.com/tpu/docs/supported-tpu-configurations#tpu_vm for details." - type = string -} - -variable "zone" { - description = "Nodes will only be created in this zone. Check https://cloud.google.com/tpu/docs/regions-zones to get zones with TPU-vm in it." - type = string - - validation { - condition = can(coalesce(var.zone)) - error_message = "Zone cannot be null or empty." - } -} - -variable "preemptible" { - description = "Specify whether TPU-vms in this nodeset are preemtible, see https://cloud.google.com/tpu/docs/preemptible for details." - type = bool - default = false -} - -variable "reserved" { - description = "Specify whether TPU-vms in this nodeset are created under a reservation." - type = bool - default = false -} - -variable "preserve_tpu" { - description = "Specify whether TPU-vms will get preserve on suspend, if set to true, on suspend vm is stopped, on false it gets deleted" - type = bool - default = true -} - -variable "node_count_static" { - description = "Number of nodes to be statically created." - type = number - default = 0 - - validation { - condition = var.node_count_static >= 0 - error_message = "Value must be >= 0." - } -} - -variable "node_count_dynamic_max" { - description = "Maximum number of nodes allowed in this partition to be created dynamically." - type = number - default = 0 - - validation { - condition = var.node_count_dynamic_max >= 0 - error_message = "Value must be >= 0." - } -} - -variable "enable_public_ip" { - description = "Enables IP address to access the Internet." - type = bool - default = false -} - -variable "data_disks" { - type = list(string) - description = "The data disks to include in the TPU node" - default = [] -} - -variable "subnetwork" { - description = "The name of the subnetwork to attach the TPU-vm of this nodeset to." - type = string -} - -variable "service_account" { - type = object({ - email = string - scopes = set(string) - }) - description = < - -- [Module: Slurm Partition](#module-slurm-partition) - - [Overview](#overview) - - [Usage](#usage) - - [Service Account](#service-account) - - [Dependencies](#dependencies) - - [Module API](#module-api) - - - -## Overview - -This is a submodule of [slurm_cluster](../../../slurm_cluster/README.md). It -creates a Slurm partition for -[slurm_controller_instance](../slurm_controller_instance/) or -[slurm_controller_hybrid](../slurm_controller_hybrid/). - -Conceptutally, a Slurm partition is a queue that is associated with compute -resources, limits, and access controls. Users submit jobs to one or more -partitions to have their jobs be completed against requested resources within -their allotted limits and access. - -This module defines a partition and its resources -- most notably, compute -nodes. Sets of compute nodes reside within a partition. Each set of compute -nodes must resolve to an -[instance template](../../../../docs/glossary.md#instance-template). Either the -[instance template](../../../../docs/glossary.md#instance-template) is: created -by definition -- module creates an -[instance template](../../../../docs/glossary.md#instance-template) using subset -of input parameters; or by the -[self link](../../../../docs/glossary.md#self-link) of an -[instance template](../../../../docs/glossary.md#instance-template) that is -managed outside of this module. Additionally, there are compute node parameters -that will override certain properties of the -[instance template](../../../../docs/glossary.md#instance-template) when -instanceated as a [VM](../../../../docs/glossary.md#vm). - -Compute instances created by -[slurm_controller_instance](../slurm_controller_instance/README.md), using this -partition, run [slurmd](../../../../docs/glossary.md#slurmd) and -[slurmstepd](../../../../docs/glossary.md#slurmstepd). - -## Usage - -See [examples](../../examples/slurm_partition/) directory for sample usages. - -See below for a simple inclusion within your own terraform project. - -```hcl -module "slurm_partition" { - source = "git@github.com:SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=v5.0.0" - - project_id = "" - - slurm_cluster_name = "" - - partition_name = "debug" - partition_nodes = { - count_static = 0 - count_dynamic = 10 - group_name = "test" - node_conf = {} - - # Template by Definition - additional_disks = [] - can_ip_forward = false - disable_smt = false - disk_auto_delete = true - disk_labels = {} - disk_size_gb = null - disk_type = null - enable_confidential_vm = false - enable_oslogin = true - enable_shielded_vm = false - gpu = null - labels = {} - machine_type = "n1-standard-1" - metadata = {} - min_cpu_platform = null - on_host_maintenance = null - preemptible = false - service_account = { - email = "" - scopes = ["https://www.googleapis.com/auth/cloud-platform"] - } - shielded_instance_config = null - source_image_family = null - source_image_project = null - source_image = null - tags = [] - - # Template by Source - instance_template = null - } - region = "us-central1" - subnetwork = "default" -} -``` - -> **NOTE:** Because this module is not hosted on -> [Terraform Registry](../../../../docs/glossary.md#terraform-registry), the -> version must be strictly controlled via -> [revision](https://www.terraform.io/language/modules/sources#selecting-a-revision) -> syntax on the source line. - -### Service Account - -It is recommended to generate a `compute` type -[service account](../../../../docs/glossary.md#service-account) via -[slurm_sa_iam](../../../slurm_sa_iam/README.md). - -Otherwise reference -[compute service account and IAM](../../../slurm_sa_iam/README.md#compute) to -create a self managed compute -[service account](../../../../docs/glossary.md#service-account) and -[IAM](../../../../docs/glossary.md#iam). - -## Dependencies - -- [Terraform](https://www.terraform.io/downloads.html) is installed. -- [Compute Engine API](../../../../docs/glossary.md#compute-engine) is enabled. -- [Python](../../../../docs/glossary.md#python) is installed. - - Required Version: `>= 3.6.0, < 4.0.0` -- [Pip](../../../../docs/glossary.md#pip) packages are installed. - - `pip3 install -r ../../../scripts/requirements.txt --user` - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_cluster/modules/slurm_partition/README_TF.md b/terraform/slurm_cluster/modules/slurm_partition/README_TF.md deleted file mode 100644 index e3f1d9e9..00000000 --- a/terraform/slurm_cluster/modules/slurm_partition/README_TF.md +++ /dev/null @@ -1,64 +0,0 @@ -# slurm_partition - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.2 | -| [google](#requirement\_google) | >= 3.53 | -| [null](#requirement\_null) | ~> 3.0 | - -## Providers - -| Name | Version | -|------|---------| -| [null](#provider\_null) | ~> 3.0 | - -## Modules - -No modules. - -## Resources - -| Name | Type | -|------|------| -| [null_resource.partition](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [default](#input\_default) | If this is true, jobs submitted without a partition specification will utilize this partition.
This sets 'Default' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Default for details. | `bool` | `false` | no | -| [enable\_job\_exclusive](#input\_enable\_job\_exclusive) | Enables job exclusivity. A job will run exclusively on the scheduled nodes. | `bool` | `false` | no | -| [network\_storage](#input\_network\_storage) | Storage to mounted on all instances in this partition.
* server\_ip : Address of the storage server.
* remote\_mount : The location in the remote instance filesystem to mount from.
* local\_mount : The location on the instance filesystem to mount to.
* fs\_type : Filesystem type (e.g. "nfs").
* mount\_options : Raw options to pass to 'mount'. |
list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
| `[]` | no | -| [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | -| [partition\_name](#input\_partition\_name) | Name of Slurm partition. | `string` | n/a | yes | -| [partition\_nodeset](#input\_partition\_nodeset) | Slurm nodesets by name, as a list of string. | `set(string)` | `[]` | no | -| [partition\_nodeset\_dyn](#input\_partition\_nodeset\_dyn) | Slurm nodesets (dynamic) by name, as a list of string. | `set(string)` | `[]` | no | -| [partition\_nodeset\_tpu](#input\_partition\_nodeset\_tpu) | Slurm nodesets (tpu) by name, as a list of string. | `set(string)` | `[]` | no | -| [resume\_timeout](#input\_resume\_timeout) | Maximum time permitted (in seconds) between when a node resume request is issued and when the node is actually available for use.
If null is given, then a smart default will be chosen depending on nodesets in partition.
This sets 'ResumeTimeout' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout_1 for details. | `number` | `300` | no | -| [suspend\_time](#input\_suspend\_time) | Nodes which remain idle or down for this number of seconds will be placed into power save mode by SuspendProgram.
This sets 'SuspendTime' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime_1 for details.
NOTE: use value -1 to exclude partition from suspend. | `number` | `300` | no | -| [suspend\_timeout](#input\_suspend\_timeout) | Maximum time permitted (in seconds) between when a node suspend request is issued and when the node is shutdown.
If null is given, then a smart default will be chosen depending on nodesets in partition.
This sets 'SuspendTimeout' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout_1 for details. | `number` | `null` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [partition](#output\_partition) | Partition for slurm controller. | -| [partition\_name](#output\_partition\_name) | Partition for slurm controller. | - diff --git a/terraform/slurm_cluster/modules/slurm_partition/main.tf b/terraform/slurm_cluster/modules/slurm_partition/main.tf deleted file mode 100644 index 83556377..00000000 --- a/terraform/slurm_cluster/modules/slurm_partition/main.tf +++ /dev/null @@ -1,61 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########## -# LOCALS # -########## - -locals { - has_node = length(var.partition_nodeset) > 0 - has_dyn = length(var.partition_nodeset_dyn) > 0 - has_tpu = length(var.partition_nodeset_tpu) > 0 -} - -locals { - partition_conf = merge({ - "Default" = var.default ? "YES" : null - "ResumeTimeout" = var.resume_timeout != null ? var.resume_timeout : (local.has_tpu ? 600 : 300) - "SuspendTime" = var.suspend_time < 0 ? "INFINITE" : var.suspend_time - "SuspendTimeout" = var.suspend_timeout != null ? var.suspend_timeout : (local.has_tpu ? 240 : 120) - }, var.partition_conf) - - partition = { - partition_name = var.partition_name - partition_conf = local.partition_conf - partition_nodeset = var.partition_nodeset - partition_nodeset_dyn = var.partition_nodeset_dyn - partition_nodeset_tpu = var.partition_nodeset_tpu - network_storage = var.network_storage - # Options - enable_job_exclusive = var.enable_job_exclusive - } -} - -resource "null_resource" "partition" { - triggers = { - partition = sha256(jsonencode(local.partition)) - } - lifecycle { - precondition { - condition = local.has_node || local.has_dyn || local.has_tpu - error_message = "Partition must contain at least one type of nodeset." - } - precondition { - condition = ((!local.has_node || !local.has_dyn) && local.has_tpu) || ((local.has_node || local.has_dyn) && !local.has_tpu) - error_message = "Partition cannot contain TPU and non-TPU nodesets." - } - } -} diff --git a/terraform/slurm_cluster/modules/slurm_partition/outputs.tf b/terraform/slurm_cluster/modules/slurm_partition/outputs.tf deleted file mode 100644 index c1af11a7..00000000 --- a/terraform/slurm_cluster/modules/slurm_partition/outputs.tf +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "partition_name" { - description = "Partition for slurm controller." - value = local.partition.partition_name -} - -output "partition" { - description = "Partition for slurm controller." - value = local.partition -} diff --git a/terraform/slurm_cluster/modules/slurm_partition/variables.tf b/terraform/slurm_cluster/modules/slurm_partition/variables.tf deleted file mode 100644 index 36553ded..00000000 --- a/terraform/slurm_cluster/modules/slurm_partition/variables.tf +++ /dev/null @@ -1,137 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "partition_name" { - description = "Name of Slurm partition." - type = string - - validation { - condition = can(regex("^[a-z](?:[a-z0-9]*)$", var.partition_name)) - error_message = "Variable 'partition_name' must be a match of regex '^[a-z](?:[a-z0-9]*)$'." - } -} - -variable "partition_conf" { - description = < 0 - error_message = "Value must be > 0." - } -} - -variable "suspend_time" { - description = <<-EOD - Nodes which remain idle or down for this number of seconds will be placed into power save mode by SuspendProgram. - This sets 'SuspendTime' in partition_conf. - See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime_1 for details. - NOTE: use value -1 to exclude partition from suspend. - EOD - type = number - default = 300 - - validation { - condition = var.suspend_time >= -1 - error_message = "Value must be >= -1." - } -} - -variable "suspend_timeout" { - description = <<-EOD - Maximum time permitted (in seconds) between when a node suspend request is issued and when the node is shutdown. - If null is given, then a smart default will be chosen depending on nodesets in partition. - This sets 'SuspendTimeout' in partition_conf. - See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout_1 for details. - EOD - type = number - default = null - - validation { - condition = var.suspend_timeout == null ? true : var.suspend_timeout > 0 - error_message = "Value must be > 0." - } -} - -variable "enable_job_exclusive" { - description = < - -- [Module: Slurm Firewall Rules](#module-slurm-firewall-rules) - - [Overview](#overview) - - [Usage](#usage) - - [Dependencies](#dependencies) - - [TerraformUser](#terraformuser) - - [Required](#required) - - [Module API](#module-api) - - - -## Overview - -This module creates [firewall rules](../../docs/glossary.md#firewall-rules) to -support [Slurm](../../docs/glossary.md#slurm) cluster communication. - -## Usage - -See [examples](./examples/) directory for sample usages. - -See below for a simple inclusion within your own terraform project. - -```hcl -module "slurm_firewall_rules" { - source = "git@github.com:SchedMD/slurm-gcp.git//terraform/slurm_firewall_rules?ref=v5.0.0" - - project_id = "" - network_name = "default" - slurm_cluster_name = "" -} -``` - -> **NOTE:** Because this module is not hosted on -> [Terraform Registry](../../docs/glossary.md#terraform-registry), the version -> must be strictly controlled via -> [revision](https://www.terraform.io/language/modules/sources#selecting-a-revision) -> syntax on the source line. - -## Dependencies - -- [Terraform](https://www.terraform.io/downloads.html) is installed. -- [Compute Engine API](../../docs/glossary.md#compute-engine) is enabled. - -### TerraformUser - -#### Required - -- Compute Security Admin (`roles/compute.securityAdmin`) -- Service Account Admin (`roles/iam.serviceAccountAdmin`) - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_firewall_rules/README_TF.md b/terraform/slurm_firewall_rules/README_TF.md deleted file mode 100644 index 322891cc..00000000 --- a/terraform/slurm_firewall_rules/README_TF.md +++ /dev/null @@ -1,55 +0,0 @@ -# slurm_firewall_rules - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.0 | - -## Providers - -No providers. - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [firewall\_rules](#module\_firewall\_rules) | terraform-google-modules/network/google//modules/firewall-rules | ~> 4.0 | - -## Resources - -No resources. - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [network\_name](#input\_network\_name) | Name of the network this set of firewall rules applies to. | `string` | n/a | yes | -| [project\_id](#input\_project\_id) | Project ID of the project that holds the network. | `string` | n/a | yes | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming. | `string` | n/a | yes | -| [source\_service\_accounts](#input\_source\_service\_accounts) | If source service accounts are specified, the firewall will apply only to
traffic originating from an instance with a service account in this list. Source
service accounts cannot be used to control traffic to an instance's external IP
address because service accounts are associated with an instance, not an IP
address. sourceRanges can be set at the same time as sourceServiceAccounts. If
both are set, the firewall will apply to traffic that has source IP address
within sourceRanges OR the source IP belongs to an instance with service account
listed in sourceServiceAccount. The connection does not need to match both
properties for the firewall to apply. sourceServiceAccounts cannot be used at
the same time as sourceTags or targetTags. | `list(string)` | `null` | no | -| [source\_tags](#input\_source\_tags) | If source tags are specified, the firewall will apply only to traffic with
source IP that belongs to a tag listed in source tags. Source tags cannot
be used to control traffic to an instance's external IP address. Because tags
are associated with an instance, not an IP address. One or both of
sourceRanges and sourceTags may be set. If both properties are set, the firewall
will apply to traffic that has source IP address within sourceRanges OR the
source IP that belongs to a tag listed in the sourceTags property. The
connection does not need to match both properties for the firewall to apply. | `list(string)` | `[]` | no | -| [target\_service\_accounts](#input\_target\_service\_accounts) | A list of service accounts indicating sets of instances located in the network
that may make network connections as specified in allowed[].
targetServiceAccounts cannot be used at the same time as targetTags or
sourceTags. If neither targetServiceAccounts nor targetTags are specified, the
firewall rule applies to all instances on the specified network. | `list(string)` | `null` | no | -| [target\_tags](#input\_target\_tags) | A list of instance tags indicating sets of instances located in the network that
may make network connections as specified in allowed[]. If no targetTags are
specified, the firewall rule applies to all instances on the specified network. | `list(string)` | `[]` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [firewall\_rules](#output\_firewall\_rules) | The created firewall rule resources | - diff --git a/terraform/slurm_firewall_rules/examples/simple/Makefile b/terraform/slurm_firewall_rules/examples/simple/Makefile deleted file mode 100644 index 13e967e2..00000000 --- a/terraform/slurm_firewall_rules/examples/simple/Makefile +++ /dev/null @@ -1,36 +0,0 @@ -# Two options: -# (1) Use `make [COMMAND] TVFARS=example.tfvars` -# (2) Use `make [COMMAND]` and example.auto.tfvars -TFVARS= - -.PHONY: init -init: - terraform init - -.PHONY: validate -validate: - terraform validate - -.PHONY: plan -plan: init validate - ifeq ($(strip $(TFVARS)),) - terraform plan -out terraform.tfplan - else - terraform plan -var-file=$(TFVARS) -out terraform.tfplan - endif - -.PHONY: apply -apply: init validate - ifeq ($(strip $(TFVARS)),) - terraform apply -auto-approve - else - terraform apply -var-file=$(TFVARS) -auto-approve - endif - -.PHONY: destroy -destroy: init validate - ifeq ($(strip $(TFVARS)),) - terraform destroy -auto-approve - else - terraform destroy -var-file=$(TFVARS) -auto-approve - endif diff --git a/terraform/slurm_firewall_rules/examples/simple/README.md b/terraform/slurm_firewall_rules/examples/simple/README.md deleted file mode 100644 index 6008397a..00000000 --- a/terraform/slurm_firewall_rules/examples/simple/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Example: Simple Slurm Firewall Rules - -[FAQ](../../../../docs/faq.md) | [Glossary](../../../../docs/glossary.md) - - - -- [Example: Simple Slurm Firewall Rules](#example-simple-slurm-firewall-rules) - - [Overview](#overview) - - [Usage](#usage) - - [Dependencies](#dependencies) - - [Example API](#example-api) - - - -## Overview - -This example creates a -[slurm_firewall_rules](../../../modules/slurm_firewall_rules/). - -## Usage - -Modify [example.tfvars](./example.tfvars) with required and desired values. - -Then perform the following commands on the root directory: - -- `terraform init` to get the plugins -- `terraform plan -var-file=example.tfvars` to see the infrastructure plan -- `terraform apply -var-file=example.tfvars` to apply the infrastructure build -- `terraform destroy -var-file=example.tfvars` to destroy the built - infrastructure - -## Dependencies - -- [Compute Engine API](../../../../docs/glossary.md#compute-engine) is enabled. - -## Example API - -For the terraform example API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_firewall_rules/examples/simple/README_TF.md b/terraform/slurm_firewall_rules/examples/simple/README_TF.md deleted file mode 100644 index 95531888..00000000 --- a/terraform/slurm_firewall_rules/examples/simple/README_TF.md +++ /dev/null @@ -1,53 +0,0 @@ -# simple - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.0 | -| [google](#requirement\_google) | >= 3.53 | - -## Providers - -| Name | Version | -|------|---------| -| [google](#provider\_google) | >= 3.53 | - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [slurm\_firewall\_rules](#module\_slurm\_firewall\_rules) | ../../../slurm_firewall_rules | n/a | - -## Resources - -| Name | Type | -|------|------| -| [google_compute_network.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_network) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [project\_id](#input\_project\_id) | The ID of the project where this VPC will be created. | `string` | n/a | yes | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming. | `string` | n/a | yes | - -## Outputs - -No outputs. - diff --git a/terraform/slurm_firewall_rules/examples/simple/example.tfvars b/terraform/slurm_firewall_rules/examples/simple/example.tfvars deleted file mode 100644 index a017bacb..00000000 --- a/terraform/slurm_firewall_rules/examples/simple/example.tfvars +++ /dev/null @@ -1,19 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -project_id = "" - -slurm_cluster_name = "simple" diff --git a/terraform/slurm_firewall_rules/examples/simple/main.tf b/terraform/slurm_firewall_rules/examples/simple/main.tf deleted file mode 100644 index bf6a93dd..00000000 --- a/terraform/slurm_firewall_rules/examples/simple/main.tf +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -provider "google" { - project = var.project_id -} - -data "google_compute_network" "default" { - name = "default" -} - -module "slurm_firewall_rules" { - source = "../../../slurm_firewall_rules" - - project_id = var.project_id - network_name = data.google_compute_network.default.self_link - slurm_cluster_name = var.slurm_cluster_name -} diff --git a/terraform/slurm_firewall_rules/examples/simple/outputs.tf b/terraform/slurm_firewall_rules/examples/simple/outputs.tf deleted file mode 100644 index d7d55bcc..00000000 --- a/terraform/slurm_firewall_rules/examples/simple/outputs.tf +++ /dev/null @@ -1,15 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ diff --git a/terraform/slurm_firewall_rules/examples/simple/variables.tf b/terraform/slurm_firewall_rules/examples/simple/variables.tf deleted file mode 100644 index 1c01cc59..00000000 --- a/terraform/slurm_firewall_rules/examples/simple/variables.tf +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - type = string - description = "The ID of the project where this VPC will be created." -} - -variable "slurm_cluster_name" { - description = "Cluster name, used for resource naming." - type = string -} diff --git a/terraform/slurm_firewall_rules/examples/simple/versions.tf b/terraform/slurm_firewall_rules/examples/simple/versions.tf deleted file mode 100644 index adc989fe..00000000 --- a/terraform/slurm_firewall_rules/examples/simple/versions.tf +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -terraform { - required_version = "~> 1.0" - - required_providers { - google = { - source = "hashicorp/google" - version = ">= 3.53" - } - } -} diff --git a/terraform/slurm_firewall_rules/main.tf b/terraform/slurm_firewall_rules/main.tf deleted file mode 100644 index 4441a9f5..00000000 --- a/terraform/slurm_firewall_rules/main.tf +++ /dev/null @@ -1,119 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########## -# LOCALS # -########## - -locals { - target_tags = concat([var.slurm_cluster_name], var.target_tags) - - firewall_rules = [ - { - name = "${var.slurm_cluster_name}-allow-ssh-ingress" - direction = "INGRESS" - ranges = ["0.0.0.0/0"] - source_tags = var.source_tags - source_service_accounts = var.source_service_accounts - target_tags = local.target_tags - target_service_accounts = var.target_service_accounts - allow = [ - { - protocol = "tcp" - ports = ["22"] - }, - ] - log_config = { - metadata = "INCLUDE_ALL_METADATA" - } - }, - { - name = "${var.slurm_cluster_name}-allow-iap-ingress" - direction = "INGRESS" - ranges = ["35.235.240.0/20"] - source_tags = var.source_tags - source_service_accounts = var.source_service_accounts - target_tags = local.target_tags - target_service_accounts = var.target_service_accounts - allow = [ - { - protocol = "tcp" - ports = ["22", "8642", "6842"] - }, - ] - log_config = { - metadata = "INCLUDE_ALL_METADATA" - } - }, - { - name = "${var.slurm_cluster_name}-allow-internal-ingress" - direction = "INGRESS" - ranges = ["0.0.0.0/0"] - source_tags = var.source_tags - source_service_accounts = var.source_service_accounts - target_tags = local.target_tags - target_service_accounts = var.target_service_accounts - allow = [ - { - protocol = "icmp" - ports = [] - }, - { - protocol = "tcp" - ports = ["0-65535"] - }, - { - protocol = "udp" - ports = ["0-65535"] - }, - ] - log_config = { - metadata = "INCLUDE_ALL_METADATA" - } - }, - ] - - rules = [ - for f in local.firewall_rules : { - name = f.name - direction = f.direction - priority = lookup(f, "priority", null) - description = lookup(f, "description", null) - ranges = lookup(f, "ranges", null) - source_tags = lookup(f, "source_tags", null) - source_service_accounts = lookup(f, "source_service_accounts", null) - target_tags = lookup(f, "target_tags", null) - target_service_accounts = lookup(f, "target_service_accounts", null) - allow = lookup(f, "allow", []) - deny = lookup(f, "deny", []) - log_config = lookup(f, "log_config", null) - } - ] -} - -################## -# FIREWALL RULES # -################## - -module "firewall_rules" { - source = "terraform-google-modules/network/google//modules/firewall-rules" - version = "~> 4.0" - - project_id = var.project_id - network_name = var.network_name - - rules = local.rules -} diff --git a/terraform/slurm_firewall_rules/outputs.tf b/terraform/slurm_firewall_rules/outputs.tf deleted file mode 100644 index 6fafbfa5..00000000 --- a/terraform/slurm_firewall_rules/outputs.tf +++ /dev/null @@ -1,20 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "firewall_rules" { - value = module.firewall_rules.firewall_rules - description = "The created firewall rule resources" -} diff --git a/terraform/slurm_firewall_rules/variables.tf b/terraform/slurm_firewall_rules/variables.tf deleted file mode 100644 index 0f5731da..00000000 --- a/terraform/slurm_firewall_rules/variables.tf +++ /dev/null @@ -1,97 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########### -# GENERAL # -########### - -variable "project_id" { - type = string - description = "Project ID of the project that holds the network." -} - -variable "network_name" { - type = string - description = "Name of the network this set of firewall rules applies to." -} - -variable "slurm_cluster_name" { - type = string - description = "Cluster name, used for resource naming." - - validation { - condition = can(regex("^[a-z](?:[a-z0-9]{0,9})$", var.slurm_cluster_name)) - error_message = "Variable 'slurm_cluster_name' must be a match of regex '^[a-z](?:[a-z0-9]{0,9})$'." - } -} - -############## -# SLURM RULE # -############## - -variable "source_tags" { - type = list(string) - description = < - -- [Module: Slurm SA and IAM](#module-slurm-sa-and-iam) - - [Overview](#overview) - - [Usage](#usage) - - [Dependencies](#dependencies) - - [TerraformUser](#terraformuser) - - [Required](#required) - - [Module API](#module-api) - - - -## Overview - -This module can create three different sets of -[service accounts](../../docs/glossary.md#service-account), -[IAM Roles](../../docs/glossary.md#iam-roles), and -[access scopes](../../docs/glossary.md#access-scopes): controller; login; -compute. These [service account](../../docs/glossary.md#service-account) sets -are intended to be passed to other sections of the Slurm cluster configuration -to define [instances templates](../../docs/glossary.md#instance-template). - -## Usage - -See [examples](../../examples/slurm_sa_iam) directory for sample usages. - -See below for a simple inclusion within your own terraform project. - -```hcl -module "slurm_sa_iam" { - source = "git@github.com:SchedMD/slurm-gcp.git//terraform/slurm_sa_iam?ref=v5.0.0" - - project_id = "" - - slurm_cluster_name = "" - - account_type = "controller" -} -``` - -> **NOTE:** Because this module is not hosted on -> [Terraform Registry](../../docs/glossary.md#terraform-registry), the version -> must be strictly controlled via -> [revision](https://www.terraform.io/language/modules/sources#selecting-a-revision) -> syntax on the source line. - -## Dependencies - -- [Terraform](https://www.terraform.io/downloads.html) is installed. -- [IAM API](../../docs/glossary.md#iam) is enabled. - -### TerraformUser - -#### Required - -- Project IAM Admin (`roles/resourcemanager.projectIamAdmin`) - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_sa_iam/README_TF.md b/terraform/slurm_sa_iam/README_TF.md deleted file mode 100644 index d0fa4733..00000000 --- a/terraform/slurm_sa_iam/README_TF.md +++ /dev/null @@ -1,59 +0,0 @@ -# slurm_sa_iam - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.0 | -| [google](#requirement\_google) | >= 3.53 | -| [random](#requirement\_random) | ~> 3.0 | - -## Providers - -| Name | Version | -|------|---------| -| [google](#provider\_google) | >= 3.53 | -| [random](#provider\_random) | ~> 3.0 | - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [slurm\_member\_roles](#module\_slurm\_member\_roles) | terraform-google-modules/iam/google//modules/member_iam | ~> 7.0 | - -## Resources - -| Name | Type | -|------|------| -| [google_service_account.slurm_service_account](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/service_account) | resource | -| [random_string.suffix](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/string) | resource | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [account\_type](#input\_account\_type) | Account to create. May be one of: controller; login; or compute. | `string` | `"controller"` | no | -| [project\_id](#input\_project\_id) | Project ID of the project that holds the network. | `string` | n/a | yes | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming. | `string` | n/a | yes | - -## Outputs - -| Name | Description | -|------|-------------| -| [service\_account](#output\_service\_account) | Service account object. | - diff --git a/terraform/slurm_sa_iam/examples/simple/Makefile b/terraform/slurm_sa_iam/examples/simple/Makefile deleted file mode 100644 index 13e967e2..00000000 --- a/terraform/slurm_sa_iam/examples/simple/Makefile +++ /dev/null @@ -1,36 +0,0 @@ -# Two options: -# (1) Use `make [COMMAND] TVFARS=example.tfvars` -# (2) Use `make [COMMAND]` and example.auto.tfvars -TFVARS= - -.PHONY: init -init: - terraform init - -.PHONY: validate -validate: - terraform validate - -.PHONY: plan -plan: init validate - ifeq ($(strip $(TFVARS)),) - terraform plan -out terraform.tfplan - else - terraform plan -var-file=$(TFVARS) -out terraform.tfplan - endif - -.PHONY: apply -apply: init validate - ifeq ($(strip $(TFVARS)),) - terraform apply -auto-approve - else - terraform apply -var-file=$(TFVARS) -auto-approve - endif - -.PHONY: destroy -destroy: init validate - ifeq ($(strip $(TFVARS)),) - terraform destroy -auto-approve - else - terraform destroy -var-file=$(TFVARS) -auto-approve - endif diff --git a/terraform/slurm_sa_iam/examples/simple/README.md b/terraform/slurm_sa_iam/examples/simple/README.md deleted file mode 100644 index 3ac8d0af..00000000 --- a/terraform/slurm_sa_iam/examples/simple/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Example: Simple Slurm SA IAM - -[FAQ](../../../../docs/faq.md) | [Glossary](../../../../docs/glossary.md) - - - -- [Example: Simple Slurm SA IAM](#example-simple-slurm-sa-iam) - - [Overview](#overview) - - [Usage](#usage) - - [Dependencies](#dependencies) - - [Example API](#example-api) - - - -## Overview - -This example creates a [slurm_sa_iam](../../../slurm_sa_iam/README.md). - -## Usage - -Modify [example.tfvars](./example.tfvars) with required and desired values. - -Then perform the following commands on the root directory: - -- `terraform init` to get the plugins -- `terraform plan -var-file=example.tfvars` to see the infrastructure plan -- `terraform apply -var-file=example.tfvars` to apply the infrastructure build -- `terraform destroy -var-file=example.tfvars` to destroy the built - infrastructure - -## Dependencies - -- [Compute Engine API](../../../../docs/glossary.md#compute-engine) is enabled. - -## Example API - -For the terraform example API reference, please see -[README_TF.md](./README_TF.md). diff --git a/terraform/slurm_sa_iam/examples/simple/README_TF.md b/terraform/slurm_sa_iam/examples/simple/README_TF.md deleted file mode 100644 index 7dc78145..00000000 --- a/terraform/slurm_sa_iam/examples/simple/README_TF.md +++ /dev/null @@ -1,51 +0,0 @@ -# simple - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.0 | -| [google](#requirement\_google) | >= 3.53 | - -## Providers - -No providers. - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [slurm\_sa\_iam](#module\_slurm\_sa\_iam) | ../../../slurm_sa_iam | n/a | - -## Resources - -No resources. - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [project\_id](#input\_project\_id) | The ID of the project where this VPC will be created. | `string` | n/a | yes | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming. | `string` | n/a | yes | - -## Outputs - -| Name | Description | -|------|-------------| -| [controller\_service\_account](#output\_controller\_service\_account) | n/a | - diff --git a/terraform/slurm_sa_iam/examples/simple/example.tfvars b/terraform/slurm_sa_iam/examples/simple/example.tfvars deleted file mode 100644 index a017bacb..00000000 --- a/terraform/slurm_sa_iam/examples/simple/example.tfvars +++ /dev/null @@ -1,19 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -project_id = "" - -slurm_cluster_name = "simple" diff --git a/terraform/slurm_sa_iam/examples/simple/main.tf b/terraform/slurm_sa_iam/examples/simple/main.tf deleted file mode 100644 index ed048403..00000000 --- a/terraform/slurm_sa_iam/examples/simple/main.tf +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -provider "google" { - project = var.project_id -} - -module "slurm_sa_iam" { - source = "../../../slurm_sa_iam" - - project_id = var.project_id - slurm_cluster_name = var.slurm_cluster_name - account_type = "controller" -} diff --git a/terraform/slurm_sa_iam/examples/simple/outputs.tf b/terraform/slurm_sa_iam/examples/simple/outputs.tf deleted file mode 100644 index 8256fafd..00000000 --- a/terraform/slurm_sa_iam/examples/simple/outputs.tf +++ /dev/null @@ -1,19 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "controller_service_account" { - value = module.slurm_sa_iam.service_account -} diff --git a/terraform/slurm_sa_iam/examples/simple/variables.tf b/terraform/slurm_sa_iam/examples/simple/variables.tf deleted file mode 100644 index 1c01cc59..00000000 --- a/terraform/slurm_sa_iam/examples/simple/variables.tf +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "project_id" { - type = string - description = "The ID of the project where this VPC will be created." -} - -variable "slurm_cluster_name" { - description = "Cluster name, used for resource naming." - type = string -} diff --git a/terraform/slurm_sa_iam/examples/simple/versions.tf b/terraform/slurm_sa_iam/examples/simple/versions.tf deleted file mode 100644 index adc989fe..00000000 --- a/terraform/slurm_sa_iam/examples/simple/versions.tf +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -terraform { - required_version = "~> 1.0" - - required_providers { - google = { - source = "hashicorp/google" - version = ">= 3.53" - } - } -} diff --git a/terraform/slurm_sa_iam/main.tf b/terraform/slurm_sa_iam/main.tf deleted file mode 100644 index cf9cfa73..00000000 --- a/terraform/slurm_sa_iam/main.tf +++ /dev/null @@ -1,84 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########## -# LOCALS # -########## - -locals { - roles = { - controller = [ - "roles/bigquery.dataEditor", - "roles/cloudsql.editor", - "roles/compute.instanceAdmin.v1", - "roles/compute.instanceAdmin", # Beta - "roles/iam.serviceAccountUser", - "roles/logging.logWriter", - "roles/monitoring.metricWriter", - "roles/tpu.admin", - ] - compute = [ - "roles/logging.logWriter", - "roles/monitoring.metricWriter", - ] - login = [ - "roles/logging.logWriter", - "roles/monitoring.metricWriter", - ] - } - - account = { - (var.account_type) = local.roles[var.account_type] - } -} - -########## -# RANDOM # -########## - -resource "random_string" "suffix" { - length = 8 - upper = false - special = false -} - -################### -# SERVICE ACCOUNT # -################### - -resource "google_service_account" "slurm_service_account" { - for_each = var.account_type != null ? local.account : local.roles - - account_id = "${var.slurm_cluster_name}-${each.key}-${random_string.suffix.result}" - display_name = "${var.slurm_cluster_name}-${each.key} Slurm SA IAM" - project = var.project_id -} - -####### -# IAM # -####### - -module "slurm_member_roles" { - source = "terraform-google-modules/iam/google//modules/member_iam" - version = "~> 7.0" - - for_each = var.account_type != null ? local.account : local.roles - - service_account_address = google_service_account.slurm_service_account[each.key].email - prefix = "serviceAccount" - project_id = var.project_id - project_roles = each.value -} diff --git a/terraform/slurm_sa_iam/outputs.tf b/terraform/slurm_sa_iam/outputs.tf deleted file mode 100644 index 086ddc49..00000000 --- a/terraform/slurm_sa_iam/outputs.tf +++ /dev/null @@ -1,23 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "service_account" { - description = "Service account object." - value = { - email = google_service_account.slurm_service_account[var.account_type].email - scopes = ["https://www.googleapis.com/auth/cloud-platform"] - } -} diff --git a/terraform/slurm_sa_iam/variables.tf b/terraform/slurm_sa_iam/variables.tf deleted file mode 100644 index ede71064..00000000 --- a/terraform/slurm_sa_iam/variables.tf +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -########### -# GENERAL # -########### - -variable "project_id" { - description = "Project ID of the project that holds the network." - type = string -} - -variable "slurm_cluster_name" { - description = "Cluster name, used for resource naming." - type = string - - validation { - condition = can(regex("^[a-z](?:[a-z0-9]{0,9})$", var.slurm_cluster_name)) - error_message = "Variable 'slurm_cluster_name' must be a match of regex '^[a-z](?:[a-z0-9]{0,9})$'." - } -} - -variable "account_type" { - description = "Account to create. May be one of: controller; login; or compute." - type = string - default = "controller" - - validation { - condition = ( - contains(["controller", "login", "compute"], lower(var.account_type))) - error_message = "Must be one of: controller; login; compute; or null." - } -} diff --git a/terraform/slurm_sa_iam/versions.tf b/terraform/slurm_sa_iam/versions.tf deleted file mode 100644 index 75ef837b..00000000 --- a/terraform/slurm_sa_iam/versions.tf +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -terraform { - required_version = "~> 1.0" - - required_providers { - google = { - source = "hashicorp/google" - version = ">= 3.53" - } - random = { - source = "hashicorp/random" - version = "~> 3.0" - } - } -}