From f23d52f0847fb48a25572eaabedd2945ee942f41 Mon Sep 17 00:00:00 2001 From: Michael Fraenkel Date: Fri, 6 Dec 2024 18:01:45 -0700 Subject: [PATCH] feat: Node pools can enable fast_socket (#2200) --- README.md | 1 + autogen/main/README.md | 1 + autogen/main/cluster.tf.tmpl | 13 +++++++++++++ cluster.tf | 19 +++++++++++++++++++ .../README.md | 1 + .../cluster.tf | 19 +++++++++++++++++++ modules/beta-private-cluster/README.md | 1 + modules/beta-private-cluster/cluster.tf | 19 +++++++++++++++++++ .../README.md | 1 + .../cluster.tf | 19 +++++++++++++++++++ modules/beta-public-cluster/README.md | 1 + modules/beta-public-cluster/cluster.tf | 19 +++++++++++++++++++ .../private-cluster-update-variant/README.md | 1 + .../private-cluster-update-variant/cluster.tf | 19 +++++++++++++++++++ modules/private-cluster/README.md | 1 + modules/private-cluster/cluster.tf | 19 +++++++++++++++++++ 16 files changed, 154 insertions(+) diff --git a/README.md b/README.md index 43b77ca60e..669b727864 100644 --- a/README.md +++ b/README.md @@ -317,6 +317,7 @@ The node_pools variable takes the following parameters: | disk_size_gb | Size of the disk attached to each node, specified in GB. The smallest allowed disk size is 10GB | 100 | Optional | | disk_type | Type of the disk attached to each node (e.g. 'pd-standard' or 'pd-ssd') | pd-standard | Optional | | effect | Effect for the taint | | Required | +| enable_fast_socket | Enable the NCCL Fast Socket feature. `enable_gvnic` must also be enabled. | null | Optional | | enable_gcfs | Google Container File System (gcfs) has to be enabled for image streaming to be active. Needs image_type to be set to COS_CONTAINERD. | false | Optional | | enable_gvnic | gVNIC (GVE) is an alternative to the virtIO-based ethernet driver. Needs a Container-Optimized OS node image. | false | Optional | | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional | diff --git a/autogen/main/README.md b/autogen/main/README.md index 9ccbf7736d..222bd22e14 100644 --- a/autogen/main/README.md +++ b/autogen/main/README.md @@ -205,6 +205,7 @@ The node_pools variable takes the following parameters: | disk_size_gb | Size of the disk attached to each node, specified in GB. The smallest allowed disk size is 10GB | 100 | Optional | | disk_type | Type of the disk attached to each node (e.g. 'pd-standard' or 'pd-ssd') | pd-standard | Optional | | effect | Effect for the taint | | Required | +| enable_fast_socket | Enable the NCCL Fast Socket feature. `enable_gvnic` must also be enabled. | null | Optional | | enable_gcfs | Google Container File System (gcfs) has to be enabled for image streaming to be active. Needs image_type to be set to COS_CONTAINERD. | false | Optional | | enable_gvnic | gVNIC (GVE) is an alternative to the virtIO-based ethernet driver. Needs a Container-Optimized OS node image. | false | Optional | | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional | diff --git a/autogen/main/cluster.tf.tmpl b/autogen/main/cluster.tf.tmpl index bd4ad3a9ed..b493d04f7c 100644 --- a/autogen/main/cluster.tf.tmpl +++ b/autogen/main/cluster.tf.tmpl @@ -537,6 +537,13 @@ resource "google_container_cluster" "primary" { } } + dynamic "fast_socket" { + for_each = lookup(var.node_pools[0], "enable_fast_socket", null) != null ? [var.node_pools[0].enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } + dynamic "kubelet_config" { for_each = length(setintersection( keys(var.node_pools[0]), @@ -930,6 +937,12 @@ resource "google_container_node_pool" "windows_pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content { diff --git a/cluster.tf b/cluster.tf index 0986e8da75..d53799e500 100644 --- a/cluster.tf +++ b/cluster.tf @@ -418,6 +418,13 @@ resource "google_container_cluster" "primary" { } } + dynamic "fast_socket" { + for_each = lookup(var.node_pools[0], "enable_fast_socket", null) != null ? [var.node_pools[0].enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } + dynamic "kubelet_config" { for_each = length(setintersection( keys(var.node_pools[0]), @@ -641,6 +648,12 @@ resource "google_container_node_pool" "pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content { @@ -932,6 +945,12 @@ resource "google_container_node_pool" "windows_pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content { diff --git a/modules/beta-private-cluster-update-variant/README.md b/modules/beta-private-cluster-update-variant/README.md index fc76c4c903..5d97be4c68 100644 --- a/modules/beta-private-cluster-update-variant/README.md +++ b/modules/beta-private-cluster-update-variant/README.md @@ -371,6 +371,7 @@ The node_pools variable takes the following parameters: | disk_size_gb | Size of the disk attached to each node, specified in GB. The smallest allowed disk size is 10GB | 100 | Optional | | disk_type | Type of the disk attached to each node (e.g. 'pd-standard' or 'pd-ssd') | pd-standard | Optional | | effect | Effect for the taint | | Required | +| enable_fast_socket | Enable the NCCL Fast Socket feature. `enable_gvnic` must also be enabled. | null | Optional | | enable_gcfs | Google Container File System (gcfs) has to be enabled for image streaming to be active. Needs image_type to be set to COS_CONTAINERD. | false | Optional | | enable_gvnic | gVNIC (GVE) is an alternative to the virtIO-based ethernet driver. Needs a Container-Optimized OS node image. | false | Optional | | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional | diff --git a/modules/beta-private-cluster-update-variant/cluster.tf b/modules/beta-private-cluster-update-variant/cluster.tf index df27a1fb6a..d4f320e369 100644 --- a/modules/beta-private-cluster-update-variant/cluster.tf +++ b/modules/beta-private-cluster-update-variant/cluster.tf @@ -456,6 +456,13 @@ resource "google_container_cluster" "primary" { } } + dynamic "fast_socket" { + for_each = lookup(var.node_pools[0], "enable_fast_socket", null) != null ? [var.node_pools[0].enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } + dynamic "kubelet_config" { for_each = length(setintersection( keys(var.node_pools[0]), @@ -799,6 +806,12 @@ resource "google_container_node_pool" "pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content { @@ -1104,6 +1117,12 @@ resource "google_container_node_pool" "windows_pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content { diff --git a/modules/beta-private-cluster/README.md b/modules/beta-private-cluster/README.md index 8fbf18847d..1077d7fc8a 100644 --- a/modules/beta-private-cluster/README.md +++ b/modules/beta-private-cluster/README.md @@ -349,6 +349,7 @@ The node_pools variable takes the following parameters: | disk_size_gb | Size of the disk attached to each node, specified in GB. The smallest allowed disk size is 10GB | 100 | Optional | | disk_type | Type of the disk attached to each node (e.g. 'pd-standard' or 'pd-ssd') | pd-standard | Optional | | effect | Effect for the taint | | Required | +| enable_fast_socket | Enable the NCCL Fast Socket feature. `enable_gvnic` must also be enabled. | null | Optional | | enable_gcfs | Google Container File System (gcfs) has to be enabled for image streaming to be active. Needs image_type to be set to COS_CONTAINERD. | false | Optional | | enable_gvnic | gVNIC (GVE) is an alternative to the virtIO-based ethernet driver. Needs a Container-Optimized OS node image. | false | Optional | | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional | diff --git a/modules/beta-private-cluster/cluster.tf b/modules/beta-private-cluster/cluster.tf index c53aada6ec..29f11bb7ee 100644 --- a/modules/beta-private-cluster/cluster.tf +++ b/modules/beta-private-cluster/cluster.tf @@ -456,6 +456,13 @@ resource "google_container_cluster" "primary" { } } + dynamic "fast_socket" { + for_each = lookup(var.node_pools[0], "enable_fast_socket", null) != null ? [var.node_pools[0].enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } + dynamic "kubelet_config" { for_each = length(setintersection( keys(var.node_pools[0]), @@ -718,6 +725,12 @@ resource "google_container_node_pool" "pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content { @@ -1022,6 +1035,12 @@ resource "google_container_node_pool" "windows_pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content { diff --git a/modules/beta-public-cluster-update-variant/README.md b/modules/beta-public-cluster-update-variant/README.md index 15b039f123..bbb58d12f4 100644 --- a/modules/beta-public-cluster-update-variant/README.md +++ b/modules/beta-public-cluster-update-variant/README.md @@ -357,6 +357,7 @@ The node_pools variable takes the following parameters: | disk_size_gb | Size of the disk attached to each node, specified in GB. The smallest allowed disk size is 10GB | 100 | Optional | | disk_type | Type of the disk attached to each node (e.g. 'pd-standard' or 'pd-ssd') | pd-standard | Optional | | effect | Effect for the taint | | Required | +| enable_fast_socket | Enable the NCCL Fast Socket feature. `enable_gvnic` must also be enabled. | null | Optional | | enable_gcfs | Google Container File System (gcfs) has to be enabled for image streaming to be active. Needs image_type to be set to COS_CONTAINERD. | false | Optional | | enable_gvnic | gVNIC (GVE) is an alternative to the virtIO-based ethernet driver. Needs a Container-Optimized OS node image. | false | Optional | | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional | diff --git a/modules/beta-public-cluster-update-variant/cluster.tf b/modules/beta-public-cluster-update-variant/cluster.tf index 471865dc73..b20fa74835 100644 --- a/modules/beta-public-cluster-update-variant/cluster.tf +++ b/modules/beta-public-cluster-update-variant/cluster.tf @@ -456,6 +456,13 @@ resource "google_container_cluster" "primary" { } } + dynamic "fast_socket" { + for_each = lookup(var.node_pools[0], "enable_fast_socket", null) != null ? [var.node_pools[0].enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } + dynamic "kubelet_config" { for_each = length(setintersection( keys(var.node_pools[0]), @@ -769,6 +776,12 @@ resource "google_container_node_pool" "pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content { @@ -1074,6 +1087,12 @@ resource "google_container_node_pool" "windows_pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content { diff --git a/modules/beta-public-cluster/README.md b/modules/beta-public-cluster/README.md index 29bb2dde89..c07bc9581a 100644 --- a/modules/beta-public-cluster/README.md +++ b/modules/beta-public-cluster/README.md @@ -335,6 +335,7 @@ The node_pools variable takes the following parameters: | disk_size_gb | Size of the disk attached to each node, specified in GB. The smallest allowed disk size is 10GB | 100 | Optional | | disk_type | Type of the disk attached to each node (e.g. 'pd-standard' or 'pd-ssd') | pd-standard | Optional | | effect | Effect for the taint | | Required | +| enable_fast_socket | Enable the NCCL Fast Socket feature. `enable_gvnic` must also be enabled. | null | Optional | | enable_gcfs | Google Container File System (gcfs) has to be enabled for image streaming to be active. Needs image_type to be set to COS_CONTAINERD. | false | Optional | | enable_gvnic | gVNIC (GVE) is an alternative to the virtIO-based ethernet driver. Needs a Container-Optimized OS node image. | false | Optional | | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional | diff --git a/modules/beta-public-cluster/cluster.tf b/modules/beta-public-cluster/cluster.tf index d11ca4684d..d76574eb39 100644 --- a/modules/beta-public-cluster/cluster.tf +++ b/modules/beta-public-cluster/cluster.tf @@ -456,6 +456,13 @@ resource "google_container_cluster" "primary" { } } + dynamic "fast_socket" { + for_each = lookup(var.node_pools[0], "enable_fast_socket", null) != null ? [var.node_pools[0].enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } + dynamic "kubelet_config" { for_each = length(setintersection( keys(var.node_pools[0]), @@ -688,6 +695,12 @@ resource "google_container_node_pool" "pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content { @@ -992,6 +1005,12 @@ resource "google_container_node_pool" "windows_pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content { diff --git a/modules/private-cluster-update-variant/README.md b/modules/private-cluster-update-variant/README.md index 8710b871cf..4339bacbb1 100644 --- a/modules/private-cluster-update-variant/README.md +++ b/modules/private-cluster-update-variant/README.md @@ -353,6 +353,7 @@ The node_pools variable takes the following parameters: | disk_size_gb | Size of the disk attached to each node, specified in GB. The smallest allowed disk size is 10GB | 100 | Optional | | disk_type | Type of the disk attached to each node (e.g. 'pd-standard' or 'pd-ssd') | pd-standard | Optional | | effect | Effect for the taint | | Required | +| enable_fast_socket | Enable the NCCL Fast Socket feature. `enable_gvnic` must also be enabled. | null | Optional | | enable_gcfs | Google Container File System (gcfs) has to be enabled for image streaming to be active. Needs image_type to be set to COS_CONTAINERD. | false | Optional | | enable_gvnic | gVNIC (GVE) is an alternative to the virtIO-based ethernet driver. Needs a Container-Optimized OS node image. | false | Optional | | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional | diff --git a/modules/private-cluster-update-variant/cluster.tf b/modules/private-cluster-update-variant/cluster.tf index 4675138c1a..fa03099a0c 100644 --- a/modules/private-cluster-update-variant/cluster.tf +++ b/modules/private-cluster-update-variant/cluster.tf @@ -418,6 +418,13 @@ resource "google_container_cluster" "primary" { } } + dynamic "fast_socket" { + for_each = lookup(var.node_pools[0], "enable_fast_socket", null) != null ? [var.node_pools[0].enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } + dynamic "kubelet_config" { for_each = length(setintersection( keys(var.node_pools[0]), @@ -751,6 +758,12 @@ resource "google_container_node_pool" "pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content { @@ -1043,6 +1056,12 @@ resource "google_container_node_pool" "windows_pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content { diff --git a/modules/private-cluster/README.md b/modules/private-cluster/README.md index 1fb8f3c332..753b3fd4c4 100644 --- a/modules/private-cluster/README.md +++ b/modules/private-cluster/README.md @@ -331,6 +331,7 @@ The node_pools variable takes the following parameters: | disk_size_gb | Size of the disk attached to each node, specified in GB. The smallest allowed disk size is 10GB | 100 | Optional | | disk_type | Type of the disk attached to each node (e.g. 'pd-standard' or 'pd-ssd') | pd-standard | Optional | | effect | Effect for the taint | | Required | +| enable_fast_socket | Enable the NCCL Fast Socket feature. `enable_gvnic` must also be enabled. | null | Optional | | enable_gcfs | Google Container File System (gcfs) has to be enabled for image streaming to be active. Needs image_type to be set to COS_CONTAINERD. | false | Optional | | enable_gvnic | gVNIC (GVE) is an alternative to the virtIO-based ethernet driver. Needs a Container-Optimized OS node image. | false | Optional | | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional | diff --git a/modules/private-cluster/cluster.tf b/modules/private-cluster/cluster.tf index 0107dbd8a3..3ca8117ae6 100644 --- a/modules/private-cluster/cluster.tf +++ b/modules/private-cluster/cluster.tf @@ -418,6 +418,13 @@ resource "google_container_cluster" "primary" { } } + dynamic "fast_socket" { + for_each = lookup(var.node_pools[0], "enable_fast_socket", null) != null ? [var.node_pools[0].enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } + dynamic "kubelet_config" { for_each = length(setintersection( keys(var.node_pools[0]), @@ -671,6 +678,12 @@ resource "google_container_node_pool" "pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content { @@ -962,6 +975,12 @@ resource "google_container_node_pool" "windows_pools" { enabled = gvnic.value } } + dynamic "fast_socket" { + for_each = lookup(each.value, "enable_fast_socket", null) != null ? [each.value.enable_fast_socket] : [] + content { + enabled = fast_socket.value + } + } dynamic "reservation_affinity" { for_each = lookup(each.value, "queued_provisioning", false) || lookup(each.value, "consume_reservation_type", "") != "" ? [each.value] : [] content {