diff --git a/.changelog/44894.txt b/.changelog/44894.txt new file mode 100644 index 000000000000..55e190793ce5 --- /dev/null +++ b/.changelog/44894.txt @@ -0,0 +1,3 @@ +```release-note:enhancement +resource/aws_eks_node_group: Add `max_parallel_nodes_repaired_count`, `max_parallel_nodes_repaired_percentage`, `max_unhealthy_node_threshold_count`, `max_unhealthy_node_threshold_percentage`, and `node_repair_config_overrides` to the `node_repair_config` schema +``` diff --git a/internal/service/eks/node_group.go b/internal/service/eks/node_group.go index 025e646b75bf..01c5189b16d8 100644 --- a/internal/service/eks/node_group.go +++ b/internal/service/eks/node_group.go @@ -156,6 +156,64 @@ func resourceNodeGroup() *schema.Resource { Optional: true, Default: false, }, + "max_parallel_nodes_repaired_count": { + Type: schema.TypeInt, + Optional: true, + ValidateFunc: validation.IntAtLeast(1), + ConflictsWith: []string{ + "node_repair_config.0.max_parallel_nodes_repaired_percentage", + }, + }, + "max_parallel_nodes_repaired_percentage": { + Type: schema.TypeInt, + Optional: true, + ValidateFunc: validation.IntBetween(1, 100), + ConflictsWith: []string{ + "node_repair_config.0.max_parallel_nodes_repaired_count", + }, + }, + "max_unhealthy_node_threshold_count": { + Type: schema.TypeInt, + Optional: true, + ValidateFunc: validation.IntAtLeast(1), + ConflictsWith: []string{ + "node_repair_config.0.max_unhealthy_node_threshold_percentage", + }, + }, + "max_unhealthy_node_threshold_percentage": { + Type: schema.TypeInt, + Optional: true, + ValidateFunc: validation.IntBetween(1, 100), + ConflictsWith: []string{ + "node_repair_config.0.max_unhealthy_node_threshold_count", + }, + }, + "node_repair_config_overrides": { + Type: schema.TypeList, + Optional: true, + Elem: &schema.Resource{ + Schema: map[string]*schema.Schema{ + "min_repair_wait_time_mins": { + Type: schema.TypeInt, + Required: true, + ValidateFunc: validation.IntAtLeast(1), + }, + "node_monitoring_condition": { + Type: schema.TypeString, + Required: true, + }, + "node_unhealthy_reason": { + Type: schema.TypeString, + Required: true, + }, + "repair_action": { + Type: schema.TypeString, + Required: true, + ValidateDiagFunc: enum.Validate[types.RepairAction](), + }, + }, + }, + }, }, }, }, @@ -955,9 +1013,66 @@ func expandNodeRepairConfig(tfMap map[string]any) *types.NodeRepairConfig { apiObject.Enabled = aws.Bool(v) } + if v, ok := tfMap["max_parallel_nodes_repaired_count"].(int); ok && v != 0 { + apiObject.MaxParallelNodesRepairedCount = aws.Int32(int32(v)) + } + + if v, ok := tfMap["max_parallel_nodes_repaired_percentage"].(int); ok && v != 0 { + apiObject.MaxParallelNodesRepairedPercentage = aws.Int32(int32(v)) + } + + if v, ok := tfMap["max_unhealthy_node_threshold_count"].(int); ok && v != 0 { + apiObject.MaxUnhealthyNodeThresholdCount = aws.Int32(int32(v)) + } + + if v, ok := tfMap["max_unhealthy_node_threshold_percentage"].(int); ok && v != 0 { + apiObject.MaxUnhealthyNodeThresholdPercentage = aws.Int32(int32(v)) + } + + if v, ok := tfMap["node_repair_config_overrides"].([]any); ok && len(v) > 0 { + apiObject.NodeRepairConfigOverrides = expandNodeRepairConfigOverrides(v) + } + return apiObject } +func expandNodeRepairConfigOverrides(tfList []any) []types.NodeRepairConfigOverrides { + if len(tfList) == 0 { + return nil + } + + var apiObjects []types.NodeRepairConfigOverrides + + for _, tfMapRaw := range tfList { + tfMap, ok := tfMapRaw.(map[string]any) + if !ok { + continue + } + + apiObject := types.NodeRepairConfigOverrides{} + + if v, ok := tfMap["min_repair_wait_time_mins"].(int); ok { + apiObject.MinRepairWaitTimeMins = aws.Int32(int32(v)) + } + + if v, ok := tfMap["node_monitoring_condition"].(string); ok && v != "" { + apiObject.NodeMonitoringCondition = aws.String(v) + } + + if v, ok := tfMap["node_unhealthy_reason"].(string); ok && v != "" { + apiObject.NodeUnhealthyReason = aws.String(v) + } + + if v, ok := tfMap["repair_action"].(string); ok && v != "" { + apiObject.RepairAction = types.RepairAction(v) + } + + apiObjects = append(apiObjects, apiObject) + } + + return apiObjects +} + func expandUpdateLabelsPayload(ctx context.Context, oldLabelsMap, newLabelsMap any) *types.UpdateLabelsPayload { // EKS Labels operate similarly to keyvaluetags oldLabels := tftags.New(ctx, oldLabelsMap) @@ -1069,9 +1184,59 @@ func flattenNodeRepairConfig(apiObject *types.NodeRepairConfig) map[string]any { tfMap[names.AttrEnabled] = aws.ToBool(v) } + if v := apiObject.MaxParallelNodesRepairedCount; v != nil { + tfMap["max_parallel_nodes_repaired_count"] = aws.ToInt32(v) + } + + if v := apiObject.MaxParallelNodesRepairedPercentage; v != nil { + tfMap["max_parallel_nodes_repaired_percentage"] = aws.ToInt32(v) + } + + if v := apiObject.MaxUnhealthyNodeThresholdCount; v != nil { + tfMap["max_unhealthy_node_threshold_count"] = aws.ToInt32(v) + } + + if v := apiObject.MaxUnhealthyNodeThresholdPercentage; v != nil { + tfMap["max_unhealthy_node_threshold_percentage"] = aws.ToInt32(v) + } + + if v := apiObject.NodeRepairConfigOverrides; v != nil { + tfMap["node_repair_config_overrides"] = flattenNodeRepairConfigOverrides(v) + } + return tfMap } +func flattenNodeRepairConfigOverrides(apiObjects []types.NodeRepairConfigOverrides) []any { + if len(apiObjects) == 0 { + return nil + } + + var tfList []any + + for _, apiObject := range apiObjects { + tfMap := make(map[string]any) + + if v := apiObject.MinRepairWaitTimeMins; v != nil { + tfMap["min_repair_wait_time_mins"] = aws.ToInt32(v) + } + + if v := apiObject.NodeMonitoringCondition; v != nil { + tfMap["node_monitoring_condition"] = aws.ToString(v) + } + + if v := apiObject.NodeUnhealthyReason; v != nil { + tfMap["node_unhealthy_reason"] = aws.ToString(v) + } + + tfMap["repair_action"] = string(apiObject.RepairAction) + + tfList = append(tfList, tfMap) + } + + return tfList +} + func flattenNodegroupUpdateConfig(apiObject *types.NodegroupUpdateConfig) map[string]any { if apiObject == nil { return nil diff --git a/internal/service/eks/node_group_test.go b/internal/service/eks/node_group_test.go index a16bd5d91040..6cb7424b9318 100644 --- a/internal/service/eks/node_group_test.go +++ b/internal/service/eks/node_group_test.go @@ -506,7 +506,7 @@ func TestAccEKSNodeGroup_LaunchTemplate_version(t *testing.T) { }) } -func TestAccEKSNodeGroup_nodeRepairConfig(t *testing.T) { +func TestAccEKSNodeGroup_nodeRepairConfig_basic(t *testing.T) { ctx := acctest.Context(t) var nodeGroup1 types.Nodegroup rName := sdkacctest.RandomWithPrefix(acctest.ResourcePrefix) @@ -519,7 +519,7 @@ func TestAccEKSNodeGroup_nodeRepairConfig(t *testing.T) { CheckDestroy: testAccCheckNodeGroupDestroy(ctx), Steps: []resource.TestStep{ { - Config: testAccNodeGroupConfig_nodeRepairConfig(rName), + Config: testAccNodeGroupConfig_nodeRepairConfigBasic(rName), Check: resource.ComposeTestCheckFunc( testAccCheckNodeGroupExists(ctx, resourceName, &nodeGroup1), resource.TestCheckResourceAttr(resourceName, "node_repair_config.#", "1"), @@ -535,6 +535,106 @@ func TestAccEKSNodeGroup_nodeRepairConfig(t *testing.T) { }) } +func TestAccEKSNodeGroup_nodeRepairConfig_counts(t *testing.T) { + ctx := acctest.Context(t) + var nodeGroup1 types.Nodegroup + rName := sdkacctest.RandomWithPrefix(acctest.ResourcePrefix) + resourceName := "aws_eks_node_group.test" + + resource.ParallelTest(t, resource.TestCase{ + PreCheck: func() { acctest.PreCheck(ctx, t); testAccPreCheck(ctx, t) }, + ErrorCheck: acctest.ErrorCheck(t, names.EKSServiceID), + ProtoV5ProviderFactories: acctest.ProtoV5ProviderFactories, + CheckDestroy: testAccCheckNodeGroupDestroy(ctx), + Steps: []resource.TestStep{ + { + Config: testAccNodeGroupConfig_nodeRepairConfigCounts(rName), + Check: resource.ComposeTestCheckFunc( + testAccCheckNodeGroupExists(ctx, resourceName, &nodeGroup1), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.#", "1"), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.enabled", acctest.CtTrue), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.max_parallel_nodes_repaired_count", "2"), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.max_unhealthy_node_threshold_count", "3"), + ), + }, + { + ResourceName: resourceName, + ImportState: true, + ImportStateVerify: true, + }, + }, + }) +} + +func TestAccEKSNodeGroup_nodeRepairConfig_percentages(t *testing.T) { + ctx := acctest.Context(t) + var nodeGroup1 types.Nodegroup + rName := sdkacctest.RandomWithPrefix(acctest.ResourcePrefix) + resourceName := "aws_eks_node_group.test" + + resource.ParallelTest(t, resource.TestCase{ + PreCheck: func() { acctest.PreCheck(ctx, t); testAccPreCheck(ctx, t) }, + ErrorCheck: acctest.ErrorCheck(t, names.EKSServiceID), + ProtoV5ProviderFactories: acctest.ProtoV5ProviderFactories, + CheckDestroy: testAccCheckNodeGroupDestroy(ctx), + Steps: []resource.TestStep{ + { + Config: testAccNodeGroupConfig_nodeRepairConfigPercentages(rName), + Check: resource.ComposeTestCheckFunc( + testAccCheckNodeGroupExists(ctx, resourceName, &nodeGroup1), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.#", "1"), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.enabled", acctest.CtTrue), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.max_parallel_nodes_repaired_percentage", "25"), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.max_unhealthy_node_threshold_percentage", "40"), + ), + }, + { + ResourceName: resourceName, + ImportState: true, + ImportStateVerify: true, + }, + }, + }) +} + +func TestAccEKSNodeGroup_nodeRepairConfig_overrides(t *testing.T) { + ctx := acctest.Context(t) + var nodeGroup1 types.Nodegroup + rName := sdkacctest.RandomWithPrefix(acctest.ResourcePrefix) + resourceName := "aws_eks_node_group.test" + + resource.ParallelTest(t, resource.TestCase{ + PreCheck: func() { acctest.PreCheck(ctx, t); testAccPreCheck(ctx, t) }, + ErrorCheck: acctest.ErrorCheck(t, names.EKSServiceID), + ProtoV5ProviderFactories: acctest.ProtoV5ProviderFactories, + CheckDestroy: testAccCheckNodeGroupDestroy(ctx), + Steps: []resource.TestStep{ + { + Config: testAccNodeGroupConfig_nodeRepairConfigOverrides(rName), + Check: resource.ComposeTestCheckFunc( + testAccCheckNodeGroupExists(ctx, resourceName, &nodeGroup1), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.#", "1"), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.enabled", acctest.CtTrue), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.node_repair_config_overrides.#", "2"), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.node_repair_config_overrides.0.min_repair_wait_time_mins", "30"), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.node_repair_config_overrides.0.node_monitoring_condition", "NodeNotReady"), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.node_repair_config_overrides.0.node_unhealthy_reason", "NetworkUnavailable"), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.node_repair_config_overrides.0.repair_action", "Replace"), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.node_repair_config_overrides.1.min_repair_wait_time_mins", "60"), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.node_repair_config_overrides.1.node_monitoring_condition", "NodeNotReady"), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.node_repair_config_overrides.1.node_unhealthy_reason", "AutoScalingGroupNotFound"), + resource.TestCheckResourceAttr(resourceName, "node_repair_config.0.node_repair_config_overrides.1.repair_action", "Reboot"), + ), + }, + { + ResourceName: resourceName, + ImportState: true, + ImportStateVerify: true, + }, + }, + }) +} + func TestAccEKSNodeGroup_releaseVersion(t *testing.T) { ctx := acctest.Context(t) var nodeGroup1, nodeGroup2 types.Nodegroup @@ -2117,7 +2217,7 @@ resource "aws_eks_node_group" "test" { `, rName, taintKey1, taintValue1, taintEffect1, taintKey2, taintValue2, taintEffect2)) } -func testAccNodeGroupConfig_nodeRepairConfig(rName string) string { +func testAccNodeGroupConfig_nodeRepairConfigBasic(rName string) string { return acctest.ConfigCompose(testAccNodeGroupConfig_base(rName), fmt.Sprintf(` resource "aws_eks_node_group" "test" { cluster_name = aws_eks_cluster.test.name @@ -2145,6 +2245,108 @@ resource "aws_eks_node_group" "test" { `, rName)) } +func testAccNodeGroupConfig_nodeRepairConfigCounts(rName string) string { + return acctest.ConfigCompose(testAccNodeGroupConfig_base(rName), fmt.Sprintf(` +resource "aws_eks_node_group" "test" { + cluster_name = aws_eks_cluster.test.name + node_group_name = %[1]q + node_role_arn = aws_iam_role.node.arn + subnet_ids = aws_subnet.test[*].id + + scaling_config { + desired_size = 1 + max_size = 3 + min_size = 1 + } + + node_repair_config { + enabled = true + max_parallel_nodes_repaired_count = 2 + max_unhealthy_node_threshold_count = 3 + } + + depends_on = [ + aws_iam_role_policy_attachment.node-AmazonEKSWorkerNodePolicy, + aws_iam_role_policy_attachment.node-AmazonEKS_CNI_Policy, + aws_iam_role_policy_attachment.node-AmazonEC2ContainerRegistryReadOnly, + aws_iam_role_policy_attachment.node-AmazonEKSWorkerNodeMinimalPolicy, + ] +} +`, rName)) +} + +func testAccNodeGroupConfig_nodeRepairConfigPercentages(rName string) string { + return acctest.ConfigCompose(testAccNodeGroupConfig_base(rName), fmt.Sprintf(` +resource "aws_eks_node_group" "test" { + cluster_name = aws_eks_cluster.test.name + node_group_name = %[1]q + node_role_arn = aws_iam_role.node.arn + subnet_ids = aws_subnet.test[*].id + + scaling_config { + desired_size = 1 + max_size = 3 + min_size = 1 + } + + node_repair_config { + enabled = true + max_parallel_nodes_repaired_percentage = 25 + max_unhealthy_node_threshold_percentage = 40 + } + + depends_on = [ + aws_iam_role_policy_attachment.node-AmazonEKSWorkerNodePolicy, + aws_iam_role_policy_attachment.node-AmazonEKS_CNI_Policy, + aws_iam_role_policy_attachment.node-AmazonEC2ContainerRegistryReadOnly, + aws_iam_role_policy_attachment.node-AmazonEKSWorkerNodeMinimalPolicy, + ] +} +`, rName)) +} + +func testAccNodeGroupConfig_nodeRepairConfigOverrides(rName string) string { + return acctest.ConfigCompose(testAccNodeGroupConfig_base(rName), fmt.Sprintf(` +resource "aws_eks_node_group" "test" { + cluster_name = aws_eks_cluster.test.name + node_group_name = %[1]q + node_role_arn = aws_iam_role.node.arn + subnet_ids = aws_subnet.test[*].id + + scaling_config { + desired_size = 1 + max_size = 3 + min_size = 1 + } + + node_repair_config { + enabled = true + + node_repair_config_overrides { + min_repair_wait_time_mins = 30 + node_monitoring_condition = "NodeNotReady" + node_unhealthy_reason = "NetworkUnavailable" + repair_action = "Replace" + } + + node_repair_config_overrides { + min_repair_wait_time_mins = 60 + node_monitoring_condition = "NodeNotReady" + node_unhealthy_reason = "AutoScalingGroupNotFound" + repair_action = "Reboot" + } + } + + depends_on = [ + aws_iam_role_policy_attachment.node-AmazonEKSWorkerNodePolicy, + aws_iam_role_policy_attachment.node-AmazonEKS_CNI_Policy, + aws_iam_role_policy_attachment.node-AmazonEC2ContainerRegistryReadOnly, + aws_iam_role_policy_attachment.node-AmazonEKSWorkerNodeMinimalPolicy, + ] +} +`, rName)) +} + func testAccNodeGroupConfig_update1(rName string) string { return acctest.ConfigCompose(testAccNodeGroupConfig_base(rName), fmt.Sprintf(` resource "aws_eks_node_group" "test" { diff --git a/website/docs/r/eks_node_group.html.markdown b/website/docs/r/eks_node_group.html.markdown index 07905d34c7e2..0f878622ee65 100644 --- a/website/docs/r/eks_node_group.html.markdown +++ b/website/docs/r/eks_node_group.html.markdown @@ -169,7 +169,19 @@ The following arguments are optional: ### node_repair_config Configuration Block -* `enabled` - (Required) Specifies whether to enable node auto repair for the node group. Node auto repair is disabled by default. +* `enabled` - (Optional) Specifies whether to enable node auto repair for the node group. Node auto repair is disabled by default. Defaults to `false`. +* `max_parallel_nodes_repaired_count` - (Optional) Maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. Conflicts with `max_parallel_nodes_repaired_percentage`. +* `max_parallel_nodes_repaired_percentage` - (Optional) Maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. Conflicts with `max_parallel_nodes_repaired_count`. +* `max_unhealthy_node_threshold_count` - (Optional) Count threshold of unhealthy nodes, above which node auto repair actions will stop. Conflicts with `max_unhealthy_node_threshold_percentage`. +* `max_unhealthy_node_threshold_percentage` - (Optional) Percentage threshold of unhealthy nodes, above which node auto repair actions will stop. Conflicts with `max_unhealthy_node_threshold_count`. +* `node_repair_config_overrides` - (Optional) Granular overrides for specific repair actions. See [`node_repair_config_overrides`](#node_repair_config_overrides-configuration-block) below for details. + +### node_repair_config_overrides Configuration Block + +* `min_repair_wait_time_mins` - (Required) Minimum time in minutes to wait before attempting to repair a node with the specified `node_monitoring_condition` and `node_unhealthy_reason`. +* `node_monitoring_condition` - (Required) Unhealthy condition reported by the node monitoring agent that this override applies to. +* `node_unhealthy_reason` - (Required) Reason reported by the node monitoring agent that this override applies to. +* `repair_action` - (Required) Repair action to take for nodes when all of the specified conditions are met. Valid values are defined by the EKS API. ### remote_access Configuration Block