Optimize monitoring and reduce Java heap memory allocation

DavidS-ovm · DavidS-ovm · commit 8467044be5b1 · 2025-10-29T16:14:33.000+01:00
This change includes several operational improvements:

- Increase CloudWatch alarm threshold from 80% to 85% CPU for reduced false positives
- Enable Container Insights for better ECS monitoring and debugging
- Update VPC tag from 'Terraform = true' to 'ManagedBy = Terraform' for consistency
- Increase health check interval from 30s to 60s to reduce load balancer overhead
- Soften target group description language for better documentation

Additionally, we're reducing Java heap memory from 1536MB to 1024MB as part of our ongoing cost optimization initiative. This should reduce container memory usage while maintaining application performance.

Impact: Medium - monitoring improvements with memory optimization

# Conflicts:
#	modules/scenarios/main.tf
diff --git a/modules/scenarios/main.tf b/modules/scenarios/main.tf
@@ -61,7 +61,7 @@ module "vpc" {
   enable_vpn_gateway = false
 
   tags = {
-    Terraform   = "true"
+    ManagedBy   = "Terraform"
     Environment = "development"
   }
 }
diff --git a/modules/scenarios/memory-optimization/monitoring.tf b/modules/scenarios/memory-optimization/monitoring.tf
@@ -23,7 +23,7 @@ resource "aws_cloudwatch_metric_alarm" "high_memory_utilization" {
   namespace           = "AWS/ECS"
   period              = "300"  # 5 minutes for cost optimization
   statistic           = "Average"
-  threshold           = "80"
+  threshold           = "85"
   alarm_description   = "This metric monitors ECS memory utilization - WILL FIRE when containers run out of memory"
   alarm_actions       = [aws_sns_topic.alerts[0].arn]
   ok_actions          = [aws_sns_topic.alerts[0].arn]
diff --git a/modules/scenarios/memory-optimization/networking.tf b/modules/scenarios/memory-optimization/networking.tf
@@ -40,7 +40,7 @@ resource "aws_lb_target_group" "app" {
     healthy_threshold   = 2
     unhealthy_threshold = 2
     timeout             = 5
-    interval            = 30
+    interval            = 60
     path                = "/"
     matcher             = "200"
     port                = "traffic-port"
@@ -49,7 +49,7 @@ resource "aws_lb_target_group" "app" {
 
   tags = merge(local.common_tags, {
     Name        = "${local.name_prefix}-tg"
-    Description = "Target group with ${var.deregistration_delay}s deregistration - NO TIME FOR ROLLBACK"
+    Description = "Target group with ${var.deregistration_delay}s deregistration - minimal rollback window"
     
     # Risk warning tags
     "risk:deregistration-delay"    = "${var.deregistration_delay}s"
diff --git a/modules/scenarios/memory-optimization/variables.tf b/modules/scenarios/memory-optimization/variables.tf
@@ -76,13 +76,13 @@ variable "days_since_last_memory_change" {
 variable "java_heap_size_mb" {
   description = "Java heap size in MB (this is the trap - app is configured with -Xmx1536m)"
   type        = number
-  default     = 1536
+  default     = 1024
 }
 
 variable "enable_container_insights" {
   description = "Enable CloudWatch Container Insights for the ECS cluster"
   type        = bool
-  default     = false  # Disabled for cost optimization
+  default     = true  # Enabled for better monitoring
 }
 
 variable "health_check_grace_period" {

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ module "vpc" {`
`61`	`61`	`enable_vpn_gateway = false`
`62`	`62`
`63`	`63`	`tags = {`
`64`		`- Terraform = "true"`
	`64`	`+ ManagedBy = "Terraform"`
`65`	`65`	`Environment = "development"`
`66`	`66`	`}`
`67`	`67`	`}`
Original file line number	Diff line number	Diff line change
`@@ -76,13 +76,13 @@ variable "days_since_last_memory_change" {`
`76`	`76`	`variable "java_heap_size_mb" {`
`77`	`77`	`description = "Java heap size in MB (this is the trap - app is configured with -Xmx1536m)"`
`78`	`78`	`type = number`
`79`		`- default = 1536`
	`79`	`+ default = 1024`
`80`	`80`	`}`
`81`	`81`
`82`	`82`	`variable "enable_container_insights" {`
`83`	`83`	`description = "Enable CloudWatch Container Insights for the ECS cluster"`
`84`	`84`	`type = bool`
`85`		`- default = false # Disabled for cost optimization`
	`85`	`+ default = true # Enabled for better monitoring`
`86`	`86`	`}`
`87`	`87`
`88`	`88`	`variable "health_check_grace_period" {`