From 6b9f1756df6db4756bcb48c602f601e901a4e75a Mon Sep 17 00:00:00 2001 From: Jiri Sveceny Date: Thu, 25 Sep 2025 11:05:01 +0200 Subject: [PATCH 1/3] Traefik ingress running on api nodes --- iac/provider-gcp/nomad/jobs/ingress.hcl | 83 +++++++++++++++++++++++++ iac/provider-gcp/nomad/main.tf | 20 ++++++ 2 files changed, 103 insertions(+) create mode 100644 iac/provider-gcp/nomad/jobs/ingress.hcl diff --git a/iac/provider-gcp/nomad/jobs/ingress.hcl b/iac/provider-gcp/nomad/jobs/ingress.hcl new file mode 100644 index 0000000000..6f34441a6c --- /dev/null +++ b/iac/provider-gcp/nomad/jobs/ingress.hcl @@ -0,0 +1,83 @@ +job "ingress" { + datacenters = ["${gcp_zone}"] + node_pool = "${node_pool}" + type = "system" + priority = 90 + + group "ingress" { + network { + port "ingress" { + static = "${ingress_port}" + } + + port "control" { + static = "${control_port}" + } + } + +# https://developer.hashicorp.com/nomad/docs/job-specification/update +%{ if update_stanza } + update { + max_parallel = 1 # Update only 1 node at a time + } +%{ endif } + + // todo: health check + service { + port = "ingress" + name = "ingress" + provider = "nomad" + } + + task "ingress" { + driver = "docker" + + # If we need more than 30s we will need to update the max_kill_timeout in nomad + # https://developer.hashicorp.com/nomad/docs/configuration/client#max_kill_timeout + %{ if update_stanza } + kill_timeout = "24h" + %{ endif } + + kill_signal = "SIGTERM" + + config { + network_mode = "host" + image = "traefik:v3.5" + ports = ["control", "ingress"] + args = [ + # Entry-points that are set internally by Traefik + "--entrypoints.web.address=:${ingress_port}", + "--entrypoints.traefik.address=:${control_port}", + + # Traefik internals (logging, metrics, ...) + "--api.dashboard=true", + "--api.insecure=false", + + "--accesslog=true", + "--ping=true", + "--ping.entryPoint=web", + "--metrics=true", + "--metrics.prometheus=true", + "--metrics.prometheus.entryPoint=traefik", + + # Traefik Nomad provider + "--providers.nomad=true", + "--providers.nomad.endpoint.address=${nomad_endpoint}", + "--providers.nomad.endpoint.token=${nomad_token}", + + # Traefik Consul provider + "--providers.consulcatalog=true", + "--providers.consulcatalog.exposedByDefault=false", + "--providers.consulcatalog.endpoint.address=${consul_endpoint}", + "--providers.consulcatalog.endpoint.token=${consul_token}", + ] + } + + resources { + memory_max = ${memory_mb * 1.5} + memory = ${memory_mb} + cpu = ${cpu_count * 1000} + } + } + } +} \ No newline at end of file diff --git a/iac/provider-gcp/nomad/main.tf b/iac/provider-gcp/nomad/main.tf index 24bf4f944e..eb0eeb2ca5 100644 --- a/iac/provider-gcp/nomad/main.tf +++ b/iac/provider-gcp/nomad/main.tf @@ -68,6 +68,26 @@ resource "docker_image" "db_migrator_image" { platform = "linux/amd64/v8" } +resource "nomad_job" "ingress" { + jobspec = templatefile("${path.module}/jobs/ingress.hcl", + { + update_stanza = var.api_machine_count > 1 + cpu_count = var.api_resources_cpu_count + memory_mb = var.api_resources_memory_mb + node_pool = var.api_node_pool + gcp_zone = var.gcp_zone + + ingress_port = 8800 + control_port = 8900 + + nomad_endpoint = "http://localhost:4646" + nomad_token = var.nomad_acl_token_secret + + consul_token = var.consul_acl_token_secret + consul_endpoint = "http://localhost:8500" + }) +} + resource "nomad_job" "api" { jobspec = templatefile("${path.module}/jobs/api.hcl", { update_stanza = var.api_machine_count > 1 From b44e506fec69d13027dc259cea44370476451cf3 Mon Sep 17 00:00:00 2001 From: Jiri Sveceny Date: Thu, 25 Sep 2025 12:45:35 +0200 Subject: [PATCH 2/3] ingress routing, removed port mapping for other services --- iac/provider-gcp/nomad-cluster/main.tf | 39 ++++--- .../nomad-cluster/network/main.tf | 108 ++++++++---------- .../nomad-cluster/network/variables.tf | 65 ++++++----- .../nomad-cluster/nodepool-api.tf | 21 +--- .../nomad-cluster/nodepool-build.tf | 5 - .../nomad-cluster/nodepool-clickhouse.tf | 5 - iac/provider-gcp/nomad/jobs/ingress.hcl | 18 ++- iac/provider-gcp/nomad/main.tf | 4 +- 8 files changed, 128 insertions(+), 137 deletions(-) diff --git a/iac/provider-gcp/nomad-cluster/main.tf b/iac/provider-gcp/nomad-cluster/main.tf index a7a1ac909d..a2e88b4a0a 100644 --- a/iac/provider-gcp/nomad-cluster/main.tf +++ b/iac/provider-gcp/nomad-cluster/main.tf @@ -92,15 +92,15 @@ module "network" { gcp_project_id = var.gcp_project_id - api_port = var.api_port - docker_reverse_proxy_port = var.docker_reverse_proxy_port - network_name = var.network_name - domain_name = var.domain_name - additional_domains = var.additional_domains + // api_port = var.api_port + // docker_reverse_proxy_port = var.docker_reverse_proxy_port + network_name = var.network_name + domain_name = var.domain_name + additional_domains = var.additional_domains - client_instance_group = google_compute_region_instance_group_manager.client_pool.instance_group - client_proxy_port = var.edge_proxy_port - client_proxy_health_port = var.edge_api_port + client_instance_group = google_compute_region_instance_group_manager.client_pool.instance_group + //client_proxy_port = var.edge_proxy_port + //client_proxy_health_port = var.edge_api_port api_instance_group = google_compute_instance_group_manager.api_pool.instance_group build_instance_group = google_compute_instance_group_manager.build_pool.instance_group @@ -115,14 +115,23 @@ module "network" { labels = var.labels prefix = var.prefix - additional_api_path_rules = [ - for service in var.additional_api_services : { - paths = service.paths - service_id = service.service_id - } - ] + //additional_api_path_rules = [ + // for service in var.additional_api_services : { + // paths = service.paths + // service_id = service.service_id + // } + //] - additional_ports = [for service in var.additional_api_services : service.api_node_group_port] + //additional_ports = [for service in var.additional_api_services : service.api_node_group_port] + + + ingress = { + port_name = "ingress" + port = 8800 + health_port_name = "ingress-health" + health_port = 8900 + health_path = "/ping" + } } module "filestore" { diff --git a/iac/provider-gcp/nomad-cluster/network/main.tf b/iac/provider-gcp/nomad-cluster/network/main.tf index ea46f7b29e..483225a8f2 100644 --- a/iac/provider-gcp/nomad-cluster/network/main.tf +++ b/iac/provider-gcp/nomad-cluster/network/main.tf @@ -20,12 +20,16 @@ locals { parts = split(".", var.domain_name) is_subdomain = length(local.parts) > 2 + // Take everything except last 2 parts subdomain = local.is_subdomain ? join(".", slice(local.parts, 0, length(local.parts) - 2)) : "" + // Take last 2 parts (1 dot) root_domain = local.is_subdomain ? join(".", slice(local.parts, length(local.parts) - 2, length(local.parts))) : var.domain_name backends = { + + /* session = { protocol = "HTTP" port = var.client_proxy_port.port @@ -66,6 +70,32 @@ locals { } groups = [{ group = var.build_instance_group }] } +*/ + + + + + + + + + + + ingress = { + protocol = "HTTP" + port = var.ingress.port + port_name = var.ingress.port_name + timeout_sec = 86400 + connection_draining_timeout_sec = 1 + http_health_check = { + request_path = var.ingress.health_path + port = var.ingress.health_port + timeout_sec = 3 + check_interval_sec = 3 + } + groups = [{ group = var.api_instance_group }] + } + nomad = { protocol = "HTTP" port = 80 @@ -78,6 +108,7 @@ locals { } groups = [{ group = var.server_instance_group }] } + consul = { protocol = "HTTP" port = 80 @@ -251,17 +282,7 @@ resource "google_certificate_manager_certificate_map_entry" "subdomains_map_entr # Load balancers resource "google_compute_url_map" "orch_map" { name = "${var.prefix}orch-map" - default_service = google_compute_backend_service.default["nomad"].self_link - - host_rule { - hosts = concat(["api.${var.domain_name}"], [for d in var.additional_domains : "api.${d}"]) - path_matcher = "api-paths" - } - - host_rule { - hosts = concat(["docker.${var.domain_name}"], [for d in var.additional_domains : "docker.${d}"]) - path_matcher = "docker-reverse-proxy-paths" - } + default_service = google_compute_backend_service.default["ingress"].self_link host_rule { hosts = concat(["nomad.${var.domain_name}"], [for d in var.additional_domains : "nomad.${d}"]) @@ -273,33 +294,10 @@ resource "google_compute_url_map" "orch_map" { path_matcher = "consul-paths" } - host_rule { - hosts = concat(["*.${var.domain_name}"], [for d in var.additional_domains : "*.${d}"]) - path_matcher = "session-paths" - } - - path_matcher { - name = "api-paths" - default_service = google_compute_backend_service.default["api"].self_link - - dynamic "path_rule" { - for_each = var.additional_api_path_rules - content { - paths = path_rule.value.paths - service = path_rule.value.service_id - } - } - } - - path_matcher { - name = "docker-reverse-proxy-paths" - default_service = google_compute_backend_service.default["docker-reverse-proxy"].self_link - } - - path_matcher { - name = "session-paths" - default_service = google_compute_backend_service.default["session"].self_link - } + //host_rule { + // hosts = concat(["*.${var.domain_name}"], [for d in var.additional_domains : "*.${d}"]) + // path_matcher = "ingress-paths" + //} path_matcher { name = "nomad-paths" @@ -322,6 +320,11 @@ resource "google_compute_url_map" "orch_map" { name = "consul-paths" default_service = google_compute_backend_service.default["consul"].self_link } + + //path_matcher { + // name = "ingress-paths" + // default_service = google_compute_backend_service.default["ingress"].self_link + //} } ### IPv4 block ### @@ -526,32 +529,10 @@ resource "google_compute_firewall" "default-hc" { } } - dynamic "allow" { - for_each = toset(var.additional_ports) - - content { - protocol = "tcp" - ports = [allow.value] - } - } -} - -resource "google_compute_firewall" "client_proxy_firewall_ingress" { - name = "${var.prefix}${var.cluster_tag_name}-client-proxy-firewall-ingress" - network = var.network_name - allow { protocol = "tcp" - ports = ["3002"] + ports = [var.ingress.port] } - - priority = 999 - - direction = "INGRESS" - target_tags = [var.cluster_tag_name] - # Load balancer health check IP ranges - # https://cloud.google.com/load-balancing/docs/health-check-concepts - source_ranges = ["130.211.0.0/22", "35.191.0.0/16"] } resource "google_compute_firewall" "logs_collector_firewall_ingress" { @@ -629,6 +610,9 @@ resource "google_compute_firewall" "orch_firewall_egress" { target_tags = [var.cluster_tag_name] } +/* +todo + # Security policy resource "google_compute_security_policy_rule" "api-throttling-api-key" { @@ -760,7 +744,7 @@ resource "google_compute_security_policy_rule" "disable-consul" { } } } - + */ resource "google_compute_security_policy" "disable-bots-log-collector" { name = "disable-bots-log-collector" diff --git a/iac/provider-gcp/nomad-cluster/network/variables.tf b/iac/provider-gcp/nomad-cluster/network/variables.tf index ea7c9c6b9f..ae6b50ba98 100644 --- a/iac/provider-gcp/nomad-cluster/network/variables.tf +++ b/iac/provider-gcp/nomad-cluster/network/variables.tf @@ -32,34 +32,14 @@ variable "cloudflare_api_token_secret_name" { type = string } -variable "api_port" { +variable "ingress" { type = object({ - name = string - port = number - health_path = string - }) -} + port_name = string + port = number -variable "docker_reverse_proxy_port" { - type = object({ - name = string - port = number - health_path = string - }) -} - -variable "client_proxy_health_port" { - type = object({ - name = string - port = number - path = string - }) -} - -variable "client_proxy_port" { - type = object({ - name = string - port = number + health_port_name = string + health_port = number + health_path = string }) } @@ -103,6 +83,7 @@ variable "labels" { type = map(string) } +/* variable "additional_api_path_rules" { description = "Additional path rules to add to the load balancer routing." type = list(object({ @@ -115,3 +96,35 @@ variable "additional_ports" { description = "Additional ports to expose on the load balancer." type = list(number) } + +variable "api_port" { + type = object({ + name = string + port = number + health_path = string + }) +} + +variable "docker_reverse_proxy_port" { + type = object({ + name = string + port = number + health_path = string + }) +} + +variable "client_proxy_health_port" { + type = object({ + name = string + port = number + path = string + }) +} + +variable "client_proxy_port" { + type = object({ + name = string + port = number + }) +} +*/ diff --git a/iac/provider-gcp/nomad-cluster/nodepool-api.tf b/iac/provider-gcp/nomad-cluster/nodepool-api.tf index a8e24ac9e1..b2f03811b5 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-api.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-api.tf @@ -52,26 +52,13 @@ resource "google_compute_instance_group_manager" "api_pool" { } named_port { - name = var.edge_api_port.name - port = var.edge_api_port.port + name = "ingress" + port = 8800 } named_port { - name = var.edge_proxy_port.name - port = var.edge_proxy_port.port - } - - named_port { - name = var.api_port.name - port = var.api_port.port - } - - dynamic "named_port" { - for_each = local.api_additional_ports - content { - name = "${var.prefix}${named_port.value.name}" - port = named_port.value.port - } + name = "ingress-health" + port = 8900 } auto_healing_policies { diff --git a/iac/provider-gcp/nomad-cluster/nodepool-build.tf b/iac/provider-gcp/nomad-cluster/nodepool-build.tf index efe3f1b5a0..44b9fa7a02 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-build.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-build.tf @@ -50,11 +50,6 @@ resource "google_compute_instance_group_manager" "build_pool" { instance_template = google_compute_instance_template.build.id } - named_port { - name = var.docker_reverse_proxy_port.name - port = var.docker_reverse_proxy_port.port - } - auto_healing_policies { health_check = google_compute_health_check.build_nomad_check.id initial_delay_sec = 600 diff --git a/iac/provider-gcp/nomad-cluster/nodepool-clickhouse.tf b/iac/provider-gcp/nomad-cluster/nodepool-clickhouse.tf index 0229473925..8159f313fb 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-clickhouse.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-clickhouse.tf @@ -44,11 +44,6 @@ resource "google_compute_instance_group_manager" "clickhouse_pool" { instance_template = google_compute_instance_template.clickhouse.id } - named_port { - name = var.clickhouse_health_port.name - port = var.clickhouse_health_port.port - } - auto_healing_policies { health_check = google_compute_health_check.clickhouse_nomad_check.id initial_delay_sec = 600 diff --git a/iac/provider-gcp/nomad/jobs/ingress.hcl b/iac/provider-gcp/nomad/jobs/ingress.hcl index 6f34441a6c..b675fd4f2e 100644 --- a/iac/provider-gcp/nomad/jobs/ingress.hcl +++ b/iac/provider-gcp/nomad/jobs/ingress.hcl @@ -22,11 +22,19 @@ job "ingress" { } %{ endif } - // todo: health check service { - port = "ingress" - name = "ingress" - provider = "nomad" + port = "ingress" + name = "ingress" + task = "ingress" + + check { + type = "http" + name = "health" + path = "/ping" + interval = "3s" + timeout = "3s" + port = "${control_port}" + } } task "ingress" { @@ -55,7 +63,7 @@ job "ingress" { "--accesslog=true", "--ping=true", - "--ping.entryPoint=web", + "--ping.entryPoint=traefik", "--metrics=true", "--metrics.prometheus=true", "--metrics.prometheus.entryPoint=traefik", diff --git a/iac/provider-gcp/nomad/main.tf b/iac/provider-gcp/nomad/main.tf index eb0eeb2ca5..1af774e097 100644 --- a/iac/provider-gcp/nomad/main.tf +++ b/iac/provider-gcp/nomad/main.tf @@ -72,8 +72,8 @@ resource "nomad_job" "ingress" { jobspec = templatefile("${path.module}/jobs/ingress.hcl", { update_stanza = var.api_machine_count > 1 - cpu_count = var.api_resources_cpu_count - memory_mb = var.api_resources_memory_mb + cpu_count = 1 + memory_mb = 512 node_pool = var.api_node_pool gcp_zone = var.gcp_zone From c12ffdf59ec715f25be18defa7cb6b5d30a5c7cd Mon Sep 17 00:00:00 2001 From: Jiri Sveceny Date: Thu, 25 Sep 2025 12:45:57 +0200 Subject: [PATCH 3/3] routing for api, edge api, docker proxy, sandboxes --- iac/provider-gcp/nomad/jobs/api.hcl | 9 ++++++--- .../nomad/jobs/docker-reverse-proxy.hcl | 13 +++++++++---- iac/provider-gcp/nomad/jobs/edge.hcl | 19 ++++++++++++++----- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/iac/provider-gcp/nomad/jobs/api.hcl b/iac/provider-gcp/nomad/jobs/api.hcl index b81bdeb0e2..714501ebf7 100644 --- a/iac/provider-gcp/nomad/jobs/api.hcl +++ b/iac/provider-gcp/nomad/jobs/api.hcl @@ -37,6 +37,12 @@ job "api" { port = "${port_number}" task = "start" + tags = [ + "traefik.enable=true", + "traefik.http.routers.api.rule=Host(`api.e2b-jirka.dev`)", + "traefik.http.routers.api.priority=10" + ] + check { type = "http" name = "health" @@ -47,7 +53,6 @@ job "api" { } } -%{ if update_stanza } # An update stanza to enable rolling updates of the service update { # The number of extra instances to run during the update @@ -61,7 +66,6 @@ job "api" { # Whether to promote the canary if the rest of the group is not healthy auto_promote = true } -%{ endif } task "start" { driver = "docker" @@ -109,7 +113,6 @@ job "api" { } config { - network_mode = "host" image = "${api_docker_image}" ports = ["${port_name}"] args = [ diff --git a/iac/provider-gcp/nomad/jobs/docker-reverse-proxy.hcl b/iac/provider-gcp/nomad/jobs/docker-reverse-proxy.hcl index 781ee7cbdd..075bdaac35 100644 --- a/iac/provider-gcp/nomad/jobs/docker-reverse-proxy.hcl +++ b/iac/provider-gcp/nomad/jobs/docker-reverse-proxy.hcl @@ -24,6 +24,12 @@ job "docker-reverse-proxy" { name = "docker-reverse-proxy" port = "${port_name}" + tags = [ + "traefik.enable=true", + "traefik.http.routers.docker_reverse_proxy.rule=Host(`docker.e2b-jirka.dev`)", + "traefik.http.routers.docker_reverse_proxy.priority=12" + ] + check { type = "http" name = "health" @@ -53,10 +59,9 @@ job "docker-reverse-proxy" { } config { - network_mode = "host" - image = "${image_name}" - ports = ["${port_name}"] - args = [ + image = "${image_name}" + ports = ["${port_name}"] + args = [ "--port", "${port_number}", ] } diff --git a/iac/provider-gcp/nomad/jobs/edge.hcl b/iac/provider-gcp/nomad/jobs/edge.hcl index 3a473d2f6f..99ab9b8680 100644 --- a/iac/provider-gcp/nomad/jobs/edge.hcl +++ b/iac/provider-gcp/nomad/jobs/edge.hcl @@ -44,6 +44,12 @@ job "client-proxy" { name = "proxy" port = "${proxy_port_name}" + tags = [ + "traefik.enable=true", + "traefik.http.routers.edge_proxy.rule=HostRegexp(`^.+\\.e2b-jirka\\.dev$`)", + "traefik.http.routers.edge_proxy.priority=1" // make it the lowest priority router as it will catch all requests + ] + check { type = "http" name = "health" @@ -58,6 +64,12 @@ job "client-proxy" { name = "edge-api" port = "${api_port}" + tags = [ + "traefik.enable=true", + "traefik.http.routers.edge_api.rule=Host(`edge.e2b-jirka.dev`)", + "traefik.http.routers.edge_api.priority=11" + ] + check { type = "http" name = "health" @@ -68,7 +80,6 @@ job "client-proxy" { } } -%{ if update_stanza } # An update stanza to enable rolling updates of the service update { # The number of extra instances to run during the update @@ -84,7 +95,6 @@ job "client-proxy" { # Deadline for the update to be completed progress_deadline = "24h" } -%{ endif } task "start" { driver = "docker" @@ -137,9 +147,8 @@ job "client-proxy" { } config { - network_mode = "host" - image = "${image_name}" - ports = ["${proxy_port_name}", "${api_port_name}"] + image = "${image_name}" + ports = ["${proxy_port_name}", "${api_port_name}"] } } }