Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions iac/provider-gcp/nomad-cluster-disk-image/main.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -151,4 +151,13 @@ build {
"echo 'net.netfilter.nf_conntrack_max = 2097152' | sudo tee -a /etc/sysctl.conf",
]
}

# Block GCE's gce-resolved.conf to prevent DNS conflicts with Consul
provisioner "shell" {
inline = [
"echo 'Blocking gce-resolved.conf to prevent DNS conflicts with Consul DNS'",
"sudo dpkg-divert --add --rename --divert /etc/systemd/resolved.conf.d/gce-resolved.conf.diverted /etc/systemd/resolved.conf.d/gce-resolved.conf || true",
"echo 'dpkg-divert configured successfully'",
]
}
}
59 changes: 56 additions & 3 deletions iac/provider-gcp/nomad-cluster/scripts/start-client.sh
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,16 @@ cat <<EOF >/etc/systemd/resolved.conf.d/consul.conf
[Resolve]
DNS=127.0.0.1:8600
DNSSEC=false
Domains=~consul
EOF
systemctl restart systemd-resolved
sync # Ensure file is written to disk

# Remove GCE's DNS config to prevent it from competing with Consul DNS (GCP-specific fix)
# We don't need routing domains since Consul handles ALL DNS:
# - .consul queries: served directly by Consul
# - other queries: forwarded to GCE DNS via Consul's recursor config
if [ -f /etc/systemd/resolved.conf.d/gce-resolved.conf ]; then
mv /etc/systemd/resolved.conf.d/gce-resolved.conf /etc/systemd/resolved.conf.d/gce-resolved.conf.disabled
fi

# Set up huge pages
# We are not enabling Transparent Huge Pages for now, as they are not swappable and may result in slowdowns + we are not using swap right now.
Expand Down Expand Up @@ -260,13 +267,59 @@ overcommitment_hugepages=$(remove_decimal $overcommitment_hugepages)
echo "- Allocating $overcommitment_hugepages huge pages ($overcommitment_hugepages_percentage%) for overcommitment"
echo $overcommitment_hugepages >/proc/sys/vm/nr_overcommit_hugepages

# Get GCE DNS server dynamically from metadata for Consul recursors
# This ensures we can resolve internet domains through Consul
GCE_DNS=$(curl -s -H 'Metadata-Flavor: Google' http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/dns-servers || echo "169.254.169.254")

# Start Consul first (in background) with GCE DNS as recursor
# This allows Consul to handle both .consul queries AND forward internet queries
# These variables are passed in via Terraform template interpolation
/opt/consul/bin/run-consul.sh --client \
--consul-token "${CONSUL_TOKEN}" \
--cluster-tag-name "${CLUSTER_TAG_NAME}" \
--enable-gossip-encryption \
--gossip-encryption-key "${CONSUL_GOSSIP_ENCRYPTION_KEY}" \
--dns-request-token "${CONSUL_DNS_REQUEST_TOKEN}" &
--dns-request-token "${CONSUL_DNS_REQUEST_TOKEN}" \
--recursor "$${GCE_DNS}" &

# Give Consul a moment to start its DNS server on port 8600
echo "- Waiting for Consul DNS to start on port 8600..."
for i in {1..10}; do
if nc -z 127.0.0.1 8600 2>/dev/null; then
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nslookup returns non-zero if the lookup fails, could use that for a more meaningful check

Suggested change
if nc -z 127.0.0.1 8600 2>/dev/null; then
if ! nslookup google.com; then

echo "- Consul DNS is ready (attempt $i/10)"
break
fi
if [ $i -eq 10 ]; then
echo "- ERROR: Consul DNS not responding after 10 seconds, exiting..."
exit 1
fi
sleep 1
done

# Now restart systemd-resolved to apply Consul DNS configuration
# This must happen AFTER Consul starts, otherwise systemd-resolved marks 127.0.0.1:8600 as unreachable
# Consul DNS (127.0.0.1:8600) is the ONLY DNS server configured in systemd-resolved
# Consul handles ALL queries: .consul directly, everything else via recursor to GCE DNS
echo "[Configuring systemd-resolved for Consul DNS]"
echo "- Restarting systemd-resolved to apply Consul DNS config"
systemctl restart systemd-resolved
echo "- Waiting for systemd-resolved to settle"

# Give Consul a moment to start its DNS server on port 8600
echo "- Waiting for Systemd-resolved to start..."
for i in {1..10}; do
if host google.com 2>/dev/null; then
echo "- DNS resolving is ready (attempt $i/10)"
break
fi
if [ $i -eq 10 ]; then
echo "- ERROR: Systemd-resolved not responding after 10 seconds, exiting..."
exit 1
fi
sleep 1
done
echo "- Flushing DNS caches"
resolvectl flush-caches

/opt/nomad/bin/run-nomad.sh --client --consul-token "${CONSUL_TOKEN}" --node-pool "${NODE_POOL}" &

Expand Down