Skip to content

Commit 173622f

Browse files
authored
Fix GCP Consul DNS resolution with two-layer protection (#1430)
1 parent c54080f commit 173622f

File tree

2 files changed

+65
-3
lines changed

2 files changed

+65
-3
lines changed

iac/provider-gcp/nomad-cluster-disk-image/main.pkr.hcl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,4 +151,13 @@ build {
151151
"echo 'net.netfilter.nf_conntrack_max = 2097152' | sudo tee -a /etc/sysctl.conf",
152152
]
153153
}
154+
155+
# Block GCE's gce-resolved.conf to prevent DNS conflicts with Consul
156+
provisioner "shell" {
157+
inline = [
158+
"echo 'Blocking gce-resolved.conf to prevent DNS conflicts with Consul DNS'",
159+
"sudo dpkg-divert --add --rename --divert /etc/systemd/resolved.conf.d/gce-resolved.conf.diverted /etc/systemd/resolved.conf.d/gce-resolved.conf || true",
160+
"echo 'dpkg-divert configured successfully'",
161+
]
162+
}
154163
}

iac/provider-gcp/nomad-cluster/scripts/start-client.sh

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,16 @@ cat <<EOF >/etc/systemd/resolved.conf.d/consul.conf
184184
[Resolve]
185185
DNS=127.0.0.1:8600
186186
DNSSEC=false
187-
Domains=~consul
188187
EOF
189-
systemctl restart systemd-resolved
188+
sync # Ensure file is written to disk
189+
190+
# Remove GCE's DNS config to prevent it from competing with Consul DNS (GCP-specific fix)
191+
# We don't need routing domains since Consul handles ALL DNS:
192+
# - .consul queries: served directly by Consul
193+
# - other queries: forwarded to GCE DNS via Consul's recursor config
194+
if [ -f /etc/systemd/resolved.conf.d/gce-resolved.conf ]; then
195+
mv /etc/systemd/resolved.conf.d/gce-resolved.conf /etc/systemd/resolved.conf.d/gce-resolved.conf.disabled
196+
fi
190197

191198
# Set up huge pages
192199
# We are not enabling Transparent Huge Pages for now, as they are not swappable and may result in slowdowns + we are not using swap right now.
@@ -260,13 +267,59 @@ overcommitment_hugepages=$(remove_decimal $overcommitment_hugepages)
260267
echo "- Allocating $overcommitment_hugepages huge pages ($overcommitment_hugepages_percentage%) for overcommitment"
261268
echo $overcommitment_hugepages >/proc/sys/vm/nr_overcommit_hugepages
262269

270+
# Get GCE DNS server dynamically from metadata for Consul recursors
271+
# This ensures we can resolve internet domains through Consul
272+
GCE_DNS=$(curl -s -H 'Metadata-Flavor: Google' http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/dns-servers || echo "169.254.169.254")
273+
274+
# Start Consul first (in background) with GCE DNS as recursor
275+
# This allows Consul to handle both .consul queries AND forward internet queries
263276
# These variables are passed in via Terraform template interpolation
264277
/opt/consul/bin/run-consul.sh --client \
265278
--consul-token "${CONSUL_TOKEN}" \
266279
--cluster-tag-name "${CLUSTER_TAG_NAME}" \
267280
--enable-gossip-encryption \
268281
--gossip-encryption-key "${CONSUL_GOSSIP_ENCRYPTION_KEY}" \
269-
--dns-request-token "${CONSUL_DNS_REQUEST_TOKEN}" &
282+
--dns-request-token "${CONSUL_DNS_REQUEST_TOKEN}" \
283+
--recursor "$${GCE_DNS}" &
284+
285+
# Give Consul a moment to start its DNS server on port 8600
286+
echo "- Waiting for Consul DNS to start on port 8600..."
287+
for i in {1..10}; do
288+
if nc -z 127.0.0.1 8600 2>/dev/null; then
289+
echo "- Consul DNS is ready (attempt $i/10)"
290+
break
291+
fi
292+
if [ $i -eq 10 ]; then
293+
echo "- ERROR: Consul DNS not responding after 10 seconds, exiting..."
294+
exit 1
295+
fi
296+
sleep 1
297+
done
298+
299+
# Now restart systemd-resolved to apply Consul DNS configuration
300+
# This must happen AFTER Consul starts, otherwise systemd-resolved marks 127.0.0.1:8600 as unreachable
301+
# Consul DNS (127.0.0.1:8600) is the ONLY DNS server configured in systemd-resolved
302+
# Consul handles ALL queries: .consul directly, everything else via recursor to GCE DNS
303+
echo "[Configuring systemd-resolved for Consul DNS]"
304+
echo "- Restarting systemd-resolved to apply Consul DNS config"
305+
systemctl restart systemd-resolved
306+
echo "- Waiting for systemd-resolved to settle"
307+
308+
# Give Consul a moment to start its DNS server on port 8600
309+
echo "- Waiting for Systemd-resolved to start..."
310+
for i in {1..10}; do
311+
if host google.com 2>/dev/null; then
312+
echo "- DNS resolving is ready (attempt $i/10)"
313+
break
314+
fi
315+
if [ $i -eq 10 ]; then
316+
echo "- ERROR: Systemd-resolved not responding after 10 seconds, exiting..."
317+
exit 1
318+
fi
319+
sleep 1
320+
done
321+
echo "- Flushing DNS caches"
322+
resolvectl flush-caches
270323

271324
/opt/nomad/bin/run-nomad.sh --client --consul-token "${CONSUL_TOKEN}" --node-pool "${NODE_POOL}" &
272325

0 commit comments

Comments
 (0)