@@ -184,9 +184,16 @@ cat <<EOF >/etc/systemd/resolved.conf.d/consul.conf
184184[Resolve]
185185DNS=127.0.0.1:8600
186186DNSSEC=false
187- Domains=~consul
188187EOF
189- systemctl restart systemd-resolved
188+ sync # Ensure file is written to disk
189+
190+ # Remove GCE's DNS config to prevent it from competing with Consul DNS (GCP-specific fix)
191+ # We don't need routing domains since Consul handles ALL DNS:
192+ # - .consul queries: served directly by Consul
193+ # - other queries: forwarded to GCE DNS via Consul's recursor config
194+ if [ -f /etc/systemd/resolved.conf.d/gce-resolved.conf ]; then
195+ mv /etc/systemd/resolved.conf.d/gce-resolved.conf /etc/systemd/resolved.conf.d/gce-resolved.conf.disabled
196+ fi
190197
191198# Set up huge pages
192199# We are not enabling Transparent Huge Pages for now, as they are not swappable and may result in slowdowns + we are not using swap right now.
@@ -260,13 +267,59 @@ overcommitment_hugepages=$(remove_decimal $overcommitment_hugepages)
260267echo " - Allocating $overcommitment_hugepages huge pages ($overcommitment_hugepages_percentage %) for overcommitment"
261268echo $overcommitment_hugepages > /proc/sys/vm/nr_overcommit_hugepages
262269
270+ # Get GCE DNS server dynamically from metadata for Consul recursors
271+ # This ensures we can resolve internet domains through Consul
272+ GCE_DNS=$( curl -s -H ' Metadata-Flavor: Google' http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/dns-servers || echo " 169.254.169.254" )
273+
274+ # Start Consul first (in background) with GCE DNS as recursor
275+ # This allows Consul to handle both .consul queries AND forward internet queries
263276# These variables are passed in via Terraform template interpolation
264277/opt/consul/bin/run-consul.sh --client \
265278 --consul-token " ${CONSUL_TOKEN} " \
266279 --cluster-tag-name " ${CLUSTER_TAG_NAME} " \
267280 --enable-gossip-encryption \
268281 --gossip-encryption-key " ${CONSUL_GOSSIP_ENCRYPTION_KEY} " \
269- --dns-request-token " ${CONSUL_DNS_REQUEST_TOKEN} " &
282+ --dns-request-token " ${CONSUL_DNS_REQUEST_TOKEN} " \
283+ --recursor " $$ {GCE_DNS}" &
284+
285+ # Give Consul a moment to start its DNS server on port 8600
286+ echo " - Waiting for Consul DNS to start on port 8600..."
287+ for i in {1..10}; do
288+ if nc -z 127.0.0.1 8600 2> /dev/null; then
289+ echo " - Consul DNS is ready (attempt $i /10)"
290+ break
291+ fi
292+ if [ $i -eq 10 ]; then
293+ echo " - ERROR: Consul DNS not responding after 10 seconds, exiting..."
294+ exit 1
295+ fi
296+ sleep 1
297+ done
298+
299+ # Now restart systemd-resolved to apply Consul DNS configuration
300+ # This must happen AFTER Consul starts, otherwise systemd-resolved marks 127.0.0.1:8600 as unreachable
301+ # Consul DNS (127.0.0.1:8600) is the ONLY DNS server configured in systemd-resolved
302+ # Consul handles ALL queries: .consul directly, everything else via recursor to GCE DNS
303+ echo " [Configuring systemd-resolved for Consul DNS]"
304+ echo " - Restarting systemd-resolved to apply Consul DNS config"
305+ systemctl restart systemd-resolved
306+ echo " - Waiting for systemd-resolved to settle"
307+
308+ # Give Consul a moment to start its DNS server on port 8600
309+ echo " - Waiting for Systemd-resolved to start..."
310+ for i in {1..10}; do
311+ if host google.com 2> /dev/null; then
312+ echo " - DNS resolving is ready (attempt $i /10)"
313+ break
314+ fi
315+ if [ $i -eq 10 ]; then
316+ echo " - ERROR: Systemd-resolved not responding after 10 seconds, exiting..."
317+ exit 1
318+ fi
319+ sleep 1
320+ done
321+ echo " - Flushing DNS caches"
322+ resolvectl flush-caches
270323
271324/opt/nomad/bin/run-nomad.sh --client --consul-token " ${CONSUL_TOKEN} " --node-pool " ${NODE_POOL} " &
272325
0 commit comments