Skip to content

Commit 399ac14

Browse files
committed
introduce tenant dns caching for CNAMES only
1 parent 750628b commit 399ac14

File tree

4 files changed

+123
-138
lines changed

4 files changed

+123
-138
lines changed

Cargo.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/balancerd/src/bin/balancerd.rs

Lines changed: 3 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,10 @@ use std::path::PathBuf;
1818
use std::time::Duration;
1919

2020
use anyhow::Context;
21-
use hickory_resolver::{
22-
Resolver, config::*, name_server::TokioConnectionProvider, system_conf::read_system_conf,
23-
};
2421
use jsonwebtoken::DecodingKey;
2522
use mz_balancerd::{
2623
BUILD_INFO, BalancerConfig, BalancerResolver, BalancerService, CancellationResolver,
27-
FronteggResolver, SniResolver, create_default_resolver,
24+
FronteggResolver, SniResolver,
2825
};
2926
use mz_frontegg_auth::{
3027
Authenticator, AuthenticatorConfig, DEFAULT_REFRESH_DROP_FACTOR,
@@ -246,7 +243,7 @@ pub async fn run(args: ServiceArgs, tracing_handle: TracingHandle) -> Result<(),
246243

247244
(
248245
BalancerResolver::MultiTenant(
249-
create_default_resolver(),
246+
mz_balancerd::TenantDnsResolver::new(),
250247
FronteggResolver {
251248
auth,
252249
addr_template,
@@ -284,35 +281,8 @@ pub async fn run(args: ServiceArgs, tracing_handle: TracingHandle) -> Result<(),
284281
};
285282
drop(addrs);
286283

287-
// Create a resolver for static addresses with the same caching configuration
288-
let mut resolver_opts = ResolverOpts::default();
289-
resolver_opts.cache_size = 10000;
290-
resolver_opts.positive_max_ttl = Some(Duration::from_secs(10));
291-
resolver_opts.positive_min_ttl = Some(Duration::from_secs(9));
292-
resolver_opts.negative_min_ttl = Some(Duration::from_secs(1));
293-
294-
// Read system DNS configuration or fall back to defaults
295-
let (config, opts) = read_system_conf()
296-
.map(|(config, mut opts)| {
297-
// Override specific options while keeping system DNS servers
298-
opts.cache_size = resolver_opts.cache_size;
299-
opts.positive_max_ttl = resolver_opts.positive_max_ttl;
300-
opts.positive_min_ttl = resolver_opts.positive_min_ttl;
301-
opts.negative_min_ttl = resolver_opts.negative_min_ttl;
302-
(config, opts)
303-
})
304-
.unwrap_or_else(|err| {
305-
eprintln!("Failed to read system DNS configuration for static resolver, using defaults: {}", err);
306-
(ResolverConfig::default(), resolver_opts)
307-
});
308-
309284
(
310-
BalancerResolver::Static(
311-
Resolver::builder_with_config(config, TokioConnectionProvider::default())
312-
.with_options(opts)
313-
.build(),
314-
addr.clone(),
315-
),
285+
BalancerResolver::Static(mz_balancerd::TenantDnsResolver::new(), addr.clone()),
316286
CancellationResolver::Static(addr),
317287
)
318288
}

src/balancerd/src/lib.rs

Lines changed: 116 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,7 @@ impl BalancerService {
356356
let port: u16 = port.parse().expect("unexpected port");
357357

358358
let https = HttpsBalancer {
359-
resolver: Arc::from(create_default_resolver()),
359+
resolver: Arc::from(TenantDnsResolver::new()),
360360
tls: https_tls,
361361
resolve_template: Arc::from(addr),
362362
port,
@@ -1049,7 +1049,7 @@ async fn cancel_request(
10491049
}
10501050

10511051
struct HttpsBalancer {
1052-
resolver: Arc<TokioResolver>,
1052+
resolver: Arc<TenantDnsResolver>,
10531053
tls: Option<ReloadingSslContext>,
10541054
resolve_template: Arc<str>,
10551055
port: u16,
@@ -1060,7 +1060,7 @@ struct HttpsBalancer {
10601060

10611061
impl HttpsBalancer {
10621062
async fn resolve(
1063-
resolver: &TokioResolver,
1063+
resolver: &TenantDnsResolver,
10641064
resolve_template: &str,
10651065
port: u16,
10661066
servername: Option<&str>,
@@ -1086,7 +1086,9 @@ impl HttpsBalancer {
10861086
let tenant = resolver.tenant(&addr).await;
10871087

10881088
// Now do the regular ip lookup, regardless of if there was a CNAME.
1089-
let envd_addr = lookup_with_resolver(resolver, &format!("{addr}:{port}")).await?;
1089+
let envd_addr = resolver
1090+
.resolve_tenant_from_sni_host(&format!("{addr}:{port}"))
1091+
.await?;
10901092

10911093
Ok(ResolvedAddr {
10921094
addr: envd_addr,
@@ -1100,21 +1102,15 @@ trait ResolverExt {
11001102
async fn tenant(&self, addr: &str) -> Option<String>;
11011103
}
11021104

1103-
impl ResolverExt for TokioResolver {
1105+
impl ResolverExt for TenantDnsResolver {
11041106
/// Finds the tenant of a DNS address. Errors or lack of cname resolution here are ok, because
11051107
/// this is only used for metrics.
11061108
async fn tenant(&self, addr: &str) -> Option<String> {
11071109
debug!("resolving tenant for {:?}", addr);
1108-
// Lookup the CNAME. If there's a CNAME, find the tenant.
1109-
let lookup = self.lookup(addr, RecordType::CNAME).await;
1110-
if let Ok(lookup) = lookup {
1111-
for record in lookup.iter() {
1112-
if let Some(cname) = record.as_cname() {
1113-
let cname = cname.to_string();
1114-
debug!("cname: {cname}");
1115-
return extract_tenant_from_cname(&cname);
1116-
}
1117-
}
1110+
// Lookup the CNAME using the caching resolver
1111+
if let Ok(Some(cname)) = self.resolve_cname(addr).await {
1112+
debug!("cname: {cname}");
1113+
return extract_tenant_from_cname(&cname);
11181114
}
11191115
None
11201116
}
@@ -1262,8 +1258,8 @@ impl<T: AsyncRead + AsyncWrite + Unpin + Send> ClientStream for T {}
12621258

12631259
#[derive(Debug)]
12641260
pub enum BalancerResolver {
1265-
Static(TokioResolver, String),
1266-
MultiTenant(TokioResolver, FronteggResolver, Option<SniResolver>),
1261+
Static(TenantDnsResolver, String),
1262+
MultiTenant(TenantDnsResolver, FronteggResolver, Option<SniResolver>),
12671263
}
12681264

12691265
impl BalancerResolver {
@@ -1309,7 +1305,7 @@ impl BalancerResolver {
13091305
let sni_addr = sni_addr_template.replace("{}", servername);
13101306
let tenant = dns_resolver.tenant(&sni_addr).await;
13111307
let sni_addr = format!("{sni_addr}:{port}");
1312-
let addr = lookup_with_resolver(dns_resolver, &sni_addr).await?;
1308+
let addr = dns_resolver.resolve_tenant_from_sni_host(&sni_addr).await?;
13131309
if tenant.is_some() {
13141310
debug!("SNI header found for tenant {:?}", tenant);
13151311
}
@@ -1340,7 +1336,7 @@ impl BalancerResolver {
13401336

13411337
let addr =
13421338
addr_template.replace("{}", &auth_session.tenant_id().to_string());
1343-
let addr = lookup_with_resolver(dns_resolver, &addr).await?;
1339+
let addr = dns_resolver.resolve_tenant_from_sni_host(&addr).await?;
13441340
let tenant = Some(auth_session.tenant_id().to_string());
13451341
if tenant.is_some() {
13461342
debug!("SNI header NOT found for tenant {:?}", tenant);
@@ -1363,7 +1359,7 @@ impl BalancerResolver {
13631359
}
13641360
BalancerResolver::Static(resolver, addr) => {
13651361
// Use the shared resolver instance for caching benefits
1366-
let addr = lookup_with_resolver(resolver, addr).await?;
1362+
let addr = resolver.resolve_tenant_from_sni_host(addr).await?;
13671363
Ok(ResolvedAddr {
13681364
addr,
13691365
password: None,
@@ -1408,85 +1404,119 @@ pub fn create_default_resolver() -> TokioResolver {
14081404
.build()
14091405
}
14101406

1411-
/// Follows CNAME chain to resolve a hostname to IP addresses, ensuring each step is cached.
1412-
///
1413-
/// hickory's `lookup_ip` only caches the final step when CNAMEs are involved. This function
1414-
/// explicitly queries for CNAME records at each step, following the chain until we reach a
1415-
/// hostname that resolves to A records. Each query is cached individually by hickory.
1407+
/// Creates a resolver with caching disabled for DNS lookups.
1408+
pub fn create_non_caching_resolver() -> TokioResolver {
1409+
let mut resolver_opts = ResolverOpts::default();
1410+
resolver_opts.cache_size = 0; // Disable caching
1411+
resolver_opts.ip_strategy = LookupIpStrategy::Ipv4thenIpv6;
1412+
1413+
// Read system DNS configuration or fall back to defaults
1414+
let (config, opts) = read_system_conf()
1415+
.map(|(config, mut opts)| {
1416+
// Override specific options while keeping system DNS servers
1417+
opts.cache_size = resolver_opts.cache_size;
1418+
(config, opts)
1419+
})
1420+
.unwrap_or_else(|err| {
1421+
warn!(
1422+
"Failed to read system DNS configuration, using defaults: {}",
1423+
err
1424+
);
1425+
(ResolverConfig::default(), resolver_opts)
1426+
});
1427+
1428+
Resolver::builder_with_config(config, TokioConnectionProvider::default())
1429+
.with_options(opts)
1430+
.build()
1431+
}
1432+
1433+
/// A resolver that uses separate caching and non-caching resolvers for different record types.
14161434
///
1417-
/// Returns the final hostname and its A record lookup response.
1418-
async fn follow_cname_chain(
1419-
resolver: &TokioResolver,
1420-
initial_host: &str,
1421-
) -> Result<hickory_resolver::lookup_ip::LookupIp, anyhow::Error> {
1422-
let mut current_host = initial_host.to_string();
1423-
let max_hops = 10; // Prevent infinite loops in case of misconfigured DNS
1424-
1425-
for hop in 0..max_hops {
1426-
// Explicitly query for CNAME records to ensure this step is cached
1427-
match resolver.lookup(&current_host, RecordType::CNAME).await {
1435+
/// This struct contains:
1436+
/// - A caching resolver for CNAME lookups
1437+
/// - A non-caching resolver for A record lookups
1438+
#[derive(Debug)]
1439+
pub struct TenantDnsResolver {
1440+
caching_resolver: TokioResolver,
1441+
non_caching_resolver: TokioResolver,
1442+
}
1443+
1444+
impl TenantDnsResolver {
1445+
/// Creates a new TenantDnsResolver with default caching and non-caching resolvers.
1446+
pub fn new() -> Self {
1447+
Self {
1448+
caching_resolver: create_default_resolver(),
1449+
non_caching_resolver: create_non_caching_resolver(),
1450+
}
1451+
}
1452+
1453+
/// Resolves a CNAME record using the caching resolver.
1454+
/// Returns the CNAME target hostname, or None if no CNAME exists.
1455+
pub async fn resolve_cname(&self, hostname: &str) -> Result<Option<String>, anyhow::Error> {
1456+
match self
1457+
.caching_resolver
1458+
.lookup(hostname, RecordType::CNAME)
1459+
.await
1460+
{
14281461
Ok(cname_response) => {
14291462
// Check if we got a CNAME record back
14301463
if let Some(cname_record) = cname_response.iter().next() {
14311464
if let Some(cname_data) = cname_record.as_cname() {
1432-
// Follow the CNAME to the next hostname
1433-
let next_host = cname_data.to_string();
1434-
tracing::debug!(
1435-
hop = hop,
1436-
from = %current_host,
1437-
to = %next_host,
1438-
"Following CNAME"
1439-
);
1440-
current_host = next_host;
1441-
continue;
1465+
return Ok(Some(cname_data.to_string()));
14421466
}
14431467
}
1444-
// No CNAME found, fall through to A record lookup
1445-
break;
1446-
}
1447-
Err(_) => {
1448-
// No CNAME record exists, this is likely the final hostname
1449-
break;
1468+
Ok(None)
14501469
}
1470+
Err(_) => Ok(None),
14511471
}
14521472
}
14531473

1454-
// Now lookup A records for the final hostname in the chain
1455-
// We don't want to cache this
1456-
let response = resolver
1457-
.lookup_ip(&current_host)
1458-
.await
1459-
.with_context(|| format!("Failed to resolve A record for hostname: {}", current_host))?;
1474+
/// Resolves A records without caching.
1475+
/// Returns a LookupIp containing all A records for the hostname.
1476+
pub async fn resolve_arec_no_cache(
1477+
&self,
1478+
hostname: &str,
1479+
) -> Result<hickory_resolver::lookup_ip::LookupIp, anyhow::Error> {
1480+
self.non_caching_resolver
1481+
.lookup_ip(hostname)
1482+
.await
1483+
.with_context(|| format!("Failed to resolve A record for hostname: {}", hostname))
1484+
}
14601485

1461-
Ok(response)
1462-
// Ok((current_host, a_response))
1463-
}
1486+
/// Returns the first IP address resolved from the provided hostname using the DualResolver.
1487+
///
1488+
/// CNAMEs are resolved with caching, while A records are resolved without caching.
1489+
async fn resolve_tenant_from_sni_host(
1490+
self: &TenantDnsResolver,
1491+
name: &str,
1492+
) -> Result<SocketAddr, anyhow::Error> {
1493+
// Parse out the hostname and port from name (format: "hostname:port")
1494+
let (host, port_str) = name
1495+
.rsplit_once(':')
1496+
.ok_or_else(|| anyhow::anyhow!("Invalid address format, expected 'hostname:port'"))?;
1497+
let port: u16 = port_str
1498+
.parse()
1499+
.with_context(|| format!("Invalid port in address: {}", name))?;
1500+
1501+
// Resolve initial CNAME with caching if it exists...
1502+
// these are generally static and we can generally ignore TTLS
1503+
//
1504+
// Resolve arec from host without caching... these can change at anytime
1505+
// in generally we should ignore TTLs.
1506+
let ip = if let Some(resolved_host) = self.resolve_cname(host).await? {
1507+
self.resolve_arec_no_cache(&resolved_host).await?
1508+
} else {
1509+
self.resolve_arec_no_cache(host).await?
1510+
};
1511+
1512+
// Extract IP address from A records
1513+
let maybe_socket_addr = ip.iter().find_map(|r| Some(SocketAddr::new(r, port)));
14641514

1465-
/// Returns the first IP address resolved from the provided hostname using the hickory resolver for caching.
1466-
async fn lookup_with_resolver(
1467-
resolver: &TokioResolver,
1468-
name: &str,
1469-
) -> Result<SocketAddr, anyhow::Error> {
1470-
// Parse out the hostname and port from name (format: "hostname:port")
1471-
let (host, port_str) = name
1472-
.rsplit_once(':')
1473-
.ok_or_else(|| anyhow::anyhow!("Invalid address format, expected 'hostname:port'"))?;
1474-
let port: u16 = port_str
1475-
.parse()
1476-
.with_context(|| format!("Invalid port in address: {}", name))?;
1477-
1478-
// Follow the CNAME chain and get A records, ensuring each step is cached
1479-
let ip_response = follow_cname_chain(resolver, host).await?;
1480-
1481-
// Extract IP address from A records
1482-
let maybe_socket_addr = ip_response
1483-
.iter()
1484-
.find_map(|r| Some(SocketAddr::new(r, port)));
1485-
1486-
if let Some(addr) = maybe_socket_addr {
1487-
Ok(addr)
1488-
} else {
1489-
anyhow::bail!("No A records found in DNS response")
1515+
if let Some(addr) = maybe_socket_addr {
1516+
Ok(addr)
1517+
} else {
1518+
anyhow::bail!("No A records found in DNS response")
1519+
}
14901520
}
14911521
}
14921522

0 commit comments

Comments
 (0)