NickJLange · NickJLange · Feb 1, 2026 · Feb 22, 2026 · Feb 22, 2026 · Feb 23, 2026
diff --git a/dns-monitoring/datadog-monitor-dns-down-critical.json b/dns-monitoring/datadog-monitor-dns-down-critical.json
@@ -0,0 +1,20 @@
+{
+  "name": "DNS Server Down - CRITICAL - PageDuty",
-  "name": "DNS Server Down - CRITICAL - PageDuty",
+  "name": "DNS Server Down - CRITICAL - PagerDuty",
-  "name": "DNS Server Down - CRITICAL - PageDuty",
+  "name": "DNS Server Down - CRITICAL - PagerDuty",
+  "type": "query alert",
+  "query": "max(last_15m):max:pihole.dns_up{*} by {server} < 1",
+  "message": "CRITICAL: DNS server {{server.name}} has been down for 15 minutes. Triggering PagerDuty.\n\n@pagerduty",
+  "tags": ["dns", "pihole", "critical"],
+  "options": {
+    "notify_audit": false,
+    "locked": false,
+    "require_full_window": true,
+    "notify_no_data": false,
+    "renotify_interval": 15,
+    "evaluation_delay": 30,
+    "new_group_delay": 30,
+    "include_tags": false,
+    "thresholds": {
+      "critical": 1
+    }
+  }
+}
diff --git a/dns-monitoring/datadog-monitor-dns-down-warning.json b/dns-monitoring/datadog-monitor-dns-down-warning.json
@@ -0,0 +1,21 @@
+{
+  "name": "DNS Server Down - WARNING",
+  "type": "query alert",
+  "query": "max(last_5m):max:pihole.dns_up{*} by {server} < 1",
+  "message": "DNS server {{server.name}} has been down for 5 minutes. \n\n@slack-ops",
+  "tags": ["dns", "pihole", "warning"],
+  "options": {
+    "notify_audit": false,
+    "locked": false,
+    "require_full_window": true,
+    "notify_no_data": false,
+    "renotify_interval": 30,
+    "evaluation_delay": 30,
+    "new_group_delay": 30,
+    "include_tags": false,
+    "thresholds": {
+      "critical": 1,
+      "warning": 1
+    }
-    "thresholds": {
-      "critical": 1,
-      "warning": 1
-    }
+    "thresholds": {
+      "critical": 1
+    }
-    "thresholds": {
-      "critical": 1,
-      "warning": 1
-    }
+    "thresholds": {
+      "critical": 1
+    }
+  }
+}
diff --git a/dns-monitoring/datadog-monitor-dns-external-latency.json b/dns-monitoring/datadog-monitor-dns-external-latency.json
@@ -0,0 +1,21 @@
+{
+  "name": "DNS External Latency High",
+  "type": "query alert",
+  "query": "max(last_5m):max:dns_external_latency{*} > 200",
-  "query": "max(last_5m):max:dns_external_latency{*} > 200",
+  "query": "max(last_5m):max:dns_external_latency{*} > 500",
-  "query": "max(last_5m):max:dns_external_latency{*} > 200",
+  "query": "max(last_5m):max:dns_external_latency{*} > 500",
+  "message": "External DNS latency is above 200ms. Check internet connectivity or upstream DNS.\n\n@slack-ops",
+  "tags": ["dns", "external", "latency"],
+  "options": {
+    "notify_audit": false,
+    "locked": false,
+    "require_full_window": true,
+    "notify_no_data": false,
+    "renotify_interval": 60,
+    "evaluation_delay": 30,
+    "new_group_delay": 30,
+    "include_tags": false,
+    "thresholds": {
+      "critical": 500,
+      "warning": 200
+    }
+  }
+}
diff --git a/dns-monitoring/datadog-monitor-dns-latency-warning.json b/dns-monitoring/datadog-monitor-dns-latency-warning.json
@@ -0,0 +1,21 @@
+{
+  "name": "DNS Latency High - WARNING",
+  "type": "query alert",
+  "query": "max(last_5m):max:dns_internal_latency{*} > 100",
+  "message": "DNS latency on {{server.name}} is above 100ms for 5 minutes.\n\n@slack-ops",
-  "query": "max(last_5m):max:dns_internal_latency{*} > 100",
-  "message": "DNS latency on {{server.name}} is above 100ms for 5 minutes.\n\n@slack-ops",
+  "query": "max(last_5m):max:dns_internal_latency{*} by {server} > 100",
+  "message": "DNS latency on {{server.name}} is above 100ms for 5 minutes.\n\n@slack-ops",
-  "query": "max(last_5m):max:dns_internal_latency{*} > 100",
-  "message": "DNS latency on {{server.name}} is above 100ms for 5 minutes.\n\n@slack-ops",
+  "query": "max(last_5m):max:dns_internal_latency{*} by {server} > 100",
+  "message": "DNS latency on {{server.name}} is above 100ms for 5 minutes.\n\n@slack-ops",
+  "tags": ["dns", "latency", "warning"],
+  "options": {
+    "notify_audit": false,
+    "locked": false,
+    "require_full_window": true,
+    "notify_no_data": false,
+    "renotify_interval": 60,
+    "evaluation_delay": 30,
+    "new_group_delay": 30,
+    "include_tags": false,
+    "thresholds": {
+      "critical": 500,
+      "warning": 100
+    }
+  }
+}
diff --git a/openspec/changes/dns-keepalived-analysis/.openspec.yaml b/openspec/changes/dns-keepalived-analysis/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-02-21
-created: 2026-02-21
+schema: spec-driven
+created: "2026-02-21"
-created: 2026-02-21
+schema: spec-driven
+created: "2026-02-21"
diff --git a/openspec/changes/dns-keepalived-analysis/design.md b/openspec/changes/dns-keepalived-analysis/design.md
@@ -0,0 +1,187 @@
+## Context
+
+### Current DNS Infrastructure
+
+**Wisconsin (marsAlpha, marsBeta)**
+- Network: 192.168.20.0/24
+- IPs: 192.168.20.250, 192.168.20.251
+
+**New York (terraOmega, terraPhi)**
+- Network: 192.168.100.0/24
+- IPs: 192.168.100.112, 192.168.100.110
+
+**Miyagi (alphaCenA, alphaCenB)**
+- Network: 192.168.3.0/24
+- IPs: 192.168.3.100, 192.168.3.101
+
+Each server runs:
+- Pi-hole container (DNS sinkhole on port 53)
+- Unbound (upstream resolver on port 5335)
+
+**Current client DNS configuration:**
+- Clients point to individual DNS servers or round-robin via DHCP
+- No automatic failover - if server down, clients fail until manual intervention
+
+### Constraints
+- Want to minimize changes
+- Brief downtime acceptable when I'm at keyboard
+- Prefer open-source, simple solutions
+- Current monitoring and restart playbook should continue to work
+
+## Goals / Non-Goals
+
+**Goals:**
+- Understand network changes required for DNS HA
+- Document failover behavior and timing
+- Identify common pitfalls and how to avoid them
+- Create implementation roadmap
+
+**Non-Goals:**
+- Do NOT implement HA (this is analysis only)
+- Do NOT change client configurations unless necessary
+- Do NOT modify existing monitoring/alerting
+
+## Decisions
+
+### D1: HA Approach Options
+
+**Option A: Keepalived/VRRP (Recommended for HA)**
+
+| Aspect | Details |
+|--------|---------|
+| How it works | Virtual IP (VIP) floats between servers using VRRP protocol |
+| Failover | Automatic, typically 3-5 seconds |
+| Network changes | Need to allow VRRP multicast, may need switch config |
+| Complexity | Medium - keepalived daemon on each DNS server |
+| Pros | True HA, automatic failover, widely used |
+| Cons | Requires network coordination, ARP cache issues |
+
+**Option B: HSRP (Cisco proprietary)**
+
+| Aspect | Details |
+|--------|---------|
+| How it works | Cisco's proprietary failover protocol |
+| Failover | Automatic |
+| Network changes | Requires Cisco equipment |
+| Complexity | Medium |
+| Pros | Integrated with Cisco gear |
+| Cons | Proprietary, not applicable to home network |
+
+**Option C: DNS Round-Robin (Simpler, no HA)**
+
+| Aspect | Details |
+|--------|---------|
+| How it works | Multiple A records for same hostname |
+| Failover | None - clients get all IPs, try sequentially |
+| Network changes | None |
+| Complexity | Low |
+| Pros | Simple, no daemon, no network changes |
+| Cons | No automatic failover, clients may cache failed IP |
+
+**Option D: Client-side failover (Alternative)**
+
+| Aspect | Details |
+|--------|---------|
+| How it works | Configure multiple DNS servers in DHCP/client |
+| Failover | Client-dependent (some OSes retry automatically) |
+| Network changes | None |
+| Complexity | Low |
+| Pros | No infrastructure changes |
+| Cons | Inconsistent behavior across clients |
+
+### D2: Recommended Approach
+
+**Recommendation: Option A (Keepalived) per region**
+
+- Each region (Wisconsin, New York, Miyagi) gets its own VIP
+- Pair servers within each region (e.g., marsAlpha ↔ marsBeta)
+- VIP becomes the DNS address for that region
+- If one server fails, VIP moves to other within seconds
+
+### D3: Network Architecture
+
+**VIP Address Plan (per region):**
+- Reserve .2 for DNS VIP in each subnet
+- DHCP range: .11 - .254 (leaving .2 for VIP, .10 for infrastructure)
+
+```
-```
-```
+Wisconsin (192.168.20.0/24):
+  marsAlpha (192.168.20.250) ←→ marsBeta (192.168.20.251)
+  VIP: 192.168.20.2
+  DHCP: 192.168.20.11 - 192.168.20.254
+
+New York (192.168.100.0/24):
+  terraOmega (192.168.100.112) ←→ terraPhi (192.168.100.110)
+  VIP: 192.168.100.2
+  DHCP: 192.168.100.11 - 192.168.100.254
+
+Miyagi (192.168.3.0/24):
+  alphaCenA (192.168.3.100) ←→ alphaCenB (192.168.3.101)
+  VIP: 192.168.3.2
+  DHCP: 192.168.3.11 - 192.168.3.254
+```
+
+Clients would use regional VIP (.2) as their DNS server.
+
+## Risks / Trade-offs
+
+### Network Risks
+
+- **[Risk]** VRRP multicast blocked by switch
+  - **Mitigation**: Check switch allows VRRP (multicast 224.0.0.18), or configure unicast
+
+- **[Risk]** ARP cache on switches/clients points to wrong server after failover
+  - **Mitigation**: Set lower `advertisement_intvl` and use `garp_master_delay`
-  - **Mitigation**: Set lower `advertisement_intvl` and use `garp_master_delay`
+  - **Mitigation**: Set lower `advert_int` and use `garp_master_delay`
-  - **Mitigation**: Set lower `advertisement_intvl` and use `garp_master_delay`
+  - **Mitigation**: Set lower `advert_int` and use `garp_master_delay`
+
+- **[Risk]** Both servers claim VIP (split-brain)
+  - **Mitigation**: Use `priority` to designate master, `preempt` to ensure master always holds VIP
+
+- **[Risk]** Network segmentation blocks VRRP packets
+  - **Mitigation**: Ensure firewall allows VRRP (protocol 112) between DNS servers
+
+### Operational Risks
+
+- **[Risk]** Keepalived misconfiguration causes failover storms
+  - **Mitigation**: Test failover in maintenance window, set appropriate timeouts
+
+- **[Risk]** Failover doesn't trigger DNS service restart
+  - **Mitigation**: Monitor both master and backup, verify DNS service on both
+
+- **[Risk]** Upstream Unbound not configured for VIP
+  - **Mitigation**: Unbound listens on all interfaces by default, verify
+
+### Performance Trade-offs
+
+- **Latency**: Minor increase during failover (seconds)
+- **Complexity**: Additional daemon to monitor and maintain
+- **Monitoring**: Need to monitor keepalived state, not just DNS
+
+## Implementation Roadmap
+
+### Phase 1: Network Preparation (Research)
+- [ ] Verify network equipment allows VRRP
+- [ ] Choose VIP addresses for each region
+- [ ] Test VRRP between pairs
+
+### Phase 2: Keepalived Configuration
+- [ ] Install keepalived on each DNS server
+- [ ] Configure master/backup pairs per region
+- [ ] Add firewall rules for VRRP
+
+### Phase 3: DNS Service Integration
+- [ ] Ensure pihole/unbound binds to VIP
+- [ ] Update monitoring to track keepalived state
+- [ ] Update restart playbook for HA (restart VIP holder)
+
+### Phase 4: Client Migration
+- [ ] Update DHCP to serve regional VIPs
+- [ ] Test failover manually
+- [ ] Monitor for issues
+
+## Open Questions
+
+1. ~~VIP addresses~~ - **RESOLVED**: Using .2 for each regional subnet
+2. ~~Network equipment~~ - **RESOLVED**: Ubiquity and commodity switches between servers - need VRRP compatibility check
+3. ~~Client migration~~ - **RESOLVED**: Migrate clients gradually
+4. ~~Cross-region failover~~ - **RESOLVED**: No - too complex
+5. ~~Monitoring~~ - **RESOLVED**: Yes - alerts for keepalived state changes
diff --git a/openspec/changes/dns-keepalived-analysis/proposal.md b/openspec/changes/dns-keepalived-analysis/proposal.md
@@ -0,0 +1,30 @@
+## Why
+
+The current DNS infrastructure (Pi-hole + Unbound) has no high availability. If a DNS server fails, clients lose DNS resolution until manual intervention. This is a single point of failure that impacts all services. We need to understand the architectural changes required to add DNS HA using keepalived/HSRP and/or round-robin DNS.
+
+## What Changes
+
+This is a **planning/analysis** change to document the requirements, risks, and architectural impact of adding DNS high availability.
+
+- Document current DNS infrastructure and dependencies
+- Analyze keepalived/VRRP/HSRP options for DNS failover
+- Analyze DNS round-robin load balancing as an alternative
+- Document network changes required (VIP, ARP, etc.)
+- Identify common pitfalls and failure modes
+- Create implementation roadmap
+
+## Capabilities
+
+### New Capabilities
+- `dns-ha-analysis`: Analysis document covering keepalived/HSRP/round-robin options, network requirements, failover behavior, and common pitfalls for DNS HA deployment.
+
+### Modified Capabilities
+- (None - this is a new analysis)
+
+## Impact
+
+- **Network**: May require VIP (Virtual IP) configuration, network switch changes for ARP
+- **DNS Servers**: Keepalived daemon, additional network config
+- **Clients**: May need updated DNS client configurations to use VIP
+- **Monitoring**: Update monitoring to track HA state
+- **Existing Playbooks**: Restart playbook should continue to work
diff --git a/openspec/changes/dns-keepalived-analysis/specs/dns-ha-analysis/spec.md b/openspec/changes/dns-keepalived-analysis/specs/dns-ha-analysis/spec.md
@@ -0,0 +1,83 @@
+## ADDED Requirements
+
+### Requirement: Network compatibility analysis
+The analysis SHALL document whether the existing network infrastructure supports VRRP/keepalived.
+
+#### Scenario: VRRP multicast allowed
+- **WHEN** network switches allow VRRP multicast (224.0.0.18)
+- **THEN** keepalived can operate in full multicast mode
+
+#### Scenario: VRRP multicast blocked
+- **WHEN** network switches block VRRP multicast
+- **THEN** keepalived can be configured for unicast mode
+
+#### Scenario: VRRP not supported
+- **WHEN** network equipment does not support VRRP
+- **THEN** alternative HA solutions (round-robin DNS or client-side) must be used
+
+### Requirement: Failover behavior specification
+The analysis SHALL document expected failover timing and behavior.
+
+#### Scenario: Primary server fails
+- **WHEN** the primary DNS server becomes unreachable
+- **THEN** the VIP should migrate to backup within 5 seconds
+
+#### Scenario: Primary recovers
+- **WHEN** the primary DNS server becomes available again
+- **THEN** the VIP should migrate back to primary based on preemption policy
+
+#### Scenario: Both servers fail
+- **WHEN** all DNS servers in a region are down
+- **THEN** clients should timeout and move to secondary DNS (if configured)
+
+### Requirement: Network address planning
+The analysis SHALL specify VIP addresses for each region.
+
+#### Scenario: Wisconsin VIP
+- **WHEN** planning Wisconsin DNS HA
+- **THEN** use 192.168.20.2 as VIP, reserve in DHCP (range: .11-.254)
+
+#### Scenario: New York VIP
+- **WHEN** planning New York DNS HA
+- **THEN** use 192.168.100.2 as VIP, reserve in DHCP (range: .11-.254)
+
+#### Scenario: Miyagi VIP
+- **WHEN** planning Miyagi DNS HA
+- **THEN** use 192.168.3.2 as VIP, reserve in DHCP (range: .11-.254)
+
+### Requirement: Common pitfalls documentation
+The analysis SHALL document known pitfalls and mitigations.
+
+#### Scenario: ARP cache stale
+- **WHEN** failover occurs
+- **THEN** client ARP caches may point to wrong MAC for VIP
+- **MITIGATION**: Use lower advertisement interval and garp_master_delay
+
+#### Scenario: Split-brain
+- **WHEN** both servers claim VIP
+- **THEN** DNS responses are inconsistent
+- **MITIGATION**: Use priority and preempt settings correctly
+
+#### Scenario: Firewall blocks VRRP
+- **WHEN** firewalls between servers block VRRP protocol
+- **THEN** failover cannot occur
+- **MITIGATION**: Allow protocol 112 and multicast
+
+### Requirement: Implementation roadmap
+The analysis SHALL provide a phased implementation plan.
+
+#### Scenario: Phase 1 - Network prep
+- **WHEN** beginning HA implementation
+- **THEN** verify network equipment (Ubiquity + commodity switches) allows VRRP
+
+#### Scenario: Phase 2 - Keepalived config
+- **WHEN** network is ready
+- **THEN** install and configure keepalived on each server pair
+
+#### Scenario: Phase 3 - Integration
+- **WHEN** keepalived is running
+- **THEN** integrate with DNS service and monitoring
+
+#### Scenario: Phase 4 - Client migration
+- **WHEN** HA is tested
+- **THEN** update DHCP to serve VIPs to clients gradually