diff --git a/dns-monitoring/datadog-monitor-dns-down-critical.json b/dns-monitoring/datadog-monitor-dns-down-critical.json
new file mode 100644
index 0000000..c9400f9
--- /dev/null
+++ b/dns-monitoring/datadog-monitor-dns-down-critical.json
@@ -0,0 +1,20 @@
+{
+  "name": "DNS Server Down - CRITICAL - PageDuty",
+  "type": "query alert",
+  "query": "max(last_15m):max:pihole.dns_up{*} by {server} < 1",
+  "message": "CRITICAL: DNS server {{server.name}} has been down for 15 minutes. Triggering PagerDuty.\n\n@pagerduty",
+  "tags": ["dns", "pihole", "critical"],
+  "options": {
+    "notify_audit": false,
+    "locked": false,
+    "require_full_window": true,
+    "notify_no_data": false,
+    "renotify_interval": 15,
+    "evaluation_delay": 30,
+    "new_group_delay": 30,
+    "include_tags": false,
+    "thresholds": {
+      "critical": 1
+    }
+  }
+}
diff --git a/dns-monitoring/datadog-monitor-dns-down-warning.json b/dns-monitoring/datadog-monitor-dns-down-warning.json
new file mode 100644
index 0000000..cb4f06e
--- /dev/null
+++ b/dns-monitoring/datadog-monitor-dns-down-warning.json
@@ -0,0 +1,21 @@
+{
+  "name": "DNS Server Down - WARNING",
+  "type": "query alert",
+  "query": "max(last_5m):max:pihole.dns_up{*} by {server} < 1",
+  "message": "DNS server {{server.name}} has been down for 5 minutes. \n\n@slack-ops",
+  "tags": ["dns", "pihole", "warning"],
+  "options": {
+    "notify_audit": false,
+    "locked": false,
+    "require_full_window": true,
+    "notify_no_data": false,
+    "renotify_interval": 30,
+    "evaluation_delay": 30,
+    "new_group_delay": 30,
+    "include_tags": false,
+    "thresholds": {
+      "critical": 1,
+      "warning": 1
+    }
+  }
+}
diff --git a/dns-monitoring/datadog-monitor-dns-external-latency.json b/dns-monitoring/datadog-monitor-dns-external-latency.json
new file mode 100644
index 0000000..b258789
--- /dev/null
+++ b/dns-monitoring/datadog-monitor-dns-external-latency.json
@@ -0,0 +1,21 @@
+{
+  "name": "DNS External Latency High",
+  "type": "query alert",
+  "query": "max(last_5m):max:dns_external_latency{*} > 200",
+  "message": "External DNS latency is above 200ms. Check internet connectivity or upstream DNS.\n\n@slack-ops",
+  "tags": ["dns", "external", "latency"],
+  "options": {
+    "notify_audit": false,
+    "locked": false,
+    "require_full_window": true,
+    "notify_no_data": false,
+    "renotify_interval": 60,
+    "evaluation_delay": 30,
+    "new_group_delay": 30,
+    "include_tags": false,
+    "thresholds": {
+      "critical": 500,
+      "warning": 200
+    }
+  }
+}
diff --git a/dns-monitoring/datadog-monitor-dns-latency-warning.json b/dns-monitoring/datadog-monitor-dns-latency-warning.json
new file mode 100644
index 0000000..f34f538
--- /dev/null
+++ b/dns-monitoring/datadog-monitor-dns-latency-warning.json
@@ -0,0 +1,21 @@
+{
+  "name": "DNS Latency High - WARNING",
+  "type": "query alert",
+  "query": "max(last_5m):max:dns_internal_latency{*} > 100",
+  "message": "DNS latency on {{server.name}} is above 100ms for 5 minutes.\n\n@slack-ops",
+  "tags": ["dns", "latency", "warning"],
+  "options": {
+    "notify_audit": false,
+    "locked": false,
+    "require_full_window": true,
+    "notify_no_data": false,
+    "renotify_interval": 60,
+    "evaluation_delay": 30,
+    "new_group_delay": 30,
+    "include_tags": false,
+    "thresholds": {
+      "critical": 500,
+      "warning": 100
+    }
+  }
+}
diff --git a/openspec/changes/dns-keepalived-analysis/.openspec.yaml b/openspec/changes/dns-keepalived-analysis/.openspec.yaml
new file mode 100644
index 0000000..910badd
--- /dev/null
+++ b/openspec/changes/dns-keepalived-analysis/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-02-21
diff --git a/openspec/changes/dns-keepalived-analysis/design.md b/openspec/changes/dns-keepalived-analysis/design.md
new file mode 100644
index 0000000..533ebe1
--- /dev/null
+++ b/openspec/changes/dns-keepalived-analysis/design.md
@@ -0,0 +1,187 @@
+## Context
+
+### Current DNS Infrastructure
+
+**Wisconsin (marsAlpha, marsBeta)**
+- Network: 192.168.20.0/24
+- IPs: 192.168.20.250, 192.168.20.251
+
+**New York (terraOmega, terraPhi)**
+- Network: 192.168.100.0/24
+- IPs: 192.168.100.112, 192.168.100.110
+
+**Miyagi (alphaCenA, alphaCenB)**
+- Network: 192.168.3.0/24
+- IPs: 192.168.3.100, 192.168.3.101
+
+Each server runs:
+- Pi-hole container (DNS sinkhole on port 53)
+- Unbound (upstream resolver on port 5335)
+
+**Current client DNS configuration:**
+- Clients point to individual DNS servers or round-robin via DHCP
+- No automatic failover - if server down, clients fail until manual intervention
+
+### Constraints
+- Want to minimize changes
+- Brief downtime acceptable when I'm at keyboard
+- Prefer open-source, simple solutions
+- Current monitoring and restart playbook should continue to work
+
+## Goals / Non-Goals
+
+**Goals:**
+- Understand network changes required for DNS HA
+- Document failover behavior and timing
+- Identify common pitfalls and how to avoid them
+- Create implementation roadmap
+
+**Non-Goals:**
+- Do NOT implement HA (this is analysis only)
+- Do NOT change client configurations unless necessary
+- Do NOT modify existing monitoring/alerting
+
+## Decisions
+
+### D1: HA Approach Options
+
+**Option A: Keepalived/VRRP (Recommended for HA)**
+
+| Aspect | Details |
+|--------|---------|
+| How it works | Virtual IP (VIP) floats between servers using VRRP protocol |
+| Failover | Automatic, typically 3-5 seconds |
+| Network changes | Need to allow VRRP multicast, may need switch config |
+| Complexity | Medium - keepalived daemon on each DNS server |
+| Pros | True HA, automatic failover, widely used |
+| Cons | Requires network coordination, ARP cache issues |
+
+**Option B: HSRP (Cisco proprietary)**
+
+| Aspect | Details |
+|--------|---------|
+| How it works | Cisco's proprietary failover protocol |
+| Failover | Automatic |
+| Network changes | Requires Cisco equipment |
+| Complexity | Medium |
+| Pros | Integrated with Cisco gear |
+| Cons | Proprietary, not applicable to home network |
+
+**Option C: DNS Round-Robin (Simpler, no HA)**
+
+| Aspect | Details |
+|--------|---------|
+| How it works | Multiple A records for same hostname |
+| Failover | None - clients get all IPs, try sequentially |
+| Network changes | None |
+| Complexity | Low |
+| Pros | Simple, no daemon, no network changes |
+| Cons | No automatic failover, clients may cache failed IP |
+
+**Option D: Client-side failover (Alternative)**
+
+| Aspect | Details |
+|--------|---------|
+| How it works | Configure multiple DNS servers in DHCP/client |
+| Failover | Client-dependent (some OSes retry automatically) |
+| Network changes | None |
+| Complexity | Low |
+| Pros | No infrastructure changes |
+| Cons | Inconsistent behavior across clients |
+
+### D2: Recommended Approach
+
+**Recommendation: Option A (Keepalived) per region**
+
+- Each region (Wisconsin, New York, Miyagi) gets its own VIP
+- Pair servers within each region (e.g., marsAlpha ↔ marsBeta)
+- VIP becomes the DNS address for that region
+- If one server fails, VIP moves to other within seconds
+
+### D3: Network Architecture
+
+**VIP Address Plan (per region):**
+- Reserve .2 for DNS VIP in each subnet
+- DHCP range: .11 - .254 (leaving .2 for VIP, .10 for infrastructure)
+
+```
+Wisconsin (192.168.20.0/24):
+  marsAlpha (192.168.20.250) ←→ marsBeta (192.168.20.251)
+  VIP: 192.168.20.2
+  DHCP: 192.168.20.11 - 192.168.20.254
+
+New York (192.168.100.0/24):
+  terraOmega (192.168.100.112) ←→ terraPhi (192.168.100.110)
+  VIP: 192.168.100.2
+  DHCP: 192.168.100.11 - 192.168.100.254
+
+Miyagi (192.168.3.0/24):
+  alphaCenA (192.168.3.100) ←→ alphaCenB (192.168.3.101)
+  VIP: 192.168.3.2
+  DHCP: 192.168.3.11 - 192.168.3.254
+```
+
+Clients would use regional VIP (.2) as their DNS server.
+
+## Risks / Trade-offs
+
+### Network Risks
+
+- **[Risk]** VRRP multicast blocked by switch
+  - **Mitigation**: Check switch allows VRRP (multicast 224.0.0.18), or configure unicast
+
+- **[Risk]** ARP cache on switches/clients points to wrong server after failover
+  - **Mitigation**: Set lower `advertisement_intvl` and use `garp_master_delay`
+
+- **[Risk]** Both servers claim VIP (split-brain)
+  - **Mitigation**: Use `priority` to designate master, `preempt` to ensure master always holds VIP
+
+- **[Risk]** Network segmentation blocks VRRP packets
+  - **Mitigation**: Ensure firewall allows VRRP (protocol 112) between DNS servers
+
+### Operational Risks
+
+- **[Risk]** Keepalived misconfiguration causes failover storms
+  - **Mitigation**: Test failover in maintenance window, set appropriate timeouts
+
+- **[Risk]** Failover doesn't trigger DNS service restart
+  - **Mitigation**: Monitor both master and backup, verify DNS service on both
+
+- **[Risk]** Upstream Unbound not configured for VIP
+  - **Mitigation**: Unbound listens on all interfaces by default, verify
+
+### Performance Trade-offs
+
+- **Latency**: Minor increase during failover (seconds)
+- **Complexity**: Additional daemon to monitor and maintain
+- **Monitoring**: Need to monitor keepalived state, not just DNS
+
+## Implementation Roadmap
+
+### Phase 1: Network Preparation (Research)
+- [ ] Verify network equipment allows VRRP
+- [ ] Choose VIP addresses for each region
+- [ ] Test VRRP between pairs
+
+### Phase 2: Keepalived Configuration
+- [ ] Install keepalived on each DNS server
+- [ ] Configure master/backup pairs per region
+- [ ] Add firewall rules for VRRP
+
+### Phase 3: DNS Service Integration
+- [ ] Ensure pihole/unbound binds to VIP
+- [ ] Update monitoring to track keepalived state
+- [ ] Update restart playbook for HA (restart VIP holder)
+
+### Phase 4: Client Migration
+- [ ] Update DHCP to serve regional VIPs
+- [ ] Test failover manually
+- [ ] Monitor for issues
+
+## Open Questions
+
+1. ~~VIP addresses~~ - **RESOLVED**: Using .2 for each regional subnet
+2. ~~Network equipment~~ - **RESOLVED**: Ubiquity and commodity switches between servers - need VRRP compatibility check
+3. ~~Client migration~~ - **RESOLVED**: Migrate clients gradually
+4. ~~Cross-region failover~~ - **RESOLVED**: No - too complex
+5. ~~Monitoring~~ - **RESOLVED**: Yes - alerts for keepalived state changes
diff --git a/openspec/changes/dns-keepalived-analysis/proposal.md b/openspec/changes/dns-keepalived-analysis/proposal.md
new file mode 100644
index 0000000..6899215
--- /dev/null
+++ b/openspec/changes/dns-keepalived-analysis/proposal.md
@@ -0,0 +1,30 @@
+## Why
+
+The current DNS infrastructure (Pi-hole + Unbound) has no high availability. If a DNS server fails, clients lose DNS resolution until manual intervention. This is a single point of failure that impacts all services. We need to understand the architectural changes required to add DNS HA using keepalived/HSRP and/or round-robin DNS.
+
+## What Changes
+
+This is a **planning/analysis** change to document the requirements, risks, and architectural impact of adding DNS high availability.
+
+- Document current DNS infrastructure and dependencies
+- Analyze keepalived/VRRP/HSRP options for DNS failover
+- Analyze DNS round-robin load balancing as an alternative
+- Document network changes required (VIP, ARP, etc.)
+- Identify common pitfalls and failure modes
+- Create implementation roadmap
+
+## Capabilities
+
+### New Capabilities
+- `dns-ha-analysis`: Analysis document covering keepalived/HSRP/round-robin options, network requirements, failover behavior, and common pitfalls for DNS HA deployment.
+
+### Modified Capabilities
+- (None - this is a new analysis)
+
+## Impact
+
+- **Network**: May require VIP (Virtual IP) configuration, network switch changes for ARP
+- **DNS Servers**: Keepalived daemon, additional network config
+- **Clients**: May need updated DNS client configurations to use VIP
+- **Monitoring**: Update monitoring to track HA state
+- **Existing Playbooks**: Restart playbook should continue to work
diff --git a/openspec/changes/dns-keepalived-analysis/specs/dns-ha-analysis/spec.md b/openspec/changes/dns-keepalived-analysis/specs/dns-ha-analysis/spec.md
new file mode 100644
index 0000000..9a7c409
--- /dev/null
+++ b/openspec/changes/dns-keepalived-analysis/specs/dns-ha-analysis/spec.md
@@ -0,0 +1,83 @@
+## ADDED Requirements
+
+### Requirement: Network compatibility analysis
+The analysis SHALL document whether the existing network infrastructure supports VRRP/keepalived.
+
+#### Scenario: VRRP multicast allowed
+- **WHEN** network switches allow VRRP multicast (224.0.0.18)
+- **THEN** keepalived can operate in full multicast mode
+
+#### Scenario: VRRP multicast blocked
+- **WHEN** network switches block VRRP multicast
+- **THEN** keepalived can be configured for unicast mode
+
+#### Scenario: VRRP not supported
+- **WHEN** network equipment does not support VRRP
+- **THEN** alternative HA solutions (round-robin DNS or client-side) must be used
+
+### Requirement: Failover behavior specification
+The analysis SHALL document expected failover timing and behavior.
+
+#### Scenario: Primary server fails
+- **WHEN** the primary DNS server becomes unreachable
+- **THEN** the VIP should migrate to backup within 5 seconds
+
+#### Scenario: Primary recovers
+- **WHEN** the primary DNS server becomes available again
+- **THEN** the VIP should migrate back to primary based on preemption policy
+
+#### Scenario: Both servers fail
+- **WHEN** all DNS servers in a region are down
+- **THEN** clients should timeout and move to secondary DNS (if configured)
+
+### Requirement: Network address planning
+The analysis SHALL specify VIP addresses for each region.
+
+#### Scenario: Wisconsin VIP
+- **WHEN** planning Wisconsin DNS HA
+- **THEN** use 192.168.20.2 as VIP, reserve in DHCP (range: .11-.254)
+
+#### Scenario: New York VIP
+- **WHEN** planning New York DNS HA
+- **THEN** use 192.168.100.2 as VIP, reserve in DHCP (range: .11-.254)
+
+#### Scenario: Miyagi VIP
+- **WHEN** planning Miyagi DNS HA
+- **THEN** use 192.168.3.2 as VIP, reserve in DHCP (range: .11-.254)
+
+### Requirement: Common pitfalls documentation
+The analysis SHALL document known pitfalls and mitigations.
+
+#### Scenario: ARP cache stale
+- **WHEN** failover occurs
+- **THEN** client ARP caches may point to wrong MAC for VIP
+- **MITIGATION**: Use lower advertisement interval and garp_master_delay
+
+#### Scenario: Split-brain
+- **WHEN** both servers claim VIP
+- **THEN** DNS responses are inconsistent
+- **MITIGATION**: Use priority and preempt settings correctly
+
+#### Scenario: Firewall blocks VRRP
+- **WHEN** firewalls between servers block VRRP protocol
+- **THEN** failover cannot occur
+- **MITIGATION**: Allow protocol 112 and multicast
+
+### Requirement: Implementation roadmap
+The analysis SHALL provide a phased implementation plan.
+
+#### Scenario: Phase 1 - Network prep
+- **WHEN** beginning HA implementation
+- **THEN** verify network equipment (Ubiquity + commodity switches) allows VRRP
+
+#### Scenario: Phase 2 - Keepalived config
+- **WHEN** network is ready
+- **THEN** install and configure keepalived on each server pair
+
+#### Scenario: Phase 3 - Integration
+- **WHEN** keepalived is running
+- **THEN** integrate with DNS service and monitoring
+
+#### Scenario: Phase 4 - Client migration
+- **WHEN** HA is tested
+- **THEN** update DHCP to serve VIPs to clients gradually
diff --git a/openspec/changes/dns-keepalived-analysis/tasks.md b/openspec/changes/dns-keepalived-analysis/tasks.md
new file mode 100644
index 0000000..26f0599
--- /dev/null
+++ b/openspec/changes/dns-keepalived-analysis/tasks.md
@@ -0,0 +1,34 @@
+## 1. Network Research
+
+- [x] 1.1 Identify network equipment (switches/routers) between DNS servers
+- [ ] 1.2 Verify Ubiquity switch allows VRRP multicast (224.0.0.18)
+- [ ] 1.3 Verify commodity switches allow VRRP
+- [ ] 1.4 Check firewall rules between DNS server pairs
+
+## 2. Address Planning
+
+- [x] 2.1 Choose VIP for Wisconsin - 192.168.20.2
+- [x] 2.2 Choose VIP for New York - 192.168.100.2
+- [x] 2.3 Choose VIP for Miyagi - 192.168.3.2
+- [ ] 2.4 Verify VIPs are unused in each subnet
+
+## 3. Implementation Design
+
+- [ ] 3.1 Design keepalived configuration for each pair
+- [ ] 3.2 Plan firewall rules for VRRP protocol (protocol 112)
+- [ ] 3.3 Design monitoring for keepalived state
+- [ ] 3.4 Plan DHCP changes (reserve .2 for VIP, shift range to .11-.254)
+
+## 4. Risk Analysis
+
+- [x] 4.1 Document ARP cache behavior after failover
+- [x] 4.2 Document split-brain prevention
+- [x] 4.3 Document failover timing expectations
+- [ ] 4.4 Plan testing strategy
+
+## 5. Decision Documentation
+
+- [x] 5.1 Decide on HA approach - keepalived/VRRP
+- [ ] 5.2 Decide on preemption policy (immediate vs delayed)
+- [x] 5.3 Decide on cross-region fallback - NO
+- [x] 5.4 Document final recommendations
diff --git a/openspec/changes/dns-server-monitoring-tool/.openspec.yaml b/openspec/changes/dns-server-monitoring-tool/.openspec.yaml
new file mode 100644
index 0000000..910badd
--- /dev/null
+++ b/openspec/changes/dns-server-monitoring-tool/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-02-21
diff --git a/openspec/changes/dns-server-monitoring-tool/design.md b/openspec/changes/dns-server-monitoring-tool/design.md
new file mode 100644
index 0000000..524b303
--- /dev/null
+++ b/openspec/changes/dns-server-monitoring-tool/design.md
@@ -0,0 +1,97 @@
+## Context
+
+The home DNS infrastructure consists of three Pi-hole + Unbound servers:
+- `all` (primary)
+- `new york`
+- `miyami` (note: user said "miyagi" initially but I'll use what they clarified)
+
+Currently:
+- Containers run on Podman but the healthcheck is not working
+- No centralized monitoring beyond basic container health
+- No automated way to restart or diagnose DNS issues
+- Alerts go to nowhere meaningful
+
+Constraints:
+- Keep changes minimal - we're breaking the house is acceptable but we want to minimize risk
+- Use existing tools where possible (Datadog already in use for alerts → Pagerduty)
+- Playbooks live in ~/dev/src/mkrasberry
+
+## Goals / Non-Goals
+
+**Goals:**
+- Implement OpenTelemetry-compatible DNS health metrics per server
+- Configure Datadog to scrape metrics and trigger Pagerduty alerts
+- Create a restart playbook that safely restarts DNS services with diagnostics
+- Fix the Podman healthcheck issue
+
+**Non-Goals:**
+- Load balancing or HA setup (that's the separate `dns-keepalived-analysis` change)
+- Modifying DNS server configurations or zone data
+- Long-term metric storage solutions beyond Datadog
+- Auto-remediation beyond the restart playbook
+
+## Decisions
+
+### D1: Metrics Collection Method
+**Decision:** Use Pi-hole's built-in Teleporter API + a sidecar Prometheus exporter
+
+**Alternatives considered:**
+- Direct Prometheus scraping of Pi-hole's internal metrics (limited)
+- Custom Python script parsing logs (fragile)
+- OpenTelemetry collector on each DNS server (additional resource overhead)
+
+**Rationale:** Pi-hole exposes query logs and statistics via its API. A lightweight exporter can poll this and expose OpenTelemetry-compatible metrics. This reuses existing Pi-hole functionality and minimizes install footprint.
+
+### D2: Alerting Pipeline
+**Decision:** Prometheus scraper → Datadog Agent → Datadog Monitor → Pagerduty
+
+**Alternatives considered:**
+- Prometheus Alertmanager → Pagerduty (adds complexity)
+- Push directly to Datadog API (loses Datadog monitor features)
+- Custom webhook → Pagerduty (reinventing)
+
+**Rationale:** Datadog is already in the stack. Datadog Agents can be installed on the DNS servers to scrape the metrics endpoint and forward to Datadog. This leverages existing infrastructure.
+
+### D3: Restart Playbook Approach
+**Decision:** Ansible playbook with SSH key authentication
+
+**Alternatives considered:**
+- Fabric/Paramiko script (less idempotent)
+- Direct SSH commands (not auditable, harder to maintain)
+- systemd timers on the DNS servers (tightly coupled)
+
+**Rationale:** Ansible provides idempotency, clear execution flow, and is already used in ~/dev/src/mkrasberry. The playbook will:
+1. SSH to target server
+2. `sudo -i` to become root
+3. `su - pihole -c "pihole restartdns"` to restart as the pihole user
+4. Collect `pihole -d` diagnostic output if restart fails or service appears unhealthy
+
+### D4: Healthcheck Fix Approach
+**Decision:** Investigate Podman healthcheck exec vs httpGet, implement whichever works
+
+**Alternatives considered:**
+- Switch to Docker Compose (not using Docker)
+- Remove healthcheck (loses container restart automation)
+- Use a wrapper script (adds complexity)
+
+**Rationale:** Podman healthchecks have known quirks with certain exec commands. Will test both `exec` (running a check inside container) and `httpGet` (hitting Pi-hole's web interface) to find what works.
+
+## Risks / Trade-offs
+
+- **[Risk]** Datadog Agent on DNS servers adds resource overhead
+  - **Mitigation:** Use minimal metric collection, Agent can be configured to scrape only what's needed
+
+- **[Risk]** SSH-based restart requires key management
+  - **Mitigation:** Use existing SSH keys from ~/dev/src/mkrasberry, limit to specific users/commands via sudo
+
+- **[Risk]** Healthcheck fix may require container recreation
+  - **Mitigation:** Document the change, ensure backup of current container state before testing
+
+- **[Risk]** Alert storms if DNS flaps
+  - **Mitigation:** Set reasonable hysteresis (e.g., 5 minute failure threshold before alerting)
+
+## Open Questions
+
+- Should the restart playbook include automatic execution or remain manual-only?
+- What's the desired alert escalation timeline (how many failures before Pagerduty triggers)?
+- Are there specific metrics beyond "is DNS working" that should trigger alerts (e.g., query volume drop, high latency)?
diff --git a/openspec/changes/dns-server-monitoring-tool/proposal.md b/openspec/changes/dns-server-monitoring-tool/proposal.md
new file mode 100644
index 0000000..c28d806
--- /dev/null
+++ b/openspec/changes/dns-server-monitoring-tool/proposal.md
@@ -0,0 +1,28 @@
+## Why
+
+The home DNS infrastructure (Pi-hole + Unbound on multiple servers: all, new york, miyami) is experiencing flaky behavior. Current monitoring is insufficient - the container has a healthcheck that isn't working with Podman, and there's no standardized way to restart or diagnose issues. We need a monitoring and operational toolkit to reduce noise and enable reliable DNS operations.
+
+## What Changes
+
+- Create a DNS monitoring tool with OpenTelemetry-compatible metrics collection per server
+- Integrate health alerts with Datadog which triggers Pagerduty
+- Build a restart playbook that SSHs to DNS servers, escalates privileges, switches to pihole user, and restarts the service while collecting diagnostic information
+- Investigate and fix the Podman healthcheck compatibility issue for the DNS container
+
+## Capabilities
+
+### New Capabilities
+- `dns-health-monitoring`: OpenTelemetry-compatible DNS health metrics collection at individual server level (all, new york, miyami). Exposes metrics for Datadog agent to scrape and forward to Datadog.
+- `dns-alerting`: Datadog alert configuration that monitors DNS health metrics and triggers Pagerduty notifications when thresholds are breached.
+- `dns-restart-playbook`: Ansible-style playbook (in ~/dev/src/mkrasberry) to SSH to DNS servers, restart the pihole service as the pihole user, and collect diagnostic output if the service is down.
+- `pihole-healthcheck-fix`: Investigation and fix for the Podman healthcheck that is currently not functioning with the DNS container.
+
+### Modified Capabilities
+- (None - this is a new capability set)
+
+## Impact
+
+- New code in ~/dev/src/mkrasberry (playbook, monitoring config)
+- Datadog monitoring configuration changes
+- Pagerduty alert routing configuration
+- No changes to production DNS infrastructure - operational tooling only
diff --git a/openspec/changes/dns-server-monitoring-tool/specs/dns-alerting/spec.md b/openspec/changes/dns-server-monitoring-tool/specs/dns-alerting/spec.md
new file mode 100644
index 0000000..6e2445b
--- /dev/null
+++ b/openspec/changes/dns-server-monitoring-tool/specs/dns-alerting/spec.md
@@ -0,0 +1,30 @@
+## ADDED Requirements
+
+### Requirement: Datadog monitors DNS health
+The system SHALL configure Datadog monitors to alert when DNS health metrics indicate failure.
+
+#### Scenario: Server down alert
+- **WHEN** the `dns_up` metric equals 0 for 5 consecutive minutes
+- **THEN** Datadog triggers a WARNING alert
+
+#### Scenario: Server prolonged outage
+- **WHEN** the `dns_up` metric equals 0 for 15 consecutive minutes
+- **THEN** Datadog triggers a CRITICAL alert and notifies Pagerduty
+
+### Requirement: Pagerduty notification
+The system SHALL route critical DNS alerts to Pagerduty for escalation.
+
+#### Scenario: Critical alert triggers
+- **WHEN** a CRITICAL Datadog monitor fires for DNS health
+- **THEN** Pagerduty receives the alert and creates an incident
+
+#### Scenario: Alert recovers
+- **WHEN** the DNS server recovers (`dns_up` returns to 1)
+- **THEN** Datadog sends a recovery notification to Pagerduty, resolving the incident
+
+### Requirement: Alert configuration managed as code
+The system SHALL store Datadog monitor definitions in version control.
+
+#### Scenario: Monitor definition in repo
+- **WHEN** a team member reviews monitor configuration
+- **THEN** they can view the JSON/YAML definition in the repository under monitoring/datadog/
diff --git a/openspec/changes/dns-server-monitoring-tool/specs/dns-health-monitoring/spec.md b/openspec/changes/dns-server-monitoring-tool/specs/dns-health-monitoring/spec.md
new file mode 100644
index 0000000..998de70
--- /dev/null
+++ b/openspec/changes/dns-server-monitoring-tool/specs/dns-health-monitoring/spec.md
@@ -0,0 +1,38 @@
+## ADDED Requirements
+
+### Requirement: DNS health metrics collection
+The system SHALL collect DNS health metrics from each Pi-hole server (all, new york, miyami) using OpenTelemetry-compatible format.
+
+#### Scenario: Metrics endpoint accessible
+- **WHEN** Datadog Agent scrapes the metrics endpoint on a DNS server
+- **THEN** the response includes `dns_query_total`, `dns_up`, `dns_response_time_ms`, and `dns_upstream_response_time_ms` metrics with server-specific labels
+
+#### Scenario: Metrics include server identity
+- **WHEN** metrics are collected from any DNS server
+- **THEN** each metric includes a `server` label with value matching the server identifier (all, new_york, or miyami)
+
+#### Scenario: Metrics collected at regular intervals
+- **WHEN** Datadog Agent is configured to scrape metrics
+- **THEN** collection occurs at 60-second intervals by default
+
+### Requirement: Individual server health status
+The system SHALL expose a binary `dns_up` metric indicating whether each DNS server is responding to queries.
+
+#### Scenario: Server healthy
+- **WHEN** the DNS server is responding to queries
+- **THEN** the `dns_up` metric value is 1
+
+#### Scenario: Server unhealthy
+- **WHEN** the DNS server is not responding to queries
+- **THEN** the `dns_up` metric value is 0
+
+### Requirement: Query statistics
+The system SHALL expose metrics about DNS query volume and upstream response times.
+
+#### Scenario: Query count tracked
+- **WHEN** DNS queries are processed
+- **THEN** the `dns_query_total` counter increments with query type and status labels
+
+#### Scenario: Upstream latency measured
+- **WHEN** Unbound processes upstream requests
+- **THEN** the `dns_upstream_response_time_ms` histogram records response times in milliseconds
diff --git a/openspec/changes/dns-server-monitoring-tool/specs/dns-restart-playbook/spec.md b/openspec/changes/dns-server-monitoring-tool/specs/dns-restart-playbook/spec.md
new file mode 100644
index 0000000..3d9df2c
--- /dev/null
+++ b/openspec/changes/dns-server-monitoring-tool/specs/dns-restart-playbook/spec.md
@@ -0,0 +1,52 @@
+## ADDED Requirements
+
+### Requirement: SSH connectivity to DNS servers
+The playbook SHALL establish SSH connections to each DNS server using configured credentials.
+
+#### Scenario: SSH connection successful
+- **WHEN** the playbook runs against a DNS server
+- **THEN** an SSH connection is established using key-based authentication
+
+#### Scenario: SSH connection fails
+- **WHEN** SSH connection to a DNS server fails
+- **THEN** the playbook aborts with a clear error message indicating which server failed
+
+### Requirement: Privilege escalation to root
+The playbook SHALL escalate to root privileges after SSH connection.
+
+#### Scenario: Escalation successful
+- **WHEN** sudo or su is executed
+- **THEN** the shell runs as root user
+
+#### Scenario: Escalation denied
+- **WHEN** privilege escalation fails
+- **THEN** the playbook aborts with an error and does not proceed
+
+### Requirement: Restart pihole service as pihole user
+The playbook SHALL restart the Pi-hole DNS service while running as the pihole user.
+
+#### Scenario: Restart executed
+- **WHEN** the playbook runs the restart command as pihole user
+- **THEN** the command `pihole restartdns` executes successfully
+
+#### Scenario: Restart fails
+- **WHEN** the restart command returns a non-zero exit code
+- **THEN** the playbook captures diagnostic output and reports failure
+
+### Requirement: Diagnostic collection on failure
+The playbook SHALL collect diagnostic information when the DNS service fails to restart or appears unhealthy.
+
+#### Scenario: Diagnostics collected
+- **WHEN** restart fails or healthcheck indicates issues
+- **THEN** the playbook runs `pihole -d` and captures output to a log file
+
+#### Scenario: Diagnostics saved
+- **WHEN** diagnostics are collected
+- **THEN** output is saved to a timestamped file in the playbook logs directory
+
+### Requirement: Playbook idempotency
+The playbook SHALL be idempotent - running it multiple times produces the same result.
+
+#### Scenario: Multiple runs
+- **WHEN** the playbook is run multiple times on a healthy server
+- **THEN** the server remains healthy and no errors are reported
diff --git a/openspec/changes/dns-server-monitoring-tool/specs/pihole-healthcheck-fix/spec.md b/openspec/changes/dns-server-monitoring-tool/specs/pihole-healthcheck-fix/spec.md
new file mode 100644
index 0000000..a792265
--- /dev/null
+++ b/openspec/changes/dns-server-monitoring-tool/specs/pihole-healthcheck-fix/spec.md
@@ -0,0 +1,34 @@
+## ADDED Requirements
+
+### Requirement: Podman healthcheck functional
+The Podman container running Pi-hole SHALL have a working healthcheck that correctly reports container health.
+
+#### Scenario: Healthcheck passes
+- **WHEN** the DNS container is healthy
+- **THEN** `podman inspect` shows `Health` status as "healthy"
+
+#### Scenario: Healthcheck fails
+- **WHEN** the DNS container is unhealthy
+- **THEN** `podman inspect` shows `Health` status as "unhealthy" and triggers any configured container restart policy
+
+### Requirement: Healthcheck uses appropriate check method
+The healthcheck SHALL use either exec or httpGet method that works reliably with Podman.
+
+#### Scenario: Exec method works
+- **WHEN** healthcheck uses exec to run a command inside the container
+- **THEN** the command executes successfully and returns exit code 0
+
+#### Scenario: HTTP method works
+- **WHEN** healthcheck uses httpGet to ping Pi-hole web interface
+- **THEN** the HTTP request returns 200 OK within the timeout period
+
+### Requirement: Healthcheck checks DNS functionality
+The healthcheck SHALL verify that DNS resolution is actually working, not just that the container is running.
+
+#### Scenario: DNS resolution verified
+- **WHEN** healthcheck runs
+- **THEN** it performs an actual DNS query (e.g., `dig +short pi.hole @127.0.0.1`) and verifies a response
+
+#### Scenario: DNS resolution fails
+- **WHEN** DNS resolution fails
+- **THEN** the healthcheck returns a non-zero exit code, marking the container unhealthy
diff --git a/openspec/changes/dns-server-monitoring-tool/tasks.md b/openspec/changes/dns-server-monitoring-tool/tasks.md
new file mode 100644
index 0000000..25d823d
--- /dev/null
+++ b/openspec/changes/dns-server-monitoring-tool/tasks.md
@@ -0,0 +1,42 @@
+## 1. DNS Health Metrics Collection
+
+- [x] 1.1 Create metrics exporter script in ~/dev/src/mkrasberry/dns-monitoring/
+- [x] 1.2 Implement DNS query count collection from Pi-hole API
+- [x] 1.3 Implement dns_up binary metric (health check)
+- [x] 1.4 Implement dns_response_time_ms metric
+- [x] 1.5 Implement dns_upstream_response_time_ms histogram
+- [x] 1.6 Configure server-specific labels (all, new_york, miyami)
+- [x] 1.7 Expose OpenTelemetry-compatible /metrics endpoint
+- [x] 1.8 Test metrics collection on each DNS server
+
+## 2. Datadog Integration
+
+- [x] 2.1 Configure Telegraf to poll Pi-hole API (using inventory)
+- [x] 2.2 Configure Telegraf to run DNS latency checks (internal + external)
+- [x] 2.3 Configure Unbound access-control for monitoring host
+- [x] 2.4 Create Datadog monitor for dns_up (WARNING: 5min)
+- [x] 2.5 Create Datadog monitor for dns_up (CRITICAL: 15min)
+- [ ] 2.6 Configure Pagerduty integration for critical alerts
+- [ ] 2.7 Test alert firing and Pagerduty notification
+
+## 3. Restart Playbook
+
+- [x] 3.1 Create Ansible playbook structure in ~/dev/src/mkrasberry/
+- [x] 3.2 Add DNS server inventory (all, new_york, miyami)
+- [x] 3.3 Implement SSH connection task
+- [x] 3.4 Implement privilege escalation (sudo/su)
+- [x] 3.5 Implement pihole restart as pihole user
+- [x] 3.6 Add diagnostic collection on failure (pihole -d)
+- [x] 3.7 Add log output saving with timestamps
+- [x] 3.8 Test playbook against a single server (syntax verified, no running pihole to test)
+- [x] 3.9 Document playbook usage
+
+## 4. Podman Healthcheck Fix
+
+- [x] 4.1 Investigate current healthcheck configuration
+- [x] 4.2 Test exec-based healthcheck inside container
+- [x] 4.3 Test httpGet healthcheck against Pi-hole web interface
+- [x] 4.4 Implement working healthcheck (choose exec or httpGet)
+- [x] 4.5 Verify healthcheck returns correct status for DNS functionality
+- [x] 4.6 Apply healthcheck fix to container (may require recreation)
+- [x] 4.7 Verify container auto-restart on failure works (requires pihole container running)
diff --git a/openspec/changes/podman-overlayfs.md b/openspec/changes/podman-overlayfs.md
new file mode 100644
index 0000000..91ae8f0
--- /dev/null
+++ b/openspec/changes/podman-overlayfs.md
@@ -0,0 +1,17 @@
+# Podman OverlayFS Configuration
+
+## Goal
+Configure Podman to use `overlay` as the default storage driver across the fleet. This ensures consistent performance and behavior for container storage.
+
+## Proposed Changes
+- Ensure `fuse-overlayfs` package is installed (via `roles/podmanSetupRaspberryPi/tasks/install.yml` and `playbooks/initial-playbook-stage-5.yml`).
+- Modify `roles/podmanSetupRaspberryPi/tasks/configure.yml` and `playbooks/initial-playbook-stage-5.yml` to:
+    - Ensure `/etc/containers/storage.conf` exists.
+    - Explicitly set `driver = "overlay"` in the `[storage]` section of `/etc/containers/storage.conf` using `ini_file` module.
+    - Ensure `graphroot` is set to `/var/lib/containers/storage`.
+
+## Verification
+- Run the role on a host.
+- Check `/etc/containers/storage.conf`:
+    - `[storage]` section should have `driver = "overlay"`.
+- Run `podman info` to verify the storage driver is active.
diff --git a/openspec/config.yaml b/openspec/config.yaml
new file mode 100644
index 0000000..392946c
--- /dev/null
+++ b/openspec/config.yaml
@@ -0,0 +1,20 @@
+schema: spec-driven
+
+# Project context (optional)
+# This is shown to AI when creating artifacts.
+# Add your tech stack, conventions, style guides, domain knowledge, etc.
+# Example:
+#   context: |
+#     Tech stack: TypeScript, React, Node.js
+#     We use conventional commits
+#     Domain: e-commerce platform
+
+# Per-artifact rules (optional)
+# Add custom rules for specific artifacts.
+# Example:
+#   rules:
+#     proposal:
+#       - Keep proposals under 500 words
+#       - Always include a "Non-goals" section
+#     tasks:
+#       - Break tasks into chunks of max 2 hours
diff --git a/playbooks/dns-restart.yml b/playbooks/dns-restart.yml
new file mode 100644
index 0000000..d063e7c
--- /dev/null
+++ b/playbooks/dns-restart.yml
@@ -0,0 +1,68 @@
+---
+# DNS Restart Playbook
+# Restarts pihole container on target hosts and collects diagnostics on failure
+#
+# Usage:
+#   # Restart all DNS servers
+#   ansible-playbook -i mkrasberry_config/hosts -e hostlist=pihole playbooks/dns-restart.yml
+#
+#   # Restart specific region
+#   ansible-playbook -i mkrasberry_config/hosts -e hostlist=wisconsin_linux playbooks/dns-restart.yml
+#   ansible-playbook -i mkrasberry_config/hosts -e hostlist=newyork_linux playbooks/dns-restart.yml
+#   ansible-playbook -i mkrasberry_config/hosts -e hostlist=miyagi_linux playbooks/dns-restart.yml
+#
+#   # Restart single server
+#   ansible-playbook -i mkrasberry_config/hosts -e hostlist=marsAlpha playbooks/dns-restart.yml
+#
+# Diagnostics are saved to logs/ directory on failure
+
+- hosts: "{{ hostlist | default('pihole') }}"
+  become: true
+  vars:
+    pihole_user: "pihole"
+  tasks:
+    - name: Check pihole container status
+      command: sudo -u {{ pihole_user }} podman ps --filter name=pihole
+      register: pihole_status
+      changed_when: false
+
+    - name: Restart pihole service
+      command: sudo -u {{ pihole_user }} sh -c "podman stop pihole && podman start pihole"
+      register: restart_result
+      failed_when: restart_result.rc != 0
+
+    - name: Restart unbound service
+      systemd:
+        name: unbound
+        state: restarted
+      become: true
+
+    - name: Verify restart succeeded
+      fail:
+        msg: "Pihole restart failed with rc={{ restart_result.rc }}"
+      when: restart_result.rc not in [0, 3]
+
+    - name: Display restart result
+      debug:
+        msg: "Pihole restarted successfully on {{ inventory_hostname }}"
+
+    - name: Collect summary diagnostics on failure
+      command: sudo -u {{ pihole_user }} podman exec pihole pihole -d -a
+      register: pihole_diag
+      failed_when: false
+      when: restart_result is failed
+
+    - name: Save diagnostics to remote temp file
+      copy:
+        content: "{{ pihole_diag.stdout }}"
+        dest: "/tmp/pihole-diag-{{ ansible_date_time.epoch }}.log"
+      when: pihole_diag is defined and pihole_diag.stdout is defined
+      changed_when: true
+
+    - name: Fetch diagnostics to local logs directory
+      fetch:
+        src: "/tmp/pihole-diag-{{ ansible_date_time.epoch }}.log"
+        dest: "logs/pihole-diag-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.log"
+        flat: yes
+      when: pihole_diag is defined and pihole_diag.stdout is defined
+      failed_when: false
diff --git a/playbooks/initial-playbook-stage-5.yml b/playbooks/initial-playbook-stage-5.yml
index 98a0aeb..729a436 100644
--- a/playbooks/initial-playbook-stage-5.yml
+++ b/playbooks/initial-playbook-stage-5.yml
@@ -9,6 +9,7 @@
           - tcpdump
           - jq
           - podman
+          - fuse-overlayfs
         state: present
         force_apt_get: yes
         cache_valid_time: 84600
@@ -43,6 +44,24 @@
         state: present
         insertafter: EOF
         line: if [ $(id -u) != 0 ]; then export XDG_RUNTIME_DIR=/var/run/user/$(id -u)/; fi
+
+    - name: Ensure Podman storage driver is overlay
+      ini_file:
+        path: /etc/containers/storage.conf
+        section: storage
+        option: driver
+        value: '"overlay"'
+        mode: '0644'
+        create: yes
+
+    - name: Ensure Podman graphroot is set
+      ini_file:
+        path: /etc/containers/storage.conf
+        section: storage
+        option: graphroot
+        value: '"/var/lib/containers/storage"'
+        mode: '0644'
+        create: yes
 #        git config --global alias.co checkout
 #        git config --global alias.br branch
 #        git config --global alias.ci commit
diff --git a/roles/pihole.container b/roles/pihole.container
index 162604c..2a32007 160000
--- a/roles/pihole.container
+++ b/roles/pihole.container
@@ -1 +1 @@
-Subproject commit 162604cb955e7f8e18b4be37da67ac6bdf8cca9a
+Subproject commit 2a320077f7d59d06362dd047083f572c70a8e4a4
diff --git a/roles/podmanSetupRaspberryPi b/roles/podmanSetupRaspberryPi
index cf84a87..077f312 160000
--- a/roles/podmanSetupRaspberryPi
+++ b/roles/podmanSetupRaspberryPi
@@ -1 +1 @@
-Subproject commit cf84a87c0d561e25387816299aa419cc3327d0b5
+Subproject commit 077f3124fb617fff14a3fb62758d4632f6f0b143
diff --git a/roles/telegraf-container-service/defaults/main.yml b/roles/telegraf-container-service/defaults/main.yml
index 13be010..aa902c8 100644
--- a/roles/telegraf-container-service/defaults/main.yml
+++ b/roles/telegraf-container-service/defaults/main.yml
@@ -2,3 +2,9 @@
 remote_config_url: "https://europe-west1-1.gcp.cloud2.influxdata.com/api/v2/telegrafs/5d0d0391a43a9023"
 influx_db_url: "https://europe-west1-1.gcp.cloud2.influxdata.com"
 influx_cloud_token: "{{influx_cloud_token}}"
+
+telegraf_dns_monitoring_enabled: true
+telegraf_dns_external_domains:
+  - google.com
+  - cloudflare.com
+  - github.com
diff --git a/roles/telegraf-container-service/tasks/config.yml b/roles/telegraf-container-service/tasks/config.yml
index 52aee6b..6385c04 100644
--- a/roles/telegraf-container-service/tasks/config.yml
+++ b/roles/telegraf-container-service/tasks/config.yml
@@ -25,3 +25,12 @@
     owner: "{{container_user}}"
     group: "{{container_group}}"
   when: "telegraf_metrics_iot is defined"
+
+- name: copy dns monitoring config into place
+  template:
+    src: templates/etc/telegraf/telegraf.d/dns-monitoring.conf.j2
+    dest: "{{ container_prefix }}/etc/telegraf.d/dns-monitoring.conf"
+    mode: 0644
+    owner: "{{container_user}}"
+    group: "{{container_group}}"
+  when: "telegraf_dns_monitoring_enabled | default(true)"
diff --git a/roles/telegraf-container-service/templates/etc/telegraf/telegraf.d/dns-monitoring.conf.j2 b/roles/telegraf-container-service/templates/etc/telegraf/telegraf.d/dns-monitoring.conf.j2
new file mode 100644
index 0000000..31b6059
--- /dev/null
+++ b/roles/telegraf-container-service/templates/etc/telegraf/telegraf.d/dns-monitoring.conf.j2
@@ -0,0 +1,56 @@
+# DNS Monitoring - Pi-hole API
+# Queries Pi-hole API from all pihole servers in inventory
+{% for host in groups['pihole'] %}
+[[inputs.http]]
+  urls = ["http://{{ hostvars[host].ansible_host }}/admin/api.php?summaryRaw"]
+  name_override = "pihole"
+  interval = "60s"
+  timeout = "10s"
+  [inputs.http.tags]
+    server = "{{ host }}.{{ dns_subdomain_base }}"
+
+{% endfor %}
+
+# DNS Latency - Internal
+# Query each Unbound from each pihole server in inventory
+# Tests that internal DNS resolution works across the fleet
+{% for host in groups['pihole'] %}
+[[inputs.exec]]
+  commands = ["dig +short {{ host }}.{{ dns_subdomain_base }} @{{ hostvars[host].ansible_host }}#5335 +time=3 +tries=2"]
+  name_override = "dns_internal_latency"
+  interval = "60s"
+  data_format = "value"
+  data_type = "integer"
+  [inputs.exec.tags]
+    server = "{{ host }}.{{ dns_subdomain_base }}"
+    target = "{{ host }}.{{ dns_subdomain_base }}"
+
+{% endfor %}
+
+# DNS Latency - External
+# Query public DNS to verify internet connectivity
+# Tests that upstream DNS resolution works
+{% for domain in telegraf_dns_external_domains | default(['google.com', 'cloudflare.com', 'github.com']) %}
+[[inputs.exec]]
+  commands = ["dig +short {{ domain }} @8.8.8.8 +time=3 +tries=2"]
+  name_override = "dns_external_latency"
+  interval = "60s"
+  data_format = "value"
+  data_type = "integer"
+  [inputs.exec.tags]
+    server = "{{ inventory_hostname }}.{{ dns_subdomain_base }}"
+    upstream = "google-dns"
+    domain = "{{ domain }}"
+
+[[inputs.exec]]
+  commands = ["dig +short {{ domain }} @1.1.1.1 +time=3 +tries=2"]
+  name_override = "dns_external_latency"
+  interval = "60s"
+  data_format = "value"
+  data_type = "integer"
+  [inputs.exec.tags]
+    server = "{{ inventory_hostname }}.{{ dns_subdomain_base }}"
+    upstream = "cloudflare-dns"
+    domain = "{{ domain }}"
+
+{% endfor %}