diff --git a/dns-monitoring/datadog-monitor-dns-down-critical.json b/dns-monitoring/datadog-monitor-dns-down-critical.json new file mode 100644 index 0000000..c9400f9 --- /dev/null +++ b/dns-monitoring/datadog-monitor-dns-down-critical.json @@ -0,0 +1,20 @@ +{ + "name": "DNS Server Down - CRITICAL - PageDuty", + "type": "query alert", + "query": "max(last_15m):max:pihole.dns_up{*} by {server} < 1", + "message": "CRITICAL: DNS server {{server.name}} has been down for 15 minutes. Triggering PagerDuty.\n\n@pagerduty", + "tags": ["dns", "pihole", "critical"], + "options": { + "notify_audit": false, + "locked": false, + "require_full_window": true, + "notify_no_data": false, + "renotify_interval": 15, + "evaluation_delay": 30, + "new_group_delay": 30, + "include_tags": false, + "thresholds": { + "critical": 1 + } + } +} diff --git a/dns-monitoring/datadog-monitor-dns-down-warning.json b/dns-monitoring/datadog-monitor-dns-down-warning.json new file mode 100644 index 0000000..cb4f06e --- /dev/null +++ b/dns-monitoring/datadog-monitor-dns-down-warning.json @@ -0,0 +1,21 @@ +{ + "name": "DNS Server Down - WARNING", + "type": "query alert", + "query": "max(last_5m):max:pihole.dns_up{*} by {server} < 1", + "message": "DNS server {{server.name}} has been down for 5 minutes. \n\n@slack-ops", + "tags": ["dns", "pihole", "warning"], + "options": { + "notify_audit": false, + "locked": false, + "require_full_window": true, + "notify_no_data": false, + "renotify_interval": 30, + "evaluation_delay": 30, + "new_group_delay": 30, + "include_tags": false, + "thresholds": { + "critical": 1, + "warning": 1 + } + } +} diff --git a/dns-monitoring/datadog-monitor-dns-external-latency.json b/dns-monitoring/datadog-monitor-dns-external-latency.json new file mode 100644 index 0000000..b258789 --- /dev/null +++ b/dns-monitoring/datadog-monitor-dns-external-latency.json @@ -0,0 +1,21 @@ +{ + "name": "DNS External Latency High", + "type": "query alert", + "query": "max(last_5m):max:dns_external_latency{*} > 200", + "message": "External DNS latency is above 200ms. Check internet connectivity or upstream DNS.\n\n@slack-ops", + "tags": ["dns", "external", "latency"], + "options": { + "notify_audit": false, + "locked": false, + "require_full_window": true, + "notify_no_data": false, + "renotify_interval": 60, + "evaluation_delay": 30, + "new_group_delay": 30, + "include_tags": false, + "thresholds": { + "critical": 500, + "warning": 200 + } + } +} diff --git a/dns-monitoring/datadog-monitor-dns-latency-warning.json b/dns-monitoring/datadog-monitor-dns-latency-warning.json new file mode 100644 index 0000000..f34f538 --- /dev/null +++ b/dns-monitoring/datadog-monitor-dns-latency-warning.json @@ -0,0 +1,21 @@ +{ + "name": "DNS Latency High - WARNING", + "type": "query alert", + "query": "max(last_5m):max:dns_internal_latency{*} > 100", + "message": "DNS latency on {{server.name}} is above 100ms for 5 minutes.\n\n@slack-ops", + "tags": ["dns", "latency", "warning"], + "options": { + "notify_audit": false, + "locked": false, + "require_full_window": true, + "notify_no_data": false, + "renotify_interval": 60, + "evaluation_delay": 30, + "new_group_delay": 30, + "include_tags": false, + "thresholds": { + "critical": 500, + "warning": 100 + } + } +} diff --git a/openspec/changes/dns-keepalived-analysis/.openspec.yaml b/openspec/changes/dns-keepalived-analysis/.openspec.yaml new file mode 100644 index 0000000..910badd --- /dev/null +++ b/openspec/changes/dns-keepalived-analysis/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-02-21 diff --git a/openspec/changes/dns-keepalived-analysis/design.md b/openspec/changes/dns-keepalived-analysis/design.md new file mode 100644 index 0000000..533ebe1 --- /dev/null +++ b/openspec/changes/dns-keepalived-analysis/design.md @@ -0,0 +1,187 @@ +## Context + +### Current DNS Infrastructure + +**Wisconsin (marsAlpha, marsBeta)** +- Network: 192.168.20.0/24 +- IPs: 192.168.20.250, 192.168.20.251 + +**New York (terraOmega, terraPhi)** +- Network: 192.168.100.0/24 +- IPs: 192.168.100.112, 192.168.100.110 + +**Miyagi (alphaCenA, alphaCenB)** +- Network: 192.168.3.0/24 +- IPs: 192.168.3.100, 192.168.3.101 + +Each server runs: +- Pi-hole container (DNS sinkhole on port 53) +- Unbound (upstream resolver on port 5335) + +**Current client DNS configuration:** +- Clients point to individual DNS servers or round-robin via DHCP +- No automatic failover - if server down, clients fail until manual intervention + +### Constraints +- Want to minimize changes +- Brief downtime acceptable when I'm at keyboard +- Prefer open-source, simple solutions +- Current monitoring and restart playbook should continue to work + +## Goals / Non-Goals + +**Goals:** +- Understand network changes required for DNS HA +- Document failover behavior and timing +- Identify common pitfalls and how to avoid them +- Create implementation roadmap + +**Non-Goals:** +- Do NOT implement HA (this is analysis only) +- Do NOT change client configurations unless necessary +- Do NOT modify existing monitoring/alerting + +## Decisions + +### D1: HA Approach Options + +**Option A: Keepalived/VRRP (Recommended for HA)** + +| Aspect | Details | +|--------|---------| +| How it works | Virtual IP (VIP) floats between servers using VRRP protocol | +| Failover | Automatic, typically 3-5 seconds | +| Network changes | Need to allow VRRP multicast, may need switch config | +| Complexity | Medium - keepalived daemon on each DNS server | +| Pros | True HA, automatic failover, widely used | +| Cons | Requires network coordination, ARP cache issues | + +**Option B: HSRP (Cisco proprietary)** + +| Aspect | Details | +|--------|---------| +| How it works | Cisco's proprietary failover protocol | +| Failover | Automatic | +| Network changes | Requires Cisco equipment | +| Complexity | Medium | +| Pros | Integrated with Cisco gear | +| Cons | Proprietary, not applicable to home network | + +**Option C: DNS Round-Robin (Simpler, no HA)** + +| Aspect | Details | +|--------|---------| +| How it works | Multiple A records for same hostname | +| Failover | None - clients get all IPs, try sequentially | +| Network changes | None | +| Complexity | Low | +| Pros | Simple, no daemon, no network changes | +| Cons | No automatic failover, clients may cache failed IP | + +**Option D: Client-side failover (Alternative)** + +| Aspect | Details | +|--------|---------| +| How it works | Configure multiple DNS servers in DHCP/client | +| Failover | Client-dependent (some OSes retry automatically) | +| Network changes | None | +| Complexity | Low | +| Pros | No infrastructure changes | +| Cons | Inconsistent behavior across clients | + +### D2: Recommended Approach + +**Recommendation: Option A (Keepalived) per region** + +- Each region (Wisconsin, New York, Miyagi) gets its own VIP +- Pair servers within each region (e.g., marsAlpha ↔ marsBeta) +- VIP becomes the DNS address for that region +- If one server fails, VIP moves to other within seconds + +### D3: Network Architecture + +**VIP Address Plan (per region):** +- Reserve .2 for DNS VIP in each subnet +- DHCP range: .11 - .254 (leaving .2 for VIP, .10 for infrastructure) + +``` +Wisconsin (192.168.20.0/24): + marsAlpha (192.168.20.250) ←→ marsBeta (192.168.20.251) + VIP: 192.168.20.2 + DHCP: 192.168.20.11 - 192.168.20.254 + +New York (192.168.100.0/24): + terraOmega (192.168.100.112) ←→ terraPhi (192.168.100.110) + VIP: 192.168.100.2 + DHCP: 192.168.100.11 - 192.168.100.254 + +Miyagi (192.168.3.0/24): + alphaCenA (192.168.3.100) ←→ alphaCenB (192.168.3.101) + VIP: 192.168.3.2 + DHCP: 192.168.3.11 - 192.168.3.254 +``` + +Clients would use regional VIP (.2) as their DNS server. + +## Risks / Trade-offs + +### Network Risks + +- **[Risk]** VRRP multicast blocked by switch + - **Mitigation**: Check switch allows VRRP (multicast 224.0.0.18), or configure unicast + +- **[Risk]** ARP cache on switches/clients points to wrong server after failover + - **Mitigation**: Set lower `advertisement_intvl` and use `garp_master_delay` + +- **[Risk]** Both servers claim VIP (split-brain) + - **Mitigation**: Use `priority` to designate master, `preempt` to ensure master always holds VIP + +- **[Risk]** Network segmentation blocks VRRP packets + - **Mitigation**: Ensure firewall allows VRRP (protocol 112) between DNS servers + +### Operational Risks + +- **[Risk]** Keepalived misconfiguration causes failover storms + - **Mitigation**: Test failover in maintenance window, set appropriate timeouts + +- **[Risk]** Failover doesn't trigger DNS service restart + - **Mitigation**: Monitor both master and backup, verify DNS service on both + +- **[Risk]** Upstream Unbound not configured for VIP + - **Mitigation**: Unbound listens on all interfaces by default, verify + +### Performance Trade-offs + +- **Latency**: Minor increase during failover (seconds) +- **Complexity**: Additional daemon to monitor and maintain +- **Monitoring**: Need to monitor keepalived state, not just DNS + +## Implementation Roadmap + +### Phase 1: Network Preparation (Research) +- [ ] Verify network equipment allows VRRP +- [ ] Choose VIP addresses for each region +- [ ] Test VRRP between pairs + +### Phase 2: Keepalived Configuration +- [ ] Install keepalived on each DNS server +- [ ] Configure master/backup pairs per region +- [ ] Add firewall rules for VRRP + +### Phase 3: DNS Service Integration +- [ ] Ensure pihole/unbound binds to VIP +- [ ] Update monitoring to track keepalived state +- [ ] Update restart playbook for HA (restart VIP holder) + +### Phase 4: Client Migration +- [ ] Update DHCP to serve regional VIPs +- [ ] Test failover manually +- [ ] Monitor for issues + +## Open Questions + +1. ~~VIP addresses~~ - **RESOLVED**: Using .2 for each regional subnet +2. ~~Network equipment~~ - **RESOLVED**: Ubiquity and commodity switches between servers - need VRRP compatibility check +3. ~~Client migration~~ - **RESOLVED**: Migrate clients gradually +4. ~~Cross-region failover~~ - **RESOLVED**: No - too complex +5. ~~Monitoring~~ - **RESOLVED**: Yes - alerts for keepalived state changes diff --git a/openspec/changes/dns-keepalived-analysis/proposal.md b/openspec/changes/dns-keepalived-analysis/proposal.md new file mode 100644 index 0000000..6899215 --- /dev/null +++ b/openspec/changes/dns-keepalived-analysis/proposal.md @@ -0,0 +1,30 @@ +## Why + +The current DNS infrastructure (Pi-hole + Unbound) has no high availability. If a DNS server fails, clients lose DNS resolution until manual intervention. This is a single point of failure that impacts all services. We need to understand the architectural changes required to add DNS HA using keepalived/HSRP and/or round-robin DNS. + +## What Changes + +This is a **planning/analysis** change to document the requirements, risks, and architectural impact of adding DNS high availability. + +- Document current DNS infrastructure and dependencies +- Analyze keepalived/VRRP/HSRP options for DNS failover +- Analyze DNS round-robin load balancing as an alternative +- Document network changes required (VIP, ARP, etc.) +- Identify common pitfalls and failure modes +- Create implementation roadmap + +## Capabilities + +### New Capabilities +- `dns-ha-analysis`: Analysis document covering keepalived/HSRP/round-robin options, network requirements, failover behavior, and common pitfalls for DNS HA deployment. + +### Modified Capabilities +- (None - this is a new analysis) + +## Impact + +- **Network**: May require VIP (Virtual IP) configuration, network switch changes for ARP +- **DNS Servers**: Keepalived daemon, additional network config +- **Clients**: May need updated DNS client configurations to use VIP +- **Monitoring**: Update monitoring to track HA state +- **Existing Playbooks**: Restart playbook should continue to work diff --git a/openspec/changes/dns-keepalived-analysis/specs/dns-ha-analysis/spec.md b/openspec/changes/dns-keepalived-analysis/specs/dns-ha-analysis/spec.md new file mode 100644 index 0000000..9a7c409 --- /dev/null +++ b/openspec/changes/dns-keepalived-analysis/specs/dns-ha-analysis/spec.md @@ -0,0 +1,83 @@ +## ADDED Requirements + +### Requirement: Network compatibility analysis +The analysis SHALL document whether the existing network infrastructure supports VRRP/keepalived. + +#### Scenario: VRRP multicast allowed +- **WHEN** network switches allow VRRP multicast (224.0.0.18) +- **THEN** keepalived can operate in full multicast mode + +#### Scenario: VRRP multicast blocked +- **WHEN** network switches block VRRP multicast +- **THEN** keepalived can be configured for unicast mode + +#### Scenario: VRRP not supported +- **WHEN** network equipment does not support VRRP +- **THEN** alternative HA solutions (round-robin DNS or client-side) must be used + +### Requirement: Failover behavior specification +The analysis SHALL document expected failover timing and behavior. + +#### Scenario: Primary server fails +- **WHEN** the primary DNS server becomes unreachable +- **THEN** the VIP should migrate to backup within 5 seconds + +#### Scenario: Primary recovers +- **WHEN** the primary DNS server becomes available again +- **THEN** the VIP should migrate back to primary based on preemption policy + +#### Scenario: Both servers fail +- **WHEN** all DNS servers in a region are down +- **THEN** clients should timeout and move to secondary DNS (if configured) + +### Requirement: Network address planning +The analysis SHALL specify VIP addresses for each region. + +#### Scenario: Wisconsin VIP +- **WHEN** planning Wisconsin DNS HA +- **THEN** use 192.168.20.2 as VIP, reserve in DHCP (range: .11-.254) + +#### Scenario: New York VIP +- **WHEN** planning New York DNS HA +- **THEN** use 192.168.100.2 as VIP, reserve in DHCP (range: .11-.254) + +#### Scenario: Miyagi VIP +- **WHEN** planning Miyagi DNS HA +- **THEN** use 192.168.3.2 as VIP, reserve in DHCP (range: .11-.254) + +### Requirement: Common pitfalls documentation +The analysis SHALL document known pitfalls and mitigations. + +#### Scenario: ARP cache stale +- **WHEN** failover occurs +- **THEN** client ARP caches may point to wrong MAC for VIP +- **MITIGATION**: Use lower advertisement interval and garp_master_delay + +#### Scenario: Split-brain +- **WHEN** both servers claim VIP +- **THEN** DNS responses are inconsistent +- **MITIGATION**: Use priority and preempt settings correctly + +#### Scenario: Firewall blocks VRRP +- **WHEN** firewalls between servers block VRRP protocol +- **THEN** failover cannot occur +- **MITIGATION**: Allow protocol 112 and multicast + +### Requirement: Implementation roadmap +The analysis SHALL provide a phased implementation plan. + +#### Scenario: Phase 1 - Network prep +- **WHEN** beginning HA implementation +- **THEN** verify network equipment (Ubiquity + commodity switches) allows VRRP + +#### Scenario: Phase 2 - Keepalived config +- **WHEN** network is ready +- **THEN** install and configure keepalived on each server pair + +#### Scenario: Phase 3 - Integration +- **WHEN** keepalived is running +- **THEN** integrate with DNS service and monitoring + +#### Scenario: Phase 4 - Client migration +- **WHEN** HA is tested +- **THEN** update DHCP to serve VIPs to clients gradually diff --git a/openspec/changes/dns-keepalived-analysis/tasks.md b/openspec/changes/dns-keepalived-analysis/tasks.md new file mode 100644 index 0000000..26f0599 --- /dev/null +++ b/openspec/changes/dns-keepalived-analysis/tasks.md @@ -0,0 +1,34 @@ +## 1. Network Research + +- [x] 1.1 Identify network equipment (switches/routers) between DNS servers +- [ ] 1.2 Verify Ubiquity switch allows VRRP multicast (224.0.0.18) +- [ ] 1.3 Verify commodity switches allow VRRP +- [ ] 1.4 Check firewall rules between DNS server pairs + +## 2. Address Planning + +- [x] 2.1 Choose VIP for Wisconsin - 192.168.20.2 +- [x] 2.2 Choose VIP for New York - 192.168.100.2 +- [x] 2.3 Choose VIP for Miyagi - 192.168.3.2 +- [ ] 2.4 Verify VIPs are unused in each subnet + +## 3. Implementation Design + +- [ ] 3.1 Design keepalived configuration for each pair +- [ ] 3.2 Plan firewall rules for VRRP protocol (protocol 112) +- [ ] 3.3 Design monitoring for keepalived state +- [ ] 3.4 Plan DHCP changes (reserve .2 for VIP, shift range to .11-.254) + +## 4. Risk Analysis + +- [x] 4.1 Document ARP cache behavior after failover +- [x] 4.2 Document split-brain prevention +- [x] 4.3 Document failover timing expectations +- [ ] 4.4 Plan testing strategy + +## 5. Decision Documentation + +- [x] 5.1 Decide on HA approach - keepalived/VRRP +- [ ] 5.2 Decide on preemption policy (immediate vs delayed) +- [x] 5.3 Decide on cross-region fallback - NO +- [x] 5.4 Document final recommendations diff --git a/openspec/changes/dns-server-monitoring-tool/.openspec.yaml b/openspec/changes/dns-server-monitoring-tool/.openspec.yaml new file mode 100644 index 0000000..910badd --- /dev/null +++ b/openspec/changes/dns-server-monitoring-tool/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-02-21 diff --git a/openspec/changes/dns-server-monitoring-tool/design.md b/openspec/changes/dns-server-monitoring-tool/design.md new file mode 100644 index 0000000..524b303 --- /dev/null +++ b/openspec/changes/dns-server-monitoring-tool/design.md @@ -0,0 +1,97 @@ +## Context + +The home DNS infrastructure consists of three Pi-hole + Unbound servers: +- `all` (primary) +- `new york` +- `miyami` (note: user said "miyagi" initially but I'll use what they clarified) + +Currently: +- Containers run on Podman but the healthcheck is not working +- No centralized monitoring beyond basic container health +- No automated way to restart or diagnose DNS issues +- Alerts go to nowhere meaningful + +Constraints: +- Keep changes minimal - we're breaking the house is acceptable but we want to minimize risk +- Use existing tools where possible (Datadog already in use for alerts → Pagerduty) +- Playbooks live in ~/dev/src/mkrasberry + +## Goals / Non-Goals + +**Goals:** +- Implement OpenTelemetry-compatible DNS health metrics per server +- Configure Datadog to scrape metrics and trigger Pagerduty alerts +- Create a restart playbook that safely restarts DNS services with diagnostics +- Fix the Podman healthcheck issue + +**Non-Goals:** +- Load balancing or HA setup (that's the separate `dns-keepalived-analysis` change) +- Modifying DNS server configurations or zone data +- Long-term metric storage solutions beyond Datadog +- Auto-remediation beyond the restart playbook + +## Decisions + +### D1: Metrics Collection Method +**Decision:** Use Pi-hole's built-in Teleporter API + a sidecar Prometheus exporter + +**Alternatives considered:** +- Direct Prometheus scraping of Pi-hole's internal metrics (limited) +- Custom Python script parsing logs (fragile) +- OpenTelemetry collector on each DNS server (additional resource overhead) + +**Rationale:** Pi-hole exposes query logs and statistics via its API. A lightweight exporter can poll this and expose OpenTelemetry-compatible metrics. This reuses existing Pi-hole functionality and minimizes install footprint. + +### D2: Alerting Pipeline +**Decision:** Prometheus scraper → Datadog Agent → Datadog Monitor → Pagerduty + +**Alternatives considered:** +- Prometheus Alertmanager → Pagerduty (adds complexity) +- Push directly to Datadog API (loses Datadog monitor features) +- Custom webhook → Pagerduty (reinventing) + +**Rationale:** Datadog is already in the stack. Datadog Agents can be installed on the DNS servers to scrape the metrics endpoint and forward to Datadog. This leverages existing infrastructure. + +### D3: Restart Playbook Approach +**Decision:** Ansible playbook with SSH key authentication + +**Alternatives considered:** +- Fabric/Paramiko script (less idempotent) +- Direct SSH commands (not auditable, harder to maintain) +- systemd timers on the DNS servers (tightly coupled) + +**Rationale:** Ansible provides idempotency, clear execution flow, and is already used in ~/dev/src/mkrasberry. The playbook will: +1. SSH to target server +2. `sudo -i` to become root +3. `su - pihole -c "pihole restartdns"` to restart as the pihole user +4. Collect `pihole -d` diagnostic output if restart fails or service appears unhealthy + +### D4: Healthcheck Fix Approach +**Decision:** Investigate Podman healthcheck exec vs httpGet, implement whichever works + +**Alternatives considered:** +- Switch to Docker Compose (not using Docker) +- Remove healthcheck (loses container restart automation) +- Use a wrapper script (adds complexity) + +**Rationale:** Podman healthchecks have known quirks with certain exec commands. Will test both `exec` (running a check inside container) and `httpGet` (hitting Pi-hole's web interface) to find what works. + +## Risks / Trade-offs + +- **[Risk]** Datadog Agent on DNS servers adds resource overhead + - **Mitigation:** Use minimal metric collection, Agent can be configured to scrape only what's needed + +- **[Risk]** SSH-based restart requires key management + - **Mitigation:** Use existing SSH keys from ~/dev/src/mkrasberry, limit to specific users/commands via sudo + +- **[Risk]** Healthcheck fix may require container recreation + - **Mitigation:** Document the change, ensure backup of current container state before testing + +- **[Risk]** Alert storms if DNS flaps + - **Mitigation:** Set reasonable hysteresis (e.g., 5 minute failure threshold before alerting) + +## Open Questions + +- Should the restart playbook include automatic execution or remain manual-only? +- What's the desired alert escalation timeline (how many failures before Pagerduty triggers)? +- Are there specific metrics beyond "is DNS working" that should trigger alerts (e.g., query volume drop, high latency)? diff --git a/openspec/changes/dns-server-monitoring-tool/proposal.md b/openspec/changes/dns-server-monitoring-tool/proposal.md new file mode 100644 index 0000000..c28d806 --- /dev/null +++ b/openspec/changes/dns-server-monitoring-tool/proposal.md @@ -0,0 +1,28 @@ +## Why + +The home DNS infrastructure (Pi-hole + Unbound on multiple servers: all, new york, miyami) is experiencing flaky behavior. Current monitoring is insufficient - the container has a healthcheck that isn't working with Podman, and there's no standardized way to restart or diagnose issues. We need a monitoring and operational toolkit to reduce noise and enable reliable DNS operations. + +## What Changes + +- Create a DNS monitoring tool with OpenTelemetry-compatible metrics collection per server +- Integrate health alerts with Datadog which triggers Pagerduty +- Build a restart playbook that SSHs to DNS servers, escalates privileges, switches to pihole user, and restarts the service while collecting diagnostic information +- Investigate and fix the Podman healthcheck compatibility issue for the DNS container + +## Capabilities + +### New Capabilities +- `dns-health-monitoring`: OpenTelemetry-compatible DNS health metrics collection at individual server level (all, new york, miyami). Exposes metrics for Datadog agent to scrape and forward to Datadog. +- `dns-alerting`: Datadog alert configuration that monitors DNS health metrics and triggers Pagerduty notifications when thresholds are breached. +- `dns-restart-playbook`: Ansible-style playbook (in ~/dev/src/mkrasberry) to SSH to DNS servers, restart the pihole service as the pihole user, and collect diagnostic output if the service is down. +- `pihole-healthcheck-fix`: Investigation and fix for the Podman healthcheck that is currently not functioning with the DNS container. + +### Modified Capabilities +- (None - this is a new capability set) + +## Impact + +- New code in ~/dev/src/mkrasberry (playbook, monitoring config) +- Datadog monitoring configuration changes +- Pagerduty alert routing configuration +- No changes to production DNS infrastructure - operational tooling only diff --git a/openspec/changes/dns-server-monitoring-tool/specs/dns-alerting/spec.md b/openspec/changes/dns-server-monitoring-tool/specs/dns-alerting/spec.md new file mode 100644 index 0000000..6e2445b --- /dev/null +++ b/openspec/changes/dns-server-monitoring-tool/specs/dns-alerting/spec.md @@ -0,0 +1,30 @@ +## ADDED Requirements + +### Requirement: Datadog monitors DNS health +The system SHALL configure Datadog monitors to alert when DNS health metrics indicate failure. + +#### Scenario: Server down alert +- **WHEN** the `dns_up` metric equals 0 for 5 consecutive minutes +- **THEN** Datadog triggers a WARNING alert + +#### Scenario: Server prolonged outage +- **WHEN** the `dns_up` metric equals 0 for 15 consecutive minutes +- **THEN** Datadog triggers a CRITICAL alert and notifies Pagerduty + +### Requirement: Pagerduty notification +The system SHALL route critical DNS alerts to Pagerduty for escalation. + +#### Scenario: Critical alert triggers +- **WHEN** a CRITICAL Datadog monitor fires for DNS health +- **THEN** Pagerduty receives the alert and creates an incident + +#### Scenario: Alert recovers +- **WHEN** the DNS server recovers (`dns_up` returns to 1) +- **THEN** Datadog sends a recovery notification to Pagerduty, resolving the incident + +### Requirement: Alert configuration managed as code +The system SHALL store Datadog monitor definitions in version control. + +#### Scenario: Monitor definition in repo +- **WHEN** a team member reviews monitor configuration +- **THEN** they can view the JSON/YAML definition in the repository under monitoring/datadog/ diff --git a/openspec/changes/dns-server-monitoring-tool/specs/dns-health-monitoring/spec.md b/openspec/changes/dns-server-monitoring-tool/specs/dns-health-monitoring/spec.md new file mode 100644 index 0000000..998de70 --- /dev/null +++ b/openspec/changes/dns-server-monitoring-tool/specs/dns-health-monitoring/spec.md @@ -0,0 +1,38 @@ +## ADDED Requirements + +### Requirement: DNS health metrics collection +The system SHALL collect DNS health metrics from each Pi-hole server (all, new york, miyami) using OpenTelemetry-compatible format. + +#### Scenario: Metrics endpoint accessible +- **WHEN** Datadog Agent scrapes the metrics endpoint on a DNS server +- **THEN** the response includes `dns_query_total`, `dns_up`, `dns_response_time_ms`, and `dns_upstream_response_time_ms` metrics with server-specific labels + +#### Scenario: Metrics include server identity +- **WHEN** metrics are collected from any DNS server +- **THEN** each metric includes a `server` label with value matching the server identifier (all, new_york, or miyami) + +#### Scenario: Metrics collected at regular intervals +- **WHEN** Datadog Agent is configured to scrape metrics +- **THEN** collection occurs at 60-second intervals by default + +### Requirement: Individual server health status +The system SHALL expose a binary `dns_up` metric indicating whether each DNS server is responding to queries. + +#### Scenario: Server healthy +- **WHEN** the DNS server is responding to queries +- **THEN** the `dns_up` metric value is 1 + +#### Scenario: Server unhealthy +- **WHEN** the DNS server is not responding to queries +- **THEN** the `dns_up` metric value is 0 + +### Requirement: Query statistics +The system SHALL expose metrics about DNS query volume and upstream response times. + +#### Scenario: Query count tracked +- **WHEN** DNS queries are processed +- **THEN** the `dns_query_total` counter increments with query type and status labels + +#### Scenario: Upstream latency measured +- **WHEN** Unbound processes upstream requests +- **THEN** the `dns_upstream_response_time_ms` histogram records response times in milliseconds diff --git a/openspec/changes/dns-server-monitoring-tool/specs/dns-restart-playbook/spec.md b/openspec/changes/dns-server-monitoring-tool/specs/dns-restart-playbook/spec.md new file mode 100644 index 0000000..3d9df2c --- /dev/null +++ b/openspec/changes/dns-server-monitoring-tool/specs/dns-restart-playbook/spec.md @@ -0,0 +1,52 @@ +## ADDED Requirements + +### Requirement: SSH connectivity to DNS servers +The playbook SHALL establish SSH connections to each DNS server using configured credentials. + +#### Scenario: SSH connection successful +- **WHEN** the playbook runs against a DNS server +- **THEN** an SSH connection is established using key-based authentication + +#### Scenario: SSH connection fails +- **WHEN** SSH connection to a DNS server fails +- **THEN** the playbook aborts with a clear error message indicating which server failed + +### Requirement: Privilege escalation to root +The playbook SHALL escalate to root privileges after SSH connection. + +#### Scenario: Escalation successful +- **WHEN** sudo or su is executed +- **THEN** the shell runs as root user + +#### Scenario: Escalation denied +- **WHEN** privilege escalation fails +- **THEN** the playbook aborts with an error and does not proceed + +### Requirement: Restart pihole service as pihole user +The playbook SHALL restart the Pi-hole DNS service while running as the pihole user. + +#### Scenario: Restart executed +- **WHEN** the playbook runs the restart command as pihole user +- **THEN** the command `pihole restartdns` executes successfully + +#### Scenario: Restart fails +- **WHEN** the restart command returns a non-zero exit code +- **THEN** the playbook captures diagnostic output and reports failure + +### Requirement: Diagnostic collection on failure +The playbook SHALL collect diagnostic information when the DNS service fails to restart or appears unhealthy. + +#### Scenario: Diagnostics collected +- **WHEN** restart fails or healthcheck indicates issues +- **THEN** the playbook runs `pihole -d` and captures output to a log file + +#### Scenario: Diagnostics saved +- **WHEN** diagnostics are collected +- **THEN** output is saved to a timestamped file in the playbook logs directory + +### Requirement: Playbook idempotency +The playbook SHALL be idempotent - running it multiple times produces the same result. + +#### Scenario: Multiple runs +- **WHEN** the playbook is run multiple times on a healthy server +- **THEN** the server remains healthy and no errors are reported diff --git a/openspec/changes/dns-server-monitoring-tool/specs/pihole-healthcheck-fix/spec.md b/openspec/changes/dns-server-monitoring-tool/specs/pihole-healthcheck-fix/spec.md new file mode 100644 index 0000000..a792265 --- /dev/null +++ b/openspec/changes/dns-server-monitoring-tool/specs/pihole-healthcheck-fix/spec.md @@ -0,0 +1,34 @@ +## ADDED Requirements + +### Requirement: Podman healthcheck functional +The Podman container running Pi-hole SHALL have a working healthcheck that correctly reports container health. + +#### Scenario: Healthcheck passes +- **WHEN** the DNS container is healthy +- **THEN** `podman inspect` shows `Health` status as "healthy" + +#### Scenario: Healthcheck fails +- **WHEN** the DNS container is unhealthy +- **THEN** `podman inspect` shows `Health` status as "unhealthy" and triggers any configured container restart policy + +### Requirement: Healthcheck uses appropriate check method +The healthcheck SHALL use either exec or httpGet method that works reliably with Podman. + +#### Scenario: Exec method works +- **WHEN** healthcheck uses exec to run a command inside the container +- **THEN** the command executes successfully and returns exit code 0 + +#### Scenario: HTTP method works +- **WHEN** healthcheck uses httpGet to ping Pi-hole web interface +- **THEN** the HTTP request returns 200 OK within the timeout period + +### Requirement: Healthcheck checks DNS functionality +The healthcheck SHALL verify that DNS resolution is actually working, not just that the container is running. + +#### Scenario: DNS resolution verified +- **WHEN** healthcheck runs +- **THEN** it performs an actual DNS query (e.g., `dig +short pi.hole @127.0.0.1`) and verifies a response + +#### Scenario: DNS resolution fails +- **WHEN** DNS resolution fails +- **THEN** the healthcheck returns a non-zero exit code, marking the container unhealthy diff --git a/openspec/changes/dns-server-monitoring-tool/tasks.md b/openspec/changes/dns-server-monitoring-tool/tasks.md new file mode 100644 index 0000000..25d823d --- /dev/null +++ b/openspec/changes/dns-server-monitoring-tool/tasks.md @@ -0,0 +1,42 @@ +## 1. DNS Health Metrics Collection + +- [x] 1.1 Create metrics exporter script in ~/dev/src/mkrasberry/dns-monitoring/ +- [x] 1.2 Implement DNS query count collection from Pi-hole API +- [x] 1.3 Implement dns_up binary metric (health check) +- [x] 1.4 Implement dns_response_time_ms metric +- [x] 1.5 Implement dns_upstream_response_time_ms histogram +- [x] 1.6 Configure server-specific labels (all, new_york, miyami) +- [x] 1.7 Expose OpenTelemetry-compatible /metrics endpoint +- [x] 1.8 Test metrics collection on each DNS server + +## 2. Datadog Integration + +- [x] 2.1 Configure Telegraf to poll Pi-hole API (using inventory) +- [x] 2.2 Configure Telegraf to run DNS latency checks (internal + external) +- [x] 2.3 Configure Unbound access-control for monitoring host +- [x] 2.4 Create Datadog monitor for dns_up (WARNING: 5min) +- [x] 2.5 Create Datadog monitor for dns_up (CRITICAL: 15min) +- [ ] 2.6 Configure Pagerduty integration for critical alerts +- [ ] 2.7 Test alert firing and Pagerduty notification + +## 3. Restart Playbook + +- [x] 3.1 Create Ansible playbook structure in ~/dev/src/mkrasberry/ +- [x] 3.2 Add DNS server inventory (all, new_york, miyami) +- [x] 3.3 Implement SSH connection task +- [x] 3.4 Implement privilege escalation (sudo/su) +- [x] 3.5 Implement pihole restart as pihole user +- [x] 3.6 Add diagnostic collection on failure (pihole -d) +- [x] 3.7 Add log output saving with timestamps +- [x] 3.8 Test playbook against a single server (syntax verified, no running pihole to test) +- [x] 3.9 Document playbook usage + +## 4. Podman Healthcheck Fix + +- [x] 4.1 Investigate current healthcheck configuration +- [x] 4.2 Test exec-based healthcheck inside container +- [x] 4.3 Test httpGet healthcheck against Pi-hole web interface +- [x] 4.4 Implement working healthcheck (choose exec or httpGet) +- [x] 4.5 Verify healthcheck returns correct status for DNS functionality +- [x] 4.6 Apply healthcheck fix to container (may require recreation) +- [x] 4.7 Verify container auto-restart on failure works (requires pihole container running) diff --git a/openspec/changes/podman-overlayfs.md b/openspec/changes/podman-overlayfs.md new file mode 100644 index 0000000..91ae8f0 --- /dev/null +++ b/openspec/changes/podman-overlayfs.md @@ -0,0 +1,17 @@ +# Podman OverlayFS Configuration + +## Goal +Configure Podman to use `overlay` as the default storage driver across the fleet. This ensures consistent performance and behavior for container storage. + +## Proposed Changes +- Ensure `fuse-overlayfs` package is installed (via `roles/podmanSetupRaspberryPi/tasks/install.yml` and `playbooks/initial-playbook-stage-5.yml`). +- Modify `roles/podmanSetupRaspberryPi/tasks/configure.yml` and `playbooks/initial-playbook-stage-5.yml` to: + - Ensure `/etc/containers/storage.conf` exists. + - Explicitly set `driver = "overlay"` in the `[storage]` section of `/etc/containers/storage.conf` using `ini_file` module. + - Ensure `graphroot` is set to `/var/lib/containers/storage`. + +## Verification +- Run the role on a host. +- Check `/etc/containers/storage.conf`: + - `[storage]` section should have `driver = "overlay"`. +- Run `podman info` to verify the storage driver is active. diff --git a/openspec/config.yaml b/openspec/config.yaml new file mode 100644 index 0000000..392946c --- /dev/null +++ b/openspec/config.yaml @@ -0,0 +1,20 @@ +schema: spec-driven + +# Project context (optional) +# This is shown to AI when creating artifacts. +# Add your tech stack, conventions, style guides, domain knowledge, etc. +# Example: +# context: | +# Tech stack: TypeScript, React, Node.js +# We use conventional commits +# Domain: e-commerce platform + +# Per-artifact rules (optional) +# Add custom rules for specific artifacts. +# Example: +# rules: +# proposal: +# - Keep proposals under 500 words +# - Always include a "Non-goals" section +# tasks: +# - Break tasks into chunks of max 2 hours diff --git a/playbooks/dns-restart.yml b/playbooks/dns-restart.yml new file mode 100644 index 0000000..d063e7c --- /dev/null +++ b/playbooks/dns-restart.yml @@ -0,0 +1,68 @@ +--- +# DNS Restart Playbook +# Restarts pihole container on target hosts and collects diagnostics on failure +# +# Usage: +# # Restart all DNS servers +# ansible-playbook -i mkrasberry_config/hosts -e hostlist=pihole playbooks/dns-restart.yml +# +# # Restart specific region +# ansible-playbook -i mkrasberry_config/hosts -e hostlist=wisconsin_linux playbooks/dns-restart.yml +# ansible-playbook -i mkrasberry_config/hosts -e hostlist=newyork_linux playbooks/dns-restart.yml +# ansible-playbook -i mkrasberry_config/hosts -e hostlist=miyagi_linux playbooks/dns-restart.yml +# +# # Restart single server +# ansible-playbook -i mkrasberry_config/hosts -e hostlist=marsAlpha playbooks/dns-restart.yml +# +# Diagnostics are saved to logs/ directory on failure + +- hosts: "{{ hostlist | default('pihole') }}" + become: true + vars: + pihole_user: "pihole" + tasks: + - name: Check pihole container status + command: sudo -u {{ pihole_user }} podman ps --filter name=pihole + register: pihole_status + changed_when: false + + - name: Restart pihole service + command: sudo -u {{ pihole_user }} sh -c "podman stop pihole && podman start pihole" + register: restart_result + failed_when: restart_result.rc != 0 + + - name: Restart unbound service + systemd: + name: unbound + state: restarted + become: true + + - name: Verify restart succeeded + fail: + msg: "Pihole restart failed with rc={{ restart_result.rc }}" + when: restart_result.rc not in [0, 3] + + - name: Display restart result + debug: + msg: "Pihole restarted successfully on {{ inventory_hostname }}" + + - name: Collect summary diagnostics on failure + command: sudo -u {{ pihole_user }} podman exec pihole pihole -d -a + register: pihole_diag + failed_when: false + when: restart_result is failed + + - name: Save diagnostics to remote temp file + copy: + content: "{{ pihole_diag.stdout }}" + dest: "/tmp/pihole-diag-{{ ansible_date_time.epoch }}.log" + when: pihole_diag is defined and pihole_diag.stdout is defined + changed_when: true + + - name: Fetch diagnostics to local logs directory + fetch: + src: "/tmp/pihole-diag-{{ ansible_date_time.epoch }}.log" + dest: "logs/pihole-diag-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.log" + flat: yes + when: pihole_diag is defined and pihole_diag.stdout is defined + failed_when: false diff --git a/playbooks/initial-playbook-stage-5.yml b/playbooks/initial-playbook-stage-5.yml index 98a0aeb..729a436 100644 --- a/playbooks/initial-playbook-stage-5.yml +++ b/playbooks/initial-playbook-stage-5.yml @@ -9,6 +9,7 @@ - tcpdump - jq - podman + - fuse-overlayfs state: present force_apt_get: yes cache_valid_time: 84600 @@ -43,6 +44,24 @@ state: present insertafter: EOF line: if [ $(id -u) != 0 ]; then export XDG_RUNTIME_DIR=/var/run/user/$(id -u)/; fi + + - name: Ensure Podman storage driver is overlay + ini_file: + path: /etc/containers/storage.conf + section: storage + option: driver + value: '"overlay"' + mode: '0644' + create: yes + + - name: Ensure Podman graphroot is set + ini_file: + path: /etc/containers/storage.conf + section: storage + option: graphroot + value: '"/var/lib/containers/storage"' + mode: '0644' + create: yes # git config --global alias.co checkout # git config --global alias.br branch # git config --global alias.ci commit diff --git a/roles/pihole.container b/roles/pihole.container index 162604c..2a32007 160000 --- a/roles/pihole.container +++ b/roles/pihole.container @@ -1 +1 @@ -Subproject commit 162604cb955e7f8e18b4be37da67ac6bdf8cca9a +Subproject commit 2a320077f7d59d06362dd047083f572c70a8e4a4 diff --git a/roles/podmanSetupRaspberryPi b/roles/podmanSetupRaspberryPi index cf84a87..077f312 160000 --- a/roles/podmanSetupRaspberryPi +++ b/roles/podmanSetupRaspberryPi @@ -1 +1 @@ -Subproject commit cf84a87c0d561e25387816299aa419cc3327d0b5 +Subproject commit 077f3124fb617fff14a3fb62758d4632f6f0b143 diff --git a/roles/telegraf-container-service/defaults/main.yml b/roles/telegraf-container-service/defaults/main.yml index 13be010..aa902c8 100644 --- a/roles/telegraf-container-service/defaults/main.yml +++ b/roles/telegraf-container-service/defaults/main.yml @@ -2,3 +2,9 @@ remote_config_url: "https://europe-west1-1.gcp.cloud2.influxdata.com/api/v2/telegrafs/5d0d0391a43a9023" influx_db_url: "https://europe-west1-1.gcp.cloud2.influxdata.com" influx_cloud_token: "{{influx_cloud_token}}" + +telegraf_dns_monitoring_enabled: true +telegraf_dns_external_domains: + - google.com + - cloudflare.com + - github.com diff --git a/roles/telegraf-container-service/tasks/config.yml b/roles/telegraf-container-service/tasks/config.yml index 52aee6b..6385c04 100644 --- a/roles/telegraf-container-service/tasks/config.yml +++ b/roles/telegraf-container-service/tasks/config.yml @@ -25,3 +25,12 @@ owner: "{{container_user}}" group: "{{container_group}}" when: "telegraf_metrics_iot is defined" + +- name: copy dns monitoring config into place + template: + src: templates/etc/telegraf/telegraf.d/dns-monitoring.conf.j2 + dest: "{{ container_prefix }}/etc/telegraf.d/dns-monitoring.conf" + mode: 0644 + owner: "{{container_user}}" + group: "{{container_group}}" + when: "telegraf_dns_monitoring_enabled | default(true)" diff --git a/roles/telegraf-container-service/templates/etc/telegraf/telegraf.d/dns-monitoring.conf.j2 b/roles/telegraf-container-service/templates/etc/telegraf/telegraf.d/dns-monitoring.conf.j2 new file mode 100644 index 0000000..31b6059 --- /dev/null +++ b/roles/telegraf-container-service/templates/etc/telegraf/telegraf.d/dns-monitoring.conf.j2 @@ -0,0 +1,56 @@ +# DNS Monitoring - Pi-hole API +# Queries Pi-hole API from all pihole servers in inventory +{% for host in groups['pihole'] %} +[[inputs.http]] + urls = ["http://{{ hostvars[host].ansible_host }}/admin/api.php?summaryRaw"] + name_override = "pihole" + interval = "60s" + timeout = "10s" + [inputs.http.tags] + server = "{{ host }}.{{ dns_subdomain_base }}" + +{% endfor %} + +# DNS Latency - Internal +# Query each Unbound from each pihole server in inventory +# Tests that internal DNS resolution works across the fleet +{% for host in groups['pihole'] %} +[[inputs.exec]] + commands = ["dig +short {{ host }}.{{ dns_subdomain_base }} @{{ hostvars[host].ansible_host }}#5335 +time=3 +tries=2"] + name_override = "dns_internal_latency" + interval = "60s" + data_format = "value" + data_type = "integer" + [inputs.exec.tags] + server = "{{ host }}.{{ dns_subdomain_base }}" + target = "{{ host }}.{{ dns_subdomain_base }}" + +{% endfor %} + +# DNS Latency - External +# Query public DNS to verify internet connectivity +# Tests that upstream DNS resolution works +{% for domain in telegraf_dns_external_domains | default(['google.com', 'cloudflare.com', 'github.com']) %} +[[inputs.exec]] + commands = ["dig +short {{ domain }} @8.8.8.8 +time=3 +tries=2"] + name_override = "dns_external_latency" + interval = "60s" + data_format = "value" + data_type = "integer" + [inputs.exec.tags] + server = "{{ inventory_hostname }}.{{ dns_subdomain_base }}" + upstream = "google-dns" + domain = "{{ domain }}" + +[[inputs.exec]] + commands = ["dig +short {{ domain }} @1.1.1.1 +time=3 +tries=2"] + name_override = "dns_external_latency" + interval = "60s" + data_format = "value" + data_type = "integer" + [inputs.exec.tags] + server = "{{ inventory_hostname }}.{{ dns_subdomain_base }}" + upstream = "cloudflare-dns" + domain = "{{ domain }}" + +{% endfor %}