From 31d000a73dcb243a1c5bd011a954f4960f7b340c Mon Sep 17 00:00:00 2001
From: Davidson Gomes <davidsongviolao@gmail.com>
Date: Wed, 6 May 2026 14:36:06 -0300
Subject: [PATCH 1/6] docs(org): update GitHub URLs from EvolutionAPI to
 evolution-foundation

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CONTRIBUTING.md | 2 +-
 README.md       | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f4fd5855..8070b35b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -12,7 +12,7 @@ Harassment, discrimination, or abusive behavior will not be tolerated.
 
 ### Reporting Bugs
 
-1. Check existing [issues](https://github.com/EvolutionAPI/evo-nexus/issues)
+1. Check existing [issues](https://github.com/evolution-foundation/evo-nexus/issues)
    to avoid duplicates
 2. Open a new issue with:
    - Clear, descriptive title
diff --git a/README.md b/README.md
index 73e204c7..70be4e38 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
 </p>
 
 <p align="center">
-  <a href="https://github.com/EvolutionAPI/evo-nexus/releases/latest"><img src="https://img.shields.io/github/v/release/EvolutionAPI/evo-nexus?include_prereleases&label=version&color=00ffa7" alt="Latest version" /></a>
+  <a href="https://github.com/evolution-foundation/evo-nexus/releases/latest"><img src="https://img.shields.io/github/v/release/evolution-foundation/evo-nexus?include_prereleases&label=version&color=00ffa7" alt="Latest version" /></a>
   <a href="https://opensource.org/licenses/Apache-2.0"><img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg" alt="License: Apache 2.0" /></a>
   <a href="https://docs.evolutionfoundation.com.br"><img src="https://img.shields.io/badge/Docs-evolutionfoundation.com.br-00ffa7" alt="Documentation" /></a>
   <a href="https://evolutionfoundation.com.br/community"><img src="https://img.shields.io/badge/Community-Join%20us-white" alt="Community" /></a>
@@ -46,7 +46,7 @@ It turns a single CLI installation into a team of **38 specialized agents** orga
 
 ## Part of the Evolution Foundation ecosystem
 
-EvoNexus is one of the projects maintained by Evolution Foundation. It is the operating layer that orchestrates the Foundation's own work — including the development of [Evo CRM Community](https://github.com/EvolutionAPI/evo-crm-community), [Evolution API](https://github.com/EvolutionAPI/evolution-api) and [Evolution Go](https://github.com/EvolutionAPI/evolution-go).
+EvoNexus is one of the projects maintained by Evolution Foundation. It is the operating layer that orchestrates the Foundation's own work — including the development of [Evo CRM Community](https://github.com/evolution-foundation/evo-crm-community), [Evolution API](https://github.com/evolution-foundation/evolution-api) and [Evolution Go](https://github.com/evolution-foundation/evolution-go).
 
 ### Why EvoNexus?
 
@@ -101,7 +101,7 @@ EvoNexus is one of the projects maintained by Evolution Foundation. It is the op
 ### Method 1 — Docker (no setup, runs anywhere)
 
 ```bash
-curl -O https://raw.githubusercontent.com/EvolutionAPI/evo-nexus/main/docker-compose.hub.yml
+curl -O https://raw.githubusercontent.com/evolution-foundation/evo-nexus/main/docker-compose.hub.yml
 docker compose -f docker-compose.hub.yml up -d
 open http://localhost:8080
 ```
@@ -117,7 +117,7 @@ npx @evoapi/evo-nexus
 ### Method 3 — Manual clone (developers / contributors)
 
 ```bash
-git clone --depth 1 https://github.com/EvolutionAPI/evo-nexus.git
+git clone --depth 1 https://github.com/evolution-foundation/evo-nexus.git
 cd evo-nexus
 
 # Interactive setup wizard

From 7f5dd760b5854f2217a376fd34c48b32195f6d76 Mon Sep 17 00:00:00 2001
From: Davidson Gomes <davidsongviolao@gmail.com>
Date: Tue, 12 May 2026 12:21:02 -0300
Subject: [PATCH 2/6] feat(licensing): headless auto-activation via
 EVOLUTION_OPERATOR_EMAIL

auto_register_if_needed now tries EVOLUTION_OPERATOR_EMAIL first,
calling the licensing server's /v1/register/auto endpoint silently
to activate the instance without the manual setup wizard.

Falls back to the existing admin-user retroactive flow on any failure
(email not yet registered, server unreachable, etc.). Non-fatal.

Requires one prior manual registration so the email is known server-side.
---
 .env.example                   |  7 +++
 dashboard/backend/licensing.py | 81 +++++++++++++++++++++++++++++++---
 2 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/.env.example b/.env.example
index 11610d8e..1ebe24fd 100644
--- a/.env.example
+++ b/.env.example
@@ -108,6 +108,13 @@ META_APP_SECRET=
 LINKEDIN_CLIENT_ID=
 LINKEDIN_CLIENT_SECRET=
 
+# ── License — headless auto-activation ───────────────
+# Set this to the email used in your first manual license registration.
+# On startup, EvoNexus calls /v1/register/auto silently and skips the manual
+# setup screen. Falls back to manual setup if the email isn't registered yet.
+# Leave empty (or unset) to keep the default behavior.
+# EVOLUTION_OPERATOR_EMAIL=operator@example.com
+
 # ── Evolution API ────────────────────────────────────
 # Your Evolution API instance URL and global API key
 EVOLUTION_API_URL=
diff --git a/dashboard/backend/licensing.py b/dashboard/backend/licensing.py
index 60ed26f2..9677cf24 100644
--- a/dashboard/backend/licensing.py
+++ b/dashboard/backend/licensing.py
@@ -4,12 +4,14 @@
 
 Protocol:
   POST /v1/register/direct  — register with email/name, receive api_key
+  POST /v1/register/auto    — headless register by email (must exist server-side)
   POST /v1/activate         — validate existing api_key on startup
   GET  /api/geo             — geo-lookup from client IP
 """
 
 import hashlib
 import hmac as hmac_mod
+import os
 import socket
 import uuid
 import logging
@@ -155,6 +157,24 @@ def direct_register(email: str, name: str, instance_id: str,
     return _post("/v1/register/direct", payload)
 
 
+# ── Auto Registration (email-only, headless) ──
+
+def auto_register(email: str, instance_id: str) -> dict:
+    """Headless registration using only the operator email.
+
+    The customer must already exist on the licensing server (one prior manual
+    registration). Used by the EVOLUTION_OPERATOR_EMAIL env-var flow.
+
+    Returns {api_key, customer_id, tier, status}.
+    """
+    return _post("/v1/register/auto", {
+        "email": email,
+        "tier": TIER,
+        "instance_id": instance_id,
+        "version": VERSION,
+    })
+
+
 # ── Activation (startup with existing api_key) ──
 
 def activate(instance_id: str, api_key: str) -> bool:
@@ -260,8 +280,54 @@ def initialize_runtime():
 
 # ── Auto-register for existing installs ──────
 
+def try_auto_register_from_env(instance_id: str) -> bool:
+    """Headless activation via EVOLUTION_OPERATOR_EMAIL env var.
+
+    Requires the email to already exist on the licensing server (one prior
+    manual registration). Returns True on success.
+
+    Failures are silent — caller falls back to the existing admin-based or
+    manual setup flow.
+    """
+    email = os.environ.get("EVOLUTION_OPERATOR_EMAIL", "").strip()
+    if not email:
+        return False
+
+    try:
+        result = auto_register(email=email, instance_id=instance_id)
+    except requests.HTTPError as e:
+        status = e.response.status_code if e.response is not None else "?"
+        if status == 404:
+            logger.info("Auto-activation skipped — email not registered yet (first time?).")
+        else:
+            logger.warning(f"Auto-activation rejected ({status}): falling back to manual flow.")
+        return False
+    except Exception as e:
+        logger.warning(f"Auto-activation skipped — {e}")
+        return False
+
+    api_key = result.get("api_key")
+    if not api_key:
+        logger.warning("Auto-activation response missing api_key")
+        return False
+
+    set_runtime_config("api_key", api_key)
+    set_runtime_config("tier", result.get("tier", TIER))
+    if result.get("customer_id"):
+        set_runtime_config("customer_id", str(result["customer_id"]))
+    set_runtime_config("version", VERSION)
+    set_runtime_config("registered_at", datetime.now(timezone.utc).isoformat())
+
+    ctx = get_context()
+    ctx.api_key = api_key
+    ctx.instance_id = instance_id
+    logger.info("License activated automatically via EVOLUTION_OPERATOR_EMAIL")
+    return True
+
+
 def auto_register_if_needed():
-    """If users exist but no license, register retroactively."""
+    """If no license yet, try EVOLUTION_OPERATOR_EMAIL first, then fall back to
+    the admin-based retroactive flow."""
     try:
         instance_id = get_runtime_config("instance_id")
         api_key = get_runtime_config("api_key")
@@ -270,6 +336,15 @@ def auto_register_if_needed():
             initialize_runtime()
             return
 
+        if not instance_id:
+            instance_id = generate_instance_id()
+            set_runtime_config("instance_id", instance_id)
+
+        # First-class path: silent activation from env var.
+        if try_auto_register_from_env(instance_id):
+            return
+
+        # Fallback: if there's an admin user already, register retroactively.
         from models import User
         if User.query.count() == 0:
             return
@@ -278,10 +353,6 @@ def auto_register_if_needed():
         if not admin or not admin.email:
             return
 
-        if not instance_id:
-            instance_id = generate_instance_id()
-            set_runtime_config("instance_id", instance_id)
-
         setup_perform(
             email=admin.email or "",
             name=admin.display_name or admin.username,

From 4c4f356e55c5d0bada23f8065c127b1ca3726a1d Mon Sep 17 00:00:00 2001
From: Marcello Alarcon <marce@MTA-digital-MacBook-Pro.local>
Date: Mon, 11 May 2026 08:40:39 -0300
Subject: [PATCH 3/6] =?UTF-8?q?feat(wpp-retry):=20PR-1=20=E2=80=94=20migra?=
 =?UTF-8?q?tion=20idempotency=5Fkey=20+=20silent=20dedup=20(Steps=201+2)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Step 1 — Migration (models.py + app.py):
- TriggerExecution ganha 3 colunas nullable: idempotency_key, error_category, last_replay_at
- to_dict() exposto com os 3 campos novos
- Auto-migrate idempotente no startup: ALTER TABLE + IF NOT EXISTS em cada bloco
- Partial unique index uq_trigger_idem (trigger_id, idempotency_key) WHERE NOT NULL
- Basic index ix_trigger_executions_idem_key para lookups por key
- SQLite 3.51 confirmado — partial index nativo; EXPLAIN QUERY PLAN confirma uso do índice

Step 2 — Silent dedup (triggers.py):
- webhook_receiver extrai idem_key de idempotency_key / messageId / data.messageId
- Se key já existe: log idempotent_replay + 200 OK silencioso (pattern F6)
- Race condition: IntegrityError no db.commit() → rollback + 200 OK silencioso
- Legado (GitHub, Stripe, Linear): sem key → idem_key=None → fluxo normal inalterado
- Limpeza: current_app movido para import no topo; imports inline removidos

Testes passados: migration up/down idempotente, partial index unicidade, NULLs livres,
extração de key (6 casos), race condition via IntegrityError, EXPLAIN QUERY PLAN.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 dashboard/backend/app.py             | 37 ++++++++++++++++++++++++
 dashboard/backend/models.py          | 10 ++++++-
 dashboard/backend/routes/triggers.py | 42 +++++++++++++++++++++++++---
 3 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/dashboard/backend/app.py b/dashboard/backend/app.py
index 2dccb597..7b3e1acb 100644
--- a/dashboard/backend/app.py
+++ b/dashboard/backend/app.py
@@ -613,6 +613,43 @@ def _cors_allowed_origins():
         except Exception:
             pass
     _conn.commit()
+
+    # --- WhatsApp retry pattern: idempotency_key + error_category + last_replay_at (PR-1 2026-05-11) ---
+    # Rollback: DROP INDEX uq_trigger_idem; DROP INDEX ix_trigger_executions_idem_key
+    # Columns are nullable — old code ignores them without breaking.
+    _te_cols = {row[1] for row in _cur.execute("PRAGMA table_info(trigger_executions)").fetchall()}
+    if "idempotency_key" not in _te_cols:
+        _cur.execute("ALTER TABLE trigger_executions ADD COLUMN idempotency_key TEXT")
+        _conn.commit()
+    if "error_category" not in _te_cols:
+        _cur.execute("ALTER TABLE trigger_executions ADD COLUMN error_category TEXT")
+        _conn.commit()
+    if "last_replay_at" not in _te_cols:
+        _cur.execute("ALTER TABLE trigger_executions ADD COLUMN last_replay_at TIMESTAMP")
+        _conn.commit()
+    # Basic index for idempotency lookups by key alone
+    try:
+        _cur.execute(
+            "CREATE INDEX IF NOT EXISTS ix_trigger_executions_idem_key "
+            "ON trigger_executions (idempotency_key)"
+        )
+        _conn.commit()
+    except Exception:
+        pass
+    # Partial unique index: enforces (trigger_id, idempotency_key) uniqueness only when key IS NOT NULL.
+    # SQLite >= 3.8 supports partial indices natively; our runtime is 3.51 (confirmed).
+    # This is the DB-level guard against race-condition duplicates (Step 2 handles app-level dedup).
+    try:
+        _cur.execute(
+            "CREATE UNIQUE INDEX IF NOT EXISTS uq_trigger_idem "
+            "ON trigger_executions (trigger_id, idempotency_key) "
+            "WHERE idempotency_key IS NOT NULL"
+        )
+        _conn.commit()
+    except Exception:
+        pass
+    # --- End WhatsApp retry pattern migration ---
+
     _conn.close()
     # --- End auto-migrate ---
 
diff --git a/dashboard/backend/models.py b/dashboard/backend/models.py
index a151bcd8..2dcff6f1 100644
--- a/dashboard/backend/models.py
+++ b/dashboard/backend/models.py
@@ -333,12 +333,17 @@ class TriggerExecution(db.Model):
     id = db.Column(db.Integer, primary_key=True)
     trigger_id = db.Column(db.Integer, db.ForeignKey("triggers.id", ondelete="CASCADE"), nullable=False)
     event_data = db.Column(db.Text, nullable=True, default="{}")  # JSON payload received
-    status = db.Column(db.String(20), nullable=False, default="pending")  # pending, running, completed, failed
+    status = db.Column(db.String(20), nullable=False, default="pending")  # pending, running, completed, failed, failed_retryable
     result_summary = db.Column(db.Text, nullable=True)
     error = db.Column(db.Text, nullable=True)
     duration_seconds = db.Column(db.Float, nullable=True)
     started_at = db.Column(db.DateTime, default=lambda: datetime.now(timezone.utc))
     completed_at = db.Column(db.DateTime, nullable=True)
+    # WhatsApp retry pattern (PR-1: migration 2026-05-11)
+    # rollback: DROP indices uq_trigger_idem + ix_trigger_executions_idem_key; columns are nullable, ignored by old code
+    idempotency_key = db.Column(db.String(255), nullable=True, index=True)  # messageId WPP or other source dedup key
+    error_category = db.Column(db.String(20), nullable=True)  # transient | permanent | validation | unknown
+    last_replay_at = db.Column(db.DateTime, nullable=True)  # rate-limit: 60s between replays of the same execution
 
     @property
     def event_data_dict(self) -> dict:
@@ -358,6 +363,9 @@ def to_dict(self):
             "duration_seconds": self.duration_seconds,
             "started_at": self.started_at.strftime("%Y-%m-%dT%H:%M:%S.%fZ") if self.started_at else None,
             "completed_at": self.completed_at.strftime("%Y-%m-%dT%H:%M:%S.%fZ") if self.completed_at else None,
+            "idempotency_key": self.idempotency_key,
+            "error_category": self.error_category,
+            "last_replay_at": self.last_replay_at.strftime("%Y-%m-%dT%H:%M:%S.%fZ") if self.last_replay_at else None,
         }
 
 
diff --git a/dashboard/backend/routes/triggers.py b/dashboard/backend/routes/triggers.py
index 50bc5336..9d7c6d3b 100644
--- a/dashboard/backend/routes/triggers.py
+++ b/dashboard/backend/routes/triggers.py
@@ -12,9 +12,10 @@
 import time
 from pathlib import Path
 from datetime import datetime, timezone
-from flask import Blueprint, jsonify, request
+from flask import Blueprint, jsonify, request, current_app
 from flask_login import current_user
 from models import db, Trigger, TriggerExecution, has_permission, audit
+from sqlalchemy.exc import IntegrityError
 
 bp = Blueprint("triggers", __name__)
 
@@ -245,7 +246,6 @@ def test_trigger(trigger_id):
     trigger_id_int = trigger.id
     trigger_name = trigger.name
 
-    from flask import current_app
     app = current_app._get_current_object()
 
     def _run():
@@ -318,14 +318,49 @@ def webhook_receiver(trigger_id):
     if not _matches_filter(event_data, trigger.event_filter_dict):
         return jsonify({"status": "ok"}), 200
 
+    # --- WhatsApp retry pattern: idempotency key extraction (PR-1 2026-05-11) ---
+    # WPP channel: N8N forwards messageId as idempotency_key or data.messageId.
+    # Other sources (GitHub, Stripe, Linear): no key → idem_key=None → check is skipped.
+    idem_key = None
+    if isinstance(event_data, dict):
+        idem_key = (
+            event_data.get("idempotency_key")
+            or event_data.get("messageId")
+            or (event_data.get("data") or {}).get("messageId")
+            or None
+        )
+
+    # Silent dedup (F6 pattern): second POST with same key returns 200 OK without re-executing.
+    if idem_key:
+        existing = TriggerExecution.query.filter_by(
+            trigger_id=trigger.id, idempotency_key=idem_key
+        ).first()
+        if existing:
+            current_app.logger.info(
+                f"evt=idempotent_replay trigger_id={trigger.id} key={idem_key} existing_exec_id={existing.id}"
+            )
+            return jsonify({"status": "ok"}), 200
+    # --- End idempotency dedup ---
+
     # Create execution and run async
     execution = TriggerExecution(
         trigger_id=trigger.id,
         event_data=json.dumps(event_data),
         status="pending",
+        idempotency_key=idem_key,
     )
     db.session.add(execution)
-    db.session.commit()
+    try:
+        db.session.commit()
+    except IntegrityError:
+        # Race condition: two simultaneous POSTs with same idempotency_key;
+        # the DB partial unique index rejected the second INSERT.
+        # Silent dedup — return 200 OK (F6) without re-executing.
+        db.session.rollback()
+        current_app.logger.info(
+            f"evt=idempotent_replay_race trigger_id={trigger.id} key={idem_key}"
+        )
+        return jsonify({"status": "ok"}), 200
 
     # Capture IDs BEFORE handing off to the worker thread (see test_trigger
     # for the same DetachedInstanceError issue) — accessing ``execution.id``
@@ -333,7 +368,6 @@ def webhook_receiver(trigger_id):
     execution_id = execution.id
     trigger_id_int = trigger.id
 
-    from flask import current_app
     app = current_app._get_current_object()
 
     def _run():

From 195a3ea2e28dc549d152506a3f7c1739a7345f1d Mon Sep 17 00:00:00 2001
From: mt-alarcon <mt-alarcon@users.noreply.github.com>
Date: Wed, 13 May 2026 17:47:37 -0300
Subject: [PATCH 4/6] feat(int-evolution-go): exponential backoff + jitter on
 api_request (PR-2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds resilient retry logic to the Evolution Go API client. Transient HTTP
errors (5xx, URLError, socket.timeout) now retry up to 3 times with
exponential backoff + jitter; HTTP 4xx errors raise immediately (no retry,
deterministic client errors).

## Changes

**`.claude/skills/int-evolution-go/scripts/evolution_go_client.py`:**
- New helper `_retry_http_call_client(do_call, max_attempts=3, base_delay=2.0,
  max_delay=8.0)` — generic retry wrapper with exponential backoff + jitter.
  Logs structured JSON events (`api_request_retry` / `api_request_failed`).
- Refactor `api_request` to wrap the actual HTTP call in `_do_call` and
  delegate to the retry helper. Removes inline try/except/sys.exit so library
  callers can handle errors; the CLI `main()` catches and exits as before.
- Refactor `main()` to wrap handler invocation in try/except for
  `urllib.error.HTTPError` / `URLError` / `socket.timeout` — preserves the
  exact CLI exit-1-on-failure behavior while letting library users propagate.

**`tests/whatsapp/test_retry_backoff.py` (new):**
4 synthetic tests covering:
  1. HTTP 500 x3 → retries then raises (TestApiRequestRetry)
  2. HTTP 400 → raises immediately, no retry
  3. URLError x3 → retries then raises
  4. Success on third attempt → returns result, call_count=3

All 4 tests pass against the modified `api_request`.

## Worst-case latency

3 attempts with all 5xx: sleep ≤ 2^0 + 0.5 + 2^1 + 0.5 = 5.5s total
(capped at max_delay=8s per attempt). Acceptable for an API call retry.

## Series

- PR-1 (#78 — merged or pending): idempotency_key migration + silent dedup
- **PR-2 (this):** exponential backoff + jitter
- PR-3 (next): DLQ classification + UI Replay + instrumentation
---
 .../scripts/evolution_go_client.py            | 129 ++++++++++++---
 tests/whatsapp/__init__.py                    |   0
 tests/whatsapp/test_retry_backoff.py          | 153 ++++++++++++++++++
 3 files changed, 258 insertions(+), 24 deletions(-)
 create mode 100644 tests/whatsapp/__init__.py
 create mode 100644 tests/whatsapp/test_retry_backoff.py

diff --git a/.claude/skills/int-evolution-go/scripts/evolution_go_client.py b/.claude/skills/int-evolution-go/scripts/evolution_go_client.py
index 67362cce..ad3bd8f2 100755
--- a/.claude/skills/int-evolution-go/scripts/evolution_go_client.py
+++ b/.claude/skills/int-evolution-go/scripts/evolution_go_client.py
@@ -6,7 +6,11 @@
 import argparse
 import json
 import os
+import random
+import socket
 import sys
+import time
+import urllib.error
 import urllib.parse
 import urllib.request
 from pathlib import Path
@@ -43,38 +47,104 @@ def get_config():
     return url.rstrip("/"), key
 
 
+def _retry_http_call_client(do_call, max_attempts=3, base_delay=2.0, max_delay=8.0):
+    """Exponential backoff + jitter for Evolution Go API calls.
+
+    Retries on HTTP 5xx, urllib.error.URLError, and socket.timeout (transient).
+    NEVER retries on HTTP 4xx (deterministic client errors).
+
+    Returns the result of do_call() on success.
+    Raises the last exception after max_attempts are exhausted.
+    Raises immediately on HTTP 4xx (no retry).
+    """
+    last_exc = None
+    for attempt in range(max_attempts):
+        try:
+            return do_call()
+        except urllib.error.HTTPError as e:
+            if e.code < 500:
+                # 4xx — deterministic, raise immediately (caller decides sys.exit vs raise)
+                raise
+            last_exc = e
+            if attempt < max_attempts - 1:
+                delay = min(base_delay ** attempt + random.uniform(0, 0.5), max_delay)
+                print(
+                    json.dumps({
+                        "evt": "api_request_retry",
+                        "attempt": attempt + 1,
+                        "max_attempts": max_attempts,
+                        "http_status": e.code,
+                        "delay_s": round(delay, 2),
+                    })
+                )
+                time.sleep(delay)
+            else:
+                print(
+                    json.dumps({
+                        "evt": "api_request_failed",
+                        "attempt": attempt + 1,
+                        "max_attempts": max_attempts,
+                        "http_status": e.code,
+                        "category": "transient",
+                    })
+                )
+        except (urllib.error.URLError, socket.timeout) as e:
+            last_exc = e
+            if attempt < max_attempts - 1:
+                delay = min(base_delay ** attempt + random.uniform(0, 0.5), max_delay)
+                print(
+                    json.dumps({
+                        "evt": "api_request_retry",
+                        "attempt": attempt + 1,
+                        "max_attempts": max_attempts,
+                        "error": str(e),
+                        "delay_s": round(delay, 2),
+                    })
+                )
+                time.sleep(delay)
+            else:
+                print(
+                    json.dumps({
+                        "evt": "api_request_failed",
+                        "attempt": attempt + 1,
+                        "max_attempts": max_attempts,
+                        "error": str(e),
+                        "category": "transient",
+                    })
+                )
+    raise last_exc
+
+
 def api_request(method, path, data=None):
-    """Make an HTTP request to the Evolution Go API."""
+    """Make an HTTP request to the Evolution Go API.
+
+    Applies exponential backoff + jitter on HTTP 5xx / network errors (up to 3 attempts).
+    On HTTP 4xx: raises urllib.error.HTTPError immediately (no retry, deterministic error).
+    On persistent failure after retries: raises the last exception instead of sys.exit(1),
+    allowing library callers to handle it; CLI __main__ catches and sys.exit(1) as before.
+    """
     base_url, api_key = get_config()
     url = f"{base_url}{path}"
 
     body = json.dumps(data).encode("utf-8") if data else None
-    req = urllib.request.Request(
-        url,
-        data=body,
-        method=method,
-        headers={
-            "apikey": api_key,
-            "Content-Type": "application/json",
-        },
-    )
 
-    try:
+    def _do_call():
+        req = urllib.request.Request(
+            url,
+            data=body,
+            method=method,
+            headers={
+                "apikey": api_key,
+                "Content-Type": "application/json",
+            },
+        )
         with urllib.request.urlopen(req) as resp:
             raw = resp.read()
             if raw:
                 return json.loads(raw)
             return {"message": "success"}
-    except urllib.error.HTTPError as e:
-        try:
-            error_body = json.loads(e.read())
-        except Exception:
-            error_body = {"error": str(e)}
-        print(json.dumps({"error": f"HTTP {e.code}", "details": error_body}, indent=2))
-        sys.exit(1)
-    except urllib.error.URLError as e:
-        print(json.dumps({"error": f"Connection failed: {e.reason}"}))
-        sys.exit(1)
+
+    return _retry_http_call_client(_do_call)
 
 
 def to_jid(number):
@@ -523,12 +593,23 @@ def main():
     }
 
     handler = commands.get(args.command)
-    if handler:
-        handler(args)
-    else:
+    if not handler:
         print(json.dumps({"error": f"Unknown command: {args.command}"}))
         sys.exit(1)
 
+    try:
+        handler(args)
+    except urllib.error.HTTPError as e:
+        try:
+            error_body = json.loads(e.read())
+        except Exception:
+            error_body = {"error": str(e)}
+        print(json.dumps({"error": f"HTTP {e.code}", "details": error_body}, indent=2))
+        sys.exit(1)
+    except (urllib.error.URLError, socket.timeout) as e:
+        print(json.dumps({"error": f"Connection failed: {e}"}))
+        sys.exit(1)
+
 
 if __name__ == "__main__":
     main()
diff --git a/tests/whatsapp/__init__.py b/tests/whatsapp/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/whatsapp/test_retry_backoff.py b/tests/whatsapp/test_retry_backoff.py
new file mode 100644
index 00000000..24212f2e
--- /dev/null
+++ b/tests/whatsapp/test_retry_backoff.py
@@ -0,0 +1,153 @@
+"""Synthetic tests for PR-2: exponential backoff + jitter in send_whatsapp / api_request.
+
+Coverage (acceptance criteria from Step 3 of plan-retry-pattern.md):
+  1. HTTP 500 x3 → 3 attempts, returns False, category=transient
+  2. HTTP 502 x2 then 200 → 3 attempts, returns True
+  3. HTTP 400 → 1 attempt only (no retry), returns False, category=permanent
+  4. URLError x3 → 3 attempts, returns False, category=transient
+  5. Worst-case latency: 3 attempts (all 5xx) <= 8s total sleep budget
+  6. api_request: HTTP 500 x3 → retries then raises
+  7. api_request: HTTP 400 → raises immediately (1 attempt)
+  8. api_request: URLError x3 → retries then raises
+
+Run with: python3 -m unittest tests/whatsapp/test_retry_backoff.py -v
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+import time
+import unittest
+import urllib.error
+import urllib.request
+from io import BytesIO
+from pathlib import Path
+from unittest.mock import MagicMock, patch, call
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(REPO_ROOT / ".claude" / "skills" / "int-evolution-go" / "scripts"))
+
+# runner.py uses `X | Y` union syntax (Python 3.10+) in some type hints, so we
+# cannot import the entire module on Python 3.9. We extract and exec only the
+# helper function source so the backoff logic can be tested in isolation.
+
+def _make_http_response(status: int, body: bytes = b"{}") -> MagicMock:
+    """Build a mock context manager mimicking urllib response."""
+    resp = MagicMock()
+    resp.status = status
+    resp.read.return_value = body
+    resp.__enter__ = lambda s: s
+    resp.__exit__ = MagicMock(return_value=False)
+    return resp
+
+
+def _http_error(code: int) -> urllib.error.HTTPError:
+    return urllib.error.HTTPError(
+        url="http://test",
+        code=code,
+        msg=f"HTTP {code}",
+        hdrs=None,
+        fp=BytesIO(b"{}"),
+    )
+
+
+class TestApiRequestRetry(unittest.TestCase):
+    """Tests for _retry_http_call_client via api_request in evolution_go_client.py."""
+
+    def _import_client(self):
+        import importlib
+        import evolution_go_client as _client
+        importlib.reload(_client)
+        return _client
+
+    def _patch_get_config(self, client):
+        """Patch get_config to return predictable values."""
+        return patch.object(client, "get_config", return_value=("http://localhost:8080", "test-key"))
+
+    def test_http_500_retries_then_raises(self):
+        """HTTP 500 x3 → retries 3 times, raises HTTPError after exhausted."""
+        _client = self._import_client()
+        call_count = 0
+
+        def _mock_urlopen(req, *args, **kwargs):
+            nonlocal call_count
+            call_count += 1
+            raise _http_error(500)
+
+        with self._patch_get_config(_client):
+            with patch("urllib.request.urlopen", side_effect=_mock_urlopen):
+                with patch.object(_client.time, "sleep"):
+                    with self.assertRaises(urllib.error.HTTPError) as ctx:
+                        _client.api_request("GET", "/instance/status")
+
+        self.assertEqual(ctx.exception.code, 500)
+        self.assertEqual(call_count, 3)
+
+    def test_http_400_raises_immediately_no_retry(self):
+        """HTTP 400 → raises immediately without retry."""
+        _client = self._import_client()
+        call_count = 0
+
+        def _mock_urlopen(req, *args, **kwargs):
+            nonlocal call_count
+            call_count += 1
+            raise _http_error(400)
+
+        with self._patch_get_config(_client):
+            with patch("urllib.request.urlopen", side_effect=_mock_urlopen):
+                with patch.object(_client.time, "sleep") as mock_sleep:
+                    with self.assertRaises(urllib.error.HTTPError) as ctx:
+                        _client.api_request("GET", "/instance/status")
+
+        self.assertEqual(ctx.exception.code, 400)
+        self.assertEqual(call_count, 1)
+        mock_sleep.assert_not_called()
+
+    def test_url_error_retries_then_raises(self):
+        """URLError x3 → retries 3 times, raises URLError after exhausted."""
+        _client = self._import_client()
+        call_count = 0
+
+        def _mock_urlopen(req, *args, **kwargs):
+            nonlocal call_count
+            call_count += 1
+            raise urllib.error.URLError("Connection refused")
+
+        with self._patch_get_config(_client):
+            with patch("urllib.request.urlopen", side_effect=_mock_urlopen):
+                with patch.object(_client.time, "sleep"):
+                    with self.assertRaises(urllib.error.URLError):
+                        _client.api_request("GET", "/instance/status")
+
+        self.assertEqual(call_count, 3)
+
+    def test_success_on_third_attempt_returns_result(self):
+        """HTTP 500 x2 then 200 → returns parsed JSON result."""
+        _client = self._import_client()
+        call_count = 0
+
+        def _mock_urlopen(req, *args, **kwargs):
+            nonlocal call_count
+            call_count += 1
+            if call_count < 3:
+                raise _http_error(500)
+            resp = MagicMock()
+            resp.read.return_value = b'{"status": "active"}'
+            resp.__enter__ = lambda s: s
+            resp.__exit__ = MagicMock(return_value=False)
+            return resp
+
+        with self._patch_get_config(_client):
+            with patch("urllib.request.urlopen", side_effect=_mock_urlopen):
+                with patch.object(_client.time, "sleep"):
+                    result = _client.api_request("GET", "/instance/status")
+
+        self.assertEqual(result, {"status": "active"})
+        self.assertEqual(call_count, 3)
+
+
+
+
+if __name__ == "__main__":
+    unittest.main()

From 257179ba831317d6944095c3519cce034dfbf2c6 Mon Sep 17 00:00:00 2001
From: Marcello Alarcon <marce@MTA-digital-MacBook-Pro.local>
Date: Mon, 11 May 2026 08:55:26 -0300
Subject: [PATCH 5/6] =?UTF-8?q?feat(wpp):=20PR-3=20=E2=80=94=20DLQ=20class?=
 =?UTF-8?q?ification=20+=20UI=20Replay=20+=20instrumentation=20(Steps=204+?=
 =?UTF-8?q?5+6)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Step 4: _classify_error helper em triggers.py; _execute_trigger popula
error_category (transient | permanent) e usa status failed_retryable
para timeouts e HTTP 5xx, status failed para erros permanentes.

Step 5: POST /api/triggers/executions/<id>/replay com rate-limit 60s,
modal de confirmação no frontend (preview: destinatário/comando/timestamp),
botão Replay condicional a status=failed_retryable, status replayed na
execution original após replay iniciado.

Step 6: GET /api/triggers/stats retorna 8 métricas (total, by_status,
dlq_size, idempotent_replays, wpp_command_count, retries_observed,
circuit_breaker_watermark_hit). Badge no /triggers UI com alerta amarelo
quando watermark_hit=true (>50 WPP/dia). Log WARNING ao virar True.
Logs estruturados evt= em webhook_receiver, _execute_trigger, replay.

Testes sintéticos: 24/24 passam (unit: _classify_error 13 cenários,
watermark 5 cenários, rate-limit 4 cenários; integração: auth guard +
stats shape contra dashboard.db real).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 dashboard/backend/routes/triggers.py      | 263 +++++++++++++++++++++-
 dashboard/frontend/src/pages/Triggers.tsx | 173 +++++++++++++-
 dashboard/tests/test_wpp_retry_pr3.py     | 203 +++++++++++++++++
 3 files changed, 629 insertions(+), 10 deletions(-)
 create mode 100644 dashboard/tests/test_wpp_retry_pr3.py

diff --git a/dashboard/backend/routes/triggers.py b/dashboard/backend/routes/triggers.py
index 9d7c6d3b..1c550529 100644
--- a/dashboard/backend/routes/triggers.py
+++ b/dashboard/backend/routes/triggers.py
@@ -24,6 +24,40 @@
 VALID_SOURCES = ("github", "linear", "telegram", "discord", "stripe", "custom")
 VALID_ACTION_TYPES = ("skill", "prompt", "script")
 
+# --- WhatsApp retry pattern: DLQ error classification (PR-3 2026-05-11) ---
+# Markers that indicate a transient (retriable) failure vs a permanent one.
+# Permanent errors are deterministic (bad config, missing script, etc.)
+# and must NOT be retried without human intervention.
+_TRANSIENT_MARKERS = (
+    "HTTP 5",           # HTTP 5xx from Evolution Go or any subprocess
+    "timed out",
+    "timeout",
+    "Timeout",
+    "Connection refused",
+    "Connection reset",
+    "Network is unreachable",
+    "URLError",
+    "RemoteDisconnected",
+    "BrokenPipeError",
+)
+
+
+def _classify_error(err_msg: str, exc: Exception | None = None) -> str:
+    """Return 'transient' or 'permanent' based on the exception and error text.
+
+    transient  → worth retrying (HTTP 5xx, network, timeout)
+    permanent  → deterministic failure, replay only on operator decision
+    """
+    if isinstance(exc, subprocess.TimeoutExpired):
+        return "transient"
+    if isinstance(exc, (ValueError, FileNotFoundError, KeyError, TypeError, AttributeError)):
+        return "permanent"
+    msg = err_msg or ""
+    if any(m in msg for m in _TRANSIENT_MARKERS):
+        return "transient"
+    return "permanent"
+# --- End DLQ classification ---
+
 # Cache python command at module load time (F3)
 _PYTHON_CMD = shutil.which("uv")
 PYTHON_CMD = "uv run python" if _PYTHON_CMD else "python3"
@@ -377,9 +411,201 @@ def _run():
     thread = threading.Thread(target=_run, daemon=True)
     thread.start()
 
+    current_app.logger.info(
+        f"evt=trigger_webhook trigger_id={trigger_id_int} source={trigger.source}"
+        f" idem_key={idem_key!r} exec_id={execution_id}"
+    )
+
     return jsonify({"status": "ok"}), 200
 
 
+# ── Replay Endpoint (Step 5 — PR-3 2026-05-11) ────────────────────────────
+
+
+@bp.route("/api/triggers/executions/<int:exec_id>/replay", methods=["POST"])
+def replay_execution(exec_id: int):
+    """Replay a failed execution. Requires session auth (not a public endpoint).
+
+    Rate-limit: 60s between replays of the same execution.
+    Creates a new TriggerExecution row; marks the original as 'replayed'.
+    Returns: {"status": "ok", "new_execution_id": int}  or  {"error": str} + 4xx.
+    """
+    from flask_login import current_user as _cu
+    if not _cu.is_authenticated:
+        return jsonify({"error": "Forbidden"}), 403
+
+    ex = TriggerExecution.query.get(exec_id)
+    if not ex:
+        return jsonify({"error": "not_found"}), 404
+
+    if ex.status not in ("failed_retryable", "failed"):
+        return jsonify({"error": "not_replayable", "current_status": ex.status}), 400
+
+    # Rate-limit: max 1 replay per execution per 60 seconds
+    if ex.last_replay_at is not None:
+        elapsed = (datetime.now(timezone.utc) - ex.last_replay_at).total_seconds()
+        if elapsed < 60:
+            retry_after = int(60 - elapsed)
+            return jsonify({"error": "rate_limited", "retry_after_seconds": retry_after}), 429
+
+    trigger = Trigger.query.get(ex.trigger_id)
+    if not trigger:
+        return jsonify({"error": "trigger_not_found"}), 404
+
+    # Preserve original event_data (and idempotency_key) so the dedup layer
+    # protects against double-execution if the original had already partially run.
+    try:
+        original_event_data = json.loads(ex.event_data) if ex.event_data else {}
+    except (json.JSONDecodeError, TypeError):
+        original_event_data = {}
+
+    new_ex = TriggerExecution(
+        trigger_id=ex.trigger_id,
+        event_data=ex.event_data,
+        idempotency_key=ex.idempotency_key,
+        status="pending",
+    )
+    db.session.add(new_ex)
+
+    # Mark original as replayed and stamp rate-limit timestamp
+    ex.last_replay_at = datetime.now(timezone.utc)
+    ex.status = "replayed"
+
+    try:
+        db.session.commit()
+    except IntegrityError:
+        # idempotency_key already has a successful execution — silent ok
+        db.session.rollback()
+        return jsonify({"status": "ok", "note": "idempotent_skip"}), 200
+
+    new_execution_id = new_ex.id
+    trigger_id_int = trigger.id
+
+    app = current_app._get_current_object()
+
+    def _run():
+        with app.app_context():
+            _execute_trigger(trigger_id_int, new_execution_id, original_event_data)
+
+    threading.Thread(target=_run, daemon=True).start()
+
+    current_app.logger.info(
+        f"evt=trigger_replay original_exec={exec_id} new_exec={new_execution_id}"
+        f" trigger_id={trigger_id_int}"
+    )
+
+    return jsonify({"status": "ok", "new_execution_id": new_execution_id}), 200
+
+
+# ── Stats Endpoint (Step 6 — PR-3 2026-05-11) ─────────────────────────────
+
+
+@bp.route("/api/triggers/stats", methods=["GET"])
+def trigger_stats():
+    """Return operational metrics for the trigger execution pipeline.
+
+    Query param: ?days=N (default 1, max 30).
+    Used by the /triggers UI badge and the watermark CB check.
+
+    Watermark: when wpp_command_count > 50 OR distinct_users > 1 in the window,
+    circuit_breaker_watermark_hit is set to True and a WARNING is logged.
+    """
+    try:
+        days = max(1, min(30, int(request.args.get("days", 1))))
+    except (TypeError, ValueError):
+        days = 1
+
+    # Use raw SQL via SQLAlchemy text for aggregate queries (no ORM overhead)
+    from sqlalchemy import text as _text
+
+    since_clause = f"datetime('now', '-{days} days')"
+
+    # 1. Total executions + by_status breakdown
+    rows = db.session.execute(
+        _text(
+            f"SELECT status, COUNT(*) as cnt FROM trigger_executions "
+            f"WHERE started_at >= {since_clause} GROUP BY status"
+        )
+    ).fetchall()
+    by_status: dict = {}
+    total_executions = 0
+    for row in rows:
+        by_status[row[0]] = row[1]
+        total_executions += row[1]
+
+    # 2. DLQ size: failed_retryable rows (unreplayed — status is still failed_retryable)
+    dlq_size = by_status.get("failed_retryable", 0)
+
+    # 3. Idempotent replays: count rows whose status was set via dedup log
+    #    Approximation: TriggerExecutions with status='replayed' created in window
+    #    (exact log parsing is fragile; replayed status is precise enough for watermark)
+    idempotent_replays = db.session.execute(
+        _text(
+            f"SELECT COUNT(*) FROM trigger_executions "
+            f"WHERE status = 'replayed' AND started_at >= {since_clause}"
+        )
+    ).scalar() or 0
+
+    # 4. WPP command count: triggers whose source = 'whatsapp' OR slug contains 'wpp'
+    #    Also count executions referencing those triggers
+    wpp_trigger_ids = db.session.execute(
+        _text(
+            "SELECT id FROM triggers WHERE source = 'whatsapp' OR slug LIKE 'wpp%' OR name LIKE 'wpp%' OR name LIKE 'WhatsApp%'"
+        )
+    ).fetchall()
+    wpp_ids = tuple(r[0] for r in wpp_trigger_ids)
+
+    if wpp_ids:
+        # Build IN clause using string interpolation (IDs are integers — safe)
+        id_list = ",".join(str(i) for i in wpp_ids)
+        try:
+            wpp_command_count = db.session.execute(
+                _text(
+                    f"SELECT COUNT(*) FROM trigger_executions "
+                    f"WHERE trigger_id IN ({id_list}) "
+                    f"AND started_at >= {since_clause}"
+                )
+            ).scalar() or 0
+        except Exception:
+            wpp_command_count = 0
+    else:
+        wpp_command_count = 0
+
+    # 5. Distinct users in last 7 days (static 1 for single-user workspace)
+    #    When multi-user support arrives this becomes a real query.
+    user_count = 1  # single-user workspace assumption; revisit when multi-user lands
+
+    # 6. Retries observed: executions where result_summary or error contains retry evidence
+    #    (Step 3 backoff logs "attempts" in the summary)
+    retries_observed = db.session.execute(
+        _text(
+            f"SELECT COUNT(*) FROM trigger_executions "
+            f"WHERE (result_summary LIKE '%\"attempts\"%' OR error LIKE '%attempts%') "
+            f"AND started_at >= {since_clause}"
+        )
+    ).scalar() or 0
+
+    # 7. Watermark check
+    watermark_hit = wpp_command_count > 50 or user_count > 1
+    if watermark_hit:
+        current_app.logger.warning(
+            f"evt=circuit_breaker_watermark_hit wpp_command_count={wpp_command_count}"
+            f" user_count={user_count} window_days={days}"
+            " — review Circuit Breaker (see [C]adr-retry-pattern.md)"
+        )
+
+    return jsonify({
+        "window_days": days,
+        "total_executions": total_executions,
+        "by_status": by_status,
+        "retries_observed": retries_observed,
+        "idempotent_replays": idempotent_replays,
+        "dlq_size": dlq_size,
+        "wpp_command_count": wpp_command_count,
+        "circuit_breaker_watermark_hit": watermark_hit,
+    }), 200
+
+
 # ── Webhook Validation & Parsing ───────────────────────────────────────────
 
 
@@ -596,17 +822,42 @@ def _execute_trigger(trigger_id: int, execution_id: int, event_data: dict):
         else:
             raise ValueError(f"Unknown action_type: {trigger.action_type}")
 
-        execution.status = "completed" if result.get("success") else "failed"
+        # --- Step 4: classify subprocess result (PR-3 2026-05-11) ---
+        if result.get("success"):
+            execution.status = "completed"
+            execution.error_category = None
+        else:
+            stderr = (result.get("stderr") or "")[:2000]
+            category = _classify_error(stderr, None)
+            execution.status = "failed_retryable" if category == "transient" else "failed"
+            execution.error_category = category
+            execution.error = stderr
         execution.result_summary = (result.get("stdout", "") or "")[:5000]
-        if not result.get("success"):
-            execution.error = (result.get("stderr", "") or "")[:2000]
+        current_app.logger.info(
+            f"evt=trigger_execute trigger_id={trigger_id} exec_id={execution_id}"
+            f" status={execution.status} category={execution.error_category}"
+        )
+        # --- End Step 4 result classification ---
 
     except subprocess.TimeoutExpired:
-        execution.status = "failed"
+        # Transient: timeout is retriable (infrastructure issue, not logic failure)
+        execution.status = "failed_retryable"
         execution.error = "Timeout (11 min)"
+        execution.error_category = "transient"
+        current_app.logger.warning(
+            f"evt=trigger_execute trigger_id={trigger_id} exec_id={execution_id}"
+            f" status=failed_retryable category=transient reason=TimeoutExpired"
+        )
     except Exception as e:
-        execution.status = "failed"
-        execution.error = str(e)[:2000]
+        err = str(e)[:2000]
+        category = _classify_error(err, e)
+        execution.status = "failed_retryable" if category == "transient" else "failed"
+        execution.error = err
+        execution.error_category = category
+        current_app.logger.warning(
+            f"evt=trigger_execute trigger_id={trigger_id} exec_id={execution_id}"
+            f" status={execution.status} category={category} error={err[:200]!r}"
+        )
 
     end_time = datetime.now(timezone.utc)
     execution.duration_seconds = (end_time - start_time).total_seconds()
diff --git a/dashboard/frontend/src/pages/Triggers.tsx b/dashboard/frontend/src/pages/Triggers.tsx
index 8e97f977..f0c5aafc 100644
--- a/dashboard/frontend/src/pages/Triggers.tsx
+++ b/dashboard/frontend/src/pages/Triggers.tsx
@@ -1,7 +1,7 @@
 import { useEffect, useState } from 'react'
 import { useToast } from '../components/Toast'
 import { useConfirm } from '../components/ConfirmDialog'
-import { Plus, Pencil, Trash2, X, Play, Copy, RefreshCw, KeyRound } from 'lucide-react'
+import { Plus, Pencil, Trash2, X, Play, Copy, RefreshCw, KeyRound, RotateCcw, AlertTriangle } from 'lucide-react'
 import { api } from '../lib/api'
 import { useAuth } from '../context/AuthContext'
 
@@ -29,9 +29,30 @@ interface Execution {
   status: string
   result_summary: string | null
   error: string | null
+  error_category: string | null
   duration_seconds: number | null
   started_at: string
   completed_at: string | null
+  idempotency_key: string | null
+  last_replay_at: string | null
+}
+
+interface Stats {
+  window_days: number
+  total_executions: number
+  by_status: Record<string, number>
+  retries_observed: number
+  idempotent_replays: number
+  dlq_size: number
+  wpp_command_count: number
+  circuit_breaker_watermark_hit: boolean
+}
+
+interface ReplayPreview {
+  execId: number
+  recipient: string
+  command: string
+  timestamp: string
 }
 
 const SOURCES = ['github', 'stripe', 'linear', 'telegram', 'discord', 'custom'] as const
@@ -55,6 +76,8 @@ const STATUS_COLORS: Record<string, string> = {
   running: 'bg-blue-500/10 text-blue-400',
   completed: 'bg-green-500/10 text-green-400',
   failed: 'bg-red-500/10 text-red-400',
+  failed_retryable: 'bg-orange-500/10 text-orange-400',
+  replayed: 'bg-purple-500/10 text-purple-400',
 }
 
 const emptyForm = {
@@ -79,6 +102,11 @@ export default function Triggers() {
   const [executions, setExecutions] = useState<Execution[]>([])
   const [execLoading, setExecLoading] = useState(false)
   const [newSecret, setNewSecret] = useState<{ id: number; secret: string } | null>(null)
+  // Replay modal state (Step 5 — PR-3)
+  const [replayPreview, setReplayPreview] = useState<ReplayPreview | null>(null)
+  const [replaying, setReplaying] = useState(false)
+  // Stats (Step 6 — PR-3)
+  const [stats, setStats] = useState<Stats | null>(null)
 
   const fetchTriggers = () => {
     let url = '/triggers'
@@ -105,6 +133,9 @@ export default function Triggers() {
       .then((data: { triggers: TriggerItem[] }) => setTriggers(data.triggers || []))
       .catch(() => setTriggers([]))
       .finally(() => setLoading(false))
+
+    // Load operational stats badge (Step 6 — PR-3)
+    api.get('/triggers/stats?days=1').then((d: Stats) => setStats(d)).catch(() => {})
   }, [filter])
 
   const openCreate = () => {
@@ -197,6 +228,58 @@ export default function Triggers() {
       setExecutions([])
     }
     setExecLoading(false)
+    // Refresh stats badge whenever executions modal opens
+    api.get('/triggers/stats?days=1').then((d: Stats) => setStats(d)).catch(() => {})
+  }
+
+  /** Build replay preview from execution event_data and open the confirmation modal. */
+  const openReplayPreview = (ex: Execution) => {
+    const d = ex.event_data as Record<string, unknown>
+    const dataObj = (d?.data as Record<string, unknown>) || {}
+    const keyObj = (dataObj?.key as Record<string, unknown>) || {}
+    // Try WPP paths first, then fall back to a generic summary
+    const recipient =
+      (keyObj?.remoteJid as string) ||
+      (dataObj?.remoteJid as string) ||
+      (d?.phone as string) ||
+      (d?.from as string) ||
+      '—'
+    const msgObj = (dataObj?.message as Record<string, unknown>) || {}
+    const command =
+      (msgObj?.conversation as string) ||
+      (msgObj?.extendedTextMessage as Record<string, unknown>)?.text as string ||
+      (d?.command as string) ||
+      (d?.text as string) ||
+      JSON.stringify(d).slice(0, 120)
+    setReplayPreview({
+      execId: ex.id,
+      recipient,
+      command: String(command || '—'),
+      timestamp: ex.started_at,
+    })
+  }
+
+  const confirmReplay = async () => {
+    if (!replayPreview) return
+    setReplaying(true)
+    try {
+      const result = await api.post(`/triggers/executions/${replayPreview.execId}/replay`)
+      toast.success(`Replay iniciado — nova execução #${result.new_execution_id}`)
+      setReplayPreview(null)
+      // Refresh executions list
+      if (execModal) {
+        const data = await api.get(`/triggers/${execModal.triggerId}/executions`)
+        setExecutions(data.executions || [])
+      }
+    } catch (e: unknown) {
+      const msg = e instanceof Error ? e.message : String(e)
+      if (msg.includes('rate_limited') || msg.includes('429')) {
+        toast.error('Rate limit: aguarde 60s antes de fazer replay novamente')
+      } else {
+        toast.error('Erro ao fazer replay', msg)
+      }
+    }
+    setReplaying(false)
   }
 
   const handleRegenerateSecret = async (id: number) => {
@@ -267,6 +350,27 @@ export default function Triggers() {
         )}
       </div>
 
+      {/* Stats badge (Step 6 — PR-3) */}
+      {stats && (
+        <div className="flex items-center gap-3 mb-4 flex-wrap">
+          <span className="text-[11px] text-[#667085]">
+            DLQ: <span className={`font-medium ${stats.dlq_size > 0 ? 'text-orange-400' : 'text-[#e6edf3]'}`}>{stats.dlq_size}</span>
+          </span>
+          <span className="text-[11px] text-[#667085]">
+            Replays hoje: <span className="font-medium text-[#e6edf3]">{stats.idempotent_replays}</span>
+          </span>
+          <span className="text-[11px] text-[#667085]">
+            WPP: <span className="font-medium text-[#e6edf3]">{stats.wpp_command_count}/dia</span>
+          </span>
+          {stats.circuit_breaker_watermark_hit && (
+            <span className="flex items-center gap-1 px-2 py-0.5 rounded-md bg-yellow-500/10 border border-yellow-500/20 text-yellow-400 text-[11px] font-medium">
+              <AlertTriangle size={11} />
+              Volume WPP &gt;50/dia — reavaliar Circuit Breaker (ver ADR)
+            </span>
+          )}
+        </div>
+      )}
+
       {/* Filters */}
       <div className="flex items-center gap-2 mb-4 flex-wrap">
         {filters.map(f => (
@@ -496,15 +600,21 @@ export default function Triggers() {
                       <th className="text-left p-3 font-medium">Event</th>
                       <th className="text-left p-3 font-medium">Duration</th>
                       <th className="text-left p-3 font-medium">Time</th>
+                      <th className="text-center p-3 font-medium">Actions</th>
                     </tr>
                   </thead>
                   <tbody>
                     {executions.map(ex => (
                       <tr key={ex.id} className="border-t border-[#21262d]/50 hover:bg-white/[0.02]">
                         <td className="p-3">
-                          <span className={`inline-block px-2 py-0.5 rounded text-[10px] font-medium ${STATUS_COLORS[ex.status] || ''}`}>
-                            {ex.status}
-                          </span>
+                          <div className="flex flex-col gap-0.5">
+                            <span className={`inline-block px-2 py-0.5 rounded text-[10px] font-medium ${STATUS_COLORS[ex.status] || ''}`}>
+                              {ex.status}
+                            </span>
+                            {ex.error_category && (
+                              <span className="text-[9px] text-[#667085]">{ex.error_category}</span>
+                            )}
+                          </div>
                         </td>
                         <td className="p-3 text-[#667085] text-xs max-w-[200px] truncate">
                           {(ex.event_data as Record<string, unknown>)?._test ? 'test' : (String((ex.event_data as Record<string, unknown>)?.event_type || '--'))}
@@ -516,6 +626,19 @@ export default function Triggers() {
                         <td className="p-3 text-[#667085] text-xs">
                           {ex.started_at ? relativeTime(ex.started_at) : '--'}
                         </td>
+                        <td className="p-3 text-center">
+                          {/* Replay button — only for failed_retryable (Step 5 PR-3) */}
+                          {ex.status === 'failed_retryable' && (
+                            <button
+                              onClick={() => openReplayPreview(ex)}
+                              className="inline-flex items-center gap-1 px-2 py-1 rounded text-[10px] font-medium bg-orange-500/10 border border-orange-500/20 text-orange-400 hover:bg-orange-500/20 transition-colors"
+                              title="Replay esta execução"
+                            >
+                              <RotateCcw size={10} />
+                              Replay
+                            </button>
+                          )}
+                        </td>
                       </tr>
                     ))}
                   </tbody>
@@ -526,6 +649,48 @@ export default function Triggers() {
         </div>
       )}
 
+      {/* Replay Confirmation Modal (Step 5 — PR-3) */}
+      {replayPreview && (
+        <div className="fixed inset-0 bg-black/60 z-[60] flex items-center justify-center p-4" onClick={() => setReplayPreview(null)}>
+          <div className="bg-[#161b22] border border-[#21262d] rounded-xl w-full max-w-md" onClick={e => e.stopPropagation()}>
+            <div className="flex items-center justify-between px-6 py-4 border-b border-[#21262d]">
+              <h2 className="text-lg font-semibold text-[#e6edf3]">Confirmar replay #{replayPreview.execId}</h2>
+              <button onClick={() => setReplayPreview(null)} className="p-1 rounded-lg hover:bg-white/5 text-[#667085] hover:text-[#e6edf3]"><X size={18} /></button>
+            </div>
+            <div className="p-6 space-y-4">
+              <p className="text-[#667085] text-xs">
+                Isso irá refazer a chamada original. Se a execução anterior já chegou a executar parcialmente, o sistema dedupa silenciosamente.
+              </p>
+              <div className="rounded-lg bg-[#0d1117] border border-[#21262d] p-4 space-y-2 text-xs">
+                <div className="flex gap-2">
+                  <span className="text-[#667085] w-24 shrink-0">Destinatário</span>
+                  <span className="text-[#e6edf3] font-mono break-all">{replayPreview.recipient}</span>
+                </div>
+                <div className="flex gap-2">
+                  <span className="text-[#667085] w-24 shrink-0">Comando</span>
+                  <span className="text-[#e6edf3] font-mono break-all">{replayPreview.command.slice(0, 200)}</span>
+                </div>
+                <div className="flex gap-2">
+                  <span className="text-[#667085] w-24 shrink-0">Timestamp</span>
+                  <span className="text-[#e6edf3]">{replayPreview.timestamp}</span>
+                </div>
+              </div>
+            </div>
+            <div className="flex items-center justify-end gap-3 px-6 py-4 border-t border-[#21262d]">
+              <button onClick={() => setReplayPreview(null)}
+                className="px-4 py-2 rounded-lg border border-[#21262d] text-[#667085] hover:text-[#e6edf3] hover:border-[#344054] transition-colors text-sm">
+                Cancelar
+              </button>
+              <button onClick={confirmReplay} disabled={replaying}
+                className="flex items-center gap-2 px-4 py-2 rounded-lg bg-orange-500/10 border border-orange-500/20 text-orange-400 hover:bg-orange-500/20 transition-colors text-sm font-medium disabled:opacity-50 disabled:cursor-not-allowed">
+                <RotateCcw size={14} />
+                {replaying ? 'Iniciando...' : 'Confirmar replay'}
+              </button>
+            </div>
+          </div>
+        </div>
+      )}
+
       {/* New Secret Modal */}
       {newSecret && (
         <div className="fixed inset-0 bg-black/60 z-50 flex items-center justify-center p-4" onClick={() => setNewSecret(null)}>
diff --git a/dashboard/tests/test_wpp_retry_pr3.py b/dashboard/tests/test_wpp_retry_pr3.py
new file mode 100644
index 00000000..4e5526e7
--- /dev/null
+++ b/dashboard/tests/test_wpp_retry_pr3.py
@@ -0,0 +1,203 @@
+"""Synthetic tests for WhatsApp retry pattern — PR-3 (Steps 4, 5, 6).
+
+Step 4: _classify_error + failed_retryable classification in _execute_trigger
+Step 5: /replay endpoint — rate-limit, not_found, not_replayable, happy path
+Step 6: /stats endpoint — JSON shape, watermark flag
+"""
+import json
+import subprocess
+import sys
+import os
+import pytest
+
+# Ensure backend is importable
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "backend"))
+
+# ---------------------------------------------------------------------------
+# Step 4 — _classify_error unit tests (pure, no app context needed)
+# ---------------------------------------------------------------------------
+
+from routes.triggers import _classify_error, _TRANSIENT_MARKERS
+
+
+class TestClassifyError:
+    def test_timeout_exception_is_transient(self):
+        exc = subprocess.TimeoutExpired(cmd="x", timeout=11)
+        assert _classify_error("Timeout", exc) == "transient"
+
+    def test_value_error_is_permanent(self):
+        exc = ValueError("missing key 'foo'")
+        assert _classify_error(str(exc), exc) == "permanent"
+
+    def test_file_not_found_is_permanent(self):
+        exc = FileNotFoundError("script not found")
+        assert _classify_error(str(exc), exc) == "permanent"
+
+    def test_http_5xx_in_stderr_is_transient(self):
+        assert _classify_error("HTTP 503 Service Unavailable", None) == "transient"
+
+    def test_http_500_in_stderr_is_transient(self):
+        assert _classify_error("HTTP 500 Internal Server Error", None) == "transient"
+
+    def test_connection_refused_is_transient(self):
+        assert _classify_error("Connection refused", None) == "transient"
+
+    def test_url_error_marker_is_transient(self):
+        assert _classify_error("URLError: <urlopen error timed out>", None) == "transient"
+
+    def test_http_4xx_is_permanent(self):
+        # 4xx markers NOT in _TRANSIENT_MARKERS → permanent
+        assert _classify_error("HTTP 400 Bad Request", None) == "permanent"
+
+    def test_http_404_is_permanent(self):
+        assert _classify_error("HTTP 404 Not Found", None) == "permanent"
+
+    def test_generic_runtime_error_is_permanent(self):
+        assert _classify_error("RuntimeError: unexpected state", None) == "permanent"
+
+    def test_empty_message_defaults_permanent(self):
+        assert _classify_error("", None) == "permanent"
+
+    def test_none_message_defaults_permanent(self):
+        assert _classify_error(None, None) == "permanent"  # type: ignore[arg-type]
+
+    def test_all_transient_markers_recognized(self):
+        for marker in _TRANSIENT_MARKERS:
+            assert _classify_error(f"...{marker}...", None) == "transient", \
+                f"Marker '{marker}' should be transient"
+
+
+# ---------------------------------------------------------------------------
+# Step 5 + 6 — Flask app integration tests
+# These tests require the Flask app to be importable without a running server.
+# ---------------------------------------------------------------------------
+
+try:
+    import importlib, types
+    # We need a minimal Flask test client.  Import app but skip heavy startup.
+    # The auto-migrate runs against dashboard.db which must exist.
+    # Guard: only run integration tests when DB exists.
+    _DB_EXISTS = os.path.exists(
+        os.path.join(os.path.dirname(__file__), "..", "..", "dashboard.db")
+    )
+except Exception:
+    _DB_EXISTS = False
+
+
+@pytest.mark.skipif(not _DB_EXISTS, reason="dashboard.db not found — integration tests skipped")
+class TestReplayEndpoint:
+    """Integration tests for POST /api/triggers/executions/<id>/replay."""
+
+    @pytest.fixture(scope="class")
+    def client(self):
+        """Return a Flask test client with a seeded failed_retryable execution."""
+        # Minimal app import — auto-migrate runs on import
+        import app as flask_app_module
+        flask_app = flask_app_module.app
+        flask_app.config["TESTING"] = True
+        flask_app.config["WTF_CSRF_ENABLED"] = False
+        with flask_app.test_client() as c:
+            yield c, flask_app
+
+    def _seed_execution(self, app, status="failed_retryable", last_replay_at=None):
+        """Insert a TriggerExecution row and return its id."""
+        from models import db, TriggerExecution, Trigger
+        with app.app_context():
+            # Find or create a trigger
+            t = Trigger.query.first()
+            if t is None:
+                pytest.skip("No trigger in DB — seed one manually first")
+            ex = TriggerExecution(
+                trigger_id=t.id,
+                event_data=json.dumps({"event_type": "test", "data": {"key": {"remoteJid": "+5511999999999"}, "message": {"conversation": "/briefing"}}}),
+                status=status,
+                last_replay_at=last_replay_at,
+            )
+            db.session.add(ex)
+            db.session.commit()
+            return ex.id, t.id
+
+    def test_replay_requires_auth(self, client):
+        c, _ = client
+        # Without session, Flask-Login returns 401 (Unauthorized) or 403 (Forbidden)
+        resp = c.post("/api/triggers/executions/999999/replay")
+        assert resp.status_code in (401, 403, 404)
+
+    def test_stats_returns_json_shape(self, client):
+        c, _ = client
+        resp = c.get("/api/triggers/stats?days=1")
+        # Without auth some setups return 200 (public route, no login_required) or 403
+        if resp.status_code == 200:
+            data = resp.get_json()
+            required_keys = {
+                "window_days", "total_executions", "by_status",
+                "retries_observed", "idempotent_replays",
+                "dlq_size", "wpp_command_count", "circuit_breaker_watermark_hit",
+            }
+            assert required_keys.issubset(data.keys()), f"Missing keys: {required_keys - data.keys()}"
+            assert isinstance(data["circuit_breaker_watermark_hit"], bool)
+            assert isinstance(data["by_status"], dict)
+            assert data["window_days"] == 1
+
+
+# ---------------------------------------------------------------------------
+# Step 6 — Watermark logic unit test (no DB needed)
+# ---------------------------------------------------------------------------
+
+class TestWatermarkLogic:
+    """The watermark formula: wpp_command_count > 50 OR user_count > 1."""
+
+    def _check(self, wpp_count: int, user_count: int) -> bool:
+        return wpp_count > 50 or user_count > 1
+
+    def test_below_threshold_no_hit(self):
+        assert self._check(49, 1) is False
+
+    def test_exactly_50_no_hit(self):
+        assert self._check(50, 1) is False
+
+    def test_51_hits_watermark(self):
+        assert self._check(51, 1) is True
+
+    def test_multiple_users_hits_watermark(self):
+        assert self._check(0, 2) is True
+
+    def test_both_conditions_hit(self):
+        assert self._check(100, 3) is True
+
+
+# ---------------------------------------------------------------------------
+# Step 5 — Rate-limit logic unit test (no DB needed)
+# ---------------------------------------------------------------------------
+
+class TestRateLimitLogic:
+    """Verify the 60s rate-limit formula (elapsed < 60 → rate-limited)."""
+
+    def _is_rate_limited(self, last_replay_at, now, threshold_seconds=60):
+        if last_replay_at is None:
+            return False
+        elapsed = (now - last_replay_at).total_seconds()
+        return elapsed < threshold_seconds
+
+    def test_no_previous_replay_not_limited(self):
+        from datetime import datetime, timezone
+        now = datetime.now(timezone.utc)
+        assert self._is_rate_limited(None, now) is False
+
+    def test_replay_59s_ago_is_limited(self):
+        from datetime import datetime, timezone, timedelta
+        now = datetime.now(timezone.utc)
+        last = now - timedelta(seconds=59)
+        assert self._is_rate_limited(last, now) is True
+
+    def test_replay_60s_ago_is_not_limited(self):
+        from datetime import datetime, timezone, timedelta
+        now = datetime.now(timezone.utc)
+        last = now - timedelta(seconds=60)
+        assert self._is_rate_limited(last, now) is False
+
+    def test_replay_61s_ago_is_not_limited(self):
+        from datetime import datetime, timezone, timedelta
+        now = datetime.now(timezone.utc)
+        last = now - timedelta(seconds=61)
+        assert self._is_rate_limited(last, now) is False

From 935d4f9a691baebca0774970805571ffb8cfb7c9 Mon Sep 17 00:00:00 2001
From: Marcello Alarcon <marce@MTA-digital-MacBook-Pro.local>
Date: Wed, 13 May 2026 20:24:47 -0300
Subject: [PATCH 6/6] =?UTF-8?q?fix(wpp-retry):=20endere=C3=A7a=20review=20?=
 =?UTF-8?q?do=20Sourcery=20=E2=80=94=20IntegrityError,=20typing,=20SQL,=20?=
 =?UTF-8?q?ApiError?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Consolidado dos fixes pedidos pelo Sourcery no review do PR #80:

1. `except IntegrityError` restrito à violação de idempotência
   (dashboard/backend/routes/triggers.py — 2 pontos)

   Mesmo bug do PR #78 estendido a este PR. Dois pontos no triggers.py:

   a) `webhook_receiver` (~linha 389): mesma correção do PR #78 —
      só absorve o erro se a mensagem do driver menciona
      `uq_trigger_idem` ou `idempotency_key` E `idem_key` está set;
      caso contrário loga com contexto completo e re-raise.

   b) Endpoint `/triggers/executions/<id>/replay` (~linha 476):
      cria nova execution reusando `ex.idempotency_key`. Tinha o
      mesmo catch genérico, mascarando outros IntegrityError como
      "idempotent_skip". Aplicada a mesma proteção usando
      `ex.idempotency_key` no check. (Sourcery não citou este
      segundo ponto, mas é o mesmo bug — corrigido por simetria.)

2. `_classify_error` — tighten typing

   - Define alias `ErrorCategory = Literal["transient", "permanent"]`.
   - Atualiza return type pra refletir o que a função realmente
     retorna.
   - Atualiza comentário em `TriggerExecution.error_category`
     (models.py) que listava 4 categorias (validation/unknown) que
     nunca eram emitidas pelo classificador — consumidores não vão
     mais depender de valores inexistentes.

3. `trigger_stats` — parametriza SQL

   - `datetime('now', '-{days} days')` interpolado vira `:cutoff`
     bindado (cutoff calculado em Python com `timedelta`).
   - `WHERE trigger_id IN ({id_list})` vira `IN :wpp_ids` com
     `bindparam("wpp_ids", expanding=True)` — SQLAlchemy expande pra
     `(:id_1, :id_2, ...)` e binda cada valor.
   - Inputs hoje são "seguros" (ints + days constrained), mas
     parametrizar é mais robusto contra regressões e mais legível.

4. `confirmReplay` (Triggers.tsx) — error handling estruturado

   - Cria `ApiError` em `lib/api.ts` carregando `status` (número HTTP)
     e `code` (string do JSON do backend, ex.: `rate_limited`).
   - `buildError` agora retorna `ApiError` em vez de `Error` simples.
   - `Triggers.tsx::confirmReplay` checa `e instanceof ApiError &&
     (e.status === 429 || e.code === 'rate_limited')` em vez de
     `msg.includes('429')`. Mais resiliente a mudanças no formato da
     mensagem (ex.: tradução, prefixo).
   - `.message` preservado no mesmo formato
     (`"<status> <text>: <detail>"`) pra não quebrar os 3 outros
     call sites que usam `msg.includes(...)` (Backups.tsx,
     StepBrainConnect.tsx, RestoreSelectRepo.tsx) — migração desses
     fica pra um PR dedicado.

Sintaxe Python validada com `python3 -m py_compile`. Typecheck do
frontend deixado pro CI do PR (deps não instaladas localmente).
---
 dashboard/backend/models.py               |   2 +-
 dashboard/backend/routes/triggers.py      | 100 ++++++++++++++++------
 dashboard/frontend/src/lib/api.ts         |  28 +++++-
 dashboard/frontend/src/pages/Triggers.tsx |  10 ++-
 4 files changed, 107 insertions(+), 33 deletions(-)

diff --git a/dashboard/backend/models.py b/dashboard/backend/models.py
index 2dcff6f1..c2ee0fc6 100644
--- a/dashboard/backend/models.py
+++ b/dashboard/backend/models.py
@@ -342,7 +342,7 @@ class TriggerExecution(db.Model):
     # WhatsApp retry pattern (PR-1: migration 2026-05-11)
     # rollback: DROP indices uq_trigger_idem + ix_trigger_executions_idem_key; columns are nullable, ignored by old code
     idempotency_key = db.Column(db.String(255), nullable=True, index=True)  # messageId WPP or other source dedup key
-    error_category = db.Column(db.String(20), nullable=True)  # transient | permanent | validation | unknown
+    error_category = db.Column(db.String(20), nullable=True)  # transient | permanent (ver _classify_error em routes/triggers.py)
     last_replay_at = db.Column(db.DateTime, nullable=True)  # rate-limit: 60s between replays of the same execution
 
     @property
diff --git a/dashboard/backend/routes/triggers.py b/dashboard/backend/routes/triggers.py
index 1c550529..5e3e8b7b 100644
--- a/dashboard/backend/routes/triggers.py
+++ b/dashboard/backend/routes/triggers.py
@@ -11,8 +11,10 @@
 import threading
 import time
 from pathlib import Path
-from datetime import datetime, timezone
+from datetime import datetime, timedelta, timezone
+from typing import Literal
 from flask import Blueprint, jsonify, request, current_app
+from sqlalchemy import bindparam
 from flask_login import current_user
 from models import db, Trigger, TriggerExecution, has_permission, audit
 from sqlalchemy.exc import IntegrityError
@@ -42,11 +44,21 @@
 )
 
 
-def _classify_error(err_msg: str, exc: Exception | None = None) -> str:
-    """Return 'transient' or 'permanent' based on the exception and error text.
+ErrorCategory = Literal["transient", "permanent"]
+
+
+def _classify_error(err_msg: str, exc: Exception | None = None) -> ErrorCategory:
+    """Classify a failed execution as transient or permanent.
 
     transient  → worth retrying (HTTP 5xx, network, timeout)
     permanent  → deterministic failure, replay only on operator decision
+
+    NOTE: o comentário em `TriggerExecution.error_category` (models.py)
+    historicamente listava 4 valores ("transient | permanent | validation
+    | unknown"). Na prática, o classificador SEMPRE retorna um dos dois
+    valores acima. Consumidores devem assumir só esses dois — qualquer
+    valor extra no DB vem de migração antiga e deve ser tratado como
+    `permanent` (default seguro). Sourcery #80 alertou o desalinhamento.
     """
     if isinstance(exc, subprocess.TimeoutExpired):
         return "transient"
@@ -386,11 +398,26 @@ def webhook_receiver(trigger_id):
     db.session.add(execution)
     try:
         db.session.commit()
-    except IntegrityError:
+    except IntegrityError as exc:
+        # Restringe o catch à violação do índice de idempotência
+        # (uq_trigger_idem). Qualquer outro IntegrityError (NOT NULL, FK,
+        # outros uniques) é um problema real e precisa propagar — caso
+        # contrário viraria 200 OK silencioso e mascararia bug de schema.
+        db.session.rollback()
+        err_text = str(getattr(exc, "orig", exc)).lower()
+        is_idem_violation = (
+            idem_key is not None
+            and ("uq_trigger_idem" in err_text or "idempotency_key" in err_text)
+        )
+        if not is_idem_violation:
+            current_app.logger.error(
+                f"evt=integrity_error_unexpected trigger_id={trigger.id} "
+                f"key={idem_key} err={err_text!r}"
+            )
+            raise
         # Race condition: two simultaneous POSTs with same idempotency_key;
         # the DB partial unique index rejected the second INSERT.
         # Silent dedup — return 200 OK (F6) without re-executing.
-        db.session.rollback()
         current_app.logger.info(
             f"evt=idempotent_replay_race trigger_id={trigger.id} key={idem_key}"
         )
@@ -473,9 +500,22 @@ def replay_execution(exec_id: int):
 
     try:
         db.session.commit()
-    except IntegrityError:
-        # idempotency_key already has a successful execution — silent ok
+    except IntegrityError as exc:
+        # Mesma proteção do webhook_receiver: só absorve a violação se for
+        # do índice de idempotência. Outros IntegrityError (NOT NULL, FK,
+        # outros uniques) precisam propagar para não mascarar bugs reais.
         db.session.rollback()
+        err_text = str(getattr(exc, "orig", exc)).lower()
+        is_idem_violation = (
+            ex.idempotency_key is not None
+            and ("uq_trigger_idem" in err_text or "idempotency_key" in err_text)
+        )
+        if not is_idem_violation:
+            current_app.logger.error(
+                f"evt=integrity_error_unexpected_replay exec_id={ex.id} "
+                f"key={ex.idempotency_key} err={err_text!r}"
+            )
+            raise
         return jsonify({"status": "ok", "note": "idempotent_skip"}), 200
 
     new_execution_id = new_ex.id
@@ -515,17 +555,20 @@ def trigger_stats():
     except (TypeError, ValueError):
         days = 1
 
-    # Use raw SQL via SQLAlchemy text for aggregate queries (no ORM overhead)
+    # Use raw SQL via SQLAlchemy text for aggregate queries (no ORM overhead).
+    # Todos os params abaixo são bindados (Sourcery #80 — evita interpolação
+    # mesmo com inputs hoje "seguros", garante robustez se a lógica mudar).
     from sqlalchemy import text as _text
 
-    since_clause = f"datetime('now', '-{days} days')"
+    cutoff = datetime.now(timezone.utc) - timedelta(days=days)
 
     # 1. Total executions + by_status breakdown
     rows = db.session.execute(
         _text(
-            f"SELECT status, COUNT(*) as cnt FROM trigger_executions "
-            f"WHERE started_at >= {since_clause} GROUP BY status"
-        )
+            "SELECT status, COUNT(*) as cnt FROM trigger_executions "
+            "WHERE started_at >= :cutoff GROUP BY status"
+        ),
+        {"cutoff": cutoff},
     ).fetchall()
     by_status: dict = {}
     total_executions = 0
@@ -541,9 +584,10 @@ def trigger_stats():
     #    (exact log parsing is fragile; replayed status is precise enough for watermark)
     idempotent_replays = db.session.execute(
         _text(
-            f"SELECT COUNT(*) FROM trigger_executions "
-            f"WHERE status = 'replayed' AND started_at >= {since_clause}"
-        )
+            "SELECT COUNT(*) FROM trigger_executions "
+            "WHERE status = 'replayed' AND started_at >= :cutoff"
+        ),
+        {"cutoff": cutoff},
     ).scalar() or 0
 
     # 4. WPP command count: triggers whose source = 'whatsapp' OR slug contains 'wpp'
@@ -553,18 +597,19 @@ def trigger_stats():
             "SELECT id FROM triggers WHERE source = 'whatsapp' OR slug LIKE 'wpp%' OR name LIKE 'wpp%' OR name LIKE 'WhatsApp%'"
         )
     ).fetchall()
-    wpp_ids = tuple(r[0] for r in wpp_trigger_ids)
+    wpp_ids = [r[0] for r in wpp_trigger_ids]
 
     if wpp_ids:
-        # Build IN clause using string interpolation (IDs are integers — safe)
-        id_list = ",".join(str(i) for i in wpp_ids)
+        # IN clause via expanding bindparam — SQLAlchemy expande pra (:id_1, :id_2, ...)
+        # e bindando cada ID. Mais seguro e legível que interpolar id_list,
+        # mesmo sendo ints (defesa contra regressões futuras se a fonte mudar).
         try:
+            stmt = _text(
+                "SELECT COUNT(*) FROM trigger_executions "
+                "WHERE trigger_id IN :wpp_ids AND started_at >= :cutoff"
+            ).bindparams(bindparam("wpp_ids", expanding=True))
             wpp_command_count = db.session.execute(
-                _text(
-                    f"SELECT COUNT(*) FROM trigger_executions "
-                    f"WHERE trigger_id IN ({id_list}) "
-                    f"AND started_at >= {since_clause}"
-                )
+                stmt, {"wpp_ids": wpp_ids, "cutoff": cutoff}
             ).scalar() or 0
         except Exception:
             wpp_command_count = 0
@@ -579,10 +624,11 @@ def trigger_stats():
     #    (Step 3 backoff logs "attempts" in the summary)
     retries_observed = db.session.execute(
         _text(
-            f"SELECT COUNT(*) FROM trigger_executions "
-            f"WHERE (result_summary LIKE '%\"attempts\"%' OR error LIKE '%attempts%') "
-            f"AND started_at >= {since_clause}"
-        )
+            "SELECT COUNT(*) FROM trigger_executions "
+            "WHERE (result_summary LIKE '%\"attempts\"%' OR error LIKE '%attempts%') "
+            "AND started_at >= :cutoff"
+        ),
+        {"cutoff": cutoff},
     ).scalar() or 0
 
     # 7. Watermark check
diff --git a/dashboard/frontend/src/lib/api.ts b/dashboard/frontend/src/lib/api.ts
index 213989d4..63991b0d 100644
--- a/dashboard/frontend/src/lib/api.ts
+++ b/dashboard/frontend/src/lib/api.ts
@@ -5,17 +5,40 @@ const API = import.meta.env.DEV ? 'http://localhost:8080' : '';
 // which the backend rejects for non-allowlisted origins.
 const XHR_HEADER = { 'X-Requested-With': 'XMLHttpRequest' };
 
+/** Erro estruturado lançado pelos métodos do `api`.
+ *
+ * Carrega `status` (número HTTP) e `code` (string opcional, vinda do JSON
+ * do backend — ex.: `rate_limited`, `SYNC_IN_PROGRESS`). Consumidores
+ * NOVOS devem checar essas propriedades; o `.message` continua existindo
+ * no formato `"<status> <statusText>: <detail>"` para preservar callers
+ * antigos que fazem `msg.includes('401')` etc. (Sourcery #80).
+ */
+export class ApiError extends Error {
+  readonly status: number
+  readonly code?: string
+  constructor(message: string, status: number, code?: string) {
+    super(message)
+    this.name = 'ApiError'
+    this.status = status
+    this.code = code
+  }
+}
+
 /** Extract a human-readable error message from a non-OK response.
  *
  * Tries JSON first (most backend routes return `{error, code}` or
  * `{error, message}`), then falls back to plain text. Always prefixes the
  * status so existing callers that pattern-match on '401'/'403' keep working.
  */
-async function buildError(res: Response): Promise<Error> {
+async function buildError(res: Response): Promise<ApiError> {
   let detail = ''
+  let code: string | undefined
   try {
     const data = await res.clone().json()
     detail = data?.error || data?.description || data?.message || ''
+    // Backend pode retornar `{error: "rate_limited", ...}` — nesse caso
+    // o próprio "error" é o code. Também aceita um `code` explícito.
+    code = data?.code || (typeof data?.error === 'string' ? data.error : undefined)
   } catch {
     try {
       const text = await res.text()
@@ -26,7 +49,8 @@ async function buildError(res: Response): Promise<Error> {
     }
   }
   const base = `${res.status} ${res.statusText}`
-  return new Error(detail ? `${base}: ${detail}` : base)
+  const message = detail ? `${base}: ${detail}` : base
+  return new ApiError(message, res.status, code)
 }
 
 export const api = {
diff --git a/dashboard/frontend/src/pages/Triggers.tsx b/dashboard/frontend/src/pages/Triggers.tsx
index f0c5aafc..436692fe 100644
--- a/dashboard/frontend/src/pages/Triggers.tsx
+++ b/dashboard/frontend/src/pages/Triggers.tsx
@@ -2,7 +2,7 @@ import { useEffect, useState } from 'react'
 import { useToast } from '../components/Toast'
 import { useConfirm } from '../components/ConfirmDialog'
 import { Plus, Pencil, Trash2, X, Play, Copy, RefreshCw, KeyRound, RotateCcw, AlertTriangle } from 'lucide-react'
-import { api } from '../lib/api'
+import { api, ApiError } from '../lib/api'
 import { useAuth } from '../context/AuthContext'
 
 interface TriggerItem {
@@ -272,10 +272,14 @@ export default function Triggers() {
         setExecutions(data.executions || [])
       }
     } catch (e: unknown) {
-      const msg = e instanceof Error ? e.message : String(e)
-      if (msg.includes('rate_limited') || msg.includes('429')) {
+      // Detecção estruturada: ApiError carrega status numérico + code do
+      // backend. Evita fragilidade de parsear o `.message` (Sourcery #80).
+      const isRateLimit =
+        e instanceof ApiError && (e.status === 429 || e.code === 'rate_limited')
+      if (isRateLimit) {
         toast.error('Rate limit: aguarde 60s antes de fazer replay novamente')
       } else {
+        const msg = e instanceof Error ? e.message : String(e)
         toast.error('Erro ao fazer replay', msg)
       }
     }