langwatch · drewdrewthis · Dec 12, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/.cursor/commands/ci-wait-verify-fix.md b/.cursor/commands/ci-wait-verify-fix.md
@@ -0,0 +1,45 @@
+## Wait for CI and Fix Failures
+
+Execute this workflow after pushing to a PR branch.
+
+### Step 1: Run the CI Wait Script
+
+```bash
+./scripts/ci-wait.sh
+```
+
+Use `timeout: 660000` (11 min) and `required_permissions: ["network"]`.
+
+**Exit codes:**
+- `0` = All workflows passed → Done
+- `1` = One or more workflows failed → Continue to Step 2
+- `2` = Timeout → Report to user, ask how to proceed
+
+### Step 2: Fetch Failed Logs
+
+For each failed workflow from the script output:
+
+```bash
+gh run view <databaseId> --log-failed
+```
+
+### Step 3: Analyze and Fix
+
+1. Parse the log output to identify the root cause (test failure, lint error, type error, build error)
+2. Locate the relevant file(s) and line(s)
+3. Apply the minimal fix
+4. Commit with message: `fix: resolve CI failure - <brief description>`
+
+### Step 4: Push and Retry
+
+```bash
+git push
+```
+
+Return to Step 1. Maximum 3 retry attempts before stopping and reporting to user.
+
+### Constraints
+
+- Do not introduce new functionality while fixing
+- Keep fixes minimal and focused on the specific failure
+- If the failure is unclear or requires design decisions, stop and ask the user
diff --git a/.cursor/commands/review-and-learn.md b/.cursor/commands/review-and-learn.md
@@ -0,0 +1,5 @@
+# Review and Learn
+
+Review the conversation history and learn from the mistakes.
+
+Track in a table in the `AGENTS.md` file: Anti-patterns | Correct Behavior
diff --git a/.cursor/commands/review-and-verify.md b/.cursor/commands/review-and-verify.md
@@ -0,0 +1,3 @@
+# Review and Verify
+
+Re-read `AGENTS.md` and verify your work follows the guidelines.
diff --git a/.cursor/rules/AGENTS.mdc b/.cursor/rules/AGENTS.mdc
@@ -0,0 +1,5 @@
+---
+alwaysApply: true
+---
+
+Read `AGENTS.md` - non-negotiable.
diff --git a/.cursor/rules/absolutes.mdc b/.cursor/rules/absolutes.mdc
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,9 @@
+# Guidelines
+
+## Follow best practices
+
+- SRP (most important)
+- SOLID
+- YAGNI
+- KISS
+- CLEAN CODE
diff --git a/TESTING.md b/TESTING.md
@@ -0,0 +1,66 @@
+# Testing Philosophy
+
+## Hierarchy
+
+| Level           | Purpose                       | Mocking                  |
+| --------------- | ----------------------------- | ------------------------ |
+| **E2E**         | Happy paths via real examples | None                     |
+| **Integration** | Edge cases, error handling    | External boundaries only |
+| **Unit**        | Pure logic, branches          | Everything               |
+
+### Language-Specific Patterns
+
+| Language   | E2E             | Integration             | Unit             | Location     |
+| ---------- | --------------- | ----------------------- | ---------------- | ------------ |
+| TypeScript | `*.e2e.test.ts` | `*.integration.test.ts` | `*.unit.test.ts` | `__tests__/` |
+| Python     | `test_*_e2e.py` | `test_*_integration.py` | `test_*.py`      | `tests/`     |
+| Go         | `*_e2e_test.go` | `*_integration_test.go` | `*_test.go`      | same package |
+
+## Workflow
+
+1. **Spec first**: Write a `.feature` file in `specs/`. Use tags: `@e2e`, `@integration`, `@unit`.
+2. **Challenge**: LLM/reviewer challenges missing edge cases before implementation.
+3. **Examples drive E2E**: Working examples in `examples/` are wrapped by e2e tests.
+4. **Implement**: Outside-in test driven (TDD). Red → Green → Refactor.
+
+## Decision Tree
+
+```text
+Is this a happy path demonstrating SDK usage?
+  → E2E (wrap an example)
+
+Does it test orchestration between internal modules or external API behavior?
+  → Integration (mock external boundaries)
+
+Is it pure logic or a single class in isolation?
+  → Unit (mock collaborators)
+
+Is it a regression from production?
+  → Add test at the LOWEST sufficient level (unit > integration > e2e)
+```
+
+## Scenario Design
+
+Each scenario should test **one invariant**. When deciding whether to extend an existing scenario or create a new one:
+
+- **Extend** (add `And`/`But`): The new assertion is a natural consequence of the same behavior
+- **New scenario**: The assertion tests a distinct invariant that could fail independently
+
+Example: "Cache returns stale data" and "Cache key includes version" are orthogonal invariants — separate scenarios. If one fails, you immediately know which contract broke.
+
+## What We Don't Test
+
+- Type definitions
+- Simple pass-throughs with no logic
+- Third-party library internals
+- Constants/config (unless dynamic)
+
+## Regression Policy
+
+Edge cases not covered upfront are handled via regression tests. When a bug is found:
+
+1. Reproduce with a failing test
+2. Add test at the lowest sufficient level
+3. Fix and verify green
+
+This keeps the suite lean while ensuring real failures never recur.
diff --git a/javascript/examples/vitest/tests/scenario-expert-realtime.test.ts b/javascript/examples/vitest/tests/scenario-expert-realtime.test.ts
@@ -96,7 +96,7 @@ describe("Scenario Expert Agent (Realtime API)", () => {
           // Judge with audio transcription
           scenario.judgeAgent({
             criteria: [
-              "Agent explains what LangWatch Scenario is",
+              "Agent explains what Scenario is or how it helps test AI agents",
               "Agent is helpful and informative",
             ],
           })

diff --git a/javascript/src/agents/judge/__tests__/judge-span-digest-formatter.test.ts b/javascript/src/agents/judge/__tests__/judge-span-digest-formatter.test.ts
@@ -34,10 +34,9 @@ const formatter = new JudgeSpanDigestFormatter();
 describe("JudgeSpanDigestFormatter", () => {
   describe("when no spans", () => {
     it("returns empty digest marker", () => {
-      expect(formatter.format([])).toMatchInlineSnapshot(`
-        "=== OPENTELEMETRY TRACES ===
-        No spans recorded."
-      `);
+      expect(formatter.format([])).toMatchInlineSnapshot(
+        `"No spans recorded."`
+      );
     });
   });
 
@@ -56,8 +55,7 @@ describe("JudgeSpanDigestFormatter", () => {
       });
 
       expect(formatter.format([span])).toMatchInlineSnapshot(`
-        "=== OPENTELEMETRY TRACES ===
-        Spans: 1 | Total Duration: 500ms
+        "Spans: 1 | Total Duration: 500ms
 
         [1] 2023-11-14T22:13:20.000Z llm.chat (500ms)
             gen_ai.prompt: Hello
@@ -92,8 +90,7 @@ describe("JudgeSpanDigestFormatter", () => {
       ];
 
       expect(formatter.format(spans)).toMatchInlineSnapshot(`
-        "=== OPENTELEMETRY TRACES ===
-        Spans: 3 | Total Duration: 2.05s
+        "Spans: 3 | Total Duration: 2.05s
 
         [1] 2023-11-14T22:13:20.000Z first (200ms)
 
@@ -131,8 +128,7 @@ describe("JudgeSpanDigestFormatter", () => {
       ];
 
       expect(formatter.format(spans)).toMatchInlineSnapshot(`
-        "=== OPENTELEMETRY TRACES ===
-        Spans: 3 | Total Duration: 1.00s
+        "Spans: 3 | Total Duration: 1.00s
 
         [1] 2023-11-14T22:13:20.000Z agent.run (1.00s)
 
@@ -168,8 +164,7 @@ describe("JudgeSpanDigestFormatter", () => {
       ];
 
       expect(formatter.format(spans)).toMatchInlineSnapshot(`
-        "=== OPENTELEMETRY TRACES ===
-        Spans: 3 | Total Duration: 2.00s
+        "Spans: 3 | Total Duration: 2.00s
 
         [1] 2023-11-14T22:13:20.000Z root (2.00s)
 
@@ -198,8 +193,7 @@ describe("JudgeSpanDigestFormatter", () => {
       });
 
       expect(formatter.format([span])).toMatchInlineSnapshot(`
-        "=== OPENTELEMETRY TRACES ===
-        Spans: 1 | Total Duration: 100ms
+        "Spans: 1 | Total Duration: 100ms
 
         [1] 2023-11-14T22:13:20.000Z llm.chat (100ms)
             gen_ai.prompt: What is the weather in Paris?
@@ -231,8 +225,7 @@ describe("JudgeSpanDigestFormatter", () => {
       ];
 
       expect(formatter.format(spans)).toMatchInlineSnapshot(`
-        "=== OPENTELEMETRY TRACES ===
-        Spans: 2 | Total Duration: 300ms
+        "Spans: 2 | Total Duration: 300ms
 
         [1] 2023-11-14T22:13:20.000Z successful.operation (100ms)
 
@@ -265,8 +258,7 @@ describe("JudgeSpanDigestFormatter", () => {
       });
 
       expect(formatter.format([span])).toMatchInlineSnapshot(`
-        "=== OPENTELEMETRY TRACES ===
-        Spans: 1 | Total Duration: 1.00s
+        "Spans: 1 | Total Duration: 1.00s
 
         [1] 2023-11-14T22:13:20.000Z llm.stream (1.00s)
             [event] token.generated
@@ -296,8 +288,7 @@ describe("JudgeSpanDigestFormatter", () => {
       });
 
       expect(formatter.format([span])).toMatchInlineSnapshot(`
-        "=== OPENTELEMETRY TRACES ===
-        Spans: 1 | Total Duration: 100ms
+        "Spans: 1 | Total Duration: 100ms
 
         [1] 2023-11-14T22:13:20.000Z test (100ms)
             relevant.attribute: should-appear
@@ -331,7 +322,7 @@ describe("JudgeSpanDigestFormatter", () => {
       expect(result).toContain(longContent);
       expect(result).toContain("[DUPLICATE - SEE ABOVE]");
       expect(result.indexOf(longContent)).toBeLessThan(
-        result.indexOf("[DUPLICATE - SEE ABOVE]"),
+        result.indexOf("[DUPLICATE - SEE ABOVE]")
       );
     });
 

diff --git a/javascript/src/agents/judge/judge-span-digest-formatter.ts b/javascript/src/agents/judge/judge-span-digest-formatter.ts
@@ -36,7 +36,7 @@ export class JudgeSpanDigestFormatter {
 
     if (spans.length === 0) {
       this.logger.debug("No spans to format");
-      return "=== OPENTELEMETRY TRACES ===\nNo spans recorded.";
+      return "No spans recorded.";
     }
 
     const sortedSpans = this.sortByStartTime(spans);
@@ -49,7 +49,6 @@ export class JudgeSpanDigestFormatter {
     });
 
     const lines: string[] = [
-      "=== OPENTELEMETRY TRACES ===",
       `Spans: ${spans.length} | Total Duration: ${this.formatDuration(
         totalDuration
       )}`,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Review and Verify

		Re-read `AGENTS.md` and verify your work follows the guidelines.