diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts index c33cad1..2b7aa61 100644 --- a/packages/cli/src/cli.ts +++ b/packages/cli/src/cli.ts @@ -7,6 +7,7 @@ import { registerAutonomyCommands } from './cli/commands/autonomy.js'; import { registerConversationCommands } from './cli/commands/conversation.js'; import { registerDispatchCommands } from './cli/commands/dispatch.js'; import { registerMcpCommands } from './cli/commands/mcp.js'; +import { registerSafetyCommands } from './cli/commands/safety.js'; import { registerTriggerCommands } from './cli/commands/trigger.js'; import { addWorkspaceOption, @@ -2214,6 +2215,12 @@ registerTriggerCommands(program, DEFAULT_ACTOR); registerConversationCommands(program, DEFAULT_ACTOR); +// ============================================================================ +// safety +// ============================================================================ + +registerSafetyCommands(program, DEFAULT_ACTOR); + // ============================================================================ // onboarding // ============================================================================ diff --git a/packages/cli/src/cli/commands/safety.ts b/packages/cli/src/cli/commands/safety.ts new file mode 100644 index 0000000..6a0f79f --- /dev/null +++ b/packages/cli/src/cli/commands/safety.ts @@ -0,0 +1,144 @@ +import { Command } from 'commander'; +import * as workgraph from '@versatly/workgraph-kernel'; +import { + addWorkspaceOption, + resolveWorkspacePath, + runCommand, +} from '../core.js'; + +export function registerSafetyCommands(program: Command, defaultActor: string): void { + const safetyCmd = program + .command('safety') + .description('Continuous operations safety rails (rate limit, circuit breaker, kill switch)'); + + addWorkspaceOption( + safetyCmd + .command('status') + .description('Show current safety configuration and runtime state') + .option('--json', 'Emit structured JSON output'), + ).action((opts) => + runCommand( + opts, + () => { + const workspacePath = resolveWorkspacePath(opts); + return workgraph.safety.getSafetyStatus(workspacePath); + }, + (result) => [ + `Blocked: ${result.blocked ? 'yes' : 'no'}`, + ...(result.reasons.length > 0 ? result.reasons.map((reason) => `Reason: ${reason}`) : []), + `Kill switch: ${result.config.killSwitch.engaged ? 'engaged' : 'released'}`, + `Rate limit: enabled=${result.config.rateLimit.enabled} window=${result.config.rateLimit.windowSeconds}s max=${result.config.rateLimit.maxOperations} used=${result.config.runtime.rateLimitOperations}`, + `Circuit breaker: enabled=${result.config.circuitBreaker.enabled} state=${result.config.runtime.circuitState} failures=${result.config.runtime.consecutiveFailures}`, + `Updated at: ${result.config.updatedAt}`, + ], + ), + ); + + addWorkspaceOption( + safetyCmd + .command('pause') + .description('Engage kill switch to pause autonomous operations') + .option('-a, --actor ', 'Actor', defaultActor) + .option('--reason ', 'Optional pause reason') + .option('--json', 'Emit structured JSON output'), + ).action((opts) => + runCommand( + opts, + () => { + const workspacePath = resolveWorkspacePath(opts); + return { + config: workgraph.safety.pauseSafetyOperations(workspacePath, opts.actor, opts.reason), + }; + }, + (result) => [ + 'Safety kill switch engaged.', + `Reason: ${String(result.config.killSwitch.reason ?? 'none')}`, + `Updated at: ${result.config.updatedAt}`, + ], + ), + ); + + addWorkspaceOption( + safetyCmd + .command('resume') + .description('Release kill switch and resume autonomous operations') + .option('-a, --actor ', 'Actor', defaultActor) + .option('--json', 'Emit structured JSON output'), + ).action((opts) => + runCommand( + opts, + () => { + const workspacePath = resolveWorkspacePath(opts); + return { + config: workgraph.safety.resumeSafetyOperations(workspacePath, opts.actor), + }; + }, + (result) => [ + 'Safety kill switch released.', + `Updated at: ${result.config.updatedAt}`, + ], + ), + ); + + addWorkspaceOption( + safetyCmd + .command('reset') + .description('Reset safety runtime counters and circuit state') + .option('-a, --actor ', 'Actor', defaultActor) + .option('--full', 'Also clear kill switch state') + .option('--json', 'Emit structured JSON output'), + ).action((opts) => + runCommand( + opts, + () => { + const workspacePath = resolveWorkspacePath(opts); + return { + config: workgraph.safety.resetSafetyRails(workspacePath, { + actor: opts.actor, + clearKillSwitch: !!opts.full, + }), + }; + }, + (result) => [ + 'Safety runtime reset complete.', + `Circuit state: ${result.config.runtime.circuitState}`, + `Rate limit used: ${result.config.runtime.rateLimitOperations}`, + `Kill switch: ${result.config.killSwitch.engaged ? 'engaged' : 'released'}`, + ], + ), + ); + + addWorkspaceOption( + safetyCmd + .command('log') + .description('Show recent safety events from ledger') + .option('--count ', 'Number of entries', '20') + .option('--json', 'Emit structured JSON output'), + ).action((opts) => + runCommand( + opts, + () => { + const workspacePath = resolveWorkspacePath(opts); + const parsedCount = Number.parseInt(String(opts.count), 10); + const count = Number.isFinite(parsedCount) ? Math.max(0, parsedCount) : 20; + return { + entries: workgraph.safety.listSafetyEvents(workspacePath, { count }), + count, + }; + }, + (result) => { + if (result.entries.length === 0) return ['No safety events found.']; + return result.entries.map((entry) => { + const eventName = readEventName(entry); + return `${entry.ts} ${eventName} actor=${entry.actor}`; + }); + }, + ), + ); +} + +function readEventName(entry: workgraph.LedgerEntry): string { + const data = entry.data as Record | undefined; + const event = data?.event; + return typeof event === 'string' && event.trim().length > 0 ? event : 'safety.unknown'; +} diff --git a/packages/kernel/src/index.ts b/packages/kernel/src/index.ts index f00a014..7a5d60b 100644 --- a/packages/kernel/src/index.ts +++ b/packages/kernel/src/index.ts @@ -39,6 +39,7 @@ export * as triggerEngine from './trigger-engine.js'; export * as trigger from './trigger.js'; export * as autonomy from './autonomy.js'; export * as autonomyDaemon from './autonomy-daemon.js'; +export * as safety from './safety.js'; export * as commandCenter from './command-center.js'; export * as board from './board.js'; export * as agent from './agent.js'; diff --git a/packages/kernel/src/safety.test.ts b/packages/kernel/src/safety.test.ts new file mode 100644 index 0000000..b23100b --- /dev/null +++ b/packages/kernel/src/safety.test.ts @@ -0,0 +1,248 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { + SAFETY_CONFIG_FILE, + ensureSafetyConfig, + evaluateSafety, + getSafetyStatus, + listSafetyEvents, + pauseSafetyOperations, + recordOperationOutcome, + resetSafetyRails, + resumeSafetyOperations, + runWithSafetyRails, + updateSafetyConfig, + loadSafetyConfig, +} from './safety.js'; + +let workspacePath: string; + +beforeEach(() => { + workspacePath = fs.mkdtempSync(path.join(os.tmpdir(), 'wg-safety-')); +}); + +afterEach(() => { + fs.rmSync(workspacePath, { recursive: true, force: true }); +}); + +describe('safety rails', () => { + it('creates default .workgraph/safety.yaml when missing', () => { + const config = ensureSafetyConfig(workspacePath); + expect(fs.existsSync(path.join(workspacePath, SAFETY_CONFIG_FILE))).toBe(true); + expect(config.rateLimit.enabled).toBe(true); + expect(config.circuitBreaker.enabled).toBe(true); + expect(config.killSwitch.engaged).toBe(false); + }); + + it('blocks operations when rate limit is exceeded and unblocks after window reset', () => { + updateSafetyConfig(workspacePath, 'ops-admin', { + rateLimit: { + enabled: true, + maxOperations: 2, + windowSeconds: 60, + }, + circuitBreaker: { + enabled: false, + }, + }); + + const baseNow = new Date('2026-03-06T10:00:00.000Z'); + const first = evaluateSafety(workspacePath, { + actor: 'auto-1', + operation: 'autonomy.cycle', + now: baseNow, + consume: true, + }); + const second = evaluateSafety(workspacePath, { + actor: 'auto-1', + operation: 'autonomy.cycle', + now: baseNow, + consume: true, + }); + const blocked = evaluateSafety(workspacePath, { + actor: 'auto-1', + operation: 'autonomy.cycle', + now: baseNow, + consume: true, + }); + const afterWindow = evaluateSafety(workspacePath, { + actor: 'auto-1', + operation: 'autonomy.cycle', + now: new Date(baseNow.getTime() + 61_000), + consume: true, + }); + + expect(first.allowed).toBe(true); + expect(second.allowed).toBe(true); + expect(blocked.allowed).toBe(false); + expect(blocked.reasons.join(' ')).toContain('Rate limit exceeded'); + expect(afterWindow.allowed).toBe(true); + }); + + it('opens circuit breaker after repeated failures and closes after cooldown + successful probe', () => { + updateSafetyConfig(workspacePath, 'ops-admin', { + rateLimit: { + enabled: false, + }, + circuitBreaker: { + enabled: true, + failureThreshold: 2, + cooldownSeconds: 30, + halfOpenMaxOperations: 1, + }, + }); + + const t0 = new Date('2026-03-06T11:00:00.000Z'); + evaluateSafety(workspacePath, { actor: 'auto-2', operation: 'autonomy.run', now: t0, consume: true }); + recordOperationOutcome(workspacePath, { + actor: 'auto-2', + operation: 'autonomy.run', + success: false, + error: 'first failure', + now: t0, + }); + + evaluateSafety(workspacePath, { actor: 'auto-2', operation: 'autonomy.run', now: t0, consume: true }); + recordOperationOutcome(workspacePath, { + actor: 'auto-2', + operation: 'autonomy.run', + success: false, + error: 'second failure', + now: t0, + }); + + const blocked = evaluateSafety(workspacePath, { + actor: 'auto-2', + operation: 'autonomy.run', + now: new Date(t0.getTime() + 1_000), + consume: true, + }); + expect(blocked.allowed).toBe(false); + expect(blocked.reasons.join(' ')).toContain('Circuit breaker open'); + + const probeTime = new Date(t0.getTime() + 31_000); + const probe = evaluateSafety(workspacePath, { + actor: 'auto-2', + operation: 'autonomy.run', + now: probeTime, + consume: true, + }); + expect(probe.allowed).toBe(true); + + recordOperationOutcome(workspacePath, { + actor: 'auto-2', + operation: 'autonomy.run', + success: true, + now: probeTime, + }); + + const status = getSafetyStatus(workspacePath, new Date(t0.getTime() + 32_000)); + expect(status.config.runtime.circuitState).toBe('closed'); + }); + + it('enforces kill switch pause/resume and writes ledger events', () => { + pauseSafetyOperations(workspacePath, 'ops-admin', 'manual incident response'); + const pausedDecision = evaluateSafety(workspacePath, { + actor: 'auto-3', + operation: 'autonomy.run', + consume: false, + }); + expect(pausedDecision.allowed).toBe(false); + expect(pausedDecision.reasons.join(' ')).toContain('Kill switch engaged'); + + resumeSafetyOperations(workspacePath, 'ops-admin'); + const resumedDecision = evaluateSafety(workspacePath, { + actor: 'auto-3', + operation: 'autonomy.run', + consume: false, + }); + expect(resumedDecision.allowed).toBe(true); + + const events = listSafetyEvents(workspacePath, { count: 10 }); + const eventNames = events.map((entry) => String(entry.data?.event ?? '')); + expect(eventNames).toContain('safety.kill_switch.engaged'); + expect(eventNames).toContain('safety.kill_switch.released'); + }); + + it('resets runtime counters and can clear kill switch', () => { + updateSafetyConfig(workspacePath, 'ops-admin', { + rateLimit: { + enabled: true, + maxOperations: 1, + windowSeconds: 600, + }, + circuitBreaker: { + enabled: true, + failureThreshold: 1, + cooldownSeconds: 600, + halfOpenMaxOperations: 1, + }, + }); + pauseSafetyOperations(workspacePath, 'ops-admin', 'maintenance'); + + evaluateSafety(workspacePath, { + actor: 'auto-4', + operation: 'autonomy.run', + consume: true, + }); + recordOperationOutcome(workspacePath, { + actor: 'auto-4', + operation: 'autonomy.run', + success: false, + error: 'failure before reset', + }); + + const reset = resetSafetyRails(workspacePath, { + actor: 'ops-admin', + clearKillSwitch: true, + }); + expect(reset.runtime.consecutiveFailures).toBe(0); + expect(reset.runtime.circuitState).toBe('closed'); + expect(reset.runtime.rateLimitOperations).toBe(0); + expect(reset.killSwitch.engaged).toBe(false); + }); + + it('guards operation execution via runWithSafetyRails', async () => { + updateSafetyConfig(workspacePath, 'ops-admin', { + rateLimit: { + enabled: false, + }, + circuitBreaker: { + enabled: true, + failureThreshold: 1, + cooldownSeconds: 120, + halfOpenMaxOperations: 1, + }, + }); + + await expect(runWithSafetyRails( + workspacePath, + { + actor: 'auto-5', + operation: 'autonomy.dispatch', + }, + () => { + throw new Error('adapter failed'); + }, + )).rejects.toThrow('adapter failed'); + + let invoked = false; + await expect(runWithSafetyRails( + workspacePath, + { + actor: 'auto-5', + operation: 'autonomy.dispatch', + }, + () => { + invoked = true; + return 'ok'; + }, + )).rejects.toThrow('Safety rails blocked'); + expect(invoked).toBe(false); + + const config = loadSafetyConfig(workspacePath); + expect(config.runtime.circuitState).toBe('open'); + }); +}); diff --git a/packages/kernel/src/safety.ts b/packages/kernel/src/safety.ts new file mode 100644 index 0000000..0c0dad7 --- /dev/null +++ b/packages/kernel/src/safety.ts @@ -0,0 +1,710 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import YAML from 'yaml'; +import * as ledger from './ledger.js'; +import type { LedgerEntry } from './types.js'; + +export const SAFETY_CONFIG_FILE = '.workgraph/safety.yaml'; +const SAFETY_LEDGER_TARGET = SAFETY_CONFIG_FILE; +const SAFETY_LEDGER_TYPE = 'safety'; +const SAFETY_VERSION = 1; +const DEFAULT_ACTOR = 'system:safety'; + +export type WorkgraphSafetyCircuitState = 'closed' | 'open' | 'half-open'; + +export interface WorkgraphSafetyRateLimitConfig { + enabled: boolean; + windowSeconds: number; + maxOperations: number; +} + +export interface WorkgraphSafetyCircuitBreakerConfig { + enabled: boolean; + failureThreshold: number; + cooldownSeconds: number; + halfOpenMaxOperations: number; +} + +export interface WorkgraphSafetyKillSwitchConfig { + engaged: boolean; + reason?: string; + engagedAt?: string; + engagedBy?: string; +} + +export interface WorkgraphSafetyRuntimeState { + rateLimitWindowStartedAt: string; + rateLimitOperations: number; + circuitState: WorkgraphSafetyCircuitState; + consecutiveFailures: number; + openedAt?: string; + halfOpenOperations: number; + lastFailureAt?: string; + lastFailureReason?: string; +} + +export interface WorkgraphSafetyConfig { + version: number; + updatedAt: string; + rateLimit: WorkgraphSafetyRateLimitConfig; + circuitBreaker: WorkgraphSafetyCircuitBreakerConfig; + killSwitch: WorkgraphSafetyKillSwitchConfig; + runtime: WorkgraphSafetyRuntimeState; +} + +export interface WorkgraphSafetyConfigPatch { + rateLimit?: Partial; + circuitBreaker?: Partial; +} + +export interface WorkgraphSafetyEvaluateOptions { + actor: string; + operation: string; + now?: Date; + consume?: boolean; + logAllowed?: boolean; +} + +export interface WorkgraphSafetyDecision { + allowed: boolean; + reasons: string[]; + config: WorkgraphSafetyConfig; + cooldownRemainingSeconds: number; + windowRemainingSeconds: number; +} + +export interface WorkgraphSafetyOutcomeOptions { + actor: string; + operation: string; + success: boolean; + error?: string; + now?: Date; +} + +export interface WorkgraphSafetyResetOptions { + actor: string; + clearKillSwitch?: boolean; +} + +export interface WorkgraphSafetyStatus { + blocked: boolean; + reasons: string[]; + config: WorkgraphSafetyConfig; + cooldownRemainingSeconds: number; + windowRemainingSeconds: number; +} + +export interface WorkgraphSafetyEventQueryOptions { + count?: number; +} + +interface WorkgraphSafetyEvaluationSnapshot { + reasons: string[]; + cooldownRemainingSeconds: number; + windowRemainingSeconds: number; +} + +export function safetyConfigPath(workspacePath: string): string { + return path.join(workspacePath, SAFETY_CONFIG_FILE); +} + +export function ensureSafetyConfig(workspacePath: string): WorkgraphSafetyConfig { + const targetPath = safetyConfigPath(workspacePath); + if (fs.existsSync(targetPath)) { + return loadSafetyConfig(workspacePath); + } + const nowIso = new Date().toISOString(); + const created = buildDefaultSafetyConfig(nowIso); + writeSafetyConfig(workspacePath, created); + return created; +} + +export function loadSafetyConfig(workspacePath: string): WorkgraphSafetyConfig { + const targetPath = safetyConfigPath(workspacePath); + if (!fs.existsSync(targetPath)) { + return ensureSafetyConfig(workspacePath); + } + let parsed: unknown; + try { + parsed = YAML.parse(fs.readFileSync(targetPath, 'utf-8')); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + throw new Error(`Failed to parse ${SAFETY_CONFIG_FILE}: ${message}`); + } + return normalizeSafetyConfig(parsed, new Date().toISOString()); +} + +export function updateSafetyConfig( + workspacePath: string, + actor: string, + patch: WorkgraphSafetyConfigPatch, +): WorkgraphSafetyConfig { + const nowIso = new Date().toISOString(); + const current = loadSafetyConfig(workspacePath); + const merged = normalizeSafetyConfig({ + ...current, + rateLimit: { + ...current.rateLimit, + ...patch.rateLimit, + }, + circuitBreaker: { + ...current.circuitBreaker, + ...patch.circuitBreaker, + }, + updatedAt: nowIso, + }, nowIso); + writeSafetyConfig(workspacePath, merged); + appendSafetyEvent(workspacePath, actor, 'safety.config.updated', { + rateLimit: merged.rateLimit, + circuitBreaker: merged.circuitBreaker, + }); + return merged; +} + +export function pauseSafetyOperations( + workspacePath: string, + actor: string, + reason?: string, +): WorkgraphSafetyConfig { + const nowIso = new Date().toISOString(); + const config = loadSafetyConfig(workspacePath); + config.killSwitch.engaged = true; + config.killSwitch.engagedAt = nowIso; + config.killSwitch.engagedBy = normalizeActor(actor); + config.killSwitch.reason = normalizeOptionalString(reason) ?? 'Paused manually'; + config.updatedAt = nowIso; + writeSafetyConfig(workspacePath, config); + appendSafetyEvent(workspacePath, actor, 'safety.kill_switch.engaged', { + reason: config.killSwitch.reason, + }); + return config; +} + +export function resumeSafetyOperations(workspacePath: string, actor: string): WorkgraphSafetyConfig { + const nowIso = new Date().toISOString(); + const config = loadSafetyConfig(workspacePath); + config.killSwitch.engaged = false; + config.killSwitch.reason = undefined; + config.killSwitch.engagedAt = undefined; + config.killSwitch.engagedBy = undefined; + config.updatedAt = nowIso; + writeSafetyConfig(workspacePath, config); + appendSafetyEvent(workspacePath, actor, 'safety.kill_switch.released'); + return config; +} + +export function resetSafetyRails( + workspacePath: string, + options: WorkgraphSafetyResetOptions, +): WorkgraphSafetyConfig { + const nowIso = new Date().toISOString(); + const config = loadSafetyConfig(workspacePath); + config.runtime = buildDefaultRuntimeState(nowIso); + if (options.clearKillSwitch) { + config.killSwitch = { engaged: false }; + } + config.updatedAt = nowIso; + writeSafetyConfig(workspacePath, config); + appendSafetyEvent(workspacePath, options.actor, 'safety.reset', { + clearKillSwitch: options.clearKillSwitch === true, + }); + return config; +} + +export function evaluateSafety( + workspacePath: string, + options: WorkgraphSafetyEvaluateOptions, +): WorkgraphSafetyDecision { + const actor = normalizeActor(options.actor); + const operation = normalizeOperation(options.operation); + const now = options.now ?? new Date(); + const nowIso = now.toISOString(); + const nowMs = now.getTime(); + + const config = loadSafetyConfig(workspacePath); + const transitionEvents: string[] = []; + let changed = applyTimeBasedTransitions(config, nowMs, nowIso, transitionEvents); + const snapshot = evaluateConfigSnapshot(config, nowMs); + if (snapshot.reasons.length === 0 && options.consume !== false) { + if (config.rateLimit.enabled) { + config.runtime.rateLimitOperations += 1; + } + if (config.circuitBreaker.enabled && config.runtime.circuitState === 'half-open') { + config.runtime.halfOpenOperations += 1; + } + config.updatedAt = nowIso; + changed = true; + } + + if (changed) { + writeSafetyConfig(workspacePath, config); + } + + for (const eventName of transitionEvents) { + appendSafetyEvent(workspacePath, actor, eventName, { + operation, + }); + } + + if (snapshot.reasons.length > 0) { + appendSafetyEvent(workspacePath, actor, 'safety.blocked', { + operation, + reasons: snapshot.reasons, + circuitState: config.runtime.circuitState, + rateLimitOperations: config.runtime.rateLimitOperations, + }); + } else if (options.logAllowed === true) { + appendSafetyEvent(workspacePath, actor, 'safety.allowed', { + operation, + }); + } + + return { + allowed: snapshot.reasons.length === 0, + reasons: snapshot.reasons, + cooldownRemainingSeconds: snapshot.cooldownRemainingSeconds, + windowRemainingSeconds: snapshot.windowRemainingSeconds, + config, + }; +} + +export function recordOperationOutcome( + workspacePath: string, + options: WorkgraphSafetyOutcomeOptions, +): WorkgraphSafetyConfig { + const actor = normalizeActor(options.actor); + const operation = normalizeOperation(options.operation); + const now = options.now ?? new Date(); + const nowIso = now.toISOString(); + const nowMs = now.getTime(); + + const config = loadSafetyConfig(workspacePath); + const transitionEvents: string[] = []; + let changed = applyTimeBasedTransitions(config, nowMs, nowIso, transitionEvents); + + if (options.success) { + const hadFailures = config.runtime.consecutiveFailures > 0 + || !!config.runtime.lastFailureAt + || !!config.runtime.lastFailureReason; + if (hadFailures) { + config.runtime.consecutiveFailures = 0; + config.runtime.lastFailureAt = undefined; + config.runtime.lastFailureReason = undefined; + changed = true; + } + if (config.runtime.circuitState !== 'closed') { + config.runtime.circuitState = 'closed'; + config.runtime.openedAt = undefined; + config.runtime.halfOpenOperations = 0; + transitionEvents.push('safety.circuit.closed'); + changed = true; + } + } else { + config.runtime.lastFailureAt = nowIso; + config.runtime.lastFailureReason = normalizeOptionalString(options.error); + changed = true; + if (config.circuitBreaker.enabled) { + if (config.runtime.circuitState === 'half-open') { + config.runtime.circuitState = 'open'; + config.runtime.openedAt = nowIso; + config.runtime.halfOpenOperations = 0; + config.runtime.consecutiveFailures = config.circuitBreaker.failureThreshold; + transitionEvents.push('safety.circuit.opened'); + } else { + config.runtime.consecutiveFailures += 1; + if (config.runtime.consecutiveFailures >= config.circuitBreaker.failureThreshold) { + if (config.runtime.circuitState !== 'open') { + transitionEvents.push('safety.circuit.opened'); + } + config.runtime.circuitState = 'open'; + config.runtime.openedAt = nowIso; + config.runtime.halfOpenOperations = 0; + } + } + } else { + config.runtime.circuitState = 'closed'; + config.runtime.openedAt = undefined; + config.runtime.halfOpenOperations = 0; + } + } + + if (changed) { + config.updatedAt = nowIso; + writeSafetyConfig(workspacePath, config); + } + + for (const eventName of transitionEvents) { + appendSafetyEvent(workspacePath, actor, eventName, { + operation, + consecutiveFailures: config.runtime.consecutiveFailures, + }); + } + + appendSafetyEvent( + workspacePath, + actor, + options.success ? 'safety.operation.succeeded' : 'safety.operation.failed', + { + operation, + ...(options.error ? { error: options.error } : {}), + }, + ); + + return config; +} + +export async function runWithSafetyRails( + workspacePath: string, + options: Omit, + operation: () => Promise | T, +): Promise { + const decision = evaluateSafety(workspacePath, { + ...options, + consume: true, + }); + if (!decision.allowed) { + throw new Error(`Safety rails blocked "${options.operation}": ${decision.reasons.join('; ')}`); + } + try { + const result = await operation(); + recordOperationOutcome(workspacePath, { + actor: options.actor, + operation: options.operation, + success: true, + }); + return result; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + recordOperationOutcome(workspacePath, { + actor: options.actor, + operation: options.operation, + success: false, + error: message, + }); + throw error; + } +} + +export function getSafetyStatus(workspacePath: string, now: Date = new Date()): WorkgraphSafetyStatus { + const snapshotConfig = cloneSafetyConfig(loadSafetyConfig(workspacePath)); + applyTimeBasedTransitions(snapshotConfig, now.getTime(), now.toISOString(), []); + const snapshot = evaluateConfigSnapshot(snapshotConfig, now.getTime()); + return { + blocked: snapshot.reasons.length > 0, + reasons: snapshot.reasons, + cooldownRemainingSeconds: snapshot.cooldownRemainingSeconds, + windowRemainingSeconds: snapshot.windowRemainingSeconds, + config: snapshotConfig, + }; +} + +export function listSafetyEvents( + workspacePath: string, + options: WorkgraphSafetyEventQueryOptions = {}, +): LedgerEntry[] { + const allSafetyEntries = ledger.readAll(workspacePath).filter((entry) => isSafetyEntry(entry)); + const count = normalizeNonNegativeInt(options.count, 20); + if (count === 0) return []; + return allSafetyEntries.slice(-count); +} + +function writeSafetyConfig(workspacePath: string, config: WorkgraphSafetyConfig): void { + const targetPath = safetyConfigPath(workspacePath); + const directory = path.dirname(targetPath); + if (!fs.existsSync(directory)) { + fs.mkdirSync(directory, { recursive: true }); + } + fs.writeFileSync(targetPath, YAML.stringify(config), 'utf-8'); +} + +function appendSafetyEvent( + workspacePath: string, + actor: string, + event: string, + data: Record = {}, +): LedgerEntry { + return ledger.append( + workspacePath, + normalizeActor(actor), + 'update', + SAFETY_LEDGER_TARGET, + SAFETY_LEDGER_TYPE, + { + event, + ...data, + }, + ); +} + +function applyTimeBasedTransitions( + config: WorkgraphSafetyConfig, + nowMs: number, + nowIso: string, + eventNames: string[], +): boolean { + let changed = false; + + if (config.rateLimit.enabled) { + const windowStartMs = parseTimestamp(config.runtime.rateLimitWindowStartedAt); + const elapsedMs = windowStartMs === null ? Number.POSITIVE_INFINITY : nowMs - windowStartMs; + if (!Number.isFinite(elapsedMs) || elapsedMs < 0 || elapsedMs >= (config.rateLimit.windowSeconds * 1000)) { + config.runtime.rateLimitWindowStartedAt = nowIso; + config.runtime.rateLimitOperations = 0; + eventNames.push('safety.rate_limit.window_reset'); + changed = true; + } + } else if (config.runtime.rateLimitOperations !== 0) { + config.runtime.rateLimitOperations = 0; + changed = true; + } + + if (!config.circuitBreaker.enabled) { + if ( + config.runtime.circuitState !== 'closed' + || config.runtime.openedAt !== undefined + || config.runtime.halfOpenOperations !== 0 + || config.runtime.consecutiveFailures !== 0 + ) { + config.runtime.circuitState = 'closed'; + config.runtime.openedAt = undefined; + config.runtime.halfOpenOperations = 0; + config.runtime.consecutiveFailures = 0; + changed = true; + } + } else if (config.runtime.circuitState === 'open') { + const openedAtMs = parseTimestamp(config.runtime.openedAt); + const elapsedMs = openedAtMs === null ? Number.POSITIVE_INFINITY : nowMs - openedAtMs; + if (!Number.isFinite(elapsedMs) || elapsedMs < 0 || elapsedMs >= (config.circuitBreaker.cooldownSeconds * 1000)) { + config.runtime.circuitState = 'half-open'; + config.runtime.halfOpenOperations = 0; + config.runtime.openedAt = undefined; + eventNames.push('safety.circuit.half_open'); + changed = true; + } + } + + if (changed) { + config.updatedAt = nowIso; + } + return changed; +} + +function evaluateConfigSnapshot( + config: WorkgraphSafetyConfig, + nowMs: number, +): WorkgraphSafetyEvaluationSnapshot { + const reasons: string[] = []; + let cooldownRemainingSeconds = 0; + let windowRemainingSeconds = 0; + + if (config.killSwitch.engaged) { + const reason = config.killSwitch.reason ?? 'Kill switch engaged'; + reasons.push(`Kill switch engaged: ${reason}`); + } + + if (config.circuitBreaker.enabled) { + if (config.runtime.circuitState === 'open') { + const openedAtMs = parseTimestamp(config.runtime.openedAt); + if (openedAtMs !== null) { + const elapsedSeconds = Math.floor((nowMs - openedAtMs) / 1000); + cooldownRemainingSeconds = Math.max(0, config.circuitBreaker.cooldownSeconds - elapsedSeconds); + } + reasons.push( + cooldownRemainingSeconds > 0 + ? `Circuit breaker open (${cooldownRemainingSeconds}s cooldown remaining)` + : 'Circuit breaker open', + ); + } else if ( + config.runtime.circuitState === 'half-open' + && config.runtime.halfOpenOperations >= config.circuitBreaker.halfOpenMaxOperations + ) { + reasons.push('Circuit breaker half-open probe limit reached'); + } + } + + if (config.rateLimit.enabled) { + const windowStartMs = parseTimestamp(config.runtime.rateLimitWindowStartedAt); + if (windowStartMs !== null) { + const elapsedSeconds = Math.floor((nowMs - windowStartMs) / 1000); + windowRemainingSeconds = Math.max(0, config.rateLimit.windowSeconds - elapsedSeconds); + } + if (config.runtime.rateLimitOperations >= config.rateLimit.maxOperations) { + reasons.push( + `Rate limit exceeded (${config.runtime.rateLimitOperations}/${config.rateLimit.maxOperations})`, + ); + } + } + + return { + reasons, + cooldownRemainingSeconds, + windowRemainingSeconds, + }; +} + +function buildDefaultSafetyConfig(nowIso: string): WorkgraphSafetyConfig { + return { + version: SAFETY_VERSION, + updatedAt: nowIso, + rateLimit: { + enabled: true, + windowSeconds: 60, + maxOperations: 120, + }, + circuitBreaker: { + enabled: true, + failureThreshold: 3, + cooldownSeconds: 120, + halfOpenMaxOperations: 1, + }, + killSwitch: { + engaged: false, + }, + runtime: buildDefaultRuntimeState(nowIso), + }; +} + +function buildDefaultRuntimeState(nowIso: string): WorkgraphSafetyRuntimeState { + return { + rateLimitWindowStartedAt: nowIso, + rateLimitOperations: 0, + circuitState: 'closed', + consecutiveFailures: 0, + halfOpenOperations: 0, + }; +} + +function normalizeSafetyConfig(input: unknown, nowIso: string): WorkgraphSafetyConfig { + const defaults = buildDefaultSafetyConfig(nowIso); + const source = asRecord(input); + const rateLimitInput = asRecord(source.rateLimit); + const circuitInput = asRecord(source.circuitBreaker); + const killSwitchInput = asRecord(source.killSwitch); + const runtimeInput = asRecord(source.runtime); + + const rateLimit: WorkgraphSafetyRateLimitConfig = { + enabled: readBoolean(rateLimitInput.enabled) ?? defaults.rateLimit.enabled, + windowSeconds: normalizePositiveInt(rateLimitInput.windowSeconds, defaults.rateLimit.windowSeconds), + maxOperations: normalizePositiveInt(rateLimitInput.maxOperations, defaults.rateLimit.maxOperations), + }; + + const circuitBreaker: WorkgraphSafetyCircuitBreakerConfig = { + enabled: readBoolean(circuitInput.enabled) ?? defaults.circuitBreaker.enabled, + failureThreshold: normalizePositiveInt(circuitInput.failureThreshold, defaults.circuitBreaker.failureThreshold), + cooldownSeconds: normalizePositiveInt(circuitInput.cooldownSeconds, defaults.circuitBreaker.cooldownSeconds), + halfOpenMaxOperations: normalizePositiveInt( + circuitInput.halfOpenMaxOperations, + defaults.circuitBreaker.halfOpenMaxOperations, + ), + }; + + const killSwitch: WorkgraphSafetyKillSwitchConfig = { + engaged: readBoolean(killSwitchInput.engaged) ?? defaults.killSwitch.engaged, + reason: normalizeOptionalString(killSwitchInput.reason), + engagedAt: normalizeOptionalString(killSwitchInput.engagedAt), + engagedBy: normalizeOptionalString(killSwitchInput.engagedBy), + }; + if (!killSwitch.engaged) { + killSwitch.reason = undefined; + killSwitch.engagedAt = undefined; + killSwitch.engagedBy = undefined; + } + + const circuitState = normalizeCircuitState(runtimeInput.circuitState) ?? defaults.runtime.circuitState; + const runtime: WorkgraphSafetyRuntimeState = { + rateLimitWindowStartedAt: normalizeOptionalString(runtimeInput.rateLimitWindowStartedAt) ?? nowIso, + rateLimitOperations: normalizeNonNegativeInt(runtimeInput.rateLimitOperations, 0), + circuitState, + consecutiveFailures: normalizeNonNegativeInt(runtimeInput.consecutiveFailures, 0), + openedAt: normalizeOptionalString(runtimeInput.openedAt), + halfOpenOperations: normalizeNonNegativeInt(runtimeInput.halfOpenOperations, 0), + lastFailureAt: normalizeOptionalString(runtimeInput.lastFailureAt), + lastFailureReason: normalizeOptionalString(runtimeInput.lastFailureReason), + }; + if (runtime.circuitState !== 'open') runtime.openedAt = undefined; + if (runtime.circuitState === 'closed') runtime.halfOpenOperations = 0; + + return { + version: normalizePositiveInt(source.version, defaults.version), + updatedAt: normalizeOptionalString(source.updatedAt) ?? nowIso, + rateLimit, + circuitBreaker, + killSwitch, + runtime, + }; +} + +function isSafetyEntry(entry: LedgerEntry): boolean { + if (entry.type === SAFETY_LEDGER_TYPE) return true; + if (entry.target === SAFETY_LEDGER_TARGET) return true; + const data = asRecord(entry.data); + const event = normalizeOptionalString(data.event); + return event?.startsWith('safety.') ?? false; +} + +function parseTimestamp(value: string | undefined): number | null { + if (!value) return null; + const ms = Date.parse(value); + return Number.isFinite(ms) ? ms : null; +} + +function normalizeCircuitState(value: unknown): WorkgraphSafetyCircuitState | undefined { + const normalized = normalizeOptionalString(value)?.toLowerCase(); + if (normalized === 'closed' || normalized === 'open' || normalized === 'half-open') { + return normalized; + } + return undefined; +} + +function cloneSafetyConfig(config: WorkgraphSafetyConfig): WorkgraphSafetyConfig { + return JSON.parse(JSON.stringify(config)) as WorkgraphSafetyConfig; +} + +function normalizeOperation(operation: string): string { + const normalized = normalizeOptionalString(operation); + return normalized ?? 'autonomy.operation'; +} + +function normalizeActor(actor: string): string { + const normalized = normalizeOptionalString(actor); + return normalized ?? DEFAULT_ACTOR; +} + +function normalizeOptionalString(value: unknown): string | undefined { + if (typeof value !== 'string') return undefined; + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : undefined; +} + +function asRecord(value: unknown): Record { + if (!value || typeof value !== 'object' || Array.isArray(value)) return {}; + return value as Record; +} + +function readBoolean(value: unknown): boolean | undefined { + if (typeof value === 'boolean') return value; + if (typeof value === 'string') { + const normalized = value.trim().toLowerCase(); + if (normalized === 'true' || normalized === '1' || normalized === 'yes') return true; + if (normalized === 'false' || normalized === '0' || normalized === 'no') return false; + } + return undefined; +} + +function normalizePositiveInt(value: unknown, fallback: number): number { + if (typeof value === 'number' && Number.isInteger(value) && value > 0) return value; + if (typeof value === 'string') { + const parsed = Number.parseInt(value, 10); + if (Number.isInteger(parsed) && parsed > 0) return parsed; + } + return fallback; +} + +function normalizeNonNegativeInt(value: unknown, fallback: number): number { + if (typeof value === 'number' && Number.isInteger(value) && value >= 0) return value; + if (typeof value === 'string') { + const parsed = Number.parseInt(value, 10); + if (Number.isInteger(parsed) && parsed >= 0) return parsed; + } + return fallback; +}